Moving Nvidia interfaces to stats package so that they can be used outside of the accelerators package

Signed-off-by: Maciej "Iwan" Iwanowski <maciej.iwanowski@intel.com>
This commit is contained in:
Maciej "Iwan" Iwanowski 2020-03-17 09:49:50 +01:00
parent 9921cb3e21
commit adf41ba206
No known key found for this signature in database
GPG Key ID: 2484258A4DD3EE84
5 changed files with 28 additions and 19 deletions

View File

@ -16,6 +16,7 @@ package accelerators
import (
"bufio"
"fmt"
"github.com/google/cadvisor/stats"
"io/ioutil"
"os"
"path/filepath"
@ -30,7 +31,7 @@ import (
"k8s.io/klog"
)
type NvidiaManager struct {
type nvidiaManager struct {
sync.Mutex
// true if there are NVIDIA devices present on the node
@ -47,8 +48,12 @@ var sysFsPCIDevicesPath = "/sys/bus/pci/devices/"
const nvidiaVendorId = "0x10de"
func NewNvidiaManager() stats.Manager {
return &nvidiaManager{}
}
// Setup initializes NVML if nvidia devices are present on the node.
func (nm *NvidiaManager) Setup() {
func (nm *nvidiaManager) Setup() {
if !detectDevices(nvidiaVendorId) {
klog.V(4).Info("No NVIDIA devices found.")
return
@ -84,7 +89,7 @@ func detectDevices(vendorId string) bool {
// initializeNVML initializes the NVML library and sets up the nvmlDevices map.
// This is defined as a variable to help in testing.
var initializeNVML = func(nm *NvidiaManager) {
var initializeNVML = func(nm *nvidiaManager) {
if err := gonvml.Initialize(); err != nil {
// This is under a logging level because otherwise we may cause
// log spam if the drivers/nvml is not installed on the system.
@ -115,7 +120,7 @@ var initializeNVML = func(nm *NvidiaManager) {
}
// Destroy shuts down NVML.
func (nm *NvidiaManager) Destroy() {
func (nm *nvidiaManager) Destroy() {
if nm.nvmlInitialized {
gonvml.Shutdown()
}
@ -123,7 +128,7 @@ func (nm *NvidiaManager) Destroy() {
// GetCollector returns a collector that can fetch nvidia gpu metrics for nvidia devices
// present in the devices.list file in the given devicesCgroupPath.
func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
func (nm *nvidiaManager) GetCollector(devicesCgroupPath string) (stats.Collector, error) {
nc := &NvidiaCollector{}
if !nm.devicesPresent {

View File

@ -72,13 +72,13 @@ func TestGetCollector(t *testing.T) {
}
parseDevicesCgroup = mockParser
originalInitializeNVML := initializeNVML
initializeNVML = func(_ *NvidiaManager) {}
initializeNVML = func(_ *nvidiaManager) {}
defer func() {
parseDevicesCgroup = originalParser
initializeNVML = originalInitializeNVML
}()
nm := &NvidiaManager{}
nm := &nvidiaManager{}
// When devicesPresent is false, empty collector should be returned.
ac, err := nm.GetCollector("does-not-matter")

View File

@ -17,6 +17,7 @@ package manager
import (
"flag"
"fmt"
"github.com/google/cadvisor/stats"
"io/ioutil"
"math"
"math/rand"
@ -29,7 +30,6 @@ import (
"sync"
"time"
"github.com/google/cadvisor/accelerators"
"github.com/google/cadvisor/cache/memory"
"github.com/google/cadvisor/collector"
"github.com/google/cadvisor/container"
@ -90,7 +90,7 @@ type containerData struct {
collectorManager collector.CollectorManager
// nvidiaCollector updates stats for Nvidia GPUs attached to the container.
nvidiaCollector accelerators.AcceleratorCollector
nvidiaCollector stats.Collector
}
// jitter returns a time.Duration between duration and duration + maxFactor * duration,

View File

@ -18,6 +18,7 @@ package manager
import (
"flag"
"fmt"
"github.com/google/cadvisor/stats"
"net/http"
"os"
"path"
@ -181,7 +182,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn
containerWatchers: []watcher.ContainerWatcher{},
eventsChannel: eventsChannel,
collectorHttpClient: collectorHttpClient,
nvidiaManager: &accelerators.NvidiaManager{},
nvidiaManager: accelerators.NewNvidiaManager(),
rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
}
@ -230,7 +231,7 @@ type manager struct {
containerWatchers []watcher.ContainerWatcher
eventsChannel chan watcher.ContainerEvent
collectorHttpClient *http.Client
nvidiaManager accelerators.AcceleratorManager
nvidiaManager stats.Manager
// List of raw container cgroup path prefix whitelist.
rawContainerCgroupPathPrefixWhiteList []string
}

View File

@ -1,4 +1,4 @@
// Copyright 2017 Google Inc. All Rights Reserved.
// Copyright 2020 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -11,22 +11,25 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package accelerators
// Handling statistics that are fully controlled in cAdvisor
package stats
import info "github.com/google/cadvisor/info/v1"
// This is supposed to store global state about an accelerator metrics collector.
// cadvisor manager will call Setup() when it starts and Destroy() when it stops.
// For each container detected by the cadvisor manager, it will call
// This is supposed to store global state about an cAdvisor metrics collector.
// cAdvisor manager will call Setup() when it starts and Destroy() when it stops.
// For each container detected by the cAdvisor manager, it will call
// GetCollector() with the devices cgroup path for that container.
// GetCollector() is supposed to return an object that can update
// accelerator stats for that container.
type AcceleratorManager interface {
type Manager interface {
Setup()
Destroy()
GetCollector(deviceCgroup string) (AcceleratorCollector, error)
GetCollector(deviceCgroup string) (Collector, error)
}
type AcceleratorCollector interface {
// Collector can update ContainerStats by adding more metrics.
type Collector interface {
UpdateStats(*info.ContainerStats) error
}