diff --git a/accelerators/nvidia.go b/accelerators/nvidia.go index b0c3c0c5..9f688ac1 100644 --- a/accelerators/nvidia.go +++ b/accelerators/nvidia.go @@ -16,6 +16,7 @@ package accelerators import ( "bufio" "fmt" + "github.com/google/cadvisor/stats" "io/ioutil" "os" "path/filepath" @@ -30,7 +31,7 @@ import ( "k8s.io/klog" ) -type NvidiaManager struct { +type nvidiaManager struct { sync.Mutex // true if there are NVIDIA devices present on the node @@ -47,8 +48,12 @@ var sysFsPCIDevicesPath = "/sys/bus/pci/devices/" const nvidiaVendorId = "0x10de" +func NewNvidiaManager() stats.Manager { + return &nvidiaManager{} +} + // Setup initializes NVML if nvidia devices are present on the node. -func (nm *NvidiaManager) Setup() { +func (nm *nvidiaManager) Setup() { if !detectDevices(nvidiaVendorId) { klog.V(4).Info("No NVIDIA devices found.") return @@ -84,7 +89,7 @@ func detectDevices(vendorId string) bool { // initializeNVML initializes the NVML library and sets up the nvmlDevices map. // This is defined as a variable to help in testing. -var initializeNVML = func(nm *NvidiaManager) { +var initializeNVML = func(nm *nvidiaManager) { if err := gonvml.Initialize(); err != nil { // This is under a logging level because otherwise we may cause // log spam if the drivers/nvml is not installed on the system. @@ -115,7 +120,7 @@ var initializeNVML = func(nm *NvidiaManager) { } // Destroy shuts down NVML. -func (nm *NvidiaManager) Destroy() { +func (nm *nvidiaManager) Destroy() { if nm.nvmlInitialized { gonvml.Shutdown() } @@ -123,7 +128,7 @@ func (nm *NvidiaManager) Destroy() { // GetCollector returns a collector that can fetch nvidia gpu metrics for nvidia devices // present in the devices.list file in the given devicesCgroupPath. -func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) { +func (nm *nvidiaManager) GetCollector(devicesCgroupPath string) (stats.Collector, error) { nc := &NvidiaCollector{} if !nm.devicesPresent { diff --git a/accelerators/nvidia_test.go b/accelerators/nvidia_test.go index b7e7c4d6..92f4f1af 100644 --- a/accelerators/nvidia_test.go +++ b/accelerators/nvidia_test.go @@ -72,13 +72,13 @@ func TestGetCollector(t *testing.T) { } parseDevicesCgroup = mockParser originalInitializeNVML := initializeNVML - initializeNVML = func(_ *NvidiaManager) {} + initializeNVML = func(_ *nvidiaManager) {} defer func() { parseDevicesCgroup = originalParser initializeNVML = originalInitializeNVML }() - nm := &NvidiaManager{} + nm := &nvidiaManager{} // When devicesPresent is false, empty collector should be returned. ac, err := nm.GetCollector("does-not-matter") diff --git a/manager/container.go b/manager/container.go index c0439293..749c1fc9 100644 --- a/manager/container.go +++ b/manager/container.go @@ -17,6 +17,7 @@ package manager import ( "flag" "fmt" + "github.com/google/cadvisor/stats" "io/ioutil" "math" "math/rand" @@ -29,7 +30,6 @@ import ( "sync" "time" - "github.com/google/cadvisor/accelerators" "github.com/google/cadvisor/cache/memory" "github.com/google/cadvisor/collector" "github.com/google/cadvisor/container" @@ -90,7 +90,7 @@ type containerData struct { collectorManager collector.CollectorManager // nvidiaCollector updates stats for Nvidia GPUs attached to the container. - nvidiaCollector accelerators.AcceleratorCollector + nvidiaCollector stats.Collector } // jitter returns a time.Duration between duration and duration + maxFactor * duration, diff --git a/manager/manager.go b/manager/manager.go index 4855cd33..d9925a4c 100644 --- a/manager/manager.go +++ b/manager/manager.go @@ -18,6 +18,7 @@ package manager import ( "flag" "fmt" + "github.com/google/cadvisor/stats" "net/http" "os" "path" @@ -181,7 +182,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn containerWatchers: []watcher.ContainerWatcher{}, eventsChannel: eventsChannel, collectorHttpClient: collectorHttpClient, - nvidiaManager: &accelerators.NvidiaManager{}, + nvidiaManager: accelerators.NewNvidiaManager(), rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList, } @@ -230,7 +231,7 @@ type manager struct { containerWatchers []watcher.ContainerWatcher eventsChannel chan watcher.ContainerEvent collectorHttpClient *http.Client - nvidiaManager accelerators.AcceleratorManager + nvidiaManager stats.Manager // List of raw container cgroup path prefix whitelist. rawContainerCgroupPathPrefixWhiteList []string } diff --git a/accelerators/types.go b/stats/types.go similarity index 63% rename from accelerators/types.go rename to stats/types.go index d577953a..8e497633 100644 --- a/accelerators/types.go +++ b/stats/types.go @@ -1,4 +1,4 @@ -// Copyright 2017 Google Inc. All Rights Reserved. +// Copyright 2020 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,22 +11,25 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -package accelerators + +// Handling statistics that are fully controlled in cAdvisor +package stats import info "github.com/google/cadvisor/info/v1" -// This is supposed to store global state about an accelerator metrics collector. -// cadvisor manager will call Setup() when it starts and Destroy() when it stops. -// For each container detected by the cadvisor manager, it will call +// This is supposed to store global state about an cAdvisor metrics collector. +// cAdvisor manager will call Setup() when it starts and Destroy() when it stops. +// For each container detected by the cAdvisor manager, it will call // GetCollector() with the devices cgroup path for that container. // GetCollector() is supposed to return an object that can update // accelerator stats for that container. -type AcceleratorManager interface { +type Manager interface { Setup() Destroy() - GetCollector(deviceCgroup string) (AcceleratorCollector, error) + GetCollector(deviceCgroup string) (Collector, error) } -type AcceleratorCollector interface { +// Collector can update ContainerStats by adding more metrics. +type Collector interface { UpdateStats(*info.ContainerStats) error }