From adf41ba206c4e1a4e478de5de28274faa1f5d29e Mon Sep 17 00:00:00 2001 From: "Maciej \"Iwan\" Iwanowski" Date: Tue, 17 Mar 2020 09:49:50 +0100 Subject: [PATCH 1/3] Moving Nvidia interfaces to stats package so that they can be used outside of the accelerators package Signed-off-by: Maciej "Iwan" Iwanowski --- accelerators/nvidia.go | 15 ++++++++++----- accelerators/nvidia_test.go | 4 ++-- manager/container.go | 4 ++-- manager/manager.go | 5 +++-- {accelerators => stats}/types.go | 19 +++++++++++-------- 5 files changed, 28 insertions(+), 19 deletions(-) rename {accelerators => stats}/types.go (63%) diff --git a/accelerators/nvidia.go b/accelerators/nvidia.go index b0c3c0c5..9f688ac1 100644 --- a/accelerators/nvidia.go +++ b/accelerators/nvidia.go @@ -16,6 +16,7 @@ package accelerators import ( "bufio" "fmt" + "github.com/google/cadvisor/stats" "io/ioutil" "os" "path/filepath" @@ -30,7 +31,7 @@ import ( "k8s.io/klog" ) -type NvidiaManager struct { +type nvidiaManager struct { sync.Mutex // true if there are NVIDIA devices present on the node @@ -47,8 +48,12 @@ var sysFsPCIDevicesPath = "/sys/bus/pci/devices/" const nvidiaVendorId = "0x10de" +func NewNvidiaManager() stats.Manager { + return &nvidiaManager{} +} + // Setup initializes NVML if nvidia devices are present on the node. -func (nm *NvidiaManager) Setup() { +func (nm *nvidiaManager) Setup() { if !detectDevices(nvidiaVendorId) { klog.V(4).Info("No NVIDIA devices found.") return @@ -84,7 +89,7 @@ func detectDevices(vendorId string) bool { // initializeNVML initializes the NVML library and sets up the nvmlDevices map. // This is defined as a variable to help in testing. -var initializeNVML = func(nm *NvidiaManager) { +var initializeNVML = func(nm *nvidiaManager) { if err := gonvml.Initialize(); err != nil { // This is under a logging level because otherwise we may cause // log spam if the drivers/nvml is not installed on the system. @@ -115,7 +120,7 @@ var initializeNVML = func(nm *NvidiaManager) { } // Destroy shuts down NVML. -func (nm *NvidiaManager) Destroy() { +func (nm *nvidiaManager) Destroy() { if nm.nvmlInitialized { gonvml.Shutdown() } @@ -123,7 +128,7 @@ func (nm *NvidiaManager) Destroy() { // GetCollector returns a collector that can fetch nvidia gpu metrics for nvidia devices // present in the devices.list file in the given devicesCgroupPath. -func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) { +func (nm *nvidiaManager) GetCollector(devicesCgroupPath string) (stats.Collector, error) { nc := &NvidiaCollector{} if !nm.devicesPresent { diff --git a/accelerators/nvidia_test.go b/accelerators/nvidia_test.go index b7e7c4d6..92f4f1af 100644 --- a/accelerators/nvidia_test.go +++ b/accelerators/nvidia_test.go @@ -72,13 +72,13 @@ func TestGetCollector(t *testing.T) { } parseDevicesCgroup = mockParser originalInitializeNVML := initializeNVML - initializeNVML = func(_ *NvidiaManager) {} + initializeNVML = func(_ *nvidiaManager) {} defer func() { parseDevicesCgroup = originalParser initializeNVML = originalInitializeNVML }() - nm := &NvidiaManager{} + nm := &nvidiaManager{} // When devicesPresent is false, empty collector should be returned. ac, err := nm.GetCollector("does-not-matter") diff --git a/manager/container.go b/manager/container.go index c0439293..749c1fc9 100644 --- a/manager/container.go +++ b/manager/container.go @@ -17,6 +17,7 @@ package manager import ( "flag" "fmt" + "github.com/google/cadvisor/stats" "io/ioutil" "math" "math/rand" @@ -29,7 +30,6 @@ import ( "sync" "time" - "github.com/google/cadvisor/accelerators" "github.com/google/cadvisor/cache/memory" "github.com/google/cadvisor/collector" "github.com/google/cadvisor/container" @@ -90,7 +90,7 @@ type containerData struct { collectorManager collector.CollectorManager // nvidiaCollector updates stats for Nvidia GPUs attached to the container. - nvidiaCollector accelerators.AcceleratorCollector + nvidiaCollector stats.Collector } // jitter returns a time.Duration between duration and duration + maxFactor * duration, diff --git a/manager/manager.go b/manager/manager.go index 4855cd33..d9925a4c 100644 --- a/manager/manager.go +++ b/manager/manager.go @@ -18,6 +18,7 @@ package manager import ( "flag" "fmt" + "github.com/google/cadvisor/stats" "net/http" "os" "path" @@ -181,7 +182,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn containerWatchers: []watcher.ContainerWatcher{}, eventsChannel: eventsChannel, collectorHttpClient: collectorHttpClient, - nvidiaManager: &accelerators.NvidiaManager{}, + nvidiaManager: accelerators.NewNvidiaManager(), rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList, } @@ -230,7 +231,7 @@ type manager struct { containerWatchers []watcher.ContainerWatcher eventsChannel chan watcher.ContainerEvent collectorHttpClient *http.Client - nvidiaManager accelerators.AcceleratorManager + nvidiaManager stats.Manager // List of raw container cgroup path prefix whitelist. rawContainerCgroupPathPrefixWhiteList []string } diff --git a/accelerators/types.go b/stats/types.go similarity index 63% rename from accelerators/types.go rename to stats/types.go index d577953a..8e497633 100644 --- a/accelerators/types.go +++ b/stats/types.go @@ -1,4 +1,4 @@ -// Copyright 2017 Google Inc. All Rights Reserved. +// Copyright 2020 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,22 +11,25 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -package accelerators + +// Handling statistics that are fully controlled in cAdvisor +package stats import info "github.com/google/cadvisor/info/v1" -// This is supposed to store global state about an accelerator metrics collector. -// cadvisor manager will call Setup() when it starts and Destroy() when it stops. -// For each container detected by the cadvisor manager, it will call +// This is supposed to store global state about an cAdvisor metrics collector. +// cAdvisor manager will call Setup() when it starts and Destroy() when it stops. +// For each container detected by the cAdvisor manager, it will call // GetCollector() with the devices cgroup path for that container. // GetCollector() is supposed to return an object that can update // accelerator stats for that container. -type AcceleratorManager interface { +type Manager interface { Setup() Destroy() - GetCollector(deviceCgroup string) (AcceleratorCollector, error) + GetCollector(deviceCgroup string) (Collector, error) } -type AcceleratorCollector interface { +// Collector can update ContainerStats by adding more metrics. +type Collector interface { UpdateStats(*info.ContainerStats) error } From 89f15cb83315596c062a5231c66b4c1913ff0dd6 Mon Sep 17 00:00:00 2001 From: "Maciej \"Iwan\" Iwanowski" Date: Tue, 17 Mar 2020 12:10:57 +0100 Subject: [PATCH 2/3] Not exposing Nvidia collector outside accelerators package Signed-off-by: Maciej "Iwan" Iwanowski --- accelerators/nvidia.go | 16 ++++++++++------ accelerators/nvidia_test.go | 20 ++++++++++---------- manager/container_test.go | 4 ++-- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/accelerators/nvidia.go b/accelerators/nvidia.go index 9f688ac1..b3c0d139 100644 --- a/accelerators/nvidia.go +++ b/accelerators/nvidia.go @@ -129,7 +129,7 @@ func (nm *nvidiaManager) Destroy() { // GetCollector returns a collector that can fetch nvidia gpu metrics for nvidia devices // present in the devices.list file in the given devicesCgroupPath. func (nm *nvidiaManager) GetCollector(devicesCgroupPath string) (stats.Collector, error) { - nc := &NvidiaCollector{} + nc := &nvidiaCollector{} if !nm.devicesPresent { return nc, nil @@ -154,7 +154,7 @@ func (nm *nvidiaManager) GetCollector(devicesCgroupPath string) (stats.Collector if !ok { return nc, fmt.Errorf("nvidia device minor number %d not found in cached devices", minor) } - nc.Devices = append(nc.Devices, device) + nc.devices = append(nc.devices, device) } return nc, nil } @@ -213,14 +213,18 @@ var parseDevicesCgroup = func(devicesCgroupPath string) ([]int, error) { return nvidiaMinorNumbers, nil } -type NvidiaCollector struct { +type nvidiaCollector struct { // Exposed for testing - Devices []gonvml.Device + devices []gonvml.Device +} + +func NewNvidiaCollector(devices []gonvml.Device) stats.Collector { + return &nvidiaCollector{devices: devices} } // UpdateStats updates the stats for NVIDIA GPUs (if any) attached to the container. -func (nc *NvidiaCollector) UpdateStats(stats *info.ContainerStats) error { - for _, device := range nc.Devices { +func (nc *nvidiaCollector) UpdateStats(stats *info.ContainerStats) error { + for _, device := range nc.devices { model, err := device.Name() if err != nil { return fmt.Errorf("error while getting gpu name: %v", err) diff --git a/accelerators/nvidia_test.go b/accelerators/nvidia_test.go index 92f4f1af..5250c5e4 100644 --- a/accelerators/nvidia_test.go +++ b/accelerators/nvidia_test.go @@ -84,27 +84,27 @@ func TestGetCollector(t *testing.T) { ac, err := nm.GetCollector("does-not-matter") assert.Nil(t, err) assert.NotNil(t, ac) - nc, ok := ac.(*NvidiaCollector) + nc, ok := ac.(*nvidiaCollector) assert.True(t, ok) - assert.Equal(t, 0, len(nc.Devices)) + assert.Equal(t, 0, len(nc.devices)) // When nvmlInitialized is false, empty collector should be returned. nm.devicesPresent = true ac, err = nm.GetCollector("does-not-matter") assert.Nil(t, err) assert.NotNil(t, ac) - nc, ok = ac.(*NvidiaCollector) + nc, ok = ac.(*nvidiaCollector) assert.True(t, ok) - assert.Equal(t, 0, len(nc.Devices)) + assert.Equal(t, 0, len(nc.devices)) // When nvidiaDevices is empty, empty collector should be returned. nm.nvmlInitialized = true ac, err = nm.GetCollector("does-not-matter") assert.Nil(t, err) assert.NotNil(t, ac) - nc, ok = ac.(*NvidiaCollector) + nc, ok = ac.(*nvidiaCollector) assert.True(t, ok) - assert.Equal(t, 0, len(nc.Devices)) + assert.Equal(t, 0, len(nc.devices)) // nvidiaDevices contains devices but they are different than what // is returned by parseDevicesCgroup. We should get an error. @@ -112,9 +112,9 @@ func TestGetCollector(t *testing.T) { ac, err = nm.GetCollector("does-not-matter") assert.NotNil(t, err) assert.NotNil(t, ac) - nc, ok = ac.(*NvidiaCollector) + nc, ok = ac.(*nvidiaCollector) assert.True(t, ok) - assert.Equal(t, 0, len(nc.Devices)) + assert.Equal(t, 0, len(nc.devices)) // nvidiaDevices contains devices returned by parseDevicesCgroup. // No error should be returned and collectors devices array should be @@ -124,9 +124,9 @@ func TestGetCollector(t *testing.T) { ac, err = nm.GetCollector("does-not-matter") assert.Nil(t, err) assert.NotNil(t, ac) - nc, ok = ac.(*NvidiaCollector) + nc, ok = ac.(*nvidiaCollector) assert.True(t, ok) - assert.Equal(t, 2, len(nc.Devices)) + assert.Equal(t, 2, len(nc.devices)) } func TestParseDevicesCgroup(t *testing.T) { diff --git a/manager/container_test.go b/manager/container_test.go index 85793f05..a3b5d113 100644 --- a/manager/container_test.go +++ b/manager/container_test.go @@ -217,7 +217,7 @@ func TestUpdateNvidiaStats(t *testing.T) { stats := info.ContainerStats{} // When there are no devices, we should not get an error and stats should not change. - cd.nvidiaCollector = &accelerators.NvidiaCollector{} + cd.nvidiaCollector = accelerators.NewNvidiaCollector([]gonvml.Device{}) err := cd.nvidiaCollector.UpdateStats(&stats) assert.Nil(t, err) assert.Equal(t, info.ContainerStats{}, stats) @@ -225,7 +225,7 @@ func TestUpdateNvidiaStats(t *testing.T) { // This is an impossible situation (there are devices but nvml is not initialized). // Here I am testing that the CGo gonvml library doesn't panic when passed bad // input and instead returns an error. - cd.nvidiaCollector = &accelerators.NvidiaCollector{Devices: []gonvml.Device{{}, {}}} + cd.nvidiaCollector = accelerators.NewNvidiaCollector([]gonvml.Device{{}, {}}) err = cd.nvidiaCollector.UpdateStats(&stats) assert.NotNil(t, err) assert.Equal(t, info.ContainerStats{}, stats) From 8de9dfb4b6b937617d811608add3b6447741ff05 Mon Sep 17 00:00:00 2001 From: "Maciej \"Iwan\" Iwanowski" Date: Tue, 17 Mar 2020 19:42:29 +0100 Subject: [PATCH 3/3] Imports should be grouped Signed-off-by: Maciej "Iwan" Iwanowski --- accelerators/nvidia.go | 2 +- manager/container.go | 2 +- manager/manager.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/accelerators/nvidia.go b/accelerators/nvidia.go index b3c0d139..f3857022 100644 --- a/accelerators/nvidia.go +++ b/accelerators/nvidia.go @@ -16,7 +16,6 @@ package accelerators import ( "bufio" "fmt" - "github.com/google/cadvisor/stats" "io/ioutil" "os" "path/filepath" @@ -26,6 +25,7 @@ import ( "time" info "github.com/google/cadvisor/info/v1" + "github.com/google/cadvisor/stats" "github.com/mindprince/gonvml" "k8s.io/klog" diff --git a/manager/container.go b/manager/container.go index 749c1fc9..c249b09f 100644 --- a/manager/container.go +++ b/manager/container.go @@ -17,7 +17,6 @@ package manager import ( "flag" "fmt" - "github.com/google/cadvisor/stats" "io/ioutil" "math" "math/rand" @@ -35,6 +34,7 @@ import ( "github.com/google/cadvisor/container" info "github.com/google/cadvisor/info/v1" "github.com/google/cadvisor/info/v2" + "github.com/google/cadvisor/stats" "github.com/google/cadvisor/summary" "github.com/google/cadvisor/utils/cpuload" diff --git a/manager/manager.go b/manager/manager.go index d9925a4c..9e62f5c7 100644 --- a/manager/manager.go +++ b/manager/manager.go @@ -18,7 +18,6 @@ package manager import ( "flag" "fmt" - "github.com/google/cadvisor/stats" "net/http" "os" "path" @@ -38,6 +37,7 @@ import ( info "github.com/google/cadvisor/info/v1" "github.com/google/cadvisor/info/v2" "github.com/google/cadvisor/machine" + "github.com/google/cadvisor/stats" "github.com/google/cadvisor/utils/oomparser" "github.com/google/cadvisor/utils/sysfs" "github.com/google/cadvisor/version"