Merge pull request #2432 from iwankgb/stats_interfaces

Moving Nvidia interfaces
This commit is contained in:
David Ashpole 2020-03-17 12:00:11 -07:00 committed by GitHub
commit 59bc6fe2c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 50 additions and 37 deletions

View File

@ -25,12 +25,13 @@ import (
"time"
info "github.com/google/cadvisor/info/v1"
"github.com/google/cadvisor/stats"
"github.com/mindprince/gonvml"
"k8s.io/klog"
)
type NvidiaManager struct {
type nvidiaManager struct {
sync.Mutex
// true if there are NVIDIA devices present on the node
@ -47,8 +48,12 @@ var sysFsPCIDevicesPath = "/sys/bus/pci/devices/"
const nvidiaVendorId = "0x10de"
func NewNvidiaManager() stats.Manager {
return &nvidiaManager{}
}
// Setup initializes NVML if nvidia devices are present on the node.
func (nm *NvidiaManager) Setup() {
func (nm *nvidiaManager) Setup() {
if !detectDevices(nvidiaVendorId) {
klog.V(4).Info("No NVIDIA devices found.")
return
@ -84,7 +89,7 @@ func detectDevices(vendorId string) bool {
// initializeNVML initializes the NVML library and sets up the nvmlDevices map.
// This is defined as a variable to help in testing.
var initializeNVML = func(nm *NvidiaManager) {
var initializeNVML = func(nm *nvidiaManager) {
if err := gonvml.Initialize(); err != nil {
// This is under a logging level because otherwise we may cause
// log spam if the drivers/nvml is not installed on the system.
@ -115,7 +120,7 @@ var initializeNVML = func(nm *NvidiaManager) {
}
// Destroy shuts down NVML.
func (nm *NvidiaManager) Destroy() {
func (nm *nvidiaManager) Destroy() {
if nm.nvmlInitialized {
gonvml.Shutdown()
}
@ -123,8 +128,8 @@ func (nm *NvidiaManager) Destroy() {
// GetCollector returns a collector that can fetch nvidia gpu metrics for nvidia devices
// present in the devices.list file in the given devicesCgroupPath.
func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
nc := &NvidiaCollector{}
func (nm *nvidiaManager) GetCollector(devicesCgroupPath string) (stats.Collector, error) {
nc := &nvidiaCollector{}
if !nm.devicesPresent {
return nc, nil
@ -149,7 +154,7 @@ func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorColl
if !ok {
return nc, fmt.Errorf("nvidia device minor number %d not found in cached devices", minor)
}
nc.Devices = append(nc.Devices, device)
nc.devices = append(nc.devices, device)
}
return nc, nil
}
@ -208,14 +213,18 @@ var parseDevicesCgroup = func(devicesCgroupPath string) ([]int, error) {
return nvidiaMinorNumbers, nil
}
type NvidiaCollector struct {
type nvidiaCollector struct {
// Exposed for testing
Devices []gonvml.Device
devices []gonvml.Device
}
func NewNvidiaCollector(devices []gonvml.Device) stats.Collector {
return &nvidiaCollector{devices: devices}
}
// UpdateStats updates the stats for NVIDIA GPUs (if any) attached to the container.
func (nc *NvidiaCollector) UpdateStats(stats *info.ContainerStats) error {
for _, device := range nc.Devices {
func (nc *nvidiaCollector) UpdateStats(stats *info.ContainerStats) error {
for _, device := range nc.devices {
model, err := device.Name()
if err != nil {
return fmt.Errorf("error while getting gpu name: %v", err)

View File

@ -72,39 +72,39 @@ func TestGetCollector(t *testing.T) {
}
parseDevicesCgroup = mockParser
originalInitializeNVML := initializeNVML
initializeNVML = func(_ *NvidiaManager) {}
initializeNVML = func(_ *nvidiaManager) {}
defer func() {
parseDevicesCgroup = originalParser
initializeNVML = originalInitializeNVML
}()
nm := &NvidiaManager{}
nm := &nvidiaManager{}
// When devicesPresent is false, empty collector should be returned.
ac, err := nm.GetCollector("does-not-matter")
assert.Nil(t, err)
assert.NotNil(t, ac)
nc, ok := ac.(*NvidiaCollector)
nc, ok := ac.(*nvidiaCollector)
assert.True(t, ok)
assert.Equal(t, 0, len(nc.Devices))
assert.Equal(t, 0, len(nc.devices))
// When nvmlInitialized is false, empty collector should be returned.
nm.devicesPresent = true
ac, err = nm.GetCollector("does-not-matter")
assert.Nil(t, err)
assert.NotNil(t, ac)
nc, ok = ac.(*NvidiaCollector)
nc, ok = ac.(*nvidiaCollector)
assert.True(t, ok)
assert.Equal(t, 0, len(nc.Devices))
assert.Equal(t, 0, len(nc.devices))
// When nvidiaDevices is empty, empty collector should be returned.
nm.nvmlInitialized = true
ac, err = nm.GetCollector("does-not-matter")
assert.Nil(t, err)
assert.NotNil(t, ac)
nc, ok = ac.(*NvidiaCollector)
nc, ok = ac.(*nvidiaCollector)
assert.True(t, ok)
assert.Equal(t, 0, len(nc.Devices))
assert.Equal(t, 0, len(nc.devices))
// nvidiaDevices contains devices but they are different than what
// is returned by parseDevicesCgroup. We should get an error.
@ -112,9 +112,9 @@ func TestGetCollector(t *testing.T) {
ac, err = nm.GetCollector("does-not-matter")
assert.NotNil(t, err)
assert.NotNil(t, ac)
nc, ok = ac.(*NvidiaCollector)
nc, ok = ac.(*nvidiaCollector)
assert.True(t, ok)
assert.Equal(t, 0, len(nc.Devices))
assert.Equal(t, 0, len(nc.devices))
// nvidiaDevices contains devices returned by parseDevicesCgroup.
// No error should be returned and collectors devices array should be
@ -124,9 +124,9 @@ func TestGetCollector(t *testing.T) {
ac, err = nm.GetCollector("does-not-matter")
assert.Nil(t, err)
assert.NotNil(t, ac)
nc, ok = ac.(*NvidiaCollector)
nc, ok = ac.(*nvidiaCollector)
assert.True(t, ok)
assert.Equal(t, 2, len(nc.Devices))
assert.Equal(t, 2, len(nc.devices))
}
func TestParseDevicesCgroup(t *testing.T) {

View File

@ -29,12 +29,12 @@ import (
"sync"
"time"
"github.com/google/cadvisor/accelerators"
"github.com/google/cadvisor/cache/memory"
"github.com/google/cadvisor/collector"
"github.com/google/cadvisor/container"
info "github.com/google/cadvisor/info/v1"
"github.com/google/cadvisor/info/v2"
"github.com/google/cadvisor/stats"
"github.com/google/cadvisor/summary"
"github.com/google/cadvisor/utils/cpuload"
@ -90,7 +90,7 @@ type containerData struct {
collectorManager collector.CollectorManager
// nvidiaCollector updates stats for Nvidia GPUs attached to the container.
nvidiaCollector accelerators.AcceleratorCollector
nvidiaCollector stats.Collector
}
// jitter returns a time.Duration between duration and duration + maxFactor * duration,

View File

@ -217,7 +217,7 @@ func TestUpdateNvidiaStats(t *testing.T) {
stats := info.ContainerStats{}
// When there are no devices, we should not get an error and stats should not change.
cd.nvidiaCollector = &accelerators.NvidiaCollector{}
cd.nvidiaCollector = accelerators.NewNvidiaCollector([]gonvml.Device{})
err := cd.nvidiaCollector.UpdateStats(&stats)
assert.Nil(t, err)
assert.Equal(t, info.ContainerStats{}, stats)
@ -225,7 +225,7 @@ func TestUpdateNvidiaStats(t *testing.T) {
// This is an impossible situation (there are devices but nvml is not initialized).
// Here I am testing that the CGo gonvml library doesn't panic when passed bad
// input and instead returns an error.
cd.nvidiaCollector = &accelerators.NvidiaCollector{Devices: []gonvml.Device{{}, {}}}
cd.nvidiaCollector = accelerators.NewNvidiaCollector([]gonvml.Device{{}, {}})
err = cd.nvidiaCollector.UpdateStats(&stats)
assert.NotNil(t, err)
assert.Equal(t, info.ContainerStats{}, stats)

View File

@ -37,6 +37,7 @@ import (
info "github.com/google/cadvisor/info/v1"
"github.com/google/cadvisor/info/v2"
"github.com/google/cadvisor/machine"
"github.com/google/cadvisor/stats"
"github.com/google/cadvisor/utils/oomparser"
"github.com/google/cadvisor/utils/sysfs"
"github.com/google/cadvisor/version"
@ -181,7 +182,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn
containerWatchers: []watcher.ContainerWatcher{},
eventsChannel: eventsChannel,
collectorHttpClient: collectorHttpClient,
nvidiaManager: &accelerators.NvidiaManager{},
nvidiaManager: accelerators.NewNvidiaManager(),
rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
}
@ -230,7 +231,7 @@ type manager struct {
containerWatchers []watcher.ContainerWatcher
eventsChannel chan watcher.ContainerEvent
collectorHttpClient *http.Client
nvidiaManager accelerators.AcceleratorManager
nvidiaManager stats.Manager
// List of raw container cgroup path prefix whitelist.
rawContainerCgroupPathPrefixWhiteList []string
}

View File

@ -1,4 +1,4 @@
// Copyright 2017 Google Inc. All Rights Reserved.
// Copyright 2020 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -11,22 +11,25 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package accelerators
// Handling statistics that are fully controlled in cAdvisor
package stats
import info "github.com/google/cadvisor/info/v1"
// This is supposed to store global state about an accelerator metrics collector.
// cadvisor manager will call Setup() when it starts and Destroy() when it stops.
// For each container detected by the cadvisor manager, it will call
// This is supposed to store global state about an cAdvisor metrics collector.
// cAdvisor manager will call Setup() when it starts and Destroy() when it stops.
// For each container detected by the cAdvisor manager, it will call
// GetCollector() with the devices cgroup path for that container.
// GetCollector() is supposed to return an object that can update
// accelerator stats for that container.
type AcceleratorManager interface {
type Manager interface {
Setup()
Destroy()
GetCollector(deviceCgroup string) (AcceleratorCollector, error)
GetCollector(deviceCgroup string) (Collector, error)
}
type AcceleratorCollector interface {
// Collector can update ContainerStats by adding more metrics.
type Collector interface {
UpdateStats(*info.ContainerStats) error
}