Merge pull request #2432 from iwankgb/stats_interfaces
Moving Nvidia interfaces
This commit is contained in:
commit
59bc6fe2c4
@ -25,12 +25,13 @@ import (
|
||||
"time"
|
||||
|
||||
info "github.com/google/cadvisor/info/v1"
|
||||
"github.com/google/cadvisor/stats"
|
||||
|
||||
"github.com/mindprince/gonvml"
|
||||
"k8s.io/klog"
|
||||
)
|
||||
|
||||
type NvidiaManager struct {
|
||||
type nvidiaManager struct {
|
||||
sync.Mutex
|
||||
|
||||
// true if there are NVIDIA devices present on the node
|
||||
@ -47,8 +48,12 @@ var sysFsPCIDevicesPath = "/sys/bus/pci/devices/"
|
||||
|
||||
const nvidiaVendorId = "0x10de"
|
||||
|
||||
func NewNvidiaManager() stats.Manager {
|
||||
return &nvidiaManager{}
|
||||
}
|
||||
|
||||
// Setup initializes NVML if nvidia devices are present on the node.
|
||||
func (nm *NvidiaManager) Setup() {
|
||||
func (nm *nvidiaManager) Setup() {
|
||||
if !detectDevices(nvidiaVendorId) {
|
||||
klog.V(4).Info("No NVIDIA devices found.")
|
||||
return
|
||||
@ -84,7 +89,7 @@ func detectDevices(vendorId string) bool {
|
||||
|
||||
// initializeNVML initializes the NVML library and sets up the nvmlDevices map.
|
||||
// This is defined as a variable to help in testing.
|
||||
var initializeNVML = func(nm *NvidiaManager) {
|
||||
var initializeNVML = func(nm *nvidiaManager) {
|
||||
if err := gonvml.Initialize(); err != nil {
|
||||
// This is under a logging level because otherwise we may cause
|
||||
// log spam if the drivers/nvml is not installed on the system.
|
||||
@ -115,7 +120,7 @@ var initializeNVML = func(nm *NvidiaManager) {
|
||||
}
|
||||
|
||||
// Destroy shuts down NVML.
|
||||
func (nm *NvidiaManager) Destroy() {
|
||||
func (nm *nvidiaManager) Destroy() {
|
||||
if nm.nvmlInitialized {
|
||||
gonvml.Shutdown()
|
||||
}
|
||||
@ -123,8 +128,8 @@ func (nm *NvidiaManager) Destroy() {
|
||||
|
||||
// GetCollector returns a collector that can fetch nvidia gpu metrics for nvidia devices
|
||||
// present in the devices.list file in the given devicesCgroupPath.
|
||||
func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
|
||||
nc := &NvidiaCollector{}
|
||||
func (nm *nvidiaManager) GetCollector(devicesCgroupPath string) (stats.Collector, error) {
|
||||
nc := &nvidiaCollector{}
|
||||
|
||||
if !nm.devicesPresent {
|
||||
return nc, nil
|
||||
@ -149,7 +154,7 @@ func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorColl
|
||||
if !ok {
|
||||
return nc, fmt.Errorf("nvidia device minor number %d not found in cached devices", minor)
|
||||
}
|
||||
nc.Devices = append(nc.Devices, device)
|
||||
nc.devices = append(nc.devices, device)
|
||||
}
|
||||
return nc, nil
|
||||
}
|
||||
@ -208,14 +213,18 @@ var parseDevicesCgroup = func(devicesCgroupPath string) ([]int, error) {
|
||||
return nvidiaMinorNumbers, nil
|
||||
}
|
||||
|
||||
type NvidiaCollector struct {
|
||||
type nvidiaCollector struct {
|
||||
// Exposed for testing
|
||||
Devices []gonvml.Device
|
||||
devices []gonvml.Device
|
||||
}
|
||||
|
||||
func NewNvidiaCollector(devices []gonvml.Device) stats.Collector {
|
||||
return &nvidiaCollector{devices: devices}
|
||||
}
|
||||
|
||||
// UpdateStats updates the stats for NVIDIA GPUs (if any) attached to the container.
|
||||
func (nc *NvidiaCollector) UpdateStats(stats *info.ContainerStats) error {
|
||||
for _, device := range nc.Devices {
|
||||
func (nc *nvidiaCollector) UpdateStats(stats *info.ContainerStats) error {
|
||||
for _, device := range nc.devices {
|
||||
model, err := device.Name()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error while getting gpu name: %v", err)
|
||||
|
@ -72,39 +72,39 @@ func TestGetCollector(t *testing.T) {
|
||||
}
|
||||
parseDevicesCgroup = mockParser
|
||||
originalInitializeNVML := initializeNVML
|
||||
initializeNVML = func(_ *NvidiaManager) {}
|
||||
initializeNVML = func(_ *nvidiaManager) {}
|
||||
defer func() {
|
||||
parseDevicesCgroup = originalParser
|
||||
initializeNVML = originalInitializeNVML
|
||||
}()
|
||||
|
||||
nm := &NvidiaManager{}
|
||||
nm := &nvidiaManager{}
|
||||
|
||||
// When devicesPresent is false, empty collector should be returned.
|
||||
ac, err := nm.GetCollector("does-not-matter")
|
||||
assert.Nil(t, err)
|
||||
assert.NotNil(t, ac)
|
||||
nc, ok := ac.(*NvidiaCollector)
|
||||
nc, ok := ac.(*nvidiaCollector)
|
||||
assert.True(t, ok)
|
||||
assert.Equal(t, 0, len(nc.Devices))
|
||||
assert.Equal(t, 0, len(nc.devices))
|
||||
|
||||
// When nvmlInitialized is false, empty collector should be returned.
|
||||
nm.devicesPresent = true
|
||||
ac, err = nm.GetCollector("does-not-matter")
|
||||
assert.Nil(t, err)
|
||||
assert.NotNil(t, ac)
|
||||
nc, ok = ac.(*NvidiaCollector)
|
||||
nc, ok = ac.(*nvidiaCollector)
|
||||
assert.True(t, ok)
|
||||
assert.Equal(t, 0, len(nc.Devices))
|
||||
assert.Equal(t, 0, len(nc.devices))
|
||||
|
||||
// When nvidiaDevices is empty, empty collector should be returned.
|
||||
nm.nvmlInitialized = true
|
||||
ac, err = nm.GetCollector("does-not-matter")
|
||||
assert.Nil(t, err)
|
||||
assert.NotNil(t, ac)
|
||||
nc, ok = ac.(*NvidiaCollector)
|
||||
nc, ok = ac.(*nvidiaCollector)
|
||||
assert.True(t, ok)
|
||||
assert.Equal(t, 0, len(nc.Devices))
|
||||
assert.Equal(t, 0, len(nc.devices))
|
||||
|
||||
// nvidiaDevices contains devices but they are different than what
|
||||
// is returned by parseDevicesCgroup. We should get an error.
|
||||
@ -112,9 +112,9 @@ func TestGetCollector(t *testing.T) {
|
||||
ac, err = nm.GetCollector("does-not-matter")
|
||||
assert.NotNil(t, err)
|
||||
assert.NotNil(t, ac)
|
||||
nc, ok = ac.(*NvidiaCollector)
|
||||
nc, ok = ac.(*nvidiaCollector)
|
||||
assert.True(t, ok)
|
||||
assert.Equal(t, 0, len(nc.Devices))
|
||||
assert.Equal(t, 0, len(nc.devices))
|
||||
|
||||
// nvidiaDevices contains devices returned by parseDevicesCgroup.
|
||||
// No error should be returned and collectors devices array should be
|
||||
@ -124,9 +124,9 @@ func TestGetCollector(t *testing.T) {
|
||||
ac, err = nm.GetCollector("does-not-matter")
|
||||
assert.Nil(t, err)
|
||||
assert.NotNil(t, ac)
|
||||
nc, ok = ac.(*NvidiaCollector)
|
||||
nc, ok = ac.(*nvidiaCollector)
|
||||
assert.True(t, ok)
|
||||
assert.Equal(t, 2, len(nc.Devices))
|
||||
assert.Equal(t, 2, len(nc.devices))
|
||||
}
|
||||
|
||||
func TestParseDevicesCgroup(t *testing.T) {
|
||||
|
@ -29,12 +29,12 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/google/cadvisor/accelerators"
|
||||
"github.com/google/cadvisor/cache/memory"
|
||||
"github.com/google/cadvisor/collector"
|
||||
"github.com/google/cadvisor/container"
|
||||
info "github.com/google/cadvisor/info/v1"
|
||||
"github.com/google/cadvisor/info/v2"
|
||||
"github.com/google/cadvisor/stats"
|
||||
"github.com/google/cadvisor/summary"
|
||||
"github.com/google/cadvisor/utils/cpuload"
|
||||
|
||||
@ -90,7 +90,7 @@ type containerData struct {
|
||||
collectorManager collector.CollectorManager
|
||||
|
||||
// nvidiaCollector updates stats for Nvidia GPUs attached to the container.
|
||||
nvidiaCollector accelerators.AcceleratorCollector
|
||||
nvidiaCollector stats.Collector
|
||||
}
|
||||
|
||||
// jitter returns a time.Duration between duration and duration + maxFactor * duration,
|
||||
|
@ -217,7 +217,7 @@ func TestUpdateNvidiaStats(t *testing.T) {
|
||||
stats := info.ContainerStats{}
|
||||
|
||||
// When there are no devices, we should not get an error and stats should not change.
|
||||
cd.nvidiaCollector = &accelerators.NvidiaCollector{}
|
||||
cd.nvidiaCollector = accelerators.NewNvidiaCollector([]gonvml.Device{})
|
||||
err := cd.nvidiaCollector.UpdateStats(&stats)
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, info.ContainerStats{}, stats)
|
||||
@ -225,7 +225,7 @@ func TestUpdateNvidiaStats(t *testing.T) {
|
||||
// This is an impossible situation (there are devices but nvml is not initialized).
|
||||
// Here I am testing that the CGo gonvml library doesn't panic when passed bad
|
||||
// input and instead returns an error.
|
||||
cd.nvidiaCollector = &accelerators.NvidiaCollector{Devices: []gonvml.Device{{}, {}}}
|
||||
cd.nvidiaCollector = accelerators.NewNvidiaCollector([]gonvml.Device{{}, {}})
|
||||
err = cd.nvidiaCollector.UpdateStats(&stats)
|
||||
assert.NotNil(t, err)
|
||||
assert.Equal(t, info.ContainerStats{}, stats)
|
||||
|
@ -37,6 +37,7 @@ import (
|
||||
info "github.com/google/cadvisor/info/v1"
|
||||
"github.com/google/cadvisor/info/v2"
|
||||
"github.com/google/cadvisor/machine"
|
||||
"github.com/google/cadvisor/stats"
|
||||
"github.com/google/cadvisor/utils/oomparser"
|
||||
"github.com/google/cadvisor/utils/sysfs"
|
||||
"github.com/google/cadvisor/version"
|
||||
@ -181,7 +182,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn
|
||||
containerWatchers: []watcher.ContainerWatcher{},
|
||||
eventsChannel: eventsChannel,
|
||||
collectorHttpClient: collectorHttpClient,
|
||||
nvidiaManager: &accelerators.NvidiaManager{},
|
||||
nvidiaManager: accelerators.NewNvidiaManager(),
|
||||
rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
|
||||
}
|
||||
|
||||
@ -230,7 +231,7 @@ type manager struct {
|
||||
containerWatchers []watcher.ContainerWatcher
|
||||
eventsChannel chan watcher.ContainerEvent
|
||||
collectorHttpClient *http.Client
|
||||
nvidiaManager accelerators.AcceleratorManager
|
||||
nvidiaManager stats.Manager
|
||||
// List of raw container cgroup path prefix whitelist.
|
||||
rawContainerCgroupPathPrefixWhiteList []string
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright 2017 Google Inc. All Rights Reserved.
|
||||
// Copyright 2020 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -11,22 +11,25 @@
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
package accelerators
|
||||
|
||||
// Handling statistics that are fully controlled in cAdvisor
|
||||
package stats
|
||||
|
||||
import info "github.com/google/cadvisor/info/v1"
|
||||
|
||||
// This is supposed to store global state about an accelerator metrics collector.
|
||||
// cadvisor manager will call Setup() when it starts and Destroy() when it stops.
|
||||
// For each container detected by the cadvisor manager, it will call
|
||||
// This is supposed to store global state about an cAdvisor metrics collector.
|
||||
// cAdvisor manager will call Setup() when it starts and Destroy() when it stops.
|
||||
// For each container detected by the cAdvisor manager, it will call
|
||||
// GetCollector() with the devices cgroup path for that container.
|
||||
// GetCollector() is supposed to return an object that can update
|
||||
// accelerator stats for that container.
|
||||
type AcceleratorManager interface {
|
||||
type Manager interface {
|
||||
Setup()
|
||||
Destroy()
|
||||
GetCollector(deviceCgroup string) (AcceleratorCollector, error)
|
||||
GetCollector(deviceCgroup string) (Collector, error)
|
||||
}
|
||||
|
||||
type AcceleratorCollector interface {
|
||||
// Collector can update ContainerStats by adding more metrics.
|
||||
type Collector interface {
|
||||
UpdateStats(*info.ContainerStats) error
|
||||
}
|
Loading…
Reference in New Issue
Block a user