Return a NoopManager if metricset does not container the accelerator value

This is because the NVIDIA manager opens a handle on nvml at
NewNvidiaManager time.

This is problematic in a kubernetes setting where kubelet now has an
open handle on the NVIDIA driver, preventing an update of the NVIDIA
driver unless kubelet is restarted.

Additionally with the new metrics pipeline in Kubernetes, metrics are
now expected to be collected through a container rather than through the
kubelet itself.

Signed-off-by: Renaud Gaubert <rgaubert@nvidia.com>
This commit is contained in:
Renaud Gaubert 2020-06-04 23:24:47 +00:00
parent 366d59d3b6
commit 170bae8a5d
3 changed files with 10 additions and 3 deletions

View File

@ -24,6 +24,7 @@ import (
"sync"
"time"
"github.com/google/cadvisor/container"
info "github.com/google/cadvisor/info/v1"
"github.com/google/cadvisor/stats"
@ -48,7 +49,12 @@ var sysFsPCIDevicesPath = "/sys/bus/pci/devices/"
const nvidiaVendorID = "0x10de"
func NewNvidiaManager() stats.Manager {
func NewNvidiaManager(includedMetrics container.MetricSet) stats.Manager {
if !includedMetrics.Has(container.AcceleratorUsageMetrics) {
klog.V(2).Info("NVIDIA GPU metrics disabled")
return &stats.NoopManager{}
}
manager := &nvidiaManager{}
err := manager.setup()
if err != nil {

View File

@ -93,6 +93,7 @@ var (
// List of metrics that can be ignored.
ignoreWhitelist = container.MetricSet{
container.AcceleratorUsageMetrics: struct{}{},
container.DiskUsageMetrics: struct{}{},
container.DiskIOMetrics: struct{}{},
container.NetworkUsageMetrics: struct{}{},
@ -136,7 +137,7 @@ func (ml *metricSetValue) Set(value string) error {
}
func init() {
flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'cpu_topology','disk', 'diskIO', 'network', 'tcp', 'udp', 'percpu', 'sched', 'process', 'hugetlb', 'referenced_memory'.")
flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'accelerator', 'cpu_topology','disk', 'diskIO', 'network', 'tcp', 'udp', 'percpu', 'sched', 'process', 'hugetlb', 'referenced_memory'.")
// Default logging verbosity to V(2)
flag.Set("v", "2")

View File

@ -197,7 +197,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig
containerWatchers: []watcher.ContainerWatcher{},
eventsChannel: eventsChannel,
collectorHTTPClient: collectorHTTPClient,
nvidiaManager: accelerators.NewNvidiaManager(),
nvidiaManager: accelerators.NewNvidiaManager(includedMetricsSet),
rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
}