Return a NoopManager if metricset does not container the accelerator value
This is because the NVIDIA manager opens a handle on nvml at NewNvidiaManager time. This is problematic in a kubernetes setting where kubelet now has an open handle on the NVIDIA driver, preventing an update of the NVIDIA driver unless kubelet is restarted. Additionally with the new metrics pipeline in Kubernetes, metrics are now expected to be collected through a container rather than through the kubelet itself. Signed-off-by: Renaud Gaubert <rgaubert@nvidia.com>
This commit is contained in:
parent
366d59d3b6
commit
170bae8a5d
@ -24,6 +24,7 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/google/cadvisor/container"
|
||||
info "github.com/google/cadvisor/info/v1"
|
||||
"github.com/google/cadvisor/stats"
|
||||
|
||||
@ -48,7 +49,12 @@ var sysFsPCIDevicesPath = "/sys/bus/pci/devices/"
|
||||
|
||||
const nvidiaVendorID = "0x10de"
|
||||
|
||||
func NewNvidiaManager() stats.Manager {
|
||||
func NewNvidiaManager(includedMetrics container.MetricSet) stats.Manager {
|
||||
if !includedMetrics.Has(container.AcceleratorUsageMetrics) {
|
||||
klog.V(2).Info("NVIDIA GPU metrics disabled")
|
||||
return &stats.NoopManager{}
|
||||
}
|
||||
|
||||
manager := &nvidiaManager{}
|
||||
err := manager.setup()
|
||||
if err != nil {
|
||||
|
@ -93,6 +93,7 @@ var (
|
||||
|
||||
// List of metrics that can be ignored.
|
||||
ignoreWhitelist = container.MetricSet{
|
||||
container.AcceleratorUsageMetrics: struct{}{},
|
||||
container.DiskUsageMetrics: struct{}{},
|
||||
container.DiskIOMetrics: struct{}{},
|
||||
container.NetworkUsageMetrics: struct{}{},
|
||||
@ -136,7 +137,7 @@ func (ml *metricSetValue) Set(value string) error {
|
||||
}
|
||||
|
||||
func init() {
|
||||
flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'cpu_topology','disk', 'diskIO', 'network', 'tcp', 'udp', 'percpu', 'sched', 'process', 'hugetlb', 'referenced_memory'.")
|
||||
flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'accelerator', 'cpu_topology','disk', 'diskIO', 'network', 'tcp', 'udp', 'percpu', 'sched', 'process', 'hugetlb', 'referenced_memory'.")
|
||||
|
||||
// Default logging verbosity to V(2)
|
||||
flag.Set("v", "2")
|
||||
|
@ -197,7 +197,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig
|
||||
containerWatchers: []watcher.ContainerWatcher{},
|
||||
eventsChannel: eventsChannel,
|
||||
collectorHTTPClient: collectorHTTPClient,
|
||||
nvidiaManager: accelerators.NewNvidiaManager(),
|
||||
nvidiaManager: accelerators.NewNvidiaManager(includedMetricsSet),
|
||||
rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user