Initialize NVML on demand.

Earlier if the NVIDIA driver was not installed when cAdvisor was started
we would start a goroutine to try to initialize NVML every minute.

This resulted in a race. We can have a situation where:
- goroutine tries to initialize NVML but fails. So, it sleeps for a minute.
- the driver is installed.
- a container that uses NVIDIA devices is started.
This container would not get GPU stats because a minute has not passed
since the last failed initialization attempt and so NVML is not
initialized.
This commit is contained in:
Rohit Agarwal 2018-06-15 17:30:41 -07:00
parent f834c0f4c1
commit 2ce41611a0
2 changed files with 36 additions and 29 deletions

View File

@ -31,7 +31,10 @@ import (
) )
type NvidiaManager struct { type NvidiaManager struct {
sync.RWMutex sync.Mutex
// true if there are NVIDIA devices present on the node
devicesPresent bool
// true if the NVML library (libnvidia-ml.so.1) was loaded successfully // true if the NVML library (libnvidia-ml.so.1) was loaded successfully
nvmlInitialized bool nvmlInitialized bool
@ -51,20 +54,9 @@ func (nm *NvidiaManager) Setup() {
return return
} }
nm.initializeNVML() nm.devicesPresent = true
if nm.nvmlInitialized {
return initializeNVML(nm)
}
go func() {
glog.V(2).Info("Starting goroutine to initialize NVML")
// TODO: use globalHousekeepingInterval
for range time.Tick(time.Minute) {
nm.initializeNVML()
if nm.nvmlInitialized {
return
}
}
}()
} }
// detectDevices returns true if a device with given pci id is present on the node. // detectDevices returns true if a device with given pci id is present on the node.
@ -91,20 +83,18 @@ func detectDevices(vendorId string) bool {
} }
// initializeNVML initializes the NVML library and sets up the nvmlDevices map. // initializeNVML initializes the NVML library and sets up the nvmlDevices map.
func (nm *NvidiaManager) initializeNVML() { // This is defined as a variable to help in testing.
var initializeNVML = func(nm *NvidiaManager) {
if err := gonvml.Initialize(); err != nil { if err := gonvml.Initialize(); err != nil {
// This is under a logging level because otherwise we may cause // This is under a logging level because otherwise we may cause
// log spam if the drivers/nvml is not installed on the system. // log spam if the drivers/nvml is not installed on the system.
glog.V(4).Infof("Could not initialize NVML: %v", err) glog.V(4).Infof("Could not initialize NVML: %v", err)
return return
} }
nm.nvmlInitialized = true
numDevices, err := gonvml.DeviceCount() numDevices, err := gonvml.DeviceCount()
if err != nil { if err != nil {
glog.Warningf("GPU metrics would not be available. Failed to get the number of nvidia devices: %v", err) glog.Warningf("GPU metrics would not be available. Failed to get the number of nvidia devices: %v", err)
nm.Lock()
// Even though we won't have GPU metrics, the library was initialized and should be shutdown when exiting.
nm.nvmlInitialized = true
nm.Unlock()
return return
} }
glog.V(1).Infof("NVML initialized. Number of nvidia devices: %v", numDevices) glog.V(1).Infof("NVML initialized. Number of nvidia devices: %v", numDevices)
@ -122,10 +112,6 @@ func (nm *NvidiaManager) initializeNVML() {
} }
nm.nvidiaDevices[int(minorNumber)] = device nm.nvidiaDevices[int(minorNumber)] = device
} }
nm.Lock()
// Doing this at the end to avoid race in accessing nvidiaDevices in GetCollector.
nm.nvmlInitialized = true
nm.Unlock()
} }
// Destroy shuts down NVML. // Destroy shuts down NVML.
@ -139,12 +125,21 @@ func (nm *NvidiaManager) Destroy() {
// present in the devices.list file in the given devicesCgroupPath. // present in the devices.list file in the given devicesCgroupPath.
func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) { func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
nc := &NvidiaCollector{} nc := &NvidiaCollector{}
nm.RLock()
if !nm.nvmlInitialized || len(nm.nvidiaDevices) == 0 { if !nm.devicesPresent {
nm.RUnlock()
return nc, nil return nc, nil
} }
nm.RUnlock() // Makes sure that we don't call initializeNVML() concurrently and
// that we only call initializeNVML() when it's not initialized.
nm.Lock()
if !nm.nvmlInitialized {
initializeNVML(nm)
}
if !nm.nvmlInitialized || len(nm.nvidiaDevices) == 0 {
nm.Unlock()
return nc, nil
}
nm.Unlock()
nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath) nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath)
if err != nil { if err != nil {
return nc, err return nc, err

View File

@ -71,13 +71,16 @@ func TestGetCollector(t *testing.T) {
return []int{2, 3}, nil return []int{2, 3}, nil
} }
parseDevicesCgroup = mockParser parseDevicesCgroup = mockParser
originalInitializeNVML := initializeNVML
initializeNVML = func(_ *NvidiaManager) {}
defer func() { defer func() {
parseDevicesCgroup = originalParser parseDevicesCgroup = originalParser
initializeNVML = originalInitializeNVML
}() }()
nm := &NvidiaManager{} nm := &NvidiaManager{}
// When nvmlInitialized is false, empty collector should be returned. // When devicesPresent is false, empty collector should be returned.
ac, err := nm.GetCollector("does-not-matter") ac, err := nm.GetCollector("does-not-matter")
assert.Nil(t, err) assert.Nil(t, err)
assert.NotNil(t, ac) assert.NotNil(t, ac)
@ -85,6 +88,15 @@ func TestGetCollector(t *testing.T) {
assert.True(t, ok) assert.True(t, ok)
assert.Equal(t, 0, len(nc.Devices)) assert.Equal(t, 0, len(nc.Devices))
// When nvmlInitialized is false, empty collector should be returned.
nm.devicesPresent = true
ac, err = nm.GetCollector("does-not-matter")
assert.Nil(t, err)
assert.NotNil(t, ac)
nc, ok = ac.(*NvidiaCollector)
assert.True(t, ok)
assert.Equal(t, 0, len(nc.Devices))
// When nvidiaDevices is empty, empty collector should be returned. // When nvidiaDevices is empty, empty collector should be returned.
nm.nvmlInitialized = true nm.nvmlInitialized = true
ac, err = nm.GetCollector("does-not-matter") ac, err = nm.GetCollector("does-not-matter")