Avoid race in accessing nvidiaDevices between Setup() and GetCollector()

This commit is contained in:
Rohit Agarwal 2017-11-20 22:19:23 -08:00
parent 7cb3faad02
commit 3c3845e92f

View File

@ -21,6 +21,7 @@ import (
"path/filepath" "path/filepath"
"strconv" "strconv"
"strings" "strings"
"sync"
"time" "time"
info "github.com/google/cadvisor/info/v1" info "github.com/google/cadvisor/info/v1"
@ -30,6 +31,8 @@ import (
) )
type NvidiaManager struct { type NvidiaManager struct {
sync.RWMutex
// true if the NVML library (libnvidia-ml.so.1) was loaded successfully // true if the NVML library (libnvidia-ml.so.1) was loaded successfully
nvmlInitialized bool nvmlInitialized bool
@ -48,12 +51,12 @@ func (nm *NvidiaManager) Setup() {
return return
} }
go func() {
glog.Info("Starting goroutine to initialize NVML")
nm.initializeNVML() nm.initializeNVML()
if nm.nvmlInitialized { if nm.nvmlInitialized {
return return
} }
go func() {
glog.Info("Starting goroutine to initialize NVML")
// TODO: use globalHousekeepingInterval // TODO: use globalHousekeepingInterval
for range time.Tick(time.Minute) { for range time.Tick(time.Minute) {
nm.initializeNVML() nm.initializeNVML()
@ -95,10 +98,13 @@ func (nm *NvidiaManager) initializeNVML() {
glog.V(3).Infof("Could not initialize NVML: %v", err) glog.V(3).Infof("Could not initialize NVML: %v", err)
return return
} }
nm.nvmlInitialized = true
numDevices, err := gonvml.DeviceCount() numDevices, err := gonvml.DeviceCount()
if err != nil { if err != nil {
glog.Warningf("GPU metrics would not be available. Failed to get the number of nvidia devices: %v", err) glog.Warningf("GPU metrics would not be available. Failed to get the number of nvidia devices: %v", err)
nm.Lock()
// Even though we won't have GPU metrics, the library was initialized and should be shutdown when exiting.
nm.nvmlInitialized = true
nm.Unlock()
return return
} }
glog.Infof("NVML initialized. Number of nvidia devices: %v", numDevices) glog.Infof("NVML initialized. Number of nvidia devices: %v", numDevices)
@ -116,6 +122,10 @@ func (nm *NvidiaManager) initializeNVML() {
} }
nm.nvidiaDevices[int(minorNumber)] = device nm.nvidiaDevices[int(minorNumber)] = device
} }
nm.Lock()
// Doing this at the end to avoid race in accessing nvidiaDevices in GetCollector.
nm.nvmlInitialized = true
nm.Unlock()
} }
// Destroy shuts down NVML. // Destroy shuts down NVML.
@ -129,9 +139,12 @@ func (nm *NvidiaManager) Destroy() {
// present in the devices.list file in the given devicesCgroupPath. // present in the devices.list file in the given devicesCgroupPath.
func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) { func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
nc := &NvidiaCollector{} nc := &NvidiaCollector{}
nm.RLock()
if !nm.nvmlInitialized || len(nm.nvidiaDevices) == 0 { if !nm.nvmlInitialized || len(nm.nvidiaDevices) == 0 {
nm.RUnlock()
return nc, nil return nc, nil
} }
nm.RUnlock()
nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath) nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath)
if err != nil { if err != nil {
return nc, err return nc, err