Avoid race in accessing nvidiaDevices between Setup() and GetCollector()
This commit is contained in:
parent
7cb3faad02
commit
3c3845e92f
@ -21,6 +21,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
info "github.com/google/cadvisor/info/v1"
|
info "github.com/google/cadvisor/info/v1"
|
||||||
@ -30,6 +31,8 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type NvidiaManager struct {
|
type NvidiaManager struct {
|
||||||
|
sync.RWMutex
|
||||||
|
|
||||||
// true if the NVML library (libnvidia-ml.so.1) was loaded successfully
|
// true if the NVML library (libnvidia-ml.so.1) was loaded successfully
|
||||||
nvmlInitialized bool
|
nvmlInitialized bool
|
||||||
|
|
||||||
@ -48,12 +51,12 @@ func (nm *NvidiaManager) Setup() {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
go func() {
|
|
||||||
glog.Info("Starting goroutine to initialize NVML")
|
|
||||||
nm.initializeNVML()
|
nm.initializeNVML()
|
||||||
if nm.nvmlInitialized {
|
if nm.nvmlInitialized {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
go func() {
|
||||||
|
glog.Info("Starting goroutine to initialize NVML")
|
||||||
// TODO: use globalHousekeepingInterval
|
// TODO: use globalHousekeepingInterval
|
||||||
for range time.Tick(time.Minute) {
|
for range time.Tick(time.Minute) {
|
||||||
nm.initializeNVML()
|
nm.initializeNVML()
|
||||||
@ -95,10 +98,13 @@ func (nm *NvidiaManager) initializeNVML() {
|
|||||||
glog.V(3).Infof("Could not initialize NVML: %v", err)
|
glog.V(3).Infof("Could not initialize NVML: %v", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
nm.nvmlInitialized = true
|
|
||||||
numDevices, err := gonvml.DeviceCount()
|
numDevices, err := gonvml.DeviceCount()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
glog.Warningf("GPU metrics would not be available. Failed to get the number of nvidia devices: %v", err)
|
glog.Warningf("GPU metrics would not be available. Failed to get the number of nvidia devices: %v", err)
|
||||||
|
nm.Lock()
|
||||||
|
// Even though we won't have GPU metrics, the library was initialized and should be shutdown when exiting.
|
||||||
|
nm.nvmlInitialized = true
|
||||||
|
nm.Unlock()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
glog.Infof("NVML initialized. Number of nvidia devices: %v", numDevices)
|
glog.Infof("NVML initialized. Number of nvidia devices: %v", numDevices)
|
||||||
@ -116,6 +122,10 @@ func (nm *NvidiaManager) initializeNVML() {
|
|||||||
}
|
}
|
||||||
nm.nvidiaDevices[int(minorNumber)] = device
|
nm.nvidiaDevices[int(minorNumber)] = device
|
||||||
}
|
}
|
||||||
|
nm.Lock()
|
||||||
|
// Doing this at the end to avoid race in accessing nvidiaDevices in GetCollector.
|
||||||
|
nm.nvmlInitialized = true
|
||||||
|
nm.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Destroy shuts down NVML.
|
// Destroy shuts down NVML.
|
||||||
@ -129,9 +139,12 @@ func (nm *NvidiaManager) Destroy() {
|
|||||||
// present in the devices.list file in the given devicesCgroupPath.
|
// present in the devices.list file in the given devicesCgroupPath.
|
||||||
func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
|
func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
|
||||||
nc := &NvidiaCollector{}
|
nc := &NvidiaCollector{}
|
||||||
|
nm.RLock()
|
||||||
if !nm.nvmlInitialized || len(nm.nvidiaDevices) == 0 {
|
if !nm.nvmlInitialized || len(nm.nvidiaDevices) == 0 {
|
||||||
|
nm.RUnlock()
|
||||||
return nc, nil
|
return nc, nil
|
||||||
}
|
}
|
||||||
|
nm.RUnlock()
|
||||||
nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath)
|
nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nc, err
|
return nc, err
|
||||||
|
Loading…
Reference in New Issue
Block a user