When cAdvisor starts up, it would read the `vendor` files in `/sys/bus/pci/devices/*` to see if any NVIDIA devices (vendor ID: 0x10de) are attached to the node. If no NVIDIA devices are found, this code path would become dormant for the rest of cAdvisor lifetime. If NVIDIA devices are found, we would start a goroutine that would check for the presence of NVML by trying to dynamically load it at regular intervals. We need to do this regular checking instead of doing it just once because it may happen that cAdvisor is started before the NVIDIA drivers and NVML are installed. Once the NVML dynamic loading succeeds, we would use NVML’s query methods to find out how many devices exist on the node and create a map from their minor numbers to their handles and cache that map. The goroutine would exit at this point. If we detected the presence of NVML in the previous step, whenever a new container is detected by cAdvisor, cAdvisor would read the `devices.list` file from the container's devices cgroup. The `devices.list` file lists the major:minor number of all the devices that the container is allowed to access. If we find any device with major number 195 (which is the major number assigned to NVIDIA devices), we would cache the list of corresponding minor numbers for that container. During every housekeeping operation, in addition to collecting all the existing metrics, we will use the cached NVIDIA device minor numbers and the map from minor numbers to device handles to get metrics for GPU devices attached to the container.
169 lines
5.6 KiB
Go
169 lines
5.6 KiB
Go
// Copyright 2017 Google Inc. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
package accelerators
|
|
|
|
import (
|
|
"io/ioutil"
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
|
|
"github.com/mindprince/gonvml"
|
|
"github.com/stretchr/testify/assert"
|
|
)
|
|
|
|
func updateFile(t *testing.T, fn string, content []byte) {
|
|
if err := ioutil.WriteFile(fn, content, 0666); err != nil {
|
|
t.Fatalf("Error writing to temporary file for testing: %v", err)
|
|
}
|
|
}
|
|
|
|
func TestDetectDevices(t *testing.T) {
|
|
sysFsPCIDevicesPath = "/non-existent-path"
|
|
detected := detectDevices("0x10de")
|
|
assert.False(t, detected)
|
|
|
|
var err error
|
|
// Create temporary directory to represent sysfs pci devices path
|
|
if sysFsPCIDevicesPath, err = ioutil.TempDir("", "sys-bus-pci-devices"); err != nil {
|
|
t.Fatalf("Error creating temporary directory for testing: %v", err)
|
|
}
|
|
defer os.RemoveAll(sysFsPCIDevicesPath)
|
|
|
|
device0 := filepath.Join(sysFsPCIDevicesPath, "device0")
|
|
device1 := filepath.Join(sysFsPCIDevicesPath, "device1")
|
|
device2 := filepath.Join(sysFsPCIDevicesPath, "device2")
|
|
for _, device := range []string{device0, device1, device2} {
|
|
if err = os.Mkdir(device, 0777); err != nil {
|
|
t.Fatalf("Error creating temporary directory for testing: %v", err)
|
|
}
|
|
}
|
|
|
|
// device0 directory is present to make sure that
|
|
// we handle bad device directories case correctly.
|
|
|
|
// A valid vendor file but different than what's being detected.
|
|
updateFile(t, filepath.Join(device1, "vendor"), []byte("0x8086\n"))
|
|
detected = detectDevices("0x10de")
|
|
assert.False(t, detected)
|
|
|
|
// vendor file for device being detected
|
|
updateFile(t, filepath.Join(device2, "vendor"), []byte("0x10de\n"))
|
|
detected = detectDevices("0x10de")
|
|
assert.True(t, detected)
|
|
}
|
|
|
|
func TestGetCollector(t *testing.T) {
|
|
// Mock parseDevicesCgroup.
|
|
originalParser := parseDevicesCgroup
|
|
mockParser := func(_ string) ([]int, error) {
|
|
return []int{2, 3}, nil
|
|
}
|
|
parseDevicesCgroup = mockParser
|
|
defer func() {
|
|
parseDevicesCgroup = originalParser
|
|
}()
|
|
|
|
nm := &NvidiaManager{}
|
|
|
|
// When nvmlInitialized is false, empty collector should be returned.
|
|
ac, err := nm.GetCollector("does-not-matter")
|
|
assert.Nil(t, err)
|
|
assert.NotNil(t, ac)
|
|
nc, ok := ac.(*NvidiaCollector)
|
|
assert.True(t, ok)
|
|
assert.Equal(t, 0, len(nc.Devices))
|
|
|
|
// When nvidiaDevices is empty, empty collector should be returned.
|
|
nm.nvmlInitialized = true
|
|
ac, err = nm.GetCollector("does-not-matter")
|
|
assert.Nil(t, err)
|
|
assert.NotNil(t, ac)
|
|
nc, ok = ac.(*NvidiaCollector)
|
|
assert.True(t, ok)
|
|
assert.Equal(t, 0, len(nc.Devices))
|
|
|
|
// nvidiaDevices contains devices but they are different than what
|
|
// is returned by parseDevicesCgroup. We should get an error.
|
|
nm.nvidiaDevices = map[int]gonvml.Device{0: {}, 1: {}}
|
|
ac, err = nm.GetCollector("does-not-matter")
|
|
assert.NotNil(t, err)
|
|
assert.NotNil(t, ac)
|
|
nc, ok = ac.(*NvidiaCollector)
|
|
assert.True(t, ok)
|
|
assert.Equal(t, 0, len(nc.Devices))
|
|
|
|
// nvidiaDevices contains devices returned by parseDevicesCgroup.
|
|
// No error should be returned and collectors devices array should be
|
|
// correctly initialized.
|
|
nm.nvidiaDevices[2] = gonvml.Device{}
|
|
nm.nvidiaDevices[3] = gonvml.Device{}
|
|
ac, err = nm.GetCollector("does-not-matter")
|
|
assert.Nil(t, err)
|
|
assert.NotNil(t, ac)
|
|
nc, ok = ac.(*NvidiaCollector)
|
|
assert.True(t, ok)
|
|
assert.Equal(t, 2, len(nc.Devices))
|
|
}
|
|
|
|
func TestParseDevicesCgroup(t *testing.T) {
|
|
// Test case for empty devices cgroup path
|
|
nvidiaMinorNumbers, err := parseDevicesCgroup("")
|
|
assert.NotNil(t, err)
|
|
assert.Equal(t, []int{}, nvidiaMinorNumbers)
|
|
|
|
// Test case for non-existent devices cgroup
|
|
nvidiaMinorNumbers, err = parseDevicesCgroup("/non-existent-path")
|
|
assert.NotNil(t, err)
|
|
assert.Equal(t, []int{}, nvidiaMinorNumbers)
|
|
|
|
// Create temporary directory to represent devices cgroup.
|
|
tmpDir, err := ioutil.TempDir("", "devices-cgroup")
|
|
if err != nil {
|
|
t.Fatalf("Error creating temporary directory for testing: %v", err)
|
|
}
|
|
defer os.RemoveAll(tmpDir)
|
|
tmpfn := filepath.Join(tmpDir, "devices.list")
|
|
|
|
// Test case when devices.list file has more than three fields.
|
|
updateFile(t, tmpfn, []byte("c 1:2 rwm badformat\n"))
|
|
nvidiaMinorNumbers, err = parseDevicesCgroup(tmpDir)
|
|
assert.NotNil(t, err)
|
|
assert.Equal(t, []int{}, nvidiaMinorNumbers)
|
|
|
|
// Test case when devices.list file's second field is not major:minor.
|
|
updateFile(t, tmpfn, []byte("c badformat rwm\n"))
|
|
nvidiaMinorNumbers, err = parseDevicesCgroup(tmpDir)
|
|
assert.NotNil(t, err)
|
|
assert.Equal(t, []int{}, nvidiaMinorNumbers)
|
|
|
|
// Test case with nvidia devices present
|
|
updateFile(t, tmpfn, []byte("c 195:0 rwm\nc 195:255 rwm\nc 195:1 rwm"))
|
|
nvidiaMinorNumbers, err = parseDevicesCgroup(tmpDir)
|
|
assert.Nil(t, err)
|
|
assert.Equal(t, []int{0, 1}, nvidiaMinorNumbers) // Note that 255 is not supposed to be returned.
|
|
|
|
// Test case with a common devices.list file
|
|
updateFile(t, tmpfn, []byte("a *:* rwm\n"))
|
|
nvidiaMinorNumbers, err = parseDevicesCgroup(tmpDir)
|
|
assert.Nil(t, err)
|
|
assert.Equal(t, []int{}, nvidiaMinorNumbers)
|
|
|
|
// Test case for empty devices.list file
|
|
updateFile(t, tmpfn, []byte(""))
|
|
nvidiaMinorNumbers, err = parseDevicesCgroup(tmpDir)
|
|
assert.Nil(t, err)
|
|
assert.Equal(t, []int{}, nvidiaMinorNumbers)
|
|
}
|