Merge pull request #1762 from mindprince/gpu-metrics-1436
Add per container GPU metrics
This commit is contained in:
commit
9bc6590461
4
Godeps/Godeps.json
generated
4
Godeps/Godeps.json
generated
@ -414,6 +414,10 @@
|
|||||||
"ImportPath": "github.com/matttproud/golang_protobuf_extensions/pbutil",
|
"ImportPath": "github.com/matttproud/golang_protobuf_extensions/pbutil",
|
||||||
"Rev": "fc2b8d3a73c4867e51861bbdd5ae3c1f0869dd6a"
|
"Rev": "fc2b8d3a73c4867e51861bbdd5ae3c1f0869dd6a"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"ImportPath": "github.com/mindprince/gonvml",
|
||||||
|
"Rev": "b49be04bdda2d9cd7544bb3a0bce7210ec3448c7"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"ImportPath": "github.com/mistifyio/go-zfs",
|
"ImportPath": "github.com/mistifyio/go-zfs",
|
||||||
"Comment": "v2.1.1-31-g166dd29",
|
"Comment": "v2.1.1-31-g166dd29",
|
||||||
|
239
accelerators/nvidia.go
Normal file
239
accelerators/nvidia.go
Normal file
@ -0,0 +1,239 @@
|
|||||||
|
// Copyright 2017 Google Inc. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
package accelerators
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"io/ioutil"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
info "github.com/google/cadvisor/info/v1"
|
||||||
|
|
||||||
|
"github.com/golang/glog"
|
||||||
|
"github.com/mindprince/gonvml"
|
||||||
|
)
|
||||||
|
|
||||||
|
type NvidiaManager struct {
|
||||||
|
// true if the NVML library (libnvidia-ml.so.1) was loaded successfully
|
||||||
|
nvmlInitialized bool
|
||||||
|
|
||||||
|
// nvidiaDevices is a map from device minor number to a handle that can be used to get metrics about the device
|
||||||
|
nvidiaDevices map[int]gonvml.Device
|
||||||
|
}
|
||||||
|
|
||||||
|
var sysFsPCIDevicesPath = "/sys/bus/pci/devices/"
|
||||||
|
|
||||||
|
const nvidiaVendorId = "0x10de"
|
||||||
|
|
||||||
|
// Setup initializes NVML if nvidia devices are present on the node.
|
||||||
|
func (nm *NvidiaManager) Setup() {
|
||||||
|
if !detectDevices(nvidiaVendorId) {
|
||||||
|
glog.Info("No NVIDIA devices found.")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
glog.Info("Starting goroutine to initialize NVML")
|
||||||
|
nm.initializeNVML()
|
||||||
|
if nm.nvmlInitialized {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// TODO: use globalHousekeepingInterval
|
||||||
|
for range time.Tick(time.Minute) {
|
||||||
|
nm.initializeNVML()
|
||||||
|
if nm.nvmlInitialized {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
// detectDevices returns true if a device with given pci id is present on the node.
|
||||||
|
func detectDevices(vendorId string) bool {
|
||||||
|
devices, err := ioutil.ReadDir(sysFsPCIDevicesPath)
|
||||||
|
if err != nil {
|
||||||
|
glog.Warningf("error reading %q: %v", sysFsPCIDevicesPath, err)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, device := range devices {
|
||||||
|
vendorPath := filepath.Join(sysFsPCIDevicesPath, device.Name(), "vendor")
|
||||||
|
content, err := ioutil.ReadFile(vendorPath)
|
||||||
|
if err != nil {
|
||||||
|
glog.Infof("Error while reading %q: %v", vendorPath, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.EqualFold(strings.TrimSpace(string(content)), vendorId) {
|
||||||
|
glog.Infof("Found device with vendorId %q", vendorId)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// initializeNVML initializes the NVML library and sets up the nvmlDevices map.
|
||||||
|
func (nm *NvidiaManager) initializeNVML() {
|
||||||
|
if err := gonvml.Initialize(); err != nil {
|
||||||
|
// This is under a logging level because otherwise we may cause
|
||||||
|
// log spam if the drivers/nvml is not installed on the system.
|
||||||
|
glog.V(3).Infof("Could not initialize NVML: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
nm.nvmlInitialized = true
|
||||||
|
numDevices, err := gonvml.DeviceCount()
|
||||||
|
if err != nil {
|
||||||
|
glog.Warningf("GPU metrics would not be available. Failed to get the number of nvidia devices: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
glog.Infof("NVML initialized. Number of nvidia devices: %v", numDevices)
|
||||||
|
nm.nvidiaDevices = make(map[int]gonvml.Device, numDevices)
|
||||||
|
for i := 0; i < int(numDevices); i++ {
|
||||||
|
device, err := gonvml.DeviceHandleByIndex(uint(i))
|
||||||
|
if err != nil {
|
||||||
|
glog.Warningf("Failed to get nvidia device handle %d: %v", i, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
minorNumber, err := device.MinorNumber()
|
||||||
|
if err != nil {
|
||||||
|
glog.Warningf("Failed to get nvidia device minor number: %v", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
nm.nvidiaDevices[int(minorNumber)] = device
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Destroy shuts down NVML.
|
||||||
|
func (nm *NvidiaManager) Destroy() {
|
||||||
|
if nm.nvmlInitialized {
|
||||||
|
gonvml.Shutdown()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetCollector returns a collector that can fetch nvidia gpu metrics for nvidia devices
|
||||||
|
// present in the devices.list file in the given devicesCgroupPath.
|
||||||
|
func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
|
||||||
|
nc := &NvidiaCollector{}
|
||||||
|
if !nm.nvmlInitialized || len(nm.nvidiaDevices) == 0 {
|
||||||
|
return nc, nil
|
||||||
|
}
|
||||||
|
nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath)
|
||||||
|
if err != nil {
|
||||||
|
return nc, err
|
||||||
|
}
|
||||||
|
for _, minor := range nvidiaMinorNumbers {
|
||||||
|
device, ok := nm.nvidiaDevices[minor]
|
||||||
|
if !ok {
|
||||||
|
return nc, fmt.Errorf("nvidia device minor number %d not found in cached devices", minor)
|
||||||
|
}
|
||||||
|
nc.Devices = append(nc.Devices, device)
|
||||||
|
}
|
||||||
|
return nc, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseDevicesCgroup parses the devices cgroup devices.list file for the container
|
||||||
|
// and returns a list of minor numbers corresponding to NVIDIA GPU devices that the
|
||||||
|
// container is allowed to access. In cases where the container has access to all
|
||||||
|
// devices or all NVIDIA devices but the devices are not enumerated separately in
|
||||||
|
// the devices.list file, we return an empty list.
|
||||||
|
// This is defined as a variable to help in testing.
|
||||||
|
var parseDevicesCgroup = func(devicesCgroupPath string) ([]int, error) {
|
||||||
|
// Always return a non-nil slice
|
||||||
|
nvidiaMinorNumbers := []int{}
|
||||||
|
|
||||||
|
devicesList := filepath.Join(devicesCgroupPath, "devices.list")
|
||||||
|
f, err := os.Open(devicesList)
|
||||||
|
if err != nil {
|
||||||
|
return nvidiaMinorNumbers, fmt.Errorf("error while opening devices cgroup file %q: %v", devicesList, err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
s := bufio.NewScanner(f)
|
||||||
|
|
||||||
|
// See https://www.kernel.org/doc/Documentation/cgroup-v1/devices.txt for the file format
|
||||||
|
for s.Scan() {
|
||||||
|
text := s.Text()
|
||||||
|
|
||||||
|
fields := strings.Fields(text)
|
||||||
|
if len(fields) != 3 {
|
||||||
|
return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: must contain three whitespace-separated fields", text)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split the second field to find out major:minor numbers
|
||||||
|
majorMinor := strings.Split(fields[1], ":")
|
||||||
|
if len(majorMinor) != 2 {
|
||||||
|
return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: second field should have one colon", text)
|
||||||
|
}
|
||||||
|
|
||||||
|
// NVIDIA graphics devices are character devices with major number 195.
|
||||||
|
// https://github.com/torvalds/linux/blob/v4.13/Documentation/admin-guide/devices.txt#L2583
|
||||||
|
if fields[0] == "c" && majorMinor[0] == "195" {
|
||||||
|
minorNumber, err := strconv.Atoi(majorMinor[1])
|
||||||
|
if err != nil {
|
||||||
|
return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: minor number is not integer", text)
|
||||||
|
}
|
||||||
|
// We don't want devices like nvidiactl (195:255) and nvidia-modeset (195:254)
|
||||||
|
if minorNumber < 128 {
|
||||||
|
nvidiaMinorNumbers = append(nvidiaMinorNumbers, minorNumber)
|
||||||
|
}
|
||||||
|
// We are ignoring the "195:*" case
|
||||||
|
// where the container has access to all NVIDIA devices on the machine.
|
||||||
|
}
|
||||||
|
// We are ignoring the "*:*" case
|
||||||
|
// where the container has access to all devices on the machine.
|
||||||
|
}
|
||||||
|
return nvidiaMinorNumbers, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaCollector struct {
|
||||||
|
// Exposed for testing
|
||||||
|
Devices []gonvml.Device
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateStats updates the stats for NVIDIA GPUs (if any) attached to the container.
|
||||||
|
func (nc *NvidiaCollector) UpdateStats(stats *info.ContainerStats) error {
|
||||||
|
for _, device := range nc.Devices {
|
||||||
|
model, err := device.Name()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error while getting gpu name: %v", err)
|
||||||
|
}
|
||||||
|
uuid, err := device.UUID()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error while getting gpu uuid: %v", err)
|
||||||
|
}
|
||||||
|
memoryTotal, memoryUsed, err := device.MemoryInfo()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error while getting gpu memory info: %v", err)
|
||||||
|
}
|
||||||
|
//TODO: Use housekeepingInterval
|
||||||
|
utilizationGPU, err := device.AverageGPUUtilization(10 * time.Second)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error while getting gpu utilization: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
stats.Accelerators = append(stats.Accelerators, info.AcceleratorStats{
|
||||||
|
Make: "nvidia",
|
||||||
|
Model: model,
|
||||||
|
ID: uuid,
|
||||||
|
MemoryTotal: memoryTotal,
|
||||||
|
MemoryUsed: memoryUsed,
|
||||||
|
DutyCycle: uint64(utilizationGPU),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
168
accelerators/nvidia_test.go
Normal file
168
accelerators/nvidia_test.go
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
// Copyright 2017 Google Inc. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
package accelerators
|
||||||
|
|
||||||
|
import (
|
||||||
|
"io/ioutil"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/mindprince/gonvml"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
)
|
||||||
|
|
||||||
|
func updateFile(t *testing.T, fn string, content []byte) {
|
||||||
|
if err := ioutil.WriteFile(fn, content, 0666); err != nil {
|
||||||
|
t.Fatalf("Error writing to temporary file for testing: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDetectDevices(t *testing.T) {
|
||||||
|
sysFsPCIDevicesPath = "/non-existent-path"
|
||||||
|
detected := detectDevices("0x10de")
|
||||||
|
assert.False(t, detected)
|
||||||
|
|
||||||
|
var err error
|
||||||
|
// Create temporary directory to represent sysfs pci devices path
|
||||||
|
if sysFsPCIDevicesPath, err = ioutil.TempDir("", "sys-bus-pci-devices"); err != nil {
|
||||||
|
t.Fatalf("Error creating temporary directory for testing: %v", err)
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(sysFsPCIDevicesPath)
|
||||||
|
|
||||||
|
device0 := filepath.Join(sysFsPCIDevicesPath, "device0")
|
||||||
|
device1 := filepath.Join(sysFsPCIDevicesPath, "device1")
|
||||||
|
device2 := filepath.Join(sysFsPCIDevicesPath, "device2")
|
||||||
|
for _, device := range []string{device0, device1, device2} {
|
||||||
|
if err = os.Mkdir(device, 0777); err != nil {
|
||||||
|
t.Fatalf("Error creating temporary directory for testing: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// device0 directory is present to make sure that
|
||||||
|
// we handle bad device directories case correctly.
|
||||||
|
|
||||||
|
// A valid vendor file but different than what's being detected.
|
||||||
|
updateFile(t, filepath.Join(device1, "vendor"), []byte("0x8086\n"))
|
||||||
|
detected = detectDevices("0x10de")
|
||||||
|
assert.False(t, detected)
|
||||||
|
|
||||||
|
// vendor file for device being detected
|
||||||
|
updateFile(t, filepath.Join(device2, "vendor"), []byte("0x10de\n"))
|
||||||
|
detected = detectDevices("0x10de")
|
||||||
|
assert.True(t, detected)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetCollector(t *testing.T) {
|
||||||
|
// Mock parseDevicesCgroup.
|
||||||
|
originalParser := parseDevicesCgroup
|
||||||
|
mockParser := func(_ string) ([]int, error) {
|
||||||
|
return []int{2, 3}, nil
|
||||||
|
}
|
||||||
|
parseDevicesCgroup = mockParser
|
||||||
|
defer func() {
|
||||||
|
parseDevicesCgroup = originalParser
|
||||||
|
}()
|
||||||
|
|
||||||
|
nm := &NvidiaManager{}
|
||||||
|
|
||||||
|
// When nvmlInitialized is false, empty collector should be returned.
|
||||||
|
ac, err := nm.GetCollector("does-not-matter")
|
||||||
|
assert.Nil(t, err)
|
||||||
|
assert.NotNil(t, ac)
|
||||||
|
nc, ok := ac.(*NvidiaCollector)
|
||||||
|
assert.True(t, ok)
|
||||||
|
assert.Equal(t, 0, len(nc.Devices))
|
||||||
|
|
||||||
|
// When nvidiaDevices is empty, empty collector should be returned.
|
||||||
|
nm.nvmlInitialized = true
|
||||||
|
ac, err = nm.GetCollector("does-not-matter")
|
||||||
|
assert.Nil(t, err)
|
||||||
|
assert.NotNil(t, ac)
|
||||||
|
nc, ok = ac.(*NvidiaCollector)
|
||||||
|
assert.True(t, ok)
|
||||||
|
assert.Equal(t, 0, len(nc.Devices))
|
||||||
|
|
||||||
|
// nvidiaDevices contains devices but they are different than what
|
||||||
|
// is returned by parseDevicesCgroup. We should get an error.
|
||||||
|
nm.nvidiaDevices = map[int]gonvml.Device{0: {}, 1: {}}
|
||||||
|
ac, err = nm.GetCollector("does-not-matter")
|
||||||
|
assert.NotNil(t, err)
|
||||||
|
assert.NotNil(t, ac)
|
||||||
|
nc, ok = ac.(*NvidiaCollector)
|
||||||
|
assert.True(t, ok)
|
||||||
|
assert.Equal(t, 0, len(nc.Devices))
|
||||||
|
|
||||||
|
// nvidiaDevices contains devices returned by parseDevicesCgroup.
|
||||||
|
// No error should be returned and collectors devices array should be
|
||||||
|
// correctly initialized.
|
||||||
|
nm.nvidiaDevices[2] = gonvml.Device{}
|
||||||
|
nm.nvidiaDevices[3] = gonvml.Device{}
|
||||||
|
ac, err = nm.GetCollector("does-not-matter")
|
||||||
|
assert.Nil(t, err)
|
||||||
|
assert.NotNil(t, ac)
|
||||||
|
nc, ok = ac.(*NvidiaCollector)
|
||||||
|
assert.True(t, ok)
|
||||||
|
assert.Equal(t, 2, len(nc.Devices))
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseDevicesCgroup(t *testing.T) {
|
||||||
|
// Test case for empty devices cgroup path
|
||||||
|
nvidiaMinorNumbers, err := parseDevicesCgroup("")
|
||||||
|
assert.NotNil(t, err)
|
||||||
|
assert.Equal(t, []int{}, nvidiaMinorNumbers)
|
||||||
|
|
||||||
|
// Test case for non-existent devices cgroup
|
||||||
|
nvidiaMinorNumbers, err = parseDevicesCgroup("/non-existent-path")
|
||||||
|
assert.NotNil(t, err)
|
||||||
|
assert.Equal(t, []int{}, nvidiaMinorNumbers)
|
||||||
|
|
||||||
|
// Create temporary directory to represent devices cgroup.
|
||||||
|
tmpDir, err := ioutil.TempDir("", "devices-cgroup")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Error creating temporary directory for testing: %v", err)
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(tmpDir)
|
||||||
|
tmpfn := filepath.Join(tmpDir, "devices.list")
|
||||||
|
|
||||||
|
// Test case when devices.list file has more than three fields.
|
||||||
|
updateFile(t, tmpfn, []byte("c 1:2 rwm badformat\n"))
|
||||||
|
nvidiaMinorNumbers, err = parseDevicesCgroup(tmpDir)
|
||||||
|
assert.NotNil(t, err)
|
||||||
|
assert.Equal(t, []int{}, nvidiaMinorNumbers)
|
||||||
|
|
||||||
|
// Test case when devices.list file's second field is not major:minor.
|
||||||
|
updateFile(t, tmpfn, []byte("c badformat rwm\n"))
|
||||||
|
nvidiaMinorNumbers, err = parseDevicesCgroup(tmpDir)
|
||||||
|
assert.NotNil(t, err)
|
||||||
|
assert.Equal(t, []int{}, nvidiaMinorNumbers)
|
||||||
|
|
||||||
|
// Test case with nvidia devices present
|
||||||
|
updateFile(t, tmpfn, []byte("c 195:0 rwm\nc 195:255 rwm\nc 195:1 rwm"))
|
||||||
|
nvidiaMinorNumbers, err = parseDevicesCgroup(tmpDir)
|
||||||
|
assert.Nil(t, err)
|
||||||
|
assert.Equal(t, []int{0, 1}, nvidiaMinorNumbers) // Note that 255 is not supposed to be returned.
|
||||||
|
|
||||||
|
// Test case with a common devices.list file
|
||||||
|
updateFile(t, tmpfn, []byte("a *:* rwm\n"))
|
||||||
|
nvidiaMinorNumbers, err = parseDevicesCgroup(tmpDir)
|
||||||
|
assert.Nil(t, err)
|
||||||
|
assert.Equal(t, []int{}, nvidiaMinorNumbers)
|
||||||
|
|
||||||
|
// Test case for empty devices.list file
|
||||||
|
updateFile(t, tmpfn, []byte(""))
|
||||||
|
nvidiaMinorNumbers, err = parseDevicesCgroup(tmpDir)
|
||||||
|
assert.Nil(t, err)
|
||||||
|
assert.Equal(t, []int{}, nvidiaMinorNumbers)
|
||||||
|
}
|
32
accelerators/types.go
Normal file
32
accelerators/types.go
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
// Copyright 2017 Google Inc. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
package accelerators
|
||||||
|
|
||||||
|
import info "github.com/google/cadvisor/info/v1"
|
||||||
|
|
||||||
|
// This is supposed to store global state about an accelerator metrics collector.
|
||||||
|
// cadvisor manager will call Setup() when it starts and Destroy() when it stops.
|
||||||
|
// For each container detected by the cadvisor manager, it will call
|
||||||
|
// GetCollector() with the devices cgroup path for that container.
|
||||||
|
// GetCollector() is supposed to return an object that can update
|
||||||
|
// accelerator stats for that container.
|
||||||
|
type AcceleratorManager interface {
|
||||||
|
Setup()
|
||||||
|
Destroy()
|
||||||
|
GetCollector(deviceCgroup string) (AcceleratorCollector, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
type AcceleratorCollector interface {
|
||||||
|
UpdateStats(*info.ContainerStats) error
|
||||||
|
}
|
@ -37,7 +37,6 @@ if [ "${go_version:0:3}" = "1.4" ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
ldflags="
|
ldflags="
|
||||||
-extldflags '-static'
|
|
||||||
-X ${repo_path}/version.Version${ldseparator}${version}
|
-X ${repo_path}/version.Version${ldseparator}${version}
|
||||||
-X ${repo_path}/version.Revision${ldseparator}${revision}
|
-X ${repo_path}/version.Revision${ldseparator}${revision}
|
||||||
-X ${repo_path}/version.Branch${ldseparator}${branch}
|
-X ${repo_path}/version.Branch${ldseparator}${branch}
|
||||||
|
@ -84,6 +84,7 @@ var supportedSubsystems map[string]struct{} = map[string]struct{}{
|
|||||||
"memory": {},
|
"memory": {},
|
||||||
"cpuset": {},
|
"cpuset": {},
|
||||||
"blkio": {},
|
"blkio": {},
|
||||||
|
"devices": {},
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get cgroup and networking stats of the specified container
|
// Get cgroup and networking stats of the specified container
|
||||||
|
@ -520,6 +520,29 @@ type FsStats struct {
|
|||||||
WeightedIoTime uint64 `json:"weighted_io_time"`
|
WeightedIoTime uint64 `json:"weighted_io_time"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type AcceleratorStats struct {
|
||||||
|
// Make of the accelerator (nvidia, amd, google etc.)
|
||||||
|
Make string `json:"make"`
|
||||||
|
|
||||||
|
// Model of the accelerator (tesla-p100, tesla-k80 etc.)
|
||||||
|
Model string `json:"model"`
|
||||||
|
|
||||||
|
// ID of the accelerator.
|
||||||
|
ID string `json:"id"`
|
||||||
|
|
||||||
|
// Total accelerator memory.
|
||||||
|
// unit: bytes
|
||||||
|
MemoryTotal uint64 `json:"memory_total"`
|
||||||
|
|
||||||
|
// Total accelerator memory allocated.
|
||||||
|
// unit: bytes
|
||||||
|
MemoryUsed uint64 `json:"memory_used"`
|
||||||
|
|
||||||
|
// Percent of time over the past sample period during which
|
||||||
|
// the accelerator was actively processing.
|
||||||
|
DutyCycle uint64 `json:"duty_cycle"`
|
||||||
|
}
|
||||||
|
|
||||||
type ContainerStats struct {
|
type ContainerStats struct {
|
||||||
// The time of this stat point.
|
// The time of this stat point.
|
||||||
Timestamp time.Time `json:"timestamp"`
|
Timestamp time.Time `json:"timestamp"`
|
||||||
@ -534,6 +557,9 @@ type ContainerStats struct {
|
|||||||
// Task load stats
|
// Task load stats
|
||||||
TaskStats LoadStats `json:"task_stats,omitempty"`
|
TaskStats LoadStats `json:"task_stats,omitempty"`
|
||||||
|
|
||||||
|
// Metrics for Accelerators. Each Accelerator corresponds to one element in the array.
|
||||||
|
Accelerators []AcceleratorStats `json:"accelerators,omitempty"`
|
||||||
|
|
||||||
// Custom metrics from all collectors
|
// Custom metrics from all collectors
|
||||||
CustomMetrics map[string][]MetricVal `json:"custom_metrics,omitempty"`
|
CustomMetrics map[string][]MetricVal `json:"custom_metrics,omitempty"`
|
||||||
}
|
}
|
||||||
|
@ -146,6 +146,8 @@ type ContainerStats struct {
|
|||||||
Filesystem *FilesystemStats `json:"filesystem,omitempty"`
|
Filesystem *FilesystemStats `json:"filesystem,omitempty"`
|
||||||
// Task load statistics
|
// Task load statistics
|
||||||
Load *v1.LoadStats `json:"load_stats,omitempty"`
|
Load *v1.LoadStats `json:"load_stats,omitempty"`
|
||||||
|
// Metrics for Accelerators. Each Accelerator corresponds to one element in the array.
|
||||||
|
Accelerators []v1.AcceleratorStats `json:"accelerators,omitempty"`
|
||||||
// Custom Metrics
|
// Custom Metrics
|
||||||
CustomMetrics map[string][]v1.MetricVal `json:"custom_metrics,omitempty"`
|
CustomMetrics map[string][]v1.MetricVal `json:"custom_metrics,omitempty"`
|
||||||
}
|
}
|
||||||
|
@ -142,6 +142,9 @@ func ContainerStatsFromV1(containerName string, spec *v1.ContainerSpec, stats []
|
|||||||
if spec.HasCustomMetrics {
|
if spec.HasCustomMetrics {
|
||||||
stat.CustomMetrics = val.CustomMetrics
|
stat.CustomMetrics = val.CustomMetrics
|
||||||
}
|
}
|
||||||
|
if len(val.Accelerators) > 0 {
|
||||||
|
stat.Accelerators = val.Accelerators
|
||||||
|
}
|
||||||
// TODO(rjnagal): Handle load stats.
|
// TODO(rjnagal): Handle load stats.
|
||||||
newStats = append(newStats, stat)
|
newStats = append(newStats, stat)
|
||||||
}
|
}
|
||||||
|
@ -176,6 +176,14 @@ func TestContainerStatsFromV1(t *testing.T) {
|
|||||||
Available: 300,
|
Available: 300,
|
||||||
InodesFree: 100,
|
InodesFree: 100,
|
||||||
}},
|
}},
|
||||||
|
Accelerators: []v1.AcceleratorStats{{
|
||||||
|
Make: "nvidia",
|
||||||
|
Model: "tesla-p100",
|
||||||
|
ID: "GPU-deadbeef-1234-5678-90ab-feedfacecafe",
|
||||||
|
MemoryTotal: 20304050607,
|
||||||
|
MemoryUsed: 2030405060,
|
||||||
|
DutyCycle: 12,
|
||||||
|
}},
|
||||||
}
|
}
|
||||||
expectedV2Stats := ContainerStats{
|
expectedV2Stats := ContainerStats{
|
||||||
Timestamp: timestamp,
|
Timestamp: timestamp,
|
||||||
@ -190,6 +198,7 @@ func TestContainerStatsFromV1(t *testing.T) {
|
|||||||
BaseUsageBytes: &v1Stats.Filesystem[0].BaseUsage,
|
BaseUsageBytes: &v1Stats.Filesystem[0].BaseUsage,
|
||||||
InodeUsage: &v1Stats.Filesystem[0].Inodes,
|
InodeUsage: &v1Stats.Filesystem[0].Inodes,
|
||||||
},
|
},
|
||||||
|
Accelerators: v1Stats.Accelerators,
|
||||||
}
|
}
|
||||||
|
|
||||||
v2Stats := ContainerStatsFromV1("test", &v1Spec, []*v1.ContainerStats{&v1Stats})
|
v2Stats := ContainerStatsFromV1("test", &v1Spec, []*v1.ContainerStats{&v1Stats})
|
||||||
|
@ -29,6 +29,7 @@ import (
|
|||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/google/cadvisor/accelerators"
|
||||||
"github.com/google/cadvisor/cache/memory"
|
"github.com/google/cadvisor/cache/memory"
|
||||||
"github.com/google/cadvisor/collector"
|
"github.com/google/cadvisor/collector"
|
||||||
"github.com/google/cadvisor/container"
|
"github.com/google/cadvisor/container"
|
||||||
@ -78,6 +79,9 @@ type containerData struct {
|
|||||||
|
|
||||||
// Runs custom metric collectors.
|
// Runs custom metric collectors.
|
||||||
collectorManager collector.CollectorManager
|
collectorManager collector.CollectorManager
|
||||||
|
|
||||||
|
// nvidiaCollector updates stats for Nvidia GPUs attached to the container.
|
||||||
|
nvidiaCollector accelerators.AcceleratorCollector
|
||||||
}
|
}
|
||||||
|
|
||||||
// jitter returns a time.Duration between duration and duration + maxFactor * duration,
|
// jitter returns a time.Duration between duration and duration + maxFactor * duration,
|
||||||
@ -557,6 +561,12 @@ func (c *containerData) updateStats() error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var nvidiaStatsErr error
|
||||||
|
if c.nvidiaCollector != nil {
|
||||||
|
// This updates the Accelerators field of the stats struct
|
||||||
|
nvidiaStatsErr = c.nvidiaCollector.UpdateStats(stats)
|
||||||
|
}
|
||||||
|
|
||||||
ref, err := c.handler.ContainerReference()
|
ref, err := c.handler.ContainerReference()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// Ignore errors if the container is dead.
|
// Ignore errors if the container is dead.
|
||||||
@ -572,6 +582,9 @@ func (c *containerData) updateStats() error {
|
|||||||
if statsErr != nil {
|
if statsErr != nil {
|
||||||
return statsErr
|
return statsErr
|
||||||
}
|
}
|
||||||
|
if nvidiaStatsErr != nil {
|
||||||
|
return nvidiaStatsErr
|
||||||
|
}
|
||||||
return customStatsErr
|
return customStatsErr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -29,6 +29,8 @@ import (
|
|||||||
info "github.com/google/cadvisor/info/v1"
|
info "github.com/google/cadvisor/info/v1"
|
||||||
itest "github.com/google/cadvisor/info/v1/test"
|
itest "github.com/google/cadvisor/info/v1/test"
|
||||||
|
|
||||||
|
"github.com/google/cadvisor/accelerators"
|
||||||
|
"github.com/mindprince/gonvml"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
@ -205,3 +207,22 @@ func TestGetInfo(t *testing.T) {
|
|||||||
t.Errorf("received wrong container name: received %v; should be %v", info.Name, mockHandler.Name)
|
t.Errorf("received wrong container name: received %v; should be %v", info.Name, mockHandler.Name)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestUpdateNvidiaStats(t *testing.T) {
|
||||||
|
cd, _, _ := newTestContainerData(t)
|
||||||
|
stats := info.ContainerStats{}
|
||||||
|
|
||||||
|
// When there are no devices, we should not get an error and stats should not change.
|
||||||
|
cd.nvidiaCollector = &accelerators.NvidiaCollector{}
|
||||||
|
err := cd.nvidiaCollector.UpdateStats(&stats)
|
||||||
|
assert.Nil(t, err)
|
||||||
|
assert.Equal(t, info.ContainerStats{}, stats)
|
||||||
|
|
||||||
|
// This is an impossible situation (there are devices but nvml is not initialized).
|
||||||
|
// Here I am testing that the CGo gonvml library doesn't panic when passed bad
|
||||||
|
// input and instead returns an error.
|
||||||
|
cd.nvidiaCollector = &accelerators.NvidiaCollector{Devices: []gonvml.Device{{}, {}}}
|
||||||
|
err = cd.nvidiaCollector.UpdateStats(&stats)
|
||||||
|
assert.NotNil(t, err)
|
||||||
|
assert.Equal(t, info.ContainerStats{}, stats)
|
||||||
|
}
|
||||||
|
@ -18,6 +18,7 @@ package manager
|
|||||||
import (
|
import (
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
"strconv"
|
"strconv"
|
||||||
@ -25,6 +26,7 @@ import (
|
|||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/google/cadvisor/accelerators"
|
||||||
"github.com/google/cadvisor/cache/memory"
|
"github.com/google/cadvisor/cache/memory"
|
||||||
"github.com/google/cadvisor/collector"
|
"github.com/google/cadvisor/collector"
|
||||||
"github.com/google/cadvisor/container"
|
"github.com/google/cadvisor/container"
|
||||||
@ -45,8 +47,6 @@ import (
|
|||||||
"github.com/google/cadvisor/utils/sysfs"
|
"github.com/google/cadvisor/utils/sysfs"
|
||||||
"github.com/google/cadvisor/version"
|
"github.com/google/cadvisor/version"
|
||||||
|
|
||||||
"net/http"
|
|
||||||
|
|
||||||
"github.com/golang/glog"
|
"github.com/golang/glog"
|
||||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||||
)
|
)
|
||||||
@ -212,6 +212,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn
|
|||||||
containerWatchers: []watcher.ContainerWatcher{},
|
containerWatchers: []watcher.ContainerWatcher{},
|
||||||
eventsChannel: eventsChannel,
|
eventsChannel: eventsChannel,
|
||||||
collectorHttpClient: collectorHttpClient,
|
collectorHttpClient: collectorHttpClient,
|
||||||
|
nvidiaManager: &accelerators.NvidiaManager{},
|
||||||
}
|
}
|
||||||
|
|
||||||
machineInfo, err := machine.Info(sysfs, fsInfo, inHostNamespace)
|
machineInfo, err := machine.Info(sysfs, fsInfo, inHostNamespace)
|
||||||
@ -257,6 +258,7 @@ type manager struct {
|
|||||||
containerWatchers []watcher.ContainerWatcher
|
containerWatchers []watcher.ContainerWatcher
|
||||||
eventsChannel chan watcher.ContainerEvent
|
eventsChannel chan watcher.ContainerEvent
|
||||||
collectorHttpClient *http.Client
|
collectorHttpClient *http.Client
|
||||||
|
nvidiaManager accelerators.AcceleratorManager
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start the container manager.
|
// Start the container manager.
|
||||||
@ -309,6 +311,9 @@ func (self *manager) Start() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Setup collection of nvidia GPU metrics if any of them are attached to the machine.
|
||||||
|
self.nvidiaManager.Setup()
|
||||||
|
|
||||||
// Create root and then recover all containers.
|
// Create root and then recover all containers.
|
||||||
err = self.createContainer("/", watcher.Raw)
|
err = self.createContainer("/", watcher.Raw)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -338,6 +343,7 @@ func (self *manager) Start() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (self *manager) Stop() error {
|
func (self *manager) Stop() error {
|
||||||
|
defer self.nvidiaManager.Destroy()
|
||||||
// Stop and wait on all quit channels.
|
// Stop and wait on all quit channels.
|
||||||
for i, c := range self.quitChannels {
|
for i, c := range self.quitChannels {
|
||||||
// Send the exit signal and wait on the thread to exit (by closing the channel).
|
// Send the exit signal and wait on the thread to exit (by closing the channel).
|
||||||
@ -917,6 +923,15 @@ func (m *manager) createContainerLocked(containerName string, watchSource watche
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
devicesCgroupPath, err := handler.GetCgroupPath("devices")
|
||||||
|
if err != nil {
|
||||||
|
glog.Infof("Error getting devices cgroup path: %v", err)
|
||||||
|
} else {
|
||||||
|
cont.nvidiaCollector, err = m.nvidiaManager.GetCollector(devicesCgroupPath)
|
||||||
|
if err != nil {
|
||||||
|
glog.Infof("GPU metrics may be unavailable/incomplete for container %q: %v", cont.info.Name, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Add collectors
|
// Add collectors
|
||||||
labels := handler.GetContainerLabels()
|
labels := handler.GetContainerLabels()
|
||||||
|
@ -271,6 +271,51 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
}, {
|
||||||
|
name: "container_accelerator_memory_total_bytes",
|
||||||
|
help: "Total accelerator memory.",
|
||||||
|
valueType: prometheus.GaugeValue,
|
||||||
|
extraLabels: []string{"make", "model", "acc_id"},
|
||||||
|
getValues: func(s *info.ContainerStats) metricValues {
|
||||||
|
values := make(metricValues, 0, len(s.Accelerators))
|
||||||
|
for _, value := range s.Accelerators {
|
||||||
|
values = append(values, metricValue{
|
||||||
|
value: float64(value.MemoryTotal),
|
||||||
|
labels: []string{value.Make, value.Model, value.ID},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return values
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
name: "container_accelerator_memory_used_bytes",
|
||||||
|
help: "Total accelerator memory allocated.",
|
||||||
|
valueType: prometheus.GaugeValue,
|
||||||
|
extraLabels: []string{"make", "model", "acc_id"},
|
||||||
|
getValues: func(s *info.ContainerStats) metricValues {
|
||||||
|
values := make(metricValues, 0, len(s.Accelerators))
|
||||||
|
for _, value := range s.Accelerators {
|
||||||
|
values = append(values, metricValue{
|
||||||
|
value: float64(value.MemoryUsed),
|
||||||
|
labels: []string{value.Make, value.Model, value.ID},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return values
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
name: "container_accelerator_duty_cycle",
|
||||||
|
help: "Percent of time over the past sample period during which the accelerator was actively processing.",
|
||||||
|
valueType: prometheus.GaugeValue,
|
||||||
|
extraLabels: []string{"make", "model", "acc_id"},
|
||||||
|
getValues: func(s *info.ContainerStats) metricValues {
|
||||||
|
values := make(metricValues, 0, len(s.Accelerators))
|
||||||
|
for _, value := range s.Accelerators {
|
||||||
|
values = append(values, metricValue{
|
||||||
|
value: float64(value.DutyCycle),
|
||||||
|
labels: []string{value.Make, value.Model, value.ID},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return values
|
||||||
|
},
|
||||||
}, {
|
}, {
|
||||||
name: "container_fs_inodes_free",
|
name: "container_fs_inodes_free",
|
||||||
help: "Number of available Inodes",
|
help: "Number of available Inodes",
|
||||||
|
@ -191,6 +191,24 @@ func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.Container
|
|||||||
WeightedIoTime: 49,
|
WeightedIoTime: 49,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
Accelerators: []info.AcceleratorStats{
|
||||||
|
{
|
||||||
|
Make: "nvidia",
|
||||||
|
Model: "tesla-p100",
|
||||||
|
ID: "GPU-deadbeef-1234-5678-90ab-feedfacecafe",
|
||||||
|
MemoryTotal: 20304050607,
|
||||||
|
MemoryUsed: 2030405060,
|
||||||
|
DutyCycle: 12,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Make: "nvidia",
|
||||||
|
Model: "tesla-k80",
|
||||||
|
ID: "GPU-deadbeef-0123-4567-89ab-feedfacecafe",
|
||||||
|
MemoryTotal: 10203040506,
|
||||||
|
MemoryUsed: 1020304050,
|
||||||
|
DutyCycle: 6,
|
||||||
|
},
|
||||||
|
},
|
||||||
TaskStats: info.LoadStats{
|
TaskStats: info.LoadStats{
|
||||||
NrSleeping: 50,
|
NrSleeping: 50,
|
||||||
NrRunning: 51,
|
NrRunning: 51,
|
||||||
|
12
metrics/testdata/prometheus_metrics
vendored
12
metrics/testdata/prometheus_metrics
vendored
@ -1,6 +1,18 @@
|
|||||||
# HELP cadvisor_version_info A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision.
|
# HELP cadvisor_version_info A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision.
|
||||||
# TYPE cadvisor_version_info gauge
|
# TYPE cadvisor_version_info gauge
|
||||||
cadvisor_version_info{cadvisorRevision="abcdef",cadvisorVersion="0.16.0",dockerVersion="1.8.1",kernelVersion="4.1.6-200.fc22.x86_64",osVersion="Fedora 22 (Twenty Two)"} 1
|
cadvisor_version_info{cadvisorRevision="abcdef",cadvisorVersion="0.16.0",dockerVersion="1.8.1",kernelVersion="4.1.6-200.fc22.x86_64",osVersion="Fedora 22 (Twenty Two)"} 1
|
||||||
|
# HELP container_accelerator_duty_cycle Percent of time over the past sample period during which the accelerator was actively processing.
|
||||||
|
# TYPE container_accelerator_duty_cycle gauge
|
||||||
|
container_accelerator_duty_cycle{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 6
|
||||||
|
container_accelerator_duty_cycle{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 12
|
||||||
|
# HELP container_accelerator_memory_total_bytes Total accelerator memory.
|
||||||
|
# TYPE container_accelerator_memory_total_bytes gauge
|
||||||
|
container_accelerator_memory_total_bytes{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 1.0203040506e+10
|
||||||
|
container_accelerator_memory_total_bytes{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 2.0304050607e+10
|
||||||
|
# HELP container_accelerator_memory_used_bytes Total accelerator memory allocated.
|
||||||
|
# TYPE container_accelerator_memory_used_bytes gauge
|
||||||
|
container_accelerator_memory_used_bytes{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 1.02030405e+09
|
||||||
|
container_accelerator_memory_used_bytes{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 2.03040506e+09
|
||||||
# HELP container_cpu_cfs_periods_total Number of elapsed enforcement period intervals.
|
# HELP container_cpu_cfs_periods_total Number of elapsed enforcement period intervals.
|
||||||
# TYPE container_cpu_cfs_periods_total counter
|
# TYPE container_cpu_cfs_periods_total counter
|
||||||
container_cpu_cfs_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 723
|
container_cpu_cfs_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 723
|
||||||
|
202
vendor/github.com/mindprince/gonvml/LICENSE
generated
vendored
Normal file
202
vendor/github.com/mindprince/gonvml/LICENSE
generated
vendored
Normal file
@ -0,0 +1,202 @@
|
|||||||
|
|
||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction,
|
||||||
|
and distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by
|
||||||
|
the copyright owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all
|
||||||
|
other entities that control, are controlled by, or are under common
|
||||||
|
control with that entity. For the purposes of this definition,
|
||||||
|
"control" means (i) the power, direct or indirect, to cause the
|
||||||
|
direction or management of such entity, whether by contract or
|
||||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity
|
||||||
|
exercising permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications,
|
||||||
|
including but not limited to software source code, documentation
|
||||||
|
source, and configuration files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical
|
||||||
|
transformation or translation of a Source form, including but
|
||||||
|
not limited to compiled object code, generated documentation,
|
||||||
|
and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or
|
||||||
|
Object form, made available under the License, as indicated by a
|
||||||
|
copyright notice that is included in or attached to the work
|
||||||
|
(an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object
|
||||||
|
form, that is based on (or derived from) the Work and for which the
|
||||||
|
editorial revisions, annotations, elaborations, or other modifications
|
||||||
|
represent, as a whole, an original work of authorship. For the purposes
|
||||||
|
of this License, Derivative Works shall not include works that remain
|
||||||
|
separable from, or merely link (or bind by name) to the interfaces of,
|
||||||
|
the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including
|
||||||
|
the original version of the Work and any modifications or additions
|
||||||
|
to that Work or Derivative Works thereof, that is intentionally
|
||||||
|
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||||
|
or by an individual or Legal Entity authorized to submit on behalf of
|
||||||
|
the copyright owner. For the purposes of this definition, "submitted"
|
||||||
|
means any form of electronic, verbal, or written communication sent
|
||||||
|
to the Licensor or its representatives, including but not limited to
|
||||||
|
communication on electronic mailing lists, source code control systems,
|
||||||
|
and issue tracking systems that are managed by, or on behalf of, the
|
||||||
|
Licensor for the purpose of discussing and improving the Work, but
|
||||||
|
excluding communication that is conspicuously marked or otherwise
|
||||||
|
designated in writing by the copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||||
|
on behalf of whom a Contribution has been received by Licensor and
|
||||||
|
subsequently incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
copyright license to reproduce, prepare Derivative Works of,
|
||||||
|
publicly display, publicly perform, sublicense, and distribute the
|
||||||
|
Work and such Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
(except as stated in this section) patent license to make, have made,
|
||||||
|
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||||
|
where such license applies only to those patent claims licensable
|
||||||
|
by such Contributor that are necessarily infringed by their
|
||||||
|
Contribution(s) alone or by combination of their Contribution(s)
|
||||||
|
with the Work to which such Contribution(s) was submitted. If You
|
||||||
|
institute patent litigation against any entity (including a
|
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||||
|
or a Contribution incorporated within the Work constitutes direct
|
||||||
|
or contributory patent infringement, then any patent licenses
|
||||||
|
granted to You under this License for that Work shall terminate
|
||||||
|
as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the
|
||||||
|
Work or Derivative Works thereof in any medium, with or without
|
||||||
|
modifications, and in Source or Object form, provided that You
|
||||||
|
meet the following conditions:
|
||||||
|
|
||||||
|
(a) You must give any other recipients of the Work or
|
||||||
|
Derivative Works a copy of this License; and
|
||||||
|
|
||||||
|
(b) You must cause any modified files to carry prominent notices
|
||||||
|
stating that You changed the files; and
|
||||||
|
|
||||||
|
(c) You must retain, in the Source form of any Derivative Works
|
||||||
|
that You distribute, all copyright, patent, trademark, and
|
||||||
|
attribution notices from the Source form of the Work,
|
||||||
|
excluding those notices that do not pertain to any part of
|
||||||
|
the Derivative Works; and
|
||||||
|
|
||||||
|
(d) If the Work includes a "NOTICE" text file as part of its
|
||||||
|
distribution, then any Derivative Works that You distribute must
|
||||||
|
include a readable copy of the attribution notices contained
|
||||||
|
within such NOTICE file, excluding those notices that do not
|
||||||
|
pertain to any part of the Derivative Works, in at least one
|
||||||
|
of the following places: within a NOTICE text file distributed
|
||||||
|
as part of the Derivative Works; within the Source form or
|
||||||
|
documentation, if provided along with the Derivative Works; or,
|
||||||
|
within a display generated by the Derivative Works, if and
|
||||||
|
wherever such third-party notices normally appear. The contents
|
||||||
|
of the NOTICE file are for informational purposes only and
|
||||||
|
do not modify the License. You may add Your own attribution
|
||||||
|
notices within Derivative Works that You distribute, alongside
|
||||||
|
or as an addendum to the NOTICE text from the Work, provided
|
||||||
|
that such additional attribution notices cannot be construed
|
||||||
|
as modifying the License.
|
||||||
|
|
||||||
|
You may add Your own copyright statement to Your modifications and
|
||||||
|
may provide additional or different license terms and conditions
|
||||||
|
for use, reproduction, or distribution of Your modifications, or
|
||||||
|
for any such Derivative Works as a whole, provided Your use,
|
||||||
|
reproduction, and distribution of the Work otherwise complies with
|
||||||
|
the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||||
|
any Contribution intentionally submitted for inclusion in the Work
|
||||||
|
by You to the Licensor shall be under the terms and conditions of
|
||||||
|
this License, without any additional terms or conditions.
|
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify
|
||||||
|
the terms of any separate license agreement you may have executed
|
||||||
|
with Licensor regarding such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade
|
||||||
|
names, trademarks, service marks, or product names of the Licensor,
|
||||||
|
except as required for reasonable and customary use in describing the
|
||||||
|
origin of the Work and reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||||
|
agreed to in writing, Licensor provides the Work (and each
|
||||||
|
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied, including, without limitation, any warranties or conditions
|
||||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||||
|
appropriateness of using or redistributing the Work and assume any
|
||||||
|
risks associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory,
|
||||||
|
whether in tort (including negligence), contract, or otherwise,
|
||||||
|
unless required by applicable law (such as deliberate and grossly
|
||||||
|
negligent acts) or agreed to in writing, shall any Contributor be
|
||||||
|
liable to You for damages, including any direct, indirect, special,
|
||||||
|
incidental, or consequential damages of any character arising as a
|
||||||
|
result of this License or out of the use or inability to use the
|
||||||
|
Work (including but not limited to damages for loss of goodwill,
|
||||||
|
work stoppage, computer failure or malfunction, or any and all
|
||||||
|
other commercial damages or losses), even if such Contributor
|
||||||
|
has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing
|
||||||
|
the Work or Derivative Works thereof, You may choose to offer,
|
||||||
|
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||||
|
or other liability obligations and/or rights consistent with this
|
||||||
|
License. However, in accepting such obligations, You may act only
|
||||||
|
on Your own behalf and on Your sole responsibility, not on behalf
|
||||||
|
of any other Contributor, and only if You agree to indemnify,
|
||||||
|
defend, and hold each Contributor harmless for any liability
|
||||||
|
incurred by, or claims asserted against, such Contributor by reason
|
||||||
|
of your accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
APPENDIX: How to apply the Apache License to your work.
|
||||||
|
|
||||||
|
To apply the Apache License to your work, attach the following
|
||||||
|
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||||
|
replaced with your own identifying information. (Don't include
|
||||||
|
the brackets!) The text should be enclosed in the appropriate
|
||||||
|
comment syntax for the file format. We also recommend that a
|
||||||
|
file or class name and description of purpose be included on the
|
||||||
|
same "printed page" as the copyright notice for easier
|
||||||
|
identification within third-party archives.
|
||||||
|
|
||||||
|
Copyright [yyyy] [name of copyright owner]
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
20
vendor/github.com/mindprince/gonvml/Makefile
generated
vendored
Normal file
20
vendor/github.com/mindprince/gonvml/Makefile
generated
vendored
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
PKG=github.com/mindprince/gonvml
|
||||||
|
|
||||||
|
.PHONY: build
|
||||||
|
build:
|
||||||
|
docker run -v $(shell pwd):/go/src/$(PKG) --workdir=/go/src/$(PKG) golang:1.8 go build cmd/example/example.go
|
||||||
|
|
32
vendor/github.com/mindprince/gonvml/NVML_NOTICE
generated
vendored
Normal file
32
vendor/github.com/mindprince/gonvml/NVML_NOTICE
generated
vendored
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
||||||
|
|
||||||
|
NOTICE TO USER:
|
||||||
|
|
||||||
|
This source code is subject to NVIDIA ownership rights under U.S. and
|
||||||
|
international Copyright laws. Users and possessors of this source code
|
||||||
|
are hereby granted a nonexclusive, royalty-free license to use this code
|
||||||
|
in individual and commercial software.
|
||||||
|
|
||||||
|
NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
||||||
|
CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
||||||
|
IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
||||||
|
REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
||||||
|
MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||||
|
IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
||||||
|
OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||||||
|
OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||||||
|
OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
||||||
|
OR PERFORMANCE OF THIS SOURCE CODE.
|
||||||
|
|
||||||
|
U.S. Government End Users. This source code is a "commercial item" as
|
||||||
|
that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
||||||
|
"commercial computer software" and "commercial computer software
|
||||||
|
documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
||||||
|
and is provided to the U.S. Government only as a commercial end item.
|
||||||
|
Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
||||||
|
227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
||||||
|
source code with only those rights set forth herein.
|
||||||
|
|
||||||
|
Any use of this source code in individual and commercial software must
|
||||||
|
include, in the user documentation and internal comments to the code,
|
||||||
|
the above Disclaimer and U.S. Government End Users Notice.
|
19
vendor/github.com/mindprince/gonvml/README.md
generated
vendored
Normal file
19
vendor/github.com/mindprince/gonvml/README.md
generated
vendored
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
Go Bindings for NVML
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
[NVML or NVIDIA Management
|
||||||
|
Library](https://developer.nvidia.com/nvidia-management-library-nvml) is a
|
||||||
|
C-based API that can be used for monitoring NVIDIA GPU devices. It's closed
|
||||||
|
source but can be downloaded as part of the [GPU Deployment
|
||||||
|
Kit](https://developer.nvidia.com/gpu-deployment-kit).
|
||||||
|
|
||||||
|
The [NVML API
|
||||||
|
Reference](http://docs.nvidia.com/deploy/nvml-api/nvml-api-reference.html)
|
||||||
|
describe various methods that are available as part of NVML.
|
||||||
|
|
||||||
|
The `nvml.h` file is included in this repository so that we don't depend on
|
||||||
|
the presence of NVML in the build environment.
|
||||||
|
|
||||||
|
The `bindings.go` file is the cgo bridge which calls the NVML functions. The
|
||||||
|
cgo preamble in `bindings.go` uses `dlopen` to dynamically load NVML and makes
|
||||||
|
its functions available.
|
280
vendor/github.com/mindprince/gonvml/bindings.go
generated
vendored
Normal file
280
vendor/github.com/mindprince/gonvml/bindings.go
generated
vendored
Normal file
@ -0,0 +1,280 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2017 Google Inc.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package gonvml
|
||||||
|
|
||||||
|
// #cgo LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files
|
||||||
|
/*
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <dlfcn.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include "nvml.h"
|
||||||
|
|
||||||
|
// nvmlHandle is the handle for dynamically loaded libnvidia-ml.so
|
||||||
|
void *nvmlHandle;
|
||||||
|
|
||||||
|
// Loads the "libnvidia-ml.so.1" shared library and initializes NVML.
|
||||||
|
// Call this before calling any other methods.
|
||||||
|
nvmlReturn_t nvmlInit_dl(void) {
|
||||||
|
nvmlHandle = dlopen("libnvidia-ml.so.1", RTLD_LAZY | RTLD_GLOBAL);
|
||||||
|
if (nvmlHandle == NULL) {
|
||||||
|
return (NVML_ERROR_LIBRARY_NOT_FOUND);
|
||||||
|
}
|
||||||
|
return (nvmlInit());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shuts down NVML and decrements the reference count on the dynamically loaded
|
||||||
|
// "libnvidia-ml.so.1" library.
|
||||||
|
// Call this once NVML is no longer being used.
|
||||||
|
nvmlReturn_t nvmlShutdown_dl(void) {
|
||||||
|
if (nvmlHandle == NULL) {
|
||||||
|
return NVML_SUCCESS;
|
||||||
|
}
|
||||||
|
nvmlReturn_t r = nvmlShutdown();
|
||||||
|
if (r != NVML_SUCCESS) {
|
||||||
|
return (r);
|
||||||
|
}
|
||||||
|
return (dlclose(nvmlHandle) ? NVML_ERROR_UNKNOWN : NVML_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function is here because the API provided by NVML is not very user
|
||||||
|
// friendly. This function can be used to get average utilization.gpu or
|
||||||
|
// power.draw.
|
||||||
|
//
|
||||||
|
// `device`: The identifier of the target device.
|
||||||
|
// `type`: Type of sampling event. Only NVML_TOTAL_POWER_SAMPLES and NVML_GPU_UTILIZATION_SAMPLES are supported.
|
||||||
|
// `lastSeenTimeStamp`: Return average using samples with timestamp greather than this timestamp. Unix epoch in micro seconds.
|
||||||
|
// `averageUsage`: Reference in which average is returned.
|
||||||
|
//
|
||||||
|
// In my experiments, I found that NVML_GPU_UTILIZATION_SAMPLES buffer stores
|
||||||
|
// 100 samples that are uniformly spread with ~6 samples per second. So the
|
||||||
|
// buffer stores last ~16s of data.
|
||||||
|
// NVML_TOTAL_POWER_SAMPLES buffer stores 120 samples, but in different runs I
|
||||||
|
// noticed them to be non-uniformly separated. Sometimes 120 samples only
|
||||||
|
// consisted of 10s of data and sometimes they were spread over 60s.
|
||||||
|
//
|
||||||
|
nvmlReturn_t nvmlDeviceGetAverageUsage(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, unsigned int* averageUsage) {
|
||||||
|
if (nvmlHandle == NULL) {
|
||||||
|
return (NVML_ERROR_LIBRARY_NOT_FOUND);
|
||||||
|
}
|
||||||
|
|
||||||
|
// We don't really use this because both the metrics we support
|
||||||
|
// averagePowerUsage and averageGPUUtilization are unsigned int.
|
||||||
|
nvmlValueType_t sampleValType;
|
||||||
|
|
||||||
|
// This will be set to the number of samples that can be queried. We would
|
||||||
|
// need to allocate an array of this size to store the samples.
|
||||||
|
unsigned int sampleCount;
|
||||||
|
|
||||||
|
// Invoking this method with `samples` set to NULL sets the sampleCount.
|
||||||
|
nvmlReturn_t r = nvmlDeviceGetSamples(device, type, lastSeenTimeStamp, &sampleValType, &sampleCount, NULL);
|
||||||
|
if (r != NVML_SUCCESS) {
|
||||||
|
return (r);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocate memory to store sampleCount samples.
|
||||||
|
// In my experiments, the sampleCount at this stage was always 120 for
|
||||||
|
// NVML_TOTAL_POWER_SAMPLES and 100 for NVML_GPU_UTILIZATION_SAMPLES
|
||||||
|
nvmlSample_t* samples = (nvmlSample_t*) malloc(sampleCount * sizeof(nvmlSample_t));
|
||||||
|
|
||||||
|
r = nvmlDeviceGetSamples(device, type, lastSeenTimeStamp, &sampleValType, &sampleCount, samples);
|
||||||
|
if (r != NVML_SUCCESS) {
|
||||||
|
free(samples);
|
||||||
|
return (r);
|
||||||
|
}
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
unsigned int sum = 0;
|
||||||
|
for (; i < sampleCount; i++) {
|
||||||
|
sum += samples[i].sampleValue.uiVal;
|
||||||
|
}
|
||||||
|
*averageUsage = sum/sampleCount;
|
||||||
|
|
||||||
|
free(samples);
|
||||||
|
return (r);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
import "C"
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
szDriver = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE
|
||||||
|
szName = C.NVML_DEVICE_NAME_BUFFER_SIZE
|
||||||
|
szUUID = C.NVML_DEVICE_UUID_BUFFER_SIZE
|
||||||
|
)
|
||||||
|
|
||||||
|
var errLibraryNotLoaded = errors.New("could not load NVML library")
|
||||||
|
|
||||||
|
// Initialize initializes NVML.
|
||||||
|
// Call this before calling any other methods.
|
||||||
|
func Initialize() error {
|
||||||
|
return errorString(C.nvmlInit_dl())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shutdown shuts down NVML.
|
||||||
|
// Call this once NVML is no longer being used.
|
||||||
|
func Shutdown() error {
|
||||||
|
return errorString(C.nvmlShutdown_dl())
|
||||||
|
}
|
||||||
|
|
||||||
|
// errorString takes a nvmlReturn_t and converts it into a golang error.
|
||||||
|
// It uses a nvml method to convert to a user friendly error message.
|
||||||
|
func errorString(ret C.nvmlReturn_t) error {
|
||||||
|
if ret == C.NVML_SUCCESS {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// We need to special case this because if nvml library is not found
|
||||||
|
// nvmlErrorString() method will not work.
|
||||||
|
if ret == C.NVML_ERROR_LIBRARY_NOT_FOUND || C.nvmlHandle == nil {
|
||||||
|
return errLibraryNotLoaded
|
||||||
|
}
|
||||||
|
err := C.GoString(C.nvmlErrorString(ret))
|
||||||
|
return fmt.Errorf("nvml: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// SystemDriverVersion returns the the driver version on the system.
|
||||||
|
func SystemDriverVersion() (string, error) {
|
||||||
|
if C.nvmlHandle == nil {
|
||||||
|
return "", errLibraryNotLoaded
|
||||||
|
}
|
||||||
|
var driver [szDriver]C.char
|
||||||
|
r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver)
|
||||||
|
return C.GoString(&driver[0]), errorString(r)
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeviceCount returns the number of nvidia devices on the system.
|
||||||
|
func DeviceCount() (uint, error) {
|
||||||
|
if C.nvmlHandle == nil {
|
||||||
|
return 0, errLibraryNotLoaded
|
||||||
|
}
|
||||||
|
var n C.uint
|
||||||
|
r := C.nvmlDeviceGetCount(&n)
|
||||||
|
return uint(n), errorString(r)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Device is the handle for the device.
|
||||||
|
// This handle is obtained by calling DeviceHandleByIndex().
|
||||||
|
type Device struct {
|
||||||
|
dev C.nvmlDevice_t
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeviceHandleByIndex returns the device handle for a particular index.
|
||||||
|
// The indices range from 0 to DeviceCount()-1. The order in which NVML
|
||||||
|
// enumerates devices has no guarantees of consistency between reboots.
|
||||||
|
func DeviceHandleByIndex(idx uint) (Device, error) {
|
||||||
|
if C.nvmlHandle == nil {
|
||||||
|
return Device{}, errLibraryNotLoaded
|
||||||
|
}
|
||||||
|
var dev C.nvmlDevice_t
|
||||||
|
r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev)
|
||||||
|
return Device{dev}, errorString(r)
|
||||||
|
}
|
||||||
|
|
||||||
|
// MinorNumber returns the minor number for the device.
|
||||||
|
// The minor number for the device is such that the Nvidia device node
|
||||||
|
// file for each GPU will have the form /dev/nvidia[minor number].
|
||||||
|
func (d Device) MinorNumber() (uint, error) {
|
||||||
|
if C.nvmlHandle == nil {
|
||||||
|
return 0, errLibraryNotLoaded
|
||||||
|
}
|
||||||
|
var n C.uint
|
||||||
|
r := C.nvmlDeviceGetMinorNumber(d.dev, &n)
|
||||||
|
return uint(n), errorString(r)
|
||||||
|
}
|
||||||
|
|
||||||
|
// UUID returns the globally unique immutable UUID associated with this device.
|
||||||
|
func (d Device) UUID() (string, error) {
|
||||||
|
if C.nvmlHandle == nil {
|
||||||
|
return "", errLibraryNotLoaded
|
||||||
|
}
|
||||||
|
var uuid [szUUID]C.char
|
||||||
|
r := C.nvmlDeviceGetUUID(d.dev, &uuid[0], szUUID)
|
||||||
|
return C.GoString(&uuid[0]), errorString(r)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Name returns the product name of the device.
|
||||||
|
func (d Device) Name() (string, error) {
|
||||||
|
if C.nvmlHandle == nil {
|
||||||
|
return "", errLibraryNotLoaded
|
||||||
|
}
|
||||||
|
var name [szName]C.char
|
||||||
|
r := C.nvmlDeviceGetName(d.dev, &name[0], szName)
|
||||||
|
return C.GoString(&name[0]), errorString(r)
|
||||||
|
}
|
||||||
|
|
||||||
|
// MemoryInfo returns the total and used memory (in bytes) of the device.
|
||||||
|
func (d Device) MemoryInfo() (uint64, uint64, error) {
|
||||||
|
if C.nvmlHandle == nil {
|
||||||
|
return 0, 0, errLibraryNotLoaded
|
||||||
|
}
|
||||||
|
var memory C.nvmlMemory_t
|
||||||
|
r := C.nvmlDeviceGetMemoryInfo(d.dev, &memory)
|
||||||
|
return uint64(memory.total), uint64(memory.used), errorString(r)
|
||||||
|
}
|
||||||
|
|
||||||
|
// UtilizationRates returns the percent of time over the past sample period during which:
|
||||||
|
// utilization.gpu: one or more kernels were executing on the GPU.
|
||||||
|
// utilizatoin.memory: global (device) memory was being read or written.
|
||||||
|
func (d Device) UtilizationRates() (uint, uint, error) {
|
||||||
|
if C.nvmlHandle == nil {
|
||||||
|
return 0, 0, errLibraryNotLoaded
|
||||||
|
}
|
||||||
|
var utilization C.nvmlUtilization_t
|
||||||
|
r := C.nvmlDeviceGetUtilizationRates(d.dev, &utilization)
|
||||||
|
return uint(utilization.gpu), uint(utilization.memory), errorString(r)
|
||||||
|
}
|
||||||
|
|
||||||
|
// PowerUsage returns the power usage for this GPU and its associated circuitry
|
||||||
|
// in milliwatts. The reading is accurate to within +/- 5% of current power draw.
|
||||||
|
func (d Device) PowerUsage() (uint, error) {
|
||||||
|
if C.nvmlHandle == nil {
|
||||||
|
return 0, errLibraryNotLoaded
|
||||||
|
}
|
||||||
|
var n C.uint
|
||||||
|
r := C.nvmlDeviceGetPowerUsage(d.dev, &n)
|
||||||
|
return uint(n), errorString(r)
|
||||||
|
}
|
||||||
|
|
||||||
|
// AveragePowerUsage returns the power usage for this GPU and its associated circuitry
|
||||||
|
// in milliwatts averaged over the samples collected in the last `since` duration.
|
||||||
|
func (d Device) AveragePowerUsage(since time.Duration) (uint, error) {
|
||||||
|
if C.nvmlHandle == nil {
|
||||||
|
return 0, errLibraryNotLoaded
|
||||||
|
}
|
||||||
|
lastTs := C.ulonglong(time.Now().Add(-1*since).UnixNano() / 1000)
|
||||||
|
var n C.uint
|
||||||
|
r := C.nvmlDeviceGetAverageUsage(d.dev, C.NVML_TOTAL_POWER_SAMPLES, lastTs, &n)
|
||||||
|
return uint(n), errorString(r)
|
||||||
|
}
|
||||||
|
|
||||||
|
// AverageGPUUtilization returns the utilization.gpu metric (percent of time
|
||||||
|
// one of more kernels were executing on the GPU) averaged over the samples
|
||||||
|
// collected in the last `since` duration.
|
||||||
|
func (d Device) AverageGPUUtilization(since time.Duration) (uint, error) {
|
||||||
|
if C.nvmlHandle == nil {
|
||||||
|
return 0, errLibraryNotLoaded
|
||||||
|
}
|
||||||
|
lastTs := C.ulonglong(time.Now().Add(-1*since).UnixNano() / 1000)
|
||||||
|
var n C.uint
|
||||||
|
r := C.nvmlDeviceGetAverageUsage(d.dev, C.NVML_GPU_UTILIZATION_SAMPLES, lastTs, &n)
|
||||||
|
return uint(n), errorString(r)
|
||||||
|
}
|
5605
vendor/github.com/mindprince/gonvml/nvml.h
generated
vendored
Normal file
5605
vendor/github.com/mindprince/gonvml/nvml.h
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user