Merge pull request #1762 from mindprince/gpu-metrics-1436

Add per container GPU metrics
2017-11-06 12:34:13 -08:00 · 2017-11-06 12:34:13 -08:00 · 9bc6590461
commit 9bc6590461
parent 31694e6e1e 4a35130019
22 changed files with 6768 additions and 3 deletions
--- a/Godeps/Godeps.json
+++ b/Godeps/Godeps.json
@ -414,6 +414,10 @@
 			"ImportPath": "github.com/matttproud/golang_protobuf_extensions/pbutil",
 			"Rev": "fc2b8d3a73c4867e51861bbdd5ae3c1f0869dd6a"
 		},
+		{
+			"ImportPath": "github.com/mindprince/gonvml",
+			"Rev": "b49be04bdda2d9cd7544bb3a0bce7210ec3448c7"
+		},
 		{
 			"ImportPath": "github.com/mistifyio/go-zfs",
 			"Comment": "v2.1.1-31-g166dd29",
--- a/accelerators/nvidia.go
+++ b/accelerators/nvidia.go
@ -0,0 +1,239 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package accelerators
+
+import (
+	"bufio"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"time"
+
+	info "github.com/google/cadvisor/info/v1"
+
+	"github.com/golang/glog"
+	"github.com/mindprince/gonvml"
+)
+
+type NvidiaManager struct {
+	// true if the NVML library (libnvidia-ml.so.1) was loaded successfully
+	nvmlInitialized bool
+
+	// nvidiaDevices is a map from device minor number to a handle that can be used to get metrics about the device
+	nvidiaDevices map[int]gonvml.Device
+}
+
+var sysFsPCIDevicesPath = "/sys/bus/pci/devices/"
+
+const nvidiaVendorId = "0x10de"
+
+// Setup initializes NVML if nvidia devices are present on the node.
+func (nm *NvidiaManager) Setup() {
+	if !detectDevices(nvidiaVendorId) {
+		glog.Info("No NVIDIA devices found.")
+		return
+	}
+
+	go func() {
+		glog.Info("Starting goroutine to initialize NVML")
+		nm.initializeNVML()
+		if nm.nvmlInitialized {
+			return
+		}
+		// TODO: use globalHousekeepingInterval
+		for range time.Tick(time.Minute) {
+			nm.initializeNVML()
+			if nm.nvmlInitialized {
+				return
+			}
+		}
+	}()
+}
+
+// detectDevices returns true if a device with given pci id is present on the node.
+func detectDevices(vendorId string) bool {
+	devices, err := ioutil.ReadDir(sysFsPCIDevicesPath)
+	if err != nil {
+		glog.Warningf("error reading %q: %v", sysFsPCIDevicesPath, err)
+		return false
+	}
+
+	for _, device := range devices {
+		vendorPath := filepath.Join(sysFsPCIDevicesPath, device.Name(), "vendor")
+		content, err := ioutil.ReadFile(vendorPath)
+		if err != nil {
+			glog.Infof("Error while reading %q: %v", vendorPath, err)
+			continue
+		}
+		if strings.EqualFold(strings.TrimSpace(string(content)), vendorId) {
+			glog.Infof("Found device with vendorId %q", vendorId)
+			return true
+		}
+	}
+	return false
+}
+
+// initializeNVML initializes the NVML library and sets up the nvmlDevices map.
+func (nm *NvidiaManager) initializeNVML() {
+	if err := gonvml.Initialize(); err != nil {
+		// This is under a logging level because otherwise we may cause
+		// log spam if the drivers/nvml is not installed on the system.
+		glog.V(3).Infof("Could not initialize NVML: %v", err)
+		return
+	}
+	nm.nvmlInitialized = true
+	numDevices, err := gonvml.DeviceCount()
+	if err != nil {
+		glog.Warningf("GPU metrics would not be available. Failed to get the number of nvidia devices: %v", err)
+		return
+	}
+	glog.Infof("NVML initialized. Number of nvidia devices: %v", numDevices)
+	nm.nvidiaDevices = make(map[int]gonvml.Device, numDevices)
+	for i := 0; i < int(numDevices); i++ {
+		device, err := gonvml.DeviceHandleByIndex(uint(i))
+		if err != nil {
+			glog.Warningf("Failed to get nvidia device handle %d: %v", i, err)
+			continue
+		}
+		minorNumber, err := device.MinorNumber()
+		if err != nil {
+			glog.Warningf("Failed to get nvidia device minor number: %v", err)
+			continue
+		}
+		nm.nvidiaDevices[int(minorNumber)] = device
+	}
+}
+
+// Destroy shuts down NVML.
+func (nm *NvidiaManager) Destroy() {
+	if nm.nvmlInitialized {
+		gonvml.Shutdown()
+	}
+}
+
+// GetCollector returns a collector that can fetch nvidia gpu metrics for nvidia devices
+// present in the devices.list file in the given devicesCgroupPath.
+func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
+	nc := &NvidiaCollector{}
+	if !nm.nvmlInitialized || len(nm.nvidiaDevices) == 0 {
+		return nc, nil
+	}
+	nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath)
+	if err != nil {
+		return nc, err
+	}
+	for _, minor := range nvidiaMinorNumbers {
+		device, ok := nm.nvidiaDevices[minor]
+		if !ok {
+			return nc, fmt.Errorf("nvidia device minor number %d not found in cached devices", minor)
+		}
+		nc.Devices = append(nc.Devices, device)
+	}
+	return nc, nil
+}
+
+// parseDevicesCgroup parses the devices cgroup devices.list file for the container
+// and returns a list of minor numbers corresponding to NVIDIA GPU devices that the
+// container is allowed to access. In cases where the container has access to all
+// devices or all NVIDIA devices but the devices are not enumerated separately in
+// the devices.list file, we return an empty list.
+// This is defined as a variable to help in testing.
+var parseDevicesCgroup = func(devicesCgroupPath string) ([]int, error) {
+	// Always return a non-nil slice
+	nvidiaMinorNumbers := []int{}
+
+	devicesList := filepath.Join(devicesCgroupPath, "devices.list")
+	f, err := os.Open(devicesList)
+	if err != nil {
+		return nvidiaMinorNumbers, fmt.Errorf("error while opening devices cgroup file %q: %v", devicesList, err)
+	}
+	defer f.Close()
+
+	s := bufio.NewScanner(f)
+
+	// See https://www.kernel.org/doc/Documentation/cgroup-v1/devices.txt for the file format
+	for s.Scan() {
+		text := s.Text()
+
+		fields := strings.Fields(text)
+		if len(fields) != 3 {
+			return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: must contain three whitespace-separated fields", text)
+		}
+
+		// Split the second field to find out major:minor numbers
+		majorMinor := strings.Split(fields[1], ":")
+		if len(majorMinor) != 2 {
+			return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: second field should have one colon", text)
+		}
+
+		// NVIDIA graphics devices are character devices with major number 195.
+		// https://github.com/torvalds/linux/blob/v4.13/Documentation/admin-guide/devices.txt#L2583
+		if fields[0] == "c" && majorMinor[0] == "195" {
+			minorNumber, err := strconv.Atoi(majorMinor[1])
+			if err != nil {
+				return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: minor number is not integer", text)
+			}
+			// We don't want devices like nvidiactl (195:255) and nvidia-modeset (195:254)
+			if minorNumber < 128 {
+				nvidiaMinorNumbers = append(nvidiaMinorNumbers, minorNumber)
+			}
+			// We are ignoring the "195:*" case
+			// where the container has access to all NVIDIA devices on the machine.
+		}
+		// We are ignoring the "*:*" case
+		// where the container has access to all devices on the machine.
+	}
+	return nvidiaMinorNumbers, nil
+}
+
+type NvidiaCollector struct {
+	// Exposed for testing
+	Devices []gonvml.Device
+}
+
+// UpdateStats updates the stats for NVIDIA GPUs (if any) attached to the container.
+func (nc *NvidiaCollector) UpdateStats(stats *info.ContainerStats) error {
+	for _, device := range nc.Devices {
+		model, err := device.Name()
+		if err != nil {
+			return fmt.Errorf("error while getting gpu name: %v", err)
+		}
+		uuid, err := device.UUID()
+		if err != nil {
+			return fmt.Errorf("error while getting gpu uuid: %v", err)
+		}
+		memoryTotal, memoryUsed, err := device.MemoryInfo()
+		if err != nil {
+			return fmt.Errorf("error while getting gpu memory info: %v", err)
+		}
+		//TODO: Use housekeepingInterval
+		utilizationGPU, err := device.AverageGPUUtilization(10 * time.Second)
+		if err != nil {
+			return fmt.Errorf("error while getting gpu utilization: %v", err)
+		}
+
+		stats.Accelerators = append(stats.Accelerators, info.AcceleratorStats{
+			Make:        "nvidia",
+			Model:       model,
+			ID:          uuid,
+			MemoryTotal: memoryTotal,
+			MemoryUsed:  memoryUsed,
+			DutyCycle:   uint64(utilizationGPU),
+		})
+	}
+	return nil
+}
--- a/accelerators/nvidia_test.go
+++ b/accelerators/nvidia_test.go
@ -0,0 +1,168 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package accelerators
+
+import (
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/mindprince/gonvml"
+	"github.com/stretchr/testify/assert"
+)
+
+func updateFile(t *testing.T, fn string, content []byte) {
+	if err := ioutil.WriteFile(fn, content, 0666); err != nil {
+		t.Fatalf("Error writing to temporary file for testing: %v", err)
+	}
+}
+
+func TestDetectDevices(t *testing.T) {
+	sysFsPCIDevicesPath = "/non-existent-path"
+	detected := detectDevices("0x10de")
+	assert.False(t, detected)
+
+	var err error
+	// Create temporary directory to represent sysfs pci devices path
+	if sysFsPCIDevicesPath, err = ioutil.TempDir("", "sys-bus-pci-devices"); err != nil {
+		t.Fatalf("Error creating temporary directory for testing: %v", err)
+	}
+	defer os.RemoveAll(sysFsPCIDevicesPath)
+
+	device0 := filepath.Join(sysFsPCIDevicesPath, "device0")
+	device1 := filepath.Join(sysFsPCIDevicesPath, "device1")
+	device2 := filepath.Join(sysFsPCIDevicesPath, "device2")
+	for _, device := range []string{device0, device1, device2} {
+		if err = os.Mkdir(device, 0777); err != nil {
+			t.Fatalf("Error creating temporary directory for testing: %v", err)
+		}
+	}
+
+	// device0 directory is present to make sure that
+	// we handle bad device directories case correctly.
+
+	// A valid vendor file but different than what's being detected.
+	updateFile(t, filepath.Join(device1, "vendor"), []byte("0x8086\n"))
+	detected = detectDevices("0x10de")
+	assert.False(t, detected)
+
+	// vendor file for device being detected
+	updateFile(t, filepath.Join(device2, "vendor"), []byte("0x10de\n"))
+	detected = detectDevices("0x10de")
+	assert.True(t, detected)
+}
+
+func TestGetCollector(t *testing.T) {
+	// Mock parseDevicesCgroup.
+	originalParser := parseDevicesCgroup
+	mockParser := func(_ string) ([]int, error) {
+		return []int{2, 3}, nil
+	}
+	parseDevicesCgroup = mockParser
+	defer func() {
+		parseDevicesCgroup = originalParser
+	}()
+
+	nm := &NvidiaManager{}
+
+	// When nvmlInitialized is false, empty collector should be returned.
+	ac, err := nm.GetCollector("does-not-matter")
+	assert.Nil(t, err)
+	assert.NotNil(t, ac)
+	nc, ok := ac.(*NvidiaCollector)
+	assert.True(t, ok)
+	assert.Equal(t, 0, len(nc.Devices))
+
+	// When nvidiaDevices is empty, empty collector should be returned.
+	nm.nvmlInitialized = true
+	ac, err = nm.GetCollector("does-not-matter")
+	assert.Nil(t, err)
+	assert.NotNil(t, ac)
+	nc, ok = ac.(*NvidiaCollector)
+	assert.True(t, ok)
+	assert.Equal(t, 0, len(nc.Devices))
+
+	// nvidiaDevices contains devices but they are different than what
+	// is returned by parseDevicesCgroup. We should get an error.
+	nm.nvidiaDevices = map[int]gonvml.Device{0: {}, 1: {}}
+	ac, err = nm.GetCollector("does-not-matter")
+	assert.NotNil(t, err)
+	assert.NotNil(t, ac)
+	nc, ok = ac.(*NvidiaCollector)
+	assert.True(t, ok)
+	assert.Equal(t, 0, len(nc.Devices))
+
+	// nvidiaDevices contains devices returned by parseDevicesCgroup.
+	// No error should be returned and collectors devices array should be
+	// correctly initialized.
+	nm.nvidiaDevices[2] = gonvml.Device{}
+	nm.nvidiaDevices[3] = gonvml.Device{}
+	ac, err = nm.GetCollector("does-not-matter")
+	assert.Nil(t, err)
+	assert.NotNil(t, ac)
+	nc, ok = ac.(*NvidiaCollector)
+	assert.True(t, ok)
+	assert.Equal(t, 2, len(nc.Devices))
+}
+
+func TestParseDevicesCgroup(t *testing.T) {
+	// Test case for empty devices cgroup path
+	nvidiaMinorNumbers, err := parseDevicesCgroup("")
+	assert.NotNil(t, err)
+	assert.Equal(t, []int{}, nvidiaMinorNumbers)
+
+	// Test case for non-existent devices cgroup
+	nvidiaMinorNumbers, err = parseDevicesCgroup("/non-existent-path")
+	assert.NotNil(t, err)
+	assert.Equal(t, []int{}, nvidiaMinorNumbers)
+
+	// Create temporary directory to represent devices cgroup.
+	tmpDir, err := ioutil.TempDir("", "devices-cgroup")
+	if err != nil {
+		t.Fatalf("Error creating temporary directory for testing: %v", err)
+	}
+	defer os.RemoveAll(tmpDir)
+	tmpfn := filepath.Join(tmpDir, "devices.list")
+
+	// Test case when devices.list file has more than three fields.
+	updateFile(t, tmpfn, []byte("c 1:2 rwm badformat\n"))
+	nvidiaMinorNumbers, err = parseDevicesCgroup(tmpDir)
+	assert.NotNil(t, err)
+	assert.Equal(t, []int{}, nvidiaMinorNumbers)
+
+	// Test case when devices.list file's second field is not major:minor.
+	updateFile(t, tmpfn, []byte("c badformat rwm\n"))
+	nvidiaMinorNumbers, err = parseDevicesCgroup(tmpDir)
+	assert.NotNil(t, err)
+	assert.Equal(t, []int{}, nvidiaMinorNumbers)
+
+	// Test case with nvidia devices present
+	updateFile(t, tmpfn, []byte("c 195:0 rwm\nc 195:255 rwm\nc 195:1 rwm"))
+	nvidiaMinorNumbers, err = parseDevicesCgroup(tmpDir)
+	assert.Nil(t, err)
+	assert.Equal(t, []int{0, 1}, nvidiaMinorNumbers) // Note that 255 is not supposed to be returned.
+
+	// Test case with a common devices.list file
+	updateFile(t, tmpfn, []byte("a *:* rwm\n"))
+	nvidiaMinorNumbers, err = parseDevicesCgroup(tmpDir)
+	assert.Nil(t, err)
+	assert.Equal(t, []int{}, nvidiaMinorNumbers)
+
+	// Test case for empty devices.list file
+	updateFile(t, tmpfn, []byte(""))
+	nvidiaMinorNumbers, err = parseDevicesCgroup(tmpDir)
+	assert.Nil(t, err)
+	assert.Equal(t, []int{}, nvidiaMinorNumbers)
+}
--- a/accelerators/types.go
+++ b/accelerators/types.go
@ -0,0 +1,32 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package accelerators
+
+import info "github.com/google/cadvisor/info/v1"
+
+// This is supposed to store global state about an accelerator metrics collector.
+// cadvisor manager will call Setup() when it starts and Destroy() when it stops.
+// For each container detected by the cadvisor manager, it will call
+// GetCollector() with the devices cgroup path for that container.
+// GetCollector() is supposed to return an object that can update
+// accelerator stats for that container.
+type AcceleratorManager interface {
+	Setup()
+	Destroy()
+	GetCollector(deviceCgroup string) (AcceleratorCollector, error)
+}
+
+type AcceleratorCollector interface {
+	UpdateStats(*info.ContainerStats) error
+}
--- a/build/build.sh
+++ b/build/build.sh
@ -37,7 +37,6 @@ if [ "${go_version:0:3}" = "1.4" ]; then
 fi

 ldflags="
-  -extldflags '-static'
  -X ${repo_path}/version.Version${ldseparator}${version}
  -X ${repo_path}/version.Revision${ldseparator}${revision}
  -X ${repo_path}/version.Branch${ldseparator}${branch}
--- a/container/libcontainer/helpers.go
+++ b/container/libcontainer/helpers.go
@ -84,6 +84,7 @@ var supportedSubsystems map[string]struct{} = map[string]struct{}{
 	"memory":  {},
 	"cpuset":  {},
 	"blkio":   {},
+	"devices": {},
 }

 // Get cgroup and networking stats of the specified container
--- a/info/v1/container.go
+++ b/info/v1/container.go
@ -520,6 +520,29 @@ type FsStats struct {
 	WeightedIoTime uint64 `json:"weighted_io_time"`
 }

+type AcceleratorStats struct {
+	// Make of the accelerator (nvidia, amd, google etc.)
+	Make string `json:"make"`
+
+	// Model of the accelerator (tesla-p100, tesla-k80 etc.)
+	Model string `json:"model"`
+
+	// ID of the accelerator.
+	ID string `json:"id"`
+
+	// Total accelerator memory.
+	// unit: bytes
+	MemoryTotal uint64 `json:"memory_total"`
+
+	// Total accelerator memory allocated.
+	// unit: bytes
+	MemoryUsed uint64 `json:"memory_used"`
+
+	// Percent of time over the past sample period during which
+	// the accelerator was actively processing.
+	DutyCycle uint64 `json:"duty_cycle"`
+}
+
 type ContainerStats struct {
 	// The time of this stat point.
 	Timestamp time.Time    `json:"timestamp"`
@ -534,6 +557,9 @@ type ContainerStats struct {
 	// Task load stats
 	TaskStats LoadStats `json:"task_stats,omitempty"`

+	// Metrics for Accelerators. Each Accelerator corresponds to one element in the array.
+	Accelerators []AcceleratorStats `json:"accelerators,omitempty"`
+
 	// Custom metrics from all collectors
 	CustomMetrics map[string][]MetricVal `json:"custom_metrics,omitempty"`
 }
--- a/info/v2/container.go
+++ b/info/v2/container.go
@ -146,6 +146,8 @@ type ContainerStats struct {
 	Filesystem *FilesystemStats `json:"filesystem,omitempty"`
 	// Task load statistics
 	Load *v1.LoadStats `json:"load_stats,omitempty"`
+	// Metrics for Accelerators. Each Accelerator corresponds to one element in the array.
+	Accelerators []v1.AcceleratorStats `json:"accelerators,omitempty"`
 	// Custom Metrics
 	CustomMetrics map[string][]v1.MetricVal `json:"custom_metrics,omitempty"`
 }
--- a/info/v2/conversion.go
+++ b/info/v2/conversion.go
@ -142,6 +142,9 @@ func ContainerStatsFromV1(containerName string, spec *v1.ContainerSpec, stats []
 		if spec.HasCustomMetrics {
 			stat.CustomMetrics = val.CustomMetrics
 		}
+		if len(val.Accelerators) > 0 {
+			stat.Accelerators = val.Accelerators
+		}
 		// TODO(rjnagal): Handle load stats.
 		newStats = append(newStats, stat)
 	}
--- a/info/v2/conversion_test.go
+++ b/info/v2/conversion_test.go
@ -176,6 +176,14 @@ func TestContainerStatsFromV1(t *testing.T) {
 			Available:  300,
 			InodesFree: 100,
 		}},
+		Accelerators: []v1.AcceleratorStats{{
+			Make:        "nvidia",
+			Model:       "tesla-p100",
+			ID:          "GPU-deadbeef-1234-5678-90ab-feedfacecafe",
+			MemoryTotal: 20304050607,
+			MemoryUsed:  2030405060,
+			DutyCycle:   12,
+		}},
 	}
 	expectedV2Stats := ContainerStats{
 		Timestamp: timestamp,
@ -190,6 +198,7 @@ func TestContainerStatsFromV1(t *testing.T) {
 			BaseUsageBytes:  &v1Stats.Filesystem[0].BaseUsage,
 			InodeUsage:      &v1Stats.Filesystem[0].Inodes,
 		},
+		Accelerators: v1Stats.Accelerators,
 	}

 	v2Stats := ContainerStatsFromV1("test", &v1Spec, []*v1.ContainerStats{&v1Stats})
--- a/manager/container.go
+++ b/manager/container.go
@ -29,6 +29,7 @@ import (
 	"sync"
 	"time"

+	"github.com/google/cadvisor/accelerators"
 	"github.com/google/cadvisor/cache/memory"
 	"github.com/google/cadvisor/collector"
 	"github.com/google/cadvisor/container"
@ -78,6 +79,9 @@ type containerData struct {

 	// Runs custom metric collectors.
 	collectorManager collector.CollectorManager
+
+	// nvidiaCollector updates stats for Nvidia GPUs attached to the container.
+	nvidiaCollector accelerators.AcceleratorCollector
 }

 // jitter returns a time.Duration between duration and duration + maxFactor * duration,
@ -557,6 +561,12 @@ func (c *containerData) updateStats() error {
 		}
 	}

+	var nvidiaStatsErr error
+	if c.nvidiaCollector != nil {
+		// This updates the Accelerators field of the stats struct
+		nvidiaStatsErr = c.nvidiaCollector.UpdateStats(stats)
+	}
+
 	ref, err := c.handler.ContainerReference()
 	if err != nil {
 		// Ignore errors if the container is dead.
@ -572,6 +582,9 @@ func (c *containerData) updateStats() error {
 	if statsErr != nil {
 		return statsErr
 	}
+	if nvidiaStatsErr != nil {
+		return nvidiaStatsErr
+	}
 	return customStatsErr
 }

--- a/manager/container_test.go
+++ b/manager/container_test.go
@ -29,6 +29,8 @@ import (
 	info "github.com/google/cadvisor/info/v1"
 	itest "github.com/google/cadvisor/info/v1/test"

+	"github.com/google/cadvisor/accelerators"
+	"github.com/mindprince/gonvml"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
@ -205,3 +207,22 @@ func TestGetInfo(t *testing.T) {
 		t.Errorf("received wrong container name: received %v; should be %v", info.Name, mockHandler.Name)
 	}
 }
+
+func TestUpdateNvidiaStats(t *testing.T) {
+	cd, _, _ := newTestContainerData(t)
+	stats := info.ContainerStats{}
+
+	// When there are no devices, we should not get an error and stats should not change.
+	cd.nvidiaCollector = &accelerators.NvidiaCollector{}
+	err := cd.nvidiaCollector.UpdateStats(&stats)
+	assert.Nil(t, err)
+	assert.Equal(t, info.ContainerStats{}, stats)
+
+	// This is an impossible situation (there are devices but nvml is not initialized).
+	// Here I am testing that the CGo gonvml library doesn't panic when passed bad
+	// input and instead returns an error.
+	cd.nvidiaCollector = &accelerators.NvidiaCollector{Devices: []gonvml.Device{{}, {}}}
+	err = cd.nvidiaCollector.UpdateStats(&stats)
+	assert.NotNil(t, err)
+	assert.Equal(t, info.ContainerStats{}, stats)
+}
--- a/manager/manager.go
+++ b/manager/manager.go
@ -18,6 +18,7 @@ package manager
 import (
 	"flag"
 	"fmt"
+	"net/http"
 	"os"
 	"path"
 	"strconv"
@ -25,6 +26,7 @@ import (
 	"sync"
 	"time"

+	"github.com/google/cadvisor/accelerators"
 	"github.com/google/cadvisor/cache/memory"
 	"github.com/google/cadvisor/collector"
 	"github.com/google/cadvisor/container"
@ -45,8 +47,6 @@ import (
 	"github.com/google/cadvisor/utils/sysfs"
 	"github.com/google/cadvisor/version"

-	"net/http"
-
 	"github.com/golang/glog"
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 )
@ -212,6 +212,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn
 		containerWatchers:        []watcher.ContainerWatcher{},
 		eventsChannel:            eventsChannel,
 		collectorHttpClient:      collectorHttpClient,
+		nvidiaManager:            &accelerators.NvidiaManager{},
 	}

 	machineInfo, err := machine.Info(sysfs, fsInfo, inHostNamespace)
@ -257,6 +258,7 @@ type manager struct {
 	containerWatchers        []watcher.ContainerWatcher
 	eventsChannel            chan watcher.ContainerEvent
 	collectorHttpClient      *http.Client
+	nvidiaManager            accelerators.AcceleratorManager
 }

 // Start the container manager.
@ -309,6 +311,9 @@ func (self *manager) Start() error {
 		return nil
 	}

+	// Setup collection of nvidia GPU metrics if any of them are attached to the machine.
+	self.nvidiaManager.Setup()
+
 	// Create root and then recover all containers.
 	err = self.createContainer("/", watcher.Raw)
 	if err != nil {
@ -338,6 +343,7 @@ func (self *manager) Start() error {
 }

 func (self *manager) Stop() error {
+	defer self.nvidiaManager.Destroy()
 	// Stop and wait on all quit channels.
 	for i, c := range self.quitChannels {
 		// Send the exit signal and wait on the thread to exit (by closing the channel).
@ -917,6 +923,15 @@ func (m *manager) createContainerLocked(containerName string, watchSource watche
 	if err != nil {
 		return err
 	}
+	devicesCgroupPath, err := handler.GetCgroupPath("devices")
+	if err != nil {
+		glog.Infof("Error getting devices cgroup path: %v", err)
+	} else {
+		cont.nvidiaCollector, err = m.nvidiaManager.GetCollector(devicesCgroupPath)
+		if err != nil {
+			glog.Infof("GPU metrics may be unavailable/incomplete for container %q: %v", cont.info.Name, err)
+		}
+	}

 	// Add collectors
 	labels := handler.GetContainerLabels()
--- a/metrics/prometheus.go
+++ b/metrics/prometheus.go
@ -271,6 +271,51 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo
 						},
 					}
 				},
+			}, {
+				name:        "container_accelerator_memory_total_bytes",
+				help:        "Total accelerator memory.",
+				valueType:   prometheus.GaugeValue,
+				extraLabels: []string{"make", "model", "acc_id"},
+				getValues: func(s *info.ContainerStats) metricValues {
+					values := make(metricValues, 0, len(s.Accelerators))
+					for _, value := range s.Accelerators {
+						values = append(values, metricValue{
+							value:  float64(value.MemoryTotal),
+							labels: []string{value.Make, value.Model, value.ID},
+						})
+					}
+					return values
+				},
+			}, {
+				name:        "container_accelerator_memory_used_bytes",
+				help:        "Total accelerator memory allocated.",
+				valueType:   prometheus.GaugeValue,
+				extraLabels: []string{"make", "model", "acc_id"},
+				getValues: func(s *info.ContainerStats) metricValues {
+					values := make(metricValues, 0, len(s.Accelerators))
+					for _, value := range s.Accelerators {
+						values = append(values, metricValue{
+							value:  float64(value.MemoryUsed),
+							labels: []string{value.Make, value.Model, value.ID},
+						})
+					}
+					return values
+				},
+			}, {
+				name:        "container_accelerator_duty_cycle",
+				help:        "Percent of time over the past sample period during which the accelerator was actively processing.",
+				valueType:   prometheus.GaugeValue,
+				extraLabels: []string{"make", "model", "acc_id"},
+				getValues: func(s *info.ContainerStats) metricValues {
+					values := make(metricValues, 0, len(s.Accelerators))
+					for _, value := range s.Accelerators {
+						values = append(values, metricValue{
+							value:  float64(value.DutyCycle),
+							labels: []string{value.Make, value.Model, value.ID},
+						})
+					}
+					return values
+				},
 			}, {
 				name:        "container_fs_inodes_free",
 				help:        "Number of available Inodes",
--- a/metrics/prometheus_test.go
+++ b/metrics/prometheus_test.go
@ -191,6 +191,24 @@ func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.Container
 							WeightedIoTime:  49,
 						},
 					},
+					Accelerators: []info.AcceleratorStats{
+						{
+							Make:        "nvidia",
+							Model:       "tesla-p100",
+							ID:          "GPU-deadbeef-1234-5678-90ab-feedfacecafe",
+							MemoryTotal: 20304050607,
+							MemoryUsed:  2030405060,
+							DutyCycle:   12,
+						},
+						{
+							Make:        "nvidia",
+							Model:       "tesla-k80",
+							ID:          "GPU-deadbeef-0123-4567-89ab-feedfacecafe",
+							MemoryTotal: 10203040506,
+							MemoryUsed:  1020304050,
+							DutyCycle:   6,
+						},
+					},
 					TaskStats: info.LoadStats{
 						NrSleeping:        50,
 						NrRunning:         51,
--- a/metrics/testdata/prometheus_metrics
+++ b/metrics/testdata/prometheus_metrics
@ -1,6 +1,18 @@
 # HELP cadvisor_version_info A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision.
 # TYPE cadvisor_version_info gauge
 cadvisor_version_info{cadvisorRevision="abcdef",cadvisorVersion="0.16.0",dockerVersion="1.8.1",kernelVersion="4.1.6-200.fc22.x86_64",osVersion="Fedora 22 (Twenty Two)"} 1
+# HELP container_accelerator_duty_cycle Percent of time over the past sample period during which the accelerator was actively processing.
+# TYPE container_accelerator_duty_cycle gauge
+container_accelerator_duty_cycle{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 6
+container_accelerator_duty_cycle{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 12
+# HELP container_accelerator_memory_total_bytes Total accelerator memory.
+# TYPE container_accelerator_memory_total_bytes gauge
+container_accelerator_memory_total_bytes{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 1.0203040506e+10
+container_accelerator_memory_total_bytes{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 2.0304050607e+10
+# HELP container_accelerator_memory_used_bytes Total accelerator memory allocated.
+# TYPE container_accelerator_memory_used_bytes gauge
+container_accelerator_memory_used_bytes{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 1.02030405e+09
+container_accelerator_memory_used_bytes{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 2.03040506e+09
 # HELP container_cpu_cfs_periods_total Number of elapsed enforcement period intervals.
 # TYPE container_cpu_cfs_periods_total counter
 container_cpu_cfs_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 723
--- a/vendor/github.com/mindprince/gonvml/LICENSE
+++ b/vendor/github.com/mindprince/gonvml/LICENSE
@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/vendor/github.com/mindprince/gonvml/Makefile
+++ b/vendor/github.com/mindprince/gonvml/Makefile
@ -0,0 +1,20 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+PKG=github.com/mindprince/gonvml
+
+.PHONY: build
+build:
+	docker run -v $(shell pwd):/go/src/$(PKG) --workdir=/go/src/$(PKG) golang:1.8 go build cmd/example/example.go
+
--- a/vendor/github.com/mindprince/gonvml/NVML_NOTICE
+++ b/vendor/github.com/mindprince/gonvml/NVML_NOTICE
@ -0,0 +1,32 @@
+Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+
+NOTICE TO USER:
+
+This source code is subject to NVIDIA ownership rights under U.S. and
+international Copyright laws.  Users and possessors of this source code
+are hereby granted a nonexclusive, royalty-free license to use this code
+in individual and commercial software.
+
+NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+OR PERFORMANCE OF THIS SOURCE CODE.
+
+U.S. Government End Users.   This source code is a "commercial item" as
+that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+"commercial computer  software"  and "commercial computer software
+documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+and is provided to the U.S. Government only as a commercial end item.
+Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+source code with only those rights set forth herein.
+
+Any use of this source code in individual and commercial software must
+include, in the user documentation and internal comments to the code,
+the above Disclaimer and U.S. Government End Users Notice.
--- a/vendor/github.com/mindprince/gonvml/README.md
+++ b/vendor/github.com/mindprince/gonvml/README.md
@ -0,0 +1,19 @@
+Go Bindings for NVML
+--------------------
+
+[NVML or NVIDIA Management
+Library](https://developer.nvidia.com/nvidia-management-library-nvml) is a
+C-based API that can be used for monitoring NVIDIA GPU devices. It's closed
+source but can be downloaded as part of the [GPU Deployment
+Kit](https://developer.nvidia.com/gpu-deployment-kit).
+
+The [NVML API
+Reference](http://docs.nvidia.com/deploy/nvml-api/nvml-api-reference.html)
+describe various methods that are available as part of NVML.
+
+The `nvml.h` file is included in this repository so that we don't depend on
+the presence of NVML in the build environment.
+
+The `bindings.go` file is the cgo bridge which calls the NVML functions. The
+cgo preamble in `bindings.go` uses `dlopen` to dynamically load NVML and makes
+its functions available.
--- a/vendor/github.com/mindprince/gonvml/bindings.go
+++ b/vendor/github.com/mindprince/gonvml/bindings.go
@ -0,0 +1,280 @@
+/*
+Copyright 2017 Google Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package gonvml
+
+// #cgo LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files
+/*
+#include <stddef.h>
+#include <dlfcn.h>
+#include <stdlib.h>
+
+#include "nvml.h"
+
+// nvmlHandle is the handle for dynamically loaded libnvidia-ml.so
+void *nvmlHandle;
+
+// Loads the "libnvidia-ml.so.1" shared library and initializes NVML.
+// Call this before calling any other methods.
+nvmlReturn_t nvmlInit_dl(void) {
+  nvmlHandle = dlopen("libnvidia-ml.so.1", RTLD_LAZY | RTLD_GLOBAL);
+  if (nvmlHandle == NULL) {
+    return (NVML_ERROR_LIBRARY_NOT_FOUND);
+  }
+  return (nvmlInit());
+}
+
+// Shuts down NVML and decrements the reference count on the dynamically loaded
+// "libnvidia-ml.so.1" library.
+// Call this once NVML is no longer being used.
+nvmlReturn_t nvmlShutdown_dl(void) {
+  if (nvmlHandle == NULL) {
+    return NVML_SUCCESS;
+  }
+  nvmlReturn_t r = nvmlShutdown();
+  if (r != NVML_SUCCESS) {
+    return (r);
+  }
+  return (dlclose(nvmlHandle) ? NVML_ERROR_UNKNOWN : NVML_SUCCESS);
+}
+
+// This function is here because the API provided by NVML is not very user
+// friendly. This function can be used to get average utilization.gpu or
+// power.draw.
+//
+// `device`: The identifier of the target device.
+// `type`: Type of sampling event. Only NVML_TOTAL_POWER_SAMPLES and NVML_GPU_UTILIZATION_SAMPLES are supported.
+// `lastSeenTimeStamp`: Return average using samples with timestamp greather than this timestamp. Unix epoch in micro seconds.
+// `averageUsage`: Reference in which average is returned.
+//
+// In my experiments, I found that NVML_GPU_UTILIZATION_SAMPLES buffer stores
+// 100 samples that are uniformly spread with ~6 samples per second. So the
+// buffer stores last ~16s of data.
+// NVML_TOTAL_POWER_SAMPLES buffer stores 120 samples, but in different runs I
+// noticed them to be non-uniformly separated. Sometimes 120 samples only
+// consisted of 10s of data and sometimes they were spread over 60s.
+//
+nvmlReturn_t nvmlDeviceGetAverageUsage(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, unsigned int* averageUsage) {
+  if (nvmlHandle == NULL) {
+    return (NVML_ERROR_LIBRARY_NOT_FOUND);
+  }
+
+  // We don't really use this because both the metrics we support
+  // averagePowerUsage and averageGPUUtilization are unsigned int.
+  nvmlValueType_t sampleValType;
+
+  // This will be set to the number of samples that can be queried. We would
+  // need to allocate an array of this size to store the samples.
+  unsigned int sampleCount;
+
+  // Invoking this method with `samples` set to NULL sets the sampleCount.
+  nvmlReturn_t r = nvmlDeviceGetSamples(device, type, lastSeenTimeStamp, &sampleValType, &sampleCount, NULL);
+  if (r != NVML_SUCCESS) {
+    return (r);
+  }
+
+  // Allocate memory to store sampleCount samples.
+  // In my experiments, the sampleCount at this stage was always 120 for
+  // NVML_TOTAL_POWER_SAMPLES and 100 for NVML_GPU_UTILIZATION_SAMPLES
+  nvmlSample_t* samples = (nvmlSample_t*) malloc(sampleCount * sizeof(nvmlSample_t));
+
+  r = nvmlDeviceGetSamples(device, type, lastSeenTimeStamp, &sampleValType, &sampleCount, samples);
+  if (r != NVML_SUCCESS) {
+    free(samples);
+    return (r);
+  }
+
+  int i = 0;
+  unsigned int sum = 0;
+  for (; i < sampleCount; i++) {
+    sum += samples[i].sampleValue.uiVal;
+  }
+  *averageUsage = sum/sampleCount;
+
+  free(samples);
+  return (r);
+}
+*/
+import "C"
+
+import (
+	"errors"
+	"fmt"
+	"time"
+)
+
+const (
+	szDriver = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE
+	szName   = C.NVML_DEVICE_NAME_BUFFER_SIZE
+	szUUID   = C.NVML_DEVICE_UUID_BUFFER_SIZE
+)
+
+var errLibraryNotLoaded = errors.New("could not load NVML library")
+
+// Initialize initializes NVML.
+// Call this before calling any other methods.
+func Initialize() error {
+	return errorString(C.nvmlInit_dl())
+}
+
+// Shutdown shuts down NVML.
+// Call this once NVML is no longer being used.
+func Shutdown() error {
+	return errorString(C.nvmlShutdown_dl())
+}
+
+// errorString takes a nvmlReturn_t and converts it into a golang error.
+// It uses a nvml method to convert to a user friendly error message.
+func errorString(ret C.nvmlReturn_t) error {
+	if ret == C.NVML_SUCCESS {
+		return nil
+	}
+	// We need to special case this because if nvml library is not found
+	// nvmlErrorString() method will not work.
+	if ret == C.NVML_ERROR_LIBRARY_NOT_FOUND || C.nvmlHandle == nil {
+		return errLibraryNotLoaded
+	}
+	err := C.GoString(C.nvmlErrorString(ret))
+	return fmt.Errorf("nvml: %v", err)
+}
+
+// SystemDriverVersion returns the the driver version on the system.
+func SystemDriverVersion() (string, error) {
+	if C.nvmlHandle == nil {
+		return "", errLibraryNotLoaded
+	}
+	var driver [szDriver]C.char
+	r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver)
+	return C.GoString(&driver[0]), errorString(r)
+}
+
+// DeviceCount returns the number of nvidia devices on the system.
+func DeviceCount() (uint, error) {
+	if C.nvmlHandle == nil {
+		return 0, errLibraryNotLoaded
+	}
+	var n C.uint
+	r := C.nvmlDeviceGetCount(&n)
+	return uint(n), errorString(r)
+}
+
+// Device is the handle for the device.
+// This handle is obtained by calling DeviceHandleByIndex().
+type Device struct {
+	dev C.nvmlDevice_t
+}
+
+// DeviceHandleByIndex returns the device handle for a particular index.
+// The indices range from 0 to DeviceCount()-1. The order in which NVML
+// enumerates devices has no guarantees of consistency between reboots.
+func DeviceHandleByIndex(idx uint) (Device, error) {
+	if C.nvmlHandle == nil {
+		return Device{}, errLibraryNotLoaded
+	}
+	var dev C.nvmlDevice_t
+	r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev)
+	return Device{dev}, errorString(r)
+}
+
+// MinorNumber returns the minor number for the device.
+// The minor number for the device is such that the Nvidia device node
+// file for each GPU will have the form /dev/nvidia[minor number].
+func (d Device) MinorNumber() (uint, error) {
+	if C.nvmlHandle == nil {
+		return 0, errLibraryNotLoaded
+	}
+	var n C.uint
+	r := C.nvmlDeviceGetMinorNumber(d.dev, &n)
+	return uint(n), errorString(r)
+}
+
+// UUID returns the globally unique immutable UUID associated with this device.
+func (d Device) UUID() (string, error) {
+	if C.nvmlHandle == nil {
+		return "", errLibraryNotLoaded
+	}
+	var uuid [szUUID]C.char
+	r := C.nvmlDeviceGetUUID(d.dev, &uuid[0], szUUID)
+	return C.GoString(&uuid[0]), errorString(r)
+}
+
+// Name returns the product name of the device.
+func (d Device) Name() (string, error) {
+	if C.nvmlHandle == nil {
+		return "", errLibraryNotLoaded
+	}
+	var name [szName]C.char
+	r := C.nvmlDeviceGetName(d.dev, &name[0], szName)
+	return C.GoString(&name[0]), errorString(r)
+}
+
+// MemoryInfo returns the total and used memory (in bytes) of the device.
+func (d Device) MemoryInfo() (uint64, uint64, error) {
+	if C.nvmlHandle == nil {
+		return 0, 0, errLibraryNotLoaded
+	}
+	var memory C.nvmlMemory_t
+	r := C.nvmlDeviceGetMemoryInfo(d.dev, &memory)
+	return uint64(memory.total), uint64(memory.used), errorString(r)
+}
+
+// UtilizationRates returns the percent of time over the past sample period during which:
+// utilization.gpu: one or more kernels were executing on the GPU.
+// utilizatoin.memory: global (device) memory was being read or written.
+func (d Device) UtilizationRates() (uint, uint, error) {
+	if C.nvmlHandle == nil {
+		return 0, 0, errLibraryNotLoaded
+	}
+	var utilization C.nvmlUtilization_t
+	r := C.nvmlDeviceGetUtilizationRates(d.dev, &utilization)
+	return uint(utilization.gpu), uint(utilization.memory), errorString(r)
+}
+
+// PowerUsage returns the power usage for this GPU and its associated circuitry
+// in milliwatts. The reading is accurate to within +/- 5% of current power draw.
+func (d Device) PowerUsage() (uint, error) {
+	if C.nvmlHandle == nil {
+		return 0, errLibraryNotLoaded
+	}
+	var n C.uint
+	r := C.nvmlDeviceGetPowerUsage(d.dev, &n)
+	return uint(n), errorString(r)
+}
+
+// AveragePowerUsage returns the power usage for this GPU and its associated circuitry
+// in milliwatts averaged over the samples collected in the last `since` duration.
+func (d Device) AveragePowerUsage(since time.Duration) (uint, error) {
+	if C.nvmlHandle == nil {
+		return 0, errLibraryNotLoaded
+	}
+	lastTs := C.ulonglong(time.Now().Add(-1*since).UnixNano() / 1000)
+	var n C.uint
+	r := C.nvmlDeviceGetAverageUsage(d.dev, C.NVML_TOTAL_POWER_SAMPLES, lastTs, &n)
+	return uint(n), errorString(r)
+}
+
+// AverageGPUUtilization returns the utilization.gpu metric (percent of time
+// one of more kernels were executing on the GPU) averaged over the samples
+// collected in the last `since` duration.
+func (d Device) AverageGPUUtilization(since time.Duration) (uint, error) {
+	if C.nvmlHandle == nil {
+		return 0, errLibraryNotLoaded
+	}
+	lastTs := C.ulonglong(time.Now().Add(-1*since).UnixNano() / 1000)
+	var n C.uint
+	r := C.nvmlDeviceGetAverageUsage(d.dev, C.NVML_GPU_UTILIZATION_SAMPLES, lastTs, &n)
+	return uint(n), errorString(r)
+}
--- a/vendor/github.com/mindprince/gonvml/nvml.h
+++ b/vendor/github.com/mindprince/gonvml/nvml.h