Moving Nvidia interfaces to stats package so that they can be used outside of the accelerators package

Signed-off-by: Maciej "Iwan" Iwanowski <maciej.iwanowski@intel.com>
This commit is contained in:
Maciej "Iwan" Iwanowski 2020-03-17 09:49:50 +01:00
parent 9921cb3e21
commit adf41ba206
No known key found for this signature in database
GPG Key ID: 2484258A4DD3EE84
5 changed files with 28 additions and 19 deletions

View File

@ -16,6 +16,7 @@ package accelerators
import ( import (
"bufio" "bufio"
"fmt" "fmt"
"github.com/google/cadvisor/stats"
"io/ioutil" "io/ioutil"
"os" "os"
"path/filepath" "path/filepath"
@ -30,7 +31,7 @@ import (
"k8s.io/klog" "k8s.io/klog"
) )
type NvidiaManager struct { type nvidiaManager struct {
sync.Mutex sync.Mutex
// true if there are NVIDIA devices present on the node // true if there are NVIDIA devices present on the node
@ -47,8 +48,12 @@ var sysFsPCIDevicesPath = "/sys/bus/pci/devices/"
const nvidiaVendorId = "0x10de" const nvidiaVendorId = "0x10de"
func NewNvidiaManager() stats.Manager {
return &nvidiaManager{}
}
// Setup initializes NVML if nvidia devices are present on the node. // Setup initializes NVML if nvidia devices are present on the node.
func (nm *NvidiaManager) Setup() { func (nm *nvidiaManager) Setup() {
if !detectDevices(nvidiaVendorId) { if !detectDevices(nvidiaVendorId) {
klog.V(4).Info("No NVIDIA devices found.") klog.V(4).Info("No NVIDIA devices found.")
return return
@ -84,7 +89,7 @@ func detectDevices(vendorId string) bool {
// initializeNVML initializes the NVML library and sets up the nvmlDevices map. // initializeNVML initializes the NVML library and sets up the nvmlDevices map.
// This is defined as a variable to help in testing. // This is defined as a variable to help in testing.
var initializeNVML = func(nm *NvidiaManager) { var initializeNVML = func(nm *nvidiaManager) {
if err := gonvml.Initialize(); err != nil { if err := gonvml.Initialize(); err != nil {
// This is under a logging level because otherwise we may cause // This is under a logging level because otherwise we may cause
// log spam if the drivers/nvml is not installed on the system. // log spam if the drivers/nvml is not installed on the system.
@ -115,7 +120,7 @@ var initializeNVML = func(nm *NvidiaManager) {
} }
// Destroy shuts down NVML. // Destroy shuts down NVML.
func (nm *NvidiaManager) Destroy() { func (nm *nvidiaManager) Destroy() {
if nm.nvmlInitialized { if nm.nvmlInitialized {
gonvml.Shutdown() gonvml.Shutdown()
} }
@ -123,7 +128,7 @@ func (nm *NvidiaManager) Destroy() {
// GetCollector returns a collector that can fetch nvidia gpu metrics for nvidia devices // GetCollector returns a collector that can fetch nvidia gpu metrics for nvidia devices
// present in the devices.list file in the given devicesCgroupPath. // present in the devices.list file in the given devicesCgroupPath.
func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) { func (nm *nvidiaManager) GetCollector(devicesCgroupPath string) (stats.Collector, error) {
nc := &NvidiaCollector{} nc := &NvidiaCollector{}
if !nm.devicesPresent { if !nm.devicesPresent {

View File

@ -72,13 +72,13 @@ func TestGetCollector(t *testing.T) {
} }
parseDevicesCgroup = mockParser parseDevicesCgroup = mockParser
originalInitializeNVML := initializeNVML originalInitializeNVML := initializeNVML
initializeNVML = func(_ *NvidiaManager) {} initializeNVML = func(_ *nvidiaManager) {}
defer func() { defer func() {
parseDevicesCgroup = originalParser parseDevicesCgroup = originalParser
initializeNVML = originalInitializeNVML initializeNVML = originalInitializeNVML
}() }()
nm := &NvidiaManager{} nm := &nvidiaManager{}
// When devicesPresent is false, empty collector should be returned. // When devicesPresent is false, empty collector should be returned.
ac, err := nm.GetCollector("does-not-matter") ac, err := nm.GetCollector("does-not-matter")

View File

@ -17,6 +17,7 @@ package manager
import ( import (
"flag" "flag"
"fmt" "fmt"
"github.com/google/cadvisor/stats"
"io/ioutil" "io/ioutil"
"math" "math"
"math/rand" "math/rand"
@ -29,7 +30,6 @@ import (
"sync" "sync"
"time" "time"
"github.com/google/cadvisor/accelerators"
"github.com/google/cadvisor/cache/memory" "github.com/google/cadvisor/cache/memory"
"github.com/google/cadvisor/collector" "github.com/google/cadvisor/collector"
"github.com/google/cadvisor/container" "github.com/google/cadvisor/container"
@ -90,7 +90,7 @@ type containerData struct {
collectorManager collector.CollectorManager collectorManager collector.CollectorManager
// nvidiaCollector updates stats for Nvidia GPUs attached to the container. // nvidiaCollector updates stats for Nvidia GPUs attached to the container.
nvidiaCollector accelerators.AcceleratorCollector nvidiaCollector stats.Collector
} }
// jitter returns a time.Duration between duration and duration + maxFactor * duration, // jitter returns a time.Duration between duration and duration + maxFactor * duration,

View File

@ -18,6 +18,7 @@ package manager
import ( import (
"flag" "flag"
"fmt" "fmt"
"github.com/google/cadvisor/stats"
"net/http" "net/http"
"os" "os"
"path" "path"
@ -181,7 +182,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn
containerWatchers: []watcher.ContainerWatcher{}, containerWatchers: []watcher.ContainerWatcher{},
eventsChannel: eventsChannel, eventsChannel: eventsChannel,
collectorHttpClient: collectorHttpClient, collectorHttpClient: collectorHttpClient,
nvidiaManager: &accelerators.NvidiaManager{}, nvidiaManager: accelerators.NewNvidiaManager(),
rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList, rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
} }
@ -230,7 +231,7 @@ type manager struct {
containerWatchers []watcher.ContainerWatcher containerWatchers []watcher.ContainerWatcher
eventsChannel chan watcher.ContainerEvent eventsChannel chan watcher.ContainerEvent
collectorHttpClient *http.Client collectorHttpClient *http.Client
nvidiaManager accelerators.AcceleratorManager nvidiaManager stats.Manager
// List of raw container cgroup path prefix whitelist. // List of raw container cgroup path prefix whitelist.
rawContainerCgroupPathPrefixWhiteList []string rawContainerCgroupPathPrefixWhiteList []string
} }

View File

@ -1,4 +1,4 @@
// Copyright 2017 Google Inc. All Rights Reserved. // Copyright 2020 Google Inc. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
@ -11,22 +11,25 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
package accelerators
// Handling statistics that are fully controlled in cAdvisor
package stats
import info "github.com/google/cadvisor/info/v1" import info "github.com/google/cadvisor/info/v1"
// This is supposed to store global state about an accelerator metrics collector. // This is supposed to store global state about an cAdvisor metrics collector.
// cadvisor manager will call Setup() when it starts and Destroy() when it stops. // cAdvisor manager will call Setup() when it starts and Destroy() when it stops.
// For each container detected by the cadvisor manager, it will call // For each container detected by the cAdvisor manager, it will call
// GetCollector() with the devices cgroup path for that container. // GetCollector() with the devices cgroup path for that container.
// GetCollector() is supposed to return an object that can update // GetCollector() is supposed to return an object that can update
// accelerator stats for that container. // accelerator stats for that container.
type AcceleratorManager interface { type Manager interface {
Setup() Setup()
Destroy() Destroy()
GetCollector(deviceCgroup string) (AcceleratorCollector, error) GetCollector(deviceCgroup string) (Collector, error)
} }
type AcceleratorCollector interface { // Collector can update ContainerStats by adding more metrics.
type Collector interface {
UpdateStats(*info.ContainerStats) error UpdateStats(*info.ContainerStats) error
} }