Moving Nvidia interfaces to stats package so that they can be used outside of the accelerators package
Signed-off-by: Maciej "Iwan" Iwanowski <maciej.iwanowski@intel.com>
This commit is contained in:
parent
9921cb3e21
commit
adf41ba206
@ -16,6 +16,7 @@ package accelerators
|
|||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"github.com/google/cadvisor/stats"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
@ -30,7 +31,7 @@ import (
|
|||||||
"k8s.io/klog"
|
"k8s.io/klog"
|
||||||
)
|
)
|
||||||
|
|
||||||
type NvidiaManager struct {
|
type nvidiaManager struct {
|
||||||
sync.Mutex
|
sync.Mutex
|
||||||
|
|
||||||
// true if there are NVIDIA devices present on the node
|
// true if there are NVIDIA devices present on the node
|
||||||
@ -47,8 +48,12 @@ var sysFsPCIDevicesPath = "/sys/bus/pci/devices/"
|
|||||||
|
|
||||||
const nvidiaVendorId = "0x10de"
|
const nvidiaVendorId = "0x10de"
|
||||||
|
|
||||||
|
func NewNvidiaManager() stats.Manager {
|
||||||
|
return &nvidiaManager{}
|
||||||
|
}
|
||||||
|
|
||||||
// Setup initializes NVML if nvidia devices are present on the node.
|
// Setup initializes NVML if nvidia devices are present on the node.
|
||||||
func (nm *NvidiaManager) Setup() {
|
func (nm *nvidiaManager) Setup() {
|
||||||
if !detectDevices(nvidiaVendorId) {
|
if !detectDevices(nvidiaVendorId) {
|
||||||
klog.V(4).Info("No NVIDIA devices found.")
|
klog.V(4).Info("No NVIDIA devices found.")
|
||||||
return
|
return
|
||||||
@ -84,7 +89,7 @@ func detectDevices(vendorId string) bool {
|
|||||||
|
|
||||||
// initializeNVML initializes the NVML library and sets up the nvmlDevices map.
|
// initializeNVML initializes the NVML library and sets up the nvmlDevices map.
|
||||||
// This is defined as a variable to help in testing.
|
// This is defined as a variable to help in testing.
|
||||||
var initializeNVML = func(nm *NvidiaManager) {
|
var initializeNVML = func(nm *nvidiaManager) {
|
||||||
if err := gonvml.Initialize(); err != nil {
|
if err := gonvml.Initialize(); err != nil {
|
||||||
// This is under a logging level because otherwise we may cause
|
// This is under a logging level because otherwise we may cause
|
||||||
// log spam if the drivers/nvml is not installed on the system.
|
// log spam if the drivers/nvml is not installed on the system.
|
||||||
@ -115,7 +120,7 @@ var initializeNVML = func(nm *NvidiaManager) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Destroy shuts down NVML.
|
// Destroy shuts down NVML.
|
||||||
func (nm *NvidiaManager) Destroy() {
|
func (nm *nvidiaManager) Destroy() {
|
||||||
if nm.nvmlInitialized {
|
if nm.nvmlInitialized {
|
||||||
gonvml.Shutdown()
|
gonvml.Shutdown()
|
||||||
}
|
}
|
||||||
@ -123,7 +128,7 @@ func (nm *NvidiaManager) Destroy() {
|
|||||||
|
|
||||||
// GetCollector returns a collector that can fetch nvidia gpu metrics for nvidia devices
|
// GetCollector returns a collector that can fetch nvidia gpu metrics for nvidia devices
|
||||||
// present in the devices.list file in the given devicesCgroupPath.
|
// present in the devices.list file in the given devicesCgroupPath.
|
||||||
func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
|
func (nm *nvidiaManager) GetCollector(devicesCgroupPath string) (stats.Collector, error) {
|
||||||
nc := &NvidiaCollector{}
|
nc := &NvidiaCollector{}
|
||||||
|
|
||||||
if !nm.devicesPresent {
|
if !nm.devicesPresent {
|
||||||
|
@ -72,13 +72,13 @@ func TestGetCollector(t *testing.T) {
|
|||||||
}
|
}
|
||||||
parseDevicesCgroup = mockParser
|
parseDevicesCgroup = mockParser
|
||||||
originalInitializeNVML := initializeNVML
|
originalInitializeNVML := initializeNVML
|
||||||
initializeNVML = func(_ *NvidiaManager) {}
|
initializeNVML = func(_ *nvidiaManager) {}
|
||||||
defer func() {
|
defer func() {
|
||||||
parseDevicesCgroup = originalParser
|
parseDevicesCgroup = originalParser
|
||||||
initializeNVML = originalInitializeNVML
|
initializeNVML = originalInitializeNVML
|
||||||
}()
|
}()
|
||||||
|
|
||||||
nm := &NvidiaManager{}
|
nm := &nvidiaManager{}
|
||||||
|
|
||||||
// When devicesPresent is false, empty collector should be returned.
|
// When devicesPresent is false, empty collector should be returned.
|
||||||
ac, err := nm.GetCollector("does-not-matter")
|
ac, err := nm.GetCollector("does-not-matter")
|
||||||
|
@ -17,6 +17,7 @@ package manager
|
|||||||
import (
|
import (
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"github.com/google/cadvisor/stats"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"math"
|
"math"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
@ -29,7 +30,6 @@ import (
|
|||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/google/cadvisor/accelerators"
|
|
||||||
"github.com/google/cadvisor/cache/memory"
|
"github.com/google/cadvisor/cache/memory"
|
||||||
"github.com/google/cadvisor/collector"
|
"github.com/google/cadvisor/collector"
|
||||||
"github.com/google/cadvisor/container"
|
"github.com/google/cadvisor/container"
|
||||||
@ -90,7 +90,7 @@ type containerData struct {
|
|||||||
collectorManager collector.CollectorManager
|
collectorManager collector.CollectorManager
|
||||||
|
|
||||||
// nvidiaCollector updates stats for Nvidia GPUs attached to the container.
|
// nvidiaCollector updates stats for Nvidia GPUs attached to the container.
|
||||||
nvidiaCollector accelerators.AcceleratorCollector
|
nvidiaCollector stats.Collector
|
||||||
}
|
}
|
||||||
|
|
||||||
// jitter returns a time.Duration between duration and duration + maxFactor * duration,
|
// jitter returns a time.Duration between duration and duration + maxFactor * duration,
|
||||||
|
@ -18,6 +18,7 @@ package manager
|
|||||||
import (
|
import (
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"github.com/google/cadvisor/stats"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
@ -181,7 +182,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn
|
|||||||
containerWatchers: []watcher.ContainerWatcher{},
|
containerWatchers: []watcher.ContainerWatcher{},
|
||||||
eventsChannel: eventsChannel,
|
eventsChannel: eventsChannel,
|
||||||
collectorHttpClient: collectorHttpClient,
|
collectorHttpClient: collectorHttpClient,
|
||||||
nvidiaManager: &accelerators.NvidiaManager{},
|
nvidiaManager: accelerators.NewNvidiaManager(),
|
||||||
rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
|
rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -230,7 +231,7 @@ type manager struct {
|
|||||||
containerWatchers []watcher.ContainerWatcher
|
containerWatchers []watcher.ContainerWatcher
|
||||||
eventsChannel chan watcher.ContainerEvent
|
eventsChannel chan watcher.ContainerEvent
|
||||||
collectorHttpClient *http.Client
|
collectorHttpClient *http.Client
|
||||||
nvidiaManager accelerators.AcceleratorManager
|
nvidiaManager stats.Manager
|
||||||
// List of raw container cgroup path prefix whitelist.
|
// List of raw container cgroup path prefix whitelist.
|
||||||
rawContainerCgroupPathPrefixWhiteList []string
|
rawContainerCgroupPathPrefixWhiteList []string
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
// Copyright 2017 Google Inc. All Rights Reserved.
|
// Copyright 2020 Google Inc. All Rights Reserved.
|
||||||
//
|
//
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
// you may not use this file except in compliance with the License.
|
// you may not use this file except in compliance with the License.
|
||||||
@ -11,22 +11,25 @@
|
|||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
package accelerators
|
|
||||||
|
// Handling statistics that are fully controlled in cAdvisor
|
||||||
|
package stats
|
||||||
|
|
||||||
import info "github.com/google/cadvisor/info/v1"
|
import info "github.com/google/cadvisor/info/v1"
|
||||||
|
|
||||||
// This is supposed to store global state about an accelerator metrics collector.
|
// This is supposed to store global state about an cAdvisor metrics collector.
|
||||||
// cadvisor manager will call Setup() when it starts and Destroy() when it stops.
|
// cAdvisor manager will call Setup() when it starts and Destroy() when it stops.
|
||||||
// For each container detected by the cadvisor manager, it will call
|
// For each container detected by the cAdvisor manager, it will call
|
||||||
// GetCollector() with the devices cgroup path for that container.
|
// GetCollector() with the devices cgroup path for that container.
|
||||||
// GetCollector() is supposed to return an object that can update
|
// GetCollector() is supposed to return an object that can update
|
||||||
// accelerator stats for that container.
|
// accelerator stats for that container.
|
||||||
type AcceleratorManager interface {
|
type Manager interface {
|
||||||
Setup()
|
Setup()
|
||||||
Destroy()
|
Destroy()
|
||||||
GetCollector(deviceCgroup string) (AcceleratorCollector, error)
|
GetCollector(deviceCgroup string) (Collector, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
type AcceleratorCollector interface {
|
// Collector can update ContainerStats by adding more metrics.
|
||||||
|
type Collector interface {
|
||||||
UpdateStats(*info.ContainerStats) error
|
UpdateStats(*info.ContainerStats) error
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user