diff --git a/info/v1/container.go b/info/v1/container.go index 3930a230..189ac424 100644 --- a/info/v1/container.go +++ b/info/v1/container.go @@ -520,6 +520,29 @@ type FsStats struct { WeightedIoTime uint64 `json:"weighted_io_time"` } +type AcceleratorStats struct { + // Make of the accelerator (nvidia, amd, google etc.) + Make string `json:"make"` + + // Model of the accelerator (tesla-p100, tesla-k80 etc.) + Model string `json:"model"` + + // ID of the accelerator. + ID string `json:"id"` + + // Total accelerator memory. + // unit: bytes + MemoryTotal uint64 `json:"memory_total"` + + // Total accelerator memory allocated. + // unit: bytes + MemoryUsed uint64 `json:"memory_used"` + + // Percent of time over the past sample period during which + // the accelerator was actively processing. + DutyCycle uint64 `json:"duty_cycle"` +} + type ContainerStats struct { // The time of this stat point. Timestamp time.Time `json:"timestamp"` @@ -534,6 +557,9 @@ type ContainerStats struct { // Task load stats TaskStats LoadStats `json:"task_stats,omitempty"` + // Metrics for Accelerators. Each Accelerator corresponds to one element in the array. + Accelerators []AcceleratorStats `json:"accelerators,omitempty"` + // Custom metrics from all collectors CustomMetrics map[string][]MetricVal `json:"custom_metrics,omitempty"` } diff --git a/info/v2/container.go b/info/v2/container.go index ce102ec8..792db22f 100644 --- a/info/v2/container.go +++ b/info/v2/container.go @@ -146,6 +146,8 @@ type ContainerStats struct { Filesystem *FilesystemStats `json:"filesystem,omitempty"` // Task load statistics Load *v1.LoadStats `json:"load_stats,omitempty"` + // Metrics for Accelerators. Each Accelerator corresponds to one element in the array. + Accelerators []v1.AcceleratorStats `json:"accelerators,omitempty"` // Custom Metrics CustomMetrics map[string][]v1.MetricVal `json:"custom_metrics,omitempty"` } diff --git a/info/v2/conversion.go b/info/v2/conversion.go index 1c0f91f8..b137d30e 100644 --- a/info/v2/conversion.go +++ b/info/v2/conversion.go @@ -142,6 +142,9 @@ func ContainerStatsFromV1(containerName string, spec *v1.ContainerSpec, stats [] if spec.HasCustomMetrics { stat.CustomMetrics = val.CustomMetrics } + if len(val.Accelerators) > 0 { + stat.Accelerators = val.Accelerators + } // TODO(rjnagal): Handle load stats. newStats = append(newStats, stat) } diff --git a/info/v2/conversion_test.go b/info/v2/conversion_test.go index 445fcb09..092f7e3c 100644 --- a/info/v2/conversion_test.go +++ b/info/v2/conversion_test.go @@ -176,6 +176,14 @@ func TestContainerStatsFromV1(t *testing.T) { Available: 300, InodesFree: 100, }}, + Accelerators: []v1.AcceleratorStats{{ + Make: "nvidia", + Model: "tesla-p100", + ID: "GPU-deadbeef-1234-5678-90ab-feedfacecafe", + MemoryTotal: 20304050607, + MemoryUsed: 2030405060, + DutyCycle: 12, + }}, } expectedV2Stats := ContainerStats{ Timestamp: timestamp, @@ -190,6 +198,7 @@ func TestContainerStatsFromV1(t *testing.T) { BaseUsageBytes: &v1Stats.Filesystem[0].BaseUsage, InodeUsage: &v1Stats.Filesystem[0].Inodes, }, + Accelerators: v1Stats.Accelerators, } v2Stats := ContainerStatsFromV1("test", &v1Spec, []*v1.ContainerStats{&v1Stats}) diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 9af0c80c..f401e127 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -271,6 +271,51 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo }, } }, + }, { + name: "container_accelerator_memory_total_bytes", + help: "Total accelerator memory.", + valueType: prometheus.GaugeValue, + extraLabels: []string{"make", "model", "acc_id"}, + getValues: func(s *info.ContainerStats) metricValues { + values := make(metricValues, 0, len(s.Accelerators)) + for _, value := range s.Accelerators { + values = append(values, metricValue{ + value: float64(value.MemoryTotal), + labels: []string{value.Make, value.Model, value.ID}, + }) + } + return values + }, + }, { + name: "container_accelerator_memory_used_bytes", + help: "Total accelerator memory allocated.", + valueType: prometheus.GaugeValue, + extraLabels: []string{"make", "model", "acc_id"}, + getValues: func(s *info.ContainerStats) metricValues { + values := make(metricValues, 0, len(s.Accelerators)) + for _, value := range s.Accelerators { + values = append(values, metricValue{ + value: float64(value.MemoryUsed), + labels: []string{value.Make, value.Model, value.ID}, + }) + } + return values + }, + }, { + name: "container_accelerator_duty_cycle", + help: "Percent of time over the past sample period during which the accelerator was actively processing.", + valueType: prometheus.GaugeValue, + extraLabels: []string{"make", "model", "acc_id"}, + getValues: func(s *info.ContainerStats) metricValues { + values := make(metricValues, 0, len(s.Accelerators)) + for _, value := range s.Accelerators { + values = append(values, metricValue{ + value: float64(value.DutyCycle), + labels: []string{value.Make, value.Model, value.ID}, + }) + } + return values + }, }, { name: "container_fs_inodes_free", help: "Number of available Inodes", diff --git a/metrics/prometheus_test.go b/metrics/prometheus_test.go index 508749b4..d803f17d 100644 --- a/metrics/prometheus_test.go +++ b/metrics/prometheus_test.go @@ -191,6 +191,24 @@ func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.Container WeightedIoTime: 49, }, }, + Accelerators: []info.AcceleratorStats{ + { + Make: "nvidia", + Model: "tesla-p100", + ID: "GPU-deadbeef-1234-5678-90ab-feedfacecafe", + MemoryTotal: 20304050607, + MemoryUsed: 2030405060, + DutyCycle: 12, + }, + { + Make: "nvidia", + Model: "tesla-k80", + ID: "GPU-deadbeef-0123-4567-89ab-feedfacecafe", + MemoryTotal: 10203040506, + MemoryUsed: 1020304050, + DutyCycle: 6, + }, + }, TaskStats: info.LoadStats{ NrSleeping: 50, NrRunning: 51, diff --git a/metrics/testdata/prometheus_metrics b/metrics/testdata/prometheus_metrics index d93816d5..5b8d0ac9 100644 --- a/metrics/testdata/prometheus_metrics +++ b/metrics/testdata/prometheus_metrics @@ -1,6 +1,18 @@ # HELP cadvisor_version_info A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision. # TYPE cadvisor_version_info gauge cadvisor_version_info{cadvisorRevision="abcdef",cadvisorVersion="0.16.0",dockerVersion="1.8.1",kernelVersion="4.1.6-200.fc22.x86_64",osVersion="Fedora 22 (Twenty Two)"} 1 +# HELP container_accelerator_duty_cycle Percent of time over the past sample period during which the accelerator was actively processing. +# TYPE container_accelerator_duty_cycle gauge +container_accelerator_duty_cycle{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 6 +container_accelerator_duty_cycle{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 12 +# HELP container_accelerator_memory_total_bytes Total accelerator memory. +# TYPE container_accelerator_memory_total_bytes gauge +container_accelerator_memory_total_bytes{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 1.0203040506e+10 +container_accelerator_memory_total_bytes{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 2.0304050607e+10 +# HELP container_accelerator_memory_used_bytes Total accelerator memory allocated. +# TYPE container_accelerator_memory_used_bytes gauge +container_accelerator_memory_used_bytes{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 1.02030405e+09 +container_accelerator_memory_used_bytes{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 2.03040506e+09 # HELP container_cpu_cfs_periods_total Number of elapsed enforcement period intervals. # TYPE container_cpu_cfs_periods_total counter container_cpu_cfs_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 723