Add accelerator metrics to the API.

The structure is generic to support most hardware accelerators like
GPUs, TPUs etc.

Note that the prometheus label for id is called acc_id, so that it
doesn't conflict with some other label that maybe called id.
This commit is contained in:
Rohit Agarwal 2017-10-19 11:25:49 -07:00
parent 31694e6e1e
commit 126fb2232e
7 changed files with 115 additions and 0 deletions

View File

@ -520,6 +520,29 @@ type FsStats struct {
WeightedIoTime uint64 `json:"weighted_io_time"`
}
type AcceleratorStats struct {
// Make of the accelerator (nvidia, amd, google etc.)
Make string `json:"make"`
// Model of the accelerator (tesla-p100, tesla-k80 etc.)
Model string `json:"model"`
// ID of the accelerator.
ID string `json:"id"`
// Total accelerator memory.
// unit: bytes
MemoryTotal uint64 `json:"memory_total"`
// Total accelerator memory allocated.
// unit: bytes
MemoryUsed uint64 `json:"memory_used"`
// Percent of time over the past sample period during which
// the accelerator was actively processing.
DutyCycle uint64 `json:"duty_cycle"`
}
type ContainerStats struct {
// The time of this stat point.
Timestamp time.Time `json:"timestamp"`
@ -534,6 +557,9 @@ type ContainerStats struct {
// Task load stats
TaskStats LoadStats `json:"task_stats,omitempty"`
// Metrics for Accelerators. Each Accelerator corresponds to one element in the array.
Accelerators []AcceleratorStats `json:"accelerators,omitempty"`
// Custom metrics from all collectors
CustomMetrics map[string][]MetricVal `json:"custom_metrics,omitempty"`
}

View File

@ -146,6 +146,8 @@ type ContainerStats struct {
Filesystem *FilesystemStats `json:"filesystem,omitempty"`
// Task load statistics
Load *v1.LoadStats `json:"load_stats,omitempty"`
// Metrics for Accelerators. Each Accelerator corresponds to one element in the array.
Accelerators []v1.AcceleratorStats `json:"accelerators,omitempty"`
// Custom Metrics
CustomMetrics map[string][]v1.MetricVal `json:"custom_metrics,omitempty"`
}

View File

@ -142,6 +142,9 @@ func ContainerStatsFromV1(containerName string, spec *v1.ContainerSpec, stats []
if spec.HasCustomMetrics {
stat.CustomMetrics = val.CustomMetrics
}
if len(val.Accelerators) > 0 {
stat.Accelerators = val.Accelerators
}
// TODO(rjnagal): Handle load stats.
newStats = append(newStats, stat)
}

View File

@ -176,6 +176,14 @@ func TestContainerStatsFromV1(t *testing.T) {
Available: 300,
InodesFree: 100,
}},
Accelerators: []v1.AcceleratorStats{{
Make: "nvidia",
Model: "tesla-p100",
ID: "GPU-deadbeef-1234-5678-90ab-feedfacecafe",
MemoryTotal: 20304050607,
MemoryUsed: 2030405060,
DutyCycle: 12,
}},
}
expectedV2Stats := ContainerStats{
Timestamp: timestamp,
@ -190,6 +198,7 @@ func TestContainerStatsFromV1(t *testing.T) {
BaseUsageBytes: &v1Stats.Filesystem[0].BaseUsage,
InodeUsage: &v1Stats.Filesystem[0].Inodes,
},
Accelerators: v1Stats.Accelerators,
}
v2Stats := ContainerStatsFromV1("test", &v1Spec, []*v1.ContainerStats{&v1Stats})

View File

@ -271,6 +271,51 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo
},
}
},
}, {
name: "container_accelerator_memory_total_bytes",
help: "Total accelerator memory.",
valueType: prometheus.GaugeValue,
extraLabels: []string{"make", "model", "acc_id"},
getValues: func(s *info.ContainerStats) metricValues {
values := make(metricValues, 0, len(s.Accelerators))
for _, value := range s.Accelerators {
values = append(values, metricValue{
value: float64(value.MemoryTotal),
labels: []string{value.Make, value.Model, value.ID},
})
}
return values
},
}, {
name: "container_accelerator_memory_used_bytes",
help: "Total accelerator memory allocated.",
valueType: prometheus.GaugeValue,
extraLabels: []string{"make", "model", "acc_id"},
getValues: func(s *info.ContainerStats) metricValues {
values := make(metricValues, 0, len(s.Accelerators))
for _, value := range s.Accelerators {
values = append(values, metricValue{
value: float64(value.MemoryUsed),
labels: []string{value.Make, value.Model, value.ID},
})
}
return values
},
}, {
name: "container_accelerator_duty_cycle",
help: "Percent of time over the past sample period during which the accelerator was actively processing.",
valueType: prometheus.GaugeValue,
extraLabels: []string{"make", "model", "acc_id"},
getValues: func(s *info.ContainerStats) metricValues {
values := make(metricValues, 0, len(s.Accelerators))
for _, value := range s.Accelerators {
values = append(values, metricValue{
value: float64(value.DutyCycle),
labels: []string{value.Make, value.Model, value.ID},
})
}
return values
},
}, {
name: "container_fs_inodes_free",
help: "Number of available Inodes",

View File

@ -191,6 +191,24 @@ func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.Container
WeightedIoTime: 49,
},
},
Accelerators: []info.AcceleratorStats{
{
Make: "nvidia",
Model: "tesla-p100",
ID: "GPU-deadbeef-1234-5678-90ab-feedfacecafe",
MemoryTotal: 20304050607,
MemoryUsed: 2030405060,
DutyCycle: 12,
},
{
Make: "nvidia",
Model: "tesla-k80",
ID: "GPU-deadbeef-0123-4567-89ab-feedfacecafe",
MemoryTotal: 10203040506,
MemoryUsed: 1020304050,
DutyCycle: 6,
},
},
TaskStats: info.LoadStats{
NrSleeping: 50,
NrRunning: 51,

View File

@ -1,6 +1,18 @@
# HELP cadvisor_version_info A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision.
# TYPE cadvisor_version_info gauge
cadvisor_version_info{cadvisorRevision="abcdef",cadvisorVersion="0.16.0",dockerVersion="1.8.1",kernelVersion="4.1.6-200.fc22.x86_64",osVersion="Fedora 22 (Twenty Two)"} 1
# HELP container_accelerator_duty_cycle Percent of time over the past sample period during which the accelerator was actively processing.
# TYPE container_accelerator_duty_cycle gauge
container_accelerator_duty_cycle{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 6
container_accelerator_duty_cycle{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 12
# HELP container_accelerator_memory_total_bytes Total accelerator memory.
# TYPE container_accelerator_memory_total_bytes gauge
container_accelerator_memory_total_bytes{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 1.0203040506e+10
container_accelerator_memory_total_bytes{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 2.0304050607e+10
# HELP container_accelerator_memory_used_bytes Total accelerator memory allocated.
# TYPE container_accelerator_memory_used_bytes gauge
container_accelerator_memory_used_bytes{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 1.02030405e+09
container_accelerator_memory_used_bytes{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 2.03040506e+09
# HELP container_cpu_cfs_periods_total Number of elapsed enforcement period intervals.
# TYPE container_cpu_cfs_periods_total counter
container_cpu_cfs_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 723