Add accelerator metrics to the API.
The structure is generic to support most hardware accelerators like GPUs, TPUs etc. Note that the prometheus label for id is called acc_id, so that it doesn't conflict with some other label that maybe called id.
This commit is contained in:
parent
31694e6e1e
commit
126fb2232e
@ -520,6 +520,29 @@ type FsStats struct {
|
||||
WeightedIoTime uint64 `json:"weighted_io_time"`
|
||||
}
|
||||
|
||||
type AcceleratorStats struct {
|
||||
// Make of the accelerator (nvidia, amd, google etc.)
|
||||
Make string `json:"make"`
|
||||
|
||||
// Model of the accelerator (tesla-p100, tesla-k80 etc.)
|
||||
Model string `json:"model"`
|
||||
|
||||
// ID of the accelerator.
|
||||
ID string `json:"id"`
|
||||
|
||||
// Total accelerator memory.
|
||||
// unit: bytes
|
||||
MemoryTotal uint64 `json:"memory_total"`
|
||||
|
||||
// Total accelerator memory allocated.
|
||||
// unit: bytes
|
||||
MemoryUsed uint64 `json:"memory_used"`
|
||||
|
||||
// Percent of time over the past sample period during which
|
||||
// the accelerator was actively processing.
|
||||
DutyCycle uint64 `json:"duty_cycle"`
|
||||
}
|
||||
|
||||
type ContainerStats struct {
|
||||
// The time of this stat point.
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
@ -534,6 +557,9 @@ type ContainerStats struct {
|
||||
// Task load stats
|
||||
TaskStats LoadStats `json:"task_stats,omitempty"`
|
||||
|
||||
// Metrics for Accelerators. Each Accelerator corresponds to one element in the array.
|
||||
Accelerators []AcceleratorStats `json:"accelerators,omitempty"`
|
||||
|
||||
// Custom metrics from all collectors
|
||||
CustomMetrics map[string][]MetricVal `json:"custom_metrics,omitempty"`
|
||||
}
|
||||
|
@ -146,6 +146,8 @@ type ContainerStats struct {
|
||||
Filesystem *FilesystemStats `json:"filesystem,omitempty"`
|
||||
// Task load statistics
|
||||
Load *v1.LoadStats `json:"load_stats,omitempty"`
|
||||
// Metrics for Accelerators. Each Accelerator corresponds to one element in the array.
|
||||
Accelerators []v1.AcceleratorStats `json:"accelerators,omitempty"`
|
||||
// Custom Metrics
|
||||
CustomMetrics map[string][]v1.MetricVal `json:"custom_metrics,omitempty"`
|
||||
}
|
||||
|
@ -142,6 +142,9 @@ func ContainerStatsFromV1(containerName string, spec *v1.ContainerSpec, stats []
|
||||
if spec.HasCustomMetrics {
|
||||
stat.CustomMetrics = val.CustomMetrics
|
||||
}
|
||||
if len(val.Accelerators) > 0 {
|
||||
stat.Accelerators = val.Accelerators
|
||||
}
|
||||
// TODO(rjnagal): Handle load stats.
|
||||
newStats = append(newStats, stat)
|
||||
}
|
||||
|
@ -176,6 +176,14 @@ func TestContainerStatsFromV1(t *testing.T) {
|
||||
Available: 300,
|
||||
InodesFree: 100,
|
||||
}},
|
||||
Accelerators: []v1.AcceleratorStats{{
|
||||
Make: "nvidia",
|
||||
Model: "tesla-p100",
|
||||
ID: "GPU-deadbeef-1234-5678-90ab-feedfacecafe",
|
||||
MemoryTotal: 20304050607,
|
||||
MemoryUsed: 2030405060,
|
||||
DutyCycle: 12,
|
||||
}},
|
||||
}
|
||||
expectedV2Stats := ContainerStats{
|
||||
Timestamp: timestamp,
|
||||
@ -190,6 +198,7 @@ func TestContainerStatsFromV1(t *testing.T) {
|
||||
BaseUsageBytes: &v1Stats.Filesystem[0].BaseUsage,
|
||||
InodeUsage: &v1Stats.Filesystem[0].Inodes,
|
||||
},
|
||||
Accelerators: v1Stats.Accelerators,
|
||||
}
|
||||
|
||||
v2Stats := ContainerStatsFromV1("test", &v1Spec, []*v1.ContainerStats{&v1Stats})
|
||||
|
@ -271,6 +271,51 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo
|
||||
},
|
||||
}
|
||||
},
|
||||
}, {
|
||||
name: "container_accelerator_memory_total_bytes",
|
||||
help: "Total accelerator memory.",
|
||||
valueType: prometheus.GaugeValue,
|
||||
extraLabels: []string{"make", "model", "acc_id"},
|
||||
getValues: func(s *info.ContainerStats) metricValues {
|
||||
values := make(metricValues, 0, len(s.Accelerators))
|
||||
for _, value := range s.Accelerators {
|
||||
values = append(values, metricValue{
|
||||
value: float64(value.MemoryTotal),
|
||||
labels: []string{value.Make, value.Model, value.ID},
|
||||
})
|
||||
}
|
||||
return values
|
||||
},
|
||||
}, {
|
||||
name: "container_accelerator_memory_used_bytes",
|
||||
help: "Total accelerator memory allocated.",
|
||||
valueType: prometheus.GaugeValue,
|
||||
extraLabels: []string{"make", "model", "acc_id"},
|
||||
getValues: func(s *info.ContainerStats) metricValues {
|
||||
values := make(metricValues, 0, len(s.Accelerators))
|
||||
for _, value := range s.Accelerators {
|
||||
values = append(values, metricValue{
|
||||
value: float64(value.MemoryUsed),
|
||||
labels: []string{value.Make, value.Model, value.ID},
|
||||
})
|
||||
}
|
||||
return values
|
||||
},
|
||||
}, {
|
||||
name: "container_accelerator_duty_cycle",
|
||||
help: "Percent of time over the past sample period during which the accelerator was actively processing.",
|
||||
valueType: prometheus.GaugeValue,
|
||||
extraLabels: []string{"make", "model", "acc_id"},
|
||||
getValues: func(s *info.ContainerStats) metricValues {
|
||||
values := make(metricValues, 0, len(s.Accelerators))
|
||||
for _, value := range s.Accelerators {
|
||||
values = append(values, metricValue{
|
||||
value: float64(value.DutyCycle),
|
||||
labels: []string{value.Make, value.Model, value.ID},
|
||||
})
|
||||
}
|
||||
return values
|
||||
},
|
||||
}, {
|
||||
name: "container_fs_inodes_free",
|
||||
help: "Number of available Inodes",
|
||||
|
@ -191,6 +191,24 @@ func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.Container
|
||||
WeightedIoTime: 49,
|
||||
},
|
||||
},
|
||||
Accelerators: []info.AcceleratorStats{
|
||||
{
|
||||
Make: "nvidia",
|
||||
Model: "tesla-p100",
|
||||
ID: "GPU-deadbeef-1234-5678-90ab-feedfacecafe",
|
||||
MemoryTotal: 20304050607,
|
||||
MemoryUsed: 2030405060,
|
||||
DutyCycle: 12,
|
||||
},
|
||||
{
|
||||
Make: "nvidia",
|
||||
Model: "tesla-k80",
|
||||
ID: "GPU-deadbeef-0123-4567-89ab-feedfacecafe",
|
||||
MemoryTotal: 10203040506,
|
||||
MemoryUsed: 1020304050,
|
||||
DutyCycle: 6,
|
||||
},
|
||||
},
|
||||
TaskStats: info.LoadStats{
|
||||
NrSleeping: 50,
|
||||
NrRunning: 51,
|
||||
|
12
metrics/testdata/prometheus_metrics
vendored
12
metrics/testdata/prometheus_metrics
vendored
@ -1,6 +1,18 @@
|
||||
# HELP cadvisor_version_info A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision.
|
||||
# TYPE cadvisor_version_info gauge
|
||||
cadvisor_version_info{cadvisorRevision="abcdef",cadvisorVersion="0.16.0",dockerVersion="1.8.1",kernelVersion="4.1.6-200.fc22.x86_64",osVersion="Fedora 22 (Twenty Two)"} 1
|
||||
# HELP container_accelerator_duty_cycle Percent of time over the past sample period during which the accelerator was actively processing.
|
||||
# TYPE container_accelerator_duty_cycle gauge
|
||||
container_accelerator_duty_cycle{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 6
|
||||
container_accelerator_duty_cycle{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 12
|
||||
# HELP container_accelerator_memory_total_bytes Total accelerator memory.
|
||||
# TYPE container_accelerator_memory_total_bytes gauge
|
||||
container_accelerator_memory_total_bytes{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 1.0203040506e+10
|
||||
container_accelerator_memory_total_bytes{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 2.0304050607e+10
|
||||
# HELP container_accelerator_memory_used_bytes Total accelerator memory allocated.
|
||||
# TYPE container_accelerator_memory_used_bytes gauge
|
||||
container_accelerator_memory_used_bytes{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 1.02030405e+09
|
||||
container_accelerator_memory_used_bytes{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 2.03040506e+09
|
||||
# HELP container_cpu_cfs_periods_total Number of elapsed enforcement period intervals.
|
||||
# TYPE container_cpu_cfs_periods_total counter
|
||||
container_cpu_cfs_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 723
|
||||
|
Loading…
Reference in New Issue
Block a user