Add accelerator metrics to the API.
The structure is generic to support most hardware accelerators like GPUs, TPUs etc. Note that the prometheus label for id is called acc_id, so that it doesn't conflict with some other label that maybe called id.
This commit is contained in:
parent
31694e6e1e
commit
126fb2232e
@ -520,6 +520,29 @@ type FsStats struct {
|
|||||||
WeightedIoTime uint64 `json:"weighted_io_time"`
|
WeightedIoTime uint64 `json:"weighted_io_time"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type AcceleratorStats struct {
|
||||||
|
// Make of the accelerator (nvidia, amd, google etc.)
|
||||||
|
Make string `json:"make"`
|
||||||
|
|
||||||
|
// Model of the accelerator (tesla-p100, tesla-k80 etc.)
|
||||||
|
Model string `json:"model"`
|
||||||
|
|
||||||
|
// ID of the accelerator.
|
||||||
|
ID string `json:"id"`
|
||||||
|
|
||||||
|
// Total accelerator memory.
|
||||||
|
// unit: bytes
|
||||||
|
MemoryTotal uint64 `json:"memory_total"`
|
||||||
|
|
||||||
|
// Total accelerator memory allocated.
|
||||||
|
// unit: bytes
|
||||||
|
MemoryUsed uint64 `json:"memory_used"`
|
||||||
|
|
||||||
|
// Percent of time over the past sample period during which
|
||||||
|
// the accelerator was actively processing.
|
||||||
|
DutyCycle uint64 `json:"duty_cycle"`
|
||||||
|
}
|
||||||
|
|
||||||
type ContainerStats struct {
|
type ContainerStats struct {
|
||||||
// The time of this stat point.
|
// The time of this stat point.
|
||||||
Timestamp time.Time `json:"timestamp"`
|
Timestamp time.Time `json:"timestamp"`
|
||||||
@ -534,6 +557,9 @@ type ContainerStats struct {
|
|||||||
// Task load stats
|
// Task load stats
|
||||||
TaskStats LoadStats `json:"task_stats,omitempty"`
|
TaskStats LoadStats `json:"task_stats,omitempty"`
|
||||||
|
|
||||||
|
// Metrics for Accelerators. Each Accelerator corresponds to one element in the array.
|
||||||
|
Accelerators []AcceleratorStats `json:"accelerators,omitempty"`
|
||||||
|
|
||||||
// Custom metrics from all collectors
|
// Custom metrics from all collectors
|
||||||
CustomMetrics map[string][]MetricVal `json:"custom_metrics,omitempty"`
|
CustomMetrics map[string][]MetricVal `json:"custom_metrics,omitempty"`
|
||||||
}
|
}
|
||||||
|
@ -146,6 +146,8 @@ type ContainerStats struct {
|
|||||||
Filesystem *FilesystemStats `json:"filesystem,omitempty"`
|
Filesystem *FilesystemStats `json:"filesystem,omitempty"`
|
||||||
// Task load statistics
|
// Task load statistics
|
||||||
Load *v1.LoadStats `json:"load_stats,omitempty"`
|
Load *v1.LoadStats `json:"load_stats,omitempty"`
|
||||||
|
// Metrics for Accelerators. Each Accelerator corresponds to one element in the array.
|
||||||
|
Accelerators []v1.AcceleratorStats `json:"accelerators,omitempty"`
|
||||||
// Custom Metrics
|
// Custom Metrics
|
||||||
CustomMetrics map[string][]v1.MetricVal `json:"custom_metrics,omitempty"`
|
CustomMetrics map[string][]v1.MetricVal `json:"custom_metrics,omitempty"`
|
||||||
}
|
}
|
||||||
|
@ -142,6 +142,9 @@ func ContainerStatsFromV1(containerName string, spec *v1.ContainerSpec, stats []
|
|||||||
if spec.HasCustomMetrics {
|
if spec.HasCustomMetrics {
|
||||||
stat.CustomMetrics = val.CustomMetrics
|
stat.CustomMetrics = val.CustomMetrics
|
||||||
}
|
}
|
||||||
|
if len(val.Accelerators) > 0 {
|
||||||
|
stat.Accelerators = val.Accelerators
|
||||||
|
}
|
||||||
// TODO(rjnagal): Handle load stats.
|
// TODO(rjnagal): Handle load stats.
|
||||||
newStats = append(newStats, stat)
|
newStats = append(newStats, stat)
|
||||||
}
|
}
|
||||||
|
@ -176,6 +176,14 @@ func TestContainerStatsFromV1(t *testing.T) {
|
|||||||
Available: 300,
|
Available: 300,
|
||||||
InodesFree: 100,
|
InodesFree: 100,
|
||||||
}},
|
}},
|
||||||
|
Accelerators: []v1.AcceleratorStats{{
|
||||||
|
Make: "nvidia",
|
||||||
|
Model: "tesla-p100",
|
||||||
|
ID: "GPU-deadbeef-1234-5678-90ab-feedfacecafe",
|
||||||
|
MemoryTotal: 20304050607,
|
||||||
|
MemoryUsed: 2030405060,
|
||||||
|
DutyCycle: 12,
|
||||||
|
}},
|
||||||
}
|
}
|
||||||
expectedV2Stats := ContainerStats{
|
expectedV2Stats := ContainerStats{
|
||||||
Timestamp: timestamp,
|
Timestamp: timestamp,
|
||||||
@ -190,6 +198,7 @@ func TestContainerStatsFromV1(t *testing.T) {
|
|||||||
BaseUsageBytes: &v1Stats.Filesystem[0].BaseUsage,
|
BaseUsageBytes: &v1Stats.Filesystem[0].BaseUsage,
|
||||||
InodeUsage: &v1Stats.Filesystem[0].Inodes,
|
InodeUsage: &v1Stats.Filesystem[0].Inodes,
|
||||||
},
|
},
|
||||||
|
Accelerators: v1Stats.Accelerators,
|
||||||
}
|
}
|
||||||
|
|
||||||
v2Stats := ContainerStatsFromV1("test", &v1Spec, []*v1.ContainerStats{&v1Stats})
|
v2Stats := ContainerStatsFromV1("test", &v1Spec, []*v1.ContainerStats{&v1Stats})
|
||||||
|
@ -271,6 +271,51 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
}, {
|
||||||
|
name: "container_accelerator_memory_total_bytes",
|
||||||
|
help: "Total accelerator memory.",
|
||||||
|
valueType: prometheus.GaugeValue,
|
||||||
|
extraLabels: []string{"make", "model", "acc_id"},
|
||||||
|
getValues: func(s *info.ContainerStats) metricValues {
|
||||||
|
values := make(metricValues, 0, len(s.Accelerators))
|
||||||
|
for _, value := range s.Accelerators {
|
||||||
|
values = append(values, metricValue{
|
||||||
|
value: float64(value.MemoryTotal),
|
||||||
|
labels: []string{value.Make, value.Model, value.ID},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return values
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
name: "container_accelerator_memory_used_bytes",
|
||||||
|
help: "Total accelerator memory allocated.",
|
||||||
|
valueType: prometheus.GaugeValue,
|
||||||
|
extraLabels: []string{"make", "model", "acc_id"},
|
||||||
|
getValues: func(s *info.ContainerStats) metricValues {
|
||||||
|
values := make(metricValues, 0, len(s.Accelerators))
|
||||||
|
for _, value := range s.Accelerators {
|
||||||
|
values = append(values, metricValue{
|
||||||
|
value: float64(value.MemoryUsed),
|
||||||
|
labels: []string{value.Make, value.Model, value.ID},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return values
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
name: "container_accelerator_duty_cycle",
|
||||||
|
help: "Percent of time over the past sample period during which the accelerator was actively processing.",
|
||||||
|
valueType: prometheus.GaugeValue,
|
||||||
|
extraLabels: []string{"make", "model", "acc_id"},
|
||||||
|
getValues: func(s *info.ContainerStats) metricValues {
|
||||||
|
values := make(metricValues, 0, len(s.Accelerators))
|
||||||
|
for _, value := range s.Accelerators {
|
||||||
|
values = append(values, metricValue{
|
||||||
|
value: float64(value.DutyCycle),
|
||||||
|
labels: []string{value.Make, value.Model, value.ID},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return values
|
||||||
|
},
|
||||||
}, {
|
}, {
|
||||||
name: "container_fs_inodes_free",
|
name: "container_fs_inodes_free",
|
||||||
help: "Number of available Inodes",
|
help: "Number of available Inodes",
|
||||||
|
@ -191,6 +191,24 @@ func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.Container
|
|||||||
WeightedIoTime: 49,
|
WeightedIoTime: 49,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
Accelerators: []info.AcceleratorStats{
|
||||||
|
{
|
||||||
|
Make: "nvidia",
|
||||||
|
Model: "tesla-p100",
|
||||||
|
ID: "GPU-deadbeef-1234-5678-90ab-feedfacecafe",
|
||||||
|
MemoryTotal: 20304050607,
|
||||||
|
MemoryUsed: 2030405060,
|
||||||
|
DutyCycle: 12,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Make: "nvidia",
|
||||||
|
Model: "tesla-k80",
|
||||||
|
ID: "GPU-deadbeef-0123-4567-89ab-feedfacecafe",
|
||||||
|
MemoryTotal: 10203040506,
|
||||||
|
MemoryUsed: 1020304050,
|
||||||
|
DutyCycle: 6,
|
||||||
|
},
|
||||||
|
},
|
||||||
TaskStats: info.LoadStats{
|
TaskStats: info.LoadStats{
|
||||||
NrSleeping: 50,
|
NrSleeping: 50,
|
||||||
NrRunning: 51,
|
NrRunning: 51,
|
||||||
|
12
metrics/testdata/prometheus_metrics
vendored
12
metrics/testdata/prometheus_metrics
vendored
@ -1,6 +1,18 @@
|
|||||||
# HELP cadvisor_version_info A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision.
|
# HELP cadvisor_version_info A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision.
|
||||||
# TYPE cadvisor_version_info gauge
|
# TYPE cadvisor_version_info gauge
|
||||||
cadvisor_version_info{cadvisorRevision="abcdef",cadvisorVersion="0.16.0",dockerVersion="1.8.1",kernelVersion="4.1.6-200.fc22.x86_64",osVersion="Fedora 22 (Twenty Two)"} 1
|
cadvisor_version_info{cadvisorRevision="abcdef",cadvisorVersion="0.16.0",dockerVersion="1.8.1",kernelVersion="4.1.6-200.fc22.x86_64",osVersion="Fedora 22 (Twenty Two)"} 1
|
||||||
|
# HELP container_accelerator_duty_cycle Percent of time over the past sample period during which the accelerator was actively processing.
|
||||||
|
# TYPE container_accelerator_duty_cycle gauge
|
||||||
|
container_accelerator_duty_cycle{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 6
|
||||||
|
container_accelerator_duty_cycle{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 12
|
||||||
|
# HELP container_accelerator_memory_total_bytes Total accelerator memory.
|
||||||
|
# TYPE container_accelerator_memory_total_bytes gauge
|
||||||
|
container_accelerator_memory_total_bytes{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 1.0203040506e+10
|
||||||
|
container_accelerator_memory_total_bytes{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 2.0304050607e+10
|
||||||
|
# HELP container_accelerator_memory_used_bytes Total accelerator memory allocated.
|
||||||
|
# TYPE container_accelerator_memory_used_bytes gauge
|
||||||
|
container_accelerator_memory_used_bytes{acc_id="GPU-deadbeef-0123-4567-89ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-k80",name="testcontaineralias",zone_name="hello"} 1.02030405e+09
|
||||||
|
container_accelerator_memory_used_bytes{acc_id="GPU-deadbeef-1234-5678-90ab-feedfacecafe",container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",make="nvidia",model="tesla-p100",name="testcontaineralias",zone_name="hello"} 2.03040506e+09
|
||||||
# HELP container_cpu_cfs_periods_total Number of elapsed enforcement period intervals.
|
# HELP container_cpu_cfs_periods_total Number of elapsed enforcement period intervals.
|
||||||
# TYPE container_cpu_cfs_periods_total counter
|
# TYPE container_cpu_cfs_periods_total counter
|
||||||
container_cpu_cfs_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 723
|
container_cpu_cfs_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 723
|
||||||
|
Loading…
Reference in New Issue
Block a user