Aggregate perf metrics
Add documentation about core perf events aggregation Signed-off-by: Katarzyna Kujawa <katarzyna.kujawa@intel.com>
This commit is contained in:
parent
888dc21343
commit
538a6d5c01
@ -140,6 +140,12 @@ cAdvisor stores the latest historical data in memory. How long of a history it s
|
||||
--perf_events_config="" Path to a JSON file containing configuration of perf events to measure. Empty value disables perf events measuring.
|
||||
```
|
||||
|
||||
Core perf events can be exposed on Prometheus endpoint per CPU or aggregated by event. It is controlled through `--disable_metrics` parameter with option `percpu`, e.g.:
|
||||
- `--disable_metrics="percpu"` - core perf events are aggregated
|
||||
- `--disable_metrics=""` - core perf events are exposed per CPU.
|
||||
|
||||
Aggregated form of core perf events significantly decrease volume of data. For aggregated form of core perf events scaling ratio (`container_perf_metric_scaling ratio`) indicates the lowest value of scaling ratio for specific event to show the worst precision.
|
||||
|
||||
### Perf subsystem introduction
|
||||
|
||||
One of the goals of kernel perf subsystem is to instrument CPU performance counters that allow to profile applications.
|
||||
|
@ -1577,41 +1577,48 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
|
||||
}...)
|
||||
}
|
||||
if includedMetrics.Has(container.PerfMetrics) {
|
||||
if includedMetrics.Has(container.PerCpuUsageMetrics) {
|
||||
c.containerMetrics = append(c.containerMetrics, []containerMetric{
|
||||
{
|
||||
name: "container_perf_events_total",
|
||||
help: "Perf event metric.",
|
||||
valueType: prometheus.CounterValue,
|
||||
extraLabels: []string{"cpu", "event"},
|
||||
getValues: func(s *info.ContainerStats) metricValues {
|
||||
return getPerCPUCorePerfEvents(s)
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "container_perf_events_scaling_ratio",
|
||||
help: "Perf event metric scaling ratio.",
|
||||
valueType: prometheus.GaugeValue,
|
||||
extraLabels: []string{"cpu", "event"},
|
||||
getValues: func(s *info.ContainerStats) metricValues {
|
||||
return getPerCPUCoreScalingRatio(s)
|
||||
},
|
||||
}}...)
|
||||
} else {
|
||||
c.containerMetrics = append(c.containerMetrics, []containerMetric{
|
||||
{
|
||||
name: "container_perf_events_total",
|
||||
help: "Perf event metric.",
|
||||
valueType: prometheus.CounterValue,
|
||||
extraLabels: []string{"cpu", "event"},
|
||||
getValues: func(s *info.ContainerStats) metricValues {
|
||||
return getAggregatedCorePerfEvents(s)
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "container_perf_events_scaling_ratio",
|
||||
help: "Perf event metric scaling ratio.",
|
||||
valueType: prometheus.GaugeValue,
|
||||
extraLabels: []string{"cpu", "event"},
|
||||
getValues: func(s *info.ContainerStats) metricValues {
|
||||
return getMinCoreScalingRatio(s)
|
||||
},
|
||||
}}...)
|
||||
}
|
||||
c.containerMetrics = append(c.containerMetrics, []containerMetric{
|
||||
{
|
||||
name: "container_perf_events_total",
|
||||
help: "Perf event metric.",
|
||||
valueType: prometheus.CounterValue,
|
||||
extraLabels: []string{"cpu", "event"},
|
||||
getValues: func(s *info.ContainerStats) metricValues {
|
||||
values := make(metricValues, 0, len(s.PerfStats))
|
||||
for _, metric := range s.PerfStats {
|
||||
values = append(values, metricValue{
|
||||
value: float64(metric.Value),
|
||||
labels: []string{strconv.Itoa(metric.Cpu), metric.Name},
|
||||
timestamp: s.Timestamp,
|
||||
})
|
||||
}
|
||||
return values
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "container_perf_events_scaling_ratio",
|
||||
help: "Perf event metric scaling ratio.",
|
||||
valueType: prometheus.GaugeValue,
|
||||
extraLabels: []string{"cpu", "event"},
|
||||
getValues: func(s *info.ContainerStats) metricValues {
|
||||
values := make(metricValues, 0, len(s.PerfStats))
|
||||
for _, metric := range s.PerfStats {
|
||||
values = append(values, metricValue{
|
||||
value: metric.ScalingRatio,
|
||||
labels: []string{strconv.Itoa(metric.Cpu), metric.Name},
|
||||
timestamp: s.Timestamp,
|
||||
})
|
||||
}
|
||||
return values
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "container_perf_uncore_events_total",
|
||||
help: "Perf uncore event metric.",
|
||||
@ -1940,3 +1947,70 @@ func getNumaStatsPerNode(nodeStats map[uint8]uint64, labels []string, timestamp
|
||||
}
|
||||
return mValues
|
||||
}
|
||||
|
||||
func getPerCPUCorePerfEvents(s *info.ContainerStats) metricValues {
|
||||
values := make(metricValues, 0, len(s.PerfStats))
|
||||
for _, metric := range s.PerfStats {
|
||||
values = append(values, metricValue{
|
||||
value: float64(metric.Value),
|
||||
labels: []string{strconv.Itoa(metric.Cpu), metric.Name},
|
||||
timestamp: s.Timestamp,
|
||||
})
|
||||
}
|
||||
return values
|
||||
}
|
||||
|
||||
func getPerCPUCoreScalingRatio(s *info.ContainerStats) metricValues {
|
||||
values := make(metricValues, 0, len(s.PerfStats))
|
||||
for _, metric := range s.PerfStats {
|
||||
values = append(values, metricValue{
|
||||
value: metric.ScalingRatio,
|
||||
labels: []string{strconv.Itoa(metric.Cpu), metric.Name},
|
||||
timestamp: s.Timestamp,
|
||||
})
|
||||
}
|
||||
return values
|
||||
}
|
||||
|
||||
func getAggregatedCorePerfEvents(s *info.ContainerStats) metricValues {
|
||||
values := make(metricValues, 0)
|
||||
|
||||
perfEventStatAgg := make(map[string]uint64)
|
||||
// aggregate by event
|
||||
for _, perfStat := range s.PerfStats {
|
||||
perfEventStatAgg[perfStat.Name] += perfStat.Value
|
||||
}
|
||||
// create aggregated metrics
|
||||
for perfEvent, perfValue := range perfEventStatAgg {
|
||||
values = append(values, metricValue{
|
||||
value: float64(perfValue),
|
||||
labels: []string{"", perfEvent},
|
||||
timestamp: s.Timestamp,
|
||||
})
|
||||
}
|
||||
return values
|
||||
}
|
||||
|
||||
func getMinCoreScalingRatio(s *info.ContainerStats) metricValues {
|
||||
values := make(metricValues, 0)
|
||||
perfEventStatMin := make(map[string]float64)
|
||||
// search for minimal value of scalin ratio for specific event
|
||||
for _, perfStat := range s.PerfStats {
|
||||
if _, ok := perfEventStatMin[perfStat.Name]; !ok {
|
||||
// found a new event
|
||||
perfEventStatMin[perfStat.Name] = perfStat.ScalingRatio
|
||||
} else if perfStat.ScalingRatio < perfEventStatMin[perfStat.Name] {
|
||||
// found a lower value of scaling ration so replace the minimal value
|
||||
perfEventStatMin[perfStat.Name] = perfStat.ScalingRatio
|
||||
}
|
||||
}
|
||||
|
||||
for perfEvent, perfScalingRatio := range perfEventStatMin {
|
||||
values = append(values, metricValue{
|
||||
value: perfScalingRatio,
|
||||
labels: []string{"", perfEvent},
|
||||
timestamp: s.Timestamp,
|
||||
})
|
||||
}
|
||||
return values
|
||||
}
|
||||
|
@ -44,6 +44,21 @@ func TestPrometheusCollector(t *testing.T) {
|
||||
testPrometheusCollector(t, reg, "testdata/prometheus_metrics")
|
||||
}
|
||||
|
||||
func TestPrometheusCollectorWithPerfAggregated(t *testing.T) {
|
||||
metrics := container.MetricSet{
|
||||
container.PerfMetrics: struct{}{},
|
||||
}
|
||||
c := NewPrometheusCollector(testSubcontainersInfoProvider{}, func(container *info.ContainerInfo) map[string]string {
|
||||
s := DefaultContainerLabels(container)
|
||||
s["zone.name"] = "hello"
|
||||
return s
|
||||
}, metrics, now, v2.RequestOptions{})
|
||||
reg := prometheus.NewRegistry()
|
||||
reg.MustRegister(c)
|
||||
|
||||
testPrometheusCollector(t, reg, "testdata/prometheus_metrics_perf_aggregated")
|
||||
}
|
||||
|
||||
func testPrometheusCollector(t *testing.T, gatherer prometheus.Gatherer, metricsFile string) {
|
||||
wantMetrics, err := os.Open(metricsFile)
|
||||
if err != nil {
|
||||
@ -122,3 +137,185 @@ func (m *mockInfoProvider) GetMachineInfo() (*info.MachineInfo, error) {
|
||||
func mockLabelFunc(*info.ContainerInfo) map[string]string {
|
||||
return map[string]string{}
|
||||
}
|
||||
|
||||
func TestGetPerCpuCorePerfEvents(t *testing.T) {
|
||||
containerStats := &info.ContainerStats{
|
||||
Timestamp: time.Unix(1395066367, 0),
|
||||
PerfStats: []info.PerfStat{
|
||||
{
|
||||
PerfValue: info.PerfValue{
|
||||
ScalingRatio: 1.0,
|
||||
Value: 123,
|
||||
Name: "instructions",
|
||||
},
|
||||
Cpu: 0,
|
||||
},
|
||||
{
|
||||
PerfValue: info.PerfValue{
|
||||
ScalingRatio: 0.5,
|
||||
Value: 456,
|
||||
Name: "instructions",
|
||||
},
|
||||
Cpu: 1,
|
||||
},
|
||||
{
|
||||
PerfValue: info.PerfValue{
|
||||
ScalingRatio: 0.7,
|
||||
Value: 321,
|
||||
Name: "instructions_retired"},
|
||||
Cpu: 0,
|
||||
},
|
||||
{
|
||||
PerfValue: info.PerfValue{
|
||||
ScalingRatio: 0.3,
|
||||
Value: 789,
|
||||
Name: "instructions_retired"},
|
||||
Cpu: 1,
|
||||
},
|
||||
},
|
||||
}
|
||||
metricVals := getPerCPUCorePerfEvents(containerStats)
|
||||
assert.Equal(t, 4, len(metricVals))
|
||||
values := []float64{}
|
||||
for _, metric := range metricVals {
|
||||
values = append(values, metric.value)
|
||||
}
|
||||
assert.Contains(t, values, 123.0)
|
||||
assert.Contains(t, values, 456.0)
|
||||
assert.Contains(t, values, 321.0)
|
||||
assert.Contains(t, values, 789.0)
|
||||
}
|
||||
|
||||
func TestGetPerCpuCoreScalingRatio(t *testing.T) {
|
||||
containerStats := &info.ContainerStats{
|
||||
Timestamp: time.Unix(1395066367, 0),
|
||||
PerfStats: []info.PerfStat{
|
||||
{
|
||||
PerfValue: info.PerfValue{
|
||||
ScalingRatio: 1.0,
|
||||
Value: 123,
|
||||
Name: "instructions"},
|
||||
Cpu: 0,
|
||||
},
|
||||
{
|
||||
PerfValue: info.PerfValue{
|
||||
ScalingRatio: 0.5,
|
||||
Value: 456,
|
||||
Name: "instructions"},
|
||||
Cpu: 1,
|
||||
},
|
||||
{
|
||||
PerfValue: info.PerfValue{
|
||||
ScalingRatio: 0.7,
|
||||
Value: 321,
|
||||
Name: "instructions_retired"},
|
||||
Cpu: 0,
|
||||
},
|
||||
{
|
||||
PerfValue: info.PerfValue{
|
||||
ScalingRatio: 0.3,
|
||||
Value: 789,
|
||||
Name: "instructions_retired"},
|
||||
Cpu: 1,
|
||||
},
|
||||
},
|
||||
}
|
||||
metricVals := getPerCPUCoreScalingRatio(containerStats)
|
||||
assert.Equal(t, 4, len(metricVals))
|
||||
values := []float64{}
|
||||
for _, metric := range metricVals {
|
||||
values = append(values, metric.value)
|
||||
}
|
||||
assert.Contains(t, values, 1.0)
|
||||
assert.Contains(t, values, 0.5)
|
||||
assert.Contains(t, values, 0.7)
|
||||
assert.Contains(t, values, 0.3)
|
||||
}
|
||||
|
||||
func TestGetAggCorePerfEvents(t *testing.T) {
|
||||
containerStats := &info.ContainerStats{
|
||||
Timestamp: time.Unix(1395066367, 0),
|
||||
PerfStats: []info.PerfStat{
|
||||
{
|
||||
PerfValue: info.PerfValue{
|
||||
ScalingRatio: 1.0,
|
||||
Value: 123,
|
||||
Name: "instructions"},
|
||||
Cpu: 0,
|
||||
},
|
||||
{
|
||||
PerfValue: info.PerfValue{
|
||||
ScalingRatio: 0.5,
|
||||
Value: 456,
|
||||
Name: "instructions"},
|
||||
Cpu: 1,
|
||||
},
|
||||
{
|
||||
PerfValue: info.PerfValue{
|
||||
ScalingRatio: 0.7,
|
||||
Value: 321,
|
||||
Name: "instructions_retired"},
|
||||
Cpu: 0,
|
||||
},
|
||||
{
|
||||
PerfValue: info.PerfValue{
|
||||
ScalingRatio: 0.3,
|
||||
Value: 789,
|
||||
Name: "instructions_retired"},
|
||||
Cpu: 1,
|
||||
},
|
||||
},
|
||||
}
|
||||
metricVals := getAggregatedCorePerfEvents(containerStats)
|
||||
assert.Equal(t, 2, len(metricVals))
|
||||
values := []float64{}
|
||||
for _, metric := range metricVals {
|
||||
values = append(values, metric.value)
|
||||
}
|
||||
assert.Contains(t, values, 579.0)
|
||||
assert.Contains(t, values, 1110.0)
|
||||
}
|
||||
|
||||
func TestGetMinCoreScalingRatio(t *testing.T) {
|
||||
containerStats := &info.ContainerStats{
|
||||
Timestamp: time.Unix(1395066367, 0),
|
||||
PerfStats: []info.PerfStat{
|
||||
{
|
||||
PerfValue: info.PerfValue{
|
||||
ScalingRatio: 1.0,
|
||||
Value: 123,
|
||||
Name: "instructions"},
|
||||
Cpu: 0,
|
||||
},
|
||||
{
|
||||
PerfValue: info.PerfValue{
|
||||
ScalingRatio: 0.5,
|
||||
Value: 456,
|
||||
Name: "instructions"},
|
||||
Cpu: 1,
|
||||
},
|
||||
{
|
||||
PerfValue: info.PerfValue{
|
||||
ScalingRatio: 0.7,
|
||||
Value: 321,
|
||||
Name: "instructions_retired"},
|
||||
Cpu: 0,
|
||||
},
|
||||
{
|
||||
PerfValue: info.PerfValue{
|
||||
ScalingRatio: 0.3,
|
||||
Value: 789,
|
||||
Name: "instructions_retired"},
|
||||
Cpu: 1,
|
||||
},
|
||||
},
|
||||
}
|
||||
metricVals := getMinCoreScalingRatio(containerStats)
|
||||
assert.Equal(t, 2, len(metricVals))
|
||||
values := []float64{}
|
||||
for _, metric := range metricVals {
|
||||
values = append(values, metric.value)
|
||||
}
|
||||
assert.Contains(t, values, 0.5)
|
||||
assert.Contains(t, values, 0.3)
|
||||
}
|
||||
|
37
metrics/testdata/prometheus_metrics_perf_aggregated
vendored
Normal file
37
metrics/testdata/prometheus_metrics_perf_aggregated
vendored
Normal file
@ -0,0 +1,37 @@
|
||||
# HELP cadvisor_version_info A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision.
|
||||
# TYPE cadvisor_version_info gauge
|
||||
cadvisor_version_info{cadvisorRevision="abcdef",cadvisorVersion="0.16.0",dockerVersion="1.8.1",kernelVersion="4.1.6-200.fc22.x86_64",osVersion="Fedora 22 (Twenty Two)"} 1
|
||||
# HELP container_last_seen Last time a container was seen by the exporter
|
||||
# TYPE container_last_seen gauge
|
||||
container_last_seen{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1.395066363e+09 1395066363000
|
||||
# HELP container_perf_events_scaling_ratio Perf event metric scaling ratio.
|
||||
# TYPE container_perf_events_scaling_ratio gauge
|
||||
container_perf_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",cpu="",event="instructions",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000
|
||||
container_perf_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",cpu="",event="instructions_retired",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.33333333333 1395066363000
|
||||
# HELP container_perf_events_total Perf event metric.
|
||||
# TYPE container_perf_events_total counter
|
||||
container_perf_events_total{container_env_foo_env="prod",container_label_foo_label="bar",cpu="",event="instructions",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 579 1395066363000
|
||||
container_perf_events_total{container_env_foo_env="prod",container_label_foo_label="bar",cpu="",event="instructions_retired",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1110 1395066363000
|
||||
# HELP container_perf_uncore_events_scaling_ratio Perf uncore event metric scaling ratio.
|
||||
# TYPE container_perf_uncore_events_scaling_ratio gauge
|
||||
container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000
|
||||
container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000
|
||||
# HELP container_perf_uncore_events_total Perf uncore event metric.
|
||||
# TYPE container_perf_uncore_events_total counter
|
||||
container_perf_uncore_events_total{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1.231231512e+09 1395066363000
|
||||
container_perf_uncore_events_total{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1.111231331e+09 1395066363000
|
||||
# HELP container_scrape_error 1 if there was an error while getting container metrics, 0 otherwise
|
||||
# TYPE container_scrape_error gauge
|
||||
container_scrape_error 0
|
||||
# HELP container_spec_cpu_period CPU period of the container.
|
||||
# TYPE container_spec_cpu_period gauge
|
||||
container_spec_cpu_period{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 100000
|
||||
# HELP container_spec_cpu_quota CPU quota of the container.
|
||||
# TYPE container_spec_cpu_quota gauge
|
||||
container_spec_cpu_quota{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 10000
|
||||
# HELP container_spec_cpu_shares CPU share of the container.
|
||||
# TYPE container_spec_cpu_shares gauge
|
||||
container_spec_cpu_shares{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1000
|
||||
# HELP container_start_time_seconds Start time of the container since unix epoch in seconds.
|
||||
# TYPE container_start_time_seconds gauge
|
||||
container_start_time_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1.257894e+09
|
Loading…
Reference in New Issue
Block a user