From 1653733ea7f6357280aa7d34723c6095f2c9eb55 Mon Sep 17 00:00:00 2001 From: Tobias Schmidt Date: Tue, 19 Jul 2016 01:58:19 -0400 Subject: [PATCH] Expose cpu cgroup CFS prometheus metrics If CPU quota is configured (cpu.cfs_quota != -1) the CFS will provide stats about elapsed periods and throtting in cpu.stats. This change makes these information available as container_cpu_cfs_* metrics. --- container/libcontainer/helpers.go | 32 ++++++++++++++++------------- info/v1/container.go | 20 +++++++++++++++--- metrics/prometheus.go | 28 +++++++++++++++++++++++++ metrics/prometheus_test.go | 7 ++++++- metrics/testdata/prometheus_metrics | 11 +++++++++- 5 files changed, 79 insertions(+), 19 deletions(-) diff --git a/container/libcontainer/helpers.go b/container/libcontainer/helpers.go index 65c72ab5..77fa1869 100644 --- a/container/libcontainer/helpers.go +++ b/container/libcontainer/helpers.go @@ -89,7 +89,7 @@ func GetStats(cgroupManager cgroups.Manager, rootFs string, pid int, ignoreMetri libcontainerStats := &libcontainer.Stats{ CgroupStats: cgroupStats, } - stats := toContainerStats(libcontainerStats) + stats := newContainerStats(libcontainerStats) // If we know the pid then get network stats from /proc//net/dev if pid == 0 { @@ -350,7 +350,7 @@ func DiskStatsCopy(blkio_stats []cgroups.BlkioStatEntry) (stat []info.PerDiskSta } // Convert libcontainer stats to info.ContainerStats. -func toContainerStats0(s *cgroups.Stats, ret *info.ContainerStats) { +func setCpuStats(s *cgroups.Stats, ret *info.ContainerStats) { ret.Cpu.Usage.User = s.CpuStats.CpuUsage.UsageInUsermode ret.Cpu.Usage.System = s.CpuStats.CpuUsage.UsageInKernelmode n := len(s.CpuStats.CpuUsage.PercpuUsage) @@ -361,9 +361,13 @@ func toContainerStats0(s *cgroups.Stats, ret *info.ContainerStats) { ret.Cpu.Usage.PerCpu[i] = s.CpuStats.CpuUsage.PercpuUsage[i] ret.Cpu.Usage.Total += s.CpuStats.CpuUsage.PercpuUsage[i] } + + ret.Cpu.CFS.Periods = s.CpuStats.ThrottlingData.Periods + ret.Cpu.CFS.ThrottledPeriods = s.CpuStats.ThrottlingData.ThrottledPeriods + ret.Cpu.CFS.ThrottledTime = s.CpuStats.ThrottlingData.ThrottledTime } -func toContainerStats1(s *cgroups.Stats, ret *info.ContainerStats) { +func setDiskIoStats(s *cgroups.Stats, ret *info.ContainerStats) { ret.DiskIo.IoServiceBytes = DiskStatsCopy(s.BlkioStats.IoServiceBytesRecursive) ret.DiskIo.IoServiced = DiskStatsCopy(s.BlkioStats.IoServicedRecursive) ret.DiskIo.IoQueued = DiskStatsCopy(s.BlkioStats.IoQueuedRecursive) @@ -374,7 +378,7 @@ func toContainerStats1(s *cgroups.Stats, ret *info.ContainerStats) { ret.DiskIo.IoTime = DiskStatsCopy(s.BlkioStats.IoTimeRecursive) } -func toContainerStats2(s *cgroups.Stats, ret *info.ContainerStats) { +func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) { ret.Memory.Usage = s.MemoryStats.Usage.Usage ret.Memory.Failcnt = s.MemoryStats.Usage.Failcnt ret.Memory.Cache = s.MemoryStats.Stats["cache"] @@ -399,7 +403,7 @@ func toContainerStats2(s *cgroups.Stats, ret *info.ContainerStats) { ret.Memory.WorkingSet = workingSet } -func toContainerStats3(libcontainerStats *libcontainer.Stats, ret *info.ContainerStats) { +func setNetworkStats(libcontainerStats *libcontainer.Stats, ret *info.ContainerStats) { ret.Network.Interfaces = make([]info.InterfaceStats, len(libcontainerStats.Interfaces)) for i := range libcontainerStats.Interfaces { ret.Network.Interfaces[i] = info.InterfaceStats{ @@ -421,18 +425,18 @@ func toContainerStats3(libcontainerStats *libcontainer.Stats, ret *info.Containe } } -func toContainerStats(libcontainerStats *libcontainer.Stats) *info.ContainerStats { - s := libcontainerStats.CgroupStats - ret := new(info.ContainerStats) - ret.Timestamp = time.Now() +func newContainerStats(libcontainerStats *libcontainer.Stats) *info.ContainerStats { + ret := &info.ContainerStats{ + Timestamp: time.Now(), + } - if s != nil { - toContainerStats0(s, ret) - toContainerStats1(s, ret) - toContainerStats2(s, ret) + if s := libcontainerStats.CgroupStats; s != nil { + setCpuStats(s, ret) + setDiskIoStats(s, ret) + setMemoryStats(s, ret) } if len(libcontainerStats.Interfaces) > 0 { - toContainerStats3(libcontainerStats, ret) + setNetworkStats(libcontainerStats, ret) } return ret } diff --git a/info/v1/container.go b/info/v1/container.go index f29f5bdf..cfbdd03a 100644 --- a/info/v1/container.go +++ b/info/v1/container.go @@ -266,7 +266,7 @@ type LoadStats struct { // CPU usage time statistics. type CpuUsage struct { // Total CPU usage. - // Units: nanoseconds + // Unit: nanoseconds. Total uint64 `json:"total"` // Per CPU/core usage of the container. @@ -274,17 +274,31 @@ type CpuUsage struct { PerCpu []uint64 `json:"per_cpu_usage,omitempty"` // Time spent in user space. - // Unit: nanoseconds + // Unit: nanoseconds. User uint64 `json:"user"` // Time spent in kernel space. - // Unit: nanoseconds + // Unit: nanoseconds. System uint64 `json:"system"` } +// Cpu Completely Fair Scheduler statistics. +type CpuCFS struct { + // Total number of elapsed enforcement intervals. + Periods uint64 `json:"periods"` + + // Total number of times tasks in the cgroup have been throttled. + ThrottledPeriods uint64 `json:"throttled_periods"` + + // Total time duration for which tasks in the cgroup have been throttled. + // Unit: nanoseconds. + ThrottledTime uint64 `json:"throttled_time"` +} + // All CPU usage metrics are cumulative from the creation of the container type CpuStats struct { Usage CpuUsage `json:"usage"` + CFS CpuCFS `json:"cfs"` // Smoothed average of number of runnable threads x 1000. // We multiply by thousand to avoid using floats, but preserving precision. // Load is smoothed over the last 10 seconds. Instantaneous value can be read diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 2688f808..96879f1f 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -63,6 +63,7 @@ type containerMetric struct { help string valueType prometheus.ValueType extraLabels []string + condition func(s info.ContainerSpec) bool getValues func(s *info.ContainerStats) metricValues } @@ -127,6 +128,30 @@ func NewPrometheusCollector(infoProvider infoProvider, f ContainerNameToLabelsFu } return values }, + }, { + name: "container_cpu_cfs_periods_total", + help: "Number of elapsed enforcement period intervals.", + valueType: prometheus.CounterValue, + condition: func(s info.ContainerSpec) bool { return s.Cpu.Quota != 0 }, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: float64(s.Cpu.CFS.Periods)}} + }, + }, { + name: "container_cpu_cfs_throttled_periods_total", + help: "Number of throttled period intervals.", + valueType: prometheus.CounterValue, + condition: func(s info.ContainerSpec) bool { return s.Cpu.Quota != 0 }, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: float64(s.Cpu.CFS.ThrottledPeriods)}} + }, + }, { + name: "container_cpu_cfs_throttled_seconds_total", + help: "Total time duration the container has been throttled.", + valueType: prometheus.CounterValue, + condition: func(s info.ContainerSpec) bool { return s.Cpu.Quota != 0 }, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: float64(s.Cpu.CFS.ThrottledTime) / float64(time.Second)}} + }, }, { name: "container_memory_cache", help: "Number of bytes of page cache memory.", @@ -568,6 +593,9 @@ func (c *PrometheusCollector) collectContainersInfo(ch chan<- prometheus.Metric) // Now for the actual metrics stats := container.Stats[0] for _, cm := range c.containerMetrics { + if cm.condition != nil && !cm.condition(container.Spec) { + continue + } desc := cm.desc(baseLabels) for _, metricValue := range cm.getValues(stats) { ch <- prometheus.MustNewConstMetric(desc, cm.valueType, float64(metricValue.value), append(baseLabelValues, metricValue.labels...)...) diff --git a/metrics/prometheus_test.go b/metrics/prometheus_test.go index 119f0d7b..e1e18e9b 100644 --- a/metrics/prometheus_test.go +++ b/metrics/prometheus_test.go @@ -60,7 +60,7 @@ func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.Container HasCpu: true, Cpu: info.CpuSpec{ Limit: 1000, - Period: 10, + Period: 100000, Quota: 10000, }, CreationTime: time.Unix(1257894000, 0), @@ -80,6 +80,11 @@ func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.Container User: 6, System: 7, }, + CFS: info.CpuCFS{ + Periods: 723, + ThrottledPeriods: 18, + ThrottledTime: 1724314000, + }, }, Memory: info.MemoryStats{ Usage: 8, diff --git a/metrics/testdata/prometheus_metrics b/metrics/testdata/prometheus_metrics index dc37aa8f..298bd6e6 100644 --- a/metrics/testdata/prometheus_metrics +++ b/metrics/testdata/prometheus_metrics @@ -1,6 +1,15 @@ # HELP cadvisor_version_info A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision. # TYPE cadvisor_version_info gauge cadvisor_version_info{cadvisorRevision="abcdef",cadvisorVersion="0.16.0",dockerVersion="1.8.1",kernelVersion="4.1.6-200.fc22.x86_64",osVersion="Fedora 22 (Twenty Two)"} 1 +# HELP container_cpu_cfs_periods_total Number of elapsed enforcement period intervals. +# TYPE container_cpu_cfs_periods_total counter +container_cpu_cfs_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 723 +# HELP container_cpu_cfs_throttled_periods_total Number of throttled period intervals. +# TYPE container_cpu_cfs_throttled_periods_total counter +container_cpu_cfs_throttled_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 18 +# HELP container_cpu_cfs_throttled_seconds_total Total time duration the container has been throttled. +# TYPE container_cpu_cfs_throttled_seconds_total counter +container_cpu_cfs_throttled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1.724314 # HELP container_cpu_system_seconds_total Cumulative system cpu time consumed in seconds. # TYPE container_cpu_system_seconds_total counter container_cpu_system_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 7e-09 @@ -118,7 +127,7 @@ container_network_transmit_packets_total{container_env_foo_env="prod",container_ container_scrape_error 0 # HELP container_spec_cpu_period CPU period of the container. # TYPE container_spec_cpu_period gauge -container_spec_cpu_period{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 10 +container_spec_cpu_period{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 100000 # HELP container_spec_cpu_quota CPU quota of the container. # TYPE container_spec_cpu_quota gauge container_spec_cpu_quota{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 10000