Expose cpu cgroup CFS prometheus metrics

If CPU quota is configured (cpu.cfs_quota != -1) the CFS will provide
stats about elapsed periods and throtting in cpu.stats. This change
makes these information available as container_cpu_cfs_* metrics.
This commit is contained in:
Tobias Schmidt 2016-07-19 01:58:19 -04:00
parent f7a30e0b43
commit 1653733ea7
5 changed files with 79 additions and 19 deletions

View File

@ -89,7 +89,7 @@ func GetStats(cgroupManager cgroups.Manager, rootFs string, pid int, ignoreMetri
libcontainerStats := &libcontainer.Stats{
CgroupStats: cgroupStats,
}
stats := toContainerStats(libcontainerStats)
stats := newContainerStats(libcontainerStats)
// If we know the pid then get network stats from /proc/<pid>/net/dev
if pid == 0 {
@ -350,7 +350,7 @@ func DiskStatsCopy(blkio_stats []cgroups.BlkioStatEntry) (stat []info.PerDiskSta
}
// Convert libcontainer stats to info.ContainerStats.
func toContainerStats0(s *cgroups.Stats, ret *info.ContainerStats) {
func setCpuStats(s *cgroups.Stats, ret *info.ContainerStats) {
ret.Cpu.Usage.User = s.CpuStats.CpuUsage.UsageInUsermode
ret.Cpu.Usage.System = s.CpuStats.CpuUsage.UsageInKernelmode
n := len(s.CpuStats.CpuUsage.PercpuUsage)
@ -361,9 +361,13 @@ func toContainerStats0(s *cgroups.Stats, ret *info.ContainerStats) {
ret.Cpu.Usage.PerCpu[i] = s.CpuStats.CpuUsage.PercpuUsage[i]
ret.Cpu.Usage.Total += s.CpuStats.CpuUsage.PercpuUsage[i]
}
ret.Cpu.CFS.Periods = s.CpuStats.ThrottlingData.Periods
ret.Cpu.CFS.ThrottledPeriods = s.CpuStats.ThrottlingData.ThrottledPeriods
ret.Cpu.CFS.ThrottledTime = s.CpuStats.ThrottlingData.ThrottledTime
}
func toContainerStats1(s *cgroups.Stats, ret *info.ContainerStats) {
func setDiskIoStats(s *cgroups.Stats, ret *info.ContainerStats) {
ret.DiskIo.IoServiceBytes = DiskStatsCopy(s.BlkioStats.IoServiceBytesRecursive)
ret.DiskIo.IoServiced = DiskStatsCopy(s.BlkioStats.IoServicedRecursive)
ret.DiskIo.IoQueued = DiskStatsCopy(s.BlkioStats.IoQueuedRecursive)
@ -374,7 +378,7 @@ func toContainerStats1(s *cgroups.Stats, ret *info.ContainerStats) {
ret.DiskIo.IoTime = DiskStatsCopy(s.BlkioStats.IoTimeRecursive)
}
func toContainerStats2(s *cgroups.Stats, ret *info.ContainerStats) {
func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) {
ret.Memory.Usage = s.MemoryStats.Usage.Usage
ret.Memory.Failcnt = s.MemoryStats.Usage.Failcnt
ret.Memory.Cache = s.MemoryStats.Stats["cache"]
@ -399,7 +403,7 @@ func toContainerStats2(s *cgroups.Stats, ret *info.ContainerStats) {
ret.Memory.WorkingSet = workingSet
}
func toContainerStats3(libcontainerStats *libcontainer.Stats, ret *info.ContainerStats) {
func setNetworkStats(libcontainerStats *libcontainer.Stats, ret *info.ContainerStats) {
ret.Network.Interfaces = make([]info.InterfaceStats, len(libcontainerStats.Interfaces))
for i := range libcontainerStats.Interfaces {
ret.Network.Interfaces[i] = info.InterfaceStats{
@ -421,18 +425,18 @@ func toContainerStats3(libcontainerStats *libcontainer.Stats, ret *info.Containe
}
}
func toContainerStats(libcontainerStats *libcontainer.Stats) *info.ContainerStats {
s := libcontainerStats.CgroupStats
ret := new(info.ContainerStats)
ret.Timestamp = time.Now()
func newContainerStats(libcontainerStats *libcontainer.Stats) *info.ContainerStats {
ret := &info.ContainerStats{
Timestamp: time.Now(),
}
if s != nil {
toContainerStats0(s, ret)
toContainerStats1(s, ret)
toContainerStats2(s, ret)
if s := libcontainerStats.CgroupStats; s != nil {
setCpuStats(s, ret)
setDiskIoStats(s, ret)
setMemoryStats(s, ret)
}
if len(libcontainerStats.Interfaces) > 0 {
toContainerStats3(libcontainerStats, ret)
setNetworkStats(libcontainerStats, ret)
}
return ret
}

View File

@ -266,7 +266,7 @@ type LoadStats struct {
// CPU usage time statistics.
type CpuUsage struct {
// Total CPU usage.
// Units: nanoseconds
// Unit: nanoseconds.
Total uint64 `json:"total"`
// Per CPU/core usage of the container.
@ -274,17 +274,31 @@ type CpuUsage struct {
PerCpu []uint64 `json:"per_cpu_usage,omitempty"`
// Time spent in user space.
// Unit: nanoseconds
// Unit: nanoseconds.
User uint64 `json:"user"`
// Time spent in kernel space.
// Unit: nanoseconds
// Unit: nanoseconds.
System uint64 `json:"system"`
}
// Cpu Completely Fair Scheduler statistics.
type CpuCFS struct {
// Total number of elapsed enforcement intervals.
Periods uint64 `json:"periods"`
// Total number of times tasks in the cgroup have been throttled.
ThrottledPeriods uint64 `json:"throttled_periods"`
// Total time duration for which tasks in the cgroup have been throttled.
// Unit: nanoseconds.
ThrottledTime uint64 `json:"throttled_time"`
}
// All CPU usage metrics are cumulative from the creation of the container
type CpuStats struct {
Usage CpuUsage `json:"usage"`
CFS CpuCFS `json:"cfs"`
// Smoothed average of number of runnable threads x 1000.
// We multiply by thousand to avoid using floats, but preserving precision.
// Load is smoothed over the last 10 seconds. Instantaneous value can be read

View File

@ -63,6 +63,7 @@ type containerMetric struct {
help string
valueType prometheus.ValueType
extraLabels []string
condition func(s info.ContainerSpec) bool
getValues func(s *info.ContainerStats) metricValues
}
@ -127,6 +128,30 @@ func NewPrometheusCollector(infoProvider infoProvider, f ContainerNameToLabelsFu
}
return values
},
}, {
name: "container_cpu_cfs_periods_total",
help: "Number of elapsed enforcement period intervals.",
valueType: prometheus.CounterValue,
condition: func(s info.ContainerSpec) bool { return s.Cpu.Quota != 0 },
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.Cpu.CFS.Periods)}}
},
}, {
name: "container_cpu_cfs_throttled_periods_total",
help: "Number of throttled period intervals.",
valueType: prometheus.CounterValue,
condition: func(s info.ContainerSpec) bool { return s.Cpu.Quota != 0 },
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.Cpu.CFS.ThrottledPeriods)}}
},
}, {
name: "container_cpu_cfs_throttled_seconds_total",
help: "Total time duration the container has been throttled.",
valueType: prometheus.CounterValue,
condition: func(s info.ContainerSpec) bool { return s.Cpu.Quota != 0 },
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.Cpu.CFS.ThrottledTime) / float64(time.Second)}}
},
}, {
name: "container_memory_cache",
help: "Number of bytes of page cache memory.",
@ -568,6 +593,9 @@ func (c *PrometheusCollector) collectContainersInfo(ch chan<- prometheus.Metric)
// Now for the actual metrics
stats := container.Stats[0]
for _, cm := range c.containerMetrics {
if cm.condition != nil && !cm.condition(container.Spec) {
continue
}
desc := cm.desc(baseLabels)
for _, metricValue := range cm.getValues(stats) {
ch <- prometheus.MustNewConstMetric(desc, cm.valueType, float64(metricValue.value), append(baseLabelValues, metricValue.labels...)...)

View File

@ -60,7 +60,7 @@ func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.Container
HasCpu: true,
Cpu: info.CpuSpec{
Limit: 1000,
Period: 10,
Period: 100000,
Quota: 10000,
},
CreationTime: time.Unix(1257894000, 0),
@ -80,6 +80,11 @@ func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.Container
User: 6,
System: 7,
},
CFS: info.CpuCFS{
Periods: 723,
ThrottledPeriods: 18,
ThrottledTime: 1724314000,
},
},
Memory: info.MemoryStats{
Usage: 8,

View File

@ -1,6 +1,15 @@
# HELP cadvisor_version_info A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision.
# TYPE cadvisor_version_info gauge
cadvisor_version_info{cadvisorRevision="abcdef",cadvisorVersion="0.16.0",dockerVersion="1.8.1",kernelVersion="4.1.6-200.fc22.x86_64",osVersion="Fedora 22 (Twenty Two)"} 1
# HELP container_cpu_cfs_periods_total Number of elapsed enforcement period intervals.
# TYPE container_cpu_cfs_periods_total counter
container_cpu_cfs_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 723
# HELP container_cpu_cfs_throttled_periods_total Number of throttled period intervals.
# TYPE container_cpu_cfs_throttled_periods_total counter
container_cpu_cfs_throttled_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 18
# HELP container_cpu_cfs_throttled_seconds_total Total time duration the container has been throttled.
# TYPE container_cpu_cfs_throttled_seconds_total counter
container_cpu_cfs_throttled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1.724314
# HELP container_cpu_system_seconds_total Cumulative system cpu time consumed in seconds.
# TYPE container_cpu_system_seconds_total counter
container_cpu_system_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 7e-09
@ -118,7 +127,7 @@ container_network_transmit_packets_total{container_env_foo_env="prod",container_
container_scrape_error 0
# HELP container_spec_cpu_period CPU period of the container.
# TYPE container_spec_cpu_period gauge
container_spec_cpu_period{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 10
container_spec_cpu_period{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 100000
# HELP container_spec_cpu_quota CPU quota of the container.
# TYPE container_spec_cpu_quota gauge
container_spec_cpu_quota{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 10000