Expose cpu cgroup CFS prometheus metrics
If CPU quota is configured (cpu.cfs_quota != -1) the CFS will provide stats about elapsed periods and throtting in cpu.stats. This change makes these information available as container_cpu_cfs_* metrics.
This commit is contained in:
parent
f7a30e0b43
commit
1653733ea7
@ -89,7 +89,7 @@ func GetStats(cgroupManager cgroups.Manager, rootFs string, pid int, ignoreMetri
|
||||
libcontainerStats := &libcontainer.Stats{
|
||||
CgroupStats: cgroupStats,
|
||||
}
|
||||
stats := toContainerStats(libcontainerStats)
|
||||
stats := newContainerStats(libcontainerStats)
|
||||
|
||||
// If we know the pid then get network stats from /proc/<pid>/net/dev
|
||||
if pid == 0 {
|
||||
@ -350,7 +350,7 @@ func DiskStatsCopy(blkio_stats []cgroups.BlkioStatEntry) (stat []info.PerDiskSta
|
||||
}
|
||||
|
||||
// Convert libcontainer stats to info.ContainerStats.
|
||||
func toContainerStats0(s *cgroups.Stats, ret *info.ContainerStats) {
|
||||
func setCpuStats(s *cgroups.Stats, ret *info.ContainerStats) {
|
||||
ret.Cpu.Usage.User = s.CpuStats.CpuUsage.UsageInUsermode
|
||||
ret.Cpu.Usage.System = s.CpuStats.CpuUsage.UsageInKernelmode
|
||||
n := len(s.CpuStats.CpuUsage.PercpuUsage)
|
||||
@ -361,9 +361,13 @@ func toContainerStats0(s *cgroups.Stats, ret *info.ContainerStats) {
|
||||
ret.Cpu.Usage.PerCpu[i] = s.CpuStats.CpuUsage.PercpuUsage[i]
|
||||
ret.Cpu.Usage.Total += s.CpuStats.CpuUsage.PercpuUsage[i]
|
||||
}
|
||||
|
||||
ret.Cpu.CFS.Periods = s.CpuStats.ThrottlingData.Periods
|
||||
ret.Cpu.CFS.ThrottledPeriods = s.CpuStats.ThrottlingData.ThrottledPeriods
|
||||
ret.Cpu.CFS.ThrottledTime = s.CpuStats.ThrottlingData.ThrottledTime
|
||||
}
|
||||
|
||||
func toContainerStats1(s *cgroups.Stats, ret *info.ContainerStats) {
|
||||
func setDiskIoStats(s *cgroups.Stats, ret *info.ContainerStats) {
|
||||
ret.DiskIo.IoServiceBytes = DiskStatsCopy(s.BlkioStats.IoServiceBytesRecursive)
|
||||
ret.DiskIo.IoServiced = DiskStatsCopy(s.BlkioStats.IoServicedRecursive)
|
||||
ret.DiskIo.IoQueued = DiskStatsCopy(s.BlkioStats.IoQueuedRecursive)
|
||||
@ -374,7 +378,7 @@ func toContainerStats1(s *cgroups.Stats, ret *info.ContainerStats) {
|
||||
ret.DiskIo.IoTime = DiskStatsCopy(s.BlkioStats.IoTimeRecursive)
|
||||
}
|
||||
|
||||
func toContainerStats2(s *cgroups.Stats, ret *info.ContainerStats) {
|
||||
func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) {
|
||||
ret.Memory.Usage = s.MemoryStats.Usage.Usage
|
||||
ret.Memory.Failcnt = s.MemoryStats.Usage.Failcnt
|
||||
ret.Memory.Cache = s.MemoryStats.Stats["cache"]
|
||||
@ -399,7 +403,7 @@ func toContainerStats2(s *cgroups.Stats, ret *info.ContainerStats) {
|
||||
ret.Memory.WorkingSet = workingSet
|
||||
}
|
||||
|
||||
func toContainerStats3(libcontainerStats *libcontainer.Stats, ret *info.ContainerStats) {
|
||||
func setNetworkStats(libcontainerStats *libcontainer.Stats, ret *info.ContainerStats) {
|
||||
ret.Network.Interfaces = make([]info.InterfaceStats, len(libcontainerStats.Interfaces))
|
||||
for i := range libcontainerStats.Interfaces {
|
||||
ret.Network.Interfaces[i] = info.InterfaceStats{
|
||||
@ -421,18 +425,18 @@ func toContainerStats3(libcontainerStats *libcontainer.Stats, ret *info.Containe
|
||||
}
|
||||
}
|
||||
|
||||
func toContainerStats(libcontainerStats *libcontainer.Stats) *info.ContainerStats {
|
||||
s := libcontainerStats.CgroupStats
|
||||
ret := new(info.ContainerStats)
|
||||
ret.Timestamp = time.Now()
|
||||
func newContainerStats(libcontainerStats *libcontainer.Stats) *info.ContainerStats {
|
||||
ret := &info.ContainerStats{
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
if s != nil {
|
||||
toContainerStats0(s, ret)
|
||||
toContainerStats1(s, ret)
|
||||
toContainerStats2(s, ret)
|
||||
if s := libcontainerStats.CgroupStats; s != nil {
|
||||
setCpuStats(s, ret)
|
||||
setDiskIoStats(s, ret)
|
||||
setMemoryStats(s, ret)
|
||||
}
|
||||
if len(libcontainerStats.Interfaces) > 0 {
|
||||
toContainerStats3(libcontainerStats, ret)
|
||||
setNetworkStats(libcontainerStats, ret)
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
@ -266,7 +266,7 @@ type LoadStats struct {
|
||||
// CPU usage time statistics.
|
||||
type CpuUsage struct {
|
||||
// Total CPU usage.
|
||||
// Units: nanoseconds
|
||||
// Unit: nanoseconds.
|
||||
Total uint64 `json:"total"`
|
||||
|
||||
// Per CPU/core usage of the container.
|
||||
@ -274,17 +274,31 @@ type CpuUsage struct {
|
||||
PerCpu []uint64 `json:"per_cpu_usage,omitempty"`
|
||||
|
||||
// Time spent in user space.
|
||||
// Unit: nanoseconds
|
||||
// Unit: nanoseconds.
|
||||
User uint64 `json:"user"`
|
||||
|
||||
// Time spent in kernel space.
|
||||
// Unit: nanoseconds
|
||||
// Unit: nanoseconds.
|
||||
System uint64 `json:"system"`
|
||||
}
|
||||
|
||||
// Cpu Completely Fair Scheduler statistics.
|
||||
type CpuCFS struct {
|
||||
// Total number of elapsed enforcement intervals.
|
||||
Periods uint64 `json:"periods"`
|
||||
|
||||
// Total number of times tasks in the cgroup have been throttled.
|
||||
ThrottledPeriods uint64 `json:"throttled_periods"`
|
||||
|
||||
// Total time duration for which tasks in the cgroup have been throttled.
|
||||
// Unit: nanoseconds.
|
||||
ThrottledTime uint64 `json:"throttled_time"`
|
||||
}
|
||||
|
||||
// All CPU usage metrics are cumulative from the creation of the container
|
||||
type CpuStats struct {
|
||||
Usage CpuUsage `json:"usage"`
|
||||
CFS CpuCFS `json:"cfs"`
|
||||
// Smoothed average of number of runnable threads x 1000.
|
||||
// We multiply by thousand to avoid using floats, but preserving precision.
|
||||
// Load is smoothed over the last 10 seconds. Instantaneous value can be read
|
||||
|
@ -63,6 +63,7 @@ type containerMetric struct {
|
||||
help string
|
||||
valueType prometheus.ValueType
|
||||
extraLabels []string
|
||||
condition func(s info.ContainerSpec) bool
|
||||
getValues func(s *info.ContainerStats) metricValues
|
||||
}
|
||||
|
||||
@ -127,6 +128,30 @@ func NewPrometheusCollector(infoProvider infoProvider, f ContainerNameToLabelsFu
|
||||
}
|
||||
return values
|
||||
},
|
||||
}, {
|
||||
name: "container_cpu_cfs_periods_total",
|
||||
help: "Number of elapsed enforcement period intervals.",
|
||||
valueType: prometheus.CounterValue,
|
||||
condition: func(s info.ContainerSpec) bool { return s.Cpu.Quota != 0 },
|
||||
getValues: func(s *info.ContainerStats) metricValues {
|
||||
return metricValues{{value: float64(s.Cpu.CFS.Periods)}}
|
||||
},
|
||||
}, {
|
||||
name: "container_cpu_cfs_throttled_periods_total",
|
||||
help: "Number of throttled period intervals.",
|
||||
valueType: prometheus.CounterValue,
|
||||
condition: func(s info.ContainerSpec) bool { return s.Cpu.Quota != 0 },
|
||||
getValues: func(s *info.ContainerStats) metricValues {
|
||||
return metricValues{{value: float64(s.Cpu.CFS.ThrottledPeriods)}}
|
||||
},
|
||||
}, {
|
||||
name: "container_cpu_cfs_throttled_seconds_total",
|
||||
help: "Total time duration the container has been throttled.",
|
||||
valueType: prometheus.CounterValue,
|
||||
condition: func(s info.ContainerSpec) bool { return s.Cpu.Quota != 0 },
|
||||
getValues: func(s *info.ContainerStats) metricValues {
|
||||
return metricValues{{value: float64(s.Cpu.CFS.ThrottledTime) / float64(time.Second)}}
|
||||
},
|
||||
}, {
|
||||
name: "container_memory_cache",
|
||||
help: "Number of bytes of page cache memory.",
|
||||
@ -568,6 +593,9 @@ func (c *PrometheusCollector) collectContainersInfo(ch chan<- prometheus.Metric)
|
||||
// Now for the actual metrics
|
||||
stats := container.Stats[0]
|
||||
for _, cm := range c.containerMetrics {
|
||||
if cm.condition != nil && !cm.condition(container.Spec) {
|
||||
continue
|
||||
}
|
||||
desc := cm.desc(baseLabels)
|
||||
for _, metricValue := range cm.getValues(stats) {
|
||||
ch <- prometheus.MustNewConstMetric(desc, cm.valueType, float64(metricValue.value), append(baseLabelValues, metricValue.labels...)...)
|
||||
|
@ -60,7 +60,7 @@ func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.Container
|
||||
HasCpu: true,
|
||||
Cpu: info.CpuSpec{
|
||||
Limit: 1000,
|
||||
Period: 10,
|
||||
Period: 100000,
|
||||
Quota: 10000,
|
||||
},
|
||||
CreationTime: time.Unix(1257894000, 0),
|
||||
@ -80,6 +80,11 @@ func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.Container
|
||||
User: 6,
|
||||
System: 7,
|
||||
},
|
||||
CFS: info.CpuCFS{
|
||||
Periods: 723,
|
||||
ThrottledPeriods: 18,
|
||||
ThrottledTime: 1724314000,
|
||||
},
|
||||
},
|
||||
Memory: info.MemoryStats{
|
||||
Usage: 8,
|
||||
|
11
metrics/testdata/prometheus_metrics
vendored
11
metrics/testdata/prometheus_metrics
vendored
@ -1,6 +1,15 @@
|
||||
# HELP cadvisor_version_info A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision.
|
||||
# TYPE cadvisor_version_info gauge
|
||||
cadvisor_version_info{cadvisorRevision="abcdef",cadvisorVersion="0.16.0",dockerVersion="1.8.1",kernelVersion="4.1.6-200.fc22.x86_64",osVersion="Fedora 22 (Twenty Two)"} 1
|
||||
# HELP container_cpu_cfs_periods_total Number of elapsed enforcement period intervals.
|
||||
# TYPE container_cpu_cfs_periods_total counter
|
||||
container_cpu_cfs_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 723
|
||||
# HELP container_cpu_cfs_throttled_periods_total Number of throttled period intervals.
|
||||
# TYPE container_cpu_cfs_throttled_periods_total counter
|
||||
container_cpu_cfs_throttled_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 18
|
||||
# HELP container_cpu_cfs_throttled_seconds_total Total time duration the container has been throttled.
|
||||
# TYPE container_cpu_cfs_throttled_seconds_total counter
|
||||
container_cpu_cfs_throttled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1.724314
|
||||
# HELP container_cpu_system_seconds_total Cumulative system cpu time consumed in seconds.
|
||||
# TYPE container_cpu_system_seconds_total counter
|
||||
container_cpu_system_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 7e-09
|
||||
@ -118,7 +127,7 @@ container_network_transmit_packets_total{container_env_foo_env="prod",container_
|
||||
container_scrape_error 0
|
||||
# HELP container_spec_cpu_period CPU period of the container.
|
||||
# TYPE container_spec_cpu_period gauge
|
||||
container_spec_cpu_period{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 10
|
||||
container_spec_cpu_period{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 100000
|
||||
# HELP container_spec_cpu_quota CPU quota of the container.
|
||||
# TYPE container_spec_cpu_quota gauge
|
||||
container_spec_cpu_quota{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 10000
|
||||
|
Loading…
Reference in New Issue
Block a user