Adding /proc/<pid>/schedstat (#1872)

Add /proc/<pid>/schedstat metrics for scheduler metrics
This commit is contained in:
nielsole 2018-03-08 18:27:06 +01:00 committed by David Ashpole
parent b817801307
commit 08f0c2397c
7 changed files with 130 additions and 20 deletions

View File

@ -64,6 +64,7 @@ var (
ignoreMetrics metricSetValue = metricSetValue{container.MetricSet{ ignoreMetrics metricSetValue = metricSetValue{container.MetricSet{
container.NetworkTcpUsageMetrics: struct{}{}, container.NetworkTcpUsageMetrics: struct{}{},
container.NetworkUdpUsageMetrics: struct{}{}, container.NetworkUdpUsageMetrics: struct{}{},
container.ProcessSchedulerMetrics: struct{}{},
}} }}
// List of metrics that can be ignored. // List of metrics that can be ignored.
@ -73,6 +74,7 @@ var (
container.NetworkTcpUsageMetrics: struct{}{}, container.NetworkTcpUsageMetrics: struct{}{},
container.NetworkUdpUsageMetrics: struct{}{}, container.NetworkUdpUsageMetrics: struct{}{},
container.PerCpuUsageMetrics: struct{}{}, container.PerCpuUsageMetrics: struct{}{},
container.ProcessSchedulerMetrics: struct{}{},
} }
) )

View File

@ -41,16 +41,17 @@ type ContainerHandlerFactory interface {
type MetricKind string type MetricKind string
const ( const (
CpuUsageMetrics MetricKind = "cpu" CpuUsageMetrics MetricKind = "cpu"
PerCpuUsageMetrics MetricKind = "percpu" ProcessSchedulerMetrics MetricKind = "sched"
MemoryUsageMetrics MetricKind = "memory" PerCpuUsageMetrics MetricKind = "percpu"
CpuLoadMetrics MetricKind = "cpuLoad" MemoryUsageMetrics MetricKind = "memory"
DiskIOMetrics MetricKind = "diskIO" CpuLoadMetrics MetricKind = "cpuLoad"
DiskUsageMetrics MetricKind = "disk" DiskIOMetrics MetricKind = "diskIO"
NetworkUsageMetrics MetricKind = "network" DiskUsageMetrics MetricKind = "disk"
NetworkTcpUsageMetrics MetricKind = "tcp" NetworkUsageMetrics MetricKind = "network"
NetworkUdpUsageMetrics MetricKind = "udp" NetworkTcpUsageMetrics MetricKind = "tcp"
AppMetrics MetricKind = "app" NetworkUdpUsageMetrics MetricKind = "udp"
AppMetrics MetricKind = "app"
) )
func (mk MetricKind) String() string { func (mk MetricKind) String() string {

View File

@ -28,6 +28,7 @@ import (
"github.com/google/cadvisor/container" "github.com/google/cadvisor/container"
info "github.com/google/cadvisor/info/v1" info "github.com/google/cadvisor/info/v1"
"bytes"
"github.com/golang/glog" "github.com/golang/glog"
"github.com/opencontainers/runc/libcontainer" "github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
@ -39,18 +40,20 @@ import (
import "C" import "C"
type Handler struct { type Handler struct {
cgroupManager cgroups.Manager cgroupManager cgroups.Manager
rootFs string rootFs string
pid int pid int
ignoreMetrics container.MetricSet ignoreMetrics container.MetricSet
pidMetricsCache map[int]*info.CpuSchedstat
} }
func NewHandler(cgroupManager cgroups.Manager, rootFs string, pid int, ignoreMetrics container.MetricSet) *Handler { func NewHandler(cgroupManager cgroups.Manager, rootFs string, pid int, ignoreMetrics container.MetricSet) *Handler {
return &Handler{ return &Handler{
cgroupManager: cgroupManager, cgroupManager: cgroupManager,
rootFs: rootFs, rootFs: rootFs,
pid: pid, pid: pid,
ignoreMetrics: ignoreMetrics, ignoreMetrics: ignoreMetrics,
pidMetricsCache: make(map[int]*info.CpuSchedstat),
} }
} }
@ -66,6 +69,18 @@ func (h *Handler) GetStats() (*info.ContainerStats, error) {
withPerCPU := !h.ignoreMetrics.Has(container.PerCpuUsageMetrics) withPerCPU := !h.ignoreMetrics.Has(container.PerCpuUsageMetrics)
stats := newContainerStats(libcontainerStats, withPerCPU) stats := newContainerStats(libcontainerStats, withPerCPU)
if !h.ignoreMetrics.Has(container.ProcessSchedulerMetrics) {
pids, err := h.cgroupManager.GetAllPids()
if err != nil {
glog.V(4).Infof("Could not get PIDs for container %d: %v", h.pid, err)
} else {
stats.Cpu.Schedstat, err = schedulerStatsFromProcs(h.rootFs, pids, h.pidMetricsCache)
if err != nil {
glog.V(4).Infof("Unable to get Process Scheduler Stats: %v", err)
}
}
}
// If we know the pid then get network stats from /proc/<pid>/net/dev // If we know the pid then get network stats from /proc/<pid>/net/dev
if h.pid == 0 { if h.pid == 0 {
return stats, nil return stats, nil
@ -117,6 +132,50 @@ func (h *Handler) GetStats() (*info.ContainerStats, error) {
return stats, nil return stats, nil
} }
func schedulerStatsFromProcs(rootFs string, pids []int, pidMetricsCache map[int]*info.CpuSchedstat) (info.CpuSchedstat, error) {
for _, pid := range pids {
f, err := os.Open(path.Join(rootFs, "proc", strconv.Itoa(pid), "schedstat"))
if err != nil {
return info.CpuSchedstat{}, fmt.Errorf("couldn't open scheduler statistics for process %d: %v", pid, err)
}
defer f.Close()
contents, err := ioutil.ReadAll(f)
if err != nil {
return info.CpuSchedstat{}, fmt.Errorf("couldn't read scheduler statistics for process %d: %v", pid, err)
}
rawMetrics := bytes.Split(bytes.TrimRight(contents, "\n"), []byte(" "))
if len(rawMetrics) != 3 {
return info.CpuSchedstat{}, fmt.Errorf("unexpected number of metrics in schedstat file for process %d", pid)
}
cacheEntry, ok := pidMetricsCache[pid]
if !ok {
cacheEntry = &info.CpuSchedstat{}
pidMetricsCache[pid] = cacheEntry
}
for i, rawMetric := range rawMetrics {
metric, err := strconv.ParseUint(string(rawMetric), 10, 64)
if err != nil {
return info.CpuSchedstat{}, fmt.Errorf("parsing error while reading scheduler statistics for process: %d: %v", pid, err)
}
switch i {
case 0:
cacheEntry.RunTime = metric
case 1:
cacheEntry.RunqueueTime = metric
case 2:
cacheEntry.RunPeriods = metric
}
}
}
schedstats := info.CpuSchedstat{}
for _, v := range pidMetricsCache {
schedstats.RunPeriods += v.RunPeriods
schedstats.RunqueueTime += v.RunqueueTime
schedstats.RunTime += v.RunTime
}
return schedstats, nil
}
func networkStatsFromProc(rootFs string, pid int) ([]info.InterfaceStats, error) { func networkStatsFromProc(rootFs string, pid int) ([]info.InterfaceStats, error) {
netStatsFile := path.Join(rootFs, "proc", strconv.Itoa(pid), "/net/dev") netStatsFile := path.Join(rootFs, "proc", strconv.Itoa(pid), "/net/dev")

View File

@ -293,10 +293,23 @@ type CpuCFS struct {
ThrottledTime uint64 `json:"throttled_time"` ThrottledTime uint64 `json:"throttled_time"`
} }
// Cpu Aggregated scheduler statistics
type CpuSchedstat struct {
// https://www.kernel.org/doc/Documentation/scheduler/sched-stats.txt
// time spent on the cpu
RunTime uint64 `json:"run_time"`
// time spent waiting on a runqueue
RunqueueTime uint64 `json:"runqueue_time"`
// # of timeslices run on this cpu
RunPeriods uint64 `json:"run_periods"`
}
// All CPU usage metrics are cumulative from the creation of the container // All CPU usage metrics are cumulative from the creation of the container
type CpuStats struct { type CpuStats struct {
Usage CpuUsage `json:"usage"` Usage CpuUsage `json:"usage"`
CFS CpuCFS `json:"cfs"` CFS CpuCFS `json:"cfs"`
Schedstat CpuSchedstat `json:"schedstat"`
// Smoothed average of number of runnable threads x 1000. // Smoothed average of number of runnable threads x 1000.
// We multiply by thousand to avoid using floats, but preserving precision. // We multiply by thousand to avoid using floats, but preserving precision.
// Load is smoothed over the last 10 seconds. Instantaneous value can be read // Load is smoothed over the last 10 seconds. Instantaneous value can be read

View File

@ -197,6 +197,27 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo
getValues: func(s *info.ContainerStats) metricValues { getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.Cpu.CFS.ThrottledTime) / float64(time.Second)}} return metricValues{{value: float64(s.Cpu.CFS.ThrottledTime) / float64(time.Second)}}
}, },
}, {
name: "container_cpu_schedstat_run_seconds_total",
help: "Time duration the processes of the container have run on the CPU.",
valueType: prometheus.CounterValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.Cpu.Schedstat.RunTime) / float64(time.Second)}}
},
}, {
name: "container_cpu_schedstat_runqueue_seconds_total",
help: "Time duration processes of the container have been waiting on a runqueue.",
valueType: prometheus.CounterValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.Cpu.Schedstat.RunqueueTime) / float64(time.Second)}}
},
}, {
name: "container_cpu_schedstat_run_periods_total",
help: "Number of times processes of the cgroup have run on the cpu",
valueType: prometheus.CounterValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.Cpu.Schedstat.RunPeriods)}}
},
}, { }, {
name: "container_cpu_load_average_10s", name: "container_cpu_load_average_10s",
help: "Value of container cpu load average over the last 10 seconds.", help: "Value of container cpu load average over the last 10 seconds.",

View File

@ -90,6 +90,11 @@ func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.Container
ThrottledPeriods: 18, ThrottledPeriods: 18,
ThrottledTime: 1724314000, ThrottledTime: 1724314000,
}, },
Schedstat: info.CpuSchedstat{
RunTime: 53643567,
RunqueueTime: 479424566378,
RunPeriods: 984285,
},
LoadAverage: 2, LoadAverage: 2,
}, },
Memory: info.MemoryStats{ Memory: info.MemoryStats{

View File

@ -25,6 +25,15 @@ container_cpu_cfs_throttled_seconds_total{container_env_foo_env="prod",container
# HELP container_cpu_load_average_10s Value of container cpu load average over the last 10 seconds. # HELP container_cpu_load_average_10s Value of container cpu load average over the last 10 seconds.
# TYPE container_cpu_load_average_10s gauge # TYPE container_cpu_load_average_10s gauge
container_cpu_load_average_10s{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 2 container_cpu_load_average_10s{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 2
# HELP container_cpu_schedstat_run_periods_total Number of times processes of the cgroup have run on the cpu
# TYPE container_cpu_schedstat_run_periods_total counter
container_cpu_schedstat_run_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 984285
# HELP container_cpu_schedstat_run_seconds_total Time duration the processes of the container have run on the CPU.
# TYPE container_cpu_schedstat_run_seconds_total counter
container_cpu_schedstat_run_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.053643567
# HELP container_cpu_schedstat_runqueue_seconds_total Time duration processes of the container have been waiting on a runqueue.
# TYPE container_cpu_schedstat_runqueue_seconds_total counter
container_cpu_schedstat_runqueue_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 479.424566378
# HELP container_cpu_system_seconds_total Cumulative system cpu time consumed in seconds. # HELP container_cpu_system_seconds_total Cumulative system cpu time consumed in seconds.
# TYPE container_cpu_system_seconds_total counter # TYPE container_cpu_system_seconds_total counter
container_cpu_system_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 7e-09 container_cpu_system_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 7e-09