Adding /proc/<pid>/schedstat (#1872)
Add /proc/<pid>/schedstat metrics for scheduler metrics
This commit is contained in:
parent
b817801307
commit
08f0c2397c
@ -64,6 +64,7 @@ var (
|
||||
ignoreMetrics metricSetValue = metricSetValue{container.MetricSet{
|
||||
container.NetworkTcpUsageMetrics: struct{}{},
|
||||
container.NetworkUdpUsageMetrics: struct{}{},
|
||||
container.ProcessSchedulerMetrics: struct{}{},
|
||||
}}
|
||||
|
||||
// List of metrics that can be ignored.
|
||||
@ -73,6 +74,7 @@ var (
|
||||
container.NetworkTcpUsageMetrics: struct{}{},
|
||||
container.NetworkUdpUsageMetrics: struct{}{},
|
||||
container.PerCpuUsageMetrics: struct{}{},
|
||||
container.ProcessSchedulerMetrics: struct{}{},
|
||||
}
|
||||
)
|
||||
|
||||
|
@ -41,16 +41,17 @@ type ContainerHandlerFactory interface {
|
||||
type MetricKind string
|
||||
|
||||
const (
|
||||
CpuUsageMetrics MetricKind = "cpu"
|
||||
PerCpuUsageMetrics MetricKind = "percpu"
|
||||
MemoryUsageMetrics MetricKind = "memory"
|
||||
CpuLoadMetrics MetricKind = "cpuLoad"
|
||||
DiskIOMetrics MetricKind = "diskIO"
|
||||
DiskUsageMetrics MetricKind = "disk"
|
||||
NetworkUsageMetrics MetricKind = "network"
|
||||
NetworkTcpUsageMetrics MetricKind = "tcp"
|
||||
NetworkUdpUsageMetrics MetricKind = "udp"
|
||||
AppMetrics MetricKind = "app"
|
||||
CpuUsageMetrics MetricKind = "cpu"
|
||||
ProcessSchedulerMetrics MetricKind = "sched"
|
||||
PerCpuUsageMetrics MetricKind = "percpu"
|
||||
MemoryUsageMetrics MetricKind = "memory"
|
||||
CpuLoadMetrics MetricKind = "cpuLoad"
|
||||
DiskIOMetrics MetricKind = "diskIO"
|
||||
DiskUsageMetrics MetricKind = "disk"
|
||||
NetworkUsageMetrics MetricKind = "network"
|
||||
NetworkTcpUsageMetrics MetricKind = "tcp"
|
||||
NetworkUdpUsageMetrics MetricKind = "udp"
|
||||
AppMetrics MetricKind = "app"
|
||||
)
|
||||
|
||||
func (mk MetricKind) String() string {
|
||||
|
@ -28,6 +28,7 @@ import (
|
||||
"github.com/google/cadvisor/container"
|
||||
info "github.com/google/cadvisor/info/v1"
|
||||
|
||||
"bytes"
|
||||
"github.com/golang/glog"
|
||||
"github.com/opencontainers/runc/libcontainer"
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
@ -39,18 +40,20 @@ import (
|
||||
import "C"
|
||||
|
||||
type Handler struct {
|
||||
cgroupManager cgroups.Manager
|
||||
rootFs string
|
||||
pid int
|
||||
ignoreMetrics container.MetricSet
|
||||
cgroupManager cgroups.Manager
|
||||
rootFs string
|
||||
pid int
|
||||
ignoreMetrics container.MetricSet
|
||||
pidMetricsCache map[int]*info.CpuSchedstat
|
||||
}
|
||||
|
||||
func NewHandler(cgroupManager cgroups.Manager, rootFs string, pid int, ignoreMetrics container.MetricSet) *Handler {
|
||||
return &Handler{
|
||||
cgroupManager: cgroupManager,
|
||||
rootFs: rootFs,
|
||||
pid: pid,
|
||||
ignoreMetrics: ignoreMetrics,
|
||||
cgroupManager: cgroupManager,
|
||||
rootFs: rootFs,
|
||||
pid: pid,
|
||||
ignoreMetrics: ignoreMetrics,
|
||||
pidMetricsCache: make(map[int]*info.CpuSchedstat),
|
||||
}
|
||||
}
|
||||
|
||||
@ -66,6 +69,18 @@ func (h *Handler) GetStats() (*info.ContainerStats, error) {
|
||||
withPerCPU := !h.ignoreMetrics.Has(container.PerCpuUsageMetrics)
|
||||
stats := newContainerStats(libcontainerStats, withPerCPU)
|
||||
|
||||
if !h.ignoreMetrics.Has(container.ProcessSchedulerMetrics) {
|
||||
pids, err := h.cgroupManager.GetAllPids()
|
||||
if err != nil {
|
||||
glog.V(4).Infof("Could not get PIDs for container %d: %v", h.pid, err)
|
||||
} else {
|
||||
stats.Cpu.Schedstat, err = schedulerStatsFromProcs(h.rootFs, pids, h.pidMetricsCache)
|
||||
if err != nil {
|
||||
glog.V(4).Infof("Unable to get Process Scheduler Stats: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we know the pid then get network stats from /proc/<pid>/net/dev
|
||||
if h.pid == 0 {
|
||||
return stats, nil
|
||||
@ -117,6 +132,50 @@ func (h *Handler) GetStats() (*info.ContainerStats, error) {
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func schedulerStatsFromProcs(rootFs string, pids []int, pidMetricsCache map[int]*info.CpuSchedstat) (info.CpuSchedstat, error) {
|
||||
for _, pid := range pids {
|
||||
f, err := os.Open(path.Join(rootFs, "proc", strconv.Itoa(pid), "schedstat"))
|
||||
if err != nil {
|
||||
return info.CpuSchedstat{}, fmt.Errorf("couldn't open scheduler statistics for process %d: %v", pid, err)
|
||||
}
|
||||
defer f.Close()
|
||||
contents, err := ioutil.ReadAll(f)
|
||||
if err != nil {
|
||||
return info.CpuSchedstat{}, fmt.Errorf("couldn't read scheduler statistics for process %d: %v", pid, err)
|
||||
}
|
||||
rawMetrics := bytes.Split(bytes.TrimRight(contents, "\n"), []byte(" "))
|
||||
if len(rawMetrics) != 3 {
|
||||
return info.CpuSchedstat{}, fmt.Errorf("unexpected number of metrics in schedstat file for process %d", pid)
|
||||
}
|
||||
cacheEntry, ok := pidMetricsCache[pid]
|
||||
if !ok {
|
||||
cacheEntry = &info.CpuSchedstat{}
|
||||
pidMetricsCache[pid] = cacheEntry
|
||||
}
|
||||
for i, rawMetric := range rawMetrics {
|
||||
metric, err := strconv.ParseUint(string(rawMetric), 10, 64)
|
||||
if err != nil {
|
||||
return info.CpuSchedstat{}, fmt.Errorf("parsing error while reading scheduler statistics for process: %d: %v", pid, err)
|
||||
}
|
||||
switch i {
|
||||
case 0:
|
||||
cacheEntry.RunTime = metric
|
||||
case 1:
|
||||
cacheEntry.RunqueueTime = metric
|
||||
case 2:
|
||||
cacheEntry.RunPeriods = metric
|
||||
}
|
||||
}
|
||||
}
|
||||
schedstats := info.CpuSchedstat{}
|
||||
for _, v := range pidMetricsCache {
|
||||
schedstats.RunPeriods += v.RunPeriods
|
||||
schedstats.RunqueueTime += v.RunqueueTime
|
||||
schedstats.RunTime += v.RunTime
|
||||
}
|
||||
return schedstats, nil
|
||||
}
|
||||
|
||||
func networkStatsFromProc(rootFs string, pid int) ([]info.InterfaceStats, error) {
|
||||
netStatsFile := path.Join(rootFs, "proc", strconv.Itoa(pid), "/net/dev")
|
||||
|
||||
|
@ -293,10 +293,23 @@ type CpuCFS struct {
|
||||
ThrottledTime uint64 `json:"throttled_time"`
|
||||
}
|
||||
|
||||
// Cpu Aggregated scheduler statistics
|
||||
type CpuSchedstat struct {
|
||||
// https://www.kernel.org/doc/Documentation/scheduler/sched-stats.txt
|
||||
|
||||
// time spent on the cpu
|
||||
RunTime uint64 `json:"run_time"`
|
||||
// time spent waiting on a runqueue
|
||||
RunqueueTime uint64 `json:"runqueue_time"`
|
||||
// # of timeslices run on this cpu
|
||||
RunPeriods uint64 `json:"run_periods"`
|
||||
}
|
||||
|
||||
// All CPU usage metrics are cumulative from the creation of the container
|
||||
type CpuStats struct {
|
||||
Usage CpuUsage `json:"usage"`
|
||||
CFS CpuCFS `json:"cfs"`
|
||||
Usage CpuUsage `json:"usage"`
|
||||
CFS CpuCFS `json:"cfs"`
|
||||
Schedstat CpuSchedstat `json:"schedstat"`
|
||||
// Smoothed average of number of runnable threads x 1000.
|
||||
// We multiply by thousand to avoid using floats, but preserving precision.
|
||||
// Load is smoothed over the last 10 seconds. Instantaneous value can be read
|
||||
|
@ -197,6 +197,27 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo
|
||||
getValues: func(s *info.ContainerStats) metricValues {
|
||||
return metricValues{{value: float64(s.Cpu.CFS.ThrottledTime) / float64(time.Second)}}
|
||||
},
|
||||
}, {
|
||||
name: "container_cpu_schedstat_run_seconds_total",
|
||||
help: "Time duration the processes of the container have run on the CPU.",
|
||||
valueType: prometheus.CounterValue,
|
||||
getValues: func(s *info.ContainerStats) metricValues {
|
||||
return metricValues{{value: float64(s.Cpu.Schedstat.RunTime) / float64(time.Second)}}
|
||||
},
|
||||
}, {
|
||||
name: "container_cpu_schedstat_runqueue_seconds_total",
|
||||
help: "Time duration processes of the container have been waiting on a runqueue.",
|
||||
valueType: prometheus.CounterValue,
|
||||
getValues: func(s *info.ContainerStats) metricValues {
|
||||
return metricValues{{value: float64(s.Cpu.Schedstat.RunqueueTime) / float64(time.Second)}}
|
||||
},
|
||||
}, {
|
||||
name: "container_cpu_schedstat_run_periods_total",
|
||||
help: "Number of times processes of the cgroup have run on the cpu",
|
||||
valueType: prometheus.CounterValue,
|
||||
getValues: func(s *info.ContainerStats) metricValues {
|
||||
return metricValues{{value: float64(s.Cpu.Schedstat.RunPeriods)}}
|
||||
},
|
||||
}, {
|
||||
name: "container_cpu_load_average_10s",
|
||||
help: "Value of container cpu load average over the last 10 seconds.",
|
||||
|
@ -90,6 +90,11 @@ func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.Container
|
||||
ThrottledPeriods: 18,
|
||||
ThrottledTime: 1724314000,
|
||||
},
|
||||
Schedstat: info.CpuSchedstat{
|
||||
RunTime: 53643567,
|
||||
RunqueueTime: 479424566378,
|
||||
RunPeriods: 984285,
|
||||
},
|
||||
LoadAverage: 2,
|
||||
},
|
||||
Memory: info.MemoryStats{
|
||||
|
9
metrics/testdata/prometheus_metrics
vendored
9
metrics/testdata/prometheus_metrics
vendored
@ -25,6 +25,15 @@ container_cpu_cfs_throttled_seconds_total{container_env_foo_env="prod",container
|
||||
# HELP container_cpu_load_average_10s Value of container cpu load average over the last 10 seconds.
|
||||
# TYPE container_cpu_load_average_10s gauge
|
||||
container_cpu_load_average_10s{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 2
|
||||
# HELP container_cpu_schedstat_run_periods_total Number of times processes of the cgroup have run on the cpu
|
||||
# TYPE container_cpu_schedstat_run_periods_total counter
|
||||
container_cpu_schedstat_run_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 984285
|
||||
# HELP container_cpu_schedstat_run_seconds_total Time duration the processes of the container have run on the CPU.
|
||||
# TYPE container_cpu_schedstat_run_seconds_total counter
|
||||
container_cpu_schedstat_run_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.053643567
|
||||
# HELP container_cpu_schedstat_runqueue_seconds_total Time duration processes of the container have been waiting on a runqueue.
|
||||
# TYPE container_cpu_schedstat_runqueue_seconds_total counter
|
||||
container_cpu_schedstat_runqueue_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 479.424566378
|
||||
# HELP container_cpu_system_seconds_total Cumulative system cpu time consumed in seconds.
|
||||
# TYPE container_cpu_system_seconds_total counter
|
||||
container_cpu_system_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 7e-09
|
||||
|
Loading…
Reference in New Issue
Block a user