Adding /proc/<pid>/schedstat (#1872)
Add /proc/<pid>/schedstat metrics for scheduler metrics
This commit is contained in:
parent
b817801307
commit
08f0c2397c
@ -64,6 +64,7 @@ var (
|
|||||||
ignoreMetrics metricSetValue = metricSetValue{container.MetricSet{
|
ignoreMetrics metricSetValue = metricSetValue{container.MetricSet{
|
||||||
container.NetworkTcpUsageMetrics: struct{}{},
|
container.NetworkTcpUsageMetrics: struct{}{},
|
||||||
container.NetworkUdpUsageMetrics: struct{}{},
|
container.NetworkUdpUsageMetrics: struct{}{},
|
||||||
|
container.ProcessSchedulerMetrics: struct{}{},
|
||||||
}}
|
}}
|
||||||
|
|
||||||
// List of metrics that can be ignored.
|
// List of metrics that can be ignored.
|
||||||
@ -73,6 +74,7 @@ var (
|
|||||||
container.NetworkTcpUsageMetrics: struct{}{},
|
container.NetworkTcpUsageMetrics: struct{}{},
|
||||||
container.NetworkUdpUsageMetrics: struct{}{},
|
container.NetworkUdpUsageMetrics: struct{}{},
|
||||||
container.PerCpuUsageMetrics: struct{}{},
|
container.PerCpuUsageMetrics: struct{}{},
|
||||||
|
container.ProcessSchedulerMetrics: struct{}{},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -41,16 +41,17 @@ type ContainerHandlerFactory interface {
|
|||||||
type MetricKind string
|
type MetricKind string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
CpuUsageMetrics MetricKind = "cpu"
|
CpuUsageMetrics MetricKind = "cpu"
|
||||||
PerCpuUsageMetrics MetricKind = "percpu"
|
ProcessSchedulerMetrics MetricKind = "sched"
|
||||||
MemoryUsageMetrics MetricKind = "memory"
|
PerCpuUsageMetrics MetricKind = "percpu"
|
||||||
CpuLoadMetrics MetricKind = "cpuLoad"
|
MemoryUsageMetrics MetricKind = "memory"
|
||||||
DiskIOMetrics MetricKind = "diskIO"
|
CpuLoadMetrics MetricKind = "cpuLoad"
|
||||||
DiskUsageMetrics MetricKind = "disk"
|
DiskIOMetrics MetricKind = "diskIO"
|
||||||
NetworkUsageMetrics MetricKind = "network"
|
DiskUsageMetrics MetricKind = "disk"
|
||||||
NetworkTcpUsageMetrics MetricKind = "tcp"
|
NetworkUsageMetrics MetricKind = "network"
|
||||||
NetworkUdpUsageMetrics MetricKind = "udp"
|
NetworkTcpUsageMetrics MetricKind = "tcp"
|
||||||
AppMetrics MetricKind = "app"
|
NetworkUdpUsageMetrics MetricKind = "udp"
|
||||||
|
AppMetrics MetricKind = "app"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (mk MetricKind) String() string {
|
func (mk MetricKind) String() string {
|
||||||
|
@ -28,6 +28,7 @@ import (
|
|||||||
"github.com/google/cadvisor/container"
|
"github.com/google/cadvisor/container"
|
||||||
info "github.com/google/cadvisor/info/v1"
|
info "github.com/google/cadvisor/info/v1"
|
||||||
|
|
||||||
|
"bytes"
|
||||||
"github.com/golang/glog"
|
"github.com/golang/glog"
|
||||||
"github.com/opencontainers/runc/libcontainer"
|
"github.com/opencontainers/runc/libcontainer"
|
||||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||||
@ -39,18 +40,20 @@ import (
|
|||||||
import "C"
|
import "C"
|
||||||
|
|
||||||
type Handler struct {
|
type Handler struct {
|
||||||
cgroupManager cgroups.Manager
|
cgroupManager cgroups.Manager
|
||||||
rootFs string
|
rootFs string
|
||||||
pid int
|
pid int
|
||||||
ignoreMetrics container.MetricSet
|
ignoreMetrics container.MetricSet
|
||||||
|
pidMetricsCache map[int]*info.CpuSchedstat
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewHandler(cgroupManager cgroups.Manager, rootFs string, pid int, ignoreMetrics container.MetricSet) *Handler {
|
func NewHandler(cgroupManager cgroups.Manager, rootFs string, pid int, ignoreMetrics container.MetricSet) *Handler {
|
||||||
return &Handler{
|
return &Handler{
|
||||||
cgroupManager: cgroupManager,
|
cgroupManager: cgroupManager,
|
||||||
rootFs: rootFs,
|
rootFs: rootFs,
|
||||||
pid: pid,
|
pid: pid,
|
||||||
ignoreMetrics: ignoreMetrics,
|
ignoreMetrics: ignoreMetrics,
|
||||||
|
pidMetricsCache: make(map[int]*info.CpuSchedstat),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -66,6 +69,18 @@ func (h *Handler) GetStats() (*info.ContainerStats, error) {
|
|||||||
withPerCPU := !h.ignoreMetrics.Has(container.PerCpuUsageMetrics)
|
withPerCPU := !h.ignoreMetrics.Has(container.PerCpuUsageMetrics)
|
||||||
stats := newContainerStats(libcontainerStats, withPerCPU)
|
stats := newContainerStats(libcontainerStats, withPerCPU)
|
||||||
|
|
||||||
|
if !h.ignoreMetrics.Has(container.ProcessSchedulerMetrics) {
|
||||||
|
pids, err := h.cgroupManager.GetAllPids()
|
||||||
|
if err != nil {
|
||||||
|
glog.V(4).Infof("Could not get PIDs for container %d: %v", h.pid, err)
|
||||||
|
} else {
|
||||||
|
stats.Cpu.Schedstat, err = schedulerStatsFromProcs(h.rootFs, pids, h.pidMetricsCache)
|
||||||
|
if err != nil {
|
||||||
|
glog.V(4).Infof("Unable to get Process Scheduler Stats: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// If we know the pid then get network stats from /proc/<pid>/net/dev
|
// If we know the pid then get network stats from /proc/<pid>/net/dev
|
||||||
if h.pid == 0 {
|
if h.pid == 0 {
|
||||||
return stats, nil
|
return stats, nil
|
||||||
@ -117,6 +132,50 @@ func (h *Handler) GetStats() (*info.ContainerStats, error) {
|
|||||||
return stats, nil
|
return stats, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func schedulerStatsFromProcs(rootFs string, pids []int, pidMetricsCache map[int]*info.CpuSchedstat) (info.CpuSchedstat, error) {
|
||||||
|
for _, pid := range pids {
|
||||||
|
f, err := os.Open(path.Join(rootFs, "proc", strconv.Itoa(pid), "schedstat"))
|
||||||
|
if err != nil {
|
||||||
|
return info.CpuSchedstat{}, fmt.Errorf("couldn't open scheduler statistics for process %d: %v", pid, err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
contents, err := ioutil.ReadAll(f)
|
||||||
|
if err != nil {
|
||||||
|
return info.CpuSchedstat{}, fmt.Errorf("couldn't read scheduler statistics for process %d: %v", pid, err)
|
||||||
|
}
|
||||||
|
rawMetrics := bytes.Split(bytes.TrimRight(contents, "\n"), []byte(" "))
|
||||||
|
if len(rawMetrics) != 3 {
|
||||||
|
return info.CpuSchedstat{}, fmt.Errorf("unexpected number of metrics in schedstat file for process %d", pid)
|
||||||
|
}
|
||||||
|
cacheEntry, ok := pidMetricsCache[pid]
|
||||||
|
if !ok {
|
||||||
|
cacheEntry = &info.CpuSchedstat{}
|
||||||
|
pidMetricsCache[pid] = cacheEntry
|
||||||
|
}
|
||||||
|
for i, rawMetric := range rawMetrics {
|
||||||
|
metric, err := strconv.ParseUint(string(rawMetric), 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
return info.CpuSchedstat{}, fmt.Errorf("parsing error while reading scheduler statistics for process: %d: %v", pid, err)
|
||||||
|
}
|
||||||
|
switch i {
|
||||||
|
case 0:
|
||||||
|
cacheEntry.RunTime = metric
|
||||||
|
case 1:
|
||||||
|
cacheEntry.RunqueueTime = metric
|
||||||
|
case 2:
|
||||||
|
cacheEntry.RunPeriods = metric
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
schedstats := info.CpuSchedstat{}
|
||||||
|
for _, v := range pidMetricsCache {
|
||||||
|
schedstats.RunPeriods += v.RunPeriods
|
||||||
|
schedstats.RunqueueTime += v.RunqueueTime
|
||||||
|
schedstats.RunTime += v.RunTime
|
||||||
|
}
|
||||||
|
return schedstats, nil
|
||||||
|
}
|
||||||
|
|
||||||
func networkStatsFromProc(rootFs string, pid int) ([]info.InterfaceStats, error) {
|
func networkStatsFromProc(rootFs string, pid int) ([]info.InterfaceStats, error) {
|
||||||
netStatsFile := path.Join(rootFs, "proc", strconv.Itoa(pid), "/net/dev")
|
netStatsFile := path.Join(rootFs, "proc", strconv.Itoa(pid), "/net/dev")
|
||||||
|
|
||||||
|
@ -293,10 +293,23 @@ type CpuCFS struct {
|
|||||||
ThrottledTime uint64 `json:"throttled_time"`
|
ThrottledTime uint64 `json:"throttled_time"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Cpu Aggregated scheduler statistics
|
||||||
|
type CpuSchedstat struct {
|
||||||
|
// https://www.kernel.org/doc/Documentation/scheduler/sched-stats.txt
|
||||||
|
|
||||||
|
// time spent on the cpu
|
||||||
|
RunTime uint64 `json:"run_time"`
|
||||||
|
// time spent waiting on a runqueue
|
||||||
|
RunqueueTime uint64 `json:"runqueue_time"`
|
||||||
|
// # of timeslices run on this cpu
|
||||||
|
RunPeriods uint64 `json:"run_periods"`
|
||||||
|
}
|
||||||
|
|
||||||
// All CPU usage metrics are cumulative from the creation of the container
|
// All CPU usage metrics are cumulative from the creation of the container
|
||||||
type CpuStats struct {
|
type CpuStats struct {
|
||||||
Usage CpuUsage `json:"usage"`
|
Usage CpuUsage `json:"usage"`
|
||||||
CFS CpuCFS `json:"cfs"`
|
CFS CpuCFS `json:"cfs"`
|
||||||
|
Schedstat CpuSchedstat `json:"schedstat"`
|
||||||
// Smoothed average of number of runnable threads x 1000.
|
// Smoothed average of number of runnable threads x 1000.
|
||||||
// We multiply by thousand to avoid using floats, but preserving precision.
|
// We multiply by thousand to avoid using floats, but preserving precision.
|
||||||
// Load is smoothed over the last 10 seconds. Instantaneous value can be read
|
// Load is smoothed over the last 10 seconds. Instantaneous value can be read
|
||||||
|
@ -197,6 +197,27 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo
|
|||||||
getValues: func(s *info.ContainerStats) metricValues {
|
getValues: func(s *info.ContainerStats) metricValues {
|
||||||
return metricValues{{value: float64(s.Cpu.CFS.ThrottledTime) / float64(time.Second)}}
|
return metricValues{{value: float64(s.Cpu.CFS.ThrottledTime) / float64(time.Second)}}
|
||||||
},
|
},
|
||||||
|
}, {
|
||||||
|
name: "container_cpu_schedstat_run_seconds_total",
|
||||||
|
help: "Time duration the processes of the container have run on the CPU.",
|
||||||
|
valueType: prometheus.CounterValue,
|
||||||
|
getValues: func(s *info.ContainerStats) metricValues {
|
||||||
|
return metricValues{{value: float64(s.Cpu.Schedstat.RunTime) / float64(time.Second)}}
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
name: "container_cpu_schedstat_runqueue_seconds_total",
|
||||||
|
help: "Time duration processes of the container have been waiting on a runqueue.",
|
||||||
|
valueType: prometheus.CounterValue,
|
||||||
|
getValues: func(s *info.ContainerStats) metricValues {
|
||||||
|
return metricValues{{value: float64(s.Cpu.Schedstat.RunqueueTime) / float64(time.Second)}}
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
name: "container_cpu_schedstat_run_periods_total",
|
||||||
|
help: "Number of times processes of the cgroup have run on the cpu",
|
||||||
|
valueType: prometheus.CounterValue,
|
||||||
|
getValues: func(s *info.ContainerStats) metricValues {
|
||||||
|
return metricValues{{value: float64(s.Cpu.Schedstat.RunPeriods)}}
|
||||||
|
},
|
||||||
}, {
|
}, {
|
||||||
name: "container_cpu_load_average_10s",
|
name: "container_cpu_load_average_10s",
|
||||||
help: "Value of container cpu load average over the last 10 seconds.",
|
help: "Value of container cpu load average over the last 10 seconds.",
|
||||||
|
@ -90,6 +90,11 @@ func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.Container
|
|||||||
ThrottledPeriods: 18,
|
ThrottledPeriods: 18,
|
||||||
ThrottledTime: 1724314000,
|
ThrottledTime: 1724314000,
|
||||||
},
|
},
|
||||||
|
Schedstat: info.CpuSchedstat{
|
||||||
|
RunTime: 53643567,
|
||||||
|
RunqueueTime: 479424566378,
|
||||||
|
RunPeriods: 984285,
|
||||||
|
},
|
||||||
LoadAverage: 2,
|
LoadAverage: 2,
|
||||||
},
|
},
|
||||||
Memory: info.MemoryStats{
|
Memory: info.MemoryStats{
|
||||||
|
9
metrics/testdata/prometheus_metrics
vendored
9
metrics/testdata/prometheus_metrics
vendored
@ -25,6 +25,15 @@ container_cpu_cfs_throttled_seconds_total{container_env_foo_env="prod",container
|
|||||||
# HELP container_cpu_load_average_10s Value of container cpu load average over the last 10 seconds.
|
# HELP container_cpu_load_average_10s Value of container cpu load average over the last 10 seconds.
|
||||||
# TYPE container_cpu_load_average_10s gauge
|
# TYPE container_cpu_load_average_10s gauge
|
||||||
container_cpu_load_average_10s{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 2
|
container_cpu_load_average_10s{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 2
|
||||||
|
# HELP container_cpu_schedstat_run_periods_total Number of times processes of the cgroup have run on the cpu
|
||||||
|
# TYPE container_cpu_schedstat_run_periods_total counter
|
||||||
|
container_cpu_schedstat_run_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 984285
|
||||||
|
# HELP container_cpu_schedstat_run_seconds_total Time duration the processes of the container have run on the CPU.
|
||||||
|
# TYPE container_cpu_schedstat_run_seconds_total counter
|
||||||
|
container_cpu_schedstat_run_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.053643567
|
||||||
|
# HELP container_cpu_schedstat_runqueue_seconds_total Time duration processes of the container have been waiting on a runqueue.
|
||||||
|
# TYPE container_cpu_schedstat_runqueue_seconds_total counter
|
||||||
|
container_cpu_schedstat_runqueue_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 479.424566378
|
||||||
# HELP container_cpu_system_seconds_total Cumulative system cpu time consumed in seconds.
|
# HELP container_cpu_system_seconds_total Cumulative system cpu time consumed in seconds.
|
||||||
# TYPE container_cpu_system_seconds_total counter
|
# TYPE container_cpu_system_seconds_total counter
|
||||||
container_cpu_system_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 7e-09
|
container_cpu_system_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 7e-09
|
||||||
|
Loading…
Reference in New Issue
Block a user