Emit number of processes and file descriptors of a container
This commit is contained in:
parent
026bc840e1
commit
02ecf721f5
@ -68,6 +68,7 @@ var (
|
||||
container.NetworkTcpUsageMetrics: struct{}{},
|
||||
container.NetworkUdpUsageMetrics: struct{}{},
|
||||
container.ProcessSchedulerMetrics: struct{}{},
|
||||
container.ProcessMetrics: struct{}{},
|
||||
}}
|
||||
|
||||
// List of metrics that can be ignored.
|
||||
@ -78,6 +79,7 @@ var (
|
||||
container.NetworkUdpUsageMetrics: struct{}{},
|
||||
container.PerCpuUsageMetrics: struct{}{},
|
||||
container.ProcessSchedulerMetrics: struct{}{},
|
||||
container.ProcessMetrics: struct{}{},
|
||||
}
|
||||
)
|
||||
|
||||
@ -109,7 +111,7 @@ func (ml *metricSetValue) Set(value string) error {
|
||||
}
|
||||
|
||||
func init() {
|
||||
flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'disk', 'network', 'tcp', 'udp', 'percpu'. Note: tcp and udp are disabled by default due to high CPU usage.")
|
||||
flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'disk', 'network', 'tcp', 'udp', 'percpu', 'sched', 'process'. Note: tcp and udp are disabled by default due to high CPU usage.")
|
||||
|
||||
// Default logging verbosity to V(2)
|
||||
flag.Set("v", "2")
|
||||
@ -251,6 +253,7 @@ func toIncludedMetrics(ignoreMetrics container.MetricSet) container.MetricSet {
|
||||
container.NetworkUdpUsageMetrics,
|
||||
container.AcceleratorUsageMetrics,
|
||||
container.AppMetrics,
|
||||
container.ProcessMetrics,
|
||||
}
|
||||
for _, metric := range allMetrics {
|
||||
if !ignoreMetrics.Has(metric) {
|
||||
|
@ -53,6 +53,7 @@ const (
|
||||
NetworkUdpUsageMetrics MetricKind = "udp"
|
||||
AcceleratorUsageMetrics MetricKind = "accelerator"
|
||||
AppMetrics MetricKind = "app"
|
||||
ProcessMetrics MetricKind = "process"
|
||||
)
|
||||
|
||||
func (mk MetricKind) String() string {
|
||||
|
@ -29,6 +29,7 @@ Metric name | Type | Description | Unit (where applicable)
|
||||
`container_cpu_system_seconds_total` | Counter | Cumulative system cpu time consumed | seconds
|
||||
`container_cpu_usage_seconds_total` | Counter | Cumulative cpu time consumed | seconds
|
||||
`container_cpu_user_seconds_total` | Counter | Cumulative user cpu time consumed | seconds
|
||||
`container_file_descriptors` | Gauge | Number of open file descriptors for the container |
|
||||
`container_fs_inodes_free` | Gauge | Number of available Inodes |
|
||||
`container_fs_inodes_total` | Gauge | Total number of Inodes |
|
||||
`container_fs_io_current` | Gauge | Number of I/Os currently in progress |
|
||||
@ -66,6 +67,7 @@ Metric name | Type | Description | Unit (where applicable)
|
||||
`container_network_transmit_errors_total` | Counter | Cumulative count of errors encountered while transmitting |
|
||||
`container_network_tcp_usage_total` | Gauge | tcp connection usage statistic for container |
|
||||
`container_network_udp_usage_total` | Gauge | udp connection usage statistic for container |
|
||||
`container_processes` | Gauge | Number of processes running inside the container |
|
||||
`container_spec_cpu_period` | Gauge | CPU period of the container |
|
||||
`container_spec_cpu_quota` | Gauge | CPU quota of the container |
|
||||
`container_spec_cpu_shares` | Gauge | CPU share of the container |
|
||||
|
@ -254,6 +254,7 @@ type ProcessInfo struct {
|
||||
RunningTime string `json:"running_time"`
|
||||
CgroupPath string `json:"cgroup_path"`
|
||||
Cmd string `json:"cmd"`
|
||||
FdCount int `json:"fd_count"`
|
||||
}
|
||||
|
||||
type TcpStat struct {
|
||||
|
@ -47,7 +47,9 @@ import (
|
||||
var enableLoadReader = flag.Bool("enable_load_reader", false, "Whether to enable cpu load reader")
|
||||
var HousekeepingInterval = flag.Duration("housekeeping_interval", 1*time.Second, "Interval between container housekeepings")
|
||||
|
||||
var cgroupPathRegExp = regexp.MustCompile(`devices[^:]*:(.*?)[,;$]`)
|
||||
// cgroup type chosen to fetch the cgroup path of a process.
|
||||
// Memory has been chosen, as it is one of the default cgroups that is enabled for most containers.
|
||||
var cgroupPathRegExp = regexp.MustCompile(`memory[^:]*:(.*?)[,;$]`)
|
||||
|
||||
type containerInfo struct {
|
||||
info.ContainerReference
|
||||
@ -185,8 +187,8 @@ func (c *containerData) getCgroupPath(cgroups string) (string, error) {
|
||||
}
|
||||
matches := cgroupPathRegExp.FindSubmatch([]byte(cgroups))
|
||||
if len(matches) != 2 {
|
||||
glog.V(3).Infof("failed to get devices cgroup path from %q", cgroups)
|
||||
// return root in case of failures - devices hierarchy might not be enabled.
|
||||
glog.V(3).Infof("failed to get memory cgroup path from %q", cgroups)
|
||||
// return root in case of failures - memory hierarchy might not be enabled.
|
||||
return "/", nil
|
||||
}
|
||||
return string(matches[1]), nil
|
||||
@ -266,6 +268,10 @@ func (c *containerData) getContainerPids(inHostNamespace bool) ([]string, error)
|
||||
func (c *containerData) GetProcessList(cadvisorContainer string, inHostNamespace bool) ([]v2.ProcessInfo, error) {
|
||||
// report all processes for root.
|
||||
isRoot := c.info.Name == "/"
|
||||
rootfs := "/"
|
||||
if !inHostNamespace {
|
||||
rootfs = "/rootfs"
|
||||
}
|
||||
format := "user,pid,ppid,stime,pcpu,pmem,rss,vsz,stat,time,comm,cgroup"
|
||||
out, err := c.getPsOutput(inHostNamespace, format)
|
||||
if err != nil {
|
||||
@ -324,6 +330,15 @@ func (c *containerData) GetProcessList(cadvisorContainer string, inHostNamespace
|
||||
cgroupPath = cgroup
|
||||
}
|
||||
|
||||
var fdCount int
|
||||
dirPath := path.Join(rootfs, "/proc", strconv.Itoa(pid), "fd")
|
||||
fds, err := ioutil.ReadDir(dirPath)
|
||||
if err != nil {
|
||||
glog.V(4).Infof("error while listing directory %q to measure fd count: %v", dirPath, err)
|
||||
continue
|
||||
}
|
||||
fdCount = len(fds)
|
||||
|
||||
if isRoot || c.info.Name == cgroup {
|
||||
processes = append(processes, v2.ProcessInfo{
|
||||
User: fields[0],
|
||||
@ -338,6 +353,7 @@ func (c *containerData) GetProcessList(cadvisorContainer string, inHostNamespace
|
||||
RunningTime: fields[9],
|
||||
Cmd: fields[10],
|
||||
CgroupPath: cgroupPath,
|
||||
FdCount: fdCount,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -21,6 +21,7 @@ import (
|
||||
|
||||
"github.com/google/cadvisor/container"
|
||||
info "github.com/google/cadvisor/info/v1"
|
||||
"github.com/google/cadvisor/info/v2"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
@ -35,6 +36,8 @@ type infoProvider interface {
|
||||
GetVersionInfo() (*info.VersionInfo, error)
|
||||
// GetMachineInfo provides information about the machine.
|
||||
GetMachineInfo() (*info.MachineInfo, error)
|
||||
// GetProcessList provides information about each container's processes
|
||||
GetProcessList(containerName string, options v2.RequestOptions) ([]v2.ProcessInfo, error)
|
||||
}
|
||||
|
||||
// metricValue describes a single metric value for a given set of label values
|
||||
@ -109,6 +112,7 @@ type PrometheusCollector struct {
|
||||
errors prometheus.Gauge
|
||||
containerMetrics []containerMetric
|
||||
containerLabelsFunc ContainerLabelsFunc
|
||||
includedMetrics container.MetricSet
|
||||
}
|
||||
|
||||
// NewPrometheusCollector returns a new PrometheusCollector. The passed
|
||||
@ -137,6 +141,7 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
|
||||
},
|
||||
},
|
||||
},
|
||||
includedMetrics: includedMetrics,
|
||||
}
|
||||
if includedMetrics.Has(container.CpuUsageMetrics) {
|
||||
c.containerMetrics = append(c.containerMetrics, []containerMetric{
|
||||
@ -926,10 +931,15 @@ func (c *PrometheusCollector) collectContainersInfo(ch chan<- prometheus.Metric)
|
||||
rawLabels[l] = struct{}{}
|
||||
}
|
||||
}
|
||||
for _, container := range containers {
|
||||
|
||||
psReqOpt := v2.RequestOptions{
|
||||
IdType: v2.TypeName,
|
||||
}
|
||||
|
||||
for _, cont := range containers {
|
||||
values := make([]string, 0, len(rawLabels))
|
||||
labels := make([]string, 0, len(rawLabels))
|
||||
containerLabels := c.containerLabelsFunc(container)
|
||||
containerLabels := c.containerLabelsFunc(cont)
|
||||
for l := range rawLabels {
|
||||
labels = append(labels, sanitizeLabelName(l))
|
||||
values = append(values, containerLabels[l])
|
||||
@ -937,35 +947,50 @@ func (c *PrometheusCollector) collectContainersInfo(ch chan<- prometheus.Metric)
|
||||
|
||||
// Container spec
|
||||
desc := prometheus.NewDesc("container_start_time_seconds", "Start time of the container since unix epoch in seconds.", labels, nil)
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(container.Spec.CreationTime.Unix()), values...)
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(cont.Spec.CreationTime.Unix()), values...)
|
||||
|
||||
if container.Spec.HasCpu {
|
||||
if cont.Spec.HasCpu {
|
||||
desc = prometheus.NewDesc("container_spec_cpu_period", "CPU period of the container.", labels, nil)
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(container.Spec.Cpu.Period), values...)
|
||||
if container.Spec.Cpu.Quota != 0 {
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(cont.Spec.Cpu.Period), values...)
|
||||
if cont.Spec.Cpu.Quota != 0 {
|
||||
desc = prometheus.NewDesc("container_spec_cpu_quota", "CPU quota of the container.", labels, nil)
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(container.Spec.Cpu.Quota), values...)
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(cont.Spec.Cpu.Quota), values...)
|
||||
}
|
||||
desc := prometheus.NewDesc("container_spec_cpu_shares", "CPU share of the container.", labels, nil)
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(container.Spec.Cpu.Limit), values...)
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(cont.Spec.Cpu.Limit), values...)
|
||||
|
||||
}
|
||||
if container.Spec.HasMemory {
|
||||
if cont.Spec.HasMemory {
|
||||
desc := prometheus.NewDesc("container_spec_memory_limit_bytes", "Memory limit for the container.", labels, nil)
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(container.Spec.Memory.Limit), values...)
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(cont.Spec.Memory.Limit), values...)
|
||||
desc = prometheus.NewDesc("container_spec_memory_swap_limit_bytes", "Memory swap limit for the container.", labels, nil)
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(container.Spec.Memory.SwapLimit), values...)
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(cont.Spec.Memory.SwapLimit), values...)
|
||||
desc = prometheus.NewDesc("container_spec_memory_reservation_limit_bytes", "Memory reservation limit for the container.", labels, nil)
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(container.Spec.Memory.Reservation), values...)
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(cont.Spec.Memory.Reservation), values...)
|
||||
}
|
||||
|
||||
if c.includedMetrics.Has(container.ProcessMetrics) {
|
||||
psList, err := c.infoProvider.GetProcessList(cont.Name, psReqOpt)
|
||||
if err == nil {
|
||||
desc = prometheus.NewDesc("container_processes", "Number of processes running inside the container.", labels, nil)
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(len(psList)), values...)
|
||||
|
||||
var fd int
|
||||
for _, ps := range psList {
|
||||
fd += ps.FdCount
|
||||
}
|
||||
desc = prometheus.NewDesc("container_file_descriptors", "Number of open file descriptors for the container.", labels, nil)
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(fd), values...)
|
||||
}
|
||||
}
|
||||
|
||||
// Now for the actual metrics
|
||||
if len(container.Stats) == 0 {
|
||||
if len(cont.Stats) == 0 {
|
||||
continue
|
||||
}
|
||||
stats := container.Stats[0]
|
||||
stats := cont.Stats[0]
|
||||
for _, cm := range c.containerMetrics {
|
||||
if cm.condition != nil && !cm.condition(container.Spec) {
|
||||
if cm.condition != nil && !cm.condition(cont.Spec) {
|
||||
continue
|
||||
}
|
||||
desc := cm.desc(labels)
|
||||
|
@ -26,6 +26,7 @@ import (
|
||||
|
||||
"github.com/google/cadvisor/container"
|
||||
info "github.com/google/cadvisor/info/v1"
|
||||
"github.com/google/cadvisor/info/v2"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
@ -49,6 +50,26 @@ func (p testSubcontainersInfoProvider) GetMachineInfo() (*info.MachineInfo, erro
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (p testSubcontainersInfoProvider) GetProcessList(containerName string, options v2.RequestOptions) ([]v2.ProcessInfo, error) {
|
||||
return []v2.ProcessInfo{
|
||||
{
|
||||
User: "user1",
|
||||
Pid: 1,
|
||||
Ppid: 2,
|
||||
StartTime: "OCT2018",
|
||||
PercentCpu: 0.0,
|
||||
PercentMemory: 0.0,
|
||||
RSS: 3,
|
||||
VirtualSize: 4,
|
||||
Status: "S",
|
||||
RunningTime: "00:00:00",
|
||||
Cmd: "cmd1",
|
||||
CgroupPath: "path",
|
||||
FdCount: 5,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
var allMetrics = container.MetricSet{
|
||||
container.CpuUsageMetrics: struct{}{},
|
||||
container.ProcessSchedulerMetrics: struct{}{},
|
||||
@ -61,6 +82,7 @@ var allMetrics = container.MetricSet{
|
||||
container.NetworkUsageMetrics: struct{}{},
|
||||
container.NetworkTcpUsageMetrics: struct{}{},
|
||||
container.NetworkUdpUsageMetrics: struct{}{},
|
||||
container.ProcessMetrics: struct{}{},
|
||||
}
|
||||
|
||||
func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.ContainerInfoRequest) ([]*info.ContainerInfo, error) {
|
||||
@ -305,6 +327,13 @@ func (p *erroringSubcontainersInfoProvider) GetMachineInfo() (*info.MachineInfo,
|
||||
return p.successfulProvider.GetMachineInfo()
|
||||
}
|
||||
|
||||
func (p *erroringSubcontainersInfoProvider) GetProcessList(containerName string, options v2.RequestOptions) ([]v2.ProcessInfo, error) {
|
||||
if p.shouldFail {
|
||||
return nil, errors.New("Oops 2")
|
||||
}
|
||||
return p.successfulProvider.GetProcessList(containerName, options)
|
||||
}
|
||||
|
||||
func (p *erroringSubcontainersInfoProvider) SubcontainersInfo(
|
||||
a string, r *info.ContainerInfoRequest) ([]*info.ContainerInfo, error) {
|
||||
if p.shouldFail {
|
||||
|
6
metrics/testdata/prometheus_metrics
vendored
6
metrics/testdata/prometheus_metrics
vendored
@ -46,6 +46,9 @@ container_cpu_usage_seconds_total{container_env_foo_env="prod",container_label_f
|
||||
# HELP container_cpu_user_seconds_total Cumulative user cpu time consumed in seconds.
|
||||
# TYPE container_cpu_user_seconds_total counter
|
||||
container_cpu_user_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 6e-09
|
||||
# HELP container_file_descriptors Number of open file descriptors for the container.
|
||||
# TYPE container_file_descriptors gauge
|
||||
container_file_descriptors{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 5
|
||||
# HELP container_fs_inodes_free Number of available Inodes
|
||||
# TYPE container_fs_inodes_free gauge
|
||||
container_fs_inodes_free{container_env_foo_env="prod",container_label_foo_label="bar",device="sda1",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 524288
|
||||
@ -182,6 +185,9 @@ container_network_udp_usage_total{container_env_foo_env="prod",container_label_f
|
||||
container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="listen",zone_name="hello"} 0
|
||||
container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="rxqueued",zone_name="hello"} 0
|
||||
container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="txqueued",zone_name="hello"} 0
|
||||
# HELP container_processes Number of processes running inside the container.
|
||||
# TYPE container_processes gauge
|
||||
container_processes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1
|
||||
# HELP container_scrape_error 1 if there was an error while getting container metrics, 0 otherwise
|
||||
# TYPE container_scrape_error gauge
|
||||
container_scrape_error 0
|
||||
|
Loading…
Reference in New Issue
Block a user