From d8cdae80222ebd058724e0cb02d18ded4cce658f Mon Sep 17 00:00:00 2001 From: Yann Hodique Date: Fri, 28 Sep 2018 08:48:12 -0700 Subject: [PATCH 1/2] replace golang.org/x/exp/inotify with standalone library context: kubernetes/kubernetes#68478 The inotify code was removed from golang.org/x/exp several years ago. Therefore importing it from that path prevents downstream consumers from using any module that makes use of more recent features of golang.org/x/exp. Given that this code is by definition frozen and that the long term path should be to migrate to fsnotify, replacing the current code by an identical standalone copy doesn't have maintenance cost, and will unlock other activities for kubernetes for example. --- Godeps/Godeps.json | 8 ++++---- container/common/inotify_watcher.go | 2 +- manager/watcher/raw/raw.go | 2 +- utils/tail/tail.go | 2 +- vendor/github.com/sigma/go-inotify/README.md | 5 +++++ .../sigma/go-inotify}/inotify_linux.go | 0 6 files changed, 12 insertions(+), 7 deletions(-) create mode 100644 vendor/github.com/sigma/go-inotify/README.md rename vendor/{golang.org/x/exp/inotify => github.com/sigma/go-inotify}/inotify_linux.go (100%) diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json index db507074..99babe97 100644 --- a/Godeps/Godeps.json +++ b/Godeps/Godeps.json @@ -763,6 +763,10 @@ "ImportPath": "github.com/seccomp/libseccomp-golang", "Rev": "1b506fc7c24eec5a3693cdcbed40d9c226cfc6a1" }, + { + "ImportPath": "github.com/sigma/go-inotify", + "Rev": "6e160422f7699ab1472cc86362da4f009a7efc60" + }, { "ImportPath": "github.com/sirupsen/logrus", "Comment": "v1.0.3-11-g89742ae", @@ -803,10 +807,6 @@ "ImportPath": "golang.org/x/crypto/ssh/terminal", "Rev": "eb71ad9bd329b5ac0fd0148dd99bd62e8be8e035" }, - { - "ImportPath": "golang.org/x/exp/inotify", - "Rev": "292a51b8d262487dab23a588950e8052d63d9113" - }, { "ImportPath": "golang.org/x/net/context", "Rev": "7dcfb8076726a3fdd9353b6b8a1f1b6be6811bd6" diff --git a/container/common/inotify_watcher.go b/container/common/inotify_watcher.go index 16ac1456..787f599a 100644 --- a/container/common/inotify_watcher.go +++ b/container/common/inotify_watcher.go @@ -17,7 +17,7 @@ package common import ( "sync" - "golang.org/x/exp/inotify" + inotify "github.com/sigma/go-inotify" ) // Watcher for container-related inotify events in the cgroup hierarchy. diff --git a/manager/watcher/raw/raw.go b/manager/watcher/raw/raw.go index 5bf12829..fa3e5477 100644 --- a/manager/watcher/raw/raw.go +++ b/manager/watcher/raw/raw.go @@ -26,9 +26,9 @@ import ( "github.com/google/cadvisor/container/common" "github.com/google/cadvisor/container/libcontainer" "github.com/google/cadvisor/manager/watcher" + inotify "github.com/sigma/go-inotify" "github.com/golang/glog" - "golang.org/x/exp/inotify" ) type rawContainerWatcher struct { diff --git a/utils/tail/tail.go b/utils/tail/tail.go index 88c7f72b..ca62fe69 100644 --- a/utils/tail/tail.go +++ b/utils/tail/tail.go @@ -25,7 +25,7 @@ import ( "time" "github.com/golang/glog" - "golang.org/x/exp/inotify" + inotify "github.com/sigma/go-inotify" ) type Tail struct { diff --git a/vendor/github.com/sigma/go-inotify/README.md b/vendor/github.com/sigma/go-inotify/README.md new file mode 100644 index 00000000..0c723a8d --- /dev/null +++ b/vendor/github.com/sigma/go-inotify/README.md @@ -0,0 +1,5 @@ +This is a fork of golang.org/x/exp/inotify before it was deleted. + +Please use gopkg.in/fsnotify.v0 instead. + +For updates, see: https://fsnotify.org/ diff --git a/vendor/golang.org/x/exp/inotify/inotify_linux.go b/vendor/github.com/sigma/go-inotify/inotify_linux.go similarity index 100% rename from vendor/golang.org/x/exp/inotify/inotify_linux.go rename to vendor/github.com/sigma/go-inotify/inotify_linux.go From 02ecf721f5e17d67fe6d1dee1653b045a9b6bb37 Mon Sep 17 00:00:00 2001 From: Sashank Appireddy Date: Thu, 11 Oct 2018 14:35:48 -0700 Subject: [PATCH 2/2] Emit number of processes and file descriptors of a container --- cadvisor.go | 5 ++- container/factory.go | 1 + docs/storage/prometheus.md | 2 ++ info/v2/container.go | 1 + manager/container.go | 22 ++++++++++-- metrics/prometheus.go | 55 +++++++++++++++++++++-------- metrics/prometheus_test.go | 29 +++++++++++++++ metrics/testdata/prometheus_metrics | 6 ++++ 8 files changed, 102 insertions(+), 19 deletions(-) diff --git a/cadvisor.go b/cadvisor.go index 5336cb4b..6e47e0fa 100644 --- a/cadvisor.go +++ b/cadvisor.go @@ -68,6 +68,7 @@ var ( container.NetworkTcpUsageMetrics: struct{}{}, container.NetworkUdpUsageMetrics: struct{}{}, container.ProcessSchedulerMetrics: struct{}{}, + container.ProcessMetrics: struct{}{}, }} // List of metrics that can be ignored. @@ -78,6 +79,7 @@ var ( container.NetworkUdpUsageMetrics: struct{}{}, container.PerCpuUsageMetrics: struct{}{}, container.ProcessSchedulerMetrics: struct{}{}, + container.ProcessMetrics: struct{}{}, } ) @@ -109,7 +111,7 @@ func (ml *metricSetValue) Set(value string) error { } func init() { - flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'disk', 'network', 'tcp', 'udp', 'percpu'. Note: tcp and udp are disabled by default due to high CPU usage.") + flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'disk', 'network', 'tcp', 'udp', 'percpu', 'sched', 'process'. Note: tcp and udp are disabled by default due to high CPU usage.") // Default logging verbosity to V(2) flag.Set("v", "2") @@ -251,6 +253,7 @@ func toIncludedMetrics(ignoreMetrics container.MetricSet) container.MetricSet { container.NetworkUdpUsageMetrics, container.AcceleratorUsageMetrics, container.AppMetrics, + container.ProcessMetrics, } for _, metric := range allMetrics { if !ignoreMetrics.Has(metric) { diff --git a/container/factory.go b/container/factory.go index 47847057..8e33ade6 100644 --- a/container/factory.go +++ b/container/factory.go @@ -53,6 +53,7 @@ const ( NetworkUdpUsageMetrics MetricKind = "udp" AcceleratorUsageMetrics MetricKind = "accelerator" AppMetrics MetricKind = "app" + ProcessMetrics MetricKind = "process" ) func (mk MetricKind) String() string { diff --git a/docs/storage/prometheus.md b/docs/storage/prometheus.md index adc0e342..c6bae7d2 100644 --- a/docs/storage/prometheus.md +++ b/docs/storage/prometheus.md @@ -29,6 +29,7 @@ Metric name | Type | Description | Unit (where applicable) `container_cpu_system_seconds_total` | Counter | Cumulative system cpu time consumed | seconds `container_cpu_usage_seconds_total` | Counter | Cumulative cpu time consumed | seconds `container_cpu_user_seconds_total` | Counter | Cumulative user cpu time consumed | seconds +`container_file_descriptors` | Gauge | Number of open file descriptors for the container | `container_fs_inodes_free` | Gauge | Number of available Inodes | `container_fs_inodes_total` | Gauge | Total number of Inodes | `container_fs_io_current` | Gauge | Number of I/Os currently in progress | @@ -66,6 +67,7 @@ Metric name | Type | Description | Unit (where applicable) `container_network_transmit_errors_total` | Counter | Cumulative count of errors encountered while transmitting | `container_network_tcp_usage_total` | Gauge | tcp connection usage statistic for container | `container_network_udp_usage_total` | Gauge | udp connection usage statistic for container | +`container_processes` | Gauge | Number of processes running inside the container | `container_spec_cpu_period` | Gauge | CPU period of the container | `container_spec_cpu_quota` | Gauge | CPU quota of the container | `container_spec_cpu_shares` | Gauge | CPU share of the container | diff --git a/info/v2/container.go b/info/v2/container.go index 0e2fc7ea..4288d003 100644 --- a/info/v2/container.go +++ b/info/v2/container.go @@ -254,6 +254,7 @@ type ProcessInfo struct { RunningTime string `json:"running_time"` CgroupPath string `json:"cgroup_path"` Cmd string `json:"cmd"` + FdCount int `json:"fd_count"` } type TcpStat struct { diff --git a/manager/container.go b/manager/container.go index 295479f0..55d07501 100644 --- a/manager/container.go +++ b/manager/container.go @@ -47,7 +47,9 @@ import ( var enableLoadReader = flag.Bool("enable_load_reader", false, "Whether to enable cpu load reader") var HousekeepingInterval = flag.Duration("housekeeping_interval", 1*time.Second, "Interval between container housekeepings") -var cgroupPathRegExp = regexp.MustCompile(`devices[^:]*:(.*?)[,;$]`) +// cgroup type chosen to fetch the cgroup path of a process. +// Memory has been chosen, as it is one of the default cgroups that is enabled for most containers. +var cgroupPathRegExp = regexp.MustCompile(`memory[^:]*:(.*?)[,;$]`) type containerInfo struct { info.ContainerReference @@ -185,8 +187,8 @@ func (c *containerData) getCgroupPath(cgroups string) (string, error) { } matches := cgroupPathRegExp.FindSubmatch([]byte(cgroups)) if len(matches) != 2 { - glog.V(3).Infof("failed to get devices cgroup path from %q", cgroups) - // return root in case of failures - devices hierarchy might not be enabled. + glog.V(3).Infof("failed to get memory cgroup path from %q", cgroups) + // return root in case of failures - memory hierarchy might not be enabled. return "/", nil } return string(matches[1]), nil @@ -266,6 +268,10 @@ func (c *containerData) getContainerPids(inHostNamespace bool) ([]string, error) func (c *containerData) GetProcessList(cadvisorContainer string, inHostNamespace bool) ([]v2.ProcessInfo, error) { // report all processes for root. isRoot := c.info.Name == "/" + rootfs := "/" + if !inHostNamespace { + rootfs = "/rootfs" + } format := "user,pid,ppid,stime,pcpu,pmem,rss,vsz,stat,time,comm,cgroup" out, err := c.getPsOutput(inHostNamespace, format) if err != nil { @@ -324,6 +330,15 @@ func (c *containerData) GetProcessList(cadvisorContainer string, inHostNamespace cgroupPath = cgroup } + var fdCount int + dirPath := path.Join(rootfs, "/proc", strconv.Itoa(pid), "fd") + fds, err := ioutil.ReadDir(dirPath) + if err != nil { + glog.V(4).Infof("error while listing directory %q to measure fd count: %v", dirPath, err) + continue + } + fdCount = len(fds) + if isRoot || c.info.Name == cgroup { processes = append(processes, v2.ProcessInfo{ User: fields[0], @@ -338,6 +353,7 @@ func (c *containerData) GetProcessList(cadvisorContainer string, inHostNamespace RunningTime: fields[9], Cmd: fields[10], CgroupPath: cgroupPath, + FdCount: fdCount, }) } } diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 2fff79a2..6f01d606 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -21,6 +21,7 @@ import ( "github.com/google/cadvisor/container" info "github.com/google/cadvisor/info/v1" + "github.com/google/cadvisor/info/v2" "github.com/golang/glog" "github.com/prometheus/client_golang/prometheus" @@ -35,6 +36,8 @@ type infoProvider interface { GetVersionInfo() (*info.VersionInfo, error) // GetMachineInfo provides information about the machine. GetMachineInfo() (*info.MachineInfo, error) + // GetProcessList provides information about each container's processes + GetProcessList(containerName string, options v2.RequestOptions) ([]v2.ProcessInfo, error) } // metricValue describes a single metric value for a given set of label values @@ -109,6 +112,7 @@ type PrometheusCollector struct { errors prometheus.Gauge containerMetrics []containerMetric containerLabelsFunc ContainerLabelsFunc + includedMetrics container.MetricSet } // NewPrometheusCollector returns a new PrometheusCollector. The passed @@ -137,6 +141,7 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri }, }, }, + includedMetrics: includedMetrics, } if includedMetrics.Has(container.CpuUsageMetrics) { c.containerMetrics = append(c.containerMetrics, []containerMetric{ @@ -926,10 +931,15 @@ func (c *PrometheusCollector) collectContainersInfo(ch chan<- prometheus.Metric) rawLabels[l] = struct{}{} } } - for _, container := range containers { + + psReqOpt := v2.RequestOptions{ + IdType: v2.TypeName, + } + + for _, cont := range containers { values := make([]string, 0, len(rawLabels)) labels := make([]string, 0, len(rawLabels)) - containerLabels := c.containerLabelsFunc(container) + containerLabels := c.containerLabelsFunc(cont) for l := range rawLabels { labels = append(labels, sanitizeLabelName(l)) values = append(values, containerLabels[l]) @@ -937,35 +947,50 @@ func (c *PrometheusCollector) collectContainersInfo(ch chan<- prometheus.Metric) // Container spec desc := prometheus.NewDesc("container_start_time_seconds", "Start time of the container since unix epoch in seconds.", labels, nil) - ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(container.Spec.CreationTime.Unix()), values...) + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(cont.Spec.CreationTime.Unix()), values...) - if container.Spec.HasCpu { + if cont.Spec.HasCpu { desc = prometheus.NewDesc("container_spec_cpu_period", "CPU period of the container.", labels, nil) - ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(container.Spec.Cpu.Period), values...) - if container.Spec.Cpu.Quota != 0 { + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(cont.Spec.Cpu.Period), values...) + if cont.Spec.Cpu.Quota != 0 { desc = prometheus.NewDesc("container_spec_cpu_quota", "CPU quota of the container.", labels, nil) - ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(container.Spec.Cpu.Quota), values...) + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(cont.Spec.Cpu.Quota), values...) } desc := prometheus.NewDesc("container_spec_cpu_shares", "CPU share of the container.", labels, nil) - ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(container.Spec.Cpu.Limit), values...) + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(cont.Spec.Cpu.Limit), values...) } - if container.Spec.HasMemory { + if cont.Spec.HasMemory { desc := prometheus.NewDesc("container_spec_memory_limit_bytes", "Memory limit for the container.", labels, nil) - ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(container.Spec.Memory.Limit), values...) + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(cont.Spec.Memory.Limit), values...) desc = prometheus.NewDesc("container_spec_memory_swap_limit_bytes", "Memory swap limit for the container.", labels, nil) - ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(container.Spec.Memory.SwapLimit), values...) + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(cont.Spec.Memory.SwapLimit), values...) desc = prometheus.NewDesc("container_spec_memory_reservation_limit_bytes", "Memory reservation limit for the container.", labels, nil) - ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(container.Spec.Memory.Reservation), values...) + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(cont.Spec.Memory.Reservation), values...) + } + + if c.includedMetrics.Has(container.ProcessMetrics) { + psList, err := c.infoProvider.GetProcessList(cont.Name, psReqOpt) + if err == nil { + desc = prometheus.NewDesc("container_processes", "Number of processes running inside the container.", labels, nil) + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(len(psList)), values...) + + var fd int + for _, ps := range psList { + fd += ps.FdCount + } + desc = prometheus.NewDesc("container_file_descriptors", "Number of open file descriptors for the container.", labels, nil) + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(fd), values...) + } } // Now for the actual metrics - if len(container.Stats) == 0 { + if len(cont.Stats) == 0 { continue } - stats := container.Stats[0] + stats := cont.Stats[0] for _, cm := range c.containerMetrics { - if cm.condition != nil && !cm.condition(container.Spec) { + if cm.condition != nil && !cm.condition(cont.Spec) { continue } desc := cm.desc(labels) diff --git a/metrics/prometheus_test.go b/metrics/prometheus_test.go index c6af3a19..ddaab830 100644 --- a/metrics/prometheus_test.go +++ b/metrics/prometheus_test.go @@ -26,6 +26,7 @@ import ( "github.com/google/cadvisor/container" info "github.com/google/cadvisor/info/v1" + "github.com/google/cadvisor/info/v2" "github.com/prometheus/client_golang/prometheus" ) @@ -49,6 +50,26 @@ func (p testSubcontainersInfoProvider) GetMachineInfo() (*info.MachineInfo, erro }, nil } +func (p testSubcontainersInfoProvider) GetProcessList(containerName string, options v2.RequestOptions) ([]v2.ProcessInfo, error) { + return []v2.ProcessInfo{ + { + User: "user1", + Pid: 1, + Ppid: 2, + StartTime: "OCT2018", + PercentCpu: 0.0, + PercentMemory: 0.0, + RSS: 3, + VirtualSize: 4, + Status: "S", + RunningTime: "00:00:00", + Cmd: "cmd1", + CgroupPath: "path", + FdCount: 5, + }, + }, nil +} + var allMetrics = container.MetricSet{ container.CpuUsageMetrics: struct{}{}, container.ProcessSchedulerMetrics: struct{}{}, @@ -61,6 +82,7 @@ var allMetrics = container.MetricSet{ container.NetworkUsageMetrics: struct{}{}, container.NetworkTcpUsageMetrics: struct{}{}, container.NetworkUdpUsageMetrics: struct{}{}, + container.ProcessMetrics: struct{}{}, } func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.ContainerInfoRequest) ([]*info.ContainerInfo, error) { @@ -305,6 +327,13 @@ func (p *erroringSubcontainersInfoProvider) GetMachineInfo() (*info.MachineInfo, return p.successfulProvider.GetMachineInfo() } +func (p *erroringSubcontainersInfoProvider) GetProcessList(containerName string, options v2.RequestOptions) ([]v2.ProcessInfo, error) { + if p.shouldFail { + return nil, errors.New("Oops 2") + } + return p.successfulProvider.GetProcessList(containerName, options) +} + func (p *erroringSubcontainersInfoProvider) SubcontainersInfo( a string, r *info.ContainerInfoRequest) ([]*info.ContainerInfo, error) { if p.shouldFail { diff --git a/metrics/testdata/prometheus_metrics b/metrics/testdata/prometheus_metrics index d323fb7f..eb423059 100644 --- a/metrics/testdata/prometheus_metrics +++ b/metrics/testdata/prometheus_metrics @@ -46,6 +46,9 @@ container_cpu_usage_seconds_total{container_env_foo_env="prod",container_label_f # HELP container_cpu_user_seconds_total Cumulative user cpu time consumed in seconds. # TYPE container_cpu_user_seconds_total counter container_cpu_user_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 6e-09 +# HELP container_file_descriptors Number of open file descriptors for the container. +# TYPE container_file_descriptors gauge +container_file_descriptors{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 5 # HELP container_fs_inodes_free Number of available Inodes # TYPE container_fs_inodes_free gauge container_fs_inodes_free{container_env_foo_env="prod",container_label_foo_label="bar",device="sda1",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 524288 @@ -182,6 +185,9 @@ container_network_udp_usage_total{container_env_foo_env="prod",container_label_f container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="listen",zone_name="hello"} 0 container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="rxqueued",zone_name="hello"} 0 container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="txqueued",zone_name="hello"} 0 +# HELP container_processes Number of processes running inside the container. +# TYPE container_processes gauge +container_processes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 # HELP container_scrape_error 1 if there was an error while getting container metrics, 0 otherwise # TYPE container_scrape_error gauge container_scrape_error 0