diff --git a/cadvisor.go b/cadvisor.go index 9f7b8c81..6bc70cb3 100644 --- a/cadvisor.go +++ b/cadvisor.go @@ -43,6 +43,8 @@ var httpAuthRealm = flag.String("http_auth_realm", "localhost", "HTTP auth realm var httpDigestFile = flag.String("http_digest_file", "", "HTTP digest file for the web UI") var httpDigestRealm = flag.String("http_digest_realm", "localhost", "HTTP digest file for the web UI") +var prometheusEndpoint = flag.String("prometheus_endpoint", "/metrics", "Endpoint to expose Prometheus metrics on") + func main() { defer glog.Flush() flag.Parse() @@ -72,7 +74,7 @@ func main() { mux := http.DefaultServeMux // Register all HTTP handlers. - err = cadvisorHttp.RegisterHandlers(mux, containerManager, *httpAuthFile, *httpAuthRealm, *httpDigestFile, *httpDigestRealm) + err = cadvisorHttp.RegisterHandlers(mux, containerManager, *httpAuthFile, *httpAuthRealm, *httpDigestFile, *httpDigestRealm, *prometheusEndpoint) if err != nil { glog.Fatalf("Failed to register HTTP handlers: %v", err) } diff --git a/http/handlers.go b/http/handlers.go index 53d862d8..6ff5bad7 100644 --- a/http/handlers.go +++ b/http/handlers.go @@ -24,12 +24,14 @@ import ( "github.com/google/cadvisor/healthz" httpMux "github.com/google/cadvisor/http/mux" "github.com/google/cadvisor/manager" + "github.com/google/cadvisor/metrics" "github.com/google/cadvisor/pages" "github.com/google/cadvisor/pages/static" "github.com/google/cadvisor/validate" + "github.com/prometheus/client_golang/prometheus" ) -func RegisterHandlers(mux httpMux.Mux, containerManager manager.Manager, httpAuthFile, httpAuthRealm, httpDigestFile, httpDigestRealm string) error { +func RegisterHandlers(mux httpMux.Mux, containerManager manager.Manager, httpAuthFile, httpAuthRealm, httpDigestFile, httpDigestRealm, prometheusEndpoint string) error { // Basic health handler. if err := healthz.RegisterHandler(mux); err != nil { return fmt.Errorf("failed to register healthz handler: %s", err) @@ -83,6 +85,10 @@ func RegisterHandlers(mux httpMux.Mux, containerManager manager.Manager, httpAut } } + collector := metrics.NewPrometheusCollector(containerManager) + prometheus.MustRegister(collector) + http.Handle(prometheusEndpoint, prometheus.Handler()) + return nil } diff --git a/metrics/prometheus.go b/metrics/prometheus.go new file mode 100644 index 00000000..37b5f950 --- /dev/null +++ b/metrics/prometheus.go @@ -0,0 +1,366 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "fmt" + "time" + + "github.com/golang/glog" + info "github.com/google/cadvisor/info/v1" + "github.com/google/cadvisor/manager" + "github.com/prometheus/client_golang/prometheus" +) + +type prometheusMetric struct { + valueType prometheus.ValueType + value float64 + labels []string +} + +// PrometheusCollector implements prometheus.Collector. +type PrometheusCollector struct { + manager manager.Manager + + errors prometheus.Gauge + lastSeen *prometheus.Desc + + cpuUsageUserSeconds *prometheus.Desc + cpuUsageSystemSeconds *prometheus.Desc + cpuUsageSecondsPerCPU *prometheus.Desc + + memoryUsageBytes *prometheus.Desc + memoryWorkingSet *prometheus.Desc + memoryFailures *prometheus.Desc + + fsLimit *prometheus.Desc + fsUsage *prometheus.Desc + fsReads *prometheus.Desc + fsReadsSectors *prometheus.Desc + fsReadsMerged *prometheus.Desc + fsReadTime *prometheus.Desc + + fsWrites *prometheus.Desc + fsWritesSectors *prometheus.Desc + fsWritesMerged *prometheus.Desc + fsWriteTime *prometheus.Desc + + fsIoInProgress *prometheus.Desc + fsIoTime *prometheus.Desc + + fsWeightedIoTime *prometheus.Desc + + networkRxBytes *prometheus.Desc + networkRxPackets *prometheus.Desc + networkRxErrors *prometheus.Desc + networkRxDropped *prometheus.Desc + networkTxBytes *prometheus.Desc + networkTxPackets *prometheus.Desc + networkTxErrors *prometheus.Desc + networkTxDropped *prometheus.Desc + + tasks *prometheus.Desc + + descs []*prometheus.Desc +} + +// NewPrometheusCollector returns a new PrometheusCollector. +func NewPrometheusCollector(manager manager.Manager) *PrometheusCollector { + c := &PrometheusCollector{ + manager: manager, + errors: prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "container", + Name: "scrape_error", + Help: "1 if there was an error while getting container metrics, 0 otherwise", + }), + lastSeen: prometheus.NewDesc( + "container_last_seen", + "Last time a container was seen by the exporter", + []string{"name", "id"}, + nil), + cpuUsageUserSeconds: prometheus.NewDesc( + "container_cpu_user_seconds_total", + "Cumulative user cpu time consumed in seconds.", + []string{"name", "id"}, + nil), + cpuUsageSystemSeconds: prometheus.NewDesc( + "container_cpu_system_seconds_total", + "Cumulative system cpu time consumed in seconds.", + []string{"name", "id"}, + nil), + cpuUsageSecondsPerCPU: prometheus.NewDesc( + "container_cpu_usage_seconds_total", + "Cumulative cpu time consumed per cpu in seconds.", + []string{"name", "id", "cpu"}, + nil), + memoryUsageBytes: prometheus.NewDesc( + "container_memory_usage_bytes", + "Current memory usage in bytes.", + []string{"name", "id"}, + nil), + memoryWorkingSet: prometheus.NewDesc( + "container_memory_working_set_bytes", + "Current working set in bytes.", + []string{"name", "id"}, + nil), + memoryFailures: prometheus.NewDesc( + "container_memory_failures_total", + "Cumulative count of memory allocation failures.", + []string{"type", "scope", "name", "id"}, + nil), + + fsLimit: prometheus.NewDesc( + "container_fs_limit_bytes", + "Number of bytes that can be consumed by the container on this filesystem.", + []string{"name", "id", "device"}, + nil), + fsUsage: prometheus.NewDesc( + "container_fs_usage_bytes", + "Number of bytes that are consumed by the container on this filesystem.", + []string{"name", "id", "device"}, + nil), + fsReads: prometheus.NewDesc( + "container_fs_reads_total", + "Cumulative count of reads completed", + []string{"name", "id", "device"}, + nil), + fsReadsSectors: prometheus.NewDesc( + "container_fs_sector_reads_total", + "Cumulative count of sector reads completed", + []string{"name", "id", "device"}, + nil), + fsReadsMerged: prometheus.NewDesc( + "container_fs_reads_merged_total", + "Cumulative count of reads merged", + []string{"name", "id", "device"}, + nil), + fsReadTime: prometheus.NewDesc( + "container_fs_read_seconds_total", + "Cumulative count of seconds spent reading", + []string{"name", "id", "device"}, + nil), + fsWrites: prometheus.NewDesc( + "container_fs_writes_total", + "Cumulative count of writes completed", + []string{"name", "id", "device"}, + nil), + fsWritesSectors: prometheus.NewDesc( + "container_fs_sector_writes_total", + "Cumulative count of sector writes completed", + []string{"name", "id", "device"}, + nil), + fsWritesMerged: prometheus.NewDesc( + "container_fs_writes_merged_total", + "Cumulative count of writes merged", + []string{"name", "id", "device"}, + nil), + fsWriteTime: prometheus.NewDesc( + "container_fs_write_seconds_total", + "Cumulative count of seconds spent writing", + []string{"name", "id", "device"}, + nil), + fsIoInProgress: prometheus.NewDesc( + "container_fs_io_current", + "Number of I/Os currently in progress", + []string{"name", "id", "device"}, + nil), + fsIoTime: prometheus.NewDesc( + "container_fs_io_time_seconds_total", + "Cumulative count of seconds spent doing I/Os", + []string{"name", "id", "device"}, + nil), + fsWeightedIoTime: prometheus.NewDesc( + "container_fs_io_time_weighted_seconds_total", + "Cumulative count of seconds spent doing I/Os", + []string{"name", "id", "device"}, + nil), + networkRxBytes: prometheus.NewDesc( + "container_network_receive_bytes_total", + "Cumulative count of bytes received", + []string{"name", "id"}, + nil), + networkRxPackets: prometheus.NewDesc( + "container_network_receive_packets_total", + "Cumulative count of packets received", + []string{"name", "id"}, + nil), + networkRxDropped: prometheus.NewDesc( + "container_network_receive_packets_dropped_total", + "Cumulative count of bytes received", + []string{"name", "id"}, + nil), + networkRxErrors: prometheus.NewDesc( + "container_network_receive_errors_total", + "Cumulative count of errors encountered while receiving", + []string{"name", "id"}, + nil), + networkTxBytes: prometheus.NewDesc( + "container_network_transmit_bytes_total", + "Cumulative count of bytes transmitted", + []string{"name", "id"}, + nil), + networkTxPackets: prometheus.NewDesc( + "container_network_transmit_packets_total", + "Cumulative count of packets transmitted", + []string{"name", "id"}, + nil), + networkTxDropped: prometheus.NewDesc( + "container_network_transmit_packets_dropped_total", + "Cumulative count of bytes dropped", + []string{"name", "id"}, + nil), + networkTxErrors: prometheus.NewDesc( + "container_network_transmit_errors_total", + "Cumulative count of errors encountered while transmitting", + []string{"name", "id"}, + nil), + + tasks: prometheus.NewDesc( + "container_tasks_state", + "Number of tasks in given state", + []string{"state", "name", "id"}, + nil), + } + c.descs = []*prometheus.Desc{ + c.lastSeen, + + c.cpuUsageUserSeconds, + c.cpuUsageSystemSeconds, + + c.memoryUsageBytes, + c.memoryWorkingSet, + c.memoryFailures, + + c.fsLimit, + c.fsUsage, + c.fsReads, + c.fsReadsSectors, + c.fsReadsMerged, + c.fsReadTime, + c.fsWrites, + c.fsWritesSectors, + c.fsWritesMerged, + c.fsWriteTime, + c.fsIoInProgress, + c.fsIoTime, + c.fsWeightedIoTime, + + c.networkRxBytes, + c.networkRxPackets, + c.networkRxErrors, + c.networkRxDropped, + c.networkTxBytes, + c.networkTxPackets, + c.networkTxErrors, + c.networkTxDropped, + + c.tasks, + } + return c +} + +// Describe describes all the metrics ever exported by cadvisor. It +// implements prometheus.PrometheusCollector. +func (c *PrometheusCollector) Describe(ch chan<- *prometheus.Desc) { + c.errors.Describe(ch) + for _, d := range c.descs { + ch <- d + } +} + +// Collect fetches the stats from all containers and delivers them as +// Prometheus metrics. It implements prometheus.PrometheusCollector. +func (c *PrometheusCollector) Collect(ch chan<- prometheus.Metric) { + containers, err := c.manager.SubcontainersInfo("/", &info.ContainerInfoRequest{NumStats: 1}) + if err != nil { + c.errors.Set(1) + glog.Warning("Couldn't get containers: %s", err) + return + } + for _, container := range containers { + id := container.Name + name := id + if len(container.Aliases) > 0 { + name = container.Aliases[0] + } + stats := container.Stats[0] + + for desc, metrics := range map[*prometheus.Desc][]prometheusMetric{ + c.cpuUsageUserSeconds: {{valueType: prometheus.CounterValue, value: float64(stats.Cpu.Usage.User) / float64(time.Second)}}, + c.cpuUsageSystemSeconds: {{valueType: prometheus.CounterValue, value: float64(stats.Cpu.Usage.System) / float64(time.Second)}}, + + c.memoryFailures: { + {valueType: prometheus.CounterValue, labels: []string{"pgfault", "container"}, value: float64(stats.Memory.ContainerData.Pgfault)}, + {valueType: prometheus.CounterValue, labels: []string{"pgmajfault", "container"}, value: float64(stats.Memory.ContainerData.Pgmajfault)}, + {valueType: prometheus.CounterValue, labels: []string{"pgfault", "hierarchy"}, value: float64(stats.Memory.HierarchicalData.Pgfault)}, + {valueType: prometheus.CounterValue, labels: []string{"pgmajfault", "hierarchy"}, value: float64(stats.Memory.HierarchicalData.Pgmajfault)}, + }, + c.tasks: { + {valueType: prometheus.GaugeValue, labels: []string{"sleeping"}, value: float64(stats.TaskStats.NrSleeping)}, + {valueType: prometheus.GaugeValue, labels: []string{"running"}, value: float64(stats.TaskStats.NrRunning)}, + {valueType: prometheus.GaugeValue, labels: []string{"stopped"}, value: float64(stats.TaskStats.NrStopped)}, + {valueType: prometheus.GaugeValue, labels: []string{"uninterruptible"}, value: float64(stats.TaskStats.NrUinterruptible)}, + {valueType: prometheus.GaugeValue, labels: []string{"iowaiting"}, value: float64(stats.TaskStats.NrIoWait)}, + }, + + c.lastSeen: {{valueType: prometheus.GaugeValue, value: float64(time.Now().Unix())}}, + + c.memoryUsageBytes: {{valueType: prometheus.GaugeValue, value: float64(stats.Memory.Usage)}}, + c.memoryWorkingSet: {{valueType: prometheus.GaugeValue, value: float64(stats.Memory.WorkingSet)}}, + + c.networkRxBytes: {{valueType: prometheus.CounterValue, value: float64(stats.Network.RxBytes)}}, + c.networkRxPackets: {{valueType: prometheus.CounterValue, value: float64(stats.Network.RxPackets)}}, + c.networkRxErrors: {{valueType: prometheus.CounterValue, value: float64(stats.Network.RxErrors)}}, + c.networkRxDropped: {{valueType: prometheus.CounterValue, value: float64(stats.Network.RxDropped)}}, + c.networkTxBytes: {{valueType: prometheus.CounterValue, value: float64(stats.Network.TxBytes)}}, + c.networkTxPackets: {{valueType: prometheus.CounterValue, value: float64(stats.Network.TxPackets)}}, + c.networkTxErrors: {{valueType: prometheus.CounterValue, value: float64(stats.Network.TxErrors)}}, + c.networkTxDropped: {{valueType: prometheus.CounterValue, value: float64(stats.Network.TxDropped)}}, + } { + for _, m := range metrics { + ch <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, float64(m.value), append(m.labels, name, id)...) + } + } + + // Metrics with dynamic labels + for i, value := range stats.Cpu.Usage.PerCpu { + ch <- prometheus.MustNewConstMetric(c.cpuUsageSecondsPerCPU, prometheus.CounterValue, float64(value)/float64(time.Second), name, id, fmt.Sprintf("cpu%02d", i)) + } + + for _, stat := range stats.Filesystem { + for desc, m := range map[*prometheus.Desc]prometheusMetric{ + c.fsReads: {valueType: prometheus.CounterValue, value: float64(stat.ReadsCompleted)}, + c.fsReadsSectors: {valueType: prometheus.CounterValue, value: float64(stat.SectorsRead)}, + c.fsReadsMerged: {valueType: prometheus.CounterValue, value: float64(stat.ReadsMerged)}, + c.fsReadTime: {valueType: prometheus.CounterValue, value: float64(stat.ReadTime) / float64(time.Second)}, + + c.fsWrites: {valueType: prometheus.CounterValue, value: float64(stat.WritesCompleted)}, + c.fsWritesSectors: {valueType: prometheus.CounterValue, value: float64(stat.SectorsWritten)}, + c.fsWritesMerged: {valueType: prometheus.CounterValue, value: float64(stat.WritesMerged)}, + c.fsWriteTime: {valueType: prometheus.CounterValue, value: float64(stat.WriteTime) / float64(time.Second)}, + + c.fsIoTime: {valueType: prometheus.CounterValue, value: float64(stat.IoInProgress) / float64(time.Second)}, + c.fsWeightedIoTime: {valueType: prometheus.CounterValue, value: float64(stat.IoTime) / float64(time.Second)}, + + c.fsIoInProgress: {valueType: prometheus.GaugeValue, value: float64(stat.IoInProgress)}, + c.fsLimit: {valueType: prometheus.GaugeValue, value: float64(stat.Limit)}, + c.fsUsage: {valueType: prometheus.GaugeValue, value: float64(stat.Usage)}, + } { + ch <- prometheus.MustNewConstMetric(desc, m.valueType, m.value, name, id, stat.Device) + } + } + } + c.errors.Collect(ch) +}