From 4e25a7951f1c9ad97df5e39978808a4334b74a30 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Wed, 19 Apr 2017 01:08:59 -0400 Subject: [PATCH] Report container FS metrics into prometheus /metrics PerDiskStats reported from cgroups were not being surfaced into prometheus. In order to properly correlate the metrics, we need to assign a device label to each metric (which is the FS or device path). Since blkio cgroup tracks devices, we create a synthetic device `/dev/NAME` for the metric. Assign a Device label to each PerDiskStat for the handlers up front, and then surface the PerDiskStat values into the prometheus metrics. Report two new metrics - total bytes read and total bytes written. --- container/common/helpers.go | 68 ++++++++++++++++++ container/docker/handler.go | 14 ++-- container/raw/handler.go | 27 +++++++ container/rkt/handler.go | 13 ++-- info/v1/container.go | 7 +- info/v1/machine.go | 4 ++ machine/info.go | 2 +- metrics/prometheus.go | 138 ++++++++++++++++++++++++++++-------- 8 files changed, 230 insertions(+), 43 deletions(-) diff --git a/container/common/helpers.go b/container/common/helpers.go index 89090fb2..33d9db37 100644 --- a/container/common/helpers.go +++ b/container/common/helpers.go @@ -223,3 +223,71 @@ func ListContainers(name string, cgroupPaths map[string]string, listType contain return ret, nil } + +// AssignDeviceNamesToDiskStats assigns the Device field on the provided DiskIoStats by looking up +// the device major and minor identifiers in the provided device namer. +func AssignDeviceNamesToDiskStats(namer DeviceNamer, stats *info.DiskIoStats) { + assignDeviceNamesToPerDiskStats( + namer, + stats.IoMerged, + stats.IoQueued, + stats.IoServiceBytes, + stats.IoServiceTime, + stats.IoServiced, + stats.IoTime, + stats.IoWaitTime, + stats.Sectors, + ) +} + +// assignDeviceNamesToPerDiskStats looks up device names for the provided stats, caching names +// if necessary. +func assignDeviceNamesToPerDiskStats(namer DeviceNamer, diskStats ...[]info.PerDiskStats) { + devices := make(deviceIdentifierMap) + for _, stats := range diskStats { + for i, stat := range stats { + stats[i].Device = devices.Find(stat.Major, stat.Minor, namer) + } + } +} + +// DeviceNamer returns string names for devices by their major and minor id. +type DeviceNamer interface { + // DeviceName returns the name of the device by its major and minor ids, or false if no + // such device is recognized. + DeviceName(major, minor uint64) (string, bool) +} + +type MachineInfoNamer info.MachineInfo + +func (n *MachineInfoNamer) DeviceName(major, minor uint64) (string, bool) { + for _, info := range n.DiskMap { + if info.Major == major && info.Minor == minor { + return "/dev/" + info.Name, true + } + } + for _, info := range n.Filesystems { + if info.DeviceMajor == major && info.DeviceMinor == minor { + return info.Device, true + } + } + return "", false +} + +type deviceIdentifier struct { + major uint64 + minor uint64 +} + +type deviceIdentifierMap map[deviceIdentifier]string + +// Find locates the device name by device identifier out of from, caching the result as necessary. +func (m deviceIdentifierMap) Find(major, minor uint64, namer DeviceNamer) string { + d := deviceIdentifier{major, minor} + if s, ok := m[d]; ok { + return s + } + s, _ := namer.DeviceName(major, minor) + m[d] = s + return s +} diff --git a/container/docker/handler.go b/container/docker/handler.go index e1409326..4a3db6f9 100644 --- a/container/docker/handler.go +++ b/container/docker/handler.go @@ -390,6 +390,15 @@ func (self *dockerContainerHandler) GetSpec() (info.ContainerSpec, error) { } func (self *dockerContainerHandler) getFsStats(stats *info.ContainerStats) error { + mi, err := self.machineInfoFactory.GetMachineInfo() + if err != nil { + return err + } + + if !self.ignoreMetrics.Has(container.DiskIOMetrics) { + common.AssignDeviceNamesToDiskStats((*common.MachineInfoNamer)(mi), &stats.DiskIo) + } + if self.ignoreMetrics.Has(container.DiskUsageMetrics) { return nil } @@ -411,11 +420,6 @@ func (self *dockerContainerHandler) getFsStats(stats *info.ContainerStats) error return nil } - mi, err := self.machineInfoFactory.GetMachineInfo() - if err != nil { - return err - } - var ( limit uint64 fsType string diff --git a/container/raw/handler.go b/container/raw/handler.go index 62c5c8c8..35614afd 100644 --- a/container/raw/handler.go +++ b/container/raw/handler.go @@ -197,6 +197,7 @@ func fsToFsStats(fs *fs.Fs) info.FsStats { } func (self *rawContainerHandler) getFsStats(stats *info.ContainerStats) error { + var allFs []fs.Fs // Get Filesystem information only for the root cgroup. if isRootCgroup(self.name) { filesystems, err := self.fsInfo.GetGlobalFsInfo() @@ -207,6 +208,7 @@ func (self *rawContainerHandler) getFsStats(stats *info.ContainerStats) error { fs := filesystems[i] stats.Filesystem = append(stats.Filesystem, fsToFsStats(&fs)) } + allFs = filesystems } else if len(self.externalMounts) > 0 { var mountSet map[string]struct{} mountSet = make(map[string]struct{}) @@ -221,7 +223,10 @@ func (self *rawContainerHandler) getFsStats(stats *info.ContainerStats) error { fs := filesystems[i] stats.Filesystem = append(stats.Filesystem, fsToFsStats(&fs)) } + allFs = filesystems } + + common.AssignDeviceNamesToDiskStats(&fsNamer{fs: allFs, factory: self.machineInfoFactory}, &stats.DiskIo) return nil } @@ -272,3 +277,25 @@ func (self *rawContainerHandler) Exists() bool { func (self *rawContainerHandler) Type() container.ContainerType { return container.ContainerTypeRaw } + +type fsNamer struct { + fs []fs.Fs + factory info.MachineInfoFactory + info common.DeviceNamer +} + +func (n *fsNamer) DeviceName(major, minor uint64) (string, bool) { + for _, info := range n.fs { + if uint64(info.Major) == major && uint64(info.Minor) == minor { + return info.Device, true + } + } + if n.info == nil { + mi, err := n.factory.GetMachineInfo() + if err != nil { + return "", false + } + n.info = (*common.MachineInfoNamer)(mi) + } + return n.info.DeviceName(major, minor) +} diff --git a/container/rkt/handler.go b/container/rkt/handler.go index b5708aa8..9caa0ce4 100644 --- a/container/rkt/handler.go +++ b/container/rkt/handler.go @@ -202,6 +202,15 @@ func (handler *rktContainerHandler) GetSpec() (info.ContainerSpec, error) { } func (handler *rktContainerHandler) getFsStats(stats *info.ContainerStats) error { + mi, err := handler.machineInfoFactory.GetMachineInfo() + if err != nil { + return err + } + + if !handler.ignoreMetrics.Has(container.DiskIOMetrics) { + common.AssignDeviceNamesToDiskStats((*common.MachineInfoNamer)(mi), &stats.DiskIo) + } + if handler.ignoreMetrics.Has(container.DiskUsageMetrics) { return nil } @@ -211,10 +220,6 @@ func (handler *rktContainerHandler) getFsStats(stats *info.ContainerStats) error return err } - mi, err := handler.machineInfoFactory.GetMachineInfo() - if err != nil { - return err - } var limit uint64 = 0 // Use capacity as limit. diff --git a/info/v1/container.go b/info/v1/container.go index 6127d881..038db841 100644 --- a/info/v1/container.go +++ b/info/v1/container.go @@ -307,9 +307,10 @@ type CpuStats struct { } type PerDiskStats struct { - Major uint64 `json:"major"` - Minor uint64 `json:"minor"` - Stats map[string]uint64 `json:"stats"` + Device string `json:"-"` + Major uint64 `json:"major"` + Minor uint64 `json:"minor"` + Stats map[string]uint64 `json:"stats"` } type DiskIoStats struct { diff --git a/info/v1/machine.go b/info/v1/machine.go index c259e0ba..9395ceaa 100644 --- a/info/v1/machine.go +++ b/info/v1/machine.go @@ -17,6 +17,10 @@ package v1 type FsInfo struct { // Block device associated with the filesystem. Device string `json:"device"` + // DeviceMajor is the major identifier of the device, used for correlation with blkio stats + DeviceMajor uint64 `json:"-"` + // DeviceMinor is the minor identifier of the device, used for correlation with blkio stats + DeviceMinor uint64 `json:"-"` // Total number of bytes available on the filesystem. Capacity uint64 `json:"capacity"` diff --git a/machine/info.go b/machine/info.go index 0478218e..b08e9a8e 100644 --- a/machine/info.go +++ b/machine/info.go @@ -116,7 +116,7 @@ func Info(sysFs sysfs.SysFs, fsInfo fs.FsInfo, inHostNamespace bool) (*info.Mach if fs.Inodes != nil { inodes = *fs.Inodes } - machineInfo.Filesystems = append(machineInfo.Filesystems, info.FsInfo{Device: fs.Device, Type: fs.Type.String(), Capacity: fs.Capacity, Inodes: inodes, HasInodes: fs.Inodes != nil}) + machineInfo.Filesystems = append(machineInfo.Filesystems, info.FsInfo{Device: fs.Device, DeviceMajor: uint64(fs.Major), DeviceMinor: uint64(fs.Minor), Type: fs.Type.String(), Capacity: fs.Capacity, Inodes: inodes, HasInodes: fs.Inodes != nil}) } return machineInfo, nil diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 6f65cb45..87d53e94 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -45,6 +45,14 @@ type metricValue struct { type metricValues []metricValue +// asFloat64 converts a uint64 into a float64. +func asFloat64(v uint64) float64 { return float64(v) } + +// asNanosecondsToSeconds converts nanoseconds into a float64 representing seconds. +func asNanosecondsToSeconds(v uint64) float64 { + return float64(v) / float64(time.Second) +} + // fsValues is a helper method for assembling per-filesystem stats. func fsValues(fsStats []info.FsStats, valueFn func(*info.FsStats) float64) metricValues { values := make(metricValues, 0, len(fsStats)) @@ -57,6 +65,24 @@ func fsValues(fsStats []info.FsStats, valueFn func(*info.FsStats) float64) metri return values } +// ioValues is a helper method for assembling per-disk and per-filesystem stats. +func ioValues(ioStats []info.PerDiskStats, ioType string, ioValueFn func(uint64) float64, fsStats []info.FsStats, valueFn func(*info.FsStats) float64) metricValues { + values := make(metricValues, 0, len(ioStats)+len(fsStats)) + for _, stat := range ioStats { + values = append(values, metricValue{ + value: ioValueFn(stat.Stats[ioType]), + labels: []string{stat.Device}, + }) + } + for _, stat := range fsStats { + values = append(values, metricValue{ + value: valueFn(&stat), + labels: []string{stat.Device}, + }) + } + return values +} + // containerMetric describes a multi-dimensional metric used for exposing a // certain type of container statistic. type containerMetric struct { @@ -270,15 +296,29 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo return float64(fs.Usage) }) }, + }, { + name: "container_fs_reads_bytes_total", + help: "Cumulative count of bytes read", + valueType: prometheus.CounterValue, + extraLabels: []string{"device"}, + getValues: func(s *info.ContainerStats) metricValues { + return ioValues( + s.DiskIo.IoServiceBytes, "Read", asFloat64, + nil, nil, + ) + }, }, { name: "container_fs_reads_total", help: "Cumulative count of reads completed", valueType: prometheus.CounterValue, extraLabels: []string{"device"}, getValues: func(s *info.ContainerStats) metricValues { - return fsValues(s.Filesystem, func(fs *info.FsStats) float64 { - return float64(fs.ReadsCompleted) - }) + return ioValues( + s.DiskIo.IoServiced, "Read", asFloat64, + s.Filesystem, func(fs *info.FsStats) float64 { + return float64(fs.ReadsCompleted) + }, + ) }, }, { name: "container_fs_sector_reads_total", @@ -286,9 +326,12 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo valueType: prometheus.CounterValue, extraLabels: []string{"device"}, getValues: func(s *info.ContainerStats) metricValues { - return fsValues(s.Filesystem, func(fs *info.FsStats) float64 { - return float64(fs.SectorsRead) - }) + return ioValues( + s.DiskIo.Sectors, "Read", asFloat64, + s.Filesystem, func(fs *info.FsStats) float64 { + return float64(fs.SectorsRead) + }, + ) }, }, { name: "container_fs_reads_merged_total", @@ -296,9 +339,12 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo valueType: prometheus.CounterValue, extraLabels: []string{"device"}, getValues: func(s *info.ContainerStats) metricValues { - return fsValues(s.Filesystem, func(fs *info.FsStats) float64 { - return float64(fs.ReadsMerged) - }) + return ioValues( + s.DiskIo.IoMerged, "Read", asFloat64, + s.Filesystem, func(fs *info.FsStats) float64 { + return float64(fs.ReadsMerged) + }, + ) }, }, { name: "container_fs_read_seconds_total", @@ -306,9 +352,23 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo valueType: prometheus.CounterValue, extraLabels: []string{"device"}, getValues: func(s *info.ContainerStats) metricValues { - return fsValues(s.Filesystem, func(fs *info.FsStats) float64 { - return float64(fs.ReadTime) / float64(time.Second) - }) + return ioValues( + s.DiskIo.IoServiceTime, "Read", asNanosecondsToSeconds, + s.Filesystem, func(fs *info.FsStats) float64 { + return float64(fs.ReadTime) / float64(time.Second) + }, + ) + }, + }, { + name: "container_fs_writes_bytes_total", + help: "Cumulative count of bytes written", + valueType: prometheus.CounterValue, + extraLabels: []string{"device"}, + getValues: func(s *info.ContainerStats) metricValues { + return ioValues( + s.DiskIo.IoServiceBytes, "Write", asFloat64, + nil, nil, + ) }, }, { name: "container_fs_writes_total", @@ -316,9 +376,12 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo valueType: prometheus.CounterValue, extraLabels: []string{"device"}, getValues: func(s *info.ContainerStats) metricValues { - return fsValues(s.Filesystem, func(fs *info.FsStats) float64 { - return float64(fs.WritesCompleted) - }) + return ioValues( + s.DiskIo.IoServiced, "Write", asFloat64, + s.Filesystem, func(fs *info.FsStats) float64 { + return float64(fs.WritesCompleted) + }, + ) }, }, { name: "container_fs_sector_writes_total", @@ -326,9 +389,12 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo valueType: prometheus.CounterValue, extraLabels: []string{"device"}, getValues: func(s *info.ContainerStats) metricValues { - return fsValues(s.Filesystem, func(fs *info.FsStats) float64 { - return float64(fs.SectorsWritten) - }) + return ioValues( + s.DiskIo.Sectors, "Write", asFloat64, + s.Filesystem, func(fs *info.FsStats) float64 { + return float64(fs.SectorsWritten) + }, + ) }, }, { name: "container_fs_writes_merged_total", @@ -336,9 +402,12 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo valueType: prometheus.CounterValue, extraLabels: []string{"device"}, getValues: func(s *info.ContainerStats) metricValues { - return fsValues(s.Filesystem, func(fs *info.FsStats) float64 { - return float64(fs.WritesMerged) - }) + return ioValues( + s.DiskIo.IoMerged, "Write", asFloat64, + s.Filesystem, func(fs *info.FsStats) float64 { + return float64(fs.WritesMerged) + }, + ) }, }, { name: "container_fs_write_seconds_total", @@ -346,9 +415,12 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo valueType: prometheus.CounterValue, extraLabels: []string{"device"}, getValues: func(s *info.ContainerStats) metricValues { - return fsValues(s.Filesystem, func(fs *info.FsStats) float64 { - return float64(fs.WriteTime) / float64(time.Second) - }) + return ioValues( + s.DiskIo.IoServiceTime, "Write", asNanosecondsToSeconds, + s.Filesystem, func(fs *info.FsStats) float64 { + return float64(fs.WriteTime) / float64(time.Second) + }, + ) }, }, { name: "container_fs_io_current", @@ -356,9 +428,12 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo valueType: prometheus.GaugeValue, extraLabels: []string{"device"}, getValues: func(s *info.ContainerStats) metricValues { - return fsValues(s.Filesystem, func(fs *info.FsStats) float64 { - return float64(fs.IoInProgress) - }) + return ioValues( + s.DiskIo.IoQueued, "Total", asFloat64, + s.Filesystem, func(fs *info.FsStats) float64 { + return float64(fs.IoInProgress) + }, + ) }, }, { name: "container_fs_io_time_seconds_total", @@ -366,9 +441,12 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo valueType: prometheus.CounterValue, extraLabels: []string{"device"}, getValues: func(s *info.ContainerStats) metricValues { - return fsValues(s.Filesystem, func(fs *info.FsStats) float64 { - return float64(float64(fs.IoTime) / float64(time.Second)) - }) + return ioValues( + s.DiskIo.IoServiceTime, "Total", asNanosecondsToSeconds, + s.Filesystem, func(fs *info.FsStats) float64 { + return float64(float64(fs.IoTime) / float64(time.Second)) + }, + ) }, }, { name: "container_fs_io_time_weighted_seconds_total",