Extending Prometheus metrics by hardware metrics (#2444)
* Extending Prometheus metrics by hardware metrics: - machine_cpu_physical_cores - machine_cpu_sockets - machine_dimm_capacity_bytes - machine_dimm_count - machine_nvm_capacity Signed-off-by: Katarzyna Kujawa <katarzyna.kujawa@intel.com>
This commit is contained in:
parent
e0c7caafcc
commit
921d0824c4
@ -157,9 +157,10 @@ func main() {
|
||||
|
||||
collectorHttpClient := createCollectorHttpClient(*collectorCert, *collectorKey)
|
||||
|
||||
containerManager, err := manager.New(memoryStorage, sysFs, *maxHousekeepingInterval, *allowDynamicHousekeeping, includedMetrics, &collectorHttpClient, strings.Split(*rawCgroupPrefixWhiteList, ","))
|
||||
// Create a new manager for containers and machine
|
||||
resourceManager, err := manager.New(memoryStorage, sysFs, *maxHousekeepingInterval, *allowDynamicHousekeeping, includedMetrics, &collectorHttpClient, strings.Split(*rawCgroupPrefixWhiteList, ","))
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to create a Container Manager: %s", err)
|
||||
klog.Fatalf("Failed to create a manager: %s", err)
|
||||
}
|
||||
|
||||
mux := http.NewServeMux()
|
||||
@ -172,7 +173,7 @@ func main() {
|
||||
}
|
||||
|
||||
// Register all HTTP handlers.
|
||||
err = cadvisorhttp.RegisterHandlers(mux, containerManager, *httpAuthFile, *httpAuthRealm, *httpDigestFile, *httpDigestRealm, *urlBasePrefix)
|
||||
err = cadvisorhttp.RegisterHandlers(mux, resourceManager, *httpAuthFile, *httpAuthRealm, *httpDigestFile, *httpDigestRealm, *urlBasePrefix)
|
||||
if err != nil {
|
||||
klog.Fatalf("Failed to register HTTP handlers: %v", err)
|
||||
}
|
||||
@ -183,15 +184,16 @@ func main() {
|
||||
containerLabelFunc = metrics.BaseContainerLabels(whitelistedLabels)
|
||||
}
|
||||
|
||||
cadvisorhttp.RegisterPrometheusHandler(mux, containerManager, *prometheusEndpoint, containerLabelFunc, includedMetrics)
|
||||
// Register Prometheus collector to gather information about containers, Go runtime, processes, and machine
|
||||
cadvisorhttp.RegisterPrometheusHandler(mux, resourceManager, *prometheusEndpoint, containerLabelFunc, includedMetrics)
|
||||
|
||||
// Start the manager.
|
||||
if err := containerManager.Start(); err != nil {
|
||||
klog.Fatalf("Failed to start container manager: %v", err)
|
||||
if err := resourceManager.Start(); err != nil {
|
||||
klog.Fatalf("Failed to start manager: %v", err)
|
||||
}
|
||||
|
||||
// Install signal handler.
|
||||
installSignalHandler(containerManager)
|
||||
installSignalHandler(resourceManager)
|
||||
|
||||
klog.V(1).Infof("Starting cAdvisor version: %s-%s on port %d", version.Info["version"], version.Info["revision"], *argPort)
|
||||
|
||||
|
@ -92,11 +92,12 @@ func RegisterHandlers(mux httpmux.Mux, containerManager manager.Manager, httpAut
|
||||
|
||||
// RegisterPrometheusHandler creates a new PrometheusCollector and configures
|
||||
// the provided HTTP mux to handle the given Prometheus endpoint.
|
||||
func RegisterPrometheusHandler(mux httpmux.Mux, containerManager manager.Manager, prometheusEndpoint string,
|
||||
func RegisterPrometheusHandler(mux httpmux.Mux, resourceManager manager.Manager, prometheusEndpoint string,
|
||||
f metrics.ContainerLabelsFunc, includedMetrics container.MetricSet) {
|
||||
r := prometheus.NewRegistry()
|
||||
r.MustRegister(
|
||||
metrics.NewPrometheusCollector(containerManager, f, includedMetrics),
|
||||
metrics.NewPrometheusCollector(resourceManager, f, includedMetrics),
|
||||
metrics.NewPrometheusMachineCollector(resourceManager),
|
||||
prometheus.NewGoCollector(),
|
||||
prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{}),
|
||||
)
|
||||
|
@ -1,6 +1,6 @@
|
||||
# Monitoring cAdvisor with Prometheus
|
||||
|
||||
cAdvisor exposes container statistics as [Prometheus](https://prometheus.io) metrics out of the box. By default, these metrics are served under the `/metrics` HTTP endpoint. This endpoint may be customized by setting the `-prometheus_endpoint` command-line flag.
|
||||
cAdvisor exposes container and hardware statistics as [Prometheus](https://prometheus.io) metrics out of the box. By default, these metrics are served under the `/metrics` HTTP endpoint. This endpoint may be customized by setting the `-prometheus_endpoint` command-line flag.
|
||||
|
||||
To monitor cAdvisor with Prometheus, simply configure one or more jobs in Prometheus which scrape the relevant cAdvisor processes at that metrics endpoint. For details, see Prometheus's [Configuration](https://prometheus.io/docs/operating/configuration/) documentation, as well as the [Getting started](https://prometheus.io/docs/introduction/getting_started/) guide.
|
||||
|
||||
@ -10,9 +10,9 @@ To monitor cAdvisor with Prometheus, simply configure one or more jobs in Promet
|
||||
|
||||
* [vegasbrianc](https://github.com/vegasbrianc) provides a [starter project](https://github.com/vegasbrianc/prometheus) for cAdvisor and Prometheus monitoring, alongide a ready-to-use [Grafana dashboard](https://github.com/vegasbrianc/grafana_dashboard).
|
||||
|
||||
## Prometheus metrics
|
||||
## Prometheus container metrics
|
||||
|
||||
The table below lists the Prometheus metrics exposed by cAdvisor (in alphabetical order by metric name):
|
||||
The table below lists the Prometheus container metrics exposed by cAdvisor (in alphabetical order by metric name):
|
||||
|
||||
Metric name | Type | Description | Unit (where applicable)
|
||||
:-----------|:-----|:------------|:-----------------------
|
||||
@ -78,3 +78,17 @@ Metric name | Type | Description | Unit (where applicable)
|
||||
`container_spec_memory_reservation_limit_bytes` | Gauge | Memory reservation limit for the container | bytes
|
||||
`container_start_time_seconds` | Gauge | Start time of the container since unix epoch | seconds
|
||||
`container_tasks_state` | Gauge | Number of tasks in given state (`sleeping`, `running`, `stopped`, `uninterruptible`, or `ioawaiting`) |
|
||||
|
||||
## Prometheus hardware metrics
|
||||
|
||||
The table below lists the Prometheus hardware metrics exposed by cAdvisor (in alphabetical order by metric name):
|
||||
|
||||
Metric name | Type | Description | Unit (where applicable)
|
||||
:-----------|:-----|:------------|:-----------------------
|
||||
`machine_cpu_cores` | Gauge | Number of physical CPU cores |
|
||||
`machine_cpu_physical_cores` | Gauge | Number of logical CPU cores |
|
||||
`machine_cpu_sockets` | Gauge | Number of CPU sockets |
|
||||
`machine_dimm_capacity_bytes` | Gauge | Total RAM DIMM capacity (all types memory modules) value labeled by dimm type,<br>information is retrieved from sysfs edac per-DIMM API (/sys/devices/system/edac/mc/) introduced in kernel 3.6 | bytes
|
||||
`machine_dimm_count` | Gauge | Number of RAM DIMM (all types memory modules) value labeled by dimm type,<br>information is retrieved from sysfs edac per-DIMM API (/sys/devices/system/edac/mc/) introduced in kernel 3.6 |
|
||||
`machine_memory_bytes` | Gauge | Amount of memory installed on the machine | bytes
|
||||
`machine_nvm_capacity` | Gauge | NVM capacity value labeled by NVM mode (memory mode or app direct mode) | bytes
|
||||
|
42
metrics/metrics.go
Normal file
42
metrics/metrics.go
Normal file
@ -0,0 +1,42 @@
|
||||
// Copyright 2020 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
info "github.com/google/cadvisor/info/v1"
|
||||
)
|
||||
|
||||
// metricValue describes a single metric value for a given set of label values
|
||||
// within a parent containerMetric.
|
||||
type metricValue struct {
|
||||
value float64
|
||||
labels []string
|
||||
timestamp time.Time
|
||||
}
|
||||
|
||||
type metricValues []metricValue
|
||||
|
||||
// infoProvider will usually be manager.Manager, but can be swapped out for testing.
|
||||
type infoProvider interface {
|
||||
// SubcontainersInfo provides information about all subcontainers of the
|
||||
// specified container including itself.
|
||||
SubcontainersInfo(containerName string, query *info.ContainerInfoRequest) ([]*info.ContainerInfo, error)
|
||||
// GetVersionInfo provides information about the version.
|
||||
GetVersionInfo() (*info.VersionInfo, error)
|
||||
// GetMachineInfo provides information about the machine.
|
||||
GetMachineInfo() (*info.MachineInfo, error)
|
||||
}
|
@ -25,27 +25,6 @@ import (
|
||||
"k8s.io/klog"
|
||||
)
|
||||
|
||||
// infoProvider will usually be manager.Manager, but can be swapped out for testing.
|
||||
type infoProvider interface {
|
||||
// SubcontainersInfo provides information about all subcontainers of the
|
||||
// specified container including itself.
|
||||
SubcontainersInfo(containerName string, query *info.ContainerInfoRequest) ([]*info.ContainerInfo, error)
|
||||
// GetVersionInfo provides information about the version.
|
||||
GetVersionInfo() (*info.VersionInfo, error)
|
||||
// GetMachineInfo provides information about the machine.
|
||||
GetMachineInfo() (*info.MachineInfo, error)
|
||||
}
|
||||
|
||||
// metricValue describes a single metric value for a given set of label values
|
||||
// within a parent containerMetric.
|
||||
type metricValue struct {
|
||||
value float64
|
||||
labels []string
|
||||
timestamp time.Time
|
||||
}
|
||||
|
||||
type metricValues []metricValue
|
||||
|
||||
// asFloat64 converts a uint64 into a float64.
|
||||
func asFloat64(v uint64) float64 { return float64(v) }
|
||||
|
||||
@ -1568,11 +1547,7 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
|
||||
return c
|
||||
}
|
||||
|
||||
var (
|
||||
versionInfoDesc = prometheus.NewDesc("cadvisor_version_info", "A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision.", []string{"kernelVersion", "osVersion", "dockerVersion", "cadvisorVersion", "cadvisorRevision"}, nil)
|
||||
machineInfoCoresDesc = prometheus.NewDesc("machine_cpu_cores", "Number of CPU cores on the machine.", nil, nil)
|
||||
machineInfoMemoryDesc = prometheus.NewDesc("machine_memory_bytes", "Amount of memory installed on the machine.", nil, nil)
|
||||
)
|
||||
var versionInfoDesc = prometheus.NewDesc("cadvisor_version_info", "A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision.", []string{"kernelVersion", "osVersion", "dockerVersion", "cadvisorVersion", "cadvisorRevision"}, nil)
|
||||
|
||||
// Describe describes all the metrics ever exported by cadvisor. It
|
||||
// implements prometheus.PrometheusCollector.
|
||||
@ -1582,15 +1557,12 @@ func (c *PrometheusCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||
ch <- cm.desc([]string{})
|
||||
}
|
||||
ch <- versionInfoDesc
|
||||
ch <- machineInfoCoresDesc
|
||||
ch <- machineInfoMemoryDesc
|
||||
}
|
||||
|
||||
// Collect fetches the stats from all containers and delivers them as
|
||||
// Prometheus metrics. It implements prometheus.PrometheusCollector.
|
||||
func (c *PrometheusCollector) Collect(ch chan<- prometheus.Metric) {
|
||||
c.errors.Set(0)
|
||||
c.collectMachineInfo(ch)
|
||||
c.collectVersionInfo(ch)
|
||||
c.collectContainersInfo(ch)
|
||||
c.errors.Collect(ch)
|
||||
@ -1758,17 +1730,6 @@ func (c *PrometheusCollector) collectVersionInfo(ch chan<- prometheus.Metric) {
|
||||
ch <- prometheus.MustNewConstMetric(versionInfoDesc, prometheus.GaugeValue, 1, []string{versionInfo.KernelVersion, versionInfo.ContainerOsVersion, versionInfo.DockerVersion, versionInfo.CadvisorVersion, versionInfo.CadvisorRevision}...)
|
||||
}
|
||||
|
||||
func (c *PrometheusCollector) collectMachineInfo(ch chan<- prometheus.Metric) {
|
||||
machineInfo, err := c.infoProvider.GetMachineInfo()
|
||||
if err != nil {
|
||||
c.errors.Set(1)
|
||||
klog.Warningf("Couldn't get machine info: %s", err)
|
||||
return
|
||||
}
|
||||
ch <- prometheus.MustNewConstMetric(machineInfoCoresDesc, prometheus.GaugeValue, float64(machineInfo.NumCores))
|
||||
ch <- prometheus.MustNewConstMetric(machineInfoMemoryDesc, prometheus.GaugeValue, float64(machineInfo.MemoryCapacity))
|
||||
}
|
||||
|
||||
// Size after which we consider memory to be "unlimited". This is not
|
||||
// MaxInt64 due to rounding by the kernel.
|
||||
const maxMemorySize = uint64(1 << 62)
|
||||
|
445
metrics/prometheus_fake.go
Normal file
445
metrics/prometheus_fake.go
Normal file
@ -0,0 +1,445 @@
|
||||
// Copyright 2020 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"time"
|
||||
|
||||
info "github.com/google/cadvisor/info/v1"
|
||||
)
|
||||
|
||||
type testSubcontainersInfoProvider struct{}
|
||||
|
||||
func (p testSubcontainersInfoProvider) GetVersionInfo() (*info.VersionInfo, error) {
|
||||
return &info.VersionInfo{
|
||||
KernelVersion: "4.1.6-200.fc22.x86_64",
|
||||
ContainerOsVersion: "Fedora 22 (Twenty Two)",
|
||||
DockerVersion: "1.8.1",
|
||||
CadvisorVersion: "0.16.0",
|
||||
CadvisorRevision: "abcdef",
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (p testSubcontainersInfoProvider) GetMachineInfo() (*info.MachineInfo, error) {
|
||||
return &info.MachineInfo{
|
||||
NumCores: 4,
|
||||
NumPhysicalCores: 1,
|
||||
NumSockets: 1,
|
||||
MemoryCapacity: 1024,
|
||||
MemoryByType: map[string]*info.MemoryInfo{
|
||||
"Non-volatile-RAM": {Capacity: 2168421613568, DimmCount: 8},
|
||||
"Unbuffered-DDR4": {Capacity: 412316860416, DimmCount: 12},
|
||||
},
|
||||
NVMInfo: info.NVMInfo{
|
||||
MemoryModeCapacity: 429496729600,
|
||||
AppDirectModeCapacity: 1735166787584,
|
||||
},
|
||||
MachineID: "machine-id-test",
|
||||
SystemUUID: "system-uuid-test",
|
||||
BootID: "boot-id-test",
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.ContainerInfoRequest) ([]*info.ContainerInfo, error) {
|
||||
return []*info.ContainerInfo{
|
||||
{
|
||||
ContainerReference: info.ContainerReference{
|
||||
Name: "testcontainer",
|
||||
Aliases: []string{"testcontaineralias"},
|
||||
},
|
||||
Spec: info.ContainerSpec{
|
||||
Image: "test",
|
||||
HasCpu: true,
|
||||
Cpu: info.CpuSpec{
|
||||
Limit: 1000,
|
||||
Period: 100000,
|
||||
Quota: 10000,
|
||||
},
|
||||
Memory: info.MemorySpec{
|
||||
Limit: 2048,
|
||||
Reservation: 1024,
|
||||
SwapLimit: 4096,
|
||||
},
|
||||
HasHugetlb: true,
|
||||
HasProcesses: true,
|
||||
Processes: info.ProcessSpec{
|
||||
Limit: 100,
|
||||
},
|
||||
CreationTime: time.Unix(1257894000, 0),
|
||||
Labels: map[string]string{
|
||||
"foo.label": "bar",
|
||||
},
|
||||
Envs: map[string]string{
|
||||
"foo+env": "prod",
|
||||
},
|
||||
},
|
||||
Stats: []*info.ContainerStats{
|
||||
{
|
||||
Timestamp: time.Unix(1395066363, 0),
|
||||
Cpu: info.CpuStats{
|
||||
Usage: info.CpuUsage{
|
||||
Total: 1,
|
||||
PerCpu: []uint64{2, 3, 4, 5},
|
||||
User: 6,
|
||||
System: 7,
|
||||
},
|
||||
CFS: info.CpuCFS{
|
||||
Periods: 723,
|
||||
ThrottledPeriods: 18,
|
||||
ThrottledTime: 1724314000,
|
||||
},
|
||||
Schedstat: info.CpuSchedstat{
|
||||
RunTime: 53643567,
|
||||
RunqueueTime: 479424566378,
|
||||
RunPeriods: 984285,
|
||||
},
|
||||
LoadAverage: 2,
|
||||
},
|
||||
Memory: info.MemoryStats{
|
||||
Usage: 8,
|
||||
MaxUsage: 8,
|
||||
WorkingSet: 9,
|
||||
ContainerData: info.MemoryStatsMemoryData{
|
||||
Pgfault: 10,
|
||||
Pgmajfault: 11,
|
||||
},
|
||||
HierarchicalData: info.MemoryStatsMemoryData{
|
||||
Pgfault: 12,
|
||||
Pgmajfault: 13,
|
||||
},
|
||||
Cache: 14,
|
||||
RSS: 15,
|
||||
MappedFile: 16,
|
||||
Swap: 8192,
|
||||
},
|
||||
Hugetlb: map[string]info.HugetlbStats{
|
||||
"2Mi": {
|
||||
Usage: 4,
|
||||
MaxUsage: 10,
|
||||
Failcnt: 1,
|
||||
},
|
||||
"1Gi": {
|
||||
Usage: 0,
|
||||
MaxUsage: 0,
|
||||
Failcnt: 0,
|
||||
},
|
||||
},
|
||||
Network: info.NetworkStats{
|
||||
InterfaceStats: info.InterfaceStats{
|
||||
Name: "eth0",
|
||||
RxBytes: 14,
|
||||
RxPackets: 15,
|
||||
RxErrors: 16,
|
||||
RxDropped: 17,
|
||||
TxBytes: 18,
|
||||
TxPackets: 19,
|
||||
TxErrors: 20,
|
||||
TxDropped: 21,
|
||||
},
|
||||
Interfaces: []info.InterfaceStats{
|
||||
{
|
||||
Name: "eth0",
|
||||
RxBytes: 14,
|
||||
RxPackets: 15,
|
||||
RxErrors: 16,
|
||||
RxDropped: 17,
|
||||
TxBytes: 18,
|
||||
TxPackets: 19,
|
||||
TxErrors: 20,
|
||||
TxDropped: 21,
|
||||
},
|
||||
},
|
||||
Tcp: info.TcpStat{
|
||||
Established: 13,
|
||||
SynSent: 0,
|
||||
SynRecv: 0,
|
||||
FinWait1: 0,
|
||||
FinWait2: 0,
|
||||
TimeWait: 0,
|
||||
Close: 0,
|
||||
CloseWait: 0,
|
||||
LastAck: 0,
|
||||
Listen: 3,
|
||||
Closing: 0,
|
||||
},
|
||||
Tcp6: info.TcpStat{
|
||||
Established: 11,
|
||||
SynSent: 0,
|
||||
SynRecv: 0,
|
||||
FinWait1: 0,
|
||||
FinWait2: 0,
|
||||
TimeWait: 0,
|
||||
Close: 0,
|
||||
CloseWait: 0,
|
||||
LastAck: 0,
|
||||
Listen: 3,
|
||||
Closing: 0,
|
||||
},
|
||||
TcpAdvanced: info.TcpAdvancedStat{
|
||||
TCPFullUndo: 2361,
|
||||
TCPMD5NotFound: 0,
|
||||
TCPDSACKRecv: 83680,
|
||||
TCPSackShifted: 2,
|
||||
TCPSackShiftFallback: 298,
|
||||
PFMemallocDrop: 0,
|
||||
EstabResets: 37,
|
||||
InSegs: 140370590,
|
||||
TCPPureAcks: 24251339,
|
||||
TCPDSACKOldSent: 15633,
|
||||
IPReversePathFilter: 0,
|
||||
TCPFastOpenPassiveFail: 0,
|
||||
InCsumErrors: 0,
|
||||
TCPRenoFailures: 43414,
|
||||
TCPMemoryPressuresChrono: 0,
|
||||
TCPDeferAcceptDrop: 0,
|
||||
TW: 10436427,
|
||||
TCPSpuriousRTOs: 0,
|
||||
TCPDSACKIgnoredNoUndo: 71885,
|
||||
RtoMax: 120000,
|
||||
ActiveOpens: 11038621,
|
||||
EmbryonicRsts: 0,
|
||||
RcvPruned: 0,
|
||||
TCPLossProbeRecovery: 401,
|
||||
TCPHPHits: 56096478,
|
||||
TCPPartialUndo: 3,
|
||||
TCPAbortOnMemory: 0,
|
||||
AttemptFails: 48997,
|
||||
RetransSegs: 462961,
|
||||
SyncookiesFailed: 0,
|
||||
OfoPruned: 0,
|
||||
TCPAbortOnLinger: 0,
|
||||
TCPAbortFailed: 0,
|
||||
TCPRenoReorder: 839,
|
||||
TCPRcvCollapsed: 0,
|
||||
TCPDSACKIgnoredOld: 0,
|
||||
TCPReqQFullDrop: 0,
|
||||
OutOfWindowIcmps: 0,
|
||||
TWKilled: 0,
|
||||
TCPLossProbes: 88648,
|
||||
TCPRenoRecoveryFail: 394,
|
||||
TCPFastOpenCookieReqd: 0,
|
||||
TCPHPAcks: 21490641,
|
||||
TCPSACKReneging: 0,
|
||||
TCPTSReorder: 3,
|
||||
TCPSlowStartRetrans: 290832,
|
||||
MaxConn: -1,
|
||||
SyncookiesRecv: 0,
|
||||
TCPSackFailures: 60,
|
||||
DelayedACKLocked: 90,
|
||||
TCPDSACKOfoSent: 1,
|
||||
TCPSynRetrans: 988,
|
||||
TCPDSACKOfoRecv: 10,
|
||||
TCPSACKDiscard: 0,
|
||||
TCPMD5Unexpected: 0,
|
||||
TCPSackMerged: 6,
|
||||
RtoMin: 200,
|
||||
CurrEstab: 22,
|
||||
TCPTimeWaitOverflow: 0,
|
||||
ListenOverflows: 0,
|
||||
DelayedACKs: 503975,
|
||||
TCPLossUndo: 61374,
|
||||
TCPOrigDataSent: 130698387,
|
||||
TCPBacklogDrop: 0,
|
||||
TCPReqQFullDoCookies: 0,
|
||||
TCPFastOpenPassive: 0,
|
||||
PAWSActive: 0,
|
||||
OutRsts: 91699,
|
||||
TCPSackRecoveryFail: 2,
|
||||
DelayedACKLost: 18843,
|
||||
TCPAbortOnData: 8,
|
||||
TCPMinTTLDrop: 0,
|
||||
PruneCalled: 0,
|
||||
TWRecycled: 0,
|
||||
ListenDrops: 0,
|
||||
TCPAbortOnTimeout: 0,
|
||||
SyncookiesSent: 0,
|
||||
TCPSACKReorder: 11,
|
||||
TCPDSACKUndo: 33,
|
||||
TCPMD5Failure: 0,
|
||||
TCPLostRetransmit: 0,
|
||||
TCPAbortOnClose: 7,
|
||||
TCPFastOpenListenOverflow: 0,
|
||||
OutSegs: 211580512,
|
||||
InErrs: 31,
|
||||
TCPTimeouts: 27422,
|
||||
TCPLossFailures: 729,
|
||||
TCPSackRecovery: 159,
|
||||
RtoAlgorithm: 1,
|
||||
PassiveOpens: 59,
|
||||
LockDroppedIcmps: 0,
|
||||
TCPRenoRecovery: 3519,
|
||||
TCPFACKReorder: 0,
|
||||
TCPFastRetrans: 11794,
|
||||
TCPRetransFail: 0,
|
||||
TCPMemoryPressures: 0,
|
||||
TCPFastOpenActive: 0,
|
||||
TCPFastOpenActiveFail: 0,
|
||||
PAWSEstab: 0,
|
||||
},
|
||||
Udp: info.UdpStat{
|
||||
Listen: 0,
|
||||
Dropped: 0,
|
||||
RxQueued: 0,
|
||||
TxQueued: 0,
|
||||
},
|
||||
Udp6: info.UdpStat{
|
||||
Listen: 0,
|
||||
Dropped: 0,
|
||||
RxQueued: 0,
|
||||
TxQueued: 0,
|
||||
},
|
||||
},
|
||||
Filesystem: []info.FsStats{
|
||||
{
|
||||
Device: "sda1",
|
||||
InodesFree: 524288,
|
||||
Inodes: 2097152,
|
||||
Limit: 22,
|
||||
Usage: 23,
|
||||
ReadsCompleted: 24,
|
||||
ReadsMerged: 25,
|
||||
SectorsRead: 26,
|
||||
ReadTime: 27,
|
||||
WritesCompleted: 28,
|
||||
WritesMerged: 39,
|
||||
SectorsWritten: 40,
|
||||
WriteTime: 41,
|
||||
IoInProgress: 42,
|
||||
IoTime: 43,
|
||||
WeightedIoTime: 44,
|
||||
},
|
||||
{
|
||||
Device: "sda2",
|
||||
InodesFree: 262144,
|
||||
Inodes: 2097152,
|
||||
Limit: 37,
|
||||
Usage: 38,
|
||||
ReadsCompleted: 39,
|
||||
ReadsMerged: 40,
|
||||
SectorsRead: 41,
|
||||
ReadTime: 42,
|
||||
WritesCompleted: 43,
|
||||
WritesMerged: 44,
|
||||
SectorsWritten: 45,
|
||||
WriteTime: 46,
|
||||
IoInProgress: 47,
|
||||
IoTime: 48,
|
||||
WeightedIoTime: 49,
|
||||
},
|
||||
},
|
||||
Accelerators: []info.AcceleratorStats{
|
||||
{
|
||||
Make: "nvidia",
|
||||
Model: "tesla-p100",
|
||||
ID: "GPU-deadbeef-1234-5678-90ab-feedfacecafe",
|
||||
MemoryTotal: 20304050607,
|
||||
MemoryUsed: 2030405060,
|
||||
DutyCycle: 12,
|
||||
},
|
||||
{
|
||||
Make: "nvidia",
|
||||
Model: "tesla-k80",
|
||||
ID: "GPU-deadbeef-0123-4567-89ab-feedfacecafe",
|
||||
MemoryTotal: 10203040506,
|
||||
MemoryUsed: 1020304050,
|
||||
DutyCycle: 6,
|
||||
},
|
||||
},
|
||||
Processes: info.ProcessStats{
|
||||
ProcessCount: 1,
|
||||
FdCount: 5,
|
||||
SocketCount: 3,
|
||||
ThreadsCurrent: 5,
|
||||
ThreadsMax: 100,
|
||||
Ulimits: []info.UlimitSpec{
|
||||
{
|
||||
Name: "max_open_files",
|
||||
SoftLimit: 16384,
|
||||
HardLimit: 16384,
|
||||
},
|
||||
},
|
||||
},
|
||||
TaskStats: info.LoadStats{
|
||||
NrSleeping: 50,
|
||||
NrRunning: 51,
|
||||
NrStopped: 52,
|
||||
NrUninterruptible: 53,
|
||||
NrIoWait: 54,
|
||||
},
|
||||
CustomMetrics: map[string][]info.MetricVal{
|
||||
"container_custom_app_metric_1": {
|
||||
{
|
||||
FloatValue: float64(1.1),
|
||||
Timestamp: time.Now(),
|
||||
Label: "testlabel_1_1_1",
|
||||
Labels: map[string]string{"test_label": "1_1", "test_label_2": "2_1"},
|
||||
},
|
||||
{
|
||||
FloatValue: float64(1.2),
|
||||
Timestamp: time.Now(),
|
||||
Label: "testlabel_1_1_2",
|
||||
Labels: map[string]string{"test_label": "1_2", "test_label_2": "2_2"},
|
||||
},
|
||||
},
|
||||
"container_custom_app_metric_2": {
|
||||
{
|
||||
FloatValue: float64(2),
|
||||
Timestamp: time.Now(),
|
||||
Label: "testlabel2",
|
||||
Labels: map[string]string{"test_label": "test_value"},
|
||||
},
|
||||
},
|
||||
"container_custom_app_metric_3": {
|
||||
{
|
||||
FloatValue: float64(3),
|
||||
Timestamp: time.Now(),
|
||||
Label: "testlabel3",
|
||||
Labels: map[string]string{"test_label": "test_value"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
type erroringSubcontainersInfoProvider struct {
|
||||
successfulProvider testSubcontainersInfoProvider
|
||||
shouldFail bool
|
||||
}
|
||||
|
||||
func (p *erroringSubcontainersInfoProvider) GetVersionInfo() (*info.VersionInfo, error) {
|
||||
if p.shouldFail {
|
||||
return nil, errors.New("Oops 1")
|
||||
}
|
||||
return p.successfulProvider.GetVersionInfo()
|
||||
}
|
||||
|
||||
func (p *erroringSubcontainersInfoProvider) GetMachineInfo() (*info.MachineInfo, error) {
|
||||
if p.shouldFail {
|
||||
return nil, errors.New("Oops 2")
|
||||
}
|
||||
return p.successfulProvider.GetMachineInfo()
|
||||
}
|
||||
|
||||
func (p *erroringSubcontainersInfoProvider) SubcontainersInfo(
|
||||
a string, r *info.ContainerInfoRequest) ([]*info.ContainerInfo, error) {
|
||||
if p.shouldFail {
|
||||
return []*info.ContainerInfo{}, errors.New("Oops 3")
|
||||
}
|
||||
return p.successfulProvider.SubcontainersInfo(a, r)
|
||||
}
|
199
metrics/prometheus_machine.go
Normal file
199
metrics/prometheus_machine.go
Normal file
@ -0,0 +1,199 @@
|
||||
// Copyright 2020 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package metrics
|
||||
|
||||
import (
|
||||
info "github.com/google/cadvisor/info/v1"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
"k8s.io/klog"
|
||||
)
|
||||
|
||||
var baseLabelsNames = []string{"machine_id", "system_uuid", "boot_id"}
|
||||
|
||||
const (
|
||||
prometheusModeLabelName = "mode"
|
||||
prometheusTypeLabelName = "type"
|
||||
|
||||
nvmMemoryMode = "memory_mode"
|
||||
nvmAppDirectMode = "app_direct_mode"
|
||||
|
||||
memoryByTypeDimmCountKey = "DimmCount"
|
||||
memoryByTypeDimmCapacityKey = "Capacity"
|
||||
)
|
||||
|
||||
// machineMetric describes a multi-dimensional metric used for exposing a
|
||||
// certain type of machine statistic.
|
||||
type machineMetric struct {
|
||||
name string
|
||||
help string
|
||||
valueType prometheus.ValueType
|
||||
extraLabels []string
|
||||
condition func(machineInfo *info.MachineInfo) bool
|
||||
getValues func(machineInfo *info.MachineInfo) metricValues
|
||||
}
|
||||
|
||||
func (metric *machineMetric) desc(baseLabels []string) *prometheus.Desc {
|
||||
return prometheus.NewDesc(metric.name, metric.help, append(baseLabels, metric.extraLabels...), nil)
|
||||
}
|
||||
|
||||
// PrometheusMachineCollector implements prometheus.Collector.
|
||||
type PrometheusMachineCollector struct {
|
||||
infoProvider infoProvider
|
||||
errors prometheus.Gauge
|
||||
machineMetrics []machineMetric
|
||||
}
|
||||
|
||||
// NewPrometheusMachineCollector returns a new PrometheusCollector.
|
||||
func NewPrometheusMachineCollector(i infoProvider) *PrometheusMachineCollector {
|
||||
c := &PrometheusMachineCollector{
|
||||
infoProvider: i,
|
||||
errors: prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: "machine",
|
||||
Name: "scrape_error",
|
||||
Help: "1 if there was an error while getting machine metrics, 0 otherwise.",
|
||||
}),
|
||||
machineMetrics: []machineMetric{
|
||||
{
|
||||
name: "machine_cpu_physical_cores",
|
||||
help: "Number of physical CPU cores.",
|
||||
valueType: prometheus.GaugeValue,
|
||||
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||||
return metricValues{{value: float64(machineInfo.NumPhysicalCores)}}
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "machine_cpu_cores",
|
||||
help: "Number of logical CPU cores.",
|
||||
valueType: prometheus.GaugeValue,
|
||||
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||||
return metricValues{{value: float64(machineInfo.NumCores)}}
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "machine_cpu_sockets",
|
||||
help: "Number of CPU sockets.",
|
||||
valueType: prometheus.GaugeValue,
|
||||
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||||
return metricValues{{value: float64(machineInfo.NumSockets)}}
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "machine_memory_bytes",
|
||||
help: "Amount of memory installed on the machine.",
|
||||
valueType: prometheus.GaugeValue,
|
||||
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||||
return metricValues{{value: float64(machineInfo.MemoryCapacity)}}
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "machine_dimm_count",
|
||||
help: "Number of RAM DIMM (all types memory modules) value labeled by dimm type.",
|
||||
valueType: prometheus.GaugeValue,
|
||||
extraLabels: []string{prometheusTypeLabelName},
|
||||
condition: func(machineInfo *info.MachineInfo) bool { return len(machineInfo.MemoryByType) != 0 },
|
||||
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||||
return getMemoryByType(machineInfo, memoryByTypeDimmCountKey)
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "machine_dimm_capacity_bytes",
|
||||
help: "Total RAM DIMM capacity (all types memory modules) value labeled by dimm type.",
|
||||
valueType: prometheus.GaugeValue,
|
||||
extraLabels: []string{prometheusTypeLabelName},
|
||||
condition: func(machineInfo *info.MachineInfo) bool { return len(machineInfo.MemoryByType) != 0 },
|
||||
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||||
return getMemoryByType(machineInfo, memoryByTypeDimmCapacityKey)
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "machine_nvm_capacity",
|
||||
help: "NVM capacity value labeled by NVM mode (memory mode or app direct mode).",
|
||||
valueType: prometheus.GaugeValue,
|
||||
extraLabels: []string{prometheusModeLabelName},
|
||||
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||||
return metricValues{
|
||||
{value: float64(machineInfo.NVMInfo.MemoryModeCapacity), labels: []string{nvmMemoryMode}},
|
||||
{value: float64(machineInfo.NVMInfo.AppDirectModeCapacity), labels: []string{nvmAppDirectMode}},
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// Describe describes all the machine metrics ever exported by cadvisor. It
|
||||
// implements prometheus.PrometheusCollector.
|
||||
func (collector *PrometheusMachineCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||
collector.errors.Describe(ch)
|
||||
for _, metric := range collector.machineMetrics {
|
||||
ch <- metric.desc([]string{})
|
||||
}
|
||||
}
|
||||
|
||||
// Collect fetches information about machine and delivers them as
|
||||
// Prometheus metrics. It implements prometheus.PrometheusCollector.
|
||||
func (collector *PrometheusMachineCollector) Collect(ch chan<- prometheus.Metric) {
|
||||
collector.errors.Set(0)
|
||||
collector.collectMachineInfo(ch)
|
||||
collector.errors.Collect(ch)
|
||||
}
|
||||
|
||||
func (collector *PrometheusMachineCollector) collectMachineInfo(ch chan<- prometheus.Metric) {
|
||||
machineInfo, err := collector.infoProvider.GetMachineInfo()
|
||||
if err != nil {
|
||||
collector.errors.Set(1)
|
||||
klog.Warningf("Couldn't get machine info: %s", err)
|
||||
return
|
||||
}
|
||||
|
||||
baseLabelsValues := []string{machineInfo.MachineID, machineInfo.SystemUUID, machineInfo.BootID}
|
||||
|
||||
for _, metric := range collector.machineMetrics {
|
||||
if metric.condition != nil && !metric.condition(machineInfo) {
|
||||
continue
|
||||
}
|
||||
|
||||
for _, metricValue := range metric.getValues(machineInfo) {
|
||||
labelValues := make([]string, len(baseLabelsValues))
|
||||
copy(labelValues, baseLabelsValues)
|
||||
if len(metric.extraLabels) != 0 {
|
||||
labelValues = append(labelValues, metricValue.labels...)
|
||||
}
|
||||
ch <- prometheus.MustNewConstMetric(metric.desc(baseLabelsNames),
|
||||
metric.valueType, metricValue.value, labelValues...)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func getMemoryByType(machineInfo *info.MachineInfo, property string) metricValues {
|
||||
mValues := make(metricValues, 0, len(machineInfo.MemoryByType))
|
||||
for memoryType, memoryInfo := range machineInfo.MemoryByType {
|
||||
propertyValue := 0.0
|
||||
switch property {
|
||||
case memoryByTypeDimmCapacityKey:
|
||||
propertyValue = float64(memoryInfo.Capacity)
|
||||
case memoryByTypeDimmCountKey:
|
||||
propertyValue = float64(memoryInfo.DimmCount)
|
||||
default:
|
||||
klog.Warningf("Incorrect propery name for MemoryByType, property %s", property)
|
||||
return metricValues{}
|
||||
}
|
||||
mValues = append(mValues, metricValue{value: propertyValue, labels: []string{memoryType}})
|
||||
}
|
||||
return mValues
|
||||
}
|
89
metrics/prometheus_machine_test.go
Normal file
89
metrics/prometheus_machine_test.go
Normal file
@ -0,0 +1,89 @@
|
||||
// Copyright 2020 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io/ioutil"
|
||||
"testing"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/common/expfmt"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
const machineMetricsFile = "testdata/prometheus_machine_metrics"
|
||||
const machineMetricsFailureFile = "testdata/prometheus_machine_metrics_failure"
|
||||
|
||||
func TestPrometheusMachineCollector(t *testing.T) {
|
||||
collector := NewPrometheusMachineCollector(testSubcontainersInfoProvider{})
|
||||
registry := prometheus.NewRegistry()
|
||||
registry.MustRegister(collector)
|
||||
|
||||
metricsFamily, err := registry.Gather()
|
||||
assert.Nil(t, err)
|
||||
|
||||
var metricBuffer bytes.Buffer
|
||||
for _, metricFamily := range metricsFamily {
|
||||
_, err := expfmt.MetricFamilyToText(&metricBuffer, metricFamily)
|
||||
assert.Nil(t, err)
|
||||
}
|
||||
collectedMetrics := string(metricBuffer.Bytes())
|
||||
expectedMetrics, err := ioutil.ReadFile(machineMetricsFile)
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, string(expectedMetrics), collectedMetrics)
|
||||
}
|
||||
|
||||
func TestPrometheusMachineCollectorWithFailure(t *testing.T) {
|
||||
provider := &erroringSubcontainersInfoProvider{
|
||||
successfulProvider: testSubcontainersInfoProvider{},
|
||||
shouldFail: true,
|
||||
}
|
||||
collector := NewPrometheusMachineCollector(provider)
|
||||
registry := prometheus.NewRegistry()
|
||||
registry.MustRegister(collector)
|
||||
|
||||
metricsFamily, err := registry.Gather()
|
||||
assert.Nil(t, err)
|
||||
|
||||
var metricBuffer bytes.Buffer
|
||||
for _, metricFamily := range metricsFamily {
|
||||
_, err := expfmt.MetricFamilyToText(&metricBuffer, metricFamily)
|
||||
assert.Nil(t, err)
|
||||
}
|
||||
collectedMetrics := string(metricBuffer.Bytes())
|
||||
expectedMetrics, err := ioutil.ReadFile(machineMetricsFailureFile)
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, string(expectedMetrics), collectedMetrics)
|
||||
}
|
||||
|
||||
func TestGetMemoryByType(t *testing.T) {
|
||||
machineInfo, err := testSubcontainersInfoProvider{}.GetMachineInfo()
|
||||
assert.Nil(t, err)
|
||||
|
||||
capacityMetrics := getMemoryByType(machineInfo, memoryByTypeDimmCapacityKey)
|
||||
assert.Equal(t, 2, len(capacityMetrics))
|
||||
|
||||
countMetrics := getMemoryByType(machineInfo, memoryByTypeDimmCountKey)
|
||||
assert.Equal(t, 2, len(countMetrics))
|
||||
}
|
||||
|
||||
func TestGetMemoryByTypeWithWrongProperty(t *testing.T) {
|
||||
machineInfo, err := testSubcontainersInfoProvider{}.GetMachineInfo()
|
||||
assert.Nil(t, err)
|
||||
|
||||
metricVals := getMemoryByType(machineInfo, "wrong_property_name")
|
||||
assert.Equal(t, 0, len(metricVals))
|
||||
}
|
@ -15,14 +15,12 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"regexp"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/google/cadvisor/container"
|
||||
info "github.com/google/cadvisor/info/v1"
|
||||
@ -30,389 +28,6 @@ import (
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
)
|
||||
|
||||
type testSubcontainersInfoProvider struct{}
|
||||
|
||||
func (p testSubcontainersInfoProvider) GetVersionInfo() (*info.VersionInfo, error) {
|
||||
return &info.VersionInfo{
|
||||
KernelVersion: "4.1.6-200.fc22.x86_64",
|
||||
ContainerOsVersion: "Fedora 22 (Twenty Two)",
|
||||
DockerVersion: "1.8.1",
|
||||
CadvisorVersion: "0.16.0",
|
||||
CadvisorRevision: "abcdef",
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (p testSubcontainersInfoProvider) GetMachineInfo() (*info.MachineInfo, error) {
|
||||
return &info.MachineInfo{
|
||||
NumCores: 4,
|
||||
MemoryCapacity: 1024,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.ContainerInfoRequest) ([]*info.ContainerInfo, error) {
|
||||
return []*info.ContainerInfo{
|
||||
{
|
||||
ContainerReference: info.ContainerReference{
|
||||
Name: "testcontainer",
|
||||
Aliases: []string{"testcontaineralias"},
|
||||
},
|
||||
Spec: info.ContainerSpec{
|
||||
Image: "test",
|
||||
HasCpu: true,
|
||||
Cpu: info.CpuSpec{
|
||||
Limit: 1000,
|
||||
Period: 100000,
|
||||
Quota: 10000,
|
||||
},
|
||||
Memory: info.MemorySpec{
|
||||
Limit: 2048,
|
||||
Reservation: 1024,
|
||||
SwapLimit: 4096,
|
||||
},
|
||||
HasHugetlb: true,
|
||||
HasProcesses: true,
|
||||
Processes: info.ProcessSpec{
|
||||
Limit: 100,
|
||||
},
|
||||
CreationTime: time.Unix(1257894000, 0),
|
||||
Labels: map[string]string{
|
||||
"foo.label": "bar",
|
||||
},
|
||||
Envs: map[string]string{
|
||||
"foo+env": "prod",
|
||||
},
|
||||
},
|
||||
Stats: []*info.ContainerStats{
|
||||
{
|
||||
Timestamp: time.Unix(1395066363, 0),
|
||||
Cpu: info.CpuStats{
|
||||
Usage: info.CpuUsage{
|
||||
Total: 1,
|
||||
PerCpu: []uint64{2, 3, 4, 5},
|
||||
User: 6,
|
||||
System: 7,
|
||||
},
|
||||
CFS: info.CpuCFS{
|
||||
Periods: 723,
|
||||
ThrottledPeriods: 18,
|
||||
ThrottledTime: 1724314000,
|
||||
},
|
||||
Schedstat: info.CpuSchedstat{
|
||||
RunTime: 53643567,
|
||||
RunqueueTime: 479424566378,
|
||||
RunPeriods: 984285,
|
||||
},
|
||||
LoadAverage: 2,
|
||||
},
|
||||
Memory: info.MemoryStats{
|
||||
Usage: 8,
|
||||
MaxUsage: 8,
|
||||
WorkingSet: 9,
|
||||
ContainerData: info.MemoryStatsMemoryData{
|
||||
Pgfault: 10,
|
||||
Pgmajfault: 11,
|
||||
},
|
||||
HierarchicalData: info.MemoryStatsMemoryData{
|
||||
Pgfault: 12,
|
||||
Pgmajfault: 13,
|
||||
},
|
||||
Cache: 14,
|
||||
RSS: 15,
|
||||
MappedFile: 16,
|
||||
Swap: 8192,
|
||||
},
|
||||
Hugetlb: map[string]info.HugetlbStats{
|
||||
"2Mi": {
|
||||
Usage: 4,
|
||||
MaxUsage: 10,
|
||||
Failcnt: 1,
|
||||
},
|
||||
"1Gi": {
|
||||
Usage: 0,
|
||||
MaxUsage: 0,
|
||||
Failcnt: 0,
|
||||
},
|
||||
},
|
||||
Network: info.NetworkStats{
|
||||
InterfaceStats: info.InterfaceStats{
|
||||
Name: "eth0",
|
||||
RxBytes: 14,
|
||||
RxPackets: 15,
|
||||
RxErrors: 16,
|
||||
RxDropped: 17,
|
||||
TxBytes: 18,
|
||||
TxPackets: 19,
|
||||
TxErrors: 20,
|
||||
TxDropped: 21,
|
||||
},
|
||||
Interfaces: []info.InterfaceStats{
|
||||
{
|
||||
Name: "eth0",
|
||||
RxBytes: 14,
|
||||
RxPackets: 15,
|
||||
RxErrors: 16,
|
||||
RxDropped: 17,
|
||||
TxBytes: 18,
|
||||
TxPackets: 19,
|
||||
TxErrors: 20,
|
||||
TxDropped: 21,
|
||||
},
|
||||
},
|
||||
Tcp: info.TcpStat{
|
||||
Established: 13,
|
||||
SynSent: 0,
|
||||
SynRecv: 0,
|
||||
FinWait1: 0,
|
||||
FinWait2: 0,
|
||||
TimeWait: 0,
|
||||
Close: 0,
|
||||
CloseWait: 0,
|
||||
LastAck: 0,
|
||||
Listen: 3,
|
||||
Closing: 0,
|
||||
},
|
||||
Tcp6: info.TcpStat{
|
||||
Established: 11,
|
||||
SynSent: 0,
|
||||
SynRecv: 0,
|
||||
FinWait1: 0,
|
||||
FinWait2: 0,
|
||||
TimeWait: 0,
|
||||
Close: 0,
|
||||
CloseWait: 0,
|
||||
LastAck: 0,
|
||||
Listen: 3,
|
||||
Closing: 0,
|
||||
},
|
||||
TcpAdvanced: info.TcpAdvancedStat{
|
||||
TCPFullUndo: 2361,
|
||||
TCPMD5NotFound: 0,
|
||||
TCPDSACKRecv: 83680,
|
||||
TCPSackShifted: 2,
|
||||
TCPSackShiftFallback: 298,
|
||||
PFMemallocDrop: 0,
|
||||
EstabResets: 37,
|
||||
InSegs: 140370590,
|
||||
TCPPureAcks: 24251339,
|
||||
TCPDSACKOldSent: 15633,
|
||||
IPReversePathFilter: 0,
|
||||
TCPFastOpenPassiveFail: 0,
|
||||
InCsumErrors: 0,
|
||||
TCPRenoFailures: 43414,
|
||||
TCPMemoryPressuresChrono: 0,
|
||||
TCPDeferAcceptDrop: 0,
|
||||
TW: 10436427,
|
||||
TCPSpuriousRTOs: 0,
|
||||
TCPDSACKIgnoredNoUndo: 71885,
|
||||
RtoMax: 120000,
|
||||
ActiveOpens: 11038621,
|
||||
EmbryonicRsts: 0,
|
||||
RcvPruned: 0,
|
||||
TCPLossProbeRecovery: 401,
|
||||
TCPHPHits: 56096478,
|
||||
TCPPartialUndo: 3,
|
||||
TCPAbortOnMemory: 0,
|
||||
AttemptFails: 48997,
|
||||
RetransSegs: 462961,
|
||||
SyncookiesFailed: 0,
|
||||
OfoPruned: 0,
|
||||
TCPAbortOnLinger: 0,
|
||||
TCPAbortFailed: 0,
|
||||
TCPRenoReorder: 839,
|
||||
TCPRcvCollapsed: 0,
|
||||
TCPDSACKIgnoredOld: 0,
|
||||
TCPReqQFullDrop: 0,
|
||||
OutOfWindowIcmps: 0,
|
||||
TWKilled: 0,
|
||||
TCPLossProbes: 88648,
|
||||
TCPRenoRecoveryFail: 394,
|
||||
TCPFastOpenCookieReqd: 0,
|
||||
TCPHPAcks: 21490641,
|
||||
TCPSACKReneging: 0,
|
||||
TCPTSReorder: 3,
|
||||
TCPSlowStartRetrans: 290832,
|
||||
MaxConn: -1,
|
||||
SyncookiesRecv: 0,
|
||||
TCPSackFailures: 60,
|
||||
DelayedACKLocked: 90,
|
||||
TCPDSACKOfoSent: 1,
|
||||
TCPSynRetrans: 988,
|
||||
TCPDSACKOfoRecv: 10,
|
||||
TCPSACKDiscard: 0,
|
||||
TCPMD5Unexpected: 0,
|
||||
TCPSackMerged: 6,
|
||||
RtoMin: 200,
|
||||
CurrEstab: 22,
|
||||
TCPTimeWaitOverflow: 0,
|
||||
ListenOverflows: 0,
|
||||
DelayedACKs: 503975,
|
||||
TCPLossUndo: 61374,
|
||||
TCPOrigDataSent: 130698387,
|
||||
TCPBacklogDrop: 0,
|
||||
TCPReqQFullDoCookies: 0,
|
||||
TCPFastOpenPassive: 0,
|
||||
PAWSActive: 0,
|
||||
OutRsts: 91699,
|
||||
TCPSackRecoveryFail: 2,
|
||||
DelayedACKLost: 18843,
|
||||
TCPAbortOnData: 8,
|
||||
TCPMinTTLDrop: 0,
|
||||
PruneCalled: 0,
|
||||
TWRecycled: 0,
|
||||
ListenDrops: 0,
|
||||
TCPAbortOnTimeout: 0,
|
||||
SyncookiesSent: 0,
|
||||
TCPSACKReorder: 11,
|
||||
TCPDSACKUndo: 33,
|
||||
TCPMD5Failure: 0,
|
||||
TCPLostRetransmit: 0,
|
||||
TCPAbortOnClose: 7,
|
||||
TCPFastOpenListenOverflow: 0,
|
||||
OutSegs: 211580512,
|
||||
InErrs: 31,
|
||||
TCPTimeouts: 27422,
|
||||
TCPLossFailures: 729,
|
||||
TCPSackRecovery: 159,
|
||||
RtoAlgorithm: 1,
|
||||
PassiveOpens: 59,
|
||||
LockDroppedIcmps: 0,
|
||||
TCPRenoRecovery: 3519,
|
||||
TCPFACKReorder: 0,
|
||||
TCPFastRetrans: 11794,
|
||||
TCPRetransFail: 0,
|
||||
TCPMemoryPressures: 0,
|
||||
TCPFastOpenActive: 0,
|
||||
TCPFastOpenActiveFail: 0,
|
||||
PAWSEstab: 0,
|
||||
},
|
||||
Udp: info.UdpStat{
|
||||
Listen: 0,
|
||||
Dropped: 0,
|
||||
RxQueued: 0,
|
||||
TxQueued: 0,
|
||||
},
|
||||
Udp6: info.UdpStat{
|
||||
Listen: 0,
|
||||
Dropped: 0,
|
||||
RxQueued: 0,
|
||||
TxQueued: 0,
|
||||
},
|
||||
},
|
||||
Filesystem: []info.FsStats{
|
||||
{
|
||||
Device: "sda1",
|
||||
InodesFree: 524288,
|
||||
Inodes: 2097152,
|
||||
Limit: 22,
|
||||
Usage: 23,
|
||||
ReadsCompleted: 24,
|
||||
ReadsMerged: 25,
|
||||
SectorsRead: 26,
|
||||
ReadTime: 27,
|
||||
WritesCompleted: 28,
|
||||
WritesMerged: 39,
|
||||
SectorsWritten: 40,
|
||||
WriteTime: 41,
|
||||
IoInProgress: 42,
|
||||
IoTime: 43,
|
||||
WeightedIoTime: 44,
|
||||
},
|
||||
{
|
||||
Device: "sda2",
|
||||
InodesFree: 262144,
|
||||
Inodes: 2097152,
|
||||
Limit: 37,
|
||||
Usage: 38,
|
||||
ReadsCompleted: 39,
|
||||
ReadsMerged: 40,
|
||||
SectorsRead: 41,
|
||||
ReadTime: 42,
|
||||
WritesCompleted: 43,
|
||||
WritesMerged: 44,
|
||||
SectorsWritten: 45,
|
||||
WriteTime: 46,
|
||||
IoInProgress: 47,
|
||||
IoTime: 48,
|
||||
WeightedIoTime: 49,
|
||||
},
|
||||
},
|
||||
Accelerators: []info.AcceleratorStats{
|
||||
{
|
||||
Make: "nvidia",
|
||||
Model: "tesla-p100",
|
||||
ID: "GPU-deadbeef-1234-5678-90ab-feedfacecafe",
|
||||
MemoryTotal: 20304050607,
|
||||
MemoryUsed: 2030405060,
|
||||
DutyCycle: 12,
|
||||
},
|
||||
{
|
||||
Make: "nvidia",
|
||||
Model: "tesla-k80",
|
||||
ID: "GPU-deadbeef-0123-4567-89ab-feedfacecafe",
|
||||
MemoryTotal: 10203040506,
|
||||
MemoryUsed: 1020304050,
|
||||
DutyCycle: 6,
|
||||
},
|
||||
},
|
||||
Processes: info.ProcessStats{
|
||||
ProcessCount: 1,
|
||||
FdCount: 5,
|
||||
SocketCount: 3,
|
||||
ThreadsCurrent: 5,
|
||||
ThreadsMax: 100,
|
||||
Ulimits: []info.UlimitSpec{
|
||||
{
|
||||
Name: "max_open_files",
|
||||
SoftLimit: 16384,
|
||||
HardLimit: 16384,
|
||||
},
|
||||
},
|
||||
},
|
||||
TaskStats: info.LoadStats{
|
||||
NrSleeping: 50,
|
||||
NrRunning: 51,
|
||||
NrStopped: 52,
|
||||
NrUninterruptible: 53,
|
||||
NrIoWait: 54,
|
||||
},
|
||||
CustomMetrics: map[string][]info.MetricVal{
|
||||
"container_custom_app_metric_1": {
|
||||
{
|
||||
FloatValue: float64(1.1),
|
||||
Timestamp: time.Now(),
|
||||
Label: "testlabel_1_1_1",
|
||||
Labels: map[string]string{"test_label": "1_1", "test_label_2": "2_1"},
|
||||
},
|
||||
{
|
||||
FloatValue: float64(1.2),
|
||||
Timestamp: time.Now(),
|
||||
Label: "testlabel_1_1_2",
|
||||
Labels: map[string]string{"test_label": "1_2", "test_label_2": "2_2"},
|
||||
},
|
||||
},
|
||||
"container_custom_app_metric_2": {
|
||||
{
|
||||
FloatValue: float64(2),
|
||||
Timestamp: time.Now(),
|
||||
Label: "testlabel2",
|
||||
Labels: map[string]string{"test_label": "test_value"},
|
||||
},
|
||||
},
|
||||
"container_custom_app_metric_3": {
|
||||
{
|
||||
FloatValue: float64(3),
|
||||
Timestamp: time.Now(),
|
||||
Label: "testlabel3",
|
||||
Labels: map[string]string{"test_label": "test_value"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
var (
|
||||
includeRe = regexp.MustCompile(`^(?:(?:# HELP |# TYPE )?container_|cadvisor_version_info\{)`)
|
||||
ignoreRe = regexp.MustCompile(`^container_last_seen\{`)
|
||||
@ -456,33 +71,6 @@ func testPrometheusCollector(t *testing.T, c *PrometheusCollector, metricsFile s
|
||||
}
|
||||
}
|
||||
|
||||
type erroringSubcontainersInfoProvider struct {
|
||||
successfulProvider testSubcontainersInfoProvider
|
||||
shouldFail bool
|
||||
}
|
||||
|
||||
func (p *erroringSubcontainersInfoProvider) GetVersionInfo() (*info.VersionInfo, error) {
|
||||
if p.shouldFail {
|
||||
return nil, errors.New("Oops 1")
|
||||
}
|
||||
return p.successfulProvider.GetVersionInfo()
|
||||
}
|
||||
|
||||
func (p *erroringSubcontainersInfoProvider) GetMachineInfo() (*info.MachineInfo, error) {
|
||||
if p.shouldFail {
|
||||
return nil, errors.New("Oops 2")
|
||||
}
|
||||
return p.successfulProvider.GetMachineInfo()
|
||||
}
|
||||
|
||||
func (p *erroringSubcontainersInfoProvider) SubcontainersInfo(
|
||||
a string, r *info.ContainerInfoRequest) ([]*info.ContainerInfo, error) {
|
||||
if p.shouldFail {
|
||||
return []*info.ContainerInfo{}, errors.New("Oops 3")
|
||||
}
|
||||
return p.successfulProvider.SubcontainersInfo(a, r)
|
||||
}
|
||||
|
||||
func TestPrometheusCollector_scrapeFailure(t *testing.T) {
|
||||
provider := &erroringSubcontainersInfoProvider{
|
||||
successfulProvider: testSubcontainersInfoProvider{},
|
||||
|
27
metrics/testdata/prometheus_machine_metrics
vendored
Normal file
27
metrics/testdata/prometheus_machine_metrics
vendored
Normal file
@ -0,0 +1,27 @@
|
||||
# HELP machine_cpu_cores Number of logical CPU cores.
|
||||
# TYPE machine_cpu_cores gauge
|
||||
machine_cpu_cores{boot_id="boot-id-test",machine_id="machine-id-test",system_uuid="system-uuid-test"} 4
|
||||
# HELP machine_cpu_physical_cores Number of physical CPU cores.
|
||||
# TYPE machine_cpu_physical_cores gauge
|
||||
machine_cpu_physical_cores{boot_id="boot-id-test",machine_id="machine-id-test",system_uuid="system-uuid-test"} 1
|
||||
# HELP machine_cpu_sockets Number of CPU sockets.
|
||||
# TYPE machine_cpu_sockets gauge
|
||||
machine_cpu_sockets{boot_id="boot-id-test",machine_id="machine-id-test",system_uuid="system-uuid-test"} 1
|
||||
# HELP machine_dimm_capacity_bytes Total RAM DIMM capacity (all types memory modules) value labeled by dimm type.
|
||||
# TYPE machine_dimm_capacity_bytes gauge
|
||||
machine_dimm_capacity_bytes{boot_id="boot-id-test",machine_id="machine-id-test",system_uuid="system-uuid-test",type="Non-volatile-RAM"} 2.168421613568e+12
|
||||
machine_dimm_capacity_bytes{boot_id="boot-id-test",machine_id="machine-id-test",system_uuid="system-uuid-test",type="Unbuffered-DDR4"} 4.12316860416e+11
|
||||
# HELP machine_dimm_count Number of RAM DIMM (all types memory modules) value labeled by dimm type.
|
||||
# TYPE machine_dimm_count gauge
|
||||
machine_dimm_count{boot_id="boot-id-test",machine_id="machine-id-test",system_uuid="system-uuid-test",type="Non-volatile-RAM"} 8
|
||||
machine_dimm_count{boot_id="boot-id-test",machine_id="machine-id-test",system_uuid="system-uuid-test",type="Unbuffered-DDR4"} 12
|
||||
# HELP machine_memory_bytes Amount of memory installed on the machine.
|
||||
# TYPE machine_memory_bytes gauge
|
||||
machine_memory_bytes{boot_id="boot-id-test",machine_id="machine-id-test",system_uuid="system-uuid-test"} 1024
|
||||
# HELP machine_nvm_capacity NVM capacity value labeled by NVM mode (memory mode or app direct mode).
|
||||
# TYPE machine_nvm_capacity gauge
|
||||
machine_nvm_capacity{boot_id="boot-id-test",machine_id="machine-id-test",mode="app_direct_mode",system_uuid="system-uuid-test"} 1.735166787584e+12
|
||||
machine_nvm_capacity{boot_id="boot-id-test",machine_id="machine-id-test",mode="memory_mode",system_uuid="system-uuid-test"} 4.294967296e+11
|
||||
# HELP machine_scrape_error 1 if there was an error while getting machine metrics, 0 otherwise.
|
||||
# TYPE machine_scrape_error gauge
|
||||
machine_scrape_error 0
|
3
metrics/testdata/prometheus_machine_metrics_failure
vendored
Normal file
3
metrics/testdata/prometheus_machine_metrics_failure
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
# HELP machine_scrape_error 1 if there was an error while getting machine metrics, 0 otherwise.
|
||||
# TYPE machine_scrape_error gauge
|
||||
machine_scrape_error 1
|
6
metrics/testdata/prometheus_metrics
vendored
6
metrics/testdata/prometheus_metrics
vendored
@ -364,12 +364,6 @@ container_threads_max{container_env_foo_env="prod",container_label_foo_label="ba
|
||||
# HELP container_ulimits_soft Soft ulimit values for the container root process. Unlimited if -1, except priority and nice
|
||||
# TYPE container_ulimits_soft gauge
|
||||
container_ulimits_soft{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",ulimit="max_open_files",zone_name="hello"} 16384 1395066363000
|
||||
# HELP machine_cpu_cores Number of CPU cores on the machine.
|
||||
# TYPE machine_cpu_cores gauge
|
||||
machine_cpu_cores 4
|
||||
# HELP machine_memory_bytes Amount of memory installed on the machine.
|
||||
# TYPE machine_memory_bytes gauge
|
||||
machine_memory_bytes 1024
|
||||
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds.
|
||||
# TYPE process_cpu_seconds_total counter
|
||||
process_cpu_seconds_total 0
|
||||
|
Loading…
Reference in New Issue
Block a user