From f8f474e7915ff14c767bc1deb37a6f805c8c3469 Mon Sep 17 00:00:00 2001 From: Alexander Staubo Date: Thu, 21 Jul 2016 00:19:31 -0400 Subject: [PATCH] This fixes a bug where any errors encountered during Prometheus metric collection would stay counted in the gauge "container_scrape_errors", making that particular metric useless. Instead, it must be reset on every scrape. --- metrics/prometheus.go | 1 + metrics/prometheus_test.go | 54 ++++++++++++++++++++- metrics/testdata/prometheus_metrics_failure | 3 ++ 3 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 metrics/testdata/prometheus_metrics_failure diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 74d36964..f6a898ff 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -493,6 +493,7 @@ func (c *PrometheusCollector) Describe(ch chan<- *prometheus.Desc) { // Collect fetches the stats from all containers and delivers them as // Prometheus metrics. It implements prometheus.PrometheusCollector. func (c *PrometheusCollector) Collect(ch chan<- prometheus.Metric) { + c.errors.Set(0) c.collectMachineInfo(ch) c.collectVersionInfo(ch) c.collectContainersInfo(ch) diff --git a/metrics/prometheus_test.go b/metrics/prometheus_test.go index 7aafc948..119f0d7b 100644 --- a/metrics/prometheus_test.go +++ b/metrics/prometheus_test.go @@ -15,6 +15,7 @@ package metrics import ( + "errors" "io/ioutil" "net/http" "net/http/httptest" @@ -181,10 +182,13 @@ func TestPrometheusCollector(t *testing.T) { prometheus.MustRegister(c) defer prometheus.Unregister(c) + testPrometheusCollector(t, c, "testdata/prometheus_metrics") +} + +func testPrometheusCollector(t *testing.T, c *PrometheusCollector, metricsFile string) { rw := httptest.NewRecorder() prometheus.Handler().ServeHTTP(rw, &http.Request{}) - metricsFile := "testdata/prometheus_metrics" wantMetrics, err := ioutil.ReadFile(metricsFile) if err != nil { t.Fatalf("unable to read input test file %s", metricsFile) @@ -206,3 +210,51 @@ func TestPrometheusCollector(t *testing.T) { } } } + +type erroringSubcontainersInfoProvider struct { + successfulProvider testSubcontainersInfoProvider + shouldFail bool +} + +func (p *erroringSubcontainersInfoProvider) GetVersionInfo() (*info.VersionInfo, error) { + if p.shouldFail { + return nil, errors.New("Oops 1") + } + return p.successfulProvider.GetVersionInfo() +} + +func (p *erroringSubcontainersInfoProvider) GetMachineInfo() (*info.MachineInfo, error) { + if p.shouldFail { + return nil, errors.New("Oops 2") + } + return p.successfulProvider.GetMachineInfo() +} + +func (p *erroringSubcontainersInfoProvider) SubcontainersInfo( + a string, r *info.ContainerInfoRequest) ([]*info.ContainerInfo, error) { + if p.shouldFail { + return []*info.ContainerInfo{}, errors.New("Oops 3") + } + return p.successfulProvider.SubcontainersInfo(a, r) +} + +func TestPrometheusCollector_scrapeFailure(t *testing.T) { + provider := &erroringSubcontainersInfoProvider{ + successfulProvider: testSubcontainersInfoProvider{}, + shouldFail: true, + } + + c := NewPrometheusCollector(provider, func(name string) map[string]string { + return map[string]string{ + "zone.name": "hello", + } + }) + prometheus.MustRegister(c) + defer prometheus.Unregister(c) + + testPrometheusCollector(t, c, "testdata/prometheus_metrics_failure") + + provider.shouldFail = false + + testPrometheusCollector(t, c, "testdata/prometheus_metrics") +} diff --git a/metrics/testdata/prometheus_metrics_failure b/metrics/testdata/prometheus_metrics_failure new file mode 100644 index 00000000..b5bc6705 --- /dev/null +++ b/metrics/testdata/prometheus_metrics_failure @@ -0,0 +1,3 @@ +# HELP container_scrape_error 1 if there was an error while getting container metrics, 0 otherwise +# TYPE container_scrape_error gauge +container_scrape_error 1