From e660d8b8b7cc76438a3e145bcb27ede4c935a0f4 Mon Sep 17 00:00:00 2001 From: James Ravn Date: Thu, 18 Jan 2018 18:05:27 +0000 Subject: [PATCH] Retry docker status on startup For https://github.com/google/cadvisor/issues/1866. --- container/docker/docker.go | 14 +++++++++++--- manager/manager.go | 37 ++++++++++++++++++++++++++++++++----- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/container/docker/docker.go b/container/docker/docker.go index b0ed227d..f1cda2be 100644 --- a/container/docker/docker.go +++ b/container/docker/docker.go @@ -29,19 +29,27 @@ import ( "github.com/google/cadvisor/machine" ) -const defaultTimeout = time.Second * 5 +var dockerTimeout = 10 * time.Second func defaultContext() context.Context { - ctx, _ := context.WithTimeout(context.Background(), defaultTimeout) + ctx, _ := context.WithTimeout(context.Background(), dockerTimeout) return ctx } +func SetTimeout(timeout time.Duration) { + dockerTimeout = timeout +} + func Status() (v1.DockerStatus, error) { + return StatusWithContext(defaultContext()) +} + +func StatusWithContext(ctx context.Context) (v1.DockerStatus, error) { client, err := Client() if err != nil { return v1.DockerStatus{}, fmt.Errorf("unable to communicate with docker daemon: %v", err) } - dockerInfo, err := client.Info(defaultContext()) + dockerInfo, err := client.Info(ctx) if err != nil { return v1.DockerStatus{}, err } diff --git a/manager/manager.go b/manager/manager.go index 08955833..10a725f7 100644 --- a/manager/manager.go +++ b/manager/manager.go @@ -50,6 +50,7 @@ import ( "github.com/golang/glog" "github.com/opencontainers/runc/libcontainer/cgroups" + "golang.org/x/net/context" "k8s.io/utils/clock" ) @@ -59,6 +60,8 @@ var eventStorageAgeLimit = flag.String("event_storage_age_limit", "default=24h", var eventStorageEventLimit = flag.String("event_storage_event_limit", "default=100000", "Max number of events to store (per type). Value is a comma separated list of key values, where the keys are event types (e.g.: creation, oom) or \"default\" and the value is an integer. Default is applied to all non-specified event types") var applicationMetricsCountLimit = flag.Int("application_metrics_count_limit", 100, "Max number of application metrics to store (per container)") +const dockerClientTimeout = 10 * time.Second + // The Manager interface defines operations for starting a manager and getting // container and machine information. type Manager interface { @@ -154,11 +157,10 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn dockerStatus info.DockerStatus rktPath string ) - if tempDockerStatus, err := docker.Status(); err != nil { - glog.V(5).Infof("Docker not connected: %v", err) - } else { - dockerStatus = tempDockerStatus - } + docker.SetTimeout(dockerClientTimeout) + // Try to connect to docker indefinitely on startup. + dockerStatus = retryDockerStatus() + if tmpRktPath, err := rkt.RktPath(); err != nil { glog.V(5).Infof("Rkt not connected: %v", err) } else { @@ -234,6 +236,31 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn return newManager, nil } +func retryDockerStatus() info.DockerStatus { + startupTimeout := dockerClientTimeout + maxTimeout := 4 * startupTimeout + for { + ctx, _ := context.WithTimeout(context.Background(), startupTimeout) + dockerStatus, err := docker.StatusWithContext(ctx) + if err != nil { + return dockerStatus + } + + switch err { + case context.DeadlineExceeded: + glog.Warningf("Timeout trying to communicate with docker during initialization, will retry") + default: + glog.V(5).Infof("Docker not connected: %v", err) + return info.DockerStatus{} + } + + startupTimeout = 2 * startupTimeout + if startupTimeout > maxTimeout { + startupTimeout = maxTimeout + } + } +} + // A namespaced container name. type namespacedContainerName struct { // The namespace of the container. Can be empty for the root namespace.