Retry docker status on startup

For https://github.com/google/cadvisor/issues/1866.
This commit is contained in:
James Ravn 2018-01-18 18:05:27 +00:00
parent 828ac5eb11
commit e660d8b8b7
2 changed files with 43 additions and 8 deletions

View File

@ -29,19 +29,27 @@ import (
"github.com/google/cadvisor/machine"
)
const defaultTimeout = time.Second * 5
var dockerTimeout = 10 * time.Second
func defaultContext() context.Context {
ctx, _ := context.WithTimeout(context.Background(), defaultTimeout)
ctx, _ := context.WithTimeout(context.Background(), dockerTimeout)
return ctx
}
func SetTimeout(timeout time.Duration) {
dockerTimeout = timeout
}
func Status() (v1.DockerStatus, error) {
return StatusWithContext(defaultContext())
}
func StatusWithContext(ctx context.Context) (v1.DockerStatus, error) {
client, err := Client()
if err != nil {
return v1.DockerStatus{}, fmt.Errorf("unable to communicate with docker daemon: %v", err)
}
dockerInfo, err := client.Info(defaultContext())
dockerInfo, err := client.Info(ctx)
if err != nil {
return v1.DockerStatus{}, err
}

View File

@ -50,6 +50,7 @@ import (
"github.com/golang/glog"
"github.com/opencontainers/runc/libcontainer/cgroups"
"golang.org/x/net/context"
"k8s.io/utils/clock"
)
@ -59,6 +60,8 @@ var eventStorageAgeLimit = flag.String("event_storage_age_limit", "default=24h",
var eventStorageEventLimit = flag.String("event_storage_event_limit", "default=100000", "Max number of events to store (per type). Value is a comma separated list of key values, where the keys are event types (e.g.: creation, oom) or \"default\" and the value is an integer. Default is applied to all non-specified event types")
var applicationMetricsCountLimit = flag.Int("application_metrics_count_limit", 100, "Max number of application metrics to store (per container)")
const dockerClientTimeout = 10 * time.Second
// The Manager interface defines operations for starting a manager and getting
// container and machine information.
type Manager interface {
@ -154,11 +157,10 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn
dockerStatus info.DockerStatus
rktPath string
)
if tempDockerStatus, err := docker.Status(); err != nil {
glog.V(5).Infof("Docker not connected: %v", err)
} else {
dockerStatus = tempDockerStatus
}
docker.SetTimeout(dockerClientTimeout)
// Try to connect to docker indefinitely on startup.
dockerStatus = retryDockerStatus()
if tmpRktPath, err := rkt.RktPath(); err != nil {
glog.V(5).Infof("Rkt not connected: %v", err)
} else {
@ -234,6 +236,31 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn
return newManager, nil
}
func retryDockerStatus() info.DockerStatus {
startupTimeout := dockerClientTimeout
maxTimeout := 4 * startupTimeout
for {
ctx, _ := context.WithTimeout(context.Background(), startupTimeout)
dockerStatus, err := docker.StatusWithContext(ctx)
if err != nil {
return dockerStatus
}
switch err {
case context.DeadlineExceeded:
glog.Warningf("Timeout trying to communicate with docker during initialization, will retry")
default:
glog.V(5).Infof("Docker not connected: %v", err)
return info.DockerStatus{}
}
startupTimeout = 2 * startupTimeout
if startupTimeout > maxTimeout {
startupTimeout = maxTimeout
}
}
}
// A namespaced container name.
type namespacedContainerName struct {
// The namespace of the container. Can be empty for the root namespace.