From 39fe454f3224a019489ce14d0f7a69839e3b71e7 Mon Sep 17 00:00:00 2001 From: "Tim St. Clair" Date: Mon, 16 May 2016 13:25:33 -0700 Subject: [PATCH] Make manager multi-container functions robust to partial failures --- api/versions.go | 15 +++++-- manager/manager.go | 62 +++++++++++++++++++---------- manager/manager_test.go | 86 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 24 deletions(-) diff --git a/api/versions.go b/api/versions.go index a4b39cda..ec920552 100644 --- a/api/versions.go +++ b/api/versions.go @@ -361,7 +361,10 @@ func (self *version2_0) HandleRequest(requestType string, request []string, m ma glog.V(4).Infof("Api - Stats: Looking for stats for container %q, options %+v", name, opt) infos, err := m.GetRequestedContainersInfo(name, opt) if err != nil { - return err + if len(infos) == 0 { + return err + } + glog.Errorf("Error calling GetRequestedContainersInfo: %v", err) } contStats := make(map[string][]v2.DeprecatedContainerStats, 0) for name, cinfo := range infos { @@ -482,7 +485,10 @@ func (self *version2_1) HandleRequest(requestType string, request []string, m ma glog.V(4).Infof("Api - MachineStats(%v)", request) cont, err := m.GetRequestedContainersInfo("/", opt) if err != nil { - return err + if len(cont) == 0 { + return err + } + glog.Errorf("Error calling GetRequestedContainersInfo: %v", err) } return writeResult(v2.MachineStatsFromV1(cont["/"]), w) case statsApi: @@ -490,7 +496,10 @@ func (self *version2_1) HandleRequest(requestType string, request []string, m ma glog.V(4).Infof("Api - Stats: Looking for stats for container %q, options %+v", name, opt) conts, err := m.GetRequestedContainersInfo(name, opt) if err != nil { - return err + if len(conts) == 0 { + return err + } + glog.Errorf("Error calling GetRequestedContainersInfo: %v", err) } contStats := make(map[string]v2.ContainerInfo, len(conts)) for name, cont := range conts { diff --git a/manager/manager.go b/manager/manager.go index 4e5a1a63..0cd0a401 100644 --- a/manager/manager.go +++ b/manager/manager.go @@ -69,6 +69,8 @@ type Manager interface { GetContainerInfo(containerName string, query *info.ContainerInfoRequest) (*info.ContainerInfo, error) // Get V2 information about a container. + // Recursive (subcontainer) requests are best-effort, and may return a partial result alongside an + // error in the partial failure case. GetContainerInfoV2(containerName string, options v2.RequestOptions) (map[string]v2.ContainerInfo, error) // Get information about all subcontainers of the specified container (includes self). @@ -387,15 +389,16 @@ func (self *manager) GetDerivedStats(containerName string, options v2.RequestOpt if err != nil { return nil, err } + var errs partialFailure stats := make(map[string]v2.DerivedStats) for name, cont := range conts { d, err := cont.DerivedStats() if err != nil { - return nil, err + errs.append(name, "DerivedStats", err) } stats[name] = d } - return stats, nil + return stats, errs } func (self *manager) GetContainerSpec(containerName string, options v2.RequestOptions) (map[string]v2.ContainerSpec, error) { @@ -403,16 +406,17 @@ func (self *manager) GetContainerSpec(containerName string, options v2.RequestOp if err != nil { return nil, err } + var errs partialFailure specs := make(map[string]v2.ContainerSpec) for name, cont := range conts { cinfo, err := cont.GetInfo() if err != nil { - return nil, err + errs.append(name, "GetInfo", err) } spec := self.getV2Spec(cinfo) specs[name] = spec } - return specs, nil + return specs, errs } // Get V2 container spec from v1 container info. @@ -448,26 +452,32 @@ func (self *manager) GetContainerInfoV2(containerName string, options v2.Request return nil, err } + var errs partialFailure + var nilTime time.Time // Ignored. + infos := make(map[string]v2.ContainerInfo, len(containers)) for name, container := range containers { + result := v2.ContainerInfo{} cinfo, err := container.GetInfo() if err != nil { - return nil, err + errs.append(name, "GetInfo", err) + infos[name] = result + continue } + result.Spec = self.getV2Spec(cinfo) - var nilTime time.Time // Ignored. stats, err := self.memoryCache.RecentStats(name, nilTime, nilTime, options.Count) if err != nil { - return nil, err + errs.append(name, "RecentStats", err) + infos[name] = result + continue } - infos[name] = v2.ContainerInfo{ - Spec: self.getV2Spec(cinfo), - Stats: v2.ContainerStatsFromV1(&cinfo.Spec, stats), - } + result.Stats = v2.ContainerStatsFromV1(&cinfo.Spec, stats) + infos[name] = result } - return infos, nil + return infos, errs } func (self *manager) containerDataToContainerInfo(cont *containerData, query *info.ContainerInfoRequest) (*info.ContainerInfo, error) { @@ -608,6 +618,7 @@ func (self *manager) GetRequestedContainersInfo(containerName string, options v2 if err != nil { return nil, err } + var errs partialFailure containersMap := make(map[string]*info.ContainerInfo) query := info.ContainerInfoRequest{ NumStats: options.Count, @@ -615,12 +626,11 @@ func (self *manager) GetRequestedContainersInfo(containerName string, options v2 for name, data := range containers { info, err := self.containerDataToContainerInfo(data, &query) if err != nil { - // Skip containers with errors, we try to degrade gracefully. - continue + errs.append(name, "containerDataToContainerInfo", err) } containersMap[name] = info } - return containersMap, nil + return containersMap, errs } func (self *manager) getRequestedContainers(containerName string, options v2.RequestOptions) (map[string]*containerData, error) { @@ -1016,19 +1026,18 @@ func (self *manager) watchForNewContainers(quit chan error) error { glog.Warningf("Failed to process watch event %+v: %v", event, err) } case <-quit: - errors := []string{} + var errs partialFailure // Stop processing events if asked to quit. - for _, watcher := range self.containerWatchers { + for i, watcher := range self.containerWatchers { err := watcher.Stop() if err != nil { - errors = append(errors, err.Error()) + errs.append(fmt.Sprintf("watcher %d", i), "Stop", err) } } - if len(errors) > 0 { - err_str := strings.Join(errors, ", ") - quit <- fmt.Errorf("Error quiting watchers: %v", err_str) + if len(errs) > 0 { + quit <- errs } else { quit <- nil glog.Infof("Exiting thread watching subcontainers") @@ -1202,3 +1211,14 @@ func getVersionInfo() (*info.VersionInfo, error) { CadvisorRevision: version.Info["revision"], }, nil } + +// Helper for accumulating partial failures. +type partialFailure []string + +func (f *partialFailure) append(id, operation string, err error) { + *f = append(*f, fmt.Sprintf("[%q: %s: %s]", id, operation, err)) +} + +func (f partialFailure) Error() string { + return fmt.Sprintf("partial failures: %s", strings.Join(f, ", ")) +} diff --git a/manager/manager_test.go b/manager/manager_test.go index cc426a52..0575c884 100644 --- a/manager/manager_test.go +++ b/manager/manager_test.go @@ -17,6 +17,7 @@ package manager import ( + "fmt" "reflect" "strings" "testing" @@ -28,7 +29,10 @@ import ( "github.com/google/cadvisor/container/docker" info "github.com/google/cadvisor/info/v1" itest "github.com/google/cadvisor/info/v1/test" + "github.com/google/cadvisor/info/v2" "github.com/google/cadvisor/utils/sysfs/fakesysfs" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) // TODO(vmarmol): Refactor these tests. @@ -151,6 +155,88 @@ func TestGetContainerInfo(t *testing.T) { } +func TestGetContainerInfoV2(t *testing.T) { + containers := []string{ + "/", + "/c1", + "/c2", + } + + options := v2.RequestOptions{ + IdType: v2.TypeName, + Count: 1, + Recursive: true, + } + query := &info.ContainerInfoRequest{ + NumStats: 2, + } + + m, _, handlerMap := expectManagerWithContainers(containers, query, t) + + infos, err := m.GetContainerInfoV2("/", options) + require.NoError(t, err, "Error calling GetContainerInfoV2") + + for container, handler := range handlerMap { + handler.AssertExpectations(t) + info, ok := infos[container] + assert.True(t, ok, "Missing info for container %q", container) + assert.NotEqual(t, v2.ContainerSpec{}, info.Spec, "Empty spec for container %q", container) + assert.NotEmpty(t, info.Stats, "Missing stats for container %q", container) + } +} + +func TestGetContainerInfoV2Failure(t *testing.T) { + successful := "/" + statless := "/c1" + failing := "/c2" + containers := []string{ + successful, statless, failing, + } + + options := v2.RequestOptions{ + IdType: v2.TypeName, + Count: 1, + Recursive: true, + } + query := &info.ContainerInfoRequest{ + NumStats: 2, + } + + m, _, handlerMap := expectManagerWithContainers(containers, query, t) + + // Remove /c1 stats + require.NoError(t, m.memoryCache.RemoveContainer(statless)) + + // Make GetSpec fail on /c2 + mockErr := fmt.Errorf("intentional GetSpec failure") + failingHandler := container.NewMockContainerHandler(failing) + failingHandler.On("GetSpec").Return(info.ContainerSpec{}, mockErr) + failingHandler.On("Exists").Return(true) + *handlerMap[failing] = *failingHandler + m.containers[namespacedContainerName{Name: failing}].lastUpdatedTime = time.Time{} // Force GetSpec. + + infos, err := m.GetContainerInfoV2("/", options) + assert.Error(t, err, "Expected error calling GetContainerInfoV2") + + // Successful containers still successful. + info, ok := infos[successful] + assert.True(t, ok, "Missing info for container %q", successful) + assert.NotEqual(t, v2.ContainerSpec{}, info.Spec, "Empty spec for container %q", successful) + assert.NotEmpty(t, info.Stats, "Missing stats for container %q", successful) + + // "/c1" present with spec. + info, ok = infos[statless] + assert.True(t, ok, "Missing info for container %q", statless) + assert.NotEqual(t, v2.ContainerSpec{}, info.Spec, "Empty spec for container %q", statless) + assert.Empty(t, info.Stats, "Missing stats for container %q", successful) + + // "/c2" should be present but empty. + info, ok = infos[failing] + assert.True(t, ok, "Missing info for failed container") + assert.Equal(t, v2.ContainerInfo{}, info, "Empty spec for failed container") + assert.Empty(t, info.Stats, "Missing stats for failed container") +} + func TestSubcontainersInfo(t *testing.T) { containers := []string{ "/c1",