diff --git a/container/docker/fsHandler.go b/container/docker/fsHandler.go index 215b5456..0792ebac 100644 --- a/container/docker/fsHandler.go +++ b/container/docker/fsHandler.go @@ -36,6 +36,7 @@ type realFsHandler struct { usageBytes uint64 baseUsageBytes uint64 period time.Duration + minPeriod time.Duration rootfs string extraDir string fsInfo fs.FsInfo @@ -43,7 +44,11 @@ type realFsHandler struct { stopChan chan struct{} } -const longDu = time.Second +const ( + longDu = time.Second + duTimeout = time.Minute + maxDuBackoffFactor = 20 +) var _ fsHandler = &realFsHandler{} @@ -53,6 +58,7 @@ func newFsHandler(period time.Duration, rootfs, extraDir string, fsInfo fs.FsInf usageBytes: 0, baseUsageBytes: 0, period: period, + minPeriod: period, rootfs: rootfs, extraDir: extraDir, fsInfo: fsInfo, @@ -60,18 +66,14 @@ func newFsHandler(period time.Duration, rootfs, extraDir string, fsInfo fs.FsInf } } -func (fh *realFsHandler) needsUpdate() bool { - return time.Now().After(fh.lastUpdate.Add(fh.period)) -} - func (fh *realFsHandler) update() error { // TODO(vishh): Add support for external mounts. - baseUsage, err := fh.fsInfo.GetDirUsage(fh.rootfs) + baseUsage, err := fh.fsInfo.GetDirUsage(fh.rootfs, duTimeout) if err != nil { return err } - extraDirUsage, err := fh.fsInfo.GetDirUsage(fh.extraDir) + extraDirUsage, err := fh.fsInfo.GetDirUsage(fh.extraDir, duTimeout) if err != nil { return err } @@ -93,11 +95,17 @@ func (fh *realFsHandler) trackUsage() { case <-time.After(fh.period): start := time.Now() if err := fh.update(); err != nil { - glog.V(2).Infof("failed to collect filesystem stats - %v", err) + glog.Errorf("failed to collect filesystem stats - %v", err) + fh.period = fh.period * 2 + if fh.period > maxDuBackoffFactor*fh.minPeriod { + fh.period = maxDuBackoffFactor * fh.minPeriod + } + } else { + fh.period = fh.minPeriod } duration := time.Since(start) if duration > longDu { - glog.V(3).Infof("`du` on following dirs took %v: %v", duration, []string{fh.rootfs, fh.extraDir}) + glog.V(2).Infof("`du` on following dirs took %v: %v", duration, []string{fh.rootfs, fh.extraDir}) } } } diff --git a/container/docker/handler.go b/container/docker/handler.go index 19daffa5..04856010 100644 --- a/container/docker/handler.go +++ b/container/docker/handler.go @@ -129,6 +129,7 @@ func newDockerContainerHandler( rootFs := "/" if !inHostNamespace { rootFs = "/rootfs" + storageDir = path.Join(rootFs, storageDir) } id := ContainerNameToDockerId(name) diff --git a/container/raw/handler.go b/container/raw/handler.go index 0f2a6e06..932cb730 100644 --- a/container/raw/handler.go +++ b/container/raw/handler.go @@ -210,7 +210,7 @@ func (self *rawContainerHandler) GetSpec() (info.ContainerSpec, error) { spec.Cpu.Period = readUInt64(cpuRoot, "cpu.cfs_period_us") quota := readString(cpuRoot, "cpu.cfs_quota_us") - if quota != "-1" { + if quota != "" && quota != "-1" { val, err := strconv.ParseUint(quota, 10, 64) if err != nil { glog.Errorf("raw driver: Failed to parse CPUQuota from %q: %s", path.Join(cpuRoot, "cpu.cfs_quota_us"), err) diff --git a/fs/fs.go b/fs/fs.go index c08fe445..5d0489e7 100644 --- a/fs/fs.go +++ b/fs/fs.go @@ -21,6 +21,7 @@ import ( "bufio" "encoding/json" "fmt" + "io/ioutil" "os" "os/exec" "path" @@ -29,6 +30,7 @@ import ( "strconv" "strings" "syscall" + "time" "github.com/docker/docker/pkg/mount" "github.com/golang/glog" @@ -355,14 +357,37 @@ func (self *RealFsInfo) GetDirFsDevice(dir string) (*DeviceInfo, error) { return nil, fmt.Errorf("could not find device with major: %d, minor: %d in cached partitions map", major, minor) } -func (self *RealFsInfo) GetDirUsage(dir string) (uint64, error) { - out, err := exec.Command("nice", "-n", "19", "du", "-s", dir).CombinedOutput() +func (self *RealFsInfo) GetDirUsage(dir string, timeout time.Duration) (uint64, error) { + cmd := exec.Command("nice", "-n", "19", "du", "-s", dir) + stdoutp, err := cmd.StdoutPipe() if err != nil { - return 0, fmt.Errorf("du command failed on %s with output %s - %s", dir, out, err) + return 0, fmt.Errorf("failed to setup stdout for cmd %v - %v", cmd.Args, err) } - usageInKb, err := strconv.ParseUint(strings.Fields(string(out))[0], 10, 64) + stderrp, err := cmd.StderrPipe() if err != nil { - return 0, fmt.Errorf("cannot parse 'du' output %s - %s", out, err) + return 0, fmt.Errorf("failed to setup stderr for cmd %v - %v", cmd.Args, err) + } + + if err := cmd.Start(); err != nil { + return 0, fmt.Errorf("failed to exec du - %v", err) + } + stdoutb, souterr := ioutil.ReadAll(stdoutp) + stderrb, _ := ioutil.ReadAll(stderrp) + timer := time.AfterFunc(timeout, func() { + glog.Infof("killing cmd %v due to timeout(%s)", cmd.Args, timeout.String()) + cmd.Process.Kill() + }) + if err := cmd.Wait(); err != nil { + return 0, fmt.Errorf("du command failed on %s with output stdout: %s, stderr: %s - %v", dir, string(stdoutb), string(stderrb), err) + } + stdout := string(stdoutb) + if souterr != nil { + glog.Errorf("failed to read from stdout for cmd %v - %v", cmd.Args, souterr) + } + timer.Stop() + usageInKb, err := strconv.ParseUint(strings.Fields(stdout)[0], 10, 64) + if err != nil { + return 0, fmt.Errorf("cannot parse 'du' output %s - %s", stdout, err) } return usageInKb * 1024, nil } diff --git a/fs/fs_test.go b/fs/fs_test.go index 91b179f0..850cf82a 100644 --- a/fs/fs_test.go +++ b/fs/fs_test.go @@ -20,6 +20,7 @@ import ( "os" "reflect" "testing" + "time" "github.com/stretchr/testify/assert" ) @@ -98,7 +99,7 @@ func TestDirUsage(t *testing.T) { fi, err := f.Stat() as.NoError(err) expectedSize := uint64(fi.Size()) - size, err := fsInfo.GetDirUsage(dir) + size, err := fsInfo.GetDirUsage(dir, time.Minute) as.NoError(err) as.True(expectedSize <= size, "expected dir size to be at-least %d; got size: %d", expectedSize, size) } diff --git a/fs/types.go b/fs/types.go index 37f0db2a..e3572b89 100644 --- a/fs/types.go +++ b/fs/types.go @@ -14,6 +14,8 @@ package fs +import "time" + type DeviceInfo struct { Device string Major uint @@ -50,7 +52,7 @@ type FsInfo interface { GetFsInfoForPath(mountSet map[string]struct{}) ([]Fs, error) // Returns number of bytes occupied by 'dir'. - GetDirUsage(dir string) (uint64, error) + GetDirUsage(dir string, timeout time.Duration) (uint64, error) // Returns the block device info of the filesystem on which 'dir' resides. GetDirFsDevice(dir string) (*DeviceInfo, error) diff --git a/integration/runner/runner.go b/integration/runner/runner.go index 8feef146..dfef3e13 100644 --- a/integration/runner/runner.go +++ b/integration/runner/runner.go @@ -39,13 +39,17 @@ import ( // must be able to ssh into hosts without password // godep go run ./integration/runner/runner.go --logtostderr --v 2 --ssh-config <.ssh/config file> -const cadvisorBinary = "cadvisor" +const ( + cadvisorBinary = "cadvisor" + testTimeout = 15 * time.Minute +) var cadvisorTimeout = flag.Duration("cadvisor_timeout", 15*time.Second, "Time to wait for cAdvisor to come up on the remote host") var port = flag.Int("port", 8080, "Port in which to start cAdvisor in the remote host") var testRetryCount = flag.Int("test-retry-count", 3, "Number of times to retry failed tests before failing.") var testRetryWhitelist = flag.String("test-retry-whitelist", "", "Path to newline separated list of regexexp for test failures that should be retried. If empty, no tests are retried.") var sshOptions = flag.String("ssh-options", "", "Commandline options passed to ssh.") +var testArgs = flag.String("test_args", "", "arguments to be passed to the integrationt tests") var retryRegex *regexp.Regexp func getAttributes(ipAddress, portStr string) (*cadvisorApi.Attributes, error) { @@ -158,7 +162,7 @@ func PushAndRunTests(host, testDir string) error { } // Run the command - err = RunCommand("godep", "go", "test", "github.com/google/cadvisor/integration/tests/...", "--host", host, "--port", portStr, "--ssh-options", *sshOptions) + err = RunCommand("godep", "go", "test", "--timeout", testTimeout.String(), *testArgs, "github.com/google/cadvisor/integration/tests/...", "--host", host, "--port", portStr, "--ssh-options", *sshOptions) if err == nil { // On success, break out of retry loop break @@ -166,7 +170,7 @@ func PushAndRunTests(host, testDir string) error { // Only retry on test failures caused by these known flaky failure conditions if retryRegex == nil || !retryRegex.Match([]byte(err.Error())) { - glog.Warningf("Skipping retry for tests on host %s because error is not whitelisted: %s", host, err.Error()) + glog.Warningf("Skipping retry for tests on host %s because error is not whitelisted", host) break } } diff --git a/integration/tests/api/docker_test.go b/integration/tests/api/docker_test.go index 14f1827c..2b728443 100644 --- a/integration/tests/api/docker_test.go +++ b/integration/tests/api/docker_test.go @@ -292,36 +292,62 @@ func TestDockerContainerNetworkStats(t *testing.T) { } func TestDockerFilesystemStats(t *testing.T) { - t.Skip("enable this once this test does not cause timeouts.") fm := framework.New(t) defer fm.Cleanup() - storageDriver := fm.Docker().StorageDriver() - switch storageDriver { - case framework.Aufs: - case framework.Overlay: - default: - t.Skip("skipping filesystem stats test") - } + const ( + ddUsage = uint64(1 << 3) // 1 KB + sleepDuration = 10 * time.Second + ) // Wait for the container to show up. - containerId := fm.Docker().RunBusybox("/bin/sh", "-c", "dd if=/dev/zero of=/file count=1 bs=1M & ping www.google.com") + containerId := fm.Docker().RunBusybox("/bin/sh", "-c", fmt.Sprintf("'dd if=/dev/zero of=/file count=2 bs=%d & sleep 10000'", ddUsage)) + waitForContainer(containerId, fm) - time.Sleep(time.Minute) request := &v2.RequestOptions{ IdType: v2.TypeDocker, Count: 1, } - containerInfo, err := fm.Cadvisor().ClientV2().Stats(containerId, request) - time.Sleep(time.Minute) - require.NoError(t, err) - require.True(t, len(containerInfo) == 1) - var info v2.ContainerInfo - for _, cInfo := range containerInfo { - info = cInfo + needsBaseUsageCheck := false + storageDriver := fm.Docker().StorageDriver() + switch storageDriver { + case framework.Aufs, framework.Overlay: + needsBaseUsageCheck = true + } + pass := false + // We need to wait for the `dd` operation to complete. + for i := 0; i < 10; i++ { + containerInfo, err := fm.Cadvisor().ClientV2().Stats(containerId, request) + require.NoError(t, err) + require.Equal(t, len(containerInfo), 1) + var info v2.ContainerInfo + // There is only one container in containerInfo. Since it is a map with unknown key, + // use the value blindly. + for _, cInfo := range containerInfo { + info = cInfo + } + sanityCheckV2(containerId, info, t) + + require.NotNil(t, info.Stats[0].Filesystem.TotalUsageBytes) + if *info.Stats[0].Filesystem.TotalUsageBytes >= ddUsage { + if !needsBaseUsageCheck { + pass = true + break + } + require.NotNil(t, info.Stats[0].Filesystem.BaseUsageBytes) + if *info.Stats[0].Filesystem.BaseUsageBytes >= ddUsage { + pass = true + break + } + } + t.Logf("expected total usage %d bytes to be greater than %d bytes", *info.Stats[0].Filesystem.TotalUsageBytes, ddUsage) + if needsBaseUsageCheck { + t.Logf("expected base %d bytes to be greater than %d bytes", *info.Stats[0].Filesystem.BaseUsageBytes, ddUsage) + } + t.Logf("retrying after %s...", sleepDuration.String()) + time.Sleep(sleepDuration) + } + + if !pass { + t.Fail() } - sanityCheckV2(containerId, info, t) - require.NotNil(t, info.Stats[0].Filesystem.BaseUsageBytes) - assert.True(t, *info.Stats[0].Filesystem.BaseUsageBytes > (1<<6), "expected base fs usage to be greater than 1MB") - require.NotNil(t, info.Stats[0].Filesystem.TotalUsageBytes) - assert.True(t, *info.Stats[0].Filesystem.TotalUsageBytes > (1<<6), "expected total fs usage to be greater than 1MB") }