Merge pull request #1117 from vishh/timeout-du

Timeout `du` after a minute.
This commit is contained in:
Vish Kannan 2016-02-23 17:48:49 -08:00
commit 5fdd1ee185
8 changed files with 109 additions and 42 deletions

View File

@ -36,6 +36,7 @@ type realFsHandler struct {
usageBytes uint64
baseUsageBytes uint64
period time.Duration
minPeriod time.Duration
rootfs string
extraDir string
fsInfo fs.FsInfo
@ -43,7 +44,11 @@ type realFsHandler struct {
stopChan chan struct{}
}
const longDu = time.Second
const (
longDu = time.Second
duTimeout = time.Minute
maxDuBackoffFactor = 20
)
var _ fsHandler = &realFsHandler{}
@ -53,6 +58,7 @@ func newFsHandler(period time.Duration, rootfs, extraDir string, fsInfo fs.FsInf
usageBytes: 0,
baseUsageBytes: 0,
period: period,
minPeriod: period,
rootfs: rootfs,
extraDir: extraDir,
fsInfo: fsInfo,
@ -60,18 +66,14 @@ func newFsHandler(period time.Duration, rootfs, extraDir string, fsInfo fs.FsInf
}
}
func (fh *realFsHandler) needsUpdate() bool {
return time.Now().After(fh.lastUpdate.Add(fh.period))
}
func (fh *realFsHandler) update() error {
// TODO(vishh): Add support for external mounts.
baseUsage, err := fh.fsInfo.GetDirUsage(fh.rootfs)
baseUsage, err := fh.fsInfo.GetDirUsage(fh.rootfs, duTimeout)
if err != nil {
return err
}
extraDirUsage, err := fh.fsInfo.GetDirUsage(fh.extraDir)
extraDirUsage, err := fh.fsInfo.GetDirUsage(fh.extraDir, duTimeout)
if err != nil {
return err
}
@ -93,11 +95,17 @@ func (fh *realFsHandler) trackUsage() {
case <-time.After(fh.period):
start := time.Now()
if err := fh.update(); err != nil {
glog.V(2).Infof("failed to collect filesystem stats - %v", err)
glog.Errorf("failed to collect filesystem stats - %v", err)
fh.period = fh.period * 2
if fh.period > maxDuBackoffFactor*fh.minPeriod {
fh.period = maxDuBackoffFactor * fh.minPeriod
}
} else {
fh.period = fh.minPeriod
}
duration := time.Since(start)
if duration > longDu {
glog.V(3).Infof("`du` on following dirs took %v: %v", duration, []string{fh.rootfs, fh.extraDir})
glog.V(2).Infof("`du` on following dirs took %v: %v", duration, []string{fh.rootfs, fh.extraDir})
}
}
}

View File

@ -129,6 +129,7 @@ func newDockerContainerHandler(
rootFs := "/"
if !inHostNamespace {
rootFs = "/rootfs"
storageDir = path.Join(rootFs, storageDir)
}
id := ContainerNameToDockerId(name)

View File

@ -210,7 +210,7 @@ func (self *rawContainerHandler) GetSpec() (info.ContainerSpec, error) {
spec.Cpu.Period = readUInt64(cpuRoot, "cpu.cfs_period_us")
quota := readString(cpuRoot, "cpu.cfs_quota_us")
if quota != "-1" {
if quota != "" && quota != "-1" {
val, err := strconv.ParseUint(quota, 10, 64)
if err != nil {
glog.Errorf("raw driver: Failed to parse CPUQuota from %q: %s", path.Join(cpuRoot, "cpu.cfs_quota_us"), err)

View File

@ -21,6 +21,7 @@ import (
"bufio"
"encoding/json"
"fmt"
"io/ioutil"
"os"
"os/exec"
"path"
@ -29,6 +30,7 @@ import (
"strconv"
"strings"
"syscall"
"time"
"github.com/docker/docker/pkg/mount"
"github.com/golang/glog"
@ -355,14 +357,37 @@ func (self *RealFsInfo) GetDirFsDevice(dir string) (*DeviceInfo, error) {
return nil, fmt.Errorf("could not find device with major: %d, minor: %d in cached partitions map", major, minor)
}
func (self *RealFsInfo) GetDirUsage(dir string) (uint64, error) {
out, err := exec.Command("nice", "-n", "19", "du", "-s", dir).CombinedOutput()
func (self *RealFsInfo) GetDirUsage(dir string, timeout time.Duration) (uint64, error) {
cmd := exec.Command("nice", "-n", "19", "du", "-s", dir)
stdoutp, err := cmd.StdoutPipe()
if err != nil {
return 0, fmt.Errorf("du command failed on %s with output %s - %s", dir, out, err)
return 0, fmt.Errorf("failed to setup stdout for cmd %v - %v", cmd.Args, err)
}
usageInKb, err := strconv.ParseUint(strings.Fields(string(out))[0], 10, 64)
stderrp, err := cmd.StderrPipe()
if err != nil {
return 0, fmt.Errorf("cannot parse 'du' output %s - %s", out, err)
return 0, fmt.Errorf("failed to setup stderr for cmd %v - %v", cmd.Args, err)
}
if err := cmd.Start(); err != nil {
return 0, fmt.Errorf("failed to exec du - %v", err)
}
stdoutb, souterr := ioutil.ReadAll(stdoutp)
stderrb, _ := ioutil.ReadAll(stderrp)
timer := time.AfterFunc(timeout, func() {
glog.Infof("killing cmd %v due to timeout(%s)", cmd.Args, timeout.String())
cmd.Process.Kill()
})
if err := cmd.Wait(); err != nil {
return 0, fmt.Errorf("du command failed on %s with output stdout: %s, stderr: %s - %v", dir, string(stdoutb), string(stderrb), err)
}
stdout := string(stdoutb)
if souterr != nil {
glog.Errorf("failed to read from stdout for cmd %v - %v", cmd.Args, souterr)
}
timer.Stop()
usageInKb, err := strconv.ParseUint(strings.Fields(stdout)[0], 10, 64)
if err != nil {
return 0, fmt.Errorf("cannot parse 'du' output %s - %s", stdout, err)
}
return usageInKb * 1024, nil
}

View File

@ -20,6 +20,7 @@ import (
"os"
"reflect"
"testing"
"time"
"github.com/stretchr/testify/assert"
)
@ -98,7 +99,7 @@ func TestDirUsage(t *testing.T) {
fi, err := f.Stat()
as.NoError(err)
expectedSize := uint64(fi.Size())
size, err := fsInfo.GetDirUsage(dir)
size, err := fsInfo.GetDirUsage(dir, time.Minute)
as.NoError(err)
as.True(expectedSize <= size, "expected dir size to be at-least %d; got size: %d", expectedSize, size)
}

View File

@ -14,6 +14,8 @@
package fs
import "time"
type DeviceInfo struct {
Device string
Major uint
@ -50,7 +52,7 @@ type FsInfo interface {
GetFsInfoForPath(mountSet map[string]struct{}) ([]Fs, error)
// Returns number of bytes occupied by 'dir'.
GetDirUsage(dir string) (uint64, error)
GetDirUsage(dir string, timeout time.Duration) (uint64, error)
// Returns the block device info of the filesystem on which 'dir' resides.
GetDirFsDevice(dir string) (*DeviceInfo, error)

View File

@ -39,13 +39,17 @@ import (
// must be able to ssh into hosts without password
// godep go run ./integration/runner/runner.go --logtostderr --v 2 --ssh-config <.ssh/config file> <list of hosts>
const cadvisorBinary = "cadvisor"
const (
cadvisorBinary = "cadvisor"
testTimeout = 15 * time.Minute
)
var cadvisorTimeout = flag.Duration("cadvisor_timeout", 15*time.Second, "Time to wait for cAdvisor to come up on the remote host")
var port = flag.Int("port", 8080, "Port in which to start cAdvisor in the remote host")
var testRetryCount = flag.Int("test-retry-count", 3, "Number of times to retry failed tests before failing.")
var testRetryWhitelist = flag.String("test-retry-whitelist", "", "Path to newline separated list of regexexp for test failures that should be retried. If empty, no tests are retried.")
var sshOptions = flag.String("ssh-options", "", "Commandline options passed to ssh.")
var testArgs = flag.String("test_args", "", "arguments to be passed to the integrationt tests")
var retryRegex *regexp.Regexp
func getAttributes(ipAddress, portStr string) (*cadvisorApi.Attributes, error) {
@ -158,7 +162,7 @@ func PushAndRunTests(host, testDir string) error {
}
// Run the command
err = RunCommand("godep", "go", "test", "github.com/google/cadvisor/integration/tests/...", "--host", host, "--port", portStr, "--ssh-options", *sshOptions)
err = RunCommand("godep", "go", "test", "--timeout", testTimeout.String(), *testArgs, "github.com/google/cadvisor/integration/tests/...", "--host", host, "--port", portStr, "--ssh-options", *sshOptions)
if err == nil {
// On success, break out of retry loop
break
@ -166,7 +170,7 @@ func PushAndRunTests(host, testDir string) error {
// Only retry on test failures caused by these known flaky failure conditions
if retryRegex == nil || !retryRegex.Match([]byte(err.Error())) {
glog.Warningf("Skipping retry for tests on host %s because error is not whitelisted: %s", host, err.Error())
glog.Warningf("Skipping retry for tests on host %s because error is not whitelisted", host)
break
}
}

View File

@ -292,36 +292,62 @@ func TestDockerContainerNetworkStats(t *testing.T) {
}
func TestDockerFilesystemStats(t *testing.T) {
t.Skip("enable this once this test does not cause timeouts.")
fm := framework.New(t)
defer fm.Cleanup()
storageDriver := fm.Docker().StorageDriver()
switch storageDriver {
case framework.Aufs:
case framework.Overlay:
default:
t.Skip("skipping filesystem stats test")
}
const (
ddUsage = uint64(1 << 3) // 1 KB
sleepDuration = 10 * time.Second
)
// Wait for the container to show up.
containerId := fm.Docker().RunBusybox("/bin/sh", "-c", "dd if=/dev/zero of=/file count=1 bs=1M & ping www.google.com")
containerId := fm.Docker().RunBusybox("/bin/sh", "-c", fmt.Sprintf("'dd if=/dev/zero of=/file count=2 bs=%d & sleep 10000'", ddUsage))
waitForContainer(containerId, fm)
time.Sleep(time.Minute)
request := &v2.RequestOptions{
IdType: v2.TypeDocker,
Count: 1,
}
containerInfo, err := fm.Cadvisor().ClientV2().Stats(containerId, request)
time.Sleep(time.Minute)
require.NoError(t, err)
require.True(t, len(containerInfo) == 1)
var info v2.ContainerInfo
for _, cInfo := range containerInfo {
info = cInfo
needsBaseUsageCheck := false
storageDriver := fm.Docker().StorageDriver()
switch storageDriver {
case framework.Aufs, framework.Overlay:
needsBaseUsageCheck = true
}
pass := false
// We need to wait for the `dd` operation to complete.
for i := 0; i < 10; i++ {
containerInfo, err := fm.Cadvisor().ClientV2().Stats(containerId, request)
require.NoError(t, err)
require.Equal(t, len(containerInfo), 1)
var info v2.ContainerInfo
// There is only one container in containerInfo. Since it is a map with unknown key,
// use the value blindly.
for _, cInfo := range containerInfo {
info = cInfo
}
sanityCheckV2(containerId, info, t)
require.NotNil(t, info.Stats[0].Filesystem.TotalUsageBytes)
if *info.Stats[0].Filesystem.TotalUsageBytes >= ddUsage {
if !needsBaseUsageCheck {
pass = true
break
}
require.NotNil(t, info.Stats[0].Filesystem.BaseUsageBytes)
if *info.Stats[0].Filesystem.BaseUsageBytes >= ddUsage {
pass = true
break
}
}
t.Logf("expected total usage %d bytes to be greater than %d bytes", *info.Stats[0].Filesystem.TotalUsageBytes, ddUsage)
if needsBaseUsageCheck {
t.Logf("expected base %d bytes to be greater than %d bytes", *info.Stats[0].Filesystem.BaseUsageBytes, ddUsage)
}
t.Logf("retrying after %s...", sleepDuration.String())
time.Sleep(sleepDuration)
}
if !pass {
t.Fail()
}
sanityCheckV2(containerId, info, t)
require.NotNil(t, info.Stats[0].Filesystem.BaseUsageBytes)
assert.True(t, *info.Stats[0].Filesystem.BaseUsageBytes > (1<<6), "expected base fs usage to be greater than 1MB")
require.NotNil(t, info.Stats[0].Filesystem.TotalUsageBytes)
assert.True(t, *info.Stats[0].Filesystem.TotalUsageBytes > (1<<6), "expected total fs usage to be greater than 1MB")
}