cadvisor/integration/runner/runner.go

290 lines
8.3 KiB
Go

// Copyright 2015 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"bufio"
"bytes"
"encoding/json"
"errors"
"flag"
"fmt"
"io/ioutil"
"net/http"
"os"
"os/exec"
"path"
"regexp"
"strconv"
"strings"
"sync"
"time"
"github.com/google/cadvisor/integration/common"
cadvisorApi "github.com/google/cadvisor/info/v2"
"github.com/golang/glog"
)
const cadvisorBinary = "cadvisor"
var cadvisorTimeout = flag.Duration("cadvisor_timeout", 15*time.Second, "Time to wait for cAdvisor to come up on the remote host")
var port = flag.Int("port", 8080, "Port in which to start cAdvisor in the remote host")
var testRetryCount = flag.Int("test-retry-count", 3, "Number of times to retry failed tests before failing.")
var testRetryWhitelist = flag.String("test-retry-whitelist", "", "Path to newline separated list of regexexp for test failures that should be retried. If empty, no tests are retried.")
var retryRegex *regexp.Regexp
func getAttributes(ipAddress, portStr string) (*cadvisorApi.Attributes, error) {
// Get host attributes and log attributes if the tests fail.
var attributes cadvisorApi.Attributes
resp, err := http.Get(fmt.Sprintf("http://%s:%s/api/v2.1/attributes", ipAddress, portStr))
if err != nil {
return nil, fmt.Errorf("failed to get attributes - %v", err)
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("failed to get attributes. Status code - %v", resp.StatusCode)
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("unable to read attributes response body - %v", err)
}
if err := json.Unmarshal(body, &attributes); err != nil {
return nil, fmt.Errorf("failed to unmarshal attributes - %v", err)
}
return &attributes, nil
}
func RunCommand(cmd string, args ...string) error {
output, err := exec.Command(cmd, args...).CombinedOutput()
if err != nil {
return fmt.Errorf("command %q %q failed with error: %v and output: %q", cmd, args, err, output)
}
return nil
}
func PushAndRunTests(host, testDir string) error {
// Push binary.
glog.Infof("Pushing cAdvisor binary to %q...", host)
args := common.GetGCComputeArgs("ssh", host, "--", "mkdir", "-p", testDir)
err := RunCommand("gcloud", args...)
if err != nil {
return fmt.Errorf("failed to make remote testing directory: %v", err)
}
defer func() {
args := common.GetGCComputeArgs("ssh", host, "--", "rm", "-rf", testDir)
err := RunCommand("gcloud", args...)
if err != nil {
glog.Errorf("Failed to cleanup test directory: %v", err)
}
}()
args = common.GetGCComputeArgs("copy-files", cadvisorBinary, fmt.Sprintf("%s:%s", host, testDir))
err = RunCommand("gcloud", args...)
if err != nil {
return fmt.Errorf("failed to copy binary: %v", err)
}
// Start cAdvisor.
glog.Infof("Running cAdvisor on %q...", host)
portStr := strconv.Itoa(*port)
errChan := make(chan error)
go func() {
args = common.GetGCComputeArgs("ssh", host, "--", fmt.Sprintf("sudo %s --port %s --logtostderr &> %s/log.txt", path.Join(testDir, cadvisorBinary), portStr, testDir))
err = RunCommand("gcloud", args...)
if err != nil {
errChan <- fmt.Errorf("error running cAdvisor: %v", err)
}
}()
defer func() {
args = common.GetGCComputeArgs("ssh", host, "--", "sudo", "pkill", cadvisorBinary)
err := RunCommand("gcloud", args...)
if err != nil {
glog.Errorf("Failed to cleanup: %v", err)
}
}()
ipAddress, err := common.GetGceIp(host)
if err != nil {
return fmt.Errorf("failed to get GCE IP: %v", err)
}
// Wait for cAdvisor to come up.
endTime := time.Now().Add(*cadvisorTimeout)
done := false
for endTime.After(time.Now()) && !done {
select {
case err := <-errChan:
// Quit early if there was an error.
return err
case <-time.After(500 * time.Millisecond):
// Stop waiting when cAdvisor is healthy..
resp, err := http.Get(fmt.Sprintf("http://%s:%s/healthz", ipAddress, portStr))
if err == nil && resp.StatusCode == http.StatusOK {
done = true
break
}
}
}
if !done {
return fmt.Errorf("timed out waiting for cAdvisor to come up at host %q", host)
}
// Get attributes for debugging purposes.
attributes, err := getAttributes(ipAddress, portStr)
if err != nil {
return fmt.Errorf("%v - %q", err, host)
}
// Run the tests in a retry loop.
glog.Infof("Running integration tests targeting %q...", host)
for i := 0; i <= *testRetryCount; i++ {
// Check if this is a retry
if i > 0 {
time.Sleep(time.Second * 15) // Wait 15 seconds before retrying
glog.Warningf("Retrying (%d of %d) tests on host %s due to error %v", i, *testRetryCount, host, err)
}
// Run the command
err = RunCommand("godep", "go", "test", "github.com/google/cadvisor/integration/tests/...", "--host", host, "--port", portStr)
if err == nil {
// On success, break out of retry loop
break
}
// Only retry on test failures caused by these known flaky failure conditions
if retryRegex == nil || !retryRegex.Match([]byte(err.Error())) {
glog.Warningf("Skipping retry for tests on host %s because error is not whitelisted: %s", host, err.Error())
break
}
}
if err != nil {
// Copy logs from the host
args = common.GetGCComputeArgs("copy-files", fmt.Sprintf("%s:%s/log.txt", host, testDir), "./")
// Declare new error or it will get shadowed by logs, err := <> and we won't be able to unset it from nil
err2 := RunCommand("gcloud", args...)
if err2 != nil {
return fmt.Errorf("error fetching logs: %v for %v", err2, err)
}
defer os.Remove("./log.txt")
logs, err2 := ioutil.ReadFile("./log.txt")
if err2 != nil {
return fmt.Errorf("error reading local log file: %v for %v", err2, err)
}
glog.Errorf("----------------------\nLogs from Host: %q\n%v\n", host, string(logs))
err = fmt.Errorf("error on host %s: %v\n%+v", host, err, attributes)
}
return err
}
func Run() error {
start := time.Now()
defer func() {
glog.Infof("Execution time %v", time.Since(start))
}()
defer glog.Flush()
hosts := flag.Args()
testDir := fmt.Sprintf("/tmp/cadvisor-%d", os.Getpid())
glog.Infof("Running integration tests on host(s) %q", strings.Join(hosts, ","))
// Build cAdvisor.
glog.Infof("Building cAdvisor...")
err := RunCommand("godep", "go", "build", "github.com/google/cadvisor")
if err != nil {
return err
}
defer func() {
err := RunCommand("rm", cadvisorBinary)
if err != nil {
glog.Error(err)
}
}()
// Run test on all hosts in parallel.
var wg sync.WaitGroup
allErrors := make([]error, 0)
var allErrorsLock sync.Mutex
for _, host := range hosts {
wg.Add(1)
go func(host string) {
defer wg.Done()
err := PushAndRunTests(host, testDir)
if err != nil {
func() {
allErrorsLock.Lock()
defer allErrorsLock.Unlock()
allErrors = append(allErrors, err)
}()
}
}(host)
}
wg.Wait()
if len(allErrors) != 0 {
var buffer bytes.Buffer
for i, err := range allErrors {
buffer.WriteString(fmt.Sprintf("Error %d: ", i))
buffer.WriteString(err.Error())
buffer.WriteString("\n")
}
return errors.New(buffer.String())
}
glog.Infof("All tests pass!")
return nil
}
// initRetryWhitelist initializes the whitelist of test failures that can be retried.
func initRetryWhitelist() {
if *testRetryWhitelist == "" {
return
}
file, err := os.Open(*testRetryWhitelist)
if err != nil {
glog.Fatal(err)
}
defer file.Close()
retryStrings := []string{}
scanner := bufio.NewScanner(file)
for scanner.Scan() {
text := scanner.Text()
if text != "" {
retryStrings = append(retryStrings, text)
}
}
if err := scanner.Err(); err != nil {
glog.Fatal(err)
}
retryRegex = regexp.MustCompile(strings.Join(retryStrings, "|"))
}
func main() {
flag.Parse()
// Check usage.
if len(flag.Args()) == 0 {
glog.Fatalf("USAGE: runner <hosts to test>")
}
initRetryWhitelist()
// Run the tests.
err := Run()
if err != nil {
glog.Fatal(err)
}
}