diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json index 26df30c5..504531fe 100644 --- a/Godeps/Godeps.json +++ b/Godeps/Godeps.json @@ -327,78 +327,83 @@ }, { "ImportPath": "github.com/opencontainers/runc/libcontainer", - "Comment": "v0.0.7", - "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" + "Comment": "v1.0.0-rc1-71-g4dedd09", + "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/apparmor", - "Comment": "v0.0.7", - "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" + "Comment": "v1.0.0-rc1-71-g4dedd09", + "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups", - "Comment": "v0.0.7", - "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" + "Comment": "v1.0.0-rc1-71-g4dedd09", + "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/fs", - "Comment": "v0.0.7", - "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" + "Comment": "v1.0.0-rc1-71-g4dedd09", + "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/systemd", - "Comment": "v0.0.7", - "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" + "Comment": "v1.0.0-rc1-71-g4dedd09", + "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/configs", - "Comment": "v0.0.7", - "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" + "Comment": "v1.0.0-rc1-71-g4dedd09", + "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/configs/validate", - "Comment": "v0.0.7", - "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" + "Comment": "v1.0.0-rc1-71-g4dedd09", + "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/criurpc", - "Comment": "v0.0.7", - "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" + "Comment": "v1.0.0-rc1-71-g4dedd09", + "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079" + }, + { + "ImportPath": "github.com/opencontainers/runc/libcontainer/keys", + "Comment": "v1.0.0-rc1-71-g4dedd09", + "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/label", - "Comment": "v0.0.7", - "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" + "Comment": "v1.0.0-rc1-71-g4dedd09", + "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/seccomp", - "Comment": "v0.0.7", - "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" + "Comment": "v1.0.0-rc1-71-g4dedd09", + "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/selinux", - "Comment": "v0.0.7", - "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" + "Comment": "v1.0.0-rc1-71-g4dedd09", + "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/stacktrace", - "Comment": "v0.0.7", - "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" + "Comment": "v1.0.0-rc1-71-g4dedd09", + "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/system", - "Comment": "v0.0.7", - "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" + "Comment": "v1.0.0-rc1-71-g4dedd09", + "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/user", - "Comment": "v0.0.7", - "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" + "Comment": "v1.0.0-rc1-71-g4dedd09", + "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/utils", - "Comment": "v0.0.7", - "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" + "Comment": "v1.0.0-rc1-71-g4dedd09", + "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079" }, { "ImportPath": "github.com/pborman/uuid", diff --git a/vendor/github.com/abbot/go-http-auth/auth.go b/vendor/github.com/abbot/go-http-auth/auth.go index e87ce6f0..c4eb5639 100644 --- a/vendor/github.com/abbot/go-http-auth/auth.go +++ b/vendor/github.com/abbot/go-http-auth/auth.go @@ -2,12 +2,12 @@ package auth import "net/http" -/* +/* Request handlers must take AuthenticatedRequest instead of http.Request */ type AuthenticatedRequest struct { http.Request - /* + /* Authenticated user name. Current API implies that Username is never empty, which means that authentication is always done before calling the request handler. diff --git a/vendor/github.com/abbot/go-http-auth/digest.go b/vendor/github.com/abbot/go-http-auth/digest.go index e52f9dfa..b3225ee4 100644 --- a/vendor/github.com/abbot/go-http-auth/digest.go +++ b/vendor/github.com/abbot/go-http-auth/digest.go @@ -22,7 +22,7 @@ type DigestAuth struct { Secrets SecretProvider PlainTextSecrets bool - /* + /* Approximate size of Client's Cache. When actual number of tracked client nonces exceeds ClientCacheSize+ClientCacheTolerance, ClientCacheTolerance*2 @@ -108,7 +108,7 @@ func DigestAuthParams(r *http.Request) map[string]string { return result } -/* +/* Check if request contains valid authentication data. Returns a pair of username, authinfo where username is the name of the authenticated user or an empty string and authinfo is the contents for the optional @@ -178,7 +178,7 @@ func (da *DigestAuth) CheckAuth(r *http.Request) (username string, authinfo *str const DefaultClientCacheSize = 1000 const DefaultClientCacheTolerance = 100 -/* +/* Wrap returns an Authenticator which uses HTTP Digest authentication. Arguments: @@ -201,7 +201,7 @@ func (a *DigestAuth) Wrap(wrapped AuthenticatedHandlerFunc) http.HandlerFunc { } } -/* +/* JustCheck returns function which converts an http.HandlerFunc into a http.HandlerFunc which requires authentication. Username is passed as an extra X-Authenticated-Username header. diff --git a/vendor/github.com/abbot/go-http-auth/users.go b/vendor/github.com/abbot/go-http-auth/users.go index 376a8ad6..5e7d0b8d 100644 --- a/vendor/github.com/abbot/go-http-auth/users.go +++ b/vendor/github.com/abbot/go-http-auth/users.go @@ -3,7 +3,7 @@ package auth import "encoding/csv" import "os" -/* +/* SecretProvider is used by authenticators. Takes user name and realm as an argument, returns secret required for authentication (HA1 for digest authentication, properly encrypted password for basic). diff --git a/vendor/github.com/kr/text/wrap.go b/vendor/github.com/kr/text/wrap.go index 1c85cd2c..ca885651 100755 --- a/vendor/github.com/kr/text/wrap.go +++ b/vendor/github.com/kr/text/wrap.go @@ -31,7 +31,7 @@ func WrapBytes(b []byte, lim int) []byte { // WrapWords is the low-level line-breaking algorithm, useful if you need more // control over the details of the text wrapping process. For most uses, either -// Wrap or WrapBytes will be sufficient and more convenient. +// Wrap or WrapBytes will be sufficient and more convenient. // // WrapWords splits a list of words into lines with minimal "raggedness", // treating each byte as one unit, accounting for spc units between adjacent diff --git a/vendor/github.com/opencontainers/runc/libcontainer/README.md b/vendor/github.com/opencontainers/runc/libcontainer/README.md index fc6b4b0b..6331010f 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/README.md +++ b/vendor/github.com/opencontainers/runc/libcontainer/README.md @@ -76,7 +76,7 @@ config := &configs.Config{ Name: "test-container", Parent: "system", Resources: &configs.Resources{ - MemorySwappiness: -1, + MemorySwappiness: nil, AllowAllDevices: false, AllowedDevices: configs.DefaultAllowedDevices, }, @@ -133,15 +133,15 @@ config := &configs.Config{ UidMappings: []configs.IDMap{ { ContainerID: 0, - Host: 1000, - size: 65536, + HostID: 1000, + Size: 65536, }, }, GidMappings: []configs.IDMap{ { ContainerID: 0, - Host: 1000, - size: 65536, + HostID: 1000, + Size: 65536, }, }, Networks: []*configs.Network{ @@ -186,8 +186,8 @@ process := &libcontainer.Process{ err := container.Start(process) if err != nil { - logrus.Fatal(err) container.Destroy() + logrus.Fatal(err) return } @@ -216,6 +216,12 @@ container.Pause() // resume all paused processes. container.Resume() + +// send signal to container's init process. +container.Signal(signal) + +// update container resource constraints. +container.Set(config) ``` diff --git a/vendor/github.com/opencontainers/runc/libcontainer/SPEC.md b/vendor/github.com/opencontainers/runc/libcontainer/SPEC.md index 61511125..32578f01 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/SPEC.md +++ b/vendor/github.com/opencontainers/runc/libcontainer/SPEC.md @@ -90,7 +90,7 @@ in tmpfs. After `/dev/null` has been setup we check for any external links between the container's io, STDIN, STDOUT, STDERR. If the container's io is pointing -to `/dev/null` outside the container we close and `dup2` the the `/dev/null` +to `/dev/null` outside the container we close and `dup2` the `/dev/null` that is local to the container's rootfs. @@ -142,6 +142,7 @@ system resources like cpu, memory, and device access. | perf_event | 1 | | freezer | 1 | | hugetlb | 1 | +| pids | 1 | All cgroup subsystem are joined so that statistics can be collected from @@ -199,7 +200,7 @@ provide a good default for security and flexibility for the applications. | CAP_SYS_BOOT | 0 | | CAP_LEASE | 0 | | CAP_WAKE_ALARM | 0 | -| CAP_BLOCK_SUSPE | 0 | +| CAP_BLOCK_SUSPEND | 0 | Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor) @@ -296,7 +297,7 @@ a container. | -------------- | ------------------------------------------------------------------ | | Get processes | Return all the pids for processes running inside a container | | Get Stats | Return resource statistics for the container as a whole | -| Wait | Wait waits on the container's init process ( pid 1 ) | +| Wait | Waits on the container's init process ( pid 1 ) | | Wait Process | Wait on any of the container's processes returning the exit status | | Destroy | Kill the container's init process and remove any filesystem state | | Signal | Send a signal to the container's init process | diff --git a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go index 22c17f52..82ed1a68 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go @@ -7,6 +7,7 @@ package apparmor // #include import "C" import ( + "fmt" "io/ioutil" "os" "unsafe" @@ -32,7 +33,7 @@ func ApplyProfile(name string) error { cName := C.CString(name) defer C.free(unsafe.Pointer(cName)) if _, err := C.aa_change_onexec(cName); err != nil { - return err + return fmt.Errorf("apparmor failed to apply profile: %s", err) } return nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go index c8f77965..274ab47d 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go @@ -9,7 +9,7 @@ import ( ) type Manager interface { - // Apply cgroup configuration to the process with the specified pid + // Applies cgroup configuration to the process with the specified pid Apply(pid int) error // Returns the PIDs inside the cgroup set diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go index 4da3b734..633ab042 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go @@ -14,6 +14,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" ) var ( @@ -30,6 +31,7 @@ var ( &NetPrioGroup{}, &PerfEventGroup{}, &FreezerGroup{}, + &NameGroup{GroupName: "name=systemd", Join: true}, } CgroupProcesses = "cgroup.procs" HugePageSizes, _ = cgroups.GetHugePageSize() @@ -94,11 +96,10 @@ func getCgroupRoot() (string, error) { } type cgroupData struct { - root string - parent string - name string - config *configs.Cgroup - pid int + root string + innerPath string + config *configs.Cgroup + pid int } func (m *Manager) Apply(pid int) (err error) { @@ -129,12 +130,9 @@ func (m *Manager) Apply(pid int) (err error) { return cgroups.EnterPid(m.Paths, pid) } + m.mu.Lock() + defer m.mu.Unlock() paths := make(map[string]string) - defer func() { - if err != nil { - cgroups.RemovePaths(paths) - } - }() for _, sys := range subsystems { if err := sys.Apply(d); err != nil { return err @@ -144,7 +142,9 @@ func (m *Manager) Apply(pid int) (err error) { // created then join consists of writing the process pids to cgroup.procs p, err := d.path(sys.Name()) if err != nil { - if cgroups.IsNotFound(err) { + // The non-presence of the devices subsystem is + // considered fatal for security reasons. + if cgroups.IsNotFound(err) && sys.Name() != "devices" { continue } return err @@ -267,45 +267,31 @@ func getCgroupPath(c *configs.Cgroup) (string, error) { return d.path("devices") } -// pathClean makes a path safe for use with filepath.Join. This is done by not -// only cleaning the path, but also (if the path is relative) adding a leading -// '/' and cleaning it (then removing the leading '/'). This ensures that a -// path resulting from prepending another path will always resolve to lexically -// be a subdirectory of the prefixed path. This is all done lexically, so paths -// that include symlinks won't be safe as a result of using pathClean. -func pathClean(path string) string { - // Ensure that all paths are cleaned (especially problematic ones like - // "/../../../../../" which can cause lots of issues). - path = filepath.Clean(path) - - // If the path isn't absolute, we need to do more processing to fix paths - // such as "../../../..//some/path". We also shouldn't convert absolute - // paths to relative ones. - if !filepath.IsAbs(path) { - path = filepath.Clean(string(os.PathSeparator) + path) - // This can't fail, as (by definition) all paths are relative to root. - path, _ = filepath.Rel(string(os.PathSeparator), path) - } - - // Clean the path again for good measure. - return filepath.Clean(path) -} - func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { root, err := getCgroupRoot() if err != nil { return nil, err } - // Clean the parent slice path. - c.Parent = pathClean(c.Parent) + if (c.Name != "" || c.Parent != "") && c.Path != "" { + return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used") + } + + // XXX: Do not remove this code. Path safety is important! -- cyphar + cgPath := libcontainerUtils.CleanPath(c.Path) + cgParent := libcontainerUtils.CleanPath(c.Parent) + cgName := libcontainerUtils.CleanPath(c.Name) + + innerPath := cgPath + if innerPath == "" { + innerPath = filepath.Join(cgParent, cgName) + } return &cgroupData{ - root: root, - parent: c.Parent, - name: c.Name, - config: c, - pid: pid, + root: root, + innerPath: innerPath, + config: c, + pid: pid, }, nil } @@ -333,11 +319,10 @@ func (raw *cgroupData) path(subsystem string) (string, error) { return "", err } - cgPath := filepath.Join(raw.parent, raw.name) // If the cgroup name/path is absolute do not look relative to the cgroup of the init process. - if filepath.IsAbs(cgPath) { + if filepath.IsAbs(raw.innerPath) { // Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'. - return filepath.Join(raw.root, filepath.Base(mnt), cgPath), nil + return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil } parentPath, err := raw.parentPath(subsystem, mnt, root) @@ -345,7 +330,7 @@ func (raw *cgroupData) path(subsystem string) (string, error) { return "", err } - return filepath.Join(parentPath, cgPath), nil + return filepath.Join(parentPath, raw.innerPath), nil } func (raw *cgroupData) join(subsystem string) (string, error) { @@ -366,9 +351,12 @@ func writeFile(dir, file, data string) error { // Normally dir should not be empty, one case is that cgroup subsystem // is not mounted, we will get empty dir, and we want it fail here. if dir == "" { - return fmt.Errorf("no such directory for %s.", file) + return fmt.Errorf("no such directory for %s", file) } - return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) + if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", data, file, err) + } + return nil } func readFile(dir, file string) (string, error) { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go index ed100231..cbe62bd9 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go @@ -12,6 +12,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" ) type CpusetGroup struct { @@ -88,7 +89,7 @@ func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []b // it's parent. func (s *CpusetGroup) ensureParent(current, root string) error { parent := filepath.Dir(current) - if filepath.Clean(parent) == root { + if libcontainerUtils.CleanPath(parent) == root { return nil } // Avoid infinite recursion. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go index a41ce801..5f783310 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go @@ -5,6 +5,7 @@ package fs import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/system" ) type DevicesGroup struct { @@ -25,6 +26,23 @@ func (s *DevicesGroup) Apply(d *cgroupData) error { } func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error { + if system.RunningInUserNS() { + return nil + } + + devices := cgroup.Resources.Devices + if len(devices) > 0 { + for _, dev := range devices { + file := "devices.deny" + if dev.Allow { + file = "devices.allow" + } + if err := writeFile(path, file, dev.CgroupString()); err != nil { + return err + } + } + return nil + } if !cgroup.Resources.AllowAllDevices { if err := writeFile(path, "devices.deny", "a"); err != nil { return err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go index 8c3e963f..b8371282 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go @@ -9,6 +9,7 @@ import ( "path/filepath" "strconv" "strings" + "time" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" @@ -26,38 +27,75 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) { if err != nil && !cgroups.IsNotFound(err) { return err } - if memoryAssigned(d.config) { - if path != "" { - if err := os.MkdirAll(path, 0755); err != nil { - return err - } - } - // We have to set kernel memory here, as we can't change it once - // processes have been attached. - if err := s.SetKernelMemory(path, d.config); err != nil { - return err - } + // reset error. + err = nil + if path == "" { + // Invalid input. + return fmt.Errorf("invalid path for memory cgroups: %+v", d) } - defer func() { if err != nil { os.RemoveAll(path) } }() - + if !cgroups.PathExists(path) { + if err = os.MkdirAll(path, 0755); err != nil { + return err + } + } + if memoryAssigned(d.config) { + // We have to set kernel memory here, as we can't change it once + // processes have been attached to the cgroup. + if err = s.SetKernelMemory(path, d.config); err != nil { + return err + } + } // We need to join memory cgroup after set memory limits, because // kmem.limit_in_bytes can only be set when the cgroup is empty. - _, err = d.join("memory") - if err != nil && !cgroups.IsNotFound(err) { + if _, jerr := d.join("memory"); jerr != nil && !cgroups.IsNotFound(jerr) { + err = jerr return err } return nil } +func getModifyTime(path string) (time.Time, error) { + stat, err := os.Stat(path) + if err != nil { + return time.Time{}, fmt.Errorf("failed to get memory cgroups creation time: %v", err) + } + return stat.ModTime(), nil +} + func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error { - // This has to be done separately because it has special constraints (it - // can't be done after there are processes attached to the cgroup). - if cgroup.Resources.KernelMemory > 0 { + // This has to be done separately because it has special + // constraints (it can only be initialized before setting up a + // hierarchy or adding a task to the cgroups. However, if + // sucessfully initialized, it can be updated anytime afterwards) + if cgroup.Resources.KernelMemory != 0 { + // Is kmem.limit_in_bytes already set? + // memory.kmem.max_usage_in_bytes is a read-only file. Use it to get cgroups creation time. + kmemCreationTime, err := getModifyTime(filepath.Join(path, "memory.kmem.max_usage_in_bytes")) + if err != nil { + return err + } + kmemLimitsUpdateTime, err := getModifyTime(filepath.Join(path, "memory.kmem.limit_in_bytes")) + if err != nil { + return err + } + // kmem.limit_in_bytes has already been set if its update time is after that of creation time. + // We use `!=` op instead of `>` because updates are losing precision compared to creation. + kmemInitialized := !kmemLimitsUpdateTime.Equal(kmemCreationTime) + if !kmemInitialized { + // If there's already tasks in the cgroup, we can't change the limit either + tasks, err := getCgroupParamString(path, "tasks") + if err != nil { + return err + } + if tasks != "" { + return fmt.Errorf("cannot set kmem.limit_in_bytes after task have joined this cgroup") + } + } if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil { return err } @@ -65,19 +103,65 @@ func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error return nil } -func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { - if cgroup.Resources.Memory != 0 { - if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { +func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { + // When memory and swap memory are both set, we need to handle the cases + // for updating container. + if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap > 0 { + memoryUsage, err := getMemoryData(path, "") + if err != nil { return err } + + // When update memory limit, we should adapt the write sequence + // for memory and swap memory, so it won't fail because the new + // value and the old value don't fit kernel's validation. + if memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) { + if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + return err + } + if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + return err + } + } else { + if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + return err + } + if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + return err + } + } + } else { + if cgroup.Resources.Memory != 0 { + if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + return err + } + } + if cgroup.Resources.MemorySwap > 0 { + if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + return err + } + } } + + return nil +} + +func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { + if err := setMemoryAndSwap(path, cgroup); err != nil { + return err + } + + if err := s.SetKernelMemory(path, cgroup); err != nil { + return err + } + if cgroup.Resources.MemoryReservation != 0 { if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { return err } } - if cgroup.Resources.MemorySwap > 0 { - if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + if cgroup.Resources.KernelMemoryTCP != 0 { + if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil { return err } } @@ -86,14 +170,14 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { return err } } - if cgroup.Resources.MemorySwappiness >= 0 && cgroup.Resources.MemorySwappiness <= 100 { - if err := writeFile(path, "memory.swappiness", strconv.FormatInt(cgroup.Resources.MemorySwappiness, 10)); err != nil { + if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 { + return nil + } else if int64(*cgroup.Resources.MemorySwappiness) >= 0 && int64(*cgroup.Resources.MemorySwappiness) <= 100 { + if err := writeFile(path, "memory.swappiness", strconv.FormatInt(*cgroup.Resources.MemorySwappiness, 10)); err != nil { return err } - } else if cgroup.Resources.MemorySwappiness == -1 { - return nil } else { - return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", cgroup.Resources.MemorySwappiness) + return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", int64(*cgroup.Resources.MemorySwappiness)) } return nil @@ -139,6 +223,11 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { return err } stats.MemoryStats.KernelUsage = kernelUsage + kernelTCPUsage, err := getMemoryData(path, "kmem.tcp") + if err != nil { + return err + } + stats.MemoryStats.KernelTCPUsage = kernelTCPUsage return nil } @@ -148,8 +237,9 @@ func memoryAssigned(cgroup *configs.Cgroup) bool { cgroup.Resources.MemoryReservation != 0 || cgroup.Resources.MemorySwap > 0 || cgroup.Resources.KernelMemory > 0 || + cgroup.Resources.KernelMemoryTCP > 0 || cgroup.Resources.OomKillDisable || - cgroup.Resources.MemorySwappiness != -1 + (cgroup.Resources.MemorySwappiness != nil && *cgroup.Resources.MemorySwappiness != -1) } func getMemoryData(path, name string) (cgroups.MemoryData, error) { @@ -162,6 +252,7 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) { usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".") maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".") failcnt := strings.Join([]string{moduleName, "failcnt"}, ".") + limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".") value, err := getCgroupParamUint(path, usage) if err != nil { @@ -187,6 +278,14 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) { return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err) } memoryData.Failcnt = value + value, err = getCgroupParamUint(path, limit) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err) + } + memoryData.Limit = value return memoryData, nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go index 0e423f66..d8cf1d87 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go @@ -9,6 +9,7 @@ import ( type NameGroup struct { GroupName string + Join bool } func (s *NameGroup) Name() string { @@ -16,6 +17,10 @@ func (s *NameGroup) Name() string { } func (s *NameGroup) Apply(d *cgroupData) error { + if s.Join { + // ignore errors if the named cgroup does not exist + d.join(s.GroupName) + } return nil } @@ -24,6 +29,9 @@ func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error { } func (s *NameGroup) Remove(d *cgroupData) error { + if s.Join { + removePath(d.path(s.GroupName)) + } return nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go index 8a4054ba..8e74b645 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go @@ -3,6 +3,8 @@ package fs import ( + "strconv" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" ) @@ -23,8 +25,8 @@ func (s *NetClsGroup) Apply(d *cgroupData) error { } func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error { - if cgroup.Resources.NetClsClassid != "" { - if err := writeFile(path, "net_cls.classid", cgroup.Resources.NetClsClassid); err != nil { + if cgroup.Resources.NetClsClassid != 0 { + if err := writeFile(path, "net_cls.classid", strconv.FormatUint(uint64(cgroup.Resources.NetClsClassid), 10)); err != nil { return err } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go index 96cbb896..f1e37205 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go @@ -4,6 +4,7 @@ package fs import ( "fmt" + "path/filepath" "strconv" "github.com/opencontainers/runc/libcontainer/cgroups" @@ -47,11 +48,26 @@ func (s *PidsGroup) Remove(d *cgroupData) error { } func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error { - value, err := getCgroupParamUint(path, "pids.current") + current, err := getCgroupParamUint(path, "pids.current") if err != nil { return fmt.Errorf("failed to parse pids.current - %s", err) } - stats.PidsStats.Current = value + maxString, err := getCgroupParamString(path, "pids.max") + if err != nil { + return fmt.Errorf("failed to parse pids.max - %s", err) + } + + // Default if pids.max == "max" is 0 -- which represents "no limit". + var max uint64 + if maxString != "max" { + max, err = parseUint(maxString, 10, 64) + if err != nil { + return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max")) + } + } + + stats.PidsStats.Current = current + stats.PidsStats.Limit = max return nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go index 852b1839..5ff0a161 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go @@ -12,7 +12,6 @@ import ( ) var ( - ErrNotSupportStat = errors.New("stats are not supported for subsystem") ErrNotValidFormat = errors.New("line is not a valid key value format") ) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go index 74c65abf..b483f1bf 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go @@ -11,6 +11,7 @@ type ThrottlingData struct { ThrottledTime uint64 `json:"throttled_time,omitempty"` } +// CpuUsage denotes the usage of a CPU. // All CPU stats are aggregate since container inception. type CpuUsage struct { // Total CPU time consumed. @@ -36,7 +37,9 @@ type MemoryData struct { Usage uint64 `json:"usage,omitempty"` MaxUsage uint64 `json:"max_usage,omitempty"` Failcnt uint64 `json:"failcnt"` + Limit uint64 `json:"limit"` } + type MemoryStats struct { // memory used for cache Cache uint64 `json:"cache,omitempty"` @@ -44,14 +47,19 @@ type MemoryStats struct { Usage MemoryData `json:"usage,omitempty"` // usage of memory + swap SwapUsage MemoryData `json:"swap_usage,omitempty"` - // usafe of kernel memory - KernelUsage MemoryData `json:"kernel_usage,omitempty"` - Stats map[string]uint64 `json:"stats,omitempty"` + // usage of kernel memory + KernelUsage MemoryData `json:"kernel_usage,omitempty"` + // usage of kernel TCP memory + KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"` + + Stats map[string]uint64 `json:"stats,omitempty"` } type PidsStats struct { // number of pids in the cgroup Current uint64 `json:"current,omitempty"` + // active pids hard limit + Limit uint64 `json:"limit,omitempty"` } type BlkioStatEntry struct { @@ -78,7 +86,7 @@ type HugetlbStats struct { Usage uint64 `json:"usage,omitempty"` // maximum usage ever recorded. MaxUsage uint64 `json:"max_usage,omitempty"` - // number of times htgetlb usage allocation failure. + // number of times hugetlb usage allocation failure. Failcnt uint64 `json:"failcnt"` } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go index db020a97..11e5ec04 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go @@ -74,6 +74,7 @@ var ( theConn *systemdDbus.Conn hasStartTransientUnit bool hasTransientDefaultDependencies bool + hasDelegate bool ) func newProp(name string, units interface{}) systemdDbus.Property { @@ -146,20 +147,24 @@ func UseSystemd() bool { // Not critical because of the stop unit logic above. theConn.StopUnit(scope, "replace", nil) + + // Assume StartTransientUnit on a scope allows Delegate + hasDelegate = true + dl := newProp("Delegate", true) + if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") { + hasDelegate = false + } + } + } + + // Not critical because of the stop unit logic above. + theConn.StopUnit(scope, "replace", nil) } return hasStartTransientUnit } -func getIfaceForUnit(unitName string) string { - if strings.HasSuffix(unitName, ".scope") { - return "Scope" - } - if strings.HasSuffix(unitName, ".service") { - return "Service" - } - return "Unit" -} - func (m *Manager) Apply(pid int) error { var ( c = m.Cgroups @@ -195,6 +200,11 @@ func (m *Manager) Apply(pid int) error { newProp("PIDs", []uint32{uint32(pid)}), ) + if hasDelegate { + // This is only supported on systemd versions 218 and above. + properties = append(properties, newProp("Delegate", true)) + } + // Always enable accounting, this gets us the same behaviour as the fs implementation, // plus the kernel has some problems with joining the memory cgroup at a later time. properties = append(properties, @@ -222,11 +232,9 @@ func (m *Manager) Apply(pid int) error { newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) } - // We need to set kernel memory before processes join cgroup because - // kmem.limit_in_bytes can only be set when the cgroup is empty. - // And swap memory limit needs to be set after memory limit, only - // memory limit is handled by systemd, so it's kind of ugly here. - if c.Resources.KernelMemory > 0 { + // We have to set kernel memory here, as we can't change it once + // processes have been attached to the cgroup. + if c.Resources.KernelMemory != 0 { if err := setKernelMemory(c); err != nil { return err } @@ -236,53 +244,7 @@ func (m *Manager) Apply(pid int) error { return err } - if err := joinDevices(c, pid); err != nil { - return err - } - - // TODO: CpuQuota and CpuPeriod not available in systemd - // we need to manually join the cpu.cfs_quota_us and cpu.cfs_period_us - if err := joinCpu(c, pid); err != nil { - return err - } - - // TODO: MemoryReservation and MemorySwap not available in systemd - if err := joinMemory(c, pid); err != nil { - return err - } - - // we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd - // because it does not currently support it via the dbus api. - if err := joinFreezer(c, pid); err != nil { - return err - } - - if err := joinNetPrio(c, pid); err != nil { - return err - } - if err := joinNetCls(c, pid); err != nil { - return err - } - - if err := joinPids(c, pid); err != nil { - return err - } - - if err := joinCpuset(c, pid); err != nil { - return err - } - - if err := joinHugetlb(c, pid); err != nil { - return err - } - - if err := joinPerfEvent(c, pid); err != nil { - return err - } - // FIXME: Systemd does have `BlockIODeviceWeight` property, but we got problem - // using that (at least on systemd 208, see https://github.com/opencontainers/runc/libcontainer/pull/354), - // so use fs work around for now. - if err := joinBlkio(c, pid); err != nil { + if err := joinCgroups(c, pid); err != nil { return err } @@ -327,7 +289,7 @@ func writeFile(dir, file, data string) error { // Normally dir should not be empty, one case is that cgroup subsystem // is not mounted, we will get empty dir, and we want it fail here. if dir == "" { - return fmt.Errorf("no such directory for %s.", file) + return fmt.Errorf("no such directory for %s", file) } return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) } @@ -347,43 +309,41 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { return path, nil } -func joinCpu(c *configs.Cgroup, pid int) error { - _, err := join(c, "cpu", pid) - if err != nil && !cgroups.IsNotFound(err) { - return err +func joinCgroups(c *configs.Cgroup, pid int) error { + for _, sys := range subsystems { + name := sys.Name() + switch name { + case "name=systemd": + // let systemd handle this + break + case "cpuset": + path, err := getSubsystemPath(c, name) + if err != nil && !cgroups.IsNotFound(err) { + return err + } + s := &fs.CpusetGroup{} + if err := s.ApplyDir(path, c, pid); err != nil { + return err + } + break + default: + _, err := join(c, name, pid) + if err != nil { + // Even if it's `not found` error, we'll return err + // because devices cgroup is hard requirement for + // container security. + if name == "devices" { + return err + } + // For other subsystems, omit the `not found` error + // because they are optional. + if !cgroups.IsNotFound(err) { + return err + } + } + } } - return nil -} -func joinFreezer(c *configs.Cgroup, pid int) error { - _, err := join(c, "freezer", pid) - if err != nil && !cgroups.IsNotFound(err) { - return err - } - return nil -} - -func joinNetPrio(c *configs.Cgroup, pid int) error { - _, err := join(c, "net_prio", pid) - if err != nil && !cgroups.IsNotFound(err) { - return err - } - return nil -} - -func joinNetCls(c *configs.Cgroup, pid int) error { - _, err := join(c, "net_cls", pid) - if err != nil && !cgroups.IsNotFound(err) { - return err - } - return nil -} - -func joinPids(c *configs.Cgroup, pid int) error { - _, err := join(c, "pids", pid) - if err != nil && !cgroups.IsNotFound(err) { - return err - } return nil } @@ -392,9 +352,18 @@ func joinPids(c *configs.Cgroup, pid int) error { // test.slice/test-a.slice/test-a-b.slice. func expandSlice(slice string) (string, error) { suffix := ".slice" - sliceName := strings.TrimSuffix(slice, suffix) + // Name has to end with ".slice", but can't be just ".slice". + if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + // Path-separators are not allowed. + if strings.Contains(slice, "/") { + return "", fmt.Errorf("invalid slice name: %s", slice) + } var path, prefix string + sliceName := strings.TrimSuffix(slice, suffix) for _, component := range strings.Split(sliceName, "-") { // test--a.slice isn't permitted, nor is -test.slice. if component == "" { @@ -510,87 +479,11 @@ func getUnitName(c *configs.Cgroup) string { return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name) } -// Atm we can't use the systemd device support because of two missing things: -// * Support for wildcards to allow mknod on any device -// * Support for wildcards to allow /dev/pts support -// -// The second is available in more recent systemd as "char-pts", but not in e.g. v208 which is -// in wide use. When both these are available we will be able to switch, but need to keep the old -// implementation for backwards compat. -// -// Note: we can't use systemd to set up the initial limits, and then change the cgroup -// because systemd will re-write the device settings if it needs to re-apply the cgroup context. -// This happens at least for v208 when any sibling unit is started. -func joinDevices(c *configs.Cgroup, pid int) error { - _, err := join(c, "devices", pid) - // Even if it's `not found` error, we'll return err because devices cgroup - // is hard requirement for container security. - if err != nil { - return err - } - return nil -} - func setKernelMemory(c *configs.Cgroup) error { path, err := getSubsystemPath(c, "memory") if err != nil && !cgroups.IsNotFound(err) { return err } - if err := os.MkdirAll(path, 0755); err != nil { - return err - } - - // This doesn't get called by manager.Set, so we need to do it here. - s := &fs.MemoryGroup{} - return s.SetKernelMemory(path, c) -} - -func joinMemory(c *configs.Cgroup, pid int) error { - _, err := join(c, "memory", pid) - if err != nil && !cgroups.IsNotFound(err) { - return err - } - return nil -} - -// systemd does not atm set up the cpuset controller, so we must manually -// join it. Additionally that is a very finicky controller where each -// level must have a full setup as the default for a new directory is "no cpus" -func joinCpuset(c *configs.Cgroup, pid int) error { - path, err := getSubsystemPath(c, "cpuset") - if err != nil && !cgroups.IsNotFound(err) { - return err - } - - s := &fs.CpusetGroup{} - - return s.ApplyDir(path, c, pid) -} - -// `BlockIODeviceWeight` property of systemd does not work properly, and systemd -// expects device path instead of major minor numbers, which is also confusing -// for users. So we use fs work around for now. -func joinBlkio(c *configs.Cgroup, pid int) error { - _, err := join(c, "blkio", pid) - if err != nil { - return err - } - return nil -} - -func joinHugetlb(c *configs.Cgroup, pid int) error { - _, err := join(c, "hugetlb", pid) - if err != nil && !cgroups.IsNotFound(err) { - return err - } - return nil -} - -func joinPerfEvent(c *configs.Cgroup, pid int) error { - _, err := join(c, "perf_event", pid) - if err != nil && !cgroups.IsNotFound(err) { - return err - } - return nil + return os.MkdirAll(path, 0755) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go index 88620aae..1a7c4e1a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -5,6 +5,7 @@ package cgroups import ( "bufio" "fmt" + "io" "io/ioutil" "os" "path/filepath" @@ -12,17 +13,19 @@ import ( "strings" "time" - "github.com/docker/docker/pkg/mount" "github.com/docker/go-units" ) const cgroupNamePrefix = "name=" -// https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt +// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt func FindCgroupMountpoint(subsystem string) (string, error) { // We are not using mount.GetMounts() because it's super-inefficient, // parsing it directly sped up x10 times because of not using Sscanf. // It was one of two major performance drawbacks in container start. + if !isSubsystemAvailable(subsystem) { + return "", NewNotFoundError(subsystem) + } f, err := os.Open("/proc/self/mountinfo") if err != nil { return "", err @@ -47,6 +50,9 @@ func FindCgroupMountpoint(subsystem string) (string, error) { } func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) { + if !isSubsystemAvailable(subsystem) { + return "", "", NewNotFoundError(subsystem) + } f, err := os.Open("/proc/self/mountinfo") if err != nil { return "", "", err @@ -70,6 +76,15 @@ func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) { return "", "", NewNotFoundError(subsystem) } +func isSubsystemAvailable(subsystem string) bool { + cgroups, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return false + } + _, avail := cgroups[subsystem] + return avail +} + func FindCgroupMountpointDir() (string, error) { f, err := os.Open("/proc/self/mountinfo") if err != nil { @@ -121,42 +136,63 @@ func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) { return getControllerPath(m.Subsystems[0], cgroups) } +func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) { + res := make([]Mount, 0, len(ss)) + scanner := bufio.NewScanner(mi) + numFound := 0 + for scanner.Scan() && numFound < len(ss) { + txt := scanner.Text() + sepIdx := strings.Index(txt, " - ") + if sepIdx == -1 { + return nil, fmt.Errorf("invalid mountinfo format") + } + if txt[sepIdx+3:sepIdx+9] != "cgroup" { + continue + } + fields := strings.Split(txt, " ") + m := Mount{ + Mountpoint: fields[4], + Root: fields[3], + } + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + if !ss[opt] { + continue + } + if strings.HasPrefix(opt, cgroupNamePrefix) { + m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):]) + } else { + m.Subsystems = append(m.Subsystems, opt) + } + numFound++ + } + res = append(res, m) + } + if err := scanner.Err(); err != nil { + return nil, err + } + return res, nil +} + func GetCgroupMounts() ([]Mount, error) { - mounts, err := mount.GetMounts() + f, err := os.Open("/proc/self/mountinfo") if err != nil { return nil, err } + defer f.Close() - all, err := GetAllSubsystems() + all, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return nil, err } allMap := make(map[string]bool) - for _, s := range all { + for s := range all { allMap[s] = true } - - res := []Mount{} - for _, mount := range mounts { - if mount.Fstype == "cgroup" { - m := Mount{Mountpoint: mount.Mountpoint, Root: mount.Root} - - for _, opt := range strings.Split(mount.VfsOpts, ",") { - if strings.HasPrefix(opt, cgroupNamePrefix) { - m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):]) - } - if allMap[opt] { - m.Subsystems = append(m.Subsystems, opt) - } - } - res = append(res, m) - } - } - return res, nil + return getCgroupMountsHelper(allMap, f) } -// Returns all the cgroup subsystems supported by the kernel +// GetAllSubsystems returns all the cgroup subsystems supported by the kernel func GetAllSubsystems() ([]string, error) { f, err := os.Open("/proc/cgroups") if err != nil { @@ -182,7 +218,7 @@ func GetAllSubsystems() ([]string, error) { return subsystems, nil } -// Returns the relative path to the cgroup docker is running in. +// GetThisCgroupDir returns the relative path to the cgroup docker is running in. func GetThisCgroupDir(subsystem string) (string, error) { cgroups, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { @@ -226,6 +262,8 @@ func readProcsFile(dir string) ([]int, error) { return out, nil } +// ParseCgroupFile parses the given cgroup file, typically from +// /proc//cgroup, into a map of subgroups to cgroup names. func ParseCgroupFile(path string) (map[string]string, error) { f, err := os.Open(path) if err != nil { @@ -233,7 +271,12 @@ func ParseCgroupFile(path string) (map[string]string, error) { } defer f.Close() - s := bufio.NewScanner(f) + return parseCgroupFromReader(f) +} + +// helper function for ParseCgroupFile to make testing easier +func parseCgroupFromReader(r io.Reader) (map[string]string, error) { + s := bufio.NewScanner(r) cgroups := make(map[string]string) for s.Scan() { @@ -242,7 +285,16 @@ func ParseCgroupFile(path string) (map[string]string, error) { } text := s.Text() - parts := strings.Split(text, ":") + // from cgroups(7): + // /proc/[pid]/cgroup + // ... + // For each cgroup hierarchy ... there is one entry + // containing three colon-separated fields of the form: + // hierarchy-ID:subsystem-list:cgroup-path + parts := strings.SplitN(text, ":", 3) + if len(parts) < 3 { + return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text) + } for _, subs := range strings.Split(parts[1], ",") { cgroups[subs] = parts[2] @@ -309,7 +361,7 @@ func RemovePaths(paths map[string]string) (err error) { return nil } } - return fmt.Errorf("Failed to remove paths: %s", paths) + return fmt.Errorf("Failed to remove paths: %v", paths) } func GetHugePageSize() ([]string, error) { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go index c186d289..abbb92f5 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go @@ -11,15 +11,22 @@ const ( ) type Cgroup struct { - Name string `json:"name"` + // Deprecated, use Path instead + Name string `json:"name,omitempty"` - // name of parent cgroup or slice - Parent string `json:"parent"` + // name of parent of cgroup or slice + // Deprecated, use Path instead + Parent string `json:"parent,omitempty"` + + // Path specifies the path to cgroups that are created and/or joined by the container. + // The path is assumed to be relative to the host system cgroup mountpoint. + Path string `json:"path"` // ScopePrefix decribes prefix for the scope name ScopePrefix string `json:"scope_prefix"` - // Paths represent the cgroups paths to join + // Paths represent the absolute cgroups paths to join. + // This takes precedence over Path. Paths map[string]string // Resources contains various cgroups settings to apply @@ -28,11 +35,14 @@ type Cgroup struct { type Resources struct { // If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list. - AllowAllDevices bool `json:"allow_all_devices"` + // Deprecated + AllowAllDevices bool `json:"allow_all_devices,omitempty"` + // Deprecated + AllowedDevices []*Device `json:"allowed_devices,omitempty"` + // Deprecated + DeniedDevices []*Device `json:"denied_devices,omitempty"` - AllowedDevices []*Device `json:"allowed_devices"` - - DeniedDevices []*Device `json:"denied_devices"` + Devices []*Device `json:"devices"` // Memory limit (in bytes) Memory int64 `json:"memory"` @@ -46,6 +56,9 @@ type Resources struct { // Kernel memory limit (in bytes) KernelMemory int64 `json:"kernel_memory"` + // Kernel memory limit for TCP use (in bytes) + KernelMemoryTCP int64 `json:"kernel_memory_tcp"` + // CPU shares (relative weight vs. other containers) CpuShares int64 `json:"cpu_shares"` @@ -56,10 +69,10 @@ type Resources struct { CpuPeriod int64 `json:"cpu_period"` // How many time CPU will use in realtime scheduling (in usecs). - CpuRtRuntime int64 `json:"cpu_quota"` + CpuRtRuntime int64 `json:"cpu_rt_quota"` // CPU period to be used for realtime scheduling (in usecs). - CpuRtPeriod int64 `json:"cpu_period"` + CpuRtPeriod int64 `json:"cpu_rt_period"` // CPU to use CpusetCpus string `json:"cpuset_cpus"` @@ -101,11 +114,11 @@ type Resources struct { OomKillDisable bool `json:"oom_kill_disable"` // Tuning swappiness behaviour per cgroup - MemorySwappiness int64 `json:"memory_swappiness"` + MemorySwappiness *int64 `json:"memory_swappiness"` // Set priority of network traffic for container NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"` // Set class identifier for container's network packets - NetClsClassid string `json:"net_cls_classid"` + NetClsClassid uint32 `json:"net_cls_classid"` } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index 069daae2..3c38191b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -3,7 +3,11 @@ package configs import ( "bytes" "encoding/json" + "fmt" "os/exec" + "time" + + "github.com/Sirupsen/logrus" ) type Rlimit struct { @@ -29,7 +33,7 @@ type Seccomp struct { Syscalls []*Syscall `json:"syscalls"` } -// An action to be taken upon rule match in Seccomp +// Action is taken upon rule match in Seccomp type Action int const ( @@ -40,7 +44,7 @@ const ( Trace ) -// A comparison operator to be used when matching syscall arguments in Seccomp +// Operator is a comparison operator to be used when matching syscall arguments in Seccomp type Operator int const ( @@ -53,7 +57,7 @@ const ( MaskEqualTo ) -// A rule to match a specific syscall argument in Seccomp +// Arg is a rule to match a specific syscall argument in Seccomp type Arg struct { Index uint `json:"index"` Value uint64 `json:"value"` @@ -61,7 +65,7 @@ type Arg struct { Op Operator `json:"op"` } -// An rule to match a syscall in Seccomp +// Syscall is a rule to match a syscall in Seccomp type Syscall struct { Name string `json:"name"` Action Action `json:"action"` @@ -128,15 +132,15 @@ type Config struct { // AppArmorProfile specifies the profile to apply to the process running in the container and is // change at the time the process is execed - AppArmorProfile string `json:"apparmor_profile"` + AppArmorProfile string `json:"apparmor_profile,omitempty"` // ProcessLabel specifies the label to apply to the process running in the container. It is // commonly used by selinux - ProcessLabel string `json:"process_label"` + ProcessLabel string `json:"process_label,omitempty"` // Rlimits specifies the resource limits, such as max open files, to set in the container // If Rlimits are not set, the container will inherit rlimits from the parent process - Rlimits []Rlimit `json:"rlimits"` + Rlimits []Rlimit `json:"rlimits,omitempty"` // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores // for a process. Valid values are between the range [-1000, '1000'], where processes with @@ -144,10 +148,6 @@ type Config struct { // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ OomScoreAdj int `json:"oom_score_adj"` - // AdditionalGroups specifies the gids that should be added to supplementary groups - // in addition to those that the user belongs to. - AdditionalGroups []string `json:"additional_groups"` - // UidMappings is an array of User ID mappings for User Namespaces UidMappings []IDMap `json:"uid_mappings"` @@ -171,12 +171,22 @@ type Config struct { // A default action to be taken if no rules match is also given. Seccomp *Seccomp `json:"seccomp"` + // NoNewPrivileges controls whether processes in the container can gain additional privileges. + NoNewPrivileges bool `json:"no_new_privileges,omitempty"` + // Hooks are a collection of actions to perform at various container lifecycle events. - // Hooks are not able to be marshaled to json but they are also not needed to. - Hooks *Hooks `json:"-"` + // CommandHooks are serialized to JSON, but other hooks are not. + Hooks *Hooks // Version is the version of opencontainer specification that is supported. Version string `json:"version"` + + // Labels are user defined metadata that is stored in the config and populated on the state + Labels []string `json:"labels"` + + // NoNewKeyring will not allocated a new session keyring for the container. It will use the + // callers keyring in this case. + NoNewKeyring bool `json:"no_new_keyring"` } type Hooks struct { @@ -191,12 +201,59 @@ type Hooks struct { Poststop []Hook } +func (hooks *Hooks) UnmarshalJSON(b []byte) error { + var state struct { + Prestart []CommandHook + Poststart []CommandHook + Poststop []CommandHook + } + + if err := json.Unmarshal(b, &state); err != nil { + return err + } + + deserialize := func(shooks []CommandHook) (hooks []Hook) { + for _, shook := range shooks { + hooks = append(hooks, shook) + } + + return hooks + } + + hooks.Prestart = deserialize(state.Prestart) + hooks.Poststart = deserialize(state.Poststart) + hooks.Poststop = deserialize(state.Poststop) + return nil +} + +func (hooks Hooks) MarshalJSON() ([]byte, error) { + serialize := func(hooks []Hook) (serializableHooks []CommandHook) { + for _, hook := range hooks { + switch chook := hook.(type) { + case CommandHook: + serializableHooks = append(serializableHooks, chook) + default: + logrus.Warnf("cannot serialize hook of type %T, skipping", hook) + } + } + + return serializableHooks + } + + return json.Marshal(map[string]interface{}{ + "prestart": serialize(hooks.Prestart), + "poststart": serialize(hooks.Poststart), + "poststop": serialize(hooks.Poststop), + }) +} + // HookState is the payload provided to a hook on execution. type HookState struct { - Version string `json:"version"` - ID string `json:"id"` - Pid int `json:"pid"` - Root string `json:"root"` + Version string `json:"ociVersion"` + ID string `json:"id"` + Pid int `json:"pid"` + Root string `json:"root"` + BundlePath string `json:"bundlePath"` } type Hook interface { @@ -204,7 +261,7 @@ type Hook interface { Run(HookState) error } -// NewFunctionHooks will call the provided function when the hook is run. +// NewFunctionHook will call the provided function when the hook is run. func NewFunctionHook(f func(HookState) error) FuncHook { return FuncHook{ run: f, @@ -220,13 +277,14 @@ func (f FuncHook) Run(s HookState) error { } type Command struct { - Path string `json:"path"` - Args []string `json:"args"` - Env []string `json:"env"` - Dir string `json:"dir"` + Path string `json:"path"` + Args []string `json:"args"` + Env []string `json:"env"` + Dir string `json:"dir"` + Timeout *time.Duration `json:"timeout"` } -// NewCommandHooks will execute the provided command when the hook is run. +// NewCommandHook will execute the provided command when the hook is run. func NewCommandHook(cmd Command) CommandHook { return CommandHook{ Command: cmd, @@ -248,5 +306,23 @@ func (c Command) Run(s HookState) error { Env: c.Env, Stdin: bytes.NewReader(b), } - return cmd.Run() + errC := make(chan error, 1) + go func() { + out, err := cmd.CombinedOutput() + if err != nil { + err = fmt.Errorf("%s: %s", err, out) + } + errC <- err + }() + if c.Timeout != nil { + select { + case err := <-errC: + return err + case <-time.After(*c.Timeout): + cmd.Process.Kill() + cmd.Wait() + return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds()) + } + } + return <-errC } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_unix.go index c447f3ef..a60554a7 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_unix.go @@ -4,7 +4,7 @@ package configs import "fmt" -// Gets the root uid for the process on host which could be non-zero +// HostUID gets the root uid for the process on host which could be non-zero // when user namespaces are enabled. func (c Config) HostUID() (int, error) { if c.Namespaces.Contains(NEWUSER) { @@ -21,7 +21,7 @@ func (c Config) HostUID() (int, error) { return 0, nil } -// Gets the root gid for the process on host which could be non-zero +// HostGID gets the root gid for the process on host which could be non-zero // when user namespaces are enabled. func (c Config) HostGID() (int, error) { if c.Namespaces.Contains(NEWUSER) { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/device.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/device.go index a52a024a..8701bb21 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/device.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/device.go @@ -35,6 +35,9 @@ type Device struct { // Gid of the device. Gid uint32 `json:"gid"` + + // Write the file to the allowed list + Allow bool `json:"allow"` } func (d *Device) CgroupString() string { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go index e4529926..ba1f437f 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go @@ -3,7 +3,7 @@ package configs var ( - // These are devices that are to be both allowed and created. + // DefaultSimpleDevices are devices that are to be both allowed and created. DefaultSimpleDevices = []*Device{ // /dev/null and zero { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go index c962999e..fb4b8522 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go @@ -18,7 +18,7 @@ var namespaceInfo = map[NamespaceType]int{ } // CloneFlags parses the container's Namespaces options to set the correct -// flags on clone, unshare. This functions returns flags only for new namespaces. +// flags on clone, unshare. This function returns flags only for new namespaces. func (n *Namespaces) CloneFlags() uintptr { var flag int for _, v := range *n { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go index 1644588d..0547223a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go @@ -8,7 +8,7 @@ func (n *Namespace) Syscall() int { } // CloneFlags parses the container's Namespaces options to set the correct -// flags on clone, unshare. This functions returns flags only for new namespaces. +// flags on clone, unshare. This function returns flags only for new namespaces. func (n *Namespaces) CloneFlags() uintptr { panic("No namespace syscall support") return uintptr(0) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go index 7bc90854..b9c820d0 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go @@ -2,7 +2,11 @@ package configs -import "fmt" +import ( + "fmt" + "os" + "sync" +) const ( NEWNET NamespaceType = "NEWNET" @@ -13,6 +17,51 @@ const ( NEWUSER NamespaceType = "NEWUSER" ) +var ( + nsLock sync.Mutex + supportedNamespaces = make(map[NamespaceType]bool) +) + +// nsToFile converts the namespace type to its filename +func nsToFile(ns NamespaceType) string { + switch ns { + case NEWNET: + return "net" + case NEWNS: + return "mnt" + case NEWPID: + return "pid" + case NEWIPC: + return "ipc" + case NEWUSER: + return "user" + case NEWUTS: + return "uts" + } + return "" +} + +// IsNamespaceSupported returns whether a namespace is available or +// not +func IsNamespaceSupported(ns NamespaceType) bool { + nsLock.Lock() + defer nsLock.Unlock() + supported, ok := supportedNamespaces[ns] + if ok { + return supported + } + nsFile := nsToFile(ns) + // if the namespace type is unknown, just return false + if nsFile == "" { + return false + } + _, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile)) + // a namespace is supported if it exists and we have permissions to read it + supported = err == nil + supportedNamespaces[ns] = supported + return supported +} + func NamespaceTypes() []NamespaceType { return []NamespaceType{ NEWNET, @@ -35,26 +84,7 @@ func (n *Namespace) GetPath(pid int) string { if n.Path != "" { return n.Path } - return fmt.Sprintf("/proc/%d/ns/%s", pid, n.file()) -} - -func (n *Namespace) file() string { - file := "" - switch n.Type { - case NEWNET: - file = "net" - case NEWNS: - file = "mnt" - case NEWPID: - file = "pid" - case NEWIPC: - file = "ipc" - case NEWUSER: - file = "user" - case NEWUTS: - file = "uts" - } - return file + return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type)) } func (n *Namespaces) Remove(t NamespaceType) bool { @@ -87,3 +117,11 @@ func (n *Namespaces) index(t NamespaceType) int { func (n *Namespaces) Contains(t NamespaceType) bool { return n.index(t) != -1 } + +func (n *Namespaces) PathOf(t NamespaceType) string { + i := n.index(t) + if i == -1 { + return "" + } + return (*n)[i].Path +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go similarity index 60% rename from vendor/github.com/opencontainers/runc/libcontainer/configs/validate/config.go rename to vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go index 848a67c3..448cde27 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go @@ -4,8 +4,10 @@ import ( "fmt" "os" "path/filepath" + "strings" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/selinux" ) type Validator interface { @@ -35,10 +37,13 @@ func (v *ConfigValidator) Validate(config *configs.Config) error { if err := v.usernamespace(config); err != nil { return err } + if err := v.sysctl(config); err != nil { + return err + } return nil } -// rootfs validates the the rootfs is an absolute path and is not a symlink +// rootfs validates if the rootfs is an absolute path and is not a symlink // to the container's root filesystem. func (v *ConfigValidator) rootfs(config *configs.Config) error { cleaned, err := filepath.Abs(config.Rootfs) @@ -48,7 +53,7 @@ func (v *ConfigValidator) rootfs(config *configs.Config) error { if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil { return err } - if config.Rootfs != cleaned { + if filepath.Clean(config.Rootfs) != cleaned { return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs) } return nil @@ -76,6 +81,10 @@ func (v *ConfigValidator) security(config *configs.Config) error { !config.Namespaces.Contains(configs.NEWNS) { return fmt.Errorf("unable to restrict sys entries without a private MNT namespace") } + if config.ProcessLabel != "" && !selinux.SelinuxEnabled() { + return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported") + } + return nil } @@ -91,3 +100,39 @@ func (v *ConfigValidator) usernamespace(config *configs.Config) error { } return nil } + +// sysctl validates that the specified sysctl keys are valid or not. +// /proc/sys isn't completely namespaced and depending on which namespaces +// are specified, a subset of sysctls are permitted. +func (v *ConfigValidator) sysctl(config *configs.Config) error { + validSysctlMap := map[string]bool{ + "kernel.msgmax": true, + "kernel.msgmnb": true, + "kernel.msgmni": true, + "kernel.sem": true, + "kernel.shmall": true, + "kernel.shmmax": true, + "kernel.shmmni": true, + "kernel.shm_rmid_forced": true, + } + + for s := range config.Sysctl { + if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") { + if config.Namespaces.Contains(configs.NEWIPC) { + continue + } else { + return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s) + } + } + if strings.HasPrefix(s, "net.") { + if config.Namespaces.Contains(configs.NEWNET) { + continue + } else { + return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s) + } + } + return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s) + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go b/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go new file mode 100644 index 00000000..9e89f505 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go @@ -0,0 +1,11 @@ +package libcontainer + +import ( + "errors" +) + +// NewConsole returns an initalized console that can be used within a container by copying bytes +// from the master side to the slave that is attached as the tty for the container's init process. +func NewConsole(uid, gid int) (Console, error) { + return nil, errors.New("libcontainer console is not supported on Solaris") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container.go b/vendor/github.com/opencontainers/runc/libcontainer/container.go index 03c8c559..1a71179c 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/container.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/container.go @@ -1,4 +1,4 @@ -// Libcontainer provides a native Go implementation for creating containers +// Package libcontainer provides a native Go implementation for creating containers // with namespaces, cgroups, capabilities, and filesystem access controls. // It allows you to manage the lifecycle of the container performing additional operations // after the container is created. @@ -6,31 +6,25 @@ package libcontainer import ( "os" + "time" "github.com/opencontainers/runc/libcontainer/configs" ) -// The status of a container. +// Status is the status of a container. type Status int const ( - // The container exists but has not been run yet + // Created is the status that denotes the container exists but has not been run yet. Created Status = iota - - // The container exists and is running. + // Running is the status that denotes the container exists and is running. Running - - // The container exists, it is in the process of being paused. + // Pausing is the status that denotes the container exists, it is in the process of being paused. Pausing - - // The container exists, but all its processes are paused. + // Paused is the status that denotes the container exists, but all its processes are paused. Paused - - // The container exists, but its state is saved on disk - Checkpointed - - // The container does not exist. - Destroyed + // Stopped is the status that denotes the container does not have a created or running process. + Stopped ) func (s Status) String() string { @@ -43,10 +37,8 @@ func (s Status) String() string { return "pausing" case Paused: return "paused" - case Checkpointed: - return "checkpointed" - case Destroyed: - return "destroyed" + case Stopped: + return "stopped" default: return "unknown" } @@ -61,14 +53,17 @@ type BaseState struct { // InitProcessPid is the init process id in the parent namespace. InitProcessPid int `json:"init_process_pid"` - // InitProcessStartTime is the init process start time. + // InitProcessStartTime is the init process start time in clock cycles since boot time. InitProcessStartTime string `json:"init_process_start"` + // Created is the unix timestamp for the creation time of the container in UTC + Created time.Time `json:"created"` + // Config is the container's configuration. Config configs.Config `json:"config"` } -// A libcontainer container object. +// BaseContainer is a libcontainer container object. // // Each container is thread-safe within the same process. Since a container can // be destroyed by a separate process, any function may return that the container @@ -81,13 +76,13 @@ type BaseContainer interface { // // errors: // ContainerDestroyed - Container no longer exists, - // Systemerror - System error. + // SystemError - System error. Status() (Status, error) // State returns the current container's state information. // // errors: - // Systemerror - System error. + // SystemError - System error. State() (*State, error) // Returns the current config of the container. @@ -97,7 +92,7 @@ type BaseContainer interface { // // errors: // ContainerDestroyed - Container no longer exists, - // Systemerror - System error. + // SystemError - System error. // // Some of the returned PIDs may no longer refer to processes in the Container, unless // the Container state is PAUSED in which case every PID in the slice is valid. @@ -107,7 +102,7 @@ type BaseContainer interface { // // errors: // ContainerDestroyed - Container no longer exists, - // Systemerror - System error. + // SystemError - System error. Stats() (*Stats, error) // Set resources of container as configured @@ -115,7 +110,7 @@ type BaseContainer interface { // We can use this to change resources when containers are running. // // errors: - // Systemerror - System error. + // SystemError - System error. Set(config configs.Config) error // Start a process inside the container. Returns error if process fails to @@ -125,21 +120,38 @@ type BaseContainer interface { // ContainerDestroyed - Container no longer exists, // ConfigInvalid - config is invalid, // ContainerPaused - Container is paused, - // Systemerror - System error. + // SystemError - System error. Start(process *Process) (err error) + // Run immediatly starts the process inside the conatiner. Returns error if process + // fails to start. It does not block waiting for the exec fifo after start returns but + // opens the fifo after start returns. + // + // errors: + // ContainerDestroyed - Container no longer exists, + // ConfigInvalid - config is invalid, + // ContainerPaused - Container is paused, + // SystemError - System error. + Run(process *Process) (err error) + // Destroys the container after killing all running processes. // // Any event registrations are removed before the container is destroyed. // No error is returned if the container is already destroyed. // // errors: - // Systemerror - System error. + // SystemError - System error. Destroy() error // Signal sends the provided signal code to the container's initial process. // // errors: - // Systemerror - System error. + // SystemError - System error. Signal(s os.Signal) error + + // Exec signals the container to exec the users process at the end of the init. + // + // errors: + // SystemError - System error. + Exec() error } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go index 4015c957..70cbc635 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go @@ -15,30 +15,35 @@ import ( "strings" "sync" "syscall" + "time" "github.com/Sirupsen/logrus" "github.com/golang/protobuf/proto" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/criurpc" + "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" + "github.com/syndtr/gocapability/capability" "github.com/vishvananda/netlink/nl" ) const stdioFdCount = 3 type linuxContainer struct { - id string - root string - config *configs.Config - cgroupManager cgroups.Manager - initPath string - initArgs []string - initProcess parentProcess - criuPath string - m sync.Mutex - criuVersion int - state containerState + id string + root string + config *configs.Config + cgroupManager cgroups.Manager + initPath string + initArgs []string + initProcess parentProcess + initProcessStartTime string + criuPath string + m sync.Mutex + criuVersion int + state containerState + created time.Time } // State represents a running container's state @@ -59,7 +64,7 @@ type State struct { ExternalDescriptors []string `json:"external_descriptors,omitempty"` } -// A libcontainer container object. +// Container is a libcontainer container object. // // Each container is thread-safe within the same process. Since a container can // be destroyed by a separate process, any function may return that the container @@ -75,13 +80,13 @@ type Container interface { // Systemerror - System error. Checkpoint(criuOpts *CriuOpts) error - // Restore restores the checkpointed container to a running state using the criu(8) utiity. + // Restore restores the checkpointed container to a running state using the criu(8) utility. // // errors: // Systemerror - System error. Restore(process *Process, criuOpts *CriuOpts) error - // If the Container state is RUNNING or PAUSING, sets the Container state to PAUSING and pauses + // If the Container state is RUNNING, sets the Container state to PAUSING and pauses // the execution of any user processes. Asynchronously, when the container finished being paused the // state is changed to PAUSED. // If the Container state is PAUSED, do nothing. @@ -138,7 +143,7 @@ func (c *linuxContainer) State() (*State, error) { func (c *linuxContainer) Processes() ([]int, error) { pids, err := c.cgroupManager.GetAllPids() if err != nil { - return nil, newSystemError(err) + return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups") } return pids, nil } @@ -149,14 +154,14 @@ func (c *linuxContainer) Stats() (*Stats, error) { stats = &Stats{} ) if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { - return stats, newSystemError(err) + return stats, newSystemErrorWithCause(err, "getting container stats from cgroups") } for _, iface := range c.config.Networks { switch iface.Type { case "veth": istats, err := getNetworkInterfaceStats(iface.HostInterfaceName) if err != nil { - return stats, newSystemError(err) + return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName) } stats.Interfaces = append(stats.Interfaces, istats) } @@ -167,6 +172,13 @@ func (c *linuxContainer) Stats() (*Stats, error) { func (c *linuxContainer) Set(config configs.Config) error { c.m.Lock() defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + if status == Stopped { + return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) + } c.config = &config return c.cgroupManager.Set(c.config) } @@ -178,38 +190,90 @@ func (c *linuxContainer) Start(process *Process) error { if err != nil { return err } - doInit := status == Destroyed - parent, err := c.newParentProcess(process, doInit) + return c.start(process, status == Stopped) +} + +func (c *linuxContainer) Run(process *Process) error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() if err != nil { - return newSystemError(err) + return err + } + if err := c.start(process, status == Stopped); err != nil { + return err + } + if status == Stopped { + return c.exec() + } + return nil +} + +func (c *linuxContainer) Exec() error { + c.m.Lock() + defer c.m.Unlock() + return c.exec() +} + +func (c *linuxContainer) exec() error { + path := filepath.Join(c.root, execFifoFilename) + f, err := os.OpenFile(path, os.O_RDONLY, 0) + if err != nil { + return newSystemErrorWithCause(err, "open exec fifo for reading") + } + defer f.Close() + data, err := ioutil.ReadAll(f) + if err != nil { + return err + } + if len(data) > 0 { + os.Remove(path) + return nil + } + return fmt.Errorf("cannot start an already running container") +} + +func (c *linuxContainer) start(process *Process, isInit bool) error { + parent, err := c.newParentProcess(process, isInit) + if err != nil { + return newSystemErrorWithCause(err, "creating new parent process") } if err := parent.start(); err != nil { // terminate the process to ensure that it properly is reaped. if err := parent.terminate(); err != nil { logrus.Warn(err) } - return newSystemError(err) + return newSystemErrorWithCause(err, "starting container process") } + // generate a timestamp indicating when the container was started + c.created = time.Now().UTC() c.state = &runningState{ c: c, } - if doInit { - if err := c.updateState(parent); err != nil { + if isInit { + c.state = &createdState{ + c: c, + } + state, err := c.updateState(parent) + if err != nil { return err } + c.initProcessStartTime = state.InitProcessStartTime + if c.config.Hooks != nil { s := configs.HookState{ - Version: c.config.Version, - ID: c.id, - Pid: parent.pid(), - Root: c.config.Rootfs, + Version: c.config.Version, + ID: c.id, + Pid: parent.pid(), + Root: c.config.Rootfs, + BundlePath: utils.SearchLabels(c.config.Labels, "bundle"), } - for _, hook := range c.config.Hooks.Poststart { + for i, hook := range c.config.Hooks.Poststart { if err := hook.Run(s); err != nil { if err := parent.terminate(); err != nil { logrus.Warn(err) } - return newSystemError(err) + return newSystemErrorWithCausef(err, "running poststart hook %d", i) } } } @@ -219,7 +283,7 @@ func (c *linuxContainer) Start(process *Process) error { func (c *linuxContainer) Signal(s os.Signal) error { if err := c.initProcess.signal(s); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "signaling init process") } return nil } @@ -227,19 +291,23 @@ func (c *linuxContainer) Signal(s os.Signal) error { func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) { parentPipe, childPipe, err := newPipe() if err != nil { - return nil, newSystemError(err) + return nil, newSystemErrorWithCause(err, "creating new init pipe") } - cmd, err := c.commandTemplate(p, childPipe) + rootDir, err := os.Open(c.root) if err != nil { - return nil, newSystemError(err) + return nil, err + } + cmd, err := c.commandTemplate(p, childPipe, rootDir) + if err != nil { + return nil, newSystemErrorWithCause(err, "creating new command template") } if !doInit { - return c.newSetnsProcess(p, cmd, parentPipe, childPipe) + return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir) } - return c.newInitProcess(p, cmd, parentPipe, childPipe) + return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir) } -func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) { +func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) { cmd := &exec.Cmd{ Path: c.initPath, Args: c.initArgs, @@ -251,8 +319,10 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. if cmd.SysProcAttr == nil { cmd.SysProcAttr = &syscall.SysProcAttr{} } - cmd.ExtraFiles = append(p.ExtraFiles, childPipe) - cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) + cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir) + cmd.Env = append(cmd.Env, + fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2), + fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) // NOTE: when running a container with no PID namespace and the parent process spawning the container is // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason // even with the parent still running. @@ -262,38 +332,42 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. return cmd, nil } -func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) { - t := "_LIBCONTAINER_INITTYPE=" + string(initStandard) - cloneFlags := c.config.Namespaces.CloneFlags() - if cloneFlags&syscall.CLONE_NEWUSER != 0 { - if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil { - // user mappings are not supported - return nil, err - } - enableSetgroups(cmd.SysProcAttr) - // Default to root user when user namespaces are enabled. - if cmd.SysProcAttr.Credential == nil { - cmd.SysProcAttr.Credential = &syscall.Credential{} +func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) { + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) + nsMaps := make(map[configs.NamespaceType]string) + for _, ns := range c.config.Namespaces { + if ns.Path != "" { + nsMaps[ns.Type] = ns.Path } } - cmd.Env = append(cmd.Env, t) - cmd.SysProcAttr.Cloneflags = cloneFlags + _, sharePidns := nsMaps[configs.NEWPID] + data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "") + if err != nil { + return nil, err + } return &initProcess{ - cmd: cmd, - childPipe: childPipe, - parentPipe: parentPipe, - manager: c.cgroupManager, - config: c.newInitConfig(p), - container: c, - process: p, + cmd: cmd, + childPipe: childPipe, + parentPipe: parentPipe, + manager: c.cgroupManager, + config: c.newInitConfig(p), + container: c, + process: p, + bootstrapData: data, + sharePidns: sharePidns, + rootDir: rootDir, }, nil } -func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { +func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) + state, err := c.currentState() + if err != nil { + return nil, newSystemErrorWithCause(err, "getting container's current state") + } // for setns process, we dont have to set cloneflags as the process namespaces // will only be set via setns syscall - data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath) + data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath) if err != nil { return nil, err } @@ -306,20 +380,41 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, config: c.newInitConfig(p), process: p, bootstrapData: data, + rootDir: rootDir, }, nil } func (c *linuxContainer) newInitConfig(process *Process) *initConfig { - return &initConfig{ + cfg := &initConfig{ Config: c.config, Args: process.Args, Env: process.Env, User: process.User, + AdditionalGroups: process.AdditionalGroups, Cwd: process.Cwd, Console: process.consolePath, Capabilities: process.Capabilities, PassedFilesCount: len(process.ExtraFiles), + ContainerId: c.ID(), + NoNewPrivileges: c.config.NoNewPrivileges, + AppArmorProfile: c.config.AppArmorProfile, + ProcessLabel: c.config.ProcessLabel, + Rlimits: c.config.Rlimits, + ExecFifoPath: filepath.Join(c.root, execFifoFilename), } + if process.NoNewPrivileges != nil { + cfg.NoNewPrivileges = *process.NoNewPrivileges + } + if process.AppArmorProfile != "" { + cfg.AppArmorProfile = process.AppArmorProfile + } + if process.Label != "" { + cfg.ProcessLabel = process.Label + } + if len(process.Rlimits) > 0 { + cfg.Rlimits = process.Rlimits + } + return cfg } func newPipe() (parent *os.File, child *os.File, err error) { @@ -343,15 +438,16 @@ func (c *linuxContainer) Pause() error { if err != nil { return err } - if status != Running { - return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) + switch status { + case Running, Created: + if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { + return err + } + return c.state.transition(&pausedState{ + c: c, + }) } - if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { - return err - } - return c.state.transition(&pausedState{ - c: c, - }) + return newGenericError(fmt.Errorf("container not running: %s", status), ContainerNotRunning) } func (c *linuxContainer) Resume() error { @@ -380,23 +476,13 @@ func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struc return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) } -// XXX debug support, remove when debugging done. -func addArgsFromEnv(evar string, args *[]string) { - if e := os.Getenv(evar); e != "" { - for _, f := range strings.Fields(e) { - *args = append(*args, f) - } - } - fmt.Printf(">>> criu %v\n", *args) -} - -// check Criu version greater than or equal to min_version -func (c *linuxContainer) checkCriuVersion(min_version string) error { +// checkCriuVersion checks Criu version greater than or equal to minVersion +func (c *linuxContainer) checkCriuVersion(minVersion string) error { var x, y, z, versionReq int - _, err := fmt.Sscanf(min_version, "%d.%d.%d\n", &x, &y, &z) // 1.5.2 + _, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2 if err != nil { - _, err = fmt.Sscanf(min_version, "Version: %d.%d\n", &x, &y) // 1.6 + _, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6 } versionReq = x*10000 + y*100 + z @@ -441,7 +527,7 @@ func (c *linuxContainer) checkCriuVersion(min_version string) error { c.criuVersion = x*10000 + y*100 + z if c.criuVersion < versionReq { - return fmt.Errorf("CRIU version must be %s or higher", min_version) + return fmt.Errorf("CRIU version must be %s or higher", minVersion) } return nil @@ -514,6 +600,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { TcpEstablished: proto.Bool(criuOpts.TcpEstablished), ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), FileLocks: proto.Bool(criuOpts.FileLocks), + EmptyNs: proto.Uint32(criuOpts.EmptyNs), } // append optional criu opts, e.g., page-server and port @@ -529,7 +616,8 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { if err := c.checkCriuVersion("1.7"); err != nil { return err } - rpcOpts.ManageCgroupsMode = proto.Uint32(uint32(criuOpts.ManageCgroupsMode)) + mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) + rpcOpts.ManageCgroupsMode = &mode } t := criurpc.CriuReqType_DUMP @@ -587,6 +675,27 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) } +func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) { + for _, iface := range c.config.Networks { + switch iface.Type { + case "veth": + veth := new(criurpc.CriuVethPair) + veth.IfOut = proto.String(iface.HostInterfaceName) + veth.IfIn = proto.String(iface.Name) + req.Opts.Veths = append(req.Opts.Veths, veth) + break + case "loopback": + break + } + } + for _, i := range criuOpts.VethPairs { + veth := new(criurpc.CriuVethPair) + veth.IfOut = proto.String(i.HostInterfaceName) + veth.IfIn = proto.String(i.ContainerInterfaceName) + req.Opts.Veths = append(req.Opts.Veths, veth) + } +} + func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() @@ -650,6 +759,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), TcpEstablished: proto.Bool(criuOpts.TcpEstablished), FileLocks: proto.Bool(criuOpts.FileLocks), + EmptyNs: proto.Uint32(criuOpts.EmptyNs), }, } @@ -669,23 +779,9 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { break } } - for _, iface := range c.config.Networks { - switch iface.Type { - case "veth": - veth := new(criurpc.CriuVethPair) - veth.IfOut = proto.String(iface.HostInterfaceName) - veth.IfIn = proto.String(iface.Name) - req.Opts.Veths = append(req.Opts.Veths, veth) - break - case "loopback": - break - } - } - for _, i := range criuOpts.VethPairs { - veth := new(criurpc.CriuVethPair) - veth.IfOut = proto.String(i.HostInterfaceName) - veth.IfIn = proto.String(i.ContainerInterfaceName) - req.Opts.Veths = append(req.Opts.Veths, veth) + + if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 { + c.restoreNetwork(req, criuOpts) } // append optional manage cgroups mode @@ -693,7 +789,8 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { if err := c.checkCriuVersion("1.7"); err != nil { return err } - req.Opts.ManageCgroupsMode = proto.Uint32(uint32(criuOpts.ManageCgroupsMode)) + mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) + req.Opts.ManageCgroupsMode = &mode } var ( @@ -850,7 +947,7 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts * if err != nil { return err } - n, err = criuClient.Write(data) + _, err = criuClient.Write(data) if err != nil { return err } @@ -925,6 +1022,20 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc if err := lockNetwork(c.config); err != nil { return err } + case notify.GetScript() == "setup-namespaces": + if c.config.Hooks != nil { + s := configs.HookState{ + Version: c.config.Version, + ID: c.id, + Pid: int(notify.GetPid()), + Root: c.config.Rootfs, + } + for i, hook := range c.config.Hooks.Prestart { + if err := hook.Run(s); err != nil { + return newSystemErrorWithCausef(err, "running prestart hook %d", i) + } + } + } case notify.GetScript() == "post-restore": pid := notify.GetPid() r, err := newRestoredProcess(int(pid), fds) @@ -938,7 +1049,7 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc }); err != nil { return err } - if err := c.updateState(r); err != nil { + if _, err := c.updateState(r); err != nil { return err } if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil { @@ -950,13 +1061,17 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc return nil } -func (c *linuxContainer) updateState(process parentProcess) error { +func (c *linuxContainer) updateState(process parentProcess) (*State, error) { c.initProcess = process state, err := c.currentState() if err != nil { - return err + return nil, err } - return c.saveState(state) + err = c.saveState(state) + if err != nil { + return nil, err + } + return state, nil } func (c *linuxContainer) saveState(s *State) error { @@ -991,37 +1106,75 @@ func (c *linuxContainer) refreshState() error { if paused { return c.state.transition(&pausedState{c: c}) } - running, err := c.isRunning() + t, err := c.runType() if err != nil { return err } - if running { + switch t { + case Created: + return c.state.transition(&createdState{c: c}) + case Running: return c.state.transition(&runningState{c: c}) } return c.state.transition(&stoppedState{c: c}) } -func (c *linuxContainer) isRunning() (bool, error) { - if c.initProcess == nil { +// doesInitProcessExist checks if the init process is still the same process +// as the initial one, it could happen that the original process has exited +// and a new process has been created with the same pid, in this case, the +// container would already be stopped. +func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) { + startTime, err := system.GetProcessStartTime(initPid) + if err != nil { + return false, newSystemErrorWithCausef(err, "getting init process %d start time", initPid) + } + if c.initProcessStartTime != startTime { return false, nil } - // return Running if the init process is alive - if err := syscall.Kill(c.initProcess.pid(), 0); err != nil { - if err == syscall.ESRCH { - return false, nil - } - return false, newSystemError(err) - } return true, nil } +func (c *linuxContainer) runType() (Status, error) { + if c.initProcess == nil { + return Stopped, nil + } + pid := c.initProcess.pid() + // return Running if the init process is alive + if err := syscall.Kill(pid, 0); err != nil { + if err == syscall.ESRCH { + // It means the process does not exist anymore, could happen when the + // process exited just when we call the function, we should not return + // error in this case. + return Stopped, nil + } + return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid) + } + // check if the process is still the original init process. + exist, err := c.doesInitProcessExist(pid) + if !exist || err != nil { + return Stopped, err + } + // check if the process that is running is the init process or the user's process. + // this is the difference between the container Running and Created. + environ, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/environ", pid)) + if err != nil { + return Stopped, newSystemErrorWithCausef(err, "reading /proc/%d/environ", pid) + } + check := []byte("_LIBCONTAINER") + if bytes.Contains(environ, check) { + return Created, nil + } + return Running, nil +} + func (c *linuxContainer) isPaused() (bool, error) { data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state")) if err != nil { + // If freezer cgroup is not mounted, the container would just be not paused. if os.IsNotExist(err) { return false, nil } - return false, newSystemError(err) + return false, newSystemErrorWithCause(err, "checking if container is paused") } return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil } @@ -1043,6 +1196,7 @@ func (c *linuxContainer) currentState() (*State, error) { Config: *c.config, InitProcessPid: pid, InitProcessStartTime: startTime, + Created: c.created, }, CgroupPaths: c.cgroupManager.GetPaths(), NamespacePaths: make(map[configs.NamespaceType]string), @@ -1053,6 +1207,9 @@ func (c *linuxContainer) currentState() (*State, error) { state.NamespacePaths[ns.Type] = ns.GetPath(pid) } for _, nsType := range configs.NamespaceTypes() { + if !configs.IsNamespaceSupported(nsType) { + continue + } if _, ok := state.NamespacePaths[nsType]; !ok { ns := configs.Namespace{Type: nsType} state.NamespacePaths[ns.Type] = ns.GetPath(pid) @@ -1062,18 +1219,69 @@ func (c *linuxContainer) currentState() (*State, error) { return state, nil } -// bootstrapData encodes the necessary data in netlink binary format as a io.Reader. -// Consumer can write the data to a bootstrap program such as one that uses -// nsenter package to bootstrap the container's init process correctly, i.e. with -// correct namespaces, uid/gid mapping etc. -func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath string) (io.Reader, error) { +// orderNamespacePaths sorts namespace paths into a list of paths that we +// can setns in order. +func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { + paths := []string{} + nsTypes := []configs.NamespaceType{ + configs.NEWIPC, + configs.NEWUTS, + configs.NEWNET, + configs.NEWPID, + configs.NEWNS, + } + // join userns if the init process explicitly requires NEWUSER + if c.config.Namespaces.Contains(configs.NEWUSER) { + nsTypes = append(nsTypes, configs.NEWUSER) + } + for _, nsType := range nsTypes { + if p, ok := namespaces[nsType]; ok && p != "" { + // check if the requested namespace is supported + if !configs.IsNamespaceSupported(nsType) { + return nil, newSystemError(fmt.Errorf("namespace %s is not supported", nsType)) + } + // only set to join this namespace if it exists + if _, err := os.Lstat(p); err != nil { + return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p) + } + // do not allow namespace path with comma as we use it to separate + // the namespace paths + if strings.ContainsRune(p, ',') { + return nil, newSystemError(fmt.Errorf("invalid path %s", p)) + } + paths = append(paths, p) + } + } + return paths, nil +} + +func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { + data := bytes.NewBuffer(nil) + for _, im := range idMap { + line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size) + if _, err := data.WriteString(line); err != nil { + return nil, err + } + } + return data.Bytes(), nil +} + +// bootstrapData encodes the necessary data in netlink binary format +// as a io.Reader. +// Consumer can write the data to a bootstrap program +// such as one that uses nsenter package to bootstrap the container's +// init process correctly, i.e. with correct namespaces, uid/gid +// mapping etc. +func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) { // create the netlink message r := nl.NewNetlinkRequest(int(InitMsg), 0) - // write pid + + // write cloneFlags r.AddData(&Int32msg{ - Type: PidAttr, - Value: uint32(pid), + Type: CloneFlagsAttr, + Value: uint32(cloneFlags), }) + // write console path if consolePath != "" { r.AddData(&Bytemsg{ @@ -1081,5 +1289,57 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath Value: []byte(consolePath), }) } + + // write custom namespace paths + if len(nsMaps) > 0 { + nsPaths, err := c.orderNamespacePaths(nsMaps) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: NsPathsAttr, + Value: []byte(strings.Join(nsPaths, ",")), + }) + } + + // write namespace paths only when we are not joining an existing user ns + _, joinExistingUser := nsMaps[configs.NEWUSER] + if !joinExistingUser { + // write uid mappings + if len(c.config.UidMappings) > 0 { + b, err := encodeIDMapping(c.config.UidMappings) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: UidmapAttr, + Value: b, + }) + } + + // write gid mappings + if len(c.config.GidMappings) > 0 { + b, err := encodeIDMapping(c.config.GidMappings) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: GidmapAttr, + Value: b, + }) + // check if we have CAP_SETGID to setgroup properly + pid, err := capability.NewPid(os.Getpid()) + if err != nil { + return nil, err + } + if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { + r.AddData(&Boolmsg{ + Type: SetgroupAttr, + Value: true, + }) + } + } + } + return bytes.NewReader(r.Serialize()), nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_nouserns_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/container_nouserns_linux.go deleted file mode 100644 index 3b75d593..00000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/container_nouserns_linux.go +++ /dev/null @@ -1,13 +0,0 @@ -// +build !go1.4 - -package libcontainer - -import ( - "fmt" - "syscall" -) - -// not available before go 1.4 -func (c *linuxContainer) addUidGidMappings(sys *syscall.SysProcAttr) error { - return fmt.Errorf("User namespace is not supported in golang < 1.4") -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go b/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go new file mode 100644 index 00000000..bb84ff74 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go @@ -0,0 +1,20 @@ +package libcontainer + +// State represents a running container's state +type State struct { + BaseState + + // Platform specific fields below here +} + +// A libcontainer container object. +// +// Each container is thread-safe within the same process. Since a container can +// be destroyed by a separate process, any function may return that the container +// was not found. +type Container interface { + BaseContainer + + // Methods below here are platform specific + +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_userns_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/container_userns_linux.go deleted file mode 100644 index 5f4cf3c9..00000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/container_userns_linux.go +++ /dev/null @@ -1,26 +0,0 @@ -// +build go1.4 - -package libcontainer - -import "syscall" - -// Converts IDMap to SysProcIDMap array and adds it to SysProcAttr. -func (c *linuxContainer) addUidGidMappings(sys *syscall.SysProcAttr) error { - if c.config.UidMappings != nil { - sys.UidMappings = make([]syscall.SysProcIDMap, len(c.config.UidMappings)) - for i, um := range c.config.UidMappings { - sys.UidMappings[i].ContainerID = um.ContainerID - sys.UidMappings[i].HostID = um.HostID - sys.UidMappings[i].Size = um.Size - } - } - if c.config.GidMappings != nil { - sys.GidMappings = make([]syscall.SysProcIDMap, len(c.config.GidMappings)) - for i, gm := range c.config.GidMappings { - sys.GidMappings[i].ContainerID = gm.ContainerID - sys.GidMappings[i].HostID = gm.HostID - sys.GidMappings[i].Size = gm.Size - } - } - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go index a2a816bd..b163fbbb 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go @@ -3,13 +3,13 @@ package libcontainer // cgroup restoring strategy provided by criu -type cg_mode uint32 +type cgMode uint32 const ( - CRIU_CG_MODE_SOFT cg_mode = 3 + iota // restore cgroup properties if only dir created by criu - CRIU_CG_MODE_FULL // always restore all cgroups and their properties - CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system - CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT + CRIU_CG_MODE_SOFT cgMode = 3 + iota // restore cgroup properties if only dir created by criu + CRIU_CG_MODE_FULL // always restore all cgroups and their properties + CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system + CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT ) type CriuPageServerInfo struct { @@ -32,5 +32,6 @@ type CriuOpts struct { FileLocks bool // handle file locks, for safety PageServer CriuPageServerInfo // allow to dump to criu page server VethPairs []VethPairName // pass the veth to criu when restore - ManageCgroupsMode cg_mode // dump or restore cgroup mode + ManageCgroupsMode cgMode // dump or restore cgroup mode + EmptyNs uint32 // don't c/r properties for namespace from this mask } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go b/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go index 193b6df1..3c4fb773 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go @@ -19,6 +19,7 @@ It has these top-level messages: CriuDumpResp CriuRestoreResp CriuNotify + CriuFeatures CriuReq CriuResp */ @@ -31,6 +32,54 @@ import math "math" var _ = proto.Marshal var _ = math.Inf +type CriuCgMode int32 + +const ( + CriuCgMode_IGNORE CriuCgMode = 0 + CriuCgMode_NONE CriuCgMode = 1 + CriuCgMode_PROPS CriuCgMode = 2 + CriuCgMode_SOFT CriuCgMode = 3 + CriuCgMode_FULL CriuCgMode = 4 + CriuCgMode_STRICT CriuCgMode = 5 + CriuCgMode_DEFAULT CriuCgMode = 6 +) + +var CriuCgMode_name = map[int32]string{ + 0: "IGNORE", + 1: "NONE", + 2: "PROPS", + 3: "SOFT", + 4: "FULL", + 5: "STRICT", + 6: "DEFAULT", +} +var CriuCgMode_value = map[string]int32{ + "IGNORE": 0, + "NONE": 1, + "PROPS": 2, + "SOFT": 3, + "FULL": 4, + "STRICT": 5, + "DEFAULT": 6, +} + +func (x CriuCgMode) Enum() *CriuCgMode { + p := new(CriuCgMode) + *p = x + return p +} +func (x CriuCgMode) String() string { + return proto.EnumName(CriuCgMode_name, int32(x)) +} +func (x *CriuCgMode) UnmarshalJSON(data []byte) error { + value, err := proto.UnmarshalJSONEnum(CriuCgMode_value, data, "CriuCgMode") + if err != nil { + return err + } + *x = CriuCgMode(value) + return nil +} + type CriuReqType int32 const ( @@ -43,6 +92,7 @@ const ( CriuReqType_NOTIFY CriuReqType = 6 CriuReqType_CPUINFO_DUMP CriuReqType = 7 CriuReqType_CPUINFO_CHECK CriuReqType = 8 + CriuReqType_FEATURE_CHECK CriuReqType = 9 ) var CriuReqType_name = map[int32]string{ @@ -55,6 +105,7 @@ var CriuReqType_name = map[int32]string{ 6: "NOTIFY", 7: "CPUINFO_DUMP", 8: "CPUINFO_CHECK", + 9: "FEATURE_CHECK", } var CriuReqType_value = map[string]int32{ "EMPTY": 0, @@ -66,6 +117,7 @@ var CriuReqType_value = map[string]int32{ "NOTIFY": 6, "CPUINFO_DUMP": 7, "CPUINFO_CHECK": 8, + "FEATURE_CHECK": 9, } func (x CriuReqType) Enum() *CriuReqType { @@ -271,7 +323,12 @@ type CriuOpts struct { SkipMnt []string `protobuf:"bytes,31,rep,name=skip_mnt" json:"skip_mnt,omitempty"` EnableFs []string `protobuf:"bytes,32,rep,name=enable_fs" json:"enable_fs,omitempty"` UnixSkIno []*UnixSk `protobuf:"bytes,33,rep,name=unix_sk_ino" json:"unix_sk_ino,omitempty"` - ManageCgroupsMode *uint32 `protobuf:"varint,34,opt,name=manage_cgroups_mode" json:"manage_cgroups_mode,omitempty"` + ManageCgroupsMode *CriuCgMode `protobuf:"varint,34,opt,name=manage_cgroups_mode,enum=CriuCgMode" json:"manage_cgroups_mode,omitempty"` + GhostLimit *uint32 `protobuf:"varint,35,opt,name=ghost_limit,def=1048576" json:"ghost_limit,omitempty"` + IrmapScanPaths []string `protobuf:"bytes,36,rep,name=irmap_scan_paths" json:"irmap_scan_paths,omitempty"` + External []string `protobuf:"bytes,37,rep,name=external" json:"external,omitempty"` + EmptyNs *uint32 `protobuf:"varint,38,opt,name=empty_ns" json:"empty_ns,omitempty"` + NoSeccomp *bool `protobuf:"varint,39,opt,name=no_seccomp" json:"no_seccomp,omitempty"` XXX_unrecognized []byte `json:"-"` } @@ -281,6 +338,7 @@ func (*CriuOpts) ProtoMessage() {} const Default_CriuOpts_LogLevel int32 = 2 const Default_CriuOpts_CpuCap uint32 = 4294967295 +const Default_CriuOpts_GhostLimit uint32 = 1048576 func (m *CriuOpts) GetImagesDirFd() int32 { if m != nil && m.ImagesDirFd != nil { @@ -513,13 +571,48 @@ func (m *CriuOpts) GetUnixSkIno() []*UnixSk { return nil } -func (m *CriuOpts) GetManageCgroupsMode() uint32 { +func (m *CriuOpts) GetManageCgroupsMode() CriuCgMode { if m != nil && m.ManageCgroupsMode != nil { return *m.ManageCgroupsMode } + return CriuCgMode_IGNORE +} + +func (m *CriuOpts) GetGhostLimit() uint32 { + if m != nil && m.GhostLimit != nil { + return *m.GhostLimit + } + return Default_CriuOpts_GhostLimit +} + +func (m *CriuOpts) GetIrmapScanPaths() []string { + if m != nil { + return m.IrmapScanPaths + } + return nil +} + +func (m *CriuOpts) GetExternal() []string { + if m != nil { + return m.External + } + return nil +} + +func (m *CriuOpts) GetEmptyNs() uint32 { + if m != nil && m.EmptyNs != nil { + return *m.EmptyNs + } return 0 } +func (m *CriuOpts) GetNoSeccomp() bool { + if m != nil && m.NoSeccomp != nil { + return *m.NoSeccomp + } + return false +} + type CriuDumpResp struct { Restored *bool `protobuf:"varint,1,opt,name=restored" json:"restored,omitempty"` XXX_unrecognized []byte `json:"-"` @@ -576,6 +669,25 @@ func (m *CriuNotify) GetPid() int32 { return 0 } +// +// List of features which can queried via +// CRIU_REQ_TYPE__FEATURE_CHECK +type CriuFeatures struct { + MemTrack *bool `protobuf:"varint,1,opt,name=mem_track" json:"mem_track,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *CriuFeatures) Reset() { *m = CriuFeatures{} } +func (m *CriuFeatures) String() string { return proto.CompactTextString(m) } +func (*CriuFeatures) ProtoMessage() {} + +func (m *CriuFeatures) GetMemTrack() bool { + if m != nil && m.MemTrack != nil { + return *m.MemTrack + } + return false +} + type CriuReq struct { Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"` Opts *CriuOpts `protobuf:"bytes,2,opt,name=opts" json:"opts,omitempty"` @@ -584,8 +696,13 @@ type CriuReq struct { // When set service won't close the connection but // will wait for more req-s to appear. Works not // for all request types. - KeepOpen *bool `protobuf:"varint,4,opt,name=keep_open" json:"keep_open,omitempty"` - XXX_unrecognized []byte `json:"-"` + KeepOpen *bool `protobuf:"varint,4,opt,name=keep_open" json:"keep_open,omitempty"` + // + // 'features' can be used to query which features + // are supported by the installed criu/kernel + // via RPC. + Features *CriuFeatures `protobuf:"bytes,5,opt,name=features" json:"features,omitempty"` + XXX_unrecognized []byte `json:"-"` } func (m *CriuReq) Reset() { *m = CriuReq{} } @@ -620,6 +737,13 @@ func (m *CriuReq) GetKeepOpen() bool { return false } +func (m *CriuReq) GetFeatures() *CriuFeatures { + if m != nil { + return m.Features + } + return nil +} + type CriuResp struct { Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"` Success *bool `protobuf:"varint,2,req,name=success" json:"success,omitempty"` @@ -628,6 +752,7 @@ type CriuResp struct { Notify *CriuNotify `protobuf:"bytes,5,opt,name=notify" json:"notify,omitempty"` Ps *CriuPageServerInfo `protobuf:"bytes,6,opt,name=ps" json:"ps,omitempty"` CrErrno *int32 `protobuf:"varint,7,opt,name=cr_errno" json:"cr_errno,omitempty"` + Features *CriuFeatures `protobuf:"bytes,8,opt,name=features" json:"features,omitempty"` XXX_unrecognized []byte `json:"-"` } @@ -684,6 +809,14 @@ func (m *CriuResp) GetCrErrno() int32 { return 0 } +func (m *CriuResp) GetFeatures() *CriuFeatures { + if m != nil { + return m.Features + } + return nil +} + func init() { + proto.RegisterEnum("CriuCgMode", CriuCgMode_name, CriuCgMode_value) proto.RegisterEnum("CriuReqType", CriuReqType_name, CriuReqType_value) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.proto b/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.proto index f49325e2..34fa9884 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.proto +++ b/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.proto @@ -29,6 +29,16 @@ message unix_sk { required uint32 inode = 1; }; +enum criu_cg_mode { + IGNORE = 0; + NONE = 1; + PROPS = 2; + SOFT = 3; + FULL = 4; + STRICT = 5; + DEFAULT = 6; +}; + message criu_opts { required int32 images_dir_fd = 1; optional int32 pid = 2; /* if not set on dump, will dump requesting process */ @@ -75,7 +85,12 @@ message criu_opts { repeated unix_sk unix_sk_ino = 33; - optional uint32 manage_cgroups_mode = 34; + optional criu_cg_mode manage_cgroups_mode = 34; + optional uint32 ghost_limit = 35 [default = 0x100000]; + repeated string irmap_scan_paths = 36; + repeated string external = 37; + optional uint32 empty_ns = 38; + optional bool no_seccomp = 39; } message criu_dump_resp { @@ -103,6 +118,16 @@ enum criu_req_type { CPUINFO_DUMP = 7; CPUINFO_CHECK = 8; + + FEATURE_CHECK = 9; +} + +/* + * List of features which can queried via + * CRIU_REQ_TYPE__FEATURE_CHECK + */ +message criu_features { + optional bool mem_track = 1; } /* @@ -122,11 +147,17 @@ message criu_req { * for all request types. */ optional bool keep_open = 4; + /* + * 'features' can be used to query which features + * are supported by the installed criu/kernel + * via RPC. + */ + optional criu_features features = 5; } /* - * Responce -- it states whether the request was served - * and additional request-specific informarion + * Response -- it states whether the request was served + * and additional request-specific information */ message criu_resp { @@ -139,4 +170,5 @@ message criu_resp { optional criu_page_server_info ps = 6; optional int32 cr_errno = 7; + optional criu_features features = 8; } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/error.go b/vendor/github.com/opencontainers/runc/libcontainer/error.go index 378ef469..b0639270 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/error.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/error.go @@ -2,7 +2,7 @@ package libcontainer import "io" -// API error code type. +// ErrorCode is the API error code type. type ErrorCode int // API error codes. @@ -19,7 +19,7 @@ const ( ContainerNotPaused // Process errors - ProcessNotExecuted + NoProcessOps // Common errors ConfigInvalid @@ -49,12 +49,14 @@ func (c ErrorCode) String() string { return "Console exists for process" case ContainerNotPaused: return "Container is not paused" + case NoProcessOps: + return "No process operations" default: return "Unknown error" } } -// API Error type. +// Error is the API error type. type Error interface { error diff --git a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go index 0e4e9dfd..6cce46e0 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go @@ -9,6 +9,7 @@ import ( "os/exec" "path/filepath" "regexp" + "runtime/debug" "strconv" "syscall" @@ -22,11 +23,12 @@ import ( ) const ( - stateFilename = "state.json" + stateFilename = "state.json" + execFifoFilename = "exec.fifo" ) var ( - idRegex = regexp.MustCompile(`^[\w_-]+$`) + idRegex = regexp.MustCompile(`^[\w+-\.]+$`) maxIdLen = 1024 ) @@ -101,6 +103,15 @@ func TmpfsRoot(l *LinuxFactory) error { return nil } +// CriuPath returns an option func to configure a LinuxFactory with the +// provided criupath +func CriuPath(criupath string) func(*LinuxFactory) error { + return func(l *LinuxFactory) error { + l.CriuPath = criupath + return nil + } +} + // New returns a linux based container factory based in the root directory and // configures the factory with the provided option funcs. func New(root string, options ...func(*LinuxFactory) error) (Factory, error) { @@ -157,13 +168,34 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err if err := l.Validator.Validate(config); err != nil { return nil, newGenericError(err, ConfigInvalid) } + uid, err := config.HostUID() + if err != nil { + return nil, newGenericError(err, SystemError) + } + gid, err := config.HostGID() + if err != nil { + return nil, newGenericError(err, SystemError) + } containerRoot := filepath.Join(l.Root, id) if _, err := os.Stat(containerRoot); err == nil { return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse) } else if !os.IsNotExist(err) { return nil, newGenericError(err, SystemError) } - if err := os.MkdirAll(containerRoot, 0700); err != nil { + if err := os.MkdirAll(containerRoot, 0711); err != nil { + return nil, newGenericError(err, SystemError) + } + if err := os.Chown(containerRoot, uid, gid); err != nil { + return nil, newGenericError(err, SystemError) + } + fifoName := filepath.Join(containerRoot, execFifoFilename) + oldMask := syscall.Umask(0000) + if err := syscall.Mkfifo(fifoName, 0622); err != nil { + syscall.Umask(oldMask) + return nil, newGenericError(err, SystemError) + } + syscall.Umask(oldMask) + if err := os.Chown(fifoName, uid, gid); err != nil { return nil, newGenericError(err, SystemError) } c := &linuxContainer{ @@ -194,16 +226,18 @@ func (l *LinuxFactory) Load(id string) (Container, error) { fds: state.ExternalDescriptors, } c := &linuxContainer{ - initProcess: r, - id: id, - config: &state.Config, - initPath: l.InitPath, - initArgs: l.InitArgs, - criuPath: l.CriuPath, - cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths), - root: containerRoot, + initProcess: r, + initProcessStartTime: state.InitProcessStartTime, + id: id, + config: &state.Config, + initPath: l.InitPath, + initArgs: l.InitArgs, + criuPath: l.CriuPath, + cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths), + root: containerRoot, + created: state.Created, } - c.state = &createdState{c: c, s: Created} + c.state = &loadedState{c: c} if err := c.refreshState(); err != nil { return nil, err } @@ -217,10 +251,18 @@ func (l *LinuxFactory) Type() string { // StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state // This is a low level implementation detail of the reexec and should not be consumed externally func (l *LinuxFactory) StartInitialization() (err error) { - fdStr := os.Getenv("_LIBCONTAINER_INITPIPE") - pipefd, err := strconv.Atoi(fdStr) - if err != nil { - return fmt.Errorf("error converting env var _LIBCONTAINER_INITPIPE(%q) to an int: %s", fdStr, err) + var pipefd, rootfd int + for k, v := range map[string]*int{ + "_LIBCONTAINER_INITPIPE": &pipefd, + "_LIBCONTAINER_STATEDIR": &rootfd, + } { + s := os.Getenv(k) + + i, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("unable to convert %s=%s to int", k, s) + } + *v = i } var ( pipe = os.NewFile(uintptr(pipefd), "pipe") @@ -229,29 +271,31 @@ func (l *LinuxFactory) StartInitialization() (err error) { // clear the current process's environment to clean any libcontainer // specific env vars. os.Clearenv() + var i initer defer func() { - // if we have an error during the initialization of the container's init then send it back to the - // parent process in the form of an initError. - if err != nil { - if _, ok := i.(*linuxStandardInit); ok { - // Synchronisation only necessary for standard init. - if err := utils.WriteJSON(pipe, syncT{procError}); err != nil { - panic(err) - } - } - if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil { - panic(err) - } - } else { - if err := utils.WriteJSON(pipe, syncT{procStart}); err != nil { + // We have an error during the initialization of the container's init, + // send it back to the parent process in the form of an initError. + // If container's init successed, syscall.Exec will not return, hence + // this defer function will never be called. + if _, ok := i.(*linuxStandardInit); ok { + // Synchronisation only necessary for standard init. + if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil { panic(err) } } + if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil { + panic(err) + } // ensure that this pipe is always closed pipe.Close() }() - i, err = newContainerInit(it, pipe) + defer func() { + if e := recover(); e != nil { + err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack())) + } + }() + i, err = newContainerInit(it, pipe, rootfd) if err != nil { return err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go b/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go index 924d637b..9c3d3249 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go @@ -14,8 +14,9 @@ type syncType uint8 const ( procReady syncType = iota procError - procStart procRun + procHooks + procResume ) type syncT struct { @@ -51,6 +52,21 @@ func newGenericError(err error, c ErrorCode) Error { } func newSystemError(err error) Error { + return createSystemError(err, "") +} + +func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error { + return createSystemError(err, fmt.Sprintf(cause, v...)) +} + +func newSystemErrorWithCause(err error, cause string) Error { + return createSystemError(err, cause) +} + +// createSystemError creates the specified error with the correct number of +// stack frames skipped. This is only to be called by the other functions for +// formatting the error. +func createSystemError(err error, cause string) Error { if le, ok := err.(Error); ok { return le } @@ -58,7 +74,8 @@ func newSystemError(err error) Error { Timestamp: time.Now(), Err: err, ECode: SystemError, - Stack: stacktrace.Capture(1), + Cause: cause, + Stack: stacktrace.Capture(2), } if err != nil { gerr.Message = err.Error() @@ -70,12 +87,17 @@ type genericError struct { Timestamp time.Time ECode ErrorCode Err error `json:"-"` + Cause string Message string Stack stacktrace.Stacktrace } func (e *genericError) Error() string { - return fmt.Sprintf("[%d] %s: %s", e.ECode, e.ECode, e.Message) + if e.Cause == "" { + return e.Message + } + frame := e.Stack.Frames[0] + return fmt.Sprintf("%s:%d: %s caused %q", frame.File, frame.Line, e.Cause, e.Message) } func (e *genericError) Code() ErrorCode { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go index 918f1030..01ff0d13 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go @@ -44,22 +44,29 @@ type network struct { // initConfig is used for transferring parameters from Exec() to Init() type initConfig struct { - Args []string `json:"args"` - Env []string `json:"env"` - Cwd string `json:"cwd"` - Capabilities []string `json:"capabilities"` - User string `json:"user"` - Config *configs.Config `json:"config"` - Console string `json:"console"` - Networks []*network `json:"network"` - PassedFilesCount int `json:"passed_files_count"` + Args []string `json:"args"` + Env []string `json:"env"` + Cwd string `json:"cwd"` + Capabilities []string `json:"capabilities"` + ProcessLabel string `json:"process_label"` + AppArmorProfile string `json:"apparmor_profile"` + NoNewPrivileges bool `json:"no_new_privileges"` + User string `json:"user"` + AdditionalGroups []string `json:"additional_groups"` + Config *configs.Config `json:"config"` + Console string `json:"console"` + Networks []*network `json:"network"` + PassedFilesCount int `json:"passed_files_count"` + ContainerId string `json:"containerid"` + Rlimits []configs.Rlimit `json:"rlimits"` + ExecFifoPath string `json:"start_pipe_path"` } type initer interface { Init() error } -func newContainerInit(t initType, pipe *os.File) (initer, error) { +func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) { var config *initConfig if err := json.NewDecoder(pipe).Decode(&config); err != nil { return nil, err @@ -74,9 +81,10 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) { }, nil case initStandard: return &linuxStandardInit{ - pipe: pipe, - parentPid: syscall.Getppid(), - config: config, + pipe: pipe, + parentPid: syscall.Getppid(), + config: config, + stateDirFD: stateDirFD, }, nil } return nil, fmt.Errorf("unknown init type %q", t) @@ -163,20 +171,22 @@ func syncParentReady(pipe io.ReadWriter) error { return nil } -// joinExistingNamespaces gets all the namespace paths specified for the container and -// does a setns on the namespace fd so that the current process joins the namespace. -func joinExistingNamespaces(namespaces []configs.Namespace) error { - for _, ns := range namespaces { - if ns.Path != "" { - f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0) - if err != nil { - return err - } - err = system.Setns(f.Fd(), uintptr(ns.Syscall())) - f.Close() - if err != nil { - return err - } +// syncParentHooks sends to the given pipe a JSON payload which indicates that +// the parent should execute pre-start hooks. It then waits for the parent to +// indicate that it is cleared to resume. +func syncParentHooks(pipe io.ReadWriter) error { + // Tell parent. + if err := utils.WriteJSON(pipe, syncT{procHooks}); err != nil { + return err + } + // Wait for parent to give the all-clear. + var procSync syncT + if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { + if err == io.EOF { + return fmt.Errorf("parent closed synchronisation channel") + } + if procSync.Type != procResume { + return fmt.Errorf("invalid synchronisation flag from parent") } } return nil @@ -204,8 +214,8 @@ func setupUser(config *initConfig) error { } var addGroups []int - if len(config.Config.AdditionalGroups) > 0 { - addGroups, err = user.GetAdditionalGroupsPath(config.Config.AdditionalGroups, groupPath) + if len(config.AdditionalGroups) > 0 { + addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath) if err != nil { return err } @@ -309,19 +319,19 @@ func setupRoute(config *configs.Config) error { return nil } -func setupRlimits(config *configs.Config) error { - for _, rlimit := range config.Rlimits { - l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft} - if err := syscall.Setrlimit(rlimit.Type, l); err != nil { +func setupRlimits(limits []configs.Rlimit, pid int) error { + for _, rlimit := range limits { + if err := system.Prlimit(pid, rlimit.Type, syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil { return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err) } } return nil } -func setOomScoreAdj(oomScoreAdj int) error { - path := "/proc/self/oom_score_adj" - return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0700) +func setOomScoreAdj(oomScoreAdj int, pid int) error { + path := fmt.Sprintf("/proc/%d/oom_score_adj", pid) + + return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600) } // killCgroupProcesses freezes then iterates over all the processes inside the @@ -338,11 +348,14 @@ func killCgroupProcesses(m cgroups.Manager) error { return err } for _, pid := range pids { - if p, err := os.FindProcess(pid); err == nil { - procs = append(procs, p) - if err := p.Kill(); err != nil { - logrus.Warn(err) - } + p, err := os.FindProcess(pid) + if err != nil { + logrus.Warn(err) + continue + } + procs = append(procs, p) + if err := p.Kill(); err != nil { + logrus.Warn(err) } } if err := m.Freeze(configs.Thawed); err != nil { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go new file mode 100644 index 00000000..c67fd15b --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go @@ -0,0 +1,66 @@ +// +build linux + +package keyctl + +import ( + "fmt" + "strconv" + "strings" + "syscall" + "unsafe" +) + +const KEYCTL_JOIN_SESSION_KEYRING = 1 +const KEYCTL_SETPERM = 5 +const KEYCTL_DESCRIBE = 6 + +type KeySerial uint32 + +func JoinSessionKeyring(name string) (KeySerial, error) { + var _name *byte + var err error + + if len(name) > 0 { + _name, err = syscall.BytePtrFromString(name) + if err != nil { + return KeySerial(0), err + } + } + + sessKeyId, _, errn := syscall.Syscall(syscall.SYS_KEYCTL, KEYCTL_JOIN_SESSION_KEYRING, uintptr(unsafe.Pointer(_name)), 0) + if errn != 0 { + return 0, fmt.Errorf("could not create session key: %v", errn) + } + return KeySerial(sessKeyId), nil +} + +// ModKeyringPerm modifies permissions on a keyring by reading the current permissions, +// anding the bits with the given mask (clearing permissions) and setting +// additional permission bits +func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error { + dest := make([]byte, 1024) + destBytes := unsafe.Pointer(&dest[0]) + + if _, _, err := syscall.Syscall6(syscall.SYS_KEYCTL, uintptr(KEYCTL_DESCRIBE), uintptr(ringId), uintptr(destBytes), uintptr(len(dest)), 0, 0); err != 0 { + return err + } + + res := strings.Split(string(dest), ";") + if len(res) < 5 { + return fmt.Errorf("Destination buffer for key description is too small") + } + + // parse permissions + perm64, err := strconv.ParseUint(res[3], 16, 32) + if err != nil { + return err + } + + perm := (uint32(perm64) & mask) | setbits + + if _, _, err := syscall.Syscall(syscall.SYS_KEYCTL, uintptr(KEYCTL_SETPERM), uintptr(ringId), uintptr(perm)); err != 0 { + return err + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/label/label.go b/vendor/github.com/opencontainers/runc/libcontainer/label/label.go index 97dc6bae..684bb41b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/label/label.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/label/label.go @@ -21,6 +21,10 @@ func SetProcessLabel(processLabel string) error { return nil } +func GetFileLabel(path string) (string, error) { + return "", nil +} + func SetFileLabel(path string, fileLabel string) error { return nil } @@ -48,7 +52,7 @@ func UnreserveLabel(label string) error { return nil } -// DupSecOpt takes an process label and returns security options that +// DupSecOpt takes a process label and returns security options that // can be used to set duplicate labels on future container processes func DupSecOpt(src string) []string { return nil diff --git a/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go b/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go index e561cbfe..4493bda7 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go @@ -94,6 +94,11 @@ func GetProcessLabel() (string, error) { return selinux.Getexeccon() } +// GetFileLabel returns the label for specified path +func GetFileLabel(path string) (string, error) { + return selinux.Getfilecon(path) +} + // SetFileLabel modifies the "path" label to the specified file label func SetFileLabel(path string, fileLabel string) error { if selinux.SelinuxEnabled() && fileLabel != "" { @@ -102,7 +107,7 @@ func SetFileLabel(path string, fileLabel string) error { return nil } -// Tell the kernel the label for all files to be created +// SetFileCreateLabel tells the kernel the label for all files to be created func SetFileCreateLabel(fileLabel string) error { if selinux.SelinuxEnabled() { return selinux.Setfscreatecon(fileLabel) @@ -110,7 +115,7 @@ func SetFileCreateLabel(fileLabel string) error { return nil } -// Change the label of path to the filelabel string. +// Relabel changes the label of path to the filelabel string. // It changes the MCS label to s0 if shared is true. // This will allow all containers to share the content. func Relabel(path string, fileLabel string, shared bool) error { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go index 0c3301f2..400bd362 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go @@ -12,8 +12,12 @@ import ( // The number is randomly chosen to not conflict with known netlink types const ( InitMsg uint16 = 62000 - PidAttr uint16 = 27281 + CloneFlagsAttr uint16 = 27281 ConsolePathAttr uint16 = 27282 + NsPathsAttr uint16 = 27283 + UidmapAttr uint16 = 27284 + GidmapAttr uint16 = 27285 + SetgroupAttr uint16 = 27286 // When syscall.NLA_HDRLEN is in gccgo, take this out. syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1) ) @@ -23,7 +27,8 @@ type Int32msg struct { Value uint32 } -// int32msg has the following representation +// Serialize serializes the message. +// Int32msg has the following representation // | nlattr len | nlattr type | // | uint32 value | func (msg *Int32msg) Serialize() []byte { @@ -39,7 +44,7 @@ func (msg *Int32msg) Len() int { return syscall_NLA_HDRLEN + 4 } -// bytemsg has the following representation +// Bytemsg has the following representation // | nlattr len | nlattr type | // | value | pad | type Bytemsg struct { @@ -60,3 +65,25 @@ func (msg *Bytemsg) Serialize() []byte { func (msg *Bytemsg) Len() int { return syscall_NLA_HDRLEN + len(msg.Value) + 1 // null-terminated } + +type Boolmsg struct { + Type uint16 + Value bool +} + +func (msg *Boolmsg) Serialize() []byte { + buf := make([]byte, msg.Len()) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(msg.Len())) + native.PutUint16(buf[2:4], msg.Type) + if msg.Value { + buf[4] = 1 + } else { + buf[4] = 0 + } + return buf +} + +func (msg *Boolmsg) Len() int { + return syscall_NLA_HDRLEN + 1 +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process.go b/vendor/github.com/opencontainers/runc/libcontainer/process.go index 9661df80..334add57 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/process.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/process.go @@ -5,6 +5,8 @@ import ( "io" "math" "os" + + "github.com/opencontainers/runc/libcontainer/configs" ) type processOperations interface { @@ -26,6 +28,10 @@ type Process struct { // local to the container's user and group configuration. User string + // AdditionalGroups specifies the gids that should be added to supplementary groups + // in addition to those that the user belongs to. + AdditionalGroups []string + // Cwd will change the processes current working directory inside the container's rootfs. Cwd string @@ -48,6 +54,20 @@ type Process struct { // All capabilities not specified will be dropped from the processes capability mask Capabilities []string + // AppArmorProfile specifies the profile to apply to the process and is + // changed at the time the process is execed + AppArmorProfile string + + // Label specifies the label to apply to the process. It is commonly used by selinux + Label string + + // NoNewPrivileges controls whether processes can gain additional privileges. + NoNewPrivileges *bool + + // Rlimits specifies the resource limits, such as max open files, to set in the container + // If Rlimits are not set, the container will inherit rlimits from the parent process + Rlimits []configs.Rlimit + ops processOperations } @@ -55,7 +75,7 @@ type Process struct { // Wait releases any resources associated with the Process func (p Process) Wait() (*os.ProcessState, error) { if p.ops == nil { - return nil, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted) + return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps) } return p.ops.wait() } @@ -65,7 +85,7 @@ func (p Process) Pid() (int, error) { // math.MinInt32 is returned here, because it's invalid value // for the kill() system call. if p.ops == nil { - return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted) + return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps) } return p.ops.pid(), nil } @@ -73,7 +93,7 @@ func (p Process) Pid() (int, error) { // Signal sends a signal to the Process. func (p Process) Signal(sig os.Signal) error { if p.ops == nil { - return newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted) + return newGenericError(fmt.Errorf("invalid process"), NoProcessOps) } return p.ops.signal(sig) } @@ -86,8 +106,8 @@ type IO struct { } // NewConsole creates new console for process and returns it -func (p *Process) NewConsole(rootuid int) (Console, error) { - console, err := NewConsole(rootuid, rootuid) +func (p *Process) NewConsole(rootuid, rootgid int) (Console, error) { + console, err := NewConsole(rootuid, rootgid) if err != nil { return nil, err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go index ac457c26..33db3923 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go @@ -51,6 +51,7 @@ type setnsProcess struct { fds []string process *Process bootstrapData io.Reader + rootDir *os.File } func (p *setnsProcess) startTime() (string, error) { @@ -69,39 +70,49 @@ func (p *setnsProcess) start() (err error) { defer p.parentPipe.Close() err = p.cmd.Start() p.childPipe.Close() + p.rootDir.Close() if err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "starting setns process") } if p.bootstrapData != nil { if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "copying bootstrap data to pipe") } } if err = p.execSetns(); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "executing setns process") } if len(p.cgroupPaths) > 0 { if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { - return newSystemError(err) + return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) } } + // set oom_score_adj + if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil { + return newSystemErrorWithCause(err, "setting oom score") + } + // set rlimits, this has to be done here because we lose permissions + // to raise the limits once we enter a user-namespace + if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { + return newSystemErrorWithCause(err, "setting rlimits for process") + } if err := utils.WriteJSON(p.parentPipe, p.config); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "writing config to pipe") } if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "calling shutdown on init pipe") } // wait for the child process to fully complete and receive an error message // if one was encoutered var ierr *genericError if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { - return newSystemError(err) + return newSystemErrorWithCause(err, "decoding init error from pipe") } // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { p.wait() - return newSystemError(ierr) + return ierr } return nil } @@ -114,7 +125,7 @@ func (p *setnsProcess) execSetns() error { status, err := p.cmd.Process.Wait() if err != nil { p.cmd.Wait() - return newSystemError(err) + return newSystemErrorWithCause(err, "waiting on setns process to finish") } if !status.Success() { p.cmd.Wait() @@ -123,7 +134,7 @@ func (p *setnsProcess) execSetns() error { var pid *pid if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { p.cmd.Wait() - return newSystemError(err) + return newSystemErrorWithCause(err, "reading pid from init pipe") } process, err := os.FindProcess(pid.Pid) if err != nil { @@ -167,14 +178,17 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) { } type initProcess struct { - cmd *exec.Cmd - parentPipe *os.File - childPipe *os.File - config *initConfig - manager cgroups.Manager - container *linuxContainer - fds []string - process *Process + cmd *exec.Cmd + parentPipe *os.File + childPipe *os.File + config *initConfig + manager cgroups.Manager + container *linuxContainer + fds []string + process *Process + bootstrapData io.Reader + sharePidns bool + rootDir *os.File } func (p *initProcess) pid() int { @@ -185,27 +199,63 @@ func (p *initProcess) externalDescriptors() []string { return p.fds } -func (p *initProcess) start() (err error) { +// execSetns runs the process that executes C code to perform the setns calls +// because setns support requires the C process to fork off a child and perform the setns +// before the go runtime boots, we wait on the process to die and receive the child's pid +// over the provided pipe. +// This is called by initProcess.start function +func (p *initProcess) execSetns() error { + status, err := p.cmd.Process.Wait() + if err != nil { + p.cmd.Wait() + return err + } + if !status.Success() { + p.cmd.Wait() + return &exec.ExitError{ProcessState: status} + } + var pid *pid + if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { + p.cmd.Wait() + return err + } + process, err := os.FindProcess(pid.Pid) + if err != nil { + return err + } + p.cmd.Process = process + p.process.ops = p + return nil +} + +func (p *initProcess) start() error { defer p.parentPipe.Close() - err = p.cmd.Start() + err := p.cmd.Start() p.process.ops = p p.childPipe.Close() + p.rootDir.Close() if err != nil { p.process.ops = nil - return newSystemError(err) + return newSystemErrorWithCause(err, "starting init process command") + } + if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { + return err + } + if err := p.execSetns(); err != nil { + return newSystemErrorWithCause(err, "running exec setns process for init") } // Save the standard descriptor names before the container process // can potentially move them (e.g., via dup2()). If we don't do this now, // we won't know at checkpoint time which file descriptor to look up. fds, err := getPipeFds(p.pid()) if err != nil { - return newSystemError(err) + return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid()) } p.setExternalDescriptors(fds) // Do this before syncing with child so that no children // can escape the cgroup if err := p.manager.Apply(p.pid()); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "applying cgroup configuration for process") } defer func() { if err != nil { @@ -213,56 +263,88 @@ func (p *initProcess) start() (err error) { p.manager.Destroy() } }() - if p.config.Config.Hooks != nil { - s := configs.HookState{ - Version: p.container.config.Version, - ID: p.container.id, - Pid: p.pid(), - Root: p.config.Config.Rootfs, - } - for _, hook := range p.config.Config.Hooks.Prestart { - if err := hook.Run(s); err != nil { - return newSystemError(err) - } - } - } if err := p.createNetworkInterfaces(); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "creating nework interfaces") } if err := p.sendConfig(); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "sending config to init process") } var ( - procSync syncT - sentRun bool - ierr *genericError + procSync syncT + sentRun bool + sentResume bool + ierr *genericError ) + dec := json.NewDecoder(p.parentPipe) loop: for { - if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil { + if err := dec.Decode(&procSync); err != nil { if err == io.EOF { break loop } - return newSystemError(err) + return newSystemErrorWithCause(err, "decoding sync type from init pipe") } switch procSync.Type { - case procStart: - break loop case procReady: if err := p.manager.Set(p.config.Config); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "setting cgroup config for ready process") + } + // set oom_score_adj + if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil { + return newSystemErrorWithCause(err, "setting oom score for ready process") + } + // set rlimits, this has to be done here because we lose permissions + // to raise the limits once we enter a user-namespace + if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { + return newSystemErrorWithCause(err, "setting rlimits for ready process") + } + // call prestart hooks + if !p.config.Config.Namespaces.Contains(configs.NEWNS) { + if p.config.Config.Hooks != nil { + s := configs.HookState{ + Version: p.container.config.Version, + ID: p.container.id, + Pid: p.pid(), + Root: p.config.Config.Rootfs, + } + for i, hook := range p.config.Config.Hooks.Prestart { + if err := hook.Run(s); err != nil { + return newSystemErrorWithCausef(err, "running prestart hook %d", i) + } + } + } } // Sync with child. if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "reading syncT run type") } sentRun = true + case procHooks: + if p.config.Config.Hooks != nil { + s := configs.HookState{ + Version: p.container.config.Version, + ID: p.container.id, + Pid: p.pid(), + Root: p.config.Config.Rootfs, + BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"), + } + for i, hook := range p.config.Config.Hooks.Prestart { + if err := hook.Run(s); err != nil { + return newSystemErrorWithCausef(err, "running prestart hook %d", i) + } + } + } + // Sync with child. + if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil { + return newSystemErrorWithCause(err, "reading syncT resume type") + } + sentResume = true case procError: // wait for the child process to fully complete and receive an error message // if one was encoutered - if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { - return newSystemError(err) + if err := dec.Decode(&ierr); err != nil && err != io.EOF { + return newSystemErrorWithCause(err, "decoding proc error from init") } if ierr != nil { break loop @@ -270,19 +352,22 @@ loop: // Programmer error. panic("No error following JSON procError payload.") default: - return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child")) + return newSystemError(fmt.Errorf("invalid JSON payload from child")) } } if !sentRun { - return newSystemError(fmt.Errorf("could not synchronise with container process")) + return newSystemErrorWithCause(ierr, "container init failed") + } + if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume { + return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process")) } if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "shutting down init pipe") } // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { p.wait() - return newSystemError(ierr) + return ierr } return nil } @@ -293,7 +378,7 @@ func (p *initProcess) wait() (*os.ProcessState, error) { return p.cmd.ProcessState, err } // we should kill all processes in cgroup when init is died if we use host PID namespace - if p.cmd.SysProcAttr.Cloneflags&syscall.CLONE_NEWPID == 0 { + if p.sharePidns { killCgroupProcesses(p.manager) } return p.cmd.ProcessState, nil @@ -315,7 +400,9 @@ func (p *initProcess) startTime() (string, error) { } func (p *initProcess) sendConfig() error { - // send the state to the container's init process then shutdown writes for the parent + // send the config to the container's init process, we don't use JSON Encode + // here because there might be a problem in JSON decoder in some cases, see: + // https://github.com/docker/docker/issues/14203#issuecomment-174177790 return utils.WriteJSON(p.parentPipe, p.config) } @@ -365,7 +452,7 @@ func getPipeFds(pid int) ([]string, error) { // InitializeIO creates pipes for use with the process's STDIO // and returns the opposite side for each -func (p *Process) InitializeIO(rootuid int) (i *IO, err error) { +func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { var fds []uintptr i = &IO{} // cleanup in case of an error @@ -397,7 +484,7 @@ func (p *Process) InitializeIO(rootuid int) (i *IO, err error) { p.Stderr, i.Stderr = w, r // change ownership of the pipes incase we are in a user namespace for _, fd := range fds { - if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil { + if err := syscall.Fchown(int(fd), rootuid, rootgid); err != nil { return nil, err } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go index aa061ab0..943b2fc0 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go @@ -4,6 +4,7 @@ package libcontainer import ( "fmt" + "io" "io/ioutil" "os" "os/exec" @@ -19,47 +20,65 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/label" "github.com/opencontainers/runc/libcontainer/system" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" ) const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV +// needsSetupDev returns true if /dev needs to be set up. +func needsSetupDev(config *configs.Config) bool { + for _, m := range config.Mounts { + if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" { + return false + } + } + return true +} + // setupRootfs sets up the devices, mount points, and filesystems for use inside a // new mount namespace. -func setupRootfs(config *configs.Config, console *linuxConsole) (err error) { +func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWriter) (err error) { if err := prepareRoot(config); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "preparing rootfs") } - setupDev := len(config.Devices) != 0 + setupDev := needsSetupDev(config) for _, m := range config.Mounts { for _, precmd := range m.PremountCmds { if err := mountCmd(precmd); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "running premount command") } } if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil { - return newSystemError(err) + return newSystemErrorWithCausef(err, "mounting %q to rootfs %q", m.Destination, config.Rootfs) } for _, postcmd := range m.PostmountCmds { if err := mountCmd(postcmd); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "running postmount command") } } } if setupDev { if err := createDevices(config); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "creating device nodes") } if err := setupPtmx(config, console); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "setting up ptmx") } if err := setupDevSymlinks(config.Rootfs); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "setting up /dev symlinks") } } + // Signal the parent to run the pre-start hooks. + // The hooks are run after the mounts are setup, but before we switch to the new + // root, so that the old root is still available in the hooks for any mount + // manipulations. + if err := syncParentHooks(pipe); err != nil { + return err + } if err := syscall.Chdir(config.Rootfs); err != nil { - return newSystemError(err) + return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs) } if config.NoPivotRoot { err = msMoveRoot(config.Rootfs) @@ -67,16 +86,28 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) { err = pivotRoot(config.Rootfs, config.PivotDir) } if err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "jailing process inside rootfs") } if setupDev { if err := reOpenDevNull(); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "reopening /dev/null inside container") } } + // remount dev as ro if specifed + for _, m := range config.Mounts { + if libcontainerUtils.CleanPath(m.Destination) == "/dev" { + if m.Flags&syscall.MS_RDONLY != 0 { + if err := remountReadonly(m.Destination); err != nil { + return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination) + } + } + break + } + } + // set rootfs ( / ) as readonly if config.Readonlyfs { if err := setReadonly(); err != nil { - return newSystemError(err) + return newSystemErrorWithCause(err, "setting rootfs as readonly") } } syscall.Umask(0022) @@ -84,14 +115,12 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) { } func mountCmd(cmd configs.Command) error { - command := exec.Command(cmd.Path, cmd.Args[:]...) command.Env = cmd.Env command.Dir = cmd.Dir if out, err := command.CombinedOutput(); err != nil { return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err) } - return nil } @@ -119,8 +148,9 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { if err := mountPropagate(m, rootfs, ""); err != nil { return err } + return label.SetFileLabel(dest, mountLabel) } - return label.SetFileLabel(dest, mountLabel) + return nil case "tmpfs": stat, err := os.Stat(dest) if err != nil { @@ -137,16 +167,6 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { } } return nil - case "devpts": - if err := os.MkdirAll(dest, 0755); err != nil { - return err - } - return mountPropagate(m, rootfs, mountLabel) - case "securityfs": - if err := os.MkdirAll(dest, 0755); err != nil { - return err - } - return mountPropagate(m, rootfs, mountLabel) case "bind": stat, err := os.Stat(m.Source) if err != nil { @@ -218,41 +238,33 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { return err } } - // create symlinks for merged cgroups - cwd, err := os.Getwd() - if err != nil { - return err - } - if err := os.Chdir(filepath.Join(rootfs, m.Destination)); err != nil { - return err - } for _, mc := range merged { for _, ss := range strings.Split(mc, ",") { - if err := os.Symlink(mc, ss); err != nil { - // if cgroup already exists, then okay(it could have been created before) - if os.IsExist(err) { - continue - } - os.Chdir(cwd) + // symlink(2) is very dumb, it will just shove the path into + // the link and doesn't do any checks or relative path + // conversion. Also, don't error out if the cgroup already exists. + if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) { return err } } } - if err := os.Chdir(cwd); err != nil { - return err - } if m.Flags&syscall.MS_RDONLY != 0 { // remount cgroup root as readonly mcgrouproot := &configs.Mount{ + Source: m.Destination, + Device: "bind", Destination: m.Destination, - Flags: defaultMountFlags | syscall.MS_RDONLY, + Flags: defaultMountFlags | syscall.MS_RDONLY | syscall.MS_BIND, } if err := remount(mcgrouproot, rootfs); err != nil { return err } } default: - return fmt.Errorf("unknown mount device %q to %q", m.Device, m.Destination) + if err := os.MkdirAll(dest, 0755); err != nil { + return err + } + return mountPropagate(m, rootfs, mountLabel) } return nil } @@ -294,7 +306,7 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { // checkMountDestination checks to ensure that the mount destination is not over the top of /proc. // dest is required to be an abs path and have any symlinks resolved before calling this function. func checkMountDestination(rootfs, dest string) error { - if filepath.Clean(rootfs) == filepath.Clean(dest) { + if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) { return fmt.Errorf("mounting into / is prohibited") } invalidDestinations := []string{ @@ -307,7 +319,8 @@ func checkMountDestination(rootfs, dest string) error { "/proc/cpuinfo", "/proc/diskstats", "/proc/meminfo", - "/proc/stats", + "/proc/stat", + "/proc/net/dev", } for _, valid := range validDestinations { path, err := filepath.Rel(filepath.Join(rootfs, valid), dest) @@ -340,7 +353,7 @@ func setupDevSymlinks(rootfs string) error { // kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink // in /dev if it exists in /proc. if _, err := os.Stat("/proc/kcore"); err == nil { - links = append(links, [2]string{"/proc/kcore", "/dev/kcore"}) + links = append(links, [2]string{"/proc/kcore", "/dev/core"}) } for _, link := range links { var ( @@ -489,10 +502,10 @@ func getParentMount(rootfs string) (string, string, error) { } // Make parent mount private if it was shared -func rootfsParentMountPrivate(config *configs.Config) error { +func rootfsParentMountPrivate(rootfs string) error { sharedMount := false - parentMount, optionalOpts, err := getParentMount(config.Rootfs) + parentMount, optionalOpts, err := getParentMount(rootfs) if err != nil { return err } @@ -524,9 +537,10 @@ func prepareRoot(config *configs.Config) error { if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil { return err } - - if err := rootfsParentMountPrivate(config); err != nil { - return err + if config.NoPivotRoot { + if err := rootfsParentMountPrivate(config.Rootfs); err != nil { + return err + } } return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "") @@ -550,7 +564,7 @@ func setupPtmx(config *configs.Config, console *linuxConsole) error { return nil } -func pivotRoot(rootfs, pivotBaseDir string) error { +func pivotRoot(rootfs, pivotBaseDir string) (err error) { if pivotBaseDir == "" { pivotBaseDir = "/" } @@ -562,8 +576,21 @@ func pivotRoot(rootfs, pivotBaseDir string) error { if err != nil { return fmt.Errorf("can't create pivot_root dir %s, error %v", pivotDir, err) } + defer func() { + errVal := os.Remove(pivotDir) + if err == nil { + err = errVal + } + }() if err := syscall.PivotRoot(rootfs, pivotDir); err != nil { - return fmt.Errorf("pivot_root %s", err) + // Make the parent mount private + if err := rootfsParentMountPrivate(rootfs); err != nil { + return err + } + // Try again + if err := syscall.PivotRoot(rootfs, pivotDir); err != nil { + return fmt.Errorf("pivot_root %s", err) + } } if err := syscall.Chdir("/"); err != nil { return fmt.Errorf("chdir / %s", err) @@ -580,7 +607,7 @@ func pivotRoot(rootfs, pivotBaseDir string) error { if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { return fmt.Errorf("unmount pivot_root dir %s", err) } - return os.Remove(pivotDir) + return nil } func msMoveRoot(rootfs string) error { @@ -669,14 +696,18 @@ func remount(m *configs.Mount, rootfs string) error { // of propagation flags. func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error { var ( - dest = m.Destination - data = label.FormatMountLabel(m.Data, mountLabel) + dest = m.Destination + data = label.FormatMountLabel(m.Data, mountLabel) + flags = m.Flags ) + if libcontainerUtils.CleanPath(dest) == "/dev" { + flags &= ^syscall.MS_RDONLY + } if !strings.HasPrefix(dest, rootfs) { dest = filepath.Join(rootfs, dest) } - if err := syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags), data); err != nil { + if err := syscall.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil { return err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go index 3b9a7595..ded5a6bb 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go @@ -36,6 +36,11 @@ var archs = map[string]string{ "SCMP_ARCH_MIPSEL": "mipsel", "SCMP_ARCH_MIPSEL64": "mipsel64", "SCMP_ARCH_MIPSEL64N32": "mipsel64n32", + "SCMP_ARCH_PPC": "ppc", + "SCMP_ARCH_PPC64": "ppc64", + "SCMP_ARCH_PPC64LE": "ppc64le", + "SCMP_ARCH_S390": "s390", + "SCMP_ARCH_S390X": "s390x", } // ConvertStringToOperator converts a string into a Seccomp comparison operator. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go index 623e2277..ec0ac1c0 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go @@ -5,7 +5,6 @@ package seccomp import ( "bufio" "fmt" - "log" "os" "strings" "syscall" @@ -167,7 +166,6 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error { // Ignore it, don't error out callNum, err := libseccomp.GetSyscallFromName(call.Name) if err != nil { - log.Printf("Error resolving syscall name %s: %s - ignoring syscall.", call.Name, err) return nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go index 888483e7..44df1ad4 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go @@ -10,7 +10,7 @@ import ( var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported") -// Seccomp not supported, do nothing +// InitSeccomp does nothing because seccomp is not supported. func InitSeccomp(config *configs.Seccomp) error { if config != nil { return ErrSeccompNotEnabled diff --git a/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go b/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go index 88d612ca..2a18e2ad 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go @@ -13,9 +13,9 @@ import ( "regexp" "strconv" "strings" + "sync" "syscall" - "github.com/docker/docker/pkg/mount" "github.com/opencontainers/runc/libcontainer/system" ) @@ -35,6 +35,7 @@ const ( var ( assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`) mcsList = make(map[string]bool) + mcsLock sync.Mutex selinuxfs = "unknown" selinuxEnabled = false // Stores whether selinux is currently enabled selinuxEnabledChecked = false // Stores whether selinux enablement has been checked or established yet @@ -58,16 +59,31 @@ func getSelinuxMountPoint() string { } selinuxfs = "" - mounts, err := mount.GetMounts() + f, err := os.Open("/proc/self/mountinfo") if err != nil { return selinuxfs } - for _, mount := range mounts { - if mount.Fstype == "selinuxfs" { - selinuxfs = mount.Mountpoint - break + defer f.Close() + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + txt := scanner.Text() + // Safe as mountinfo encodes mountpoints with spaces as \040. + sepIdx := strings.Index(txt, " - ") + if sepIdx == -1 { + continue } + if !strings.Contains(txt[sepIdx:], "selinuxfs") { + continue + } + fields := strings.Split(txt, " ") + if len(fields) < 5 { + continue + } + selinuxfs = fields[4] + break } + if selinuxfs != "" { var buf syscall.Statfs_t syscall.Statfs(selinuxfs, &buf) @@ -158,12 +174,14 @@ func Setfilecon(path string, scon string) error { // Getfilecon returns the SELinux label for this path or returns an error. func Getfilecon(path string) (string, error) { con, err := system.Lgetxattr(path, xattrNameSelinux) - + if err != nil { + return "", err + } // Trim the NUL byte at the end of the byte buffer, if present. - if con[len(con)-1] == '\x00' { + if len(con) > 0 && con[len(con)-1] == '\x00' { con = con[:len(con)-1] } - return string(con), err + return string(con), nil } func Setfscreatecon(scon string) error { @@ -265,6 +283,8 @@ func SelinuxGetEnforceMode() int { } func mcsAdd(mcs string) error { + mcsLock.Lock() + defer mcsLock.Unlock() if mcsList[mcs] { return fmt.Errorf("MCS Label already exists") } @@ -273,7 +293,9 @@ func mcsAdd(mcs string) error { } func mcsDelete(mcs string) { + mcsLock.Lock() mcsList[mcs] = false + mcsLock.Unlock() } func IntToMcs(id int, catRange uint32) string { @@ -289,7 +311,7 @@ func IntToMcs(id int, catRange uint32) string { for ORD > TIER { ORD = ORD - TIER - TIER -= 1 + TIER-- } TIER = SETSIZE - TIER ORD = ORD + TIER @@ -430,7 +452,7 @@ func badPrefix(fpath string) error { return nil } -// Change the fpath file object to the SELinux label scon. +// Chcon changes the fpath file object to the SELinux label scon. // If the fpath is a directory and recurse is true Chcon will walk the // directory tree setting the label func Chcon(fpath string, scon string, recurse bool) error { @@ -464,14 +486,14 @@ func DupSecOpt(src string) []string { con["level"] == "" { return nil } - return []string{"label:user:" + con["user"], - "label:role:" + con["role"], - "label:type:" + con["type"], - "label:level:" + con["level"]} + return []string{"label=user:" + con["user"], + "label=role:" + con["role"], + "label=type:" + con["type"], + "label=level:" + con["level"]} } // DisableSecOpt returns a security opt that can be used to disabling SELinux // labeling support for future container processes func DisableSecOpt() []string { - return []string{"label:disable"} + return []string{"label=disable"} } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go index 2bde44ff..4b78ae80 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go @@ -3,9 +3,11 @@ package libcontainer import ( + "fmt" "os" "github.com/opencontainers/runc/libcontainer/apparmor" + "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/label" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" @@ -17,12 +19,21 @@ type linuxSetnsInit struct { config *initConfig } +func (l *linuxSetnsInit) getSessionRingName() string { + return fmt.Sprintf("_ses.%s", l.config.ContainerId) +} + func (l *linuxSetnsInit) Init() error { - if err := setupRlimits(l.config.Config); err != nil { - return err + if !l.config.Config.NoNewKeyring { + // do not inherit the parent's session keyring + if _, err := keyctl.JoinSessionKeyring(l.getSessionRingName()); err != nil { + return err + } } - if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil { - return err + if l.config.NoNewPrivileges { + if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { + return err + } } if l.config.Config.Seccomp != nil { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { @@ -32,13 +43,11 @@ func (l *linuxSetnsInit) Init() error { if err := finalizeNamespace(l.config); err != nil { return err } - if err := apparmor.ApplyProfile(l.config.Config.AppArmorProfile); err != nil { + if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { return err } - if l.config.Config.ProcessLabel != "" { - if err := label.SetProcessLabel(l.config.Config.ProcessLabel); err != nil { - return err - } + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + return err } return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/capture.go b/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/capture.go index 5ee6e37a..0bbe1495 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/capture.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/stacktrace/capture.go @@ -2,14 +2,14 @@ package stacktrace import "runtime" -// Caputure captures a stacktrace for the current calling go program +// Capture captures a stacktrace for the current calling go program // // skip is the number of frames to skip func Capture(userSkip int) Stacktrace { var ( skip = userSkip + 1 // add one for our own function frames []Frame - prevPc uintptr = 0 + prevPc uintptr ) for i := skip; ; i++ { pc, file, line, ok := runtime.Caller(i) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go index d3b50863..4a497e2b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go @@ -3,28 +3,62 @@ package libcontainer import ( + "fmt" "io" "os" + "os/exec" "syscall" "github.com/opencontainers/runc/libcontainer/apparmor" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/label" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" ) type linuxStandardInit struct { - pipe io.ReadWriter - parentPid int - config *initConfig + pipe io.ReadWriteCloser + parentPid int + stateDirFD int + config *initConfig } -func (l *linuxStandardInit) Init() error { - // join any namespaces via a path to the namespace fd if provided - if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil { - return err +func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { + var newperms uint32 + + if l.config.Config.Namespaces.Contains(configs.NEWUSER) { + // with user ns we need 'other' search permissions + newperms = 0x8 + } else { + // without user ns we need 'UID' search permissions + newperms = 0x80000 } + + // create a unique per session container name that we can + // join in setns; however, other containers can also join it + return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms +} + +// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value +// the kernel +const PR_SET_NO_NEW_PRIVS = 0x26 + +func (l *linuxStandardInit) Init() error { + if !l.config.Config.NoNewKeyring { + ringname, keepperms, newperms := l.getSessionRingParams() + + // do not inherit the parent's session keyring + sessKeyId, err := keyctl.JoinSessionKeyring(ringname) + if err != nil { + return err + } + // make session keyring searcheable + if err := keyctl.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { + return err + } + } + var console *linuxConsole if l.config.Console != "" { console = newConsoleFromPath(l.config.Console) @@ -32,9 +66,6 @@ func (l *linuxStandardInit) Init() error { return err } } - if _, err := syscall.Setsid(); err != nil { - return err - } if console != nil { if err := system.Setctty(); err != nil { return err @@ -46,16 +77,11 @@ func (l *linuxStandardInit) Init() error { if err := setupRoute(l.config.Config); err != nil { return err } - if err := setupRlimits(l.config.Config); err != nil { - return err - } - if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil { - return err - } + label.Init() // InitializeMountNamespace() can be executed only for a new mount namespace if l.config.Config.Namespaces.Contains(configs.NEWNS) { - if err := setupRootfs(l.config.Config, console); err != nil { + if err := setupRootfs(l.config.Config, console, l.pipe); err != nil { return err } } @@ -64,10 +90,10 @@ func (l *linuxStandardInit) Init() error { return err } } - if err := apparmor.ApplyProfile(l.config.Config.AppArmorProfile); err != nil { + if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { return err } - if err := label.SetProcessLabel(l.config.Config.ProcessLabel); err != nil { + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { return err } @@ -90,13 +116,21 @@ func (l *linuxStandardInit) Init() error { if err != nil { return err } + if l.config.NoNewPrivileges { + if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { + return err + } + } // Tell our parent that we're ready to Execv. This must be done before the // Seccomp rules have been applied, because we need to be able to read and // write to a socket. if err := syncParentReady(l.pipe); err != nil { return err } - if l.config.Config.Seccomp != nil { + // Without NoNewPrivileges seccomp is a privileged operation, so we need to + // do this before dropping capabilities; otherwise do it as late as possible + // just before execve so as few syscalls take place after it as possible. + if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { return err } @@ -110,10 +144,35 @@ func (l *linuxStandardInit) Init() error { return err } // compare the parent from the inital start of the init process and make sure that it did not change. - // if the parent changes that means it died and we were reparened to something else so we should + // if the parent changes that means it died and we were reparented to something else so we should // just kill ourself and not cause problems for someone else. if syscall.Getppid() != l.parentPid { return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) } - return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) + // check for the arg before waiting to make sure it exists and it is returned + // as a create time error. + name, err := exec.LookPath(l.config.Args[0]) + if err != nil { + return err + } + // close the pipe to signal that we have completed our init. + l.pipe.Close() + // wait for the fifo to be opened on the other side before + // exec'ing the users process. + fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0) + if err != nil { + return newSystemErrorWithCause(err, "openat exec fifo") + } + if _, err := syscall.Write(fd, []byte("0")); err != nil { + return newSystemErrorWithCause(err, "write 0 exec fifo") + } + if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { + if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return newSystemErrorWithCause(err, "init seccomp") + } + } + if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { + return newSystemErrorWithCause(err, "exec user process") + } + return nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go index fb71ef97..26628240 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go @@ -6,9 +6,11 @@ import ( "fmt" "os" "path/filepath" + "syscall" "github.com/Sirupsen/logrus" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" ) func newStateTransitionError(from, to containerState) error { @@ -56,9 +58,10 @@ func destroy(c *linuxContainer) error { func runPoststopHooks(c *linuxContainer) error { if c.config.Hooks != nil { s := configs.HookState{ - Version: c.config.Version, - ID: c.id, - Root: c.config.Rootfs, + Version: c.config.Version, + ID: c.id, + Root: c.config.Rootfs, + BundlePath: utils.SearchLabels(c.config.Labels, "bundle"), } for _, hook := range c.config.Hooks.Poststop { if err := hook.Run(s); err != nil { @@ -75,7 +78,7 @@ type stoppedState struct { } func (b *stoppedState) status() Status { - return Destroyed + return Stopped } func (b *stoppedState) transition(s containerState) error { @@ -108,11 +111,11 @@ func (r *runningState) status() Status { func (r *runningState) transition(s containerState) error { switch s.(type) { case *stoppedState: - running, err := r.c.isRunning() + t, err := r.c.runType() if err != nil { return err } - if running { + if t == Running { return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped) } r.c.state = s @@ -127,16 +130,40 @@ func (r *runningState) transition(s containerState) error { } func (r *runningState) destroy() error { - running, err := r.c.isRunning() + t, err := r.c.runType() if err != nil { return err } - if running { + if t == Running { return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped) } return destroy(r.c) } +type createdState struct { + c *linuxContainer +} + +func (i *createdState) status() Status { + return Created +} + +func (i *createdState) transition(s containerState) error { + switch s.(type) { + case *runningState, *pausedState, *stoppedState: + i.c.state = s + return nil + case *createdState: + return nil + } + return newStateTransitionError(i, s) +} + +func (i *createdState) destroy() error { + i.c.initProcess.signal(syscall.SIGKILL) + return destroy(i.c) +} + // pausedState represents a container that is currently pause. It cannot be destroyed in a // paused state and must transition back to running first. type pausedState struct { @@ -159,11 +186,11 @@ func (p *pausedState) transition(s containerState) error { } func (p *pausedState) destroy() error { - isRunning, err := p.c.isRunning() + t, err := p.c.runType() if err != nil { return err } - if !isRunning { + if t != Running && t != Created { if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil { return err } @@ -173,7 +200,7 @@ func (p *pausedState) destroy() error { } // restoredState is the same as the running state but also has accociated checkpoint -// information that maybe need destroyed when the container is stopped and destory is called. +// information that maybe need destroyed when the container is stopped and destroy is called. type restoredState struct { imageDir string c *linuxContainer @@ -202,22 +229,25 @@ func (r *restoredState) destroy() error { return destroy(r.c) } -// createdState is used whenever a container is restored, loaded, or setting additional +// loadedState is used whenever a container is restored, loaded, or setting additional // processes inside and it should not be destroyed when it is exiting. -type createdState struct { +type loadedState struct { c *linuxContainer s Status } -func (n *createdState) status() Status { +func (n *loadedState) status() Status { return n.s } -func (n *createdState) transition(s containerState) error { +func (n *loadedState) transition(s containerState) error { n.c.state = s return nil } -func (n *createdState) destroy() error { - return nil +func (n *loadedState) destroy() error { + if err := n.c.refreshState(); err != nil { + return err + } + return n.c.state.destroy() } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go b/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go new file mode 100644 index 00000000..da78c1c2 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go @@ -0,0 +1,7 @@ +package libcontainer + +// Solaris - TODO + +type Stats struct { + Interfaces []*NetworkInterface +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go index 6c835e68..1afc52b4 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go @@ -11,6 +11,19 @@ import ( "unsafe" ) +// If arg2 is nonzero, set the "child subreaper" attribute of the +// calling process; if arg2 is zero, unset the attribute. When a +// process is marked as a child subreaper, all of the children +// that it creates, and their descendants, will be marked as +// having a subreaper. In effect, a subreaper fulfills the role +// of init(1) for its descendant processes. Upon termination of +// a process that is orphaned (i.e., its immediate parent has +// already terminated) and marked as having a subreaper, the +// nearest still living ancestor subreaper will receive a SIGCHLD +// signal and be able to wait(2) on the process to discover its +// termination status. +const PR_SET_CHILD_SUBREAPER = 36 + type ParentDeathSignal int func (p ParentDeathSignal) Restore() error { @@ -40,6 +53,14 @@ func Execv(cmd string, args []string, env []string) error { return syscall.Exec(name, args, env) } +func Prlimit(pid, resource int, limit syscall.Rlimit) error { + _, _, err := syscall.RawSyscall6(syscall.SYS_PRLIMIT64, uintptr(pid), uintptr(resource), uintptr(unsafe.Pointer(&limit)), uintptr(unsafe.Pointer(&limit)), 0, 0) + if err != 0 { + return err + } + return nil +} + func SetParentDeathSignal(sig uintptr) error { if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, sig, 0); err != 0 { return err @@ -79,17 +100,12 @@ func Setctty() error { return nil } -/* - * Detect whether we are currently running in a user namespace. - * Copied from github.com/lxc/lxd/shared/util.go - */ +// RunningInUserNS detects whether we are currently running in a user namespace. +// Copied from github.com/lxc/lxd/shared/util.go func RunningInUserNS() bool { file, err := os.Open("/proc/self/uid_map") if err != nil { - /* - * This kernel-provided file only exists if user namespaces are - * supported - */ + // This kernel-provided file only exists if user namespaces are supported return false } defer file.Close() @@ -112,3 +128,16 @@ func RunningInUserNS() bool { } return true } + +// SetSubreaper sets the value i as the subreaper setting for the calling process +func SetSubreaper(i int) error { + return Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0) +} + +func Prctl(option int, arg2, arg3, arg4, arg5 uintptr) (err error) { + _, _, e1 := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0) + if e1 != 0 { + err = e1 + } + return +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go new file mode 100644 index 00000000..e7cfd62b --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go @@ -0,0 +1,9 @@ +// +build !linux + +package system + +// RunningInUserNS is a stub for non-Linux systems +// Always returns false +func RunningInUserNS() bool { + return false +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go index 6f8a982f..ab1439f3 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go @@ -2,13 +2,15 @@ package user import ( "errors" - "fmt" "syscall" ) var ( // The current operating system does not provide the required data for user lookups. ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data") + // No matching entries found in file. + ErrNoPasswdEntries = errors.New("no matching entries in passwd file") + ErrNoGroupEntries = errors.New("no matching entries in group file") ) func lookupUser(filter func(u User) bool) (User, error) { @@ -27,7 +29,7 @@ func lookupUser(filter func(u User) bool) (User, error) { // No user entries found. if len(users) == 0 { - return User{}, fmt.Errorf("no matching entries in passwd file") + return User{}, ErrNoPasswdEntries } // Assume the first entry is the "correct" one. @@ -75,7 +77,7 @@ func lookupGroup(filter func(g Group) bool) (Group, error) { // No user entries found. if len(groups) == 0 { - return Group{}, fmt.Errorf("no matching entries in group file") + return Group{}, ErrNoGroupEntries } // Assume the first entry is the "correct" one. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go index e6375ea4..43fd39ef 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go @@ -15,7 +15,7 @@ const ( ) var ( - ErrRange = fmt.Errorf("Uids and gids must be in range %d-%d", minId, maxId) + ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId) ) type User struct { @@ -42,29 +42,30 @@ func parseLine(line string, v ...interface{}) { parts := strings.Split(line, ":") for i, p := range parts { + // Ignore cases where we don't have enough fields to populate the arguments. + // Some configuration files like to misbehave. if len(v) <= i { - // if we have more "parts" than we have places to put them, bail for great "tolerance" of naughty configuration files break } + // Use the type of the argument to figure out how to parse it, scanf() style. + // This is legit. switch e := v[i].(type) { case *string: - // "root", "adm", "/bin/bash" *e = p case *int: - // "0", "4", "1000" - // ignore string to int conversion errors, for great "tolerance" of naughty configuration files + // "numbers", with conversion errors ignored because of some misbehaving configuration files. *e, _ = strconv.Atoi(p) case *[]string: - // "", "root", "root,adm,daemon" + // Comma-separated lists. if p != "" { *e = strings.Split(p, ",") } else { *e = []string{} } default: - // panic, because this is a programming/logic error, not a runtime one - panic("parseLine expects only pointers! argument " + strconv.Itoa(i) + " is not a pointer!") + // Someone goof'd when writing code using this function. Scream so they can hear us. + panic(fmt.Sprintf("parseLine only accepts {*string, *int, *[]string} as arguments! %#v is not a pointer!", e)) } } } @@ -106,8 +107,8 @@ func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) { return nil, err } - text := strings.TrimSpace(s.Text()) - if text == "" { + line := strings.TrimSpace(s.Text()) + if line == "" { continue } @@ -117,10 +118,7 @@ func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) { // root:x:0:0:root:/root:/bin/bash // adm:x:3:4:adm:/var/adm:/bin/false p := User{} - parseLine( - text, - &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell, - ) + parseLine(line, &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell) if filter == nil || filter(p) { out = append(out, p) @@ -135,6 +133,7 @@ func ParseGroupFile(path string) ([]Group, error) { if err != nil { return nil, err } + defer group.Close() return ParseGroup(group) } @@ -178,10 +177,7 @@ func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) { // root:x:0:root // adm:x:4:root,adm,daemon p := Group{} - parseLine( - text, - &p.Name, &p.Pass, &p.Gid, &p.List, - ) + parseLine(text, &p.Name, &p.Pass, &p.Gid, &p.List) if filter == nil || filter(p) { out = append(out, p) @@ -192,9 +188,10 @@ func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) { } type ExecUser struct { - Uid, Gid int - Sgids []int - Home string + Uid int + Gid int + Sgids []int + Home string } // GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the @@ -235,12 +232,12 @@ func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath // * "uid:gid // * "user:gid" // * "uid:group" +// +// It should be noted that if you specify a numeric user or group id, they will +// not be evaluated as usernames (only the metadata will be filled). So attempting +// to parse a user with user.Name = "1337" will produce the user with a UID of +// 1337. func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) { - var ( - userArg, groupArg string - name string - ) - if defaults == nil { defaults = new(ExecUser) } @@ -258,87 +255,113 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) ( user.Sgids = []int{} } - // allow for userArg to have either "user" syntax, or optionally "user:group" syntax + // Allow for userArg to have either "user" syntax, or optionally "user:group" syntax + var userArg, groupArg string parseLine(userSpec, &userArg, &groupArg) + // Convert userArg and groupArg to be numeric, so we don't have to execute + // Atoi *twice* for each iteration over lines. + uidArg, uidErr := strconv.Atoi(userArg) + gidArg, gidErr := strconv.Atoi(groupArg) + + // Find the matching user. users, err := ParsePasswdFilter(passwd, func(u User) bool { if userArg == "" { + // Default to current state of the user. return u.Uid == user.Uid } - return u.Name == userArg || strconv.Itoa(u.Uid) == userArg + + if uidErr == nil { + // If the userArg is numeric, always treat it as a UID. + return uidArg == u.Uid + } + + return u.Name == userArg }) + + // If we can't find the user, we have to bail. if err != nil && passwd != nil { if userArg == "" { userArg = strconv.Itoa(user.Uid) } - return nil, fmt.Errorf("Unable to find user %v: %v", userArg, err) + return nil, fmt.Errorf("unable to find user %s: %v", userArg, err) } - haveUser := users != nil && len(users) > 0 - if haveUser { - // if we found any user entries that matched our filter, let's take the first one as "correct" - name = users[0].Name + var matchedUserName string + if len(users) > 0 { + // First match wins, even if there's more than one matching entry. + matchedUserName = users[0].Name user.Uid = users[0].Uid user.Gid = users[0].Gid user.Home = users[0].Home } else if userArg != "" { - // we asked for a user but didn't find them... let's check to see if we wanted a numeric user - user.Uid, err = strconv.Atoi(userArg) - if err != nil { - // not numeric - we have to bail - return nil, fmt.Errorf("Unable to find user %v", userArg) + // If we can't find a user with the given username, the only other valid + // option is if it's a numeric username with no associated entry in passwd. + + if uidErr != nil { + // Not numeric. + return nil, fmt.Errorf("unable to find user %s: %v", userArg, ErrNoPasswdEntries) } + user.Uid = uidArg // Must be inside valid uid range. if user.Uid < minId || user.Uid > maxId { return nil, ErrRange } - // if userArg couldn't be found in /etc/passwd but is numeric, just roll with it - this is legit + // Okay, so it's numeric. We can just roll with this. } - if groupArg != "" || name != "" { + // On to the groups. If we matched a username, we need to do this because of + // the supplementary group IDs. + if groupArg != "" || matchedUserName != "" { groups, err := ParseGroupFilter(group, func(g Group) bool { - // Explicit group format takes precedence. - if groupArg != "" { - return g.Name == groupArg || strconv.Itoa(g.Gid) == groupArg - } - - // Check if user is a member. - for _, u := range g.List { - if u == name { - return true + // If the group argument isn't explicit, we'll just search for it. + if groupArg == "" { + // Check if user is a member of this group. + for _, u := range g.List { + if u == matchedUserName { + return true + } } + return false } - return false + if gidErr == nil { + // If the groupArg is numeric, always treat it as a GID. + return gidArg == g.Gid + } + + return g.Name == groupArg }) if err != nil && group != nil { - return nil, fmt.Errorf("Unable to find groups for user %v: %v", users[0].Name, err) + return nil, fmt.Errorf("unable to find groups for spec %v: %v", matchedUserName, err) } - haveGroup := groups != nil && len(groups) > 0 + // Only start modifying user.Gid if it is in explicit form. if groupArg != "" { - if haveGroup { - // if we found any group entries that matched our filter, let's take the first one as "correct" + if len(groups) > 0 { + // First match wins, even if there's more than one matching entry. user.Gid = groups[0].Gid - } else { - // we asked for a group but didn't find id... let's check to see if we wanted a numeric group - user.Gid, err = strconv.Atoi(groupArg) - if err != nil { - // not numeric - we have to bail - return nil, fmt.Errorf("Unable to find group %v", groupArg) - } + } else if groupArg != "" { + // If we can't find a group with the given name, the only other valid + // option is if it's a numeric group name with no associated entry in group. - // Ensure gid is inside gid range. + if gidErr != nil { + // Not numeric. + return nil, fmt.Errorf("unable to find group %s: %v", groupArg, ErrNoGroupEntries) + } + user.Gid = gidArg + + // Must be inside valid gid range. if user.Gid < minId || user.Gid > maxId { return nil, ErrRange } - // if groupArg couldn't be found in /etc/group but is numeric, just roll with it - this is legit + // Okay, so it's numeric. We can just roll with this. } - } else if haveGroup { - // If implicit group format, fill supplementary gids. + } else if len(groups) > 0 { + // Supplementary group ids only make sense if in the implicit form. user.Sgids = make([]int, len(groups)) for i, group := range groups { user.Sgids[i] = group.Gid diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go index 1378006b..3466bfce 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -5,7 +5,9 @@ import ( "encoding/hex" "encoding/json" "io" + "os" "path/filepath" + "strings" "syscall" ) @@ -54,3 +56,66 @@ func WriteJSON(w io.Writer, v interface{}) error { _, err = w.Write(data) return err } + +// CleanPath makes a path safe for use with filepath.Join. This is done by not +// only cleaning the path, but also (if the path is relative) adding a leading +// '/' and cleaning it (then removing the leading '/'). This ensures that a +// path resulting from prepending another path will always resolve to lexically +// be a subdirectory of the prefixed path. This is all done lexically, so paths +// that include symlinks won't be safe as a result of using CleanPath. +func CleanPath(path string) string { + // Deal with empty strings nicely. + if path == "" { + return "" + } + + // Ensure that all paths are cleaned (especially problematic ones like + // "/../../../../../" which can cause lots of issues). + path = filepath.Clean(path) + + // If the path isn't absolute, we need to do more processing to fix paths + // such as "../../../..//some/path". We also shouldn't convert absolute + // paths to relative ones. + if !filepath.IsAbs(path) { + path = filepath.Clean(string(os.PathSeparator) + path) + // This can't fail, as (by definition) all paths are relative to root. + path, _ = filepath.Rel(string(os.PathSeparator), path) + } + + // Clean the path again for good measure. + return filepath.Clean(path) +} + +// SearchLabels searches a list of key-value pairs for the provided key and +// returns the corresponding value. The pairs must be separated with '='. +func SearchLabels(labels []string, query string) string { + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == query { + return parts[1] + } + } + return "" +} + +// Annotations returns the bundle path and user defined annotations from the +// libcontianer state. We need to remove the bundle because that is a label +// added by libcontainer. +func Annotations(labels []string) (bundle string, userAnnotations map[string]string) { + userAnnotations = make(map[string]string) + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == "bundle" { + bundle = parts[1] + } else { + userAnnotations[parts[0]] = parts[1] + } + } + return +} diff --git a/vendor/google.golang.org/grpc/stream.go b/vendor/google.golang.org/grpc/stream.go index 775b9ee0..4974d8a8 100644 --- a/vendor/google.golang.org/grpc/stream.go +++ b/vendor/google.golang.org/grpc/stream.go @@ -109,7 +109,7 @@ func NewClientStream(ctx context.Context, desc *StreamDesc, cc *ClientConn, meth callHdr := &transport.CallHdr{ Host: cc.authority, Method: method, - Flush: desc.ServerStreams && desc.ClientStreams, + Flush: desc.ServerStreams&&desc.ClientStreams, } if cc.dopts.cp != nil { callHdr.SendCompress = cc.dopts.cp.Type()