Merge pull request #1387 from vishh/runc-bump

Update libcontainer deps
This commit is contained in:
Tim St. Clair 2016-07-19 12:45:32 -07:00 committed by GitHub
commit 734db4c437
64 changed files with 2246 additions and 965 deletions

65
Godeps/Godeps.json generated
View File

@ -327,78 +327,83 @@
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer", "ImportPath": "github.com/opencontainers/runc/libcontainer",
"Comment": "v0.0.7", "Comment": "v1.0.0-rc1-71-g4dedd09",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/apparmor", "ImportPath": "github.com/opencontainers/runc/libcontainer/apparmor",
"Comment": "v0.0.7", "Comment": "v1.0.0-rc1-71-g4dedd09",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups", "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups",
"Comment": "v0.0.7", "Comment": "v1.0.0-rc1-71-g4dedd09",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/fs", "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/fs",
"Comment": "v0.0.7", "Comment": "v1.0.0-rc1-71-g4dedd09",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/systemd", "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/systemd",
"Comment": "v0.0.7", "Comment": "v1.0.0-rc1-71-g4dedd09",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/configs", "ImportPath": "github.com/opencontainers/runc/libcontainer/configs",
"Comment": "v0.0.7", "Comment": "v1.0.0-rc1-71-g4dedd09",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/configs/validate", "ImportPath": "github.com/opencontainers/runc/libcontainer/configs/validate",
"Comment": "v0.0.7", "Comment": "v1.0.0-rc1-71-g4dedd09",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/criurpc", "ImportPath": "github.com/opencontainers/runc/libcontainer/criurpc",
"Comment": "v0.0.7", "Comment": "v1.0.0-rc1-71-g4dedd09",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/keys",
"Comment": "v1.0.0-rc1-71-g4dedd09",
"Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/label", "ImportPath": "github.com/opencontainers/runc/libcontainer/label",
"Comment": "v0.0.7", "Comment": "v1.0.0-rc1-71-g4dedd09",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/seccomp", "ImportPath": "github.com/opencontainers/runc/libcontainer/seccomp",
"Comment": "v0.0.7", "Comment": "v1.0.0-rc1-71-g4dedd09",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/selinux", "ImportPath": "github.com/opencontainers/runc/libcontainer/selinux",
"Comment": "v0.0.7", "Comment": "v1.0.0-rc1-71-g4dedd09",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/stacktrace", "ImportPath": "github.com/opencontainers/runc/libcontainer/stacktrace",
"Comment": "v0.0.7", "Comment": "v1.0.0-rc1-71-g4dedd09",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/system", "ImportPath": "github.com/opencontainers/runc/libcontainer/system",
"Comment": "v0.0.7", "Comment": "v1.0.0-rc1-71-g4dedd09",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/user", "ImportPath": "github.com/opencontainers/runc/libcontainer/user",
"Comment": "v0.0.7", "Comment": "v1.0.0-rc1-71-g4dedd09",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/utils", "ImportPath": "github.com/opencontainers/runc/libcontainer/utils",
"Comment": "v0.0.7", "Comment": "v1.0.0-rc1-71-g4dedd09",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" "Rev": "4dedd0939638fc27a609de1cb37e0666b3cf2079"
}, },
{ {
"ImportPath": "github.com/pborman/uuid", "ImportPath": "github.com/pborman/uuid",

View File

@ -76,7 +76,7 @@ config := &configs.Config{
Name: "test-container", Name: "test-container",
Parent: "system", Parent: "system",
Resources: &configs.Resources{ Resources: &configs.Resources{
MemorySwappiness: -1, MemorySwappiness: nil,
AllowAllDevices: false, AllowAllDevices: false,
AllowedDevices: configs.DefaultAllowedDevices, AllowedDevices: configs.DefaultAllowedDevices,
}, },
@ -133,15 +133,15 @@ config := &configs.Config{
UidMappings: []configs.IDMap{ UidMappings: []configs.IDMap{
{ {
ContainerID: 0, ContainerID: 0,
Host: 1000, HostID: 1000,
size: 65536, Size: 65536,
}, },
}, },
GidMappings: []configs.IDMap{ GidMappings: []configs.IDMap{
{ {
ContainerID: 0, ContainerID: 0,
Host: 1000, HostID: 1000,
size: 65536, Size: 65536,
}, },
}, },
Networks: []*configs.Network{ Networks: []*configs.Network{
@ -186,8 +186,8 @@ process := &libcontainer.Process{
err := container.Start(process) err := container.Start(process)
if err != nil { if err != nil {
logrus.Fatal(err)
container.Destroy() container.Destroy()
logrus.Fatal(err)
return return
} }
@ -216,6 +216,12 @@ container.Pause()
// resume all paused processes. // resume all paused processes.
container.Resume() container.Resume()
// send signal to container's init process.
container.Signal(signal)
// update container resource constraints.
container.Set(config)
``` ```

View File

@ -90,7 +90,7 @@ in tmpfs.
After `/dev/null` has been setup we check for any external links between After `/dev/null` has been setup we check for any external links between
the container's io, STDIN, STDOUT, STDERR. If the container's io is pointing the container's io, STDIN, STDOUT, STDERR. If the container's io is pointing
to `/dev/null` outside the container we close and `dup2` the the `/dev/null` to `/dev/null` outside the container we close and `dup2` the `/dev/null`
that is local to the container's rootfs. that is local to the container's rootfs.
@ -142,6 +142,7 @@ system resources like cpu, memory, and device access.
| perf_event | 1 | | perf_event | 1 |
| freezer | 1 | | freezer | 1 |
| hugetlb | 1 | | hugetlb | 1 |
| pids | 1 |
All cgroup subsystem are joined so that statistics can be collected from All cgroup subsystem are joined so that statistics can be collected from
@ -199,7 +200,7 @@ provide a good default for security and flexibility for the applications.
| CAP_SYS_BOOT | 0 | | CAP_SYS_BOOT | 0 |
| CAP_LEASE | 0 | | CAP_LEASE | 0 |
| CAP_WAKE_ALARM | 0 | | CAP_WAKE_ALARM | 0 |
| CAP_BLOCK_SUSPE | 0 | | CAP_BLOCK_SUSPEND | 0 |
Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor) Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor)
@ -296,7 +297,7 @@ a container.
| -------------- | ------------------------------------------------------------------ | | -------------- | ------------------------------------------------------------------ |
| Get processes | Return all the pids for processes running inside a container | | Get processes | Return all the pids for processes running inside a container |
| Get Stats | Return resource statistics for the container as a whole | | Get Stats | Return resource statistics for the container as a whole |
| Wait | Wait waits on the container's init process ( pid 1 ) | | Wait | Waits on the container's init process ( pid 1 ) |
| Wait Process | Wait on any of the container's processes returning the exit status | | Wait Process | Wait on any of the container's processes returning the exit status |
| Destroy | Kill the container's init process and remove any filesystem state | | Destroy | Kill the container's init process and remove any filesystem state |
| Signal | Send a signal to the container's init process | | Signal | Send a signal to the container's init process |

View File

@ -7,6 +7,7 @@ package apparmor
// #include <stdlib.h> // #include <stdlib.h>
import "C" import "C"
import ( import (
"fmt"
"io/ioutil" "io/ioutil"
"os" "os"
"unsafe" "unsafe"
@ -32,7 +33,7 @@ func ApplyProfile(name string) error {
cName := C.CString(name) cName := C.CString(name)
defer C.free(unsafe.Pointer(cName)) defer C.free(unsafe.Pointer(cName))
if _, err := C.aa_change_onexec(cName); err != nil { if _, err := C.aa_change_onexec(cName); err != nil {
return err return fmt.Errorf("apparmor failed to apply profile: %s", err)
} }
return nil return nil
} }

View File

@ -9,7 +9,7 @@ import (
) )
type Manager interface { type Manager interface {
// Apply cgroup configuration to the process with the specified pid // Applies cgroup configuration to the process with the specified pid
Apply(pid int) error Apply(pid int) error
// Returns the PIDs inside the cgroup set // Returns the PIDs inside the cgroup set

View File

@ -14,6 +14,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
) )
var ( var (
@ -30,6 +31,7 @@ var (
&NetPrioGroup{}, &NetPrioGroup{},
&PerfEventGroup{}, &PerfEventGroup{},
&FreezerGroup{}, &FreezerGroup{},
&NameGroup{GroupName: "name=systemd", Join: true},
} }
CgroupProcesses = "cgroup.procs" CgroupProcesses = "cgroup.procs"
HugePageSizes, _ = cgroups.GetHugePageSize() HugePageSizes, _ = cgroups.GetHugePageSize()
@ -95,8 +97,7 @@ func getCgroupRoot() (string, error) {
type cgroupData struct { type cgroupData struct {
root string root string
parent string innerPath string
name string
config *configs.Cgroup config *configs.Cgroup
pid int pid int
} }
@ -129,12 +130,9 @@ func (m *Manager) Apply(pid int) (err error) {
return cgroups.EnterPid(m.Paths, pid) return cgroups.EnterPid(m.Paths, pid)
} }
m.mu.Lock()
defer m.mu.Unlock()
paths := make(map[string]string) paths := make(map[string]string)
defer func() {
if err != nil {
cgroups.RemovePaths(paths)
}
}()
for _, sys := range subsystems { for _, sys := range subsystems {
if err := sys.Apply(d); err != nil { if err := sys.Apply(d); err != nil {
return err return err
@ -144,7 +142,9 @@ func (m *Manager) Apply(pid int) (err error) {
// created then join consists of writing the process pids to cgroup.procs // created then join consists of writing the process pids to cgroup.procs
p, err := d.path(sys.Name()) p, err := d.path(sys.Name())
if err != nil { if err != nil {
if cgroups.IsNotFound(err) { // The non-presence of the devices subsystem is
// considered fatal for security reasons.
if cgroups.IsNotFound(err) && sys.Name() != "devices" {
continue continue
} }
return err return err
@ -267,43 +267,29 @@ func getCgroupPath(c *configs.Cgroup) (string, error) {
return d.path("devices") return d.path("devices")
} }
// pathClean makes a path safe for use with filepath.Join. This is done by not
// only cleaning the path, but also (if the path is relative) adding a leading
// '/' and cleaning it (then removing the leading '/'). This ensures that a
// path resulting from prepending another path will always resolve to lexically
// be a subdirectory of the prefixed path. This is all done lexically, so paths
// that include symlinks won't be safe as a result of using pathClean.
func pathClean(path string) string {
// Ensure that all paths are cleaned (especially problematic ones like
// "/../../../../../" which can cause lots of issues).
path = filepath.Clean(path)
// If the path isn't absolute, we need to do more processing to fix paths
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
// paths to relative ones.
if !filepath.IsAbs(path) {
path = filepath.Clean(string(os.PathSeparator) + path)
// This can't fail, as (by definition) all paths are relative to root.
path, _ = filepath.Rel(string(os.PathSeparator), path)
}
// Clean the path again for good measure.
return filepath.Clean(path)
}
func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
root, err := getCgroupRoot() root, err := getCgroupRoot()
if err != nil { if err != nil {
return nil, err return nil, err
} }
// Clean the parent slice path. if (c.Name != "" || c.Parent != "") && c.Path != "" {
c.Parent = pathClean(c.Parent) return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
}
// XXX: Do not remove this code. Path safety is important! -- cyphar
cgPath := libcontainerUtils.CleanPath(c.Path)
cgParent := libcontainerUtils.CleanPath(c.Parent)
cgName := libcontainerUtils.CleanPath(c.Name)
innerPath := cgPath
if innerPath == "" {
innerPath = filepath.Join(cgParent, cgName)
}
return &cgroupData{ return &cgroupData{
root: root, root: root,
parent: c.Parent, innerPath: innerPath,
name: c.Name,
config: c, config: c,
pid: pid, pid: pid,
}, nil }, nil
@ -333,11 +319,10 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
return "", err return "", err
} }
cgPath := filepath.Join(raw.parent, raw.name)
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process. // If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(cgPath) { if filepath.IsAbs(raw.innerPath) {
// Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'. // Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'.
return filepath.Join(raw.root, filepath.Base(mnt), cgPath), nil return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
} }
parentPath, err := raw.parentPath(subsystem, mnt, root) parentPath, err := raw.parentPath(subsystem, mnt, root)
@ -345,7 +330,7 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
return "", err return "", err
} }
return filepath.Join(parentPath, cgPath), nil return filepath.Join(parentPath, raw.innerPath), nil
} }
func (raw *cgroupData) join(subsystem string) (string, error) { func (raw *cgroupData) join(subsystem string) (string, error) {
@ -366,9 +351,12 @@ func writeFile(dir, file, data string) error {
// Normally dir should not be empty, one case is that cgroup subsystem // Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here. // is not mounted, we will get empty dir, and we want it fail here.
if dir == "" { if dir == "" {
return fmt.Errorf("no such directory for %s.", file) return fmt.Errorf("no such directory for %s", file)
} }
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700); err != nil {
return fmt.Errorf("failed to write %v to %v: %v", data, file, err)
}
return nil
} }
func readFile(dir, file string) (string, error) { func readFile(dir, file string) (string, error) {

View File

@ -12,6 +12,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
) )
type CpusetGroup struct { type CpusetGroup struct {
@ -88,7 +89,7 @@ func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []b
// it's parent. // it's parent.
func (s *CpusetGroup) ensureParent(current, root string) error { func (s *CpusetGroup) ensureParent(current, root string) error {
parent := filepath.Dir(current) parent := filepath.Dir(current)
if filepath.Clean(parent) == root { if libcontainerUtils.CleanPath(parent) == root {
return nil return nil
} }
// Avoid infinite recursion. // Avoid infinite recursion.

View File

@ -5,6 +5,7 @@ package fs
import ( import (
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
) )
type DevicesGroup struct { type DevicesGroup struct {
@ -25,6 +26,23 @@ func (s *DevicesGroup) Apply(d *cgroupData) error {
} }
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error { func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
if system.RunningInUserNS() {
return nil
}
devices := cgroup.Resources.Devices
if len(devices) > 0 {
for _, dev := range devices {
file := "devices.deny"
if dev.Allow {
file = "devices.allow"
}
if err := writeFile(path, file, dev.CgroupString()); err != nil {
return err
}
}
return nil
}
if !cgroup.Resources.AllowAllDevices { if !cgroup.Resources.AllowAllDevices {
if err := writeFile(path, "devices.deny", "a"); err != nil { if err := writeFile(path, "devices.deny", "a"); err != nil {
return err return err

View File

@ -9,6 +9,7 @@ import (
"path/filepath" "path/filepath"
"strconv" "strconv"
"strings" "strings"
"time"
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
@ -26,38 +27,75 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
if memoryAssigned(d.config) { // reset error.
if path != "" { err = nil
if err := os.MkdirAll(path, 0755); err != nil { if path == "" {
return err // Invalid input.
return fmt.Errorf("invalid path for memory cgroups: %+v", d)
} }
}
// We have to set kernel memory here, as we can't change it once
// processes have been attached.
if err := s.SetKernelMemory(path, d.config); err != nil {
return err
}
}
defer func() { defer func() {
if err != nil { if err != nil {
os.RemoveAll(path) os.RemoveAll(path)
} }
}() }()
if !cgroups.PathExists(path) {
if err = os.MkdirAll(path, 0755); err != nil {
return err
}
}
if memoryAssigned(d.config) {
// We have to set kernel memory here, as we can't change it once
// processes have been attached to the cgroup.
if err = s.SetKernelMemory(path, d.config); err != nil {
return err
}
}
// We need to join memory cgroup after set memory limits, because // We need to join memory cgroup after set memory limits, because
// kmem.limit_in_bytes can only be set when the cgroup is empty. // kmem.limit_in_bytes can only be set when the cgroup is empty.
_, err = d.join("memory") if _, jerr := d.join("memory"); jerr != nil && !cgroups.IsNotFound(jerr) {
if err != nil && !cgroups.IsNotFound(err) { err = jerr
return err return err
} }
return nil return nil
} }
func getModifyTime(path string) (time.Time, error) {
stat, err := os.Stat(path)
if err != nil {
return time.Time{}, fmt.Errorf("failed to get memory cgroups creation time: %v", err)
}
return stat.ModTime(), nil
}
func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error { func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error {
// This has to be done separately because it has special constraints (it // This has to be done separately because it has special
// can't be done after there are processes attached to the cgroup). // constraints (it can only be initialized before setting up a
if cgroup.Resources.KernelMemory > 0 { // hierarchy or adding a task to the cgroups. However, if
// sucessfully initialized, it can be updated anytime afterwards)
if cgroup.Resources.KernelMemory != 0 {
// Is kmem.limit_in_bytes already set?
// memory.kmem.max_usage_in_bytes is a read-only file. Use it to get cgroups creation time.
kmemCreationTime, err := getModifyTime(filepath.Join(path, "memory.kmem.max_usage_in_bytes"))
if err != nil {
return err
}
kmemLimitsUpdateTime, err := getModifyTime(filepath.Join(path, "memory.kmem.limit_in_bytes"))
if err != nil {
return err
}
// kmem.limit_in_bytes has already been set if its update time is after that of creation time.
// We use `!=` op instead of `>` because updates are losing precision compared to creation.
kmemInitialized := !kmemLimitsUpdateTime.Equal(kmemCreationTime)
if !kmemInitialized {
// If there's already tasks in the cgroup, we can't change the limit either
tasks, err := getCgroupParamString(path, "tasks")
if err != nil {
return err
}
if tasks != "" {
return fmt.Errorf("cannot set kmem.limit_in_bytes after task have joined this cgroup")
}
}
if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil { if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
return err return err
} }
@ -65,14 +103,36 @@ func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error
return nil return nil
} }
func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.Memory != 0 { // When memory and swap memory are both set, we need to handle the cases
// for updating container.
if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap > 0 {
memoryUsage, err := getMemoryData(path, "")
if err != nil {
return err
}
// When update memory limit, we should adapt the write sequence
// for memory and swap memory, so it won't fail because the new
// value and the old value don't fit kernel's validation.
if memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) {
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err return err
} }
} else {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
} }
if cgroup.Resources.MemoryReservation != 0 { if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { return err
}
}
} else {
if cgroup.Resources.Memory != 0 {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err return err
} }
} }
@ -81,19 +141,43 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
return err return err
} }
} }
}
return nil
}
func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
if err := setMemoryAndSwap(path, cgroup); err != nil {
return err
}
if err := s.SetKernelMemory(path, cgroup); err != nil {
return err
}
if cgroup.Resources.MemoryReservation != 0 {
if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
return err
}
}
if cgroup.Resources.KernelMemoryTCP != 0 {
if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
return err
}
}
if cgroup.Resources.OomKillDisable { if cgroup.Resources.OomKillDisable {
if err := writeFile(path, "memory.oom_control", "1"); err != nil { if err := writeFile(path, "memory.oom_control", "1"); err != nil {
return err return err
} }
} }
if cgroup.Resources.MemorySwappiness >= 0 && cgroup.Resources.MemorySwappiness <= 100 { if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 {
if err := writeFile(path, "memory.swappiness", strconv.FormatInt(cgroup.Resources.MemorySwappiness, 10)); err != nil { return nil
} else if int64(*cgroup.Resources.MemorySwappiness) >= 0 && int64(*cgroup.Resources.MemorySwappiness) <= 100 {
if err := writeFile(path, "memory.swappiness", strconv.FormatInt(*cgroup.Resources.MemorySwappiness, 10)); err != nil {
return err return err
} }
} else if cgroup.Resources.MemorySwappiness == -1 {
return nil
} else { } else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", cgroup.Resources.MemorySwappiness) return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", int64(*cgroup.Resources.MemorySwappiness))
} }
return nil return nil
@ -139,6 +223,11 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
return err return err
} }
stats.MemoryStats.KernelUsage = kernelUsage stats.MemoryStats.KernelUsage = kernelUsage
kernelTCPUsage, err := getMemoryData(path, "kmem.tcp")
if err != nil {
return err
}
stats.MemoryStats.KernelTCPUsage = kernelTCPUsage
return nil return nil
} }
@ -148,8 +237,9 @@ func memoryAssigned(cgroup *configs.Cgroup) bool {
cgroup.Resources.MemoryReservation != 0 || cgroup.Resources.MemoryReservation != 0 ||
cgroup.Resources.MemorySwap > 0 || cgroup.Resources.MemorySwap > 0 ||
cgroup.Resources.KernelMemory > 0 || cgroup.Resources.KernelMemory > 0 ||
cgroup.Resources.KernelMemoryTCP > 0 ||
cgroup.Resources.OomKillDisable || cgroup.Resources.OomKillDisable ||
cgroup.Resources.MemorySwappiness != -1 (cgroup.Resources.MemorySwappiness != nil && *cgroup.Resources.MemorySwappiness != -1)
} }
func getMemoryData(path, name string) (cgroups.MemoryData, error) { func getMemoryData(path, name string) (cgroups.MemoryData, error) {
@ -162,6 +252,7 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".") usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".") maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
failcnt := strings.Join([]string{moduleName, "failcnt"}, ".") failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")
value, err := getCgroupParamUint(path, usage) value, err := getCgroupParamUint(path, usage)
if err != nil { if err != nil {
@ -187,6 +278,14 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err) return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
} }
memoryData.Failcnt = value memoryData.Failcnt = value
value, err = getCgroupParamUint(path, limit)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
}
memoryData.Limit = value
return memoryData, nil return memoryData, nil
} }

View File

@ -9,6 +9,7 @@ import (
type NameGroup struct { type NameGroup struct {
GroupName string GroupName string
Join bool
} }
func (s *NameGroup) Name() string { func (s *NameGroup) Name() string {
@ -16,6 +17,10 @@ func (s *NameGroup) Name() string {
} }
func (s *NameGroup) Apply(d *cgroupData) error { func (s *NameGroup) Apply(d *cgroupData) error {
if s.Join {
// ignore errors if the named cgroup does not exist
d.join(s.GroupName)
}
return nil return nil
} }
@ -24,6 +29,9 @@ func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error {
} }
func (s *NameGroup) Remove(d *cgroupData) error { func (s *NameGroup) Remove(d *cgroupData) error {
if s.Join {
removePath(d.path(s.GroupName))
}
return nil return nil
} }

View File

@ -3,6 +3,8 @@
package fs package fs
import ( import (
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
) )
@ -23,8 +25,8 @@ func (s *NetClsGroup) Apply(d *cgroupData) error {
} }
func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error { func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.NetClsClassid != "" { if cgroup.Resources.NetClsClassid != 0 {
if err := writeFile(path, "net_cls.classid", cgroup.Resources.NetClsClassid); err != nil { if err := writeFile(path, "net_cls.classid", strconv.FormatUint(uint64(cgroup.Resources.NetClsClassid), 10)); err != nil {
return err return err
} }
} }

View File

@ -4,6 +4,7 @@ package fs
import ( import (
"fmt" "fmt"
"path/filepath"
"strconv" "strconv"
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
@ -47,11 +48,26 @@ func (s *PidsGroup) Remove(d *cgroupData) error {
} }
func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error { func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
value, err := getCgroupParamUint(path, "pids.current") current, err := getCgroupParamUint(path, "pids.current")
if err != nil { if err != nil {
return fmt.Errorf("failed to parse pids.current - %s", err) return fmt.Errorf("failed to parse pids.current - %s", err)
} }
stats.PidsStats.Current = value maxString, err := getCgroupParamString(path, "pids.max")
if err != nil {
return fmt.Errorf("failed to parse pids.max - %s", err)
}
// Default if pids.max == "max" is 0 -- which represents "no limit".
var max uint64
if maxString != "max" {
max, err = parseUint(maxString, 10, 64)
if err != nil {
return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max"))
}
}
stats.PidsStats.Current = current
stats.PidsStats.Limit = max
return nil return nil
} }

View File

@ -12,7 +12,6 @@ import (
) )
var ( var (
ErrNotSupportStat = errors.New("stats are not supported for subsystem")
ErrNotValidFormat = errors.New("line is not a valid key value format") ErrNotValidFormat = errors.New("line is not a valid key value format")
) )

View File

@ -11,6 +11,7 @@ type ThrottlingData struct {
ThrottledTime uint64 `json:"throttled_time,omitempty"` ThrottledTime uint64 `json:"throttled_time,omitempty"`
} }
// CpuUsage denotes the usage of a CPU.
// All CPU stats are aggregate since container inception. // All CPU stats are aggregate since container inception.
type CpuUsage struct { type CpuUsage struct {
// Total CPU time consumed. // Total CPU time consumed.
@ -36,7 +37,9 @@ type MemoryData struct {
Usage uint64 `json:"usage,omitempty"` Usage uint64 `json:"usage,omitempty"`
MaxUsage uint64 `json:"max_usage,omitempty"` MaxUsage uint64 `json:"max_usage,omitempty"`
Failcnt uint64 `json:"failcnt"` Failcnt uint64 `json:"failcnt"`
Limit uint64 `json:"limit"`
} }
type MemoryStats struct { type MemoryStats struct {
// memory used for cache // memory used for cache
Cache uint64 `json:"cache,omitempty"` Cache uint64 `json:"cache,omitempty"`
@ -44,14 +47,19 @@ type MemoryStats struct {
Usage MemoryData `json:"usage,omitempty"` Usage MemoryData `json:"usage,omitempty"`
// usage of memory + swap // usage of memory + swap
SwapUsage MemoryData `json:"swap_usage,omitempty"` SwapUsage MemoryData `json:"swap_usage,omitempty"`
// usafe of kernel memory // usage of kernel memory
KernelUsage MemoryData `json:"kernel_usage,omitempty"` KernelUsage MemoryData `json:"kernel_usage,omitempty"`
// usage of kernel TCP memory
KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
Stats map[string]uint64 `json:"stats,omitempty"` Stats map[string]uint64 `json:"stats,omitempty"`
} }
type PidsStats struct { type PidsStats struct {
// number of pids in the cgroup // number of pids in the cgroup
Current uint64 `json:"current,omitempty"` Current uint64 `json:"current,omitempty"`
// active pids hard limit
Limit uint64 `json:"limit,omitempty"`
} }
type BlkioStatEntry struct { type BlkioStatEntry struct {
@ -78,7 +86,7 @@ type HugetlbStats struct {
Usage uint64 `json:"usage,omitempty"` Usage uint64 `json:"usage,omitempty"`
// maximum usage ever recorded. // maximum usage ever recorded.
MaxUsage uint64 `json:"max_usage,omitempty"` MaxUsage uint64 `json:"max_usage,omitempty"`
// number of times htgetlb usage allocation failure. // number of times hugetlb usage allocation failure.
Failcnt uint64 `json:"failcnt"` Failcnt uint64 `json:"failcnt"`
} }

View File

@ -74,6 +74,7 @@ var (
theConn *systemdDbus.Conn theConn *systemdDbus.Conn
hasStartTransientUnit bool hasStartTransientUnit bool
hasTransientDefaultDependencies bool hasTransientDefaultDependencies bool
hasDelegate bool
) )
func newProp(name string, units interface{}) systemdDbus.Property { func newProp(name string, units interface{}) systemdDbus.Property {
@ -146,20 +147,24 @@ func UseSystemd() bool {
// Not critical because of the stop unit logic above. // Not critical because of the stop unit logic above.
theConn.StopUnit(scope, "replace", nil) theConn.StopUnit(scope, "replace", nil)
// Assume StartTransientUnit on a scope allows Delegate
hasDelegate = true
dl := newProp("Delegate", true)
if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil {
if dbusError, ok := err.(dbus.Error); ok {
if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") {
hasDelegate = false
}
}
}
// Not critical because of the stop unit logic above.
theConn.StopUnit(scope, "replace", nil)
} }
return hasStartTransientUnit return hasStartTransientUnit
} }
func getIfaceForUnit(unitName string) string {
if strings.HasSuffix(unitName, ".scope") {
return "Scope"
}
if strings.HasSuffix(unitName, ".service") {
return "Service"
}
return "Unit"
}
func (m *Manager) Apply(pid int) error { func (m *Manager) Apply(pid int) error {
var ( var (
c = m.Cgroups c = m.Cgroups
@ -195,6 +200,11 @@ func (m *Manager) Apply(pid int) error {
newProp("PIDs", []uint32{uint32(pid)}), newProp("PIDs", []uint32{uint32(pid)}),
) )
if hasDelegate {
// This is only supported on systemd versions 218 and above.
properties = append(properties, newProp("Delegate", true))
}
// Always enable accounting, this gets us the same behaviour as the fs implementation, // Always enable accounting, this gets us the same behaviour as the fs implementation,
// plus the kernel has some problems with joining the memory cgroup at a later time. // plus the kernel has some problems with joining the memory cgroup at a later time.
properties = append(properties, properties = append(properties,
@ -222,11 +232,9 @@ func (m *Manager) Apply(pid int) error {
newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
} }
// We need to set kernel memory before processes join cgroup because // We have to set kernel memory here, as we can't change it once
// kmem.limit_in_bytes can only be set when the cgroup is empty. // processes have been attached to the cgroup.
// And swap memory limit needs to be set after memory limit, only if c.Resources.KernelMemory != 0 {
// memory limit is handled by systemd, so it's kind of ugly here.
if c.Resources.KernelMemory > 0 {
if err := setKernelMemory(c); err != nil { if err := setKernelMemory(c); err != nil {
return err return err
} }
@ -236,53 +244,7 @@ func (m *Manager) Apply(pid int) error {
return err return err
} }
if err := joinDevices(c, pid); err != nil { if err := joinCgroups(c, pid); err != nil {
return err
}
// TODO: CpuQuota and CpuPeriod not available in systemd
// we need to manually join the cpu.cfs_quota_us and cpu.cfs_period_us
if err := joinCpu(c, pid); err != nil {
return err
}
// TODO: MemoryReservation and MemorySwap not available in systemd
if err := joinMemory(c, pid); err != nil {
return err
}
// we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd
// because it does not currently support it via the dbus api.
if err := joinFreezer(c, pid); err != nil {
return err
}
if err := joinNetPrio(c, pid); err != nil {
return err
}
if err := joinNetCls(c, pid); err != nil {
return err
}
if err := joinPids(c, pid); err != nil {
return err
}
if err := joinCpuset(c, pid); err != nil {
return err
}
if err := joinHugetlb(c, pid); err != nil {
return err
}
if err := joinPerfEvent(c, pid); err != nil {
return err
}
// FIXME: Systemd does have `BlockIODeviceWeight` property, but we got problem
// using that (at least on systemd 208, see https://github.com/opencontainers/runc/libcontainer/pull/354),
// so use fs work around for now.
if err := joinBlkio(c, pid); err != nil {
return err return err
} }
@ -327,7 +289,7 @@ func writeFile(dir, file, data string) error {
// Normally dir should not be empty, one case is that cgroup subsystem // Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here. // is not mounted, we will get empty dir, and we want it fail here.
if dir == "" { if dir == "" {
return fmt.Errorf("no such directory for %s.", file) return fmt.Errorf("no such directory for %s", file)
} }
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
} }
@ -347,43 +309,41 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
return path, nil return path, nil
} }
func joinCpu(c *configs.Cgroup, pid int) error { func joinCgroups(c *configs.Cgroup, pid int) error {
_, err := join(c, "cpu", pid) for _, sys := range subsystems {
name := sys.Name()
switch name {
case "name=systemd":
// let systemd handle this
break
case "cpuset":
path, err := getSubsystemPath(c, name)
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
return nil s := &fs.CpusetGroup{}
} if err := s.ApplyDir(path, c, pid); err != nil {
return err
}
break
default:
_, err := join(c, name, pid)
if err != nil {
// Even if it's `not found` error, we'll return err
// because devices cgroup is hard requirement for
// container security.
if name == "devices" {
return err
}
// For other subsystems, omit the `not found` error
// because they are optional.
if !cgroups.IsNotFound(err) {
return err
}
}
}
}
func joinFreezer(c *configs.Cgroup, pid int) error {
_, err := join(c, "freezer", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinNetPrio(c *configs.Cgroup, pid int) error {
_, err := join(c, "net_prio", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinNetCls(c *configs.Cgroup, pid int) error {
_, err := join(c, "net_cls", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinPids(c *configs.Cgroup, pid int) error {
_, err := join(c, "pids", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil return nil
} }
@ -392,9 +352,18 @@ func joinPids(c *configs.Cgroup, pid int) error {
// test.slice/test-a.slice/test-a-b.slice. // test.slice/test-a.slice/test-a-b.slice.
func expandSlice(slice string) (string, error) { func expandSlice(slice string) (string, error) {
suffix := ".slice" suffix := ".slice"
sliceName := strings.TrimSuffix(slice, suffix) // Name has to end with ".slice", but can't be just ".slice".
if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
// Path-separators are not allowed.
if strings.Contains(slice, "/") {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
var path, prefix string var path, prefix string
sliceName := strings.TrimSuffix(slice, suffix)
for _, component := range strings.Split(sliceName, "-") { for _, component := range strings.Split(sliceName, "-") {
// test--a.slice isn't permitted, nor is -test.slice. // test--a.slice isn't permitted, nor is -test.slice.
if component == "" { if component == "" {
@ -510,87 +479,11 @@ func getUnitName(c *configs.Cgroup) string {
return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name) return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
} }
// Atm we can't use the systemd device support because of two missing things:
// * Support for wildcards to allow mknod on any device
// * Support for wildcards to allow /dev/pts support
//
// The second is available in more recent systemd as "char-pts", but not in e.g. v208 which is
// in wide use. When both these are available we will be able to switch, but need to keep the old
// implementation for backwards compat.
//
// Note: we can't use systemd to set up the initial limits, and then change the cgroup
// because systemd will re-write the device settings if it needs to re-apply the cgroup context.
// This happens at least for v208 when any sibling unit is started.
func joinDevices(c *configs.Cgroup, pid int) error {
_, err := join(c, "devices", pid)
// Even if it's `not found` error, we'll return err because devices cgroup
// is hard requirement for container security.
if err != nil {
return err
}
return nil
}
func setKernelMemory(c *configs.Cgroup) error { func setKernelMemory(c *configs.Cgroup) error {
path, err := getSubsystemPath(c, "memory") path, err := getSubsystemPath(c, "memory")
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
if err := os.MkdirAll(path, 0755); err != nil { return os.MkdirAll(path, 0755)
return err
}
// This doesn't get called by manager.Set, so we need to do it here.
s := &fs.MemoryGroup{}
return s.SetKernelMemory(path, c)
}
func joinMemory(c *configs.Cgroup, pid int) error {
_, err := join(c, "memory", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
// systemd does not atm set up the cpuset controller, so we must manually
// join it. Additionally that is a very finicky controller where each
// level must have a full setup as the default for a new directory is "no cpus"
func joinCpuset(c *configs.Cgroup, pid int) error {
path, err := getSubsystemPath(c, "cpuset")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
s := &fs.CpusetGroup{}
return s.ApplyDir(path, c, pid)
}
// `BlockIODeviceWeight` property of systemd does not work properly, and systemd
// expects device path instead of major minor numbers, which is also confusing
// for users. So we use fs work around for now.
func joinBlkio(c *configs.Cgroup, pid int) error {
_, err := join(c, "blkio", pid)
if err != nil {
return err
}
return nil
}
func joinHugetlb(c *configs.Cgroup, pid int) error {
_, err := join(c, "hugetlb", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinPerfEvent(c *configs.Cgroup, pid int) error {
_, err := join(c, "perf_event", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
} }

View File

@ -5,6 +5,7 @@ package cgroups
import ( import (
"bufio" "bufio"
"fmt" "fmt"
"io"
"io/ioutil" "io/ioutil"
"os" "os"
"path/filepath" "path/filepath"
@ -12,17 +13,19 @@ import (
"strings" "strings"
"time" "time"
"github.com/docker/docker/pkg/mount"
"github.com/docker/go-units" "github.com/docker/go-units"
) )
const cgroupNamePrefix = "name=" const cgroupNamePrefix = "name="
// https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
func FindCgroupMountpoint(subsystem string) (string, error) { func FindCgroupMountpoint(subsystem string) (string, error) {
// We are not using mount.GetMounts() because it's super-inefficient, // We are not using mount.GetMounts() because it's super-inefficient,
// parsing it directly sped up x10 times because of not using Sscanf. // parsing it directly sped up x10 times because of not using Sscanf.
// It was one of two major performance drawbacks in container start. // It was one of two major performance drawbacks in container start.
if !isSubsystemAvailable(subsystem) {
return "", NewNotFoundError(subsystem)
}
f, err := os.Open("/proc/self/mountinfo") f, err := os.Open("/proc/self/mountinfo")
if err != nil { if err != nil {
return "", err return "", err
@ -47,6 +50,9 @@ func FindCgroupMountpoint(subsystem string) (string, error) {
} }
func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) { func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
if !isSubsystemAvailable(subsystem) {
return "", "", NewNotFoundError(subsystem)
}
f, err := os.Open("/proc/self/mountinfo") f, err := os.Open("/proc/self/mountinfo")
if err != nil { if err != nil {
return "", "", err return "", "", err
@ -70,6 +76,15 @@ func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
return "", "", NewNotFoundError(subsystem) return "", "", NewNotFoundError(subsystem)
} }
func isSubsystemAvailable(subsystem string) bool {
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return false
}
_, avail := cgroups[subsystem]
return avail
}
func FindCgroupMountpointDir() (string, error) { func FindCgroupMountpointDir() (string, error) {
f, err := os.Open("/proc/self/mountinfo") f, err := os.Open("/proc/self/mountinfo")
if err != nil { if err != nil {
@ -121,42 +136,63 @@ func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) {
return getControllerPath(m.Subsystems[0], cgroups) return getControllerPath(m.Subsystems[0], cgroups)
} }
func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
res := make([]Mount, 0, len(ss))
scanner := bufio.NewScanner(mi)
numFound := 0
for scanner.Scan() && numFound < len(ss) {
txt := scanner.Text()
sepIdx := strings.Index(txt, " - ")
if sepIdx == -1 {
return nil, fmt.Errorf("invalid mountinfo format")
}
if txt[sepIdx+3:sepIdx+9] != "cgroup" {
continue
}
fields := strings.Split(txt, " ")
m := Mount{
Mountpoint: fields[4],
Root: fields[3],
}
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if !ss[opt] {
continue
}
if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
} else {
m.Subsystems = append(m.Subsystems, opt)
}
numFound++
}
res = append(res, m)
}
if err := scanner.Err(); err != nil {
return nil, err
}
return res, nil
}
func GetCgroupMounts() ([]Mount, error) { func GetCgroupMounts() ([]Mount, error) {
mounts, err := mount.GetMounts() f, err := os.Open("/proc/self/mountinfo")
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer f.Close()
all, err := GetAllSubsystems() all, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil { if err != nil {
return nil, err return nil, err
} }
allMap := make(map[string]bool) allMap := make(map[string]bool)
for _, s := range all { for s := range all {
allMap[s] = true allMap[s] = true
} }
return getCgroupMountsHelper(allMap, f)
res := []Mount{}
for _, mount := range mounts {
if mount.Fstype == "cgroup" {
m := Mount{Mountpoint: mount.Mountpoint, Root: mount.Root}
for _, opt := range strings.Split(mount.VfsOpts, ",") {
if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
}
if allMap[opt] {
m.Subsystems = append(m.Subsystems, opt)
}
}
res = append(res, m)
}
}
return res, nil
} }
// Returns all the cgroup subsystems supported by the kernel // GetAllSubsystems returns all the cgroup subsystems supported by the kernel
func GetAllSubsystems() ([]string, error) { func GetAllSubsystems() ([]string, error) {
f, err := os.Open("/proc/cgroups") f, err := os.Open("/proc/cgroups")
if err != nil { if err != nil {
@ -182,7 +218,7 @@ func GetAllSubsystems() ([]string, error) {
return subsystems, nil return subsystems, nil
} }
// Returns the relative path to the cgroup docker is running in. // GetThisCgroupDir returns the relative path to the cgroup docker is running in.
func GetThisCgroupDir(subsystem string) (string, error) { func GetThisCgroupDir(subsystem string) (string, error) {
cgroups, err := ParseCgroupFile("/proc/self/cgroup") cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil { if err != nil {
@ -226,6 +262,8 @@ func readProcsFile(dir string) ([]int, error) {
return out, nil return out, nil
} }
// ParseCgroupFile parses the given cgroup file, typically from
// /proc/<pid>/cgroup, into a map of subgroups to cgroup names.
func ParseCgroupFile(path string) (map[string]string, error) { func ParseCgroupFile(path string) (map[string]string, error) {
f, err := os.Open(path) f, err := os.Open(path)
if err != nil { if err != nil {
@ -233,7 +271,12 @@ func ParseCgroupFile(path string) (map[string]string, error) {
} }
defer f.Close() defer f.Close()
s := bufio.NewScanner(f) return parseCgroupFromReader(f)
}
// helper function for ParseCgroupFile to make testing easier
func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
s := bufio.NewScanner(r)
cgroups := make(map[string]string) cgroups := make(map[string]string)
for s.Scan() { for s.Scan() {
@ -242,7 +285,16 @@ func ParseCgroupFile(path string) (map[string]string, error) {
} }
text := s.Text() text := s.Text()
parts := strings.Split(text, ":") // from cgroups(7):
// /proc/[pid]/cgroup
// ...
// For each cgroup hierarchy ... there is one entry
// containing three colon-separated fields of the form:
// hierarchy-ID:subsystem-list:cgroup-path
parts := strings.SplitN(text, ":", 3)
if len(parts) < 3 {
return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
}
for _, subs := range strings.Split(parts[1], ",") { for _, subs := range strings.Split(parts[1], ",") {
cgroups[subs] = parts[2] cgroups[subs] = parts[2]
@ -309,7 +361,7 @@ func RemovePaths(paths map[string]string) (err error) {
return nil return nil
} }
} }
return fmt.Errorf("Failed to remove paths: %s", paths) return fmt.Errorf("Failed to remove paths: %v", paths)
} }
func GetHugePageSize() ([]string, error) { func GetHugePageSize() ([]string, error) {

View File

@ -11,15 +11,22 @@ const (
) )
type Cgroup struct { type Cgroup struct {
Name string `json:"name"` // Deprecated, use Path instead
Name string `json:"name,omitempty"`
// name of parent cgroup or slice // name of parent of cgroup or slice
Parent string `json:"parent"` // Deprecated, use Path instead
Parent string `json:"parent,omitempty"`
// Path specifies the path to cgroups that are created and/or joined by the container.
// The path is assumed to be relative to the host system cgroup mountpoint.
Path string `json:"path"`
// ScopePrefix decribes prefix for the scope name // ScopePrefix decribes prefix for the scope name
ScopePrefix string `json:"scope_prefix"` ScopePrefix string `json:"scope_prefix"`
// Paths represent the cgroups paths to join // Paths represent the absolute cgroups paths to join.
// This takes precedence over Path.
Paths map[string]string Paths map[string]string
// Resources contains various cgroups settings to apply // Resources contains various cgroups settings to apply
@ -28,11 +35,14 @@ type Cgroup struct {
type Resources struct { type Resources struct {
// If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list. // If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list.
AllowAllDevices bool `json:"allow_all_devices"` // Deprecated
AllowAllDevices bool `json:"allow_all_devices,omitempty"`
// Deprecated
AllowedDevices []*Device `json:"allowed_devices,omitempty"`
// Deprecated
DeniedDevices []*Device `json:"denied_devices,omitempty"`
AllowedDevices []*Device `json:"allowed_devices"` Devices []*Device `json:"devices"`
DeniedDevices []*Device `json:"denied_devices"`
// Memory limit (in bytes) // Memory limit (in bytes)
Memory int64 `json:"memory"` Memory int64 `json:"memory"`
@ -46,6 +56,9 @@ type Resources struct {
// Kernel memory limit (in bytes) // Kernel memory limit (in bytes)
KernelMemory int64 `json:"kernel_memory"` KernelMemory int64 `json:"kernel_memory"`
// Kernel memory limit for TCP use (in bytes)
KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
// CPU shares (relative weight vs. other containers) // CPU shares (relative weight vs. other containers)
CpuShares int64 `json:"cpu_shares"` CpuShares int64 `json:"cpu_shares"`
@ -56,10 +69,10 @@ type Resources struct {
CpuPeriod int64 `json:"cpu_period"` CpuPeriod int64 `json:"cpu_period"`
// How many time CPU will use in realtime scheduling (in usecs). // How many time CPU will use in realtime scheduling (in usecs).
CpuRtRuntime int64 `json:"cpu_quota"` CpuRtRuntime int64 `json:"cpu_rt_quota"`
// CPU period to be used for realtime scheduling (in usecs). // CPU period to be used for realtime scheduling (in usecs).
CpuRtPeriod int64 `json:"cpu_period"` CpuRtPeriod int64 `json:"cpu_rt_period"`
// CPU to use // CPU to use
CpusetCpus string `json:"cpuset_cpus"` CpusetCpus string `json:"cpuset_cpus"`
@ -101,11 +114,11 @@ type Resources struct {
OomKillDisable bool `json:"oom_kill_disable"` OomKillDisable bool `json:"oom_kill_disable"`
// Tuning swappiness behaviour per cgroup // Tuning swappiness behaviour per cgroup
MemorySwappiness int64 `json:"memory_swappiness"` MemorySwappiness *int64 `json:"memory_swappiness"`
// Set priority of network traffic for container // Set priority of network traffic for container
NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"` NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`
// Set class identifier for container's network packets // Set class identifier for container's network packets
NetClsClassid string `json:"net_cls_classid"` NetClsClassid uint32 `json:"net_cls_classid"`
} }

View File

@ -3,7 +3,11 @@ package configs
import ( import (
"bytes" "bytes"
"encoding/json" "encoding/json"
"fmt"
"os/exec" "os/exec"
"time"
"github.com/Sirupsen/logrus"
) )
type Rlimit struct { type Rlimit struct {
@ -29,7 +33,7 @@ type Seccomp struct {
Syscalls []*Syscall `json:"syscalls"` Syscalls []*Syscall `json:"syscalls"`
} }
// An action to be taken upon rule match in Seccomp // Action is taken upon rule match in Seccomp
type Action int type Action int
const ( const (
@ -40,7 +44,7 @@ const (
Trace Trace
) )
// A comparison operator to be used when matching syscall arguments in Seccomp // Operator is a comparison operator to be used when matching syscall arguments in Seccomp
type Operator int type Operator int
const ( const (
@ -53,7 +57,7 @@ const (
MaskEqualTo MaskEqualTo
) )
// A rule to match a specific syscall argument in Seccomp // Arg is a rule to match a specific syscall argument in Seccomp
type Arg struct { type Arg struct {
Index uint `json:"index"` Index uint `json:"index"`
Value uint64 `json:"value"` Value uint64 `json:"value"`
@ -61,7 +65,7 @@ type Arg struct {
Op Operator `json:"op"` Op Operator `json:"op"`
} }
// An rule to match a syscall in Seccomp // Syscall is a rule to match a syscall in Seccomp
type Syscall struct { type Syscall struct {
Name string `json:"name"` Name string `json:"name"`
Action Action `json:"action"` Action Action `json:"action"`
@ -128,15 +132,15 @@ type Config struct {
// AppArmorProfile specifies the profile to apply to the process running in the container and is // AppArmorProfile specifies the profile to apply to the process running in the container and is
// change at the time the process is execed // change at the time the process is execed
AppArmorProfile string `json:"apparmor_profile"` AppArmorProfile string `json:"apparmor_profile,omitempty"`
// ProcessLabel specifies the label to apply to the process running in the container. It is // ProcessLabel specifies the label to apply to the process running in the container. It is
// commonly used by selinux // commonly used by selinux
ProcessLabel string `json:"process_label"` ProcessLabel string `json:"process_label,omitempty"`
// Rlimits specifies the resource limits, such as max open files, to set in the container // Rlimits specifies the resource limits, such as max open files, to set in the container
// If Rlimits are not set, the container will inherit rlimits from the parent process // If Rlimits are not set, the container will inherit rlimits from the parent process
Rlimits []Rlimit `json:"rlimits"` Rlimits []Rlimit `json:"rlimits,omitempty"`
// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
// for a process. Valid values are between the range [-1000, '1000'], where processes with // for a process. Valid values are between the range [-1000, '1000'], where processes with
@ -144,10 +148,6 @@ type Config struct {
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
OomScoreAdj int `json:"oom_score_adj"` OomScoreAdj int `json:"oom_score_adj"`
// AdditionalGroups specifies the gids that should be added to supplementary groups
// in addition to those that the user belongs to.
AdditionalGroups []string `json:"additional_groups"`
// UidMappings is an array of User ID mappings for User Namespaces // UidMappings is an array of User ID mappings for User Namespaces
UidMappings []IDMap `json:"uid_mappings"` UidMappings []IDMap `json:"uid_mappings"`
@ -171,12 +171,22 @@ type Config struct {
// A default action to be taken if no rules match is also given. // A default action to be taken if no rules match is also given.
Seccomp *Seccomp `json:"seccomp"` Seccomp *Seccomp `json:"seccomp"`
// NoNewPrivileges controls whether processes in the container can gain additional privileges.
NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
// Hooks are a collection of actions to perform at various container lifecycle events. // Hooks are a collection of actions to perform at various container lifecycle events.
// Hooks are not able to be marshaled to json but they are also not needed to. // CommandHooks are serialized to JSON, but other hooks are not.
Hooks *Hooks `json:"-"` Hooks *Hooks
// Version is the version of opencontainer specification that is supported. // Version is the version of opencontainer specification that is supported.
Version string `json:"version"` Version string `json:"version"`
// Labels are user defined metadata that is stored in the config and populated on the state
Labels []string `json:"labels"`
// NoNewKeyring will not allocated a new session keyring for the container. It will use the
// callers keyring in this case.
NoNewKeyring bool `json:"no_new_keyring"`
} }
type Hooks struct { type Hooks struct {
@ -191,12 +201,59 @@ type Hooks struct {
Poststop []Hook Poststop []Hook
} }
func (hooks *Hooks) UnmarshalJSON(b []byte) error {
var state struct {
Prestart []CommandHook
Poststart []CommandHook
Poststop []CommandHook
}
if err := json.Unmarshal(b, &state); err != nil {
return err
}
deserialize := func(shooks []CommandHook) (hooks []Hook) {
for _, shook := range shooks {
hooks = append(hooks, shook)
}
return hooks
}
hooks.Prestart = deserialize(state.Prestart)
hooks.Poststart = deserialize(state.Poststart)
hooks.Poststop = deserialize(state.Poststop)
return nil
}
func (hooks Hooks) MarshalJSON() ([]byte, error) {
serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
for _, hook := range hooks {
switch chook := hook.(type) {
case CommandHook:
serializableHooks = append(serializableHooks, chook)
default:
logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
}
}
return serializableHooks
}
return json.Marshal(map[string]interface{}{
"prestart": serialize(hooks.Prestart),
"poststart": serialize(hooks.Poststart),
"poststop": serialize(hooks.Poststop),
})
}
// HookState is the payload provided to a hook on execution. // HookState is the payload provided to a hook on execution.
type HookState struct { type HookState struct {
Version string `json:"version"` Version string `json:"ociVersion"`
ID string `json:"id"` ID string `json:"id"`
Pid int `json:"pid"` Pid int `json:"pid"`
Root string `json:"root"` Root string `json:"root"`
BundlePath string `json:"bundlePath"`
} }
type Hook interface { type Hook interface {
@ -204,7 +261,7 @@ type Hook interface {
Run(HookState) error Run(HookState) error
} }
// NewFunctionHooks will call the provided function when the hook is run. // NewFunctionHook will call the provided function when the hook is run.
func NewFunctionHook(f func(HookState) error) FuncHook { func NewFunctionHook(f func(HookState) error) FuncHook {
return FuncHook{ return FuncHook{
run: f, run: f,
@ -224,9 +281,10 @@ type Command struct {
Args []string `json:"args"` Args []string `json:"args"`
Env []string `json:"env"` Env []string `json:"env"`
Dir string `json:"dir"` Dir string `json:"dir"`
Timeout *time.Duration `json:"timeout"`
} }
// NewCommandHooks will execute the provided command when the hook is run. // NewCommandHook will execute the provided command when the hook is run.
func NewCommandHook(cmd Command) CommandHook { func NewCommandHook(cmd Command) CommandHook {
return CommandHook{ return CommandHook{
Command: cmd, Command: cmd,
@ -248,5 +306,23 @@ func (c Command) Run(s HookState) error {
Env: c.Env, Env: c.Env,
Stdin: bytes.NewReader(b), Stdin: bytes.NewReader(b),
} }
return cmd.Run() errC := make(chan error, 1)
go func() {
out, err := cmd.CombinedOutput()
if err != nil {
err = fmt.Errorf("%s: %s", err, out)
}
errC <- err
}()
if c.Timeout != nil {
select {
case err := <-errC:
return err
case <-time.After(*c.Timeout):
cmd.Process.Kill()
cmd.Wait()
return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
}
}
return <-errC
} }

View File

@ -4,7 +4,7 @@ package configs
import "fmt" import "fmt"
// Gets the root uid for the process on host which could be non-zero // HostUID gets the root uid for the process on host which could be non-zero
// when user namespaces are enabled. // when user namespaces are enabled.
func (c Config) HostUID() (int, error) { func (c Config) HostUID() (int, error) {
if c.Namespaces.Contains(NEWUSER) { if c.Namespaces.Contains(NEWUSER) {
@ -21,7 +21,7 @@ func (c Config) HostUID() (int, error) {
return 0, nil return 0, nil
} }
// Gets the root gid for the process on host which could be non-zero // HostGID gets the root gid for the process on host which could be non-zero
// when user namespaces are enabled. // when user namespaces are enabled.
func (c Config) HostGID() (int, error) { func (c Config) HostGID() (int, error) {
if c.Namespaces.Contains(NEWUSER) { if c.Namespaces.Contains(NEWUSER) {

View File

@ -35,6 +35,9 @@ type Device struct {
// Gid of the device. // Gid of the device.
Gid uint32 `json:"gid"` Gid uint32 `json:"gid"`
// Write the file to the allowed list
Allow bool `json:"allow"`
} }
func (d *Device) CgroupString() string { func (d *Device) CgroupString() string {

View File

@ -3,7 +3,7 @@
package configs package configs
var ( var (
// These are devices that are to be both allowed and created. // DefaultSimpleDevices are devices that are to be both allowed and created.
DefaultSimpleDevices = []*Device{ DefaultSimpleDevices = []*Device{
// /dev/null and zero // /dev/null and zero
{ {

View File

@ -18,7 +18,7 @@ var namespaceInfo = map[NamespaceType]int{
} }
// CloneFlags parses the container's Namespaces options to set the correct // CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This functions returns flags only for new namespaces. // flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr { func (n *Namespaces) CloneFlags() uintptr {
var flag int var flag int
for _, v := range *n { for _, v := range *n {

View File

@ -8,7 +8,7 @@ func (n *Namespace) Syscall() int {
} }
// CloneFlags parses the container's Namespaces options to set the correct // CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This functions returns flags only for new namespaces. // flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr { func (n *Namespaces) CloneFlags() uintptr {
panic("No namespace syscall support") panic("No namespace syscall support")
return uintptr(0) return uintptr(0)

View File

@ -2,7 +2,11 @@
package configs package configs
import "fmt" import (
"fmt"
"os"
"sync"
)
const ( const (
NEWNET NamespaceType = "NEWNET" NEWNET NamespaceType = "NEWNET"
@ -13,6 +17,51 @@ const (
NEWUSER NamespaceType = "NEWUSER" NEWUSER NamespaceType = "NEWUSER"
) )
var (
nsLock sync.Mutex
supportedNamespaces = make(map[NamespaceType]bool)
)
// nsToFile converts the namespace type to its filename
func nsToFile(ns NamespaceType) string {
switch ns {
case NEWNET:
return "net"
case NEWNS:
return "mnt"
case NEWPID:
return "pid"
case NEWIPC:
return "ipc"
case NEWUSER:
return "user"
case NEWUTS:
return "uts"
}
return ""
}
// IsNamespaceSupported returns whether a namespace is available or
// not
func IsNamespaceSupported(ns NamespaceType) bool {
nsLock.Lock()
defer nsLock.Unlock()
supported, ok := supportedNamespaces[ns]
if ok {
return supported
}
nsFile := nsToFile(ns)
// if the namespace type is unknown, just return false
if nsFile == "" {
return false
}
_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
// a namespace is supported if it exists and we have permissions to read it
supported = err == nil
supportedNamespaces[ns] = supported
return supported
}
func NamespaceTypes() []NamespaceType { func NamespaceTypes() []NamespaceType {
return []NamespaceType{ return []NamespaceType{
NEWNET, NEWNET,
@ -35,26 +84,7 @@ func (n *Namespace) GetPath(pid int) string {
if n.Path != "" { if n.Path != "" {
return n.Path return n.Path
} }
return fmt.Sprintf("/proc/%d/ns/%s", pid, n.file()) return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type))
}
func (n *Namespace) file() string {
file := ""
switch n.Type {
case NEWNET:
file = "net"
case NEWNS:
file = "mnt"
case NEWPID:
file = "pid"
case NEWIPC:
file = "ipc"
case NEWUSER:
file = "user"
case NEWUTS:
file = "uts"
}
return file
} }
func (n *Namespaces) Remove(t NamespaceType) bool { func (n *Namespaces) Remove(t NamespaceType) bool {
@ -87,3 +117,11 @@ func (n *Namespaces) index(t NamespaceType) int {
func (n *Namespaces) Contains(t NamespaceType) bool { func (n *Namespaces) Contains(t NamespaceType) bool {
return n.index(t) != -1 return n.index(t) != -1
} }
func (n *Namespaces) PathOf(t NamespaceType) string {
i := n.index(t)
if i == -1 {
return ""
}
return (*n)[i].Path
}

View File

@ -4,8 +4,10 @@ import (
"fmt" "fmt"
"os" "os"
"path/filepath" "path/filepath"
"strings"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/selinux"
) )
type Validator interface { type Validator interface {
@ -35,10 +37,13 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
if err := v.usernamespace(config); err != nil { if err := v.usernamespace(config); err != nil {
return err return err
} }
if err := v.sysctl(config); err != nil {
return err
}
return nil return nil
} }
// rootfs validates the the rootfs is an absolute path and is not a symlink // rootfs validates if the rootfs is an absolute path and is not a symlink
// to the container's root filesystem. // to the container's root filesystem.
func (v *ConfigValidator) rootfs(config *configs.Config) error { func (v *ConfigValidator) rootfs(config *configs.Config) error {
cleaned, err := filepath.Abs(config.Rootfs) cleaned, err := filepath.Abs(config.Rootfs)
@ -48,7 +53,7 @@ func (v *ConfigValidator) rootfs(config *configs.Config) error {
if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil { if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil {
return err return err
} }
if config.Rootfs != cleaned { if filepath.Clean(config.Rootfs) != cleaned {
return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs) return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs)
} }
return nil return nil
@ -76,6 +81,10 @@ func (v *ConfigValidator) security(config *configs.Config) error {
!config.Namespaces.Contains(configs.NEWNS) { !config.Namespaces.Contains(configs.NEWNS) {
return fmt.Errorf("unable to restrict sys entries without a private MNT namespace") return fmt.Errorf("unable to restrict sys entries without a private MNT namespace")
} }
if config.ProcessLabel != "" && !selinux.SelinuxEnabled() {
return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported")
}
return nil return nil
} }
@ -91,3 +100,39 @@ func (v *ConfigValidator) usernamespace(config *configs.Config) error {
} }
return nil return nil
} }
// sysctl validates that the specified sysctl keys are valid or not.
// /proc/sys isn't completely namespaced and depending on which namespaces
// are specified, a subset of sysctls are permitted.
func (v *ConfigValidator) sysctl(config *configs.Config) error {
validSysctlMap := map[string]bool{
"kernel.msgmax": true,
"kernel.msgmnb": true,
"kernel.msgmni": true,
"kernel.sem": true,
"kernel.shmall": true,
"kernel.shmmax": true,
"kernel.shmmni": true,
"kernel.shm_rmid_forced": true,
}
for s := range config.Sysctl {
if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") {
if config.Namespaces.Contains(configs.NEWIPC) {
continue
} else {
return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s)
}
}
if strings.HasPrefix(s, "net.") {
if config.Namespaces.Contains(configs.NEWNET) {
continue
} else {
return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s)
}
}
return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s)
}
return nil
}

View File

@ -0,0 +1,11 @@
package libcontainer
import (
"errors"
)
// NewConsole returns an initalized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process.
func NewConsole(uid, gid int) (Console, error) {
return nil, errors.New("libcontainer console is not supported on Solaris")
}

View File

@ -1,4 +1,4 @@
// Libcontainer provides a native Go implementation for creating containers // Package libcontainer provides a native Go implementation for creating containers
// with namespaces, cgroups, capabilities, and filesystem access controls. // with namespaces, cgroups, capabilities, and filesystem access controls.
// It allows you to manage the lifecycle of the container performing additional operations // It allows you to manage the lifecycle of the container performing additional operations
// after the container is created. // after the container is created.
@ -6,31 +6,25 @@ package libcontainer
import ( import (
"os" "os"
"time"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
) )
// The status of a container. // Status is the status of a container.
type Status int type Status int
const ( const (
// The container exists but has not been run yet // Created is the status that denotes the container exists but has not been run yet.
Created Status = iota Created Status = iota
// Running is the status that denotes the container exists and is running.
// The container exists and is running.
Running Running
// Pausing is the status that denotes the container exists, it is in the process of being paused.
// The container exists, it is in the process of being paused.
Pausing Pausing
// Paused is the status that denotes the container exists, but all its processes are paused.
// The container exists, but all its processes are paused.
Paused Paused
// Stopped is the status that denotes the container does not have a created or running process.
// The container exists, but its state is saved on disk Stopped
Checkpointed
// The container does not exist.
Destroyed
) )
func (s Status) String() string { func (s Status) String() string {
@ -43,10 +37,8 @@ func (s Status) String() string {
return "pausing" return "pausing"
case Paused: case Paused:
return "paused" return "paused"
case Checkpointed: case Stopped:
return "checkpointed" return "stopped"
case Destroyed:
return "destroyed"
default: default:
return "unknown" return "unknown"
} }
@ -61,14 +53,17 @@ type BaseState struct {
// InitProcessPid is the init process id in the parent namespace. // InitProcessPid is the init process id in the parent namespace.
InitProcessPid int `json:"init_process_pid"` InitProcessPid int `json:"init_process_pid"`
// InitProcessStartTime is the init process start time. // InitProcessStartTime is the init process start time in clock cycles since boot time.
InitProcessStartTime string `json:"init_process_start"` InitProcessStartTime string `json:"init_process_start"`
// Created is the unix timestamp for the creation time of the container in UTC
Created time.Time `json:"created"`
// Config is the container's configuration. // Config is the container's configuration.
Config configs.Config `json:"config"` Config configs.Config `json:"config"`
} }
// A libcontainer container object. // BaseContainer is a libcontainer container object.
// //
// Each container is thread-safe within the same process. Since a container can // Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container // be destroyed by a separate process, any function may return that the container
@ -81,13 +76,13 @@ type BaseContainer interface {
// //
// errors: // errors:
// ContainerDestroyed - Container no longer exists, // ContainerDestroyed - Container no longer exists,
// Systemerror - System error. // SystemError - System error.
Status() (Status, error) Status() (Status, error)
// State returns the current container's state information. // State returns the current container's state information.
// //
// errors: // errors:
// Systemerror - System error. // SystemError - System error.
State() (*State, error) State() (*State, error)
// Returns the current config of the container. // Returns the current config of the container.
@ -97,7 +92,7 @@ type BaseContainer interface {
// //
// errors: // errors:
// ContainerDestroyed - Container no longer exists, // ContainerDestroyed - Container no longer exists,
// Systemerror - System error. // SystemError - System error.
// //
// Some of the returned PIDs may no longer refer to processes in the Container, unless // Some of the returned PIDs may no longer refer to processes in the Container, unless
// the Container state is PAUSED in which case every PID in the slice is valid. // the Container state is PAUSED in which case every PID in the slice is valid.
@ -107,7 +102,7 @@ type BaseContainer interface {
// //
// errors: // errors:
// ContainerDestroyed - Container no longer exists, // ContainerDestroyed - Container no longer exists,
// Systemerror - System error. // SystemError - System error.
Stats() (*Stats, error) Stats() (*Stats, error)
// Set resources of container as configured // Set resources of container as configured
@ -115,7 +110,7 @@ type BaseContainer interface {
// We can use this to change resources when containers are running. // We can use this to change resources when containers are running.
// //
// errors: // errors:
// Systemerror - System error. // SystemError - System error.
Set(config configs.Config) error Set(config configs.Config) error
// Start a process inside the container. Returns error if process fails to // Start a process inside the container. Returns error if process fails to
@ -125,21 +120,38 @@ type BaseContainer interface {
// ContainerDestroyed - Container no longer exists, // ContainerDestroyed - Container no longer exists,
// ConfigInvalid - config is invalid, // ConfigInvalid - config is invalid,
// ContainerPaused - Container is paused, // ContainerPaused - Container is paused,
// Systemerror - System error. // SystemError - System error.
Start(process *Process) (err error) Start(process *Process) (err error)
// Run immediatly starts the process inside the conatiner. Returns error if process
// fails to start. It does not block waiting for the exec fifo after start returns but
// opens the fifo after start returns.
//
// errors:
// ContainerDestroyed - Container no longer exists,
// ConfigInvalid - config is invalid,
// ContainerPaused - Container is paused,
// SystemError - System error.
Run(process *Process) (err error)
// Destroys the container after killing all running processes. // Destroys the container after killing all running processes.
// //
// Any event registrations are removed before the container is destroyed. // Any event registrations are removed before the container is destroyed.
// No error is returned if the container is already destroyed. // No error is returned if the container is already destroyed.
// //
// errors: // errors:
// Systemerror - System error. // SystemError - System error.
Destroy() error Destroy() error
// Signal sends the provided signal code to the container's initial process. // Signal sends the provided signal code to the container's initial process.
// //
// errors: // errors:
// Systemerror - System error. // SystemError - System error.
Signal(s os.Signal) error Signal(s os.Signal) error
// Exec signals the container to exec the users process at the end of the init.
//
// errors:
// SystemError - System error.
Exec() error
} }

View File

@ -15,13 +15,16 @@ import (
"strings" "strings"
"sync" "sync"
"syscall" "syscall"
"time"
"github.com/Sirupsen/logrus" "github.com/Sirupsen/logrus"
"github.com/golang/protobuf/proto" "github.com/golang/protobuf/proto"
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/criurpc" "github.com/opencontainers/runc/libcontainer/criurpc"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runc/libcontainer/utils"
"github.com/syndtr/gocapability/capability"
"github.com/vishvananda/netlink/nl" "github.com/vishvananda/netlink/nl"
) )
@ -35,10 +38,12 @@ type linuxContainer struct {
initPath string initPath string
initArgs []string initArgs []string
initProcess parentProcess initProcess parentProcess
initProcessStartTime string
criuPath string criuPath string
m sync.Mutex m sync.Mutex
criuVersion int criuVersion int
state containerState state containerState
created time.Time
} }
// State represents a running container's state // State represents a running container's state
@ -59,7 +64,7 @@ type State struct {
ExternalDescriptors []string `json:"external_descriptors,omitempty"` ExternalDescriptors []string `json:"external_descriptors,omitempty"`
} }
// A libcontainer container object. // Container is a libcontainer container object.
// //
// Each container is thread-safe within the same process. Since a container can // Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container // be destroyed by a separate process, any function may return that the container
@ -75,13 +80,13 @@ type Container interface {
// Systemerror - System error. // Systemerror - System error.
Checkpoint(criuOpts *CriuOpts) error Checkpoint(criuOpts *CriuOpts) error
// Restore restores the checkpointed container to a running state using the criu(8) utiity. // Restore restores the checkpointed container to a running state using the criu(8) utility.
// //
// errors: // errors:
// Systemerror - System error. // Systemerror - System error.
Restore(process *Process, criuOpts *CriuOpts) error Restore(process *Process, criuOpts *CriuOpts) error
// If the Container state is RUNNING or PAUSING, sets the Container state to PAUSING and pauses // If the Container state is RUNNING, sets the Container state to PAUSING and pauses
// the execution of any user processes. Asynchronously, when the container finished being paused the // the execution of any user processes. Asynchronously, when the container finished being paused the
// state is changed to PAUSED. // state is changed to PAUSED.
// If the Container state is PAUSED, do nothing. // If the Container state is PAUSED, do nothing.
@ -138,7 +143,7 @@ func (c *linuxContainer) State() (*State, error) {
func (c *linuxContainer) Processes() ([]int, error) { func (c *linuxContainer) Processes() ([]int, error) {
pids, err := c.cgroupManager.GetAllPids() pids, err := c.cgroupManager.GetAllPids()
if err != nil { if err != nil {
return nil, newSystemError(err) return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
} }
return pids, nil return pids, nil
} }
@ -149,14 +154,14 @@ func (c *linuxContainer) Stats() (*Stats, error) {
stats = &Stats{} stats = &Stats{}
) )
if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
return stats, newSystemError(err) return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
} }
for _, iface := range c.config.Networks { for _, iface := range c.config.Networks {
switch iface.Type { switch iface.Type {
case "veth": case "veth":
istats, err := getNetworkInterfaceStats(iface.HostInterfaceName) istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
if err != nil { if err != nil {
return stats, newSystemError(err) return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
} }
stats.Interfaces = append(stats.Interfaces, istats) stats.Interfaces = append(stats.Interfaces, istats)
} }
@ -167,6 +172,13 @@ func (c *linuxContainer) Stats() (*Stats, error) {
func (c *linuxContainer) Set(config configs.Config) error { func (c *linuxContainer) Set(config configs.Config) error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
if status == Stopped {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
}
c.config = &config c.config = &config
return c.cgroupManager.Set(c.config) return c.cgroupManager.Set(c.config)
} }
@ -178,38 +190,90 @@ func (c *linuxContainer) Start(process *Process) error {
if err != nil { if err != nil {
return err return err
} }
doInit := status == Destroyed return c.start(process, status == Stopped)
parent, err := c.newParentProcess(process, doInit) }
func (c *linuxContainer) Run(process *Process) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil { if err != nil {
return newSystemError(err) return err
}
if err := c.start(process, status == Stopped); err != nil {
return err
}
if status == Stopped {
return c.exec()
}
return nil
}
func (c *linuxContainer) Exec() error {
c.m.Lock()
defer c.m.Unlock()
return c.exec()
}
func (c *linuxContainer) exec() error {
path := filepath.Join(c.root, execFifoFilename)
f, err := os.OpenFile(path, os.O_RDONLY, 0)
if err != nil {
return newSystemErrorWithCause(err, "open exec fifo for reading")
}
defer f.Close()
data, err := ioutil.ReadAll(f)
if err != nil {
return err
}
if len(data) > 0 {
os.Remove(path)
return nil
}
return fmt.Errorf("cannot start an already running container")
}
func (c *linuxContainer) start(process *Process, isInit bool) error {
parent, err := c.newParentProcess(process, isInit)
if err != nil {
return newSystemErrorWithCause(err, "creating new parent process")
} }
if err := parent.start(); err != nil { if err := parent.start(); err != nil {
// terminate the process to ensure that it properly is reaped. // terminate the process to ensure that it properly is reaped.
if err := parent.terminate(); err != nil { if err := parent.terminate(); err != nil {
logrus.Warn(err) logrus.Warn(err)
} }
return newSystemError(err) return newSystemErrorWithCause(err, "starting container process")
} }
// generate a timestamp indicating when the container was started
c.created = time.Now().UTC()
c.state = &runningState{ c.state = &runningState{
c: c, c: c,
} }
if doInit { if isInit {
if err := c.updateState(parent); err != nil { c.state = &createdState{
c: c,
}
state, err := c.updateState(parent)
if err != nil {
return err return err
} }
c.initProcessStartTime = state.InitProcessStartTime
if c.config.Hooks != nil { if c.config.Hooks != nil {
s := configs.HookState{ s := configs.HookState{
Version: c.config.Version, Version: c.config.Version,
ID: c.id, ID: c.id,
Pid: parent.pid(), Pid: parent.pid(),
Root: c.config.Rootfs, Root: c.config.Rootfs,
BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
} }
for _, hook := range c.config.Hooks.Poststart { for i, hook := range c.config.Hooks.Poststart {
if err := hook.Run(s); err != nil { if err := hook.Run(s); err != nil {
if err := parent.terminate(); err != nil { if err := parent.terminate(); err != nil {
logrus.Warn(err) logrus.Warn(err)
} }
return newSystemError(err) return newSystemErrorWithCausef(err, "running poststart hook %d", i)
} }
} }
} }
@ -219,7 +283,7 @@ func (c *linuxContainer) Start(process *Process) error {
func (c *linuxContainer) Signal(s os.Signal) error { func (c *linuxContainer) Signal(s os.Signal) error {
if err := c.initProcess.signal(s); err != nil { if err := c.initProcess.signal(s); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "signaling init process")
} }
return nil return nil
} }
@ -227,19 +291,23 @@ func (c *linuxContainer) Signal(s os.Signal) error {
func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) { func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
parentPipe, childPipe, err := newPipe() parentPipe, childPipe, err := newPipe()
if err != nil { if err != nil {
return nil, newSystemError(err) return nil, newSystemErrorWithCause(err, "creating new init pipe")
} }
cmd, err := c.commandTemplate(p, childPipe) rootDir, err := os.Open(c.root)
if err != nil { if err != nil {
return nil, newSystemError(err) return nil, err
}
cmd, err := c.commandTemplate(p, childPipe, rootDir)
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new command template")
} }
if !doInit { if !doInit {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe) return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir)
} }
return c.newInitProcess(p, cmd, parentPipe, childPipe) return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
} }
func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) { func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) {
cmd := &exec.Cmd{ cmd := &exec.Cmd{
Path: c.initPath, Path: c.initPath,
Args: c.initArgs, Args: c.initArgs,
@ -251,8 +319,10 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
if cmd.SysProcAttr == nil { if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &syscall.SysProcAttr{} cmd.SysProcAttr = &syscall.SysProcAttr{}
} }
cmd.ExtraFiles = append(p.ExtraFiles, childPipe) cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir)
cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2),
fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
// NOTE: when running a container with no PID namespace and the parent process spawning the container is // NOTE: when running a container with no PID namespace and the parent process spawning the container is
// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
// even with the parent still running. // even with the parent still running.
@ -262,22 +332,19 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
return cmd, nil return cmd, nil
} }
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) { func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
t := "_LIBCONTAINER_INITTYPE=" + string(initStandard) cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
cloneFlags := c.config.Namespaces.CloneFlags() nsMaps := make(map[configs.NamespaceType]string)
if cloneFlags&syscall.CLONE_NEWUSER != 0 { for _, ns := range c.config.Namespaces {
if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil { if ns.Path != "" {
// user mappings are not supported nsMaps[ns.Type] = ns.Path
}
}
_, sharePidns := nsMaps[configs.NEWPID]
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "")
if err != nil {
return nil, err return nil, err
} }
enableSetgroups(cmd.SysProcAttr)
// Default to root user when user namespaces are enabled.
if cmd.SysProcAttr.Credential == nil {
cmd.SysProcAttr.Credential = &syscall.Credential{}
}
}
cmd.Env = append(cmd.Env, t)
cmd.SysProcAttr.Cloneflags = cloneFlags
return &initProcess{ return &initProcess{
cmd: cmd, cmd: cmd,
childPipe: childPipe, childPipe: childPipe,
@ -286,14 +353,21 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
config: c.newInitConfig(p), config: c.newInitConfig(p),
container: c, container: c,
process: p, process: p,
bootstrapData: data,
sharePidns: sharePidns,
rootDir: rootDir,
}, nil }, nil
} }
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
state, err := c.currentState()
if err != nil {
return nil, newSystemErrorWithCause(err, "getting container's current state")
}
// for setns process, we dont have to set cloneflags as the process namespaces // for setns process, we dont have to set cloneflags as the process namespaces
// will only be set via setns syscall // will only be set via setns syscall
data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath) data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -306,20 +380,41 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe,
config: c.newInitConfig(p), config: c.newInitConfig(p),
process: p, process: p,
bootstrapData: data, bootstrapData: data,
rootDir: rootDir,
}, nil }, nil
} }
func (c *linuxContainer) newInitConfig(process *Process) *initConfig { func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
return &initConfig{ cfg := &initConfig{
Config: c.config, Config: c.config,
Args: process.Args, Args: process.Args,
Env: process.Env, Env: process.Env,
User: process.User, User: process.User,
AdditionalGroups: process.AdditionalGroups,
Cwd: process.Cwd, Cwd: process.Cwd,
Console: process.consolePath, Console: process.consolePath,
Capabilities: process.Capabilities, Capabilities: process.Capabilities,
PassedFilesCount: len(process.ExtraFiles), PassedFilesCount: len(process.ExtraFiles),
ContainerId: c.ID(),
NoNewPrivileges: c.config.NoNewPrivileges,
AppArmorProfile: c.config.AppArmorProfile,
ProcessLabel: c.config.ProcessLabel,
Rlimits: c.config.Rlimits,
ExecFifoPath: filepath.Join(c.root, execFifoFilename),
} }
if process.NoNewPrivileges != nil {
cfg.NoNewPrivileges = *process.NoNewPrivileges
}
if process.AppArmorProfile != "" {
cfg.AppArmorProfile = process.AppArmorProfile
}
if process.Label != "" {
cfg.ProcessLabel = process.Label
}
if len(process.Rlimits) > 0 {
cfg.Rlimits = process.Rlimits
}
return cfg
} }
func newPipe() (parent *os.File, child *os.File, err error) { func newPipe() (parent *os.File, child *os.File, err error) {
@ -343,15 +438,16 @@ func (c *linuxContainer) Pause() error {
if err != nil { if err != nil {
return err return err
} }
if status != Running { switch status {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) case Running, Created:
}
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
return err return err
} }
return c.state.transition(&pausedState{ return c.state.transition(&pausedState{
c: c, c: c,
}) })
}
return newGenericError(fmt.Errorf("container not running: %s", status), ContainerNotRunning)
} }
func (c *linuxContainer) Resume() error { func (c *linuxContainer) Resume() error {
@ -380,23 +476,13 @@ func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struc
return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
} }
// XXX debug support, remove when debugging done. // checkCriuVersion checks Criu version greater than or equal to minVersion
func addArgsFromEnv(evar string, args *[]string) { func (c *linuxContainer) checkCriuVersion(minVersion string) error {
if e := os.Getenv(evar); e != "" {
for _, f := range strings.Fields(e) {
*args = append(*args, f)
}
}
fmt.Printf(">>> criu %v\n", *args)
}
// check Criu version greater than or equal to min_version
func (c *linuxContainer) checkCriuVersion(min_version string) error {
var x, y, z, versionReq int var x, y, z, versionReq int
_, err := fmt.Sscanf(min_version, "%d.%d.%d\n", &x, &y, &z) // 1.5.2 _, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
if err != nil { if err != nil {
_, err = fmt.Sscanf(min_version, "Version: %d.%d\n", &x, &y) // 1.6 _, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6
} }
versionReq = x*10000 + y*100 + z versionReq = x*10000 + y*100 + z
@ -441,7 +527,7 @@ func (c *linuxContainer) checkCriuVersion(min_version string) error {
c.criuVersion = x*10000 + y*100 + z c.criuVersion = x*10000 + y*100 + z
if c.criuVersion < versionReq { if c.criuVersion < versionReq {
return fmt.Errorf("CRIU version must be %s or higher", min_version) return fmt.Errorf("CRIU version must be %s or higher", minVersion)
} }
return nil return nil
@ -514,6 +600,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
TcpEstablished: proto.Bool(criuOpts.TcpEstablished), TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
FileLocks: proto.Bool(criuOpts.FileLocks), FileLocks: proto.Bool(criuOpts.FileLocks),
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
} }
// append optional criu opts, e.g., page-server and port // append optional criu opts, e.g., page-server and port
@ -529,7 +616,8 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
if err := c.checkCriuVersion("1.7"); err != nil { if err := c.checkCriuVersion("1.7"); err != nil {
return err return err
} }
rpcOpts.ManageCgroupsMode = proto.Uint32(uint32(criuOpts.ManageCgroupsMode)) mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
rpcOpts.ManageCgroupsMode = &mode
} }
t := criurpc.CriuReqType_DUMP t := criurpc.CriuReqType_DUMP
@ -587,6 +675,27 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
} }
func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(iface.HostInterfaceName)
veth.IfIn = proto.String(iface.Name)
req.Opts.Veths = append(req.Opts.Veths, veth)
break
case "loopback":
break
}
}
for _, i := range criuOpts.VethPairs {
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(i.HostInterfaceName)
veth.IfIn = proto.String(i.ContainerInterfaceName)
req.Opts.Veths = append(req.Opts.Veths, veth)
}
}
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
@ -650,6 +759,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
TcpEstablished: proto.Bool(criuOpts.TcpEstablished), TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
FileLocks: proto.Bool(criuOpts.FileLocks), FileLocks: proto.Bool(criuOpts.FileLocks),
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
}, },
} }
@ -669,23 +779,9 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
break break
} }
} }
for _, iface := range c.config.Networks {
switch iface.Type { if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 {
case "veth": c.restoreNetwork(req, criuOpts)
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(iface.HostInterfaceName)
veth.IfIn = proto.String(iface.Name)
req.Opts.Veths = append(req.Opts.Veths, veth)
break
case "loopback":
break
}
}
for _, i := range criuOpts.VethPairs {
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(i.HostInterfaceName)
veth.IfIn = proto.String(i.ContainerInterfaceName)
req.Opts.Veths = append(req.Opts.Veths, veth)
} }
// append optional manage cgroups mode // append optional manage cgroups mode
@ -693,7 +789,8 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
if err := c.checkCriuVersion("1.7"); err != nil { if err := c.checkCriuVersion("1.7"); err != nil {
return err return err
} }
req.Opts.ManageCgroupsMode = proto.Uint32(uint32(criuOpts.ManageCgroupsMode)) mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
req.Opts.ManageCgroupsMode = &mode
} }
var ( var (
@ -850,7 +947,7 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
if err != nil { if err != nil {
return err return err
} }
n, err = criuClient.Write(data) _, err = criuClient.Write(data)
if err != nil { if err != nil {
return err return err
} }
@ -925,6 +1022,20 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
if err := lockNetwork(c.config); err != nil { if err := lockNetwork(c.config); err != nil {
return err return err
} }
case notify.GetScript() == "setup-namespaces":
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Pid: int(notify.GetPid()),
Root: c.config.Rootfs,
}
for i, hook := range c.config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
case notify.GetScript() == "post-restore": case notify.GetScript() == "post-restore":
pid := notify.GetPid() pid := notify.GetPid()
r, err := newRestoredProcess(int(pid), fds) r, err := newRestoredProcess(int(pid), fds)
@ -938,7 +1049,7 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
}); err != nil { }); err != nil {
return err return err
} }
if err := c.updateState(r); err != nil { if _, err := c.updateState(r); err != nil {
return err return err
} }
if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil { if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
@ -950,13 +1061,17 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
return nil return nil
} }
func (c *linuxContainer) updateState(process parentProcess) error { func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
c.initProcess = process c.initProcess = process
state, err := c.currentState() state, err := c.currentState()
if err != nil { if err != nil {
return err return nil, err
} }
return c.saveState(state) err = c.saveState(state)
if err != nil {
return nil, err
}
return state, nil
} }
func (c *linuxContainer) saveState(s *State) error { func (c *linuxContainer) saveState(s *State) error {
@ -991,37 +1106,75 @@ func (c *linuxContainer) refreshState() error {
if paused { if paused {
return c.state.transition(&pausedState{c: c}) return c.state.transition(&pausedState{c: c})
} }
running, err := c.isRunning() t, err := c.runType()
if err != nil { if err != nil {
return err return err
} }
if running { switch t {
case Created:
return c.state.transition(&createdState{c: c})
case Running:
return c.state.transition(&runningState{c: c}) return c.state.transition(&runningState{c: c})
} }
return c.state.transition(&stoppedState{c: c}) return c.state.transition(&stoppedState{c: c})
} }
func (c *linuxContainer) isRunning() (bool, error) { // doesInitProcessExist checks if the init process is still the same process
if c.initProcess == nil { // as the initial one, it could happen that the original process has exited
return false, nil // and a new process has been created with the same pid, in this case, the
// container would already be stopped.
func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) {
startTime, err := system.GetProcessStartTime(initPid)
if err != nil {
return false, newSystemErrorWithCausef(err, "getting init process %d start time", initPid)
} }
// return Running if the init process is alive if c.initProcessStartTime != startTime {
if err := syscall.Kill(c.initProcess.pid(), 0); err != nil {
if err == syscall.ESRCH {
return false, nil return false, nil
} }
return false, newSystemError(err)
}
return true, nil return true, nil
} }
func (c *linuxContainer) runType() (Status, error) {
if c.initProcess == nil {
return Stopped, nil
}
pid := c.initProcess.pid()
// return Running if the init process is alive
if err := syscall.Kill(pid, 0); err != nil {
if err == syscall.ESRCH {
// It means the process does not exist anymore, could happen when the
// process exited just when we call the function, we should not return
// error in this case.
return Stopped, nil
}
return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid)
}
// check if the process is still the original init process.
exist, err := c.doesInitProcessExist(pid)
if !exist || err != nil {
return Stopped, err
}
// check if the process that is running is the init process or the user's process.
// this is the difference between the container Running and Created.
environ, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/environ", pid))
if err != nil {
return Stopped, newSystemErrorWithCausef(err, "reading /proc/%d/environ", pid)
}
check := []byte("_LIBCONTAINER")
if bytes.Contains(environ, check) {
return Created, nil
}
return Running, nil
}
func (c *linuxContainer) isPaused() (bool, error) { func (c *linuxContainer) isPaused() (bool, error) {
data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state")) data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state"))
if err != nil { if err != nil {
// If freezer cgroup is not mounted, the container would just be not paused.
if os.IsNotExist(err) { if os.IsNotExist(err) {
return false, nil return false, nil
} }
return false, newSystemError(err) return false, newSystemErrorWithCause(err, "checking if container is paused")
} }
return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
} }
@ -1043,6 +1196,7 @@ func (c *linuxContainer) currentState() (*State, error) {
Config: *c.config, Config: *c.config,
InitProcessPid: pid, InitProcessPid: pid,
InitProcessStartTime: startTime, InitProcessStartTime: startTime,
Created: c.created,
}, },
CgroupPaths: c.cgroupManager.GetPaths(), CgroupPaths: c.cgroupManager.GetPaths(),
NamespacePaths: make(map[configs.NamespaceType]string), NamespacePaths: make(map[configs.NamespaceType]string),
@ -1053,6 +1207,9 @@ func (c *linuxContainer) currentState() (*State, error) {
state.NamespacePaths[ns.Type] = ns.GetPath(pid) state.NamespacePaths[ns.Type] = ns.GetPath(pid)
} }
for _, nsType := range configs.NamespaceTypes() { for _, nsType := range configs.NamespaceTypes() {
if !configs.IsNamespaceSupported(nsType) {
continue
}
if _, ok := state.NamespacePaths[nsType]; !ok { if _, ok := state.NamespacePaths[nsType]; !ok {
ns := configs.Namespace{Type: nsType} ns := configs.Namespace{Type: nsType}
state.NamespacePaths[ns.Type] = ns.GetPath(pid) state.NamespacePaths[ns.Type] = ns.GetPath(pid)
@ -1062,18 +1219,69 @@ func (c *linuxContainer) currentState() (*State, error) {
return state, nil return state, nil
} }
// bootstrapData encodes the necessary data in netlink binary format as a io.Reader. // orderNamespacePaths sorts namespace paths into a list of paths that we
// Consumer can write the data to a bootstrap program such as one that uses // can setns in order.
// nsenter package to bootstrap the container's init process correctly, i.e. with func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
// correct namespaces, uid/gid mapping etc. paths := []string{}
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath string) (io.Reader, error) { nsTypes := []configs.NamespaceType{
configs.NEWIPC,
configs.NEWUTS,
configs.NEWNET,
configs.NEWPID,
configs.NEWNS,
}
// join userns if the init process explicitly requires NEWUSER
if c.config.Namespaces.Contains(configs.NEWUSER) {
nsTypes = append(nsTypes, configs.NEWUSER)
}
for _, nsType := range nsTypes {
if p, ok := namespaces[nsType]; ok && p != "" {
// check if the requested namespace is supported
if !configs.IsNamespaceSupported(nsType) {
return nil, newSystemError(fmt.Errorf("namespace %s is not supported", nsType))
}
// only set to join this namespace if it exists
if _, err := os.Lstat(p); err != nil {
return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
}
// do not allow namespace path with comma as we use it to separate
// the namespace paths
if strings.ContainsRune(p, ',') {
return nil, newSystemError(fmt.Errorf("invalid path %s", p))
}
paths = append(paths, p)
}
}
return paths, nil
}
func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
data := bytes.NewBuffer(nil)
for _, im := range idMap {
line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
if _, err := data.WriteString(line); err != nil {
return nil, err
}
}
return data.Bytes(), nil
}
// bootstrapData encodes the necessary data in netlink binary format
// as a io.Reader.
// Consumer can write the data to a bootstrap program
// such as one that uses nsenter package to bootstrap the container's
// init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) {
// create the netlink message // create the netlink message
r := nl.NewNetlinkRequest(int(InitMsg), 0) r := nl.NewNetlinkRequest(int(InitMsg), 0)
// write pid
// write cloneFlags
r.AddData(&Int32msg{ r.AddData(&Int32msg{
Type: PidAttr, Type: CloneFlagsAttr,
Value: uint32(pid), Value: uint32(cloneFlags),
}) })
// write console path // write console path
if consolePath != "" { if consolePath != "" {
r.AddData(&Bytemsg{ r.AddData(&Bytemsg{
@ -1081,5 +1289,57 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath
Value: []byte(consolePath), Value: []byte(consolePath),
}) })
} }
// write custom namespace paths
if len(nsMaps) > 0 {
nsPaths, err := c.orderNamespacePaths(nsMaps)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: NsPathsAttr,
Value: []byte(strings.Join(nsPaths, ",")),
})
}
// write namespace paths only when we are not joining an existing user ns
_, joinExistingUser := nsMaps[configs.NEWUSER]
if !joinExistingUser {
// write uid mappings
if len(c.config.UidMappings) > 0 {
b, err := encodeIDMapping(c.config.UidMappings)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: UidmapAttr,
Value: b,
})
}
// write gid mappings
if len(c.config.GidMappings) > 0 {
b, err := encodeIDMapping(c.config.GidMappings)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: GidmapAttr,
Value: b,
})
// check if we have CAP_SETGID to setgroup properly
pid, err := capability.NewPid(os.Getpid())
if err != nil {
return nil, err
}
if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
r.AddData(&Boolmsg{
Type: SetgroupAttr,
Value: true,
})
}
}
}
return bytes.NewReader(r.Serialize()), nil return bytes.NewReader(r.Serialize()), nil
} }

View File

@ -1,13 +0,0 @@
// +build !go1.4
package libcontainer
import (
"fmt"
"syscall"
)
// not available before go 1.4
func (c *linuxContainer) addUidGidMappings(sys *syscall.SysProcAttr) error {
return fmt.Errorf("User namespace is not supported in golang < 1.4")
}

View File

@ -0,0 +1,20 @@
package libcontainer
// State represents a running container's state
type State struct {
BaseState
// Platform specific fields below here
}
// A libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
// was not found.
type Container interface {
BaseContainer
// Methods below here are platform specific
}

View File

@ -1,26 +0,0 @@
// +build go1.4
package libcontainer
import "syscall"
// Converts IDMap to SysProcIDMap array and adds it to SysProcAttr.
func (c *linuxContainer) addUidGidMappings(sys *syscall.SysProcAttr) error {
if c.config.UidMappings != nil {
sys.UidMappings = make([]syscall.SysProcIDMap, len(c.config.UidMappings))
for i, um := range c.config.UidMappings {
sys.UidMappings[i].ContainerID = um.ContainerID
sys.UidMappings[i].HostID = um.HostID
sys.UidMappings[i].Size = um.Size
}
}
if c.config.GidMappings != nil {
sys.GidMappings = make([]syscall.SysProcIDMap, len(c.config.GidMappings))
for i, gm := range c.config.GidMappings {
sys.GidMappings[i].ContainerID = gm.ContainerID
sys.GidMappings[i].HostID = gm.HostID
sys.GidMappings[i].Size = gm.Size
}
}
return nil
}

View File

@ -3,10 +3,10 @@
package libcontainer package libcontainer
// cgroup restoring strategy provided by criu // cgroup restoring strategy provided by criu
type cg_mode uint32 type cgMode uint32
const ( const (
CRIU_CG_MODE_SOFT cg_mode = 3 + iota // restore cgroup properties if only dir created by criu CRIU_CG_MODE_SOFT cgMode = 3 + iota // restore cgroup properties if only dir created by criu
CRIU_CG_MODE_FULL // always restore all cgroups and their properties CRIU_CG_MODE_FULL // always restore all cgroups and their properties
CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system
CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT
@ -32,5 +32,6 @@ type CriuOpts struct {
FileLocks bool // handle file locks, for safety FileLocks bool // handle file locks, for safety
PageServer CriuPageServerInfo // allow to dump to criu page server PageServer CriuPageServerInfo // allow to dump to criu page server
VethPairs []VethPairName // pass the veth to criu when restore VethPairs []VethPairName // pass the veth to criu when restore
ManageCgroupsMode cg_mode // dump or restore cgroup mode ManageCgroupsMode cgMode // dump or restore cgroup mode
EmptyNs uint32 // don't c/r properties for namespace from this mask
} }

View File

@ -19,6 +19,7 @@ It has these top-level messages:
CriuDumpResp CriuDumpResp
CriuRestoreResp CriuRestoreResp
CriuNotify CriuNotify
CriuFeatures
CriuReq CriuReq
CriuResp CriuResp
*/ */
@ -31,6 +32,54 @@ import math "math"
var _ = proto.Marshal var _ = proto.Marshal
var _ = math.Inf var _ = math.Inf
type CriuCgMode int32
const (
CriuCgMode_IGNORE CriuCgMode = 0
CriuCgMode_NONE CriuCgMode = 1
CriuCgMode_PROPS CriuCgMode = 2
CriuCgMode_SOFT CriuCgMode = 3
CriuCgMode_FULL CriuCgMode = 4
CriuCgMode_STRICT CriuCgMode = 5
CriuCgMode_DEFAULT CriuCgMode = 6
)
var CriuCgMode_name = map[int32]string{
0: "IGNORE",
1: "NONE",
2: "PROPS",
3: "SOFT",
4: "FULL",
5: "STRICT",
6: "DEFAULT",
}
var CriuCgMode_value = map[string]int32{
"IGNORE": 0,
"NONE": 1,
"PROPS": 2,
"SOFT": 3,
"FULL": 4,
"STRICT": 5,
"DEFAULT": 6,
}
func (x CriuCgMode) Enum() *CriuCgMode {
p := new(CriuCgMode)
*p = x
return p
}
func (x CriuCgMode) String() string {
return proto.EnumName(CriuCgMode_name, int32(x))
}
func (x *CriuCgMode) UnmarshalJSON(data []byte) error {
value, err := proto.UnmarshalJSONEnum(CriuCgMode_value, data, "CriuCgMode")
if err != nil {
return err
}
*x = CriuCgMode(value)
return nil
}
type CriuReqType int32 type CriuReqType int32
const ( const (
@ -43,6 +92,7 @@ const (
CriuReqType_NOTIFY CriuReqType = 6 CriuReqType_NOTIFY CriuReqType = 6
CriuReqType_CPUINFO_DUMP CriuReqType = 7 CriuReqType_CPUINFO_DUMP CriuReqType = 7
CriuReqType_CPUINFO_CHECK CriuReqType = 8 CriuReqType_CPUINFO_CHECK CriuReqType = 8
CriuReqType_FEATURE_CHECK CriuReqType = 9
) )
var CriuReqType_name = map[int32]string{ var CriuReqType_name = map[int32]string{
@ -55,6 +105,7 @@ var CriuReqType_name = map[int32]string{
6: "NOTIFY", 6: "NOTIFY",
7: "CPUINFO_DUMP", 7: "CPUINFO_DUMP",
8: "CPUINFO_CHECK", 8: "CPUINFO_CHECK",
9: "FEATURE_CHECK",
} }
var CriuReqType_value = map[string]int32{ var CriuReqType_value = map[string]int32{
"EMPTY": 0, "EMPTY": 0,
@ -66,6 +117,7 @@ var CriuReqType_value = map[string]int32{
"NOTIFY": 6, "NOTIFY": 6,
"CPUINFO_DUMP": 7, "CPUINFO_DUMP": 7,
"CPUINFO_CHECK": 8, "CPUINFO_CHECK": 8,
"FEATURE_CHECK": 9,
} }
func (x CriuReqType) Enum() *CriuReqType { func (x CriuReqType) Enum() *CriuReqType {
@ -271,7 +323,12 @@ type CriuOpts struct {
SkipMnt []string `protobuf:"bytes,31,rep,name=skip_mnt" json:"skip_mnt,omitempty"` SkipMnt []string `protobuf:"bytes,31,rep,name=skip_mnt" json:"skip_mnt,omitempty"`
EnableFs []string `protobuf:"bytes,32,rep,name=enable_fs" json:"enable_fs,omitempty"` EnableFs []string `protobuf:"bytes,32,rep,name=enable_fs" json:"enable_fs,omitempty"`
UnixSkIno []*UnixSk `protobuf:"bytes,33,rep,name=unix_sk_ino" json:"unix_sk_ino,omitempty"` UnixSkIno []*UnixSk `protobuf:"bytes,33,rep,name=unix_sk_ino" json:"unix_sk_ino,omitempty"`
ManageCgroupsMode *uint32 `protobuf:"varint,34,opt,name=manage_cgroups_mode" json:"manage_cgroups_mode,omitempty"` ManageCgroupsMode *CriuCgMode `protobuf:"varint,34,opt,name=manage_cgroups_mode,enum=CriuCgMode" json:"manage_cgroups_mode,omitempty"`
GhostLimit *uint32 `protobuf:"varint,35,opt,name=ghost_limit,def=1048576" json:"ghost_limit,omitempty"`
IrmapScanPaths []string `protobuf:"bytes,36,rep,name=irmap_scan_paths" json:"irmap_scan_paths,omitempty"`
External []string `protobuf:"bytes,37,rep,name=external" json:"external,omitempty"`
EmptyNs *uint32 `protobuf:"varint,38,opt,name=empty_ns" json:"empty_ns,omitempty"`
NoSeccomp *bool `protobuf:"varint,39,opt,name=no_seccomp" json:"no_seccomp,omitempty"`
XXX_unrecognized []byte `json:"-"` XXX_unrecognized []byte `json:"-"`
} }
@ -281,6 +338,7 @@ func (*CriuOpts) ProtoMessage() {}
const Default_CriuOpts_LogLevel int32 = 2 const Default_CriuOpts_LogLevel int32 = 2
const Default_CriuOpts_CpuCap uint32 = 4294967295 const Default_CriuOpts_CpuCap uint32 = 4294967295
const Default_CriuOpts_GhostLimit uint32 = 1048576
func (m *CriuOpts) GetImagesDirFd() int32 { func (m *CriuOpts) GetImagesDirFd() int32 {
if m != nil && m.ImagesDirFd != nil { if m != nil && m.ImagesDirFd != nil {
@ -513,13 +571,48 @@ func (m *CriuOpts) GetUnixSkIno() []*UnixSk {
return nil return nil
} }
func (m *CriuOpts) GetManageCgroupsMode() uint32 { func (m *CriuOpts) GetManageCgroupsMode() CriuCgMode {
if m != nil && m.ManageCgroupsMode != nil { if m != nil && m.ManageCgroupsMode != nil {
return *m.ManageCgroupsMode return *m.ManageCgroupsMode
} }
return CriuCgMode_IGNORE
}
func (m *CriuOpts) GetGhostLimit() uint32 {
if m != nil && m.GhostLimit != nil {
return *m.GhostLimit
}
return Default_CriuOpts_GhostLimit
}
func (m *CriuOpts) GetIrmapScanPaths() []string {
if m != nil {
return m.IrmapScanPaths
}
return nil
}
func (m *CriuOpts) GetExternal() []string {
if m != nil {
return m.External
}
return nil
}
func (m *CriuOpts) GetEmptyNs() uint32 {
if m != nil && m.EmptyNs != nil {
return *m.EmptyNs
}
return 0 return 0
} }
func (m *CriuOpts) GetNoSeccomp() bool {
if m != nil && m.NoSeccomp != nil {
return *m.NoSeccomp
}
return false
}
type CriuDumpResp struct { type CriuDumpResp struct {
Restored *bool `protobuf:"varint,1,opt,name=restored" json:"restored,omitempty"` Restored *bool `protobuf:"varint,1,opt,name=restored" json:"restored,omitempty"`
XXX_unrecognized []byte `json:"-"` XXX_unrecognized []byte `json:"-"`
@ -576,6 +669,25 @@ func (m *CriuNotify) GetPid() int32 {
return 0 return 0
} }
//
// List of features which can queried via
// CRIU_REQ_TYPE__FEATURE_CHECK
type CriuFeatures struct {
MemTrack *bool `protobuf:"varint,1,opt,name=mem_track" json:"mem_track,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *CriuFeatures) Reset() { *m = CriuFeatures{} }
func (m *CriuFeatures) String() string { return proto.CompactTextString(m) }
func (*CriuFeatures) ProtoMessage() {}
func (m *CriuFeatures) GetMemTrack() bool {
if m != nil && m.MemTrack != nil {
return *m.MemTrack
}
return false
}
type CriuReq struct { type CriuReq struct {
Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"` Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"`
Opts *CriuOpts `protobuf:"bytes,2,opt,name=opts" json:"opts,omitempty"` Opts *CriuOpts `protobuf:"bytes,2,opt,name=opts" json:"opts,omitempty"`
@ -585,6 +697,11 @@ type CriuReq struct {
// will wait for more req-s to appear. Works not // will wait for more req-s to appear. Works not
// for all request types. // for all request types.
KeepOpen *bool `protobuf:"varint,4,opt,name=keep_open" json:"keep_open,omitempty"` KeepOpen *bool `protobuf:"varint,4,opt,name=keep_open" json:"keep_open,omitempty"`
//
// 'features' can be used to query which features
// are supported by the installed criu/kernel
// via RPC.
Features *CriuFeatures `protobuf:"bytes,5,opt,name=features" json:"features,omitempty"`
XXX_unrecognized []byte `json:"-"` XXX_unrecognized []byte `json:"-"`
} }
@ -620,6 +737,13 @@ func (m *CriuReq) GetKeepOpen() bool {
return false return false
} }
func (m *CriuReq) GetFeatures() *CriuFeatures {
if m != nil {
return m.Features
}
return nil
}
type CriuResp struct { type CriuResp struct {
Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"` Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"`
Success *bool `protobuf:"varint,2,req,name=success" json:"success,omitempty"` Success *bool `protobuf:"varint,2,req,name=success" json:"success,omitempty"`
@ -628,6 +752,7 @@ type CriuResp struct {
Notify *CriuNotify `protobuf:"bytes,5,opt,name=notify" json:"notify,omitempty"` Notify *CriuNotify `protobuf:"bytes,5,opt,name=notify" json:"notify,omitempty"`
Ps *CriuPageServerInfo `protobuf:"bytes,6,opt,name=ps" json:"ps,omitempty"` Ps *CriuPageServerInfo `protobuf:"bytes,6,opt,name=ps" json:"ps,omitempty"`
CrErrno *int32 `protobuf:"varint,7,opt,name=cr_errno" json:"cr_errno,omitempty"` CrErrno *int32 `protobuf:"varint,7,opt,name=cr_errno" json:"cr_errno,omitempty"`
Features *CriuFeatures `protobuf:"bytes,8,opt,name=features" json:"features,omitempty"`
XXX_unrecognized []byte `json:"-"` XXX_unrecognized []byte `json:"-"`
} }
@ -684,6 +809,14 @@ func (m *CriuResp) GetCrErrno() int32 {
return 0 return 0
} }
func (m *CriuResp) GetFeatures() *CriuFeatures {
if m != nil {
return m.Features
}
return nil
}
func init() { func init() {
proto.RegisterEnum("CriuCgMode", CriuCgMode_name, CriuCgMode_value)
proto.RegisterEnum("CriuReqType", CriuReqType_name, CriuReqType_value) proto.RegisterEnum("CriuReqType", CriuReqType_name, CriuReqType_value)
} }

View File

@ -29,6 +29,16 @@ message unix_sk {
required uint32 inode = 1; required uint32 inode = 1;
}; };
enum criu_cg_mode {
IGNORE = 0;
NONE = 1;
PROPS = 2;
SOFT = 3;
FULL = 4;
STRICT = 5;
DEFAULT = 6;
};
message criu_opts { message criu_opts {
required int32 images_dir_fd = 1; required int32 images_dir_fd = 1;
optional int32 pid = 2; /* if not set on dump, will dump requesting process */ optional int32 pid = 2; /* if not set on dump, will dump requesting process */
@ -75,7 +85,12 @@ message criu_opts {
repeated unix_sk unix_sk_ino = 33; repeated unix_sk unix_sk_ino = 33;
optional uint32 manage_cgroups_mode = 34; optional criu_cg_mode manage_cgroups_mode = 34;
optional uint32 ghost_limit = 35 [default = 0x100000];
repeated string irmap_scan_paths = 36;
repeated string external = 37;
optional uint32 empty_ns = 38;
optional bool no_seccomp = 39;
} }
message criu_dump_resp { message criu_dump_resp {
@ -103,6 +118,16 @@ enum criu_req_type {
CPUINFO_DUMP = 7; CPUINFO_DUMP = 7;
CPUINFO_CHECK = 8; CPUINFO_CHECK = 8;
FEATURE_CHECK = 9;
}
/*
* List of features which can queried via
* CRIU_REQ_TYPE__FEATURE_CHECK
*/
message criu_features {
optional bool mem_track = 1;
} }
/* /*
@ -122,11 +147,17 @@ message criu_req {
* for all request types. * for all request types.
*/ */
optional bool keep_open = 4; optional bool keep_open = 4;
/*
* 'features' can be used to query which features
* are supported by the installed criu/kernel
* via RPC.
*/
optional criu_features features = 5;
} }
/* /*
* Responce -- it states whether the request was served * Response -- it states whether the request was served
* and additional request-specific informarion * and additional request-specific information
*/ */
message criu_resp { message criu_resp {
@ -139,4 +170,5 @@ message criu_resp {
optional criu_page_server_info ps = 6; optional criu_page_server_info ps = 6;
optional int32 cr_errno = 7; optional int32 cr_errno = 7;
optional criu_features features = 8;
} }

View File

@ -2,7 +2,7 @@ package libcontainer
import "io" import "io"
// API error code type. // ErrorCode is the API error code type.
type ErrorCode int type ErrorCode int
// API error codes. // API error codes.
@ -19,7 +19,7 @@ const (
ContainerNotPaused ContainerNotPaused
// Process errors // Process errors
ProcessNotExecuted NoProcessOps
// Common errors // Common errors
ConfigInvalid ConfigInvalid
@ -49,12 +49,14 @@ func (c ErrorCode) String() string {
return "Console exists for process" return "Console exists for process"
case ContainerNotPaused: case ContainerNotPaused:
return "Container is not paused" return "Container is not paused"
case NoProcessOps:
return "No process operations"
default: default:
return "Unknown error" return "Unknown error"
} }
} }
// API Error type. // Error is the API error type.
type Error interface { type Error interface {
error error

View File

@ -9,6 +9,7 @@ import (
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"regexp" "regexp"
"runtime/debug"
"strconv" "strconv"
"syscall" "syscall"
@ -23,10 +24,11 @@ import (
const ( const (
stateFilename = "state.json" stateFilename = "state.json"
execFifoFilename = "exec.fifo"
) )
var ( var (
idRegex = regexp.MustCompile(`^[\w_-]+$`) idRegex = regexp.MustCompile(`^[\w+-\.]+$`)
maxIdLen = 1024 maxIdLen = 1024
) )
@ -101,6 +103,15 @@ func TmpfsRoot(l *LinuxFactory) error {
return nil return nil
} }
// CriuPath returns an option func to configure a LinuxFactory with the
// provided criupath
func CriuPath(criupath string) func(*LinuxFactory) error {
return func(l *LinuxFactory) error {
l.CriuPath = criupath
return nil
}
}
// New returns a linux based container factory based in the root directory and // New returns a linux based container factory based in the root directory and
// configures the factory with the provided option funcs. // configures the factory with the provided option funcs.
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) { func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
@ -157,13 +168,34 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
if err := l.Validator.Validate(config); err != nil { if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid) return nil, newGenericError(err, ConfigInvalid)
} }
uid, err := config.HostUID()
if err != nil {
return nil, newGenericError(err, SystemError)
}
gid, err := config.HostGID()
if err != nil {
return nil, newGenericError(err, SystemError)
}
containerRoot := filepath.Join(l.Root, id) containerRoot := filepath.Join(l.Root, id)
if _, err := os.Stat(containerRoot); err == nil { if _, err := os.Stat(containerRoot); err == nil {
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse) return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
} else if !os.IsNotExist(err) { } else if !os.IsNotExist(err) {
return nil, newGenericError(err, SystemError) return nil, newGenericError(err, SystemError)
} }
if err := os.MkdirAll(containerRoot, 0700); err != nil { if err := os.MkdirAll(containerRoot, 0711); err != nil {
return nil, newGenericError(err, SystemError)
}
if err := os.Chown(containerRoot, uid, gid); err != nil {
return nil, newGenericError(err, SystemError)
}
fifoName := filepath.Join(containerRoot, execFifoFilename)
oldMask := syscall.Umask(0000)
if err := syscall.Mkfifo(fifoName, 0622); err != nil {
syscall.Umask(oldMask)
return nil, newGenericError(err, SystemError)
}
syscall.Umask(oldMask)
if err := os.Chown(fifoName, uid, gid); err != nil {
return nil, newGenericError(err, SystemError) return nil, newGenericError(err, SystemError)
} }
c := &linuxContainer{ c := &linuxContainer{
@ -195,6 +227,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
} }
c := &linuxContainer{ c := &linuxContainer{
initProcess: r, initProcess: r,
initProcessStartTime: state.InitProcessStartTime,
id: id, id: id,
config: &state.Config, config: &state.Config,
initPath: l.InitPath, initPath: l.InitPath,
@ -202,8 +235,9 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
criuPath: l.CriuPath, criuPath: l.CriuPath,
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths), cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
root: containerRoot, root: containerRoot,
created: state.Created,
} }
c.state = &createdState{c: c, s: Created} c.state = &loadedState{c: c}
if err := c.refreshState(); err != nil { if err := c.refreshState(); err != nil {
return nil, err return nil, err
} }
@ -217,10 +251,18 @@ func (l *LinuxFactory) Type() string {
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state // StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally // This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) { func (l *LinuxFactory) StartInitialization() (err error) {
fdStr := os.Getenv("_LIBCONTAINER_INITPIPE") var pipefd, rootfd int
pipefd, err := strconv.Atoi(fdStr) for k, v := range map[string]*int{
"_LIBCONTAINER_INITPIPE": &pipefd,
"_LIBCONTAINER_STATEDIR": &rootfd,
} {
s := os.Getenv(k)
i, err := strconv.Atoi(s)
if err != nil { if err != nil {
return fmt.Errorf("error converting env var _LIBCONTAINER_INITPIPE(%q) to an int: %s", fdStr, err) return fmt.Errorf("unable to convert %s=%s to int", k, s)
}
*v = i
} }
var ( var (
pipe = os.NewFile(uintptr(pipefd), "pipe") pipe = os.NewFile(uintptr(pipefd), "pipe")
@ -229,29 +271,31 @@ func (l *LinuxFactory) StartInitialization() (err error) {
// clear the current process's environment to clean any libcontainer // clear the current process's environment to clean any libcontainer
// specific env vars. // specific env vars.
os.Clearenv() os.Clearenv()
var i initer var i initer
defer func() { defer func() {
// if we have an error during the initialization of the container's init then send it back to the // We have an error during the initialization of the container's init,
// parent process in the form of an initError. // send it back to the parent process in the form of an initError.
if err != nil { // If container's init successed, syscall.Exec will not return, hence
// this defer function will never be called.
if _, ok := i.(*linuxStandardInit); ok { if _, ok := i.(*linuxStandardInit); ok {
// Synchronisation only necessary for standard init. // Synchronisation only necessary for standard init.
if err := utils.WriteJSON(pipe, syncT{procError}); err != nil { if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
panic(err) panic(err)
} }
} }
if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil { if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
panic(err) panic(err)
} }
} else {
if err := utils.WriteJSON(pipe, syncT{procStart}); err != nil {
panic(err)
}
}
// ensure that this pipe is always closed // ensure that this pipe is always closed
pipe.Close() pipe.Close()
}() }()
i, err = newContainerInit(it, pipe) defer func() {
if e := recover(); e != nil {
err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
}
}()
i, err = newContainerInit(it, pipe, rootfd)
if err != nil { if err != nil {
return err return err
} }

View File

@ -14,8 +14,9 @@ type syncType uint8
const ( const (
procReady syncType = iota procReady syncType = iota
procError procError
procStart
procRun procRun
procHooks
procResume
) )
type syncT struct { type syncT struct {
@ -51,6 +52,21 @@ func newGenericError(err error, c ErrorCode) Error {
} }
func newSystemError(err error) Error { func newSystemError(err error) Error {
return createSystemError(err, "")
}
func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error {
return createSystemError(err, fmt.Sprintf(cause, v...))
}
func newSystemErrorWithCause(err error, cause string) Error {
return createSystemError(err, cause)
}
// createSystemError creates the specified error with the correct number of
// stack frames skipped. This is only to be called by the other functions for
// formatting the error.
func createSystemError(err error, cause string) Error {
if le, ok := err.(Error); ok { if le, ok := err.(Error); ok {
return le return le
} }
@ -58,7 +74,8 @@ func newSystemError(err error) Error {
Timestamp: time.Now(), Timestamp: time.Now(),
Err: err, Err: err,
ECode: SystemError, ECode: SystemError,
Stack: stacktrace.Capture(1), Cause: cause,
Stack: stacktrace.Capture(2),
} }
if err != nil { if err != nil {
gerr.Message = err.Error() gerr.Message = err.Error()
@ -70,12 +87,17 @@ type genericError struct {
Timestamp time.Time Timestamp time.Time
ECode ErrorCode ECode ErrorCode
Err error `json:"-"` Err error `json:"-"`
Cause string
Message string Message string
Stack stacktrace.Stacktrace Stack stacktrace.Stacktrace
} }
func (e *genericError) Error() string { func (e *genericError) Error() string {
return fmt.Sprintf("[%d] %s: %s", e.ECode, e.ECode, e.Message) if e.Cause == "" {
return e.Message
}
frame := e.Stack.Frames[0]
return fmt.Sprintf("%s:%d: %s caused %q", frame.File, frame.Line, e.Cause, e.Message)
} }
func (e *genericError) Code() ErrorCode { func (e *genericError) Code() ErrorCode {

View File

@ -48,18 +48,25 @@ type initConfig struct {
Env []string `json:"env"` Env []string `json:"env"`
Cwd string `json:"cwd"` Cwd string `json:"cwd"`
Capabilities []string `json:"capabilities"` Capabilities []string `json:"capabilities"`
ProcessLabel string `json:"process_label"`
AppArmorProfile string `json:"apparmor_profile"`
NoNewPrivileges bool `json:"no_new_privileges"`
User string `json:"user"` User string `json:"user"`
AdditionalGroups []string `json:"additional_groups"`
Config *configs.Config `json:"config"` Config *configs.Config `json:"config"`
Console string `json:"console"` Console string `json:"console"`
Networks []*network `json:"network"` Networks []*network `json:"network"`
PassedFilesCount int `json:"passed_files_count"` PassedFilesCount int `json:"passed_files_count"`
ContainerId string `json:"containerid"`
Rlimits []configs.Rlimit `json:"rlimits"`
ExecFifoPath string `json:"start_pipe_path"`
} }
type initer interface { type initer interface {
Init() error Init() error
} }
func newContainerInit(t initType, pipe *os.File) (initer, error) { func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) {
var config *initConfig var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil { if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err return nil, err
@ -77,6 +84,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
pipe: pipe, pipe: pipe,
parentPid: syscall.Getppid(), parentPid: syscall.Getppid(),
config: config, config: config,
stateDirFD: stateDirFD,
}, nil }, nil
} }
return nil, fmt.Errorf("unknown init type %q", t) return nil, fmt.Errorf("unknown init type %q", t)
@ -163,20 +171,22 @@ func syncParentReady(pipe io.ReadWriter) error {
return nil return nil
} }
// joinExistingNamespaces gets all the namespace paths specified for the container and // syncParentHooks sends to the given pipe a JSON payload which indicates that
// does a setns on the namespace fd so that the current process joins the namespace. // the parent should execute pre-start hooks. It then waits for the parent to
func joinExistingNamespaces(namespaces []configs.Namespace) error { // indicate that it is cleared to resume.
for _, ns := range namespaces { func syncParentHooks(pipe io.ReadWriter) error {
if ns.Path != "" { // Tell parent.
f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0) if err := utils.WriteJSON(pipe, syncT{procHooks}); err != nil {
if err != nil {
return err return err
} }
err = system.Setns(f.Fd(), uintptr(ns.Syscall())) // Wait for parent to give the all-clear.
f.Close() var procSync syncT
if err != nil { if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
return err if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
} }
if procSync.Type != procResume {
return fmt.Errorf("invalid synchronisation flag from parent")
} }
} }
return nil return nil
@ -204,8 +214,8 @@ func setupUser(config *initConfig) error {
} }
var addGroups []int var addGroups []int
if len(config.Config.AdditionalGroups) > 0 { if len(config.AdditionalGroups) > 0 {
addGroups, err = user.GetAdditionalGroupsPath(config.Config.AdditionalGroups, groupPath) addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath)
if err != nil { if err != nil {
return err return err
} }
@ -309,19 +319,19 @@ func setupRoute(config *configs.Config) error {
return nil return nil
} }
func setupRlimits(config *configs.Config) error { func setupRlimits(limits []configs.Rlimit, pid int) error {
for _, rlimit := range config.Rlimits { for _, rlimit := range limits {
l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft} if err := system.Prlimit(pid, rlimit.Type, syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil {
if err := syscall.Setrlimit(rlimit.Type, l); err != nil {
return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err) return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
} }
} }
return nil return nil
} }
func setOomScoreAdj(oomScoreAdj int) error { func setOomScoreAdj(oomScoreAdj int, pid int) error {
path := "/proc/self/oom_score_adj" path := fmt.Sprintf("/proc/%d/oom_score_adj", pid)
return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0700)
return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600)
} }
// killCgroupProcesses freezes then iterates over all the processes inside the // killCgroupProcesses freezes then iterates over all the processes inside the
@ -338,13 +348,16 @@ func killCgroupProcesses(m cgroups.Manager) error {
return err return err
} }
for _, pid := range pids { for _, pid := range pids {
if p, err := os.FindProcess(pid); err == nil { p, err := os.FindProcess(pid)
if err != nil {
logrus.Warn(err)
continue
}
procs = append(procs, p) procs = append(procs, p)
if err := p.Kill(); err != nil { if err := p.Kill(); err != nil {
logrus.Warn(err) logrus.Warn(err)
} }
} }
}
if err := m.Freeze(configs.Thawed); err != nil { if err := m.Freeze(configs.Thawed); err != nil {
logrus.Warn(err) logrus.Warn(err)
} }

View File

@ -0,0 +1,66 @@
// +build linux
package keyctl
import (
"fmt"
"strconv"
"strings"
"syscall"
"unsafe"
)
const KEYCTL_JOIN_SESSION_KEYRING = 1
const KEYCTL_SETPERM = 5
const KEYCTL_DESCRIBE = 6
type KeySerial uint32
func JoinSessionKeyring(name string) (KeySerial, error) {
var _name *byte
var err error
if len(name) > 0 {
_name, err = syscall.BytePtrFromString(name)
if err != nil {
return KeySerial(0), err
}
}
sessKeyId, _, errn := syscall.Syscall(syscall.SYS_KEYCTL, KEYCTL_JOIN_SESSION_KEYRING, uintptr(unsafe.Pointer(_name)), 0)
if errn != 0 {
return 0, fmt.Errorf("could not create session key: %v", errn)
}
return KeySerial(sessKeyId), nil
}
// ModKeyringPerm modifies permissions on a keyring by reading the current permissions,
// anding the bits with the given mask (clearing permissions) and setting
// additional permission bits
func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
dest := make([]byte, 1024)
destBytes := unsafe.Pointer(&dest[0])
if _, _, err := syscall.Syscall6(syscall.SYS_KEYCTL, uintptr(KEYCTL_DESCRIBE), uintptr(ringId), uintptr(destBytes), uintptr(len(dest)), 0, 0); err != 0 {
return err
}
res := strings.Split(string(dest), ";")
if len(res) < 5 {
return fmt.Errorf("Destination buffer for key description is too small")
}
// parse permissions
perm64, err := strconv.ParseUint(res[3], 16, 32)
if err != nil {
return err
}
perm := (uint32(perm64) & mask) | setbits
if _, _, err := syscall.Syscall(syscall.SYS_KEYCTL, uintptr(KEYCTL_SETPERM), uintptr(ringId), uintptr(perm)); err != 0 {
return err
}
return nil
}

View File

@ -21,6 +21,10 @@ func SetProcessLabel(processLabel string) error {
return nil return nil
} }
func GetFileLabel(path string) (string, error) {
return "", nil
}
func SetFileLabel(path string, fileLabel string) error { func SetFileLabel(path string, fileLabel string) error {
return nil return nil
} }
@ -48,7 +52,7 @@ func UnreserveLabel(label string) error {
return nil return nil
} }
// DupSecOpt takes an process label and returns security options that // DupSecOpt takes a process label and returns security options that
// can be used to set duplicate labels on future container processes // can be used to set duplicate labels on future container processes
func DupSecOpt(src string) []string { func DupSecOpt(src string) []string {
return nil return nil

View File

@ -94,6 +94,11 @@ func GetProcessLabel() (string, error) {
return selinux.Getexeccon() return selinux.Getexeccon()
} }
// GetFileLabel returns the label for specified path
func GetFileLabel(path string) (string, error) {
return selinux.Getfilecon(path)
}
// SetFileLabel modifies the "path" label to the specified file label // SetFileLabel modifies the "path" label to the specified file label
func SetFileLabel(path string, fileLabel string) error { func SetFileLabel(path string, fileLabel string) error {
if selinux.SelinuxEnabled() && fileLabel != "" { if selinux.SelinuxEnabled() && fileLabel != "" {
@ -102,7 +107,7 @@ func SetFileLabel(path string, fileLabel string) error {
return nil return nil
} }
// Tell the kernel the label for all files to be created // SetFileCreateLabel tells the kernel the label for all files to be created
func SetFileCreateLabel(fileLabel string) error { func SetFileCreateLabel(fileLabel string) error {
if selinux.SelinuxEnabled() { if selinux.SelinuxEnabled() {
return selinux.Setfscreatecon(fileLabel) return selinux.Setfscreatecon(fileLabel)
@ -110,7 +115,7 @@ func SetFileCreateLabel(fileLabel string) error {
return nil return nil
} }
// Change the label of path to the filelabel string. // Relabel changes the label of path to the filelabel string.
// It changes the MCS label to s0 if shared is true. // It changes the MCS label to s0 if shared is true.
// This will allow all containers to share the content. // This will allow all containers to share the content.
func Relabel(path string, fileLabel string, shared bool) error { func Relabel(path string, fileLabel string, shared bool) error {

View File

@ -12,8 +12,12 @@ import (
// The number is randomly chosen to not conflict with known netlink types // The number is randomly chosen to not conflict with known netlink types
const ( const (
InitMsg uint16 = 62000 InitMsg uint16 = 62000
PidAttr uint16 = 27281 CloneFlagsAttr uint16 = 27281
ConsolePathAttr uint16 = 27282 ConsolePathAttr uint16 = 27282
NsPathsAttr uint16 = 27283
UidmapAttr uint16 = 27284
GidmapAttr uint16 = 27285
SetgroupAttr uint16 = 27286
// When syscall.NLA_HDRLEN is in gccgo, take this out. // When syscall.NLA_HDRLEN is in gccgo, take this out.
syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1) syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1)
) )
@ -23,7 +27,8 @@ type Int32msg struct {
Value uint32 Value uint32
} }
// int32msg has the following representation // Serialize serializes the message.
// Int32msg has the following representation
// | nlattr len | nlattr type | // | nlattr len | nlattr type |
// | uint32 value | // | uint32 value |
func (msg *Int32msg) Serialize() []byte { func (msg *Int32msg) Serialize() []byte {
@ -39,7 +44,7 @@ func (msg *Int32msg) Len() int {
return syscall_NLA_HDRLEN + 4 return syscall_NLA_HDRLEN + 4
} }
// bytemsg has the following representation // Bytemsg has the following representation
// | nlattr len | nlattr type | // | nlattr len | nlattr type |
// | value | pad | // | value | pad |
type Bytemsg struct { type Bytemsg struct {
@ -60,3 +65,25 @@ func (msg *Bytemsg) Serialize() []byte {
func (msg *Bytemsg) Len() int { func (msg *Bytemsg) Len() int {
return syscall_NLA_HDRLEN + len(msg.Value) + 1 // null-terminated return syscall_NLA_HDRLEN + len(msg.Value) + 1 // null-terminated
} }
type Boolmsg struct {
Type uint16
Value bool
}
func (msg *Boolmsg) Serialize() []byte {
buf := make([]byte, msg.Len())
native := nl.NativeEndian()
native.PutUint16(buf[0:2], uint16(msg.Len()))
native.PutUint16(buf[2:4], msg.Type)
if msg.Value {
buf[4] = 1
} else {
buf[4] = 0
}
return buf
}
func (msg *Boolmsg) Len() int {
return syscall_NLA_HDRLEN + 1
}

View File

@ -5,6 +5,8 @@ import (
"io" "io"
"math" "math"
"os" "os"
"github.com/opencontainers/runc/libcontainer/configs"
) )
type processOperations interface { type processOperations interface {
@ -26,6 +28,10 @@ type Process struct {
// local to the container's user and group configuration. // local to the container's user and group configuration.
User string User string
// AdditionalGroups specifies the gids that should be added to supplementary groups
// in addition to those that the user belongs to.
AdditionalGroups []string
// Cwd will change the processes current working directory inside the container's rootfs. // Cwd will change the processes current working directory inside the container's rootfs.
Cwd string Cwd string
@ -48,6 +54,20 @@ type Process struct {
// All capabilities not specified will be dropped from the processes capability mask // All capabilities not specified will be dropped from the processes capability mask
Capabilities []string Capabilities []string
// AppArmorProfile specifies the profile to apply to the process and is
// changed at the time the process is execed
AppArmorProfile string
// Label specifies the label to apply to the process. It is commonly used by selinux
Label string
// NoNewPrivileges controls whether processes can gain additional privileges.
NoNewPrivileges *bool
// Rlimits specifies the resource limits, such as max open files, to set in the container
// If Rlimits are not set, the container will inherit rlimits from the parent process
Rlimits []configs.Rlimit
ops processOperations ops processOperations
} }
@ -55,7 +75,7 @@ type Process struct {
// Wait releases any resources associated with the Process // Wait releases any resources associated with the Process
func (p Process) Wait() (*os.ProcessState, error) { func (p Process) Wait() (*os.ProcessState, error) {
if p.ops == nil { if p.ops == nil {
return nil, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted) return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
} }
return p.ops.wait() return p.ops.wait()
} }
@ -65,7 +85,7 @@ func (p Process) Pid() (int, error) {
// math.MinInt32 is returned here, because it's invalid value // math.MinInt32 is returned here, because it's invalid value
// for the kill() system call. // for the kill() system call.
if p.ops == nil { if p.ops == nil {
return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted) return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
} }
return p.ops.pid(), nil return p.ops.pid(), nil
} }
@ -73,7 +93,7 @@ func (p Process) Pid() (int, error) {
// Signal sends a signal to the Process. // Signal sends a signal to the Process.
func (p Process) Signal(sig os.Signal) error { func (p Process) Signal(sig os.Signal) error {
if p.ops == nil { if p.ops == nil {
return newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted) return newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
} }
return p.ops.signal(sig) return p.ops.signal(sig)
} }
@ -86,8 +106,8 @@ type IO struct {
} }
// NewConsole creates new console for process and returns it // NewConsole creates new console for process and returns it
func (p *Process) NewConsole(rootuid int) (Console, error) { func (p *Process) NewConsole(rootuid, rootgid int) (Console, error) {
console, err := NewConsole(rootuid, rootuid) console, err := NewConsole(rootuid, rootgid)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@ -51,6 +51,7 @@ type setnsProcess struct {
fds []string fds []string
process *Process process *Process
bootstrapData io.Reader bootstrapData io.Reader
rootDir *os.File
} }
func (p *setnsProcess) startTime() (string, error) { func (p *setnsProcess) startTime() (string, error) {
@ -69,39 +70,49 @@ func (p *setnsProcess) start() (err error) {
defer p.parentPipe.Close() defer p.parentPipe.Close()
err = p.cmd.Start() err = p.cmd.Start()
p.childPipe.Close() p.childPipe.Close()
p.rootDir.Close()
if err != nil { if err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "starting setns process")
} }
if p.bootstrapData != nil { if p.bootstrapData != nil {
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
} }
} }
if err = p.execSetns(); err != nil { if err = p.execSetns(); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "executing setns process")
} }
if len(p.cgroupPaths) > 0 { if len(p.cgroupPaths) > 0 {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
return newSystemError(err) return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
} }
} }
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting oom score")
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting rlimits for process")
}
if err := utils.WriteJSON(p.parentPipe, p.config); err != nil { if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "writing config to pipe")
} }
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "calling shutdown on init pipe")
} }
// wait for the child process to fully complete and receive an error message // wait for the child process to fully complete and receive an error message
// if one was encoutered // if one was encoutered
var ierr *genericError var ierr *genericError
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err) return newSystemErrorWithCause(err, "decoding init error from pipe")
} }
// Must be done after Shutdown so the child will exit and we can wait for it. // Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil { if ierr != nil {
p.wait() p.wait()
return newSystemError(ierr) return ierr
} }
return nil return nil
} }
@ -114,7 +125,7 @@ func (p *setnsProcess) execSetns() error {
status, err := p.cmd.Process.Wait() status, err := p.cmd.Process.Wait()
if err != nil { if err != nil {
p.cmd.Wait() p.cmd.Wait()
return newSystemError(err) return newSystemErrorWithCause(err, "waiting on setns process to finish")
} }
if !status.Success() { if !status.Success() {
p.cmd.Wait() p.cmd.Wait()
@ -123,7 +134,7 @@ func (p *setnsProcess) execSetns() error {
var pid *pid var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait() p.cmd.Wait()
return newSystemError(err) return newSystemErrorWithCause(err, "reading pid from init pipe")
} }
process, err := os.FindProcess(pid.Pid) process, err := os.FindProcess(pid.Pid)
if err != nil { if err != nil {
@ -175,6 +186,9 @@ type initProcess struct {
container *linuxContainer container *linuxContainer
fds []string fds []string
process *Process process *Process
bootstrapData io.Reader
sharePidns bool
rootDir *os.File
} }
func (p *initProcess) pid() int { func (p *initProcess) pid() int {
@ -185,27 +199,63 @@ func (p *initProcess) externalDescriptors() []string {
return p.fds return p.fds
} }
func (p *initProcess) start() (err error) { // execSetns runs the process that executes C code to perform the setns calls
// because setns support requires the C process to fork off a child and perform the setns
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
// This is called by initProcess.start function
func (p *initProcess) execSetns() error {
status, err := p.cmd.Process.Wait()
if err != nil {
p.cmd.Wait()
return err
}
if !status.Success() {
p.cmd.Wait()
return &exec.ExitError{ProcessState: status}
}
var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return err
}
process, err := os.FindProcess(pid.Pid)
if err != nil {
return err
}
p.cmd.Process = process
p.process.ops = p
return nil
}
func (p *initProcess) start() error {
defer p.parentPipe.Close() defer p.parentPipe.Close()
err = p.cmd.Start() err := p.cmd.Start()
p.process.ops = p p.process.ops = p
p.childPipe.Close() p.childPipe.Close()
p.rootDir.Close()
if err != nil { if err != nil {
p.process.ops = nil p.process.ops = nil
return newSystemError(err) return newSystemErrorWithCause(err, "starting init process command")
}
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return err
}
if err := p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "running exec setns process for init")
} }
// Save the standard descriptor names before the container process // Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()). If we don't do this now, // can potentially move them (e.g., via dup2()). If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up. // we won't know at checkpoint time which file descriptor to look up.
fds, err := getPipeFds(p.pid()) fds, err := getPipeFds(p.pid())
if err != nil { if err != nil {
return newSystemError(err) return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
} }
p.setExternalDescriptors(fds) p.setExternalDescriptors(fds)
// Do this before syncing with child so that no children // Do this before syncing with child so that no children
// can escape the cgroup // can escape the cgroup
if err := p.manager.Apply(p.pid()); err != nil { if err := p.manager.Apply(p.pid()); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "applying cgroup configuration for process")
} }
defer func() { defer func() {
if err != nil { if err != nil {
@ -213,6 +263,44 @@ func (p *initProcess) start() (err error) {
p.manager.Destroy() p.manager.Destroy()
} }
}() }()
if err := p.createNetworkInterfaces(); err != nil {
return newSystemErrorWithCause(err, "creating nework interfaces")
}
if err := p.sendConfig(); err != nil {
return newSystemErrorWithCause(err, "sending config to init process")
}
var (
procSync syncT
sentRun bool
sentResume bool
ierr *genericError
)
dec := json.NewDecoder(p.parentPipe)
loop:
for {
if err := dec.Decode(&procSync); err != nil {
if err == io.EOF {
break loop
}
return newSystemErrorWithCause(err, "decoding sync type from init pipe")
}
switch procSync.Type {
case procReady:
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
}
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting oom score for ready process")
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting rlimits for ready process")
}
// call prestart hooks
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
if p.config.Config.Hooks != nil { if p.config.Config.Hooks != nil {
s := configs.HookState{ s := configs.HookState{
Version: p.container.config.Version, Version: p.container.config.Version,
@ -220,49 +308,43 @@ func (p *initProcess) start() (err error) {
Pid: p.pid(), Pid: p.pid(),
Root: p.config.Config.Rootfs, Root: p.config.Config.Rootfs,
} }
for _, hook := range p.config.Config.Hooks.Prestart { for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil { if err := hook.Run(s); err != nil {
return newSystemError(err) return newSystemErrorWithCausef(err, "running prestart hook %d", i)
} }
} }
} }
if err := p.createNetworkInterfaces(); err != nil {
return newSystemError(err)
}
if err := p.sendConfig(); err != nil {
return newSystemError(err)
}
var (
procSync syncT
sentRun bool
ierr *genericError
)
loop:
for {
if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil {
if err == io.EOF {
break loop
}
return newSystemError(err)
}
switch procSync.Type {
case procStart:
break loop
case procReady:
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemError(err)
} }
// Sync with child. // Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil { if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "reading syncT run type")
} }
sentRun = true sentRun = true
case procHooks:
if p.config.Config.Hooks != nil {
s := configs.HookState{
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Root: p.config.Config.Rootfs,
BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"),
}
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
// Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil {
return newSystemErrorWithCause(err, "reading syncT resume type")
}
sentResume = true
case procError: case procError:
// wait for the child process to fully complete and receive an error message // wait for the child process to fully complete and receive an error message
// if one was encoutered // if one was encoutered
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { if err := dec.Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err) return newSystemErrorWithCause(err, "decoding proc error from init")
} }
if ierr != nil { if ierr != nil {
break loop break loop
@ -270,19 +352,22 @@ loop:
// Programmer error. // Programmer error.
panic("No error following JSON procError payload.") panic("No error following JSON procError payload.")
default: default:
return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child")) return newSystemError(fmt.Errorf("invalid JSON payload from child"))
} }
} }
if !sentRun { if !sentRun {
return newSystemError(fmt.Errorf("could not synchronise with container process")) return newSystemErrorWithCause(ierr, "container init failed")
}
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
} }
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "shutting down init pipe")
} }
// Must be done after Shutdown so the child will exit and we can wait for it. // Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil { if ierr != nil {
p.wait() p.wait()
return newSystemError(ierr) return ierr
} }
return nil return nil
} }
@ -293,7 +378,7 @@ func (p *initProcess) wait() (*os.ProcessState, error) {
return p.cmd.ProcessState, err return p.cmd.ProcessState, err
} }
// we should kill all processes in cgroup when init is died if we use host PID namespace // we should kill all processes in cgroup when init is died if we use host PID namespace
if p.cmd.SysProcAttr.Cloneflags&syscall.CLONE_NEWPID == 0 { if p.sharePidns {
killCgroupProcesses(p.manager) killCgroupProcesses(p.manager)
} }
return p.cmd.ProcessState, nil return p.cmd.ProcessState, nil
@ -315,7 +400,9 @@ func (p *initProcess) startTime() (string, error) {
} }
func (p *initProcess) sendConfig() error { func (p *initProcess) sendConfig() error {
// send the state to the container's init process then shutdown writes for the parent // send the config to the container's init process, we don't use JSON Encode
// here because there might be a problem in JSON decoder in some cases, see:
// https://github.com/docker/docker/issues/14203#issuecomment-174177790
return utils.WriteJSON(p.parentPipe, p.config) return utils.WriteJSON(p.parentPipe, p.config)
} }
@ -365,7 +452,7 @@ func getPipeFds(pid int) ([]string, error) {
// InitializeIO creates pipes for use with the process's STDIO // InitializeIO creates pipes for use with the process's STDIO
// and returns the opposite side for each // and returns the opposite side for each
func (p *Process) InitializeIO(rootuid int) (i *IO, err error) { func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
var fds []uintptr var fds []uintptr
i = &IO{} i = &IO{}
// cleanup in case of an error // cleanup in case of an error
@ -397,7 +484,7 @@ func (p *Process) InitializeIO(rootuid int) (i *IO, err error) {
p.Stderr, i.Stderr = w, r p.Stderr, i.Stderr = w, r
// change ownership of the pipes incase we are in a user namespace // change ownership of the pipes incase we are in a user namespace
for _, fd := range fds { for _, fd := range fds {
if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil { if err := syscall.Fchown(int(fd), rootuid, rootgid); err != nil {
return nil, err return nil, err
} }
} }

View File

@ -4,6 +4,7 @@ package libcontainer
import ( import (
"fmt" "fmt"
"io"
"io/ioutil" "io/ioutil"
"os" "os"
"os/exec" "os/exec"
@ -19,47 +20,65 @@ import (
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/label" "github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/system"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
) )
const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
// needsSetupDev returns true if /dev needs to be set up.
func needsSetupDev(config *configs.Config) bool {
for _, m := range config.Mounts {
if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" {
return false
}
}
return true
}
// setupRootfs sets up the devices, mount points, and filesystems for use inside a // setupRootfs sets up the devices, mount points, and filesystems for use inside a
// new mount namespace. // new mount namespace.
func setupRootfs(config *configs.Config, console *linuxConsole) (err error) { func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWriter) (err error) {
if err := prepareRoot(config); err != nil { if err := prepareRoot(config); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "preparing rootfs")
} }
setupDev := len(config.Devices) != 0 setupDev := needsSetupDev(config)
for _, m := range config.Mounts { for _, m := range config.Mounts {
for _, precmd := range m.PremountCmds { for _, precmd := range m.PremountCmds {
if err := mountCmd(precmd); err != nil { if err := mountCmd(precmd); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "running premount command")
} }
} }
if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil { if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil {
return newSystemError(err) return newSystemErrorWithCausef(err, "mounting %q to rootfs %q", m.Destination, config.Rootfs)
} }
for _, postcmd := range m.PostmountCmds { for _, postcmd := range m.PostmountCmds {
if err := mountCmd(postcmd); err != nil { if err := mountCmd(postcmd); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "running postmount command")
} }
} }
} }
if setupDev { if setupDev {
if err := createDevices(config); err != nil { if err := createDevices(config); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "creating device nodes")
} }
if err := setupPtmx(config, console); err != nil { if err := setupPtmx(config, console); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "setting up ptmx")
} }
if err := setupDevSymlinks(config.Rootfs); err != nil { if err := setupDevSymlinks(config.Rootfs); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "setting up /dev symlinks")
} }
} }
// Signal the parent to run the pre-start hooks.
// The hooks are run after the mounts are setup, but before we switch to the new
// root, so that the old root is still available in the hooks for any mount
// manipulations.
if err := syncParentHooks(pipe); err != nil {
return err
}
if err := syscall.Chdir(config.Rootfs); err != nil { if err := syscall.Chdir(config.Rootfs); err != nil {
return newSystemError(err) return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs)
} }
if config.NoPivotRoot { if config.NoPivotRoot {
err = msMoveRoot(config.Rootfs) err = msMoveRoot(config.Rootfs)
@ -67,16 +86,28 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
err = pivotRoot(config.Rootfs, config.PivotDir) err = pivotRoot(config.Rootfs, config.PivotDir)
} }
if err != nil { if err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "jailing process inside rootfs")
} }
if setupDev { if setupDev {
if err := reOpenDevNull(); err != nil { if err := reOpenDevNull(); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "reopening /dev/null inside container")
} }
} }
// remount dev as ro if specifed
for _, m := range config.Mounts {
if libcontainerUtils.CleanPath(m.Destination) == "/dev" {
if m.Flags&syscall.MS_RDONLY != 0 {
if err := remountReadonly(m.Destination); err != nil {
return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination)
}
}
break
}
}
// set rootfs ( / ) as readonly
if config.Readonlyfs { if config.Readonlyfs {
if err := setReadonly(); err != nil { if err := setReadonly(); err != nil {
return newSystemError(err) return newSystemErrorWithCause(err, "setting rootfs as readonly")
} }
} }
syscall.Umask(0022) syscall.Umask(0022)
@ -84,14 +115,12 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
} }
func mountCmd(cmd configs.Command) error { func mountCmd(cmd configs.Command) error {
command := exec.Command(cmd.Path, cmd.Args[:]...) command := exec.Command(cmd.Path, cmd.Args[:]...)
command.Env = cmd.Env command.Env = cmd.Env
command.Dir = cmd.Dir command.Dir = cmd.Dir
if out, err := command.CombinedOutput(); err != nil { if out, err := command.CombinedOutput(); err != nil {
return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err) return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err)
} }
return nil return nil
} }
@ -119,8 +148,9 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
if err := mountPropagate(m, rootfs, ""); err != nil { if err := mountPropagate(m, rootfs, ""); err != nil {
return err return err
} }
}
return label.SetFileLabel(dest, mountLabel) return label.SetFileLabel(dest, mountLabel)
}
return nil
case "tmpfs": case "tmpfs":
stat, err := os.Stat(dest) stat, err := os.Stat(dest)
if err != nil { if err != nil {
@ -137,16 +167,6 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
} }
} }
return nil return nil
case "devpts":
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
return mountPropagate(m, rootfs, mountLabel)
case "securityfs":
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
return mountPropagate(m, rootfs, mountLabel)
case "bind": case "bind":
stat, err := os.Stat(m.Source) stat, err := os.Stat(m.Source)
if err != nil { if err != nil {
@ -218,41 +238,33 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
return err return err
} }
} }
// create symlinks for merged cgroups
cwd, err := os.Getwd()
if err != nil {
return err
}
if err := os.Chdir(filepath.Join(rootfs, m.Destination)); err != nil {
return err
}
for _, mc := range merged { for _, mc := range merged {
for _, ss := range strings.Split(mc, ",") { for _, ss := range strings.Split(mc, ",") {
if err := os.Symlink(mc, ss); err != nil { // symlink(2) is very dumb, it will just shove the path into
// if cgroup already exists, then okay(it could have been created before) // the link and doesn't do any checks or relative path
if os.IsExist(err) { // conversion. Also, don't error out if the cgroup already exists.
continue if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) {
}
os.Chdir(cwd)
return err return err
} }
} }
} }
if err := os.Chdir(cwd); err != nil {
return err
}
if m.Flags&syscall.MS_RDONLY != 0 { if m.Flags&syscall.MS_RDONLY != 0 {
// remount cgroup root as readonly // remount cgroup root as readonly
mcgrouproot := &configs.Mount{ mcgrouproot := &configs.Mount{
Source: m.Destination,
Device: "bind",
Destination: m.Destination, Destination: m.Destination,
Flags: defaultMountFlags | syscall.MS_RDONLY, Flags: defaultMountFlags | syscall.MS_RDONLY | syscall.MS_BIND,
} }
if err := remount(mcgrouproot, rootfs); err != nil { if err := remount(mcgrouproot, rootfs); err != nil {
return err return err
} }
} }
default: default:
return fmt.Errorf("unknown mount device %q to %q", m.Device, m.Destination) if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
return mountPropagate(m, rootfs, mountLabel)
} }
return nil return nil
} }
@ -294,7 +306,7 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
// checkMountDestination checks to ensure that the mount destination is not over the top of /proc. // checkMountDestination checks to ensure that the mount destination is not over the top of /proc.
// dest is required to be an abs path and have any symlinks resolved before calling this function. // dest is required to be an abs path and have any symlinks resolved before calling this function.
func checkMountDestination(rootfs, dest string) error { func checkMountDestination(rootfs, dest string) error {
if filepath.Clean(rootfs) == filepath.Clean(dest) { if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) {
return fmt.Errorf("mounting into / is prohibited") return fmt.Errorf("mounting into / is prohibited")
} }
invalidDestinations := []string{ invalidDestinations := []string{
@ -307,7 +319,8 @@ func checkMountDestination(rootfs, dest string) error {
"/proc/cpuinfo", "/proc/cpuinfo",
"/proc/diskstats", "/proc/diskstats",
"/proc/meminfo", "/proc/meminfo",
"/proc/stats", "/proc/stat",
"/proc/net/dev",
} }
for _, valid := range validDestinations { for _, valid := range validDestinations {
path, err := filepath.Rel(filepath.Join(rootfs, valid), dest) path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
@ -340,7 +353,7 @@ func setupDevSymlinks(rootfs string) error {
// kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink // kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
// in /dev if it exists in /proc. // in /dev if it exists in /proc.
if _, err := os.Stat("/proc/kcore"); err == nil { if _, err := os.Stat("/proc/kcore"); err == nil {
links = append(links, [2]string{"/proc/kcore", "/dev/kcore"}) links = append(links, [2]string{"/proc/kcore", "/dev/core"})
} }
for _, link := range links { for _, link := range links {
var ( var (
@ -489,10 +502,10 @@ func getParentMount(rootfs string) (string, string, error) {
} }
// Make parent mount private if it was shared // Make parent mount private if it was shared
func rootfsParentMountPrivate(config *configs.Config) error { func rootfsParentMountPrivate(rootfs string) error {
sharedMount := false sharedMount := false
parentMount, optionalOpts, err := getParentMount(config.Rootfs) parentMount, optionalOpts, err := getParentMount(rootfs)
if err != nil { if err != nil {
return err return err
} }
@ -524,10 +537,11 @@ func prepareRoot(config *configs.Config) error {
if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil { if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil {
return err return err
} }
if config.NoPivotRoot {
if err := rootfsParentMountPrivate(config); err != nil { if err := rootfsParentMountPrivate(config.Rootfs); err != nil {
return err return err
} }
}
return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "") return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "")
} }
@ -550,7 +564,7 @@ func setupPtmx(config *configs.Config, console *linuxConsole) error {
return nil return nil
} }
func pivotRoot(rootfs, pivotBaseDir string) error { func pivotRoot(rootfs, pivotBaseDir string) (err error) {
if pivotBaseDir == "" { if pivotBaseDir == "" {
pivotBaseDir = "/" pivotBaseDir = "/"
} }
@ -562,9 +576,22 @@ func pivotRoot(rootfs, pivotBaseDir string) error {
if err != nil { if err != nil {
return fmt.Errorf("can't create pivot_root dir %s, error %v", pivotDir, err) return fmt.Errorf("can't create pivot_root dir %s, error %v", pivotDir, err)
} }
defer func() {
errVal := os.Remove(pivotDir)
if err == nil {
err = errVal
}
}()
if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
// Make the parent mount private
if err := rootfsParentMountPrivate(rootfs); err != nil {
return err
}
// Try again
if err := syscall.PivotRoot(rootfs, pivotDir); err != nil { if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
return fmt.Errorf("pivot_root %s", err) return fmt.Errorf("pivot_root %s", err)
} }
}
if err := syscall.Chdir("/"); err != nil { if err := syscall.Chdir("/"); err != nil {
return fmt.Errorf("chdir / %s", err) return fmt.Errorf("chdir / %s", err)
} }
@ -580,7 +607,7 @@ func pivotRoot(rootfs, pivotBaseDir string) error {
if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil {
return fmt.Errorf("unmount pivot_root dir %s", err) return fmt.Errorf("unmount pivot_root dir %s", err)
} }
return os.Remove(pivotDir) return nil
} }
func msMoveRoot(rootfs string) error { func msMoveRoot(rootfs string) error {
@ -671,12 +698,16 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
var ( var (
dest = m.Destination dest = m.Destination
data = label.FormatMountLabel(m.Data, mountLabel) data = label.FormatMountLabel(m.Data, mountLabel)
flags = m.Flags
) )
if libcontainerUtils.CleanPath(dest) == "/dev" {
flags &= ^syscall.MS_RDONLY
}
if !strings.HasPrefix(dest, rootfs) { if !strings.HasPrefix(dest, rootfs) {
dest = filepath.Join(rootfs, dest) dest = filepath.Join(rootfs, dest)
} }
if err := syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags), data); err != nil { if err := syscall.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil {
return err return err
} }

View File

@ -36,6 +36,11 @@ var archs = map[string]string{
"SCMP_ARCH_MIPSEL": "mipsel", "SCMP_ARCH_MIPSEL": "mipsel",
"SCMP_ARCH_MIPSEL64": "mipsel64", "SCMP_ARCH_MIPSEL64": "mipsel64",
"SCMP_ARCH_MIPSEL64N32": "mipsel64n32", "SCMP_ARCH_MIPSEL64N32": "mipsel64n32",
"SCMP_ARCH_PPC": "ppc",
"SCMP_ARCH_PPC64": "ppc64",
"SCMP_ARCH_PPC64LE": "ppc64le",
"SCMP_ARCH_S390": "s390",
"SCMP_ARCH_S390X": "s390x",
} }
// ConvertStringToOperator converts a string into a Seccomp comparison operator. // ConvertStringToOperator converts a string into a Seccomp comparison operator.

View File

@ -5,7 +5,6 @@ package seccomp
import ( import (
"bufio" "bufio"
"fmt" "fmt"
"log"
"os" "os"
"strings" "strings"
"syscall" "syscall"
@ -167,7 +166,6 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
// Ignore it, don't error out // Ignore it, don't error out
callNum, err := libseccomp.GetSyscallFromName(call.Name) callNum, err := libseccomp.GetSyscallFromName(call.Name)
if err != nil { if err != nil {
log.Printf("Error resolving syscall name %s: %s - ignoring syscall.", call.Name, err)
return nil return nil
} }

View File

@ -10,7 +10,7 @@ import (
var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported") var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
// Seccomp not supported, do nothing // InitSeccomp does nothing because seccomp is not supported.
func InitSeccomp(config *configs.Seccomp) error { func InitSeccomp(config *configs.Seccomp) error {
if config != nil { if config != nil {
return ErrSeccompNotEnabled return ErrSeccompNotEnabled

View File

@ -13,9 +13,9 @@ import (
"regexp" "regexp"
"strconv" "strconv"
"strings" "strings"
"sync"
"syscall" "syscall"
"github.com/docker/docker/pkg/mount"
"github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/system"
) )
@ -35,6 +35,7 @@ const (
var ( var (
assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`) assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`)
mcsList = make(map[string]bool) mcsList = make(map[string]bool)
mcsLock sync.Mutex
selinuxfs = "unknown" selinuxfs = "unknown"
selinuxEnabled = false // Stores whether selinux is currently enabled selinuxEnabled = false // Stores whether selinux is currently enabled
selinuxEnabledChecked = false // Stores whether selinux enablement has been checked or established yet selinuxEnabledChecked = false // Stores whether selinux enablement has been checked or established yet
@ -58,16 +59,31 @@ func getSelinuxMountPoint() string {
} }
selinuxfs = "" selinuxfs = ""
mounts, err := mount.GetMounts() f, err := os.Open("/proc/self/mountinfo")
if err != nil { if err != nil {
return selinuxfs return selinuxfs
} }
for _, mount := range mounts { defer f.Close()
if mount.Fstype == "selinuxfs" {
selinuxfs = mount.Mountpoint scanner := bufio.NewScanner(f)
for scanner.Scan() {
txt := scanner.Text()
// Safe as mountinfo encodes mountpoints with spaces as \040.
sepIdx := strings.Index(txt, " - ")
if sepIdx == -1 {
continue
}
if !strings.Contains(txt[sepIdx:], "selinuxfs") {
continue
}
fields := strings.Split(txt, " ")
if len(fields) < 5 {
continue
}
selinuxfs = fields[4]
break break
} }
}
if selinuxfs != "" { if selinuxfs != "" {
var buf syscall.Statfs_t var buf syscall.Statfs_t
syscall.Statfs(selinuxfs, &buf) syscall.Statfs(selinuxfs, &buf)
@ -158,12 +174,14 @@ func Setfilecon(path string, scon string) error {
// Getfilecon returns the SELinux label for this path or returns an error. // Getfilecon returns the SELinux label for this path or returns an error.
func Getfilecon(path string) (string, error) { func Getfilecon(path string) (string, error) {
con, err := system.Lgetxattr(path, xattrNameSelinux) con, err := system.Lgetxattr(path, xattrNameSelinux)
if err != nil {
return "", err
}
// Trim the NUL byte at the end of the byte buffer, if present. // Trim the NUL byte at the end of the byte buffer, if present.
if con[len(con)-1] == '\x00' { if len(con) > 0 && con[len(con)-1] == '\x00' {
con = con[:len(con)-1] con = con[:len(con)-1]
} }
return string(con), err return string(con), nil
} }
func Setfscreatecon(scon string) error { func Setfscreatecon(scon string) error {
@ -265,6 +283,8 @@ func SelinuxGetEnforceMode() int {
} }
func mcsAdd(mcs string) error { func mcsAdd(mcs string) error {
mcsLock.Lock()
defer mcsLock.Unlock()
if mcsList[mcs] { if mcsList[mcs] {
return fmt.Errorf("MCS Label already exists") return fmt.Errorf("MCS Label already exists")
} }
@ -273,7 +293,9 @@ func mcsAdd(mcs string) error {
} }
func mcsDelete(mcs string) { func mcsDelete(mcs string) {
mcsLock.Lock()
mcsList[mcs] = false mcsList[mcs] = false
mcsLock.Unlock()
} }
func IntToMcs(id int, catRange uint32) string { func IntToMcs(id int, catRange uint32) string {
@ -289,7 +311,7 @@ func IntToMcs(id int, catRange uint32) string {
for ORD > TIER { for ORD > TIER {
ORD = ORD - TIER ORD = ORD - TIER
TIER -= 1 TIER--
} }
TIER = SETSIZE - TIER TIER = SETSIZE - TIER
ORD = ORD + TIER ORD = ORD + TIER
@ -430,7 +452,7 @@ func badPrefix(fpath string) error {
return nil return nil
} }
// Change the fpath file object to the SELinux label scon. // Chcon changes the fpath file object to the SELinux label scon.
// If the fpath is a directory and recurse is true Chcon will walk the // If the fpath is a directory and recurse is true Chcon will walk the
// directory tree setting the label // directory tree setting the label
func Chcon(fpath string, scon string, recurse bool) error { func Chcon(fpath string, scon string, recurse bool) error {
@ -464,14 +486,14 @@ func DupSecOpt(src string) []string {
con["level"] == "" { con["level"] == "" {
return nil return nil
} }
return []string{"label:user:" + con["user"], return []string{"label=user:" + con["user"],
"label:role:" + con["role"], "label=role:" + con["role"],
"label:type:" + con["type"], "label=type:" + con["type"],
"label:level:" + con["level"]} "label=level:" + con["level"]}
} }
// DisableSecOpt returns a security opt that can be used to disabling SELinux // DisableSecOpt returns a security opt that can be used to disabling SELinux
// labeling support for future container processes // labeling support for future container processes
func DisableSecOpt() []string { func DisableSecOpt() []string {
return []string{"label:disable"} return []string{"label=disable"}
} }

View File

@ -3,9 +3,11 @@
package libcontainer package libcontainer
import ( import (
"fmt"
"os" "os"
"github.com/opencontainers/runc/libcontainer/apparmor" "github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/label" "github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/system"
@ -17,13 +19,22 @@ type linuxSetnsInit struct {
config *initConfig config *initConfig
} }
func (l *linuxSetnsInit) getSessionRingName() string {
return fmt.Sprintf("_ses.%s", l.config.ContainerId)
}
func (l *linuxSetnsInit) Init() error { func (l *linuxSetnsInit) Init() error {
if err := setupRlimits(l.config.Config); err != nil { if !l.config.Config.NoNewKeyring {
// do not inherit the parent's session keyring
if _, err := keyctl.JoinSessionKeyring(l.getSessionRingName()); err != nil {
return err return err
} }
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil { }
if l.config.NoNewPrivileges {
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err return err
} }
}
if l.config.Config.Seccomp != nil { if l.config.Config.Seccomp != nil {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err return err
@ -32,13 +43,11 @@ func (l *linuxSetnsInit) Init() error {
if err := finalizeNamespace(l.config); err != nil { if err := finalizeNamespace(l.config); err != nil {
return err return err
} }
if err := apparmor.ApplyProfile(l.config.Config.AppArmorProfile); err != nil { if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err return err
} }
if l.config.Config.ProcessLabel != "" { if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
if err := label.SetProcessLabel(l.config.Config.ProcessLabel); err != nil {
return err return err
} }
}
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
} }

View File

@ -2,14 +2,14 @@ package stacktrace
import "runtime" import "runtime"
// Caputure captures a stacktrace for the current calling go program // Capture captures a stacktrace for the current calling go program
// //
// skip is the number of frames to skip // skip is the number of frames to skip
func Capture(userSkip int) Stacktrace { func Capture(userSkip int) Stacktrace {
var ( var (
skip = userSkip + 1 // add one for our own function skip = userSkip + 1 // add one for our own function
frames []Frame frames []Frame
prevPc uintptr = 0 prevPc uintptr
) )
for i := skip; ; i++ { for i := skip; ; i++ {
pc, file, line, ok := runtime.Caller(i) pc, file, line, ok := runtime.Caller(i)

View File

@ -3,28 +3,62 @@
package libcontainer package libcontainer
import ( import (
"fmt"
"io" "io"
"os" "os"
"os/exec"
"syscall" "syscall"
"github.com/opencontainers/runc/libcontainer/apparmor" "github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/label" "github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/system"
) )
type linuxStandardInit struct { type linuxStandardInit struct {
pipe io.ReadWriter pipe io.ReadWriteCloser
parentPid int parentPid int
stateDirFD int
config *initConfig config *initConfig
} }
func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
var newperms uint32
if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
// with user ns we need 'other' search permissions
newperms = 0x8
} else {
// without user ns we need 'UID' search permissions
newperms = 0x80000
}
// create a unique per session container name that we can
// join in setns; however, other containers can also join it
return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
}
// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value
// the kernel
const PR_SET_NO_NEW_PRIVS = 0x26
func (l *linuxStandardInit) Init() error { func (l *linuxStandardInit) Init() error {
// join any namespaces via a path to the namespace fd if provided if !l.config.Config.NoNewKeyring {
if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil { ringname, keepperms, newperms := l.getSessionRingParams()
// do not inherit the parent's session keyring
sessKeyId, err := keyctl.JoinSessionKeyring(ringname)
if err != nil {
return err return err
} }
// make session keyring searcheable
if err := keyctl.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return err
}
}
var console *linuxConsole var console *linuxConsole
if l.config.Console != "" { if l.config.Console != "" {
console = newConsoleFromPath(l.config.Console) console = newConsoleFromPath(l.config.Console)
@ -32,9 +66,6 @@ func (l *linuxStandardInit) Init() error {
return err return err
} }
} }
if _, err := syscall.Setsid(); err != nil {
return err
}
if console != nil { if console != nil {
if err := system.Setctty(); err != nil { if err := system.Setctty(); err != nil {
return err return err
@ -46,16 +77,11 @@ func (l *linuxStandardInit) Init() error {
if err := setupRoute(l.config.Config); err != nil { if err := setupRoute(l.config.Config); err != nil {
return err return err
} }
if err := setupRlimits(l.config.Config); err != nil {
return err
}
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
return err
}
label.Init() label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace // InitializeMountNamespace() can be executed only for a new mount namespace
if l.config.Config.Namespaces.Contains(configs.NEWNS) { if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := setupRootfs(l.config.Config, console); err != nil { if err := setupRootfs(l.config.Config, console, l.pipe); err != nil {
return err return err
} }
} }
@ -64,10 +90,10 @@ func (l *linuxStandardInit) Init() error {
return err return err
} }
} }
if err := apparmor.ApplyProfile(l.config.Config.AppArmorProfile); err != nil { if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err return err
} }
if err := label.SetProcessLabel(l.config.Config.ProcessLabel); err != nil { if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err return err
} }
@ -90,13 +116,21 @@ func (l *linuxStandardInit) Init() error {
if err != nil { if err != nil {
return err return err
} }
if l.config.NoNewPrivileges {
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
}
}
// Tell our parent that we're ready to Execv. This must be done before the // Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and // Seccomp rules have been applied, because we need to be able to read and
// write to a socket. // write to a socket.
if err := syncParentReady(l.pipe); err != nil { if err := syncParentReady(l.pipe); err != nil {
return err return err
} }
if l.config.Config.Seccomp != nil { // Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err return err
} }
@ -110,10 +144,35 @@ func (l *linuxStandardInit) Init() error {
return err return err
} }
// compare the parent from the inital start of the init process and make sure that it did not change. // compare the parent from the inital start of the init process and make sure that it did not change.
// if the parent changes that means it died and we were reparened to something else so we should // if the parent changes that means it died and we were reparented to something else so we should
// just kill ourself and not cause problems for someone else. // just kill ourself and not cause problems for someone else.
if syscall.Getppid() != l.parentPid { if syscall.Getppid() != l.parentPid {
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
} }
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) // check for the arg before waiting to make sure it exists and it is returned
// as a create time error.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
// close the pipe to signal that we have completed our init.
l.pipe.Close()
// wait for the fifo to be opened on the other side before
// exec'ing the users process.
fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0)
if err != nil {
return newSystemErrorWithCause(err, "openat exec fifo")
}
if _, err := syscall.Write(fd, []byte("0")); err != nil {
return newSystemErrorWithCause(err, "write 0 exec fifo")
}
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return newSystemErrorWithCause(err, "init seccomp")
}
}
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
return newSystemErrorWithCause(err, "exec user process")
}
return nil
} }

View File

@ -6,9 +6,11 @@ import (
"fmt" "fmt"
"os" "os"
"path/filepath" "path/filepath"
"syscall"
"github.com/Sirupsen/logrus" "github.com/Sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
) )
func newStateTransitionError(from, to containerState) error { func newStateTransitionError(from, to containerState) error {
@ -59,6 +61,7 @@ func runPoststopHooks(c *linuxContainer) error {
Version: c.config.Version, Version: c.config.Version,
ID: c.id, ID: c.id,
Root: c.config.Rootfs, Root: c.config.Rootfs,
BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
} }
for _, hook := range c.config.Hooks.Poststop { for _, hook := range c.config.Hooks.Poststop {
if err := hook.Run(s); err != nil { if err := hook.Run(s); err != nil {
@ -75,7 +78,7 @@ type stoppedState struct {
} }
func (b *stoppedState) status() Status { func (b *stoppedState) status() Status {
return Destroyed return Stopped
} }
func (b *stoppedState) transition(s containerState) error { func (b *stoppedState) transition(s containerState) error {
@ -108,11 +111,11 @@ func (r *runningState) status() Status {
func (r *runningState) transition(s containerState) error { func (r *runningState) transition(s containerState) error {
switch s.(type) { switch s.(type) {
case *stoppedState: case *stoppedState:
running, err := r.c.isRunning() t, err := r.c.runType()
if err != nil { if err != nil {
return err return err
} }
if running { if t == Running {
return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped) return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
} }
r.c.state = s r.c.state = s
@ -127,16 +130,40 @@ func (r *runningState) transition(s containerState) error {
} }
func (r *runningState) destroy() error { func (r *runningState) destroy() error {
running, err := r.c.isRunning() t, err := r.c.runType()
if err != nil { if err != nil {
return err return err
} }
if running { if t == Running {
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped) return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
} }
return destroy(r.c) return destroy(r.c)
} }
type createdState struct {
c *linuxContainer
}
func (i *createdState) status() Status {
return Created
}
func (i *createdState) transition(s containerState) error {
switch s.(type) {
case *runningState, *pausedState, *stoppedState:
i.c.state = s
return nil
case *createdState:
return nil
}
return newStateTransitionError(i, s)
}
func (i *createdState) destroy() error {
i.c.initProcess.signal(syscall.SIGKILL)
return destroy(i.c)
}
// pausedState represents a container that is currently pause. It cannot be destroyed in a // pausedState represents a container that is currently pause. It cannot be destroyed in a
// paused state and must transition back to running first. // paused state and must transition back to running first.
type pausedState struct { type pausedState struct {
@ -159,11 +186,11 @@ func (p *pausedState) transition(s containerState) error {
} }
func (p *pausedState) destroy() error { func (p *pausedState) destroy() error {
isRunning, err := p.c.isRunning() t, err := p.c.runType()
if err != nil { if err != nil {
return err return err
} }
if !isRunning { if t != Running && t != Created {
if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil { if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err return err
} }
@ -173,7 +200,7 @@ func (p *pausedState) destroy() error {
} }
// restoredState is the same as the running state but also has accociated checkpoint // restoredState is the same as the running state but also has accociated checkpoint
// information that maybe need destroyed when the container is stopped and destory is called. // information that maybe need destroyed when the container is stopped and destroy is called.
type restoredState struct { type restoredState struct {
imageDir string imageDir string
c *linuxContainer c *linuxContainer
@ -202,22 +229,25 @@ func (r *restoredState) destroy() error {
return destroy(r.c) return destroy(r.c)
} }
// createdState is used whenever a container is restored, loaded, or setting additional // loadedState is used whenever a container is restored, loaded, or setting additional
// processes inside and it should not be destroyed when it is exiting. // processes inside and it should not be destroyed when it is exiting.
type createdState struct { type loadedState struct {
c *linuxContainer c *linuxContainer
s Status s Status
} }
func (n *createdState) status() Status { func (n *loadedState) status() Status {
return n.s return n.s
} }
func (n *createdState) transition(s containerState) error { func (n *loadedState) transition(s containerState) error {
n.c.state = s n.c.state = s
return nil return nil
} }
func (n *createdState) destroy() error { func (n *loadedState) destroy() error {
return nil if err := n.c.refreshState(); err != nil {
return err
}
return n.c.state.destroy()
} }

View File

@ -0,0 +1,7 @@
package libcontainer
// Solaris - TODO
type Stats struct {
Interfaces []*NetworkInterface
}

View File

@ -11,6 +11,19 @@ import (
"unsafe" "unsafe"
) )
// If arg2 is nonzero, set the "child subreaper" attribute of the
// calling process; if arg2 is zero, unset the attribute. When a
// process is marked as a child subreaper, all of the children
// that it creates, and their descendants, will be marked as
// having a subreaper. In effect, a subreaper fulfills the role
// of init(1) for its descendant processes. Upon termination of
// a process that is orphaned (i.e., its immediate parent has
// already terminated) and marked as having a subreaper, the
// nearest still living ancestor subreaper will receive a SIGCHLD
// signal and be able to wait(2) on the process to discover its
// termination status.
const PR_SET_CHILD_SUBREAPER = 36
type ParentDeathSignal int type ParentDeathSignal int
func (p ParentDeathSignal) Restore() error { func (p ParentDeathSignal) Restore() error {
@ -40,6 +53,14 @@ func Execv(cmd string, args []string, env []string) error {
return syscall.Exec(name, args, env) return syscall.Exec(name, args, env)
} }
func Prlimit(pid, resource int, limit syscall.Rlimit) error {
_, _, err := syscall.RawSyscall6(syscall.SYS_PRLIMIT64, uintptr(pid), uintptr(resource), uintptr(unsafe.Pointer(&limit)), uintptr(unsafe.Pointer(&limit)), 0, 0)
if err != 0 {
return err
}
return nil
}
func SetParentDeathSignal(sig uintptr) error { func SetParentDeathSignal(sig uintptr) error {
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, sig, 0); err != 0 { if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, sig, 0); err != 0 {
return err return err
@ -79,17 +100,12 @@ func Setctty() error {
return nil return nil
} }
/* // RunningInUserNS detects whether we are currently running in a user namespace.
* Detect whether we are currently running in a user namespace. // Copied from github.com/lxc/lxd/shared/util.go
* Copied from github.com/lxc/lxd/shared/util.go
*/
func RunningInUserNS() bool { func RunningInUserNS() bool {
file, err := os.Open("/proc/self/uid_map") file, err := os.Open("/proc/self/uid_map")
if err != nil { if err != nil {
/* // This kernel-provided file only exists if user namespaces are supported
* This kernel-provided file only exists if user namespaces are
* supported
*/
return false return false
} }
defer file.Close() defer file.Close()
@ -112,3 +128,16 @@ func RunningInUserNS() bool {
} }
return true return true
} }
// SetSubreaper sets the value i as the subreaper setting for the calling process
func SetSubreaper(i int) error {
return Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
}
func Prctl(option int, arg2, arg3, arg4, arg5 uintptr) (err error) {
_, _, e1 := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0)
if e1 != 0 {
err = e1
}
return
}

View File

@ -0,0 +1,9 @@
// +build !linux
package system
// RunningInUserNS is a stub for non-Linux systems
// Always returns false
func RunningInUserNS() bool {
return false
}

View File

@ -2,13 +2,15 @@ package user
import ( import (
"errors" "errors"
"fmt"
"syscall" "syscall"
) )
var ( var (
// The current operating system does not provide the required data for user lookups. // The current operating system does not provide the required data for user lookups.
ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data") ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
// No matching entries found in file.
ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
ErrNoGroupEntries = errors.New("no matching entries in group file")
) )
func lookupUser(filter func(u User) bool) (User, error) { func lookupUser(filter func(u User) bool) (User, error) {
@ -27,7 +29,7 @@ func lookupUser(filter func(u User) bool) (User, error) {
// No user entries found. // No user entries found.
if len(users) == 0 { if len(users) == 0 {
return User{}, fmt.Errorf("no matching entries in passwd file") return User{}, ErrNoPasswdEntries
} }
// Assume the first entry is the "correct" one. // Assume the first entry is the "correct" one.
@ -75,7 +77,7 @@ func lookupGroup(filter func(g Group) bool) (Group, error) {
// No user entries found. // No user entries found.
if len(groups) == 0 { if len(groups) == 0 {
return Group{}, fmt.Errorf("no matching entries in group file") return Group{}, ErrNoGroupEntries
} }
// Assume the first entry is the "correct" one. // Assume the first entry is the "correct" one.

View File

@ -15,7 +15,7 @@ const (
) )
var ( var (
ErrRange = fmt.Errorf("Uids and gids must be in range %d-%d", minId, maxId) ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
) )
type User struct { type User struct {
@ -42,29 +42,30 @@ func parseLine(line string, v ...interface{}) {
parts := strings.Split(line, ":") parts := strings.Split(line, ":")
for i, p := range parts { for i, p := range parts {
// Ignore cases where we don't have enough fields to populate the arguments.
// Some configuration files like to misbehave.
if len(v) <= i { if len(v) <= i {
// if we have more "parts" than we have places to put them, bail for great "tolerance" of naughty configuration files
break break
} }
// Use the type of the argument to figure out how to parse it, scanf() style.
// This is legit.
switch e := v[i].(type) { switch e := v[i].(type) {
case *string: case *string:
// "root", "adm", "/bin/bash"
*e = p *e = p
case *int: case *int:
// "0", "4", "1000" // "numbers", with conversion errors ignored because of some misbehaving configuration files.
// ignore string to int conversion errors, for great "tolerance" of naughty configuration files
*e, _ = strconv.Atoi(p) *e, _ = strconv.Atoi(p)
case *[]string: case *[]string:
// "", "root", "root,adm,daemon" // Comma-separated lists.
if p != "" { if p != "" {
*e = strings.Split(p, ",") *e = strings.Split(p, ",")
} else { } else {
*e = []string{} *e = []string{}
} }
default: default:
// panic, because this is a programming/logic error, not a runtime one // Someone goof'd when writing code using this function. Scream so they can hear us.
panic("parseLine expects only pointers! argument " + strconv.Itoa(i) + " is not a pointer!") panic(fmt.Sprintf("parseLine only accepts {*string, *int, *[]string} as arguments! %#v is not a pointer!", e))
} }
} }
} }
@ -106,8 +107,8 @@ func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) {
return nil, err return nil, err
} }
text := strings.TrimSpace(s.Text()) line := strings.TrimSpace(s.Text())
if text == "" { if line == "" {
continue continue
} }
@ -117,10 +118,7 @@ func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) {
// root:x:0:0:root:/root:/bin/bash // root:x:0:0:root:/root:/bin/bash
// adm:x:3:4:adm:/var/adm:/bin/false // adm:x:3:4:adm:/var/adm:/bin/false
p := User{} p := User{}
parseLine( parseLine(line, &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell)
text,
&p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell,
)
if filter == nil || filter(p) { if filter == nil || filter(p) {
out = append(out, p) out = append(out, p)
@ -135,6 +133,7 @@ func ParseGroupFile(path string) ([]Group, error) {
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer group.Close() defer group.Close()
return ParseGroup(group) return ParseGroup(group)
} }
@ -178,10 +177,7 @@ func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
// root:x:0:root // root:x:0:root
// adm:x:4:root,adm,daemon // adm:x:4:root,adm,daemon
p := Group{} p := Group{}
parseLine( parseLine(text, &p.Name, &p.Pass, &p.Gid, &p.List)
text,
&p.Name, &p.Pass, &p.Gid, &p.List,
)
if filter == nil || filter(p) { if filter == nil || filter(p) {
out = append(out, p) out = append(out, p)
@ -192,7 +188,8 @@ func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
} }
type ExecUser struct { type ExecUser struct {
Uid, Gid int Uid int
Gid int
Sgids []int Sgids []int
Home string Home string
} }
@ -235,12 +232,12 @@ func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath
// * "uid:gid // * "uid:gid
// * "user:gid" // * "user:gid"
// * "uid:group" // * "uid:group"
//
// It should be noted that if you specify a numeric user or group id, they will
// not be evaluated as usernames (only the metadata will be filled). So attempting
// to parse a user with user.Name = "1337" will produce the user with a UID of
// 1337.
func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) { func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) {
var (
userArg, groupArg string
name string
)
if defaults == nil { if defaults == nil {
defaults = new(ExecUser) defaults = new(ExecUser)
} }
@ -258,87 +255,113 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (
user.Sgids = []int{} user.Sgids = []int{}
} }
// allow for userArg to have either "user" syntax, or optionally "user:group" syntax // Allow for userArg to have either "user" syntax, or optionally "user:group" syntax
var userArg, groupArg string
parseLine(userSpec, &userArg, &groupArg) parseLine(userSpec, &userArg, &groupArg)
// Convert userArg and groupArg to be numeric, so we don't have to execute
// Atoi *twice* for each iteration over lines.
uidArg, uidErr := strconv.Atoi(userArg)
gidArg, gidErr := strconv.Atoi(groupArg)
// Find the matching user.
users, err := ParsePasswdFilter(passwd, func(u User) bool { users, err := ParsePasswdFilter(passwd, func(u User) bool {
if userArg == "" { if userArg == "" {
// Default to current state of the user.
return u.Uid == user.Uid return u.Uid == user.Uid
} }
return u.Name == userArg || strconv.Itoa(u.Uid) == userArg
if uidErr == nil {
// If the userArg is numeric, always treat it as a UID.
return uidArg == u.Uid
}
return u.Name == userArg
}) })
// If we can't find the user, we have to bail.
if err != nil && passwd != nil { if err != nil && passwd != nil {
if userArg == "" { if userArg == "" {
userArg = strconv.Itoa(user.Uid) userArg = strconv.Itoa(user.Uid)
} }
return nil, fmt.Errorf("Unable to find user %v: %v", userArg, err) return nil, fmt.Errorf("unable to find user %s: %v", userArg, err)
} }
haveUser := users != nil && len(users) > 0 var matchedUserName string
if haveUser { if len(users) > 0 {
// if we found any user entries that matched our filter, let's take the first one as "correct" // First match wins, even if there's more than one matching entry.
name = users[0].Name matchedUserName = users[0].Name
user.Uid = users[0].Uid user.Uid = users[0].Uid
user.Gid = users[0].Gid user.Gid = users[0].Gid
user.Home = users[0].Home user.Home = users[0].Home
} else if userArg != "" { } else if userArg != "" {
// we asked for a user but didn't find them... let's check to see if we wanted a numeric user // If we can't find a user with the given username, the only other valid
user.Uid, err = strconv.Atoi(userArg) // option is if it's a numeric username with no associated entry in passwd.
if err != nil {
// not numeric - we have to bail if uidErr != nil {
return nil, fmt.Errorf("Unable to find user %v", userArg) // Not numeric.
return nil, fmt.Errorf("unable to find user %s: %v", userArg, ErrNoPasswdEntries)
} }
user.Uid = uidArg
// Must be inside valid uid range. // Must be inside valid uid range.
if user.Uid < minId || user.Uid > maxId { if user.Uid < minId || user.Uid > maxId {
return nil, ErrRange return nil, ErrRange
} }
// if userArg couldn't be found in /etc/passwd but is numeric, just roll with it - this is legit // Okay, so it's numeric. We can just roll with this.
} }
if groupArg != "" || name != "" { // On to the groups. If we matched a username, we need to do this because of
// the supplementary group IDs.
if groupArg != "" || matchedUserName != "" {
groups, err := ParseGroupFilter(group, func(g Group) bool { groups, err := ParseGroupFilter(group, func(g Group) bool {
// Explicit group format takes precedence. // If the group argument isn't explicit, we'll just search for it.
if groupArg != "" { if groupArg == "" {
return g.Name == groupArg || strconv.Itoa(g.Gid) == groupArg // Check if user is a member of this group.
}
// Check if user is a member.
for _, u := range g.List { for _, u := range g.List {
if u == name { if u == matchedUserName {
return true return true
} }
} }
return false return false
}
if gidErr == nil {
// If the groupArg is numeric, always treat it as a GID.
return gidArg == g.Gid
}
return g.Name == groupArg
}) })
if err != nil && group != nil { if err != nil && group != nil {
return nil, fmt.Errorf("Unable to find groups for user %v: %v", users[0].Name, err) return nil, fmt.Errorf("unable to find groups for spec %v: %v", matchedUserName, err)
} }
haveGroup := groups != nil && len(groups) > 0 // Only start modifying user.Gid if it is in explicit form.
if groupArg != "" { if groupArg != "" {
if haveGroup { if len(groups) > 0 {
// if we found any group entries that matched our filter, let's take the first one as "correct" // First match wins, even if there's more than one matching entry.
user.Gid = groups[0].Gid user.Gid = groups[0].Gid
} else { } else if groupArg != "" {
// we asked for a group but didn't find id... let's check to see if we wanted a numeric group // If we can't find a group with the given name, the only other valid
user.Gid, err = strconv.Atoi(groupArg) // option is if it's a numeric group name with no associated entry in group.
if err != nil {
// not numeric - we have to bail
return nil, fmt.Errorf("Unable to find group %v", groupArg)
}
// Ensure gid is inside gid range. if gidErr != nil {
// Not numeric.
return nil, fmt.Errorf("unable to find group %s: %v", groupArg, ErrNoGroupEntries)
}
user.Gid = gidArg
// Must be inside valid gid range.
if user.Gid < minId || user.Gid > maxId { if user.Gid < minId || user.Gid > maxId {
return nil, ErrRange return nil, ErrRange
} }
// if groupArg couldn't be found in /etc/group but is numeric, just roll with it - this is legit // Okay, so it's numeric. We can just roll with this.
} }
} else if haveGroup { } else if len(groups) > 0 {
// If implicit group format, fill supplementary gids. // Supplementary group ids only make sense if in the implicit form.
user.Sgids = make([]int, len(groups)) user.Sgids = make([]int, len(groups))
for i, group := range groups { for i, group := range groups {
user.Sgids[i] = group.Gid user.Sgids[i] = group.Gid

View File

@ -5,7 +5,9 @@ import (
"encoding/hex" "encoding/hex"
"encoding/json" "encoding/json"
"io" "io"
"os"
"path/filepath" "path/filepath"
"strings"
"syscall" "syscall"
) )
@ -54,3 +56,66 @@ func WriteJSON(w io.Writer, v interface{}) error {
_, err = w.Write(data) _, err = w.Write(data)
return err return err
} }
// CleanPath makes a path safe for use with filepath.Join. This is done by not
// only cleaning the path, but also (if the path is relative) adding a leading
// '/' and cleaning it (then removing the leading '/'). This ensures that a
// path resulting from prepending another path will always resolve to lexically
// be a subdirectory of the prefixed path. This is all done lexically, so paths
// that include symlinks won't be safe as a result of using CleanPath.
func CleanPath(path string) string {
// Deal with empty strings nicely.
if path == "" {
return ""
}
// Ensure that all paths are cleaned (especially problematic ones like
// "/../../../../../" which can cause lots of issues).
path = filepath.Clean(path)
// If the path isn't absolute, we need to do more processing to fix paths
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
// paths to relative ones.
if !filepath.IsAbs(path) {
path = filepath.Clean(string(os.PathSeparator) + path)
// This can't fail, as (by definition) all paths are relative to root.
path, _ = filepath.Rel(string(os.PathSeparator), path)
}
// Clean the path again for good measure.
return filepath.Clean(path)
}
// SearchLabels searches a list of key-value pairs for the provided key and
// returns the corresponding value. The pairs must be separated with '='.
func SearchLabels(labels []string, query string) string {
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == query {
return parts[1]
}
}
return ""
}
// Annotations returns the bundle path and user defined annotations from the
// libcontianer state. We need to remove the bundle because that is a label
// added by libcontainer.
func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
userAnnotations = make(map[string]string)
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == "bundle" {
bundle = parts[1]
} else {
userAnnotations[parts[0]] = parts[1]
}
}
return
}

View File

@ -109,7 +109,7 @@ func NewClientStream(ctx context.Context, desc *StreamDesc, cc *ClientConn, meth
callHdr := &transport.CallHdr{ callHdr := &transport.CallHdr{
Host: cc.authority, Host: cc.authority,
Method: method, Method: method,
Flush: desc.ServerStreams && desc.ClientStreams, Flush: desc.ServerStreams&&desc.ClientStreams,
} }
if cc.dopts.cp != nil { if cc.dopts.cp != nil {
callHdr.SendCompress = cc.dopts.cp.Type() callHdr.SendCompress = cc.dopts.cp.Type()