bump(github.com/opencontainers/runc/libcontainer): 4d6225ae

This commit is contained in:
Seth Jennings 2017-08-22 16:06:01 -05:00
parent a1e1ede0b2
commit 749f73148b
65 changed files with 3370 additions and 762 deletions

80
Godeps/Godeps.json generated
View File

@ -380,85 +380,99 @@
"Comment": "v2.1.1-5-g1b4ae6f",
"Rev": "1b4ae6fb4e77b095934d4430860ff202060169f8"
},
{
"ImportPath": "github.com/mrunalp/fileutils",
"Rev": "4ee1cc9a80582a0c75febdd5cfa779ee4361cbca"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer",
"Comment": "v1.0.0-rc1-224-g5653ced",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013"
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/apparmor",
"Comment": "v1.0.0-rc1-224-g5653ced",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013"
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups",
"Comment": "v1.0.0-rc1-224-g5653ced",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013"
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/fs",
"Comment": "v1.0.0-rc1-224-g5653ced",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013"
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/rootless",
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/systemd",
"Comment": "v1.0.0-rc1-224-g5653ced",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013"
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/configs",
"Comment": "v1.0.0-rc1-224-g5653ced",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013"
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/configs/validate",
"Comment": "v1.0.0-rc1-224-g5653ced",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013"
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/criurpc",
"Comment": "v1.0.0-rc1-224-g5653ced",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013"
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/keys",
"Comment": "v1.0.0-rc1-224-g5653ced",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013"
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/label",
"Comment": "v1.0.0-rc1-224-g5653ced",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013"
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/seccomp",
"Comment": "v1.0.0-rc1-224-g5653ced",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013"
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/selinux",
"Comment": "v1.0.0-rc1-224-g5653ced",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013"
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/stacktrace",
"Comment": "v1.0.0-rc1-224-g5653ced",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013"
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/system",
"Comment": "v1.0.0-rc1-224-g5653ced",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013"
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/user",
"Comment": "v1.0.0-rc1-224-g5653ced",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013"
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/utils",
"Comment": "v1.0.0-rc1-224-g5653ced",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013"
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runtime-spec/specs-go",
"Comment": "v1.0.0-rc5-67-gf227620",
"Rev": "f2276206b32ad0c2478bfc6440ceb7d51d815cf8"
},
{
"ImportPath": "github.com/pborman/uuid",
@ -513,7 +527,7 @@
},
{
"ImportPath": "github.com/syndtr/gocapability/capability",
"Rev": "2c00daeb6c3b45114c80ac44119e7b8801fdd852"
"Rev": "e7cb7fa329f456b3855136a2642b197bad7366ba"
},
{
"ImportPath": "github.com/vishvananda/netlink",

1
vendor/github.com/mrunalp/fileutils/.gitignore generated vendored Normal file
View File

@ -0,0 +1 @@
/gocp

191
vendor/github.com/mrunalp/fileutils/LICENSE generated vendored Normal file
View File

@ -0,0 +1,191 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
Copyright 2014 Docker, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

1
vendor/github.com/mrunalp/fileutils/MAINTAINERS generated vendored Normal file
View File

@ -0,0 +1 @@
Mrunal Patel <mrunalp@gmail.com> (@mrunalp)

5
vendor/github.com/mrunalp/fileutils/README.md generated vendored Normal file
View File

@ -0,0 +1,5 @@
# fileutils
Collection of utilities for file manipulation in golang
The library is based on docker pkg/archive pkg/idtools but does copies instead of handling archive formats.

161
vendor/github.com/mrunalp/fileutils/fileutils.go generated vendored Normal file
View File

@ -0,0 +1,161 @@
package fileutils
import (
"fmt"
"io"
"os"
"path/filepath"
"syscall"
)
// CopyFile copies the file at source to dest
func CopyFile(source string, dest string) error {
si, err := os.Lstat(source)
if err != nil {
return err
}
st, ok := si.Sys().(*syscall.Stat_t)
if !ok {
return fmt.Errorf("could not convert to syscall.Stat_t")
}
uid := int(st.Uid)
gid := int(st.Gid)
// Handle symlinks
if si.Mode()&os.ModeSymlink != 0 {
target, err := os.Readlink(source)
if err != nil {
return err
}
if err := os.Symlink(target, dest); err != nil {
return err
}
}
// Handle device files
if st.Mode&syscall.S_IFMT == syscall.S_IFBLK || st.Mode&syscall.S_IFMT == syscall.S_IFCHR {
devMajor := int64(major(uint64(st.Rdev)))
devMinor := int64(minor(uint64(st.Rdev)))
mode := uint32(si.Mode() & 07777)
if st.Mode&syscall.S_IFMT == syscall.S_IFBLK {
mode |= syscall.S_IFBLK
}
if st.Mode&syscall.S_IFMT == syscall.S_IFCHR {
mode |= syscall.S_IFCHR
}
if err := syscall.Mknod(dest, mode, int(mkdev(devMajor, devMinor))); err != nil {
return err
}
}
// Handle regular files
if si.Mode().IsRegular() {
sf, err := os.Open(source)
if err != nil {
return err
}
defer sf.Close()
df, err := os.Create(dest)
if err != nil {
return err
}
defer df.Close()
_, err = io.Copy(df, sf)
if err != nil {
return err
}
}
// Chown the file
if err := os.Lchown(dest, uid, gid); err != nil {
return err
}
// Chmod the file
if !(si.Mode()&os.ModeSymlink == os.ModeSymlink) {
if err := os.Chmod(dest, si.Mode()); err != nil {
return err
}
}
return nil
}
// CopyDirectory copies the files under the source directory
// to dest directory. The dest directory is created if it
// does not exist.
func CopyDirectory(source string, dest string) error {
fi, err := os.Stat(source)
if err != nil {
return err
}
// Get owner.
st, ok := fi.Sys().(*syscall.Stat_t)
if !ok {
return fmt.Errorf("could not convert to syscall.Stat_t")
}
// We have to pick an owner here anyway.
if err := MkdirAllNewAs(dest, fi.Mode(), int(st.Uid), int(st.Gid)); err != nil {
return err
}
return filepath.Walk(source, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
// Get the relative path
relPath, err := filepath.Rel(source, path)
if err != nil {
return nil
}
if info.IsDir() {
// Skip the source directory.
if path != source {
// Get the owner.
st, ok := info.Sys().(*syscall.Stat_t)
if !ok {
return fmt.Errorf("could not convert to syscall.Stat_t")
}
uid := int(st.Uid)
gid := int(st.Gid)
if err := os.Mkdir(filepath.Join(dest, relPath), info.Mode()); err != nil {
return err
}
if err := os.Lchown(filepath.Join(dest, relPath), uid, gid); err != nil {
return err
}
}
return nil
}
// Copy the file.
if err := CopyFile(path, filepath.Join(dest, relPath)); err != nil {
return err
}
return nil
})
}
func major(device uint64) uint64 {
return (device >> 8) & 0xfff
}
func minor(device uint64) uint64 {
return (device & 0xff) | ((device >> 12) & 0xfff00)
}
func mkdev(major int64, minor int64) uint32 {
return uint32(((minor & 0xfff00) << 12) | ((major & 0xfff) << 8) | (minor & 0xff))
}

49
vendor/github.com/mrunalp/fileutils/idtools.go generated vendored Normal file
View File

@ -0,0 +1,49 @@
package fileutils
import (
"os"
"path/filepath"
)
// MkdirAllNewAs creates a directory (include any along the path) and then modifies
// ownership ONLY of newly created directories to the requested uid/gid. If the
// directories along the path exist, no change of ownership will be performed
func MkdirAllNewAs(path string, mode os.FileMode, ownerUID, ownerGID int) error {
// make an array containing the original path asked for, plus (for mkAll == true)
// all path components leading up to the complete path that don't exist before we MkdirAll
// so that we can chown all of them properly at the end. If chownExisting is false, we won't
// chown the full directory path if it exists
var paths []string
if _, err := os.Stat(path); err != nil && os.IsNotExist(err) {
paths = []string{path}
} else if err == nil {
// nothing to do; directory path fully exists already
return nil
}
// walk back to "/" looking for directories which do not exist
// and add them to the paths array for chown after creation
dirPath := path
for {
dirPath = filepath.Dir(dirPath)
if dirPath == "/" {
break
}
if _, err := os.Stat(dirPath); err != nil && os.IsNotExist(err) {
paths = append(paths, dirPath)
}
}
if err := os.MkdirAll(path, mode); err != nil && !os.IsExist(err) {
return err
}
// even if it existed, we will chown the requested path + any subpaths that
// didn't exist when we called MkdirAll
for _, pathComponent := range paths {
if err := os.Chown(pathComponent, ownerUID, ownerGID); err != nil {
return err
}
}
return nil
}

View File

@ -1,3 +1,7 @@
# libcontainer
[![GoDoc](https://godoc.org/github.com/opencontainers/runc/libcontainer?status.svg)](https://godoc.org/github.com/opencontainers/runc/libcontainer)
Libcontainer provides a native Go implementation for creating containers
with namespaces, cgroups, capabilities, and filesystem access controls.
It allows you to manage the lifecycle of the container performing additional operations
@ -16,7 +20,14 @@ the current binary (/proc/self/exe) to be executed as the init process, and use
arg "init", we call the first step process "bootstrap", so you always need a "init"
function as the entry of "bootstrap".
In addition to the go init function the early stage bootstrap is handled by importing
[nsenter](https://github.com/opencontainers/runc/blob/master/libcontainer/nsenter/README.md).
```go
import (
_ "github.com/opencontainers/runc/libcontainer/nsenter"
)
func init() {
if len(os.Args) > 1 && os.Args[1] == "init" {
runtime.GOMAXPROCS(1)
@ -83,6 +94,7 @@ config := &configs.Config{
},
MaskPaths: []string{
"/proc/kcore",
"/sys/firmware",
},
ReadonlyPaths: []string{
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
@ -184,7 +196,7 @@ process := &libcontainer.Process{
Stderr: os.Stderr,
}
err := container.Start(process)
err := container.Run(process)
if err != nil {
container.Destroy()
logrus.Fatal(err)
@ -222,6 +234,12 @@ container.Signal(signal)
// update container resource constraints.
container.Set(config)
// get current status of the container.
status, err := container.Status()
// get current container's state information.
state, err := container.State()
```

View File

@ -7,10 +7,11 @@ import (
"os"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/syndtr/gocapability/capability"
)
const allCapabilityTypes = capability.CAPS | capability.BOUNDS
const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS
var capabilityMap map[string]capability.Cap
@ -30,40 +31,84 @@ func init() {
}
}
func newCapWhitelist(caps []string) (*whitelist, error) {
l := []capability.Cap{}
for _, c := range caps {
func newContainerCapList(capConfig *configs.Capabilities) (*containerCapabilities, error) {
bounding := []capability.Cap{}
for _, c := range capConfig.Bounding {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
l = append(l, v)
bounding = append(bounding, v)
}
effective := []capability.Cap{}
for _, c := range capConfig.Effective {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
effective = append(effective, v)
}
inheritable := []capability.Cap{}
for _, c := range capConfig.Inheritable {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
inheritable = append(inheritable, v)
}
permitted := []capability.Cap{}
for _, c := range capConfig.Permitted {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
permitted = append(permitted, v)
}
ambient := []capability.Cap{}
for _, c := range capConfig.Ambient {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
ambient = append(ambient, v)
}
pid, err := capability.NewPid(os.Getpid())
if err != nil {
return nil, err
}
return &whitelist{
keep: l,
return &containerCapabilities{
bounding: bounding,
effective: effective,
inheritable: inheritable,
permitted: permitted,
ambient: ambient,
pid: pid,
}, nil
}
type whitelist struct {
type containerCapabilities struct {
pid capability.Capabilities
keep []capability.Cap
bounding []capability.Cap
effective []capability.Cap
inheritable []capability.Cap
permitted []capability.Cap
ambient []capability.Cap
}
// dropBoundingSet drops the capability bounding set to those specified in the whitelist.
func (w *whitelist) dropBoundingSet() error {
w.pid.Clear(capability.BOUNDS)
w.pid.Set(capability.BOUNDS, w.keep...)
return w.pid.Apply(capability.BOUNDS)
// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist.
func (c *containerCapabilities) ApplyBoundingSet() error {
c.pid.Clear(capability.BOUNDS)
c.pid.Set(capability.BOUNDS, c.bounding...)
return c.pid.Apply(capability.BOUNDS)
}
// drop drops all capabilities for the current process except those specified in the whitelist.
func (w *whitelist) drop() error {
w.pid.Clear(allCapabilityTypes)
w.pid.Set(allCapabilityTypes, w.keep...)
return w.pid.Apply(allCapabilityTypes)
// Apply sets all the capabilities for the current process in the config.
func (c *containerCapabilities) ApplyCaps() error {
c.pid.Clear(allCapabilityTypes)
c.pid.Set(capability.BOUNDS, c.bounding...)
c.pid.Set(capability.PERMITTED, c.permitted...)
c.pid.Set(capability.INHERITABLE, c.inheritable...)
c.pid.Set(capability.EFFECTIVE, c.effective...)
c.pid.Set(capability.AMBIENT, c.ambient...)
return c.pid.Apply(allCapabilityTypes)
}

View File

@ -27,9 +27,9 @@ type Manager interface {
// Destroys the cgroup set
Destroy() error
// NewCgroupManager() and LoadCgroupManager() require following attributes:
// The option func SystemdCgroups() and Cgroupfs() require following attributes:
// Paths map[string]string
// Cgroups *cgroups.Cgroup
// Cgroups *configs.Cgroup
// Paths maps cgroup subsystem to path at which it is mounted.
// Cgroups specifies specific cgroup settings for the various subsystems

View File

@ -114,8 +114,8 @@ func (m *Manager) Apply(pid int) (err error) {
return err
}
m.Paths = make(map[string]string)
if c.Paths != nil {
paths := make(map[string]string)
for name, path := range c.Paths {
_, err := d.path(name)
if err != nil {
@ -124,17 +124,12 @@ func (m *Manager) Apply(pid int) (err error) {
}
return err
}
paths[name] = path
m.Paths[name] = path
}
m.Paths = paths
return cgroups.EnterPid(m.Paths, pid)
}
paths := make(map[string]string)
for _, sys := range subsystems {
if err := sys.Apply(d); err != nil {
return err
}
// TODO: Apply should, ideally, be reentrant or be broken up into a separate
// create and join phase so that the cgroup hierarchy for a container can be
// created then join consists of writing the process pids to cgroup.procs
@ -147,9 +142,12 @@ func (m *Manager) Apply(pid int) (err error) {
}
return err
}
paths[sys.Name()] = p
m.Paths[sys.Name()] = p
if err := sys.Apply(d); err != nil {
return err
}
}
m.Paths = paths
return nil
}
@ -269,25 +267,8 @@ func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
}, nil
}
func (raw *cgroupData) parentPath(subsystem, mountpoint, root string) (string, error) {
// Use GetThisCgroupDir instead of GetInitCgroupDir, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
initPath, err := cgroups.GetThisCgroupDir(subsystem)
if err != nil {
return "", err
}
// This is needed for nested containers, because in /proc/self/cgroup we
// see pathes from host, which don't exist in container.
relDir, err := filepath.Rel(root, initPath)
if err != nil {
return "", err
}
return filepath.Join(mountpoint, relDir), nil
}
func (raw *cgroupData) path(subsystem string) (string, error) {
mnt, root, err := cgroups.FindCgroupMountpointAndRoot(subsystem)
mnt, err := cgroups.FindCgroupMountpoint(subsystem)
// If we didn't mount the subsystem, there is no point we make the path.
if err != nil {
return "", err
@ -295,11 +276,14 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(raw.innerPath) {
// Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'.
// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
}
parentPath, err := raw.parentPath(subsystem, mnt, root)
// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
if err != nil {
return "", err
}
@ -348,8 +332,8 @@ func removePath(p string, err error) error {
return nil
}
func CheckCpushares(path string, c int64) error {
var cpuShares int64
func CheckCpushares(path string, c uint64) error {
var cpuShares uint64
if c == 0 {
return nil

View File

@ -55,7 +55,7 @@ func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error
func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpuRtPeriod != 0 {
if err := writeFile(path, "cpu.rt_period_us", strconv.FormatInt(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
if err := writeFile(path, "cpu.rt_period_us", strconv.FormatUint(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
return err
}
}
@ -69,12 +69,12 @@ func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error {
func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpuShares != 0 {
if err := writeFile(path, "cpu.shares", strconv.FormatInt(cgroup.Resources.CpuShares, 10)); err != nil {
if err := writeFile(path, "cpu.shares", strconv.FormatUint(cgroup.Resources.CpuShares, 10)); err != nil {
return err
}
}
if cgroup.Resources.CpuPeriod != 0 {
if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(cgroup.Resources.CpuPeriod, 10)); err != nil {
if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatUint(cgroup.Resources.CpuPeriod, 10)); err != nil {
return err
}
}

View File

@ -61,9 +61,26 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
if err != nil {
return err
}
if err := s.ensureParent(dir, root); err != nil {
// 'ensureParent' start with parent because we don't want to
// explicitly inherit from parent, it could conflict with
// 'cpuset.cpu_exclusive'.
if err := s.ensureParent(filepath.Dir(dir), root); err != nil {
return err
}
if err := os.MkdirAll(dir, 0755); err != nil {
return err
}
// We didn't inherit cpuset configs from parent, but we have
// to ensure cpuset configs are set before moving task into the
// cgroup.
// The logic is, if user specified cpuset configs, use these
// specified configs, otherwise, inherit from parent. This makes
// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
// keep backward compatbility.
if err := s.ensureCpusAndMems(dir, cgroup); err != nil {
return err
}
// because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems
if err := cgroups.WriteCgroupProc(dir, pid); err != nil {
@ -136,3 +153,10 @@ func (s *CpusetGroup) copyIfNeeded(current, parent string) error {
func (s *CpusetGroup) isEmpty(b []byte) bool {
return len(bytes.Trim(b, "\n")) == 0
}
func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error {
if err := s.Set(path, cgroup); err != nil {
return err
}
return s.copyIfNeeded(path, filepath.Dir(path))
}

View File

@ -18,6 +18,8 @@ import (
const (
cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes"
cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
cgroupMemoryLimit = "memory.limit_in_bytes"
)
type MemoryGroup struct {
@ -31,17 +33,23 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
path, err := d.path("memory")
if err != nil && !cgroups.IsNotFound(err) {
return err
} else if path == "" {
return nil
}
if memoryAssigned(d.config) {
if path != "" {
if _, err := os.Stat(path); os.IsNotExist(err) {
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
}
// Only enable kernel memory accouting when this cgroup
// is created by libcontainer, otherwise we might get
// error when people use `cgroupsPath` to join an existed
// cgroup whose kernel memory is not initialized.
if err := EnableKernelMemoryAccounting(path); err != nil {
return err
}
}
}
defer func() {
if err != nil {
os.RemoveAll(path)
@ -62,18 +70,15 @@ func EnableKernelMemoryAccounting(path string) error {
// We have to limit the kernel memory here as it won't be accounted at all
// until a limit is set on the cgroup and limit cannot be set once the
// cgroup has children, or if there are already tasks in the cgroup.
kernelMemoryLimit := int64(1)
if err := setKernelMemory(path, kernelMemoryLimit); err != nil {
for _, i := range []int64{1, -1} {
if err := setKernelMemory(path, uint64(i)); err != nil {
return err
}
kernelMemoryLimit = int64(-1)
if err := setKernelMemory(path, kernelMemoryLimit); err != nil {
return err
}
return nil
}
func setKernelMemory(path string, kernelMemoryLimit int64) error {
func setKernelMemory(path string, kernelMemoryLimit uint64) error {
if path == "" {
return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit)
}
@ -81,7 +86,7 @@ func setKernelMemory(path string, kernelMemoryLimit int64) error {
// kernel memory is not enabled on the system so we should do nothing
return nil
}
if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil {
if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatUint(kernelMemoryLimit, 10)), 0700); err != nil {
// Check if the error number returned by the syscall is "EBUSY"
// The EBUSY signal is returned on attempts to write to the
// memory.kmem.limit_in_bytes file if the cgroup has children or
@ -99,9 +104,20 @@ func setKernelMemory(path string, kernelMemoryLimit int64) error {
}
func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
ulimited := -1
// If the memory update is set to uint64(-1) we should also
// set swap to uint64(-1), it means unlimited memory.
if cgroup.Resources.Memory == uint64(ulimited) {
// Only set swap if it's enbled in kernel
if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
cgroup.Resources.MemorySwap = uint64(ulimited)
}
}
// When memory and swap memory are both set, we need to handle the cases
// for updating container.
if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap > 0 {
if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap != 0 {
memoryUsage, err := getMemoryData(path, "")
if err != nil {
return err
@ -110,29 +126,29 @@ func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
// When update memory limit, we should adapt the write sequence
// for memory and swap memory, so it won't fail because the new
// value and the old value don't fit kernel's validation.
if memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) {
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
if cgroup.Resources.MemorySwap == uint64(ulimited) || memoryUsage.Limit < cgroup.Resources.MemorySwap {
if err := writeFile(path, cgroupMemorySwapLimit, strconv.FormatUint(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
if err := writeFile(path, cgroupMemoryLimit, strconv.FormatUint(cgroup.Resources.Memory, 10)); err != nil {
return err
}
} else {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
if err := writeFile(path, cgroupMemoryLimit, strconv.FormatUint(cgroup.Resources.Memory, 10)); err != nil {
return err
}
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
if err := writeFile(path, cgroupMemorySwapLimit, strconv.FormatUint(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
}
} else {
if cgroup.Resources.Memory != 0 {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
if err := writeFile(path, cgroupMemoryLimit, strconv.FormatUint(cgroup.Resources.Memory, 10)); err != nil {
return err
}
}
if cgroup.Resources.MemorySwap > 0 {
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
if cgroup.Resources.MemorySwap != 0 {
if err := writeFile(path, cgroupMemorySwapLimit, strconv.FormatUint(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
}
@ -153,13 +169,13 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
}
if cgroup.Resources.MemoryReservation != 0 {
if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatUint(cgroup.Resources.MemoryReservation, 10)); err != nil {
return err
}
}
if cgroup.Resources.KernelMemoryTCP != 0 {
if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatUint(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
return err
}
}
@ -170,12 +186,12 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
}
if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 {
return nil
} else if int64(*cgroup.Resources.MemorySwappiness) >= 0 && int64(*cgroup.Resources.MemorySwappiness) <= 100 {
if err := writeFile(path, "memory.swappiness", strconv.FormatInt(*cgroup.Resources.MemorySwappiness, 10)); err != nil {
} else if *cgroup.Resources.MemorySwappiness <= 100 {
if err := writeFile(path, "memory.swappiness", strconv.FormatUint(*cgroup.Resources.MemorySwappiness, 10)); err != nil {
return err
}
} else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", int64(*cgroup.Resources.MemorySwappiness))
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *cgroup.Resources.MemorySwappiness)
}
return nil
@ -227,6 +243,14 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
}
stats.MemoryStats.KernelTCPUsage = kernelTCPUsage
useHierarchy := strings.Join([]string{"memory", "use_hierarchy"}, ".")
value, err := getCgroupParamUint(path, useHierarchy)
if err != nil {
return err
}
if value == 1 {
stats.MemoryStats.UseHierarchy = true
}
return nil
}
@ -237,7 +261,7 @@ func memoryAssigned(cgroup *configs.Cgroup) bool {
cgroup.Resources.KernelMemory > 0 ||
cgroup.Resources.KernelMemoryTCP > 0 ||
cgroup.Resources.OomKillDisable ||
(cgroup.Resources.MemorySwappiness != nil && *cgroup.Resources.MemorySwappiness != -1)
(cgroup.Resources.MemorySwappiness != nil && int64(*cgroup.Resources.MemorySwappiness) != -1)
}
func getMemoryData(path, name string) (cgroups.MemoryData, error) {

View File

@ -0,0 +1,128 @@
// +build linux
package rootless
import (
"fmt"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/configs/validate"
)
// TODO: This is copied from libcontainer/cgroups/fs, which duplicates this code
// needlessly. We should probably export this list.
var subsystems = []subsystem{
&fs.CpusetGroup{},
&fs.DevicesGroup{},
&fs.MemoryGroup{},
&fs.CpuGroup{},
&fs.CpuacctGroup{},
&fs.PidsGroup{},
&fs.BlkioGroup{},
&fs.HugetlbGroup{},
&fs.NetClsGroup{},
&fs.NetPrioGroup{},
&fs.PerfEventGroup{},
&fs.FreezerGroup{},
&fs.NameGroup{GroupName: "name=systemd"},
}
type subsystem interface {
// Name returns the name of the subsystem.
Name() string
// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
GetStats(path string, stats *cgroups.Stats) error
}
// The noop cgroup manager is used for rootless containers, because we currently
// cannot manage cgroups if we are in a rootless setup. This manager is chosen
// by factory if we are in rootless mode. We error out if any cgroup options are
// set in the config -- this may change in the future with upcoming kernel features
// like the cgroup namespace.
type Manager struct {
Cgroups *configs.Cgroup
Paths map[string]string
}
func (m *Manager) Apply(pid int) error {
// If there are no cgroup settings, there's nothing to do.
if m.Cgroups == nil {
return nil
}
// We can't set paths.
// TODO(cyphar): Implement the case where the runner of a rootless container
// owns their own cgroup, which would allow us to set up a
// cgroup for each path.
if m.Cgroups.Paths != nil {
return fmt.Errorf("cannot change cgroup path in rootless container")
}
// We load the paths into the manager.
paths := make(map[string]string)
for _, sys := range subsystems {
name := sys.Name()
path, err := cgroups.GetOwnCgroupPath(name)
if err != nil {
// Ignore paths we couldn't resolve.
continue
}
paths[name] = path
}
m.Paths = paths
return nil
}
func (m *Manager) GetPaths() map[string]string {
return m.Paths
}
func (m *Manager) Set(container *configs.Config) error {
// We have to re-do the validation here, since someone might decide to
// update a rootless container.
return validate.New().Validate(container)
}
func (m *Manager) GetPids() ([]int, error) {
dir, err := cgroups.GetOwnCgroupPath("devices")
if err != nil {
return nil, err
}
return cgroups.GetPids(dir)
}
func (m *Manager) GetAllPids() ([]int, error) {
dir, err := cgroups.GetOwnCgroupPath("devices")
if err != nil {
return nil, err
}
return cgroups.GetAllPids(dir)
}
func (m *Manager) GetStats() (*cgroups.Stats, error) {
// TODO(cyphar): We can make this work if we figure out a way to allow usage
// of cgroups with a rootless container. While this doesn't
// actually require write access to a cgroup directory, the
// statistics are not useful if they can be affected by
// non-container processes.
return nil, fmt.Errorf("cannot get cgroup stats in rootless container")
}
func (m *Manager) Freeze(state configs.FreezerState) error {
// TODO(cyphar): We can make this work if we figure out a way to allow usage
// of cgroups with a rootless container.
return fmt.Errorf("cannot use freezer cgroup in rootless container")
}
func (m *Manager) Destroy() error {
// We don't have to do anything here because we didn't do any setup.
return nil
}

View File

@ -51,6 +51,8 @@ type MemoryStats struct {
KernelUsage MemoryData `json:"kernel_usage,omitempty"`
// usage of kernel TCP memory
KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
// if true, memory usage is accounted for throughout a hierarchy of cgroups.
UseHierarchy bool `json:"use_hierarchy"`
Stats map[string]uint64 `json:"stats,omitempty"`
}

View File

@ -5,10 +5,8 @@ package systemd
import (
"errors"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
@ -67,12 +65,14 @@ var subsystems = subsystemSet{
const (
testScopeWait = 4
testSliceWait = 4
)
var (
connLock sync.Mutex
theConn *systemdDbus.Conn
hasStartTransientUnit bool
hasStartTransientSliceUnit bool
hasTransientDefaultDependencies bool
hasDelegate bool
)
@ -159,8 +159,36 @@ func UseSystemd() bool {
}
}
// Assume we have the ability to start a transient unit as a slice
// This was broken until systemd v229, but has been back-ported on RHEL environments >= 219
// For details, see: https://bugzilla.redhat.com/show_bug.cgi?id=1370299
hasStartTransientSliceUnit = true
// To ensure simple clean-up, we create a slice off the root with no hierarchy
slice := fmt.Sprintf("libcontainer_%d_systemd_test_default.slice", os.Getpid())
if _, err := theConn.StartTransientUnit(slice, "replace", nil, nil); err != nil {
if _, ok := err.(dbus.Error); ok {
hasStartTransientSliceUnit = false
}
}
for i := 0; i <= testSliceWait; i++ {
if _, err := theConn.StopUnit(slice, "replace", nil); err != nil {
if dbusError, ok := err.(dbus.Error); ok {
if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") {
hasStartTransientSliceUnit = false
break
}
}
} else {
break
}
time.Sleep(time.Millisecond)
}
// Not critical because of the stop unit logic above.
theConn.StopUnit(scope, "replace", nil)
theConn.StopUnit(slice, "replace", nil)
}
return hasStartTransientUnit
}
@ -194,11 +222,24 @@ func (m *Manager) Apply(pid int) error {
slice = c.Parent
}
properties = append(properties,
systemdDbus.PropSlice(slice),
systemdDbus.PropDescription("docker container "+c.Name),
newProp("PIDs", []uint32{uint32(pid)}),
)
properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
// if we create a slice, the parent is defined via a Wants=
if strings.HasSuffix(unitName, ".slice") {
// This was broken until systemd v229, but has been back-ported on RHEL environments >= 219
if !hasStartTransientSliceUnit {
return fmt.Errorf("systemd version does not support ability to start a slice as transient unit")
}
properties = append(properties, systemdDbus.PropWants(slice))
} else {
// otherwise, we use Slice=
properties = append(properties, systemdDbus.PropSlice(slice))
}
// only add pid if its valid, -1 is used w/ general slice creation.
if pid != -1 {
properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
}
if hasDelegate {
// This is only supported on systemd versions 218 and above.
@ -219,12 +260,19 @@ func (m *Manager) Apply(pid int) error {
if c.Resources.Memory != 0 {
properties = append(properties,
newProp("MemoryLimit", uint64(c.Resources.Memory)))
newProp("MemoryLimit", c.Resources.Memory))
}
if c.Resources.CpuShares != 0 {
properties = append(properties,
newProp("CPUShares", uint64(c.Resources.CpuShares)))
newProp("CPUShares", c.Resources.CpuShares))
}
// cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 {
cpuQuotaPerSecUSec := uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
properties = append(properties,
newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
}
if c.Resources.BlkioWeight != 0 {
@ -240,7 +288,7 @@ func (m *Manager) Apply(pid int) error {
}
}
if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil {
if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil && !isUnitExists(err) {
return err
}
@ -285,15 +333,6 @@ func (m *Manager) GetPaths() map[string]string {
return paths
}
func writeFile(dir, file, data string) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s", file)
}
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
}
func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
path, err := getSubsystemPath(c, subsystem)
if err != nil {
@ -302,10 +341,9 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
if err := os.MkdirAll(path, 0755); err != nil {
return "", err
}
if err := writeFile(path, "cgroup.procs", strconv.Itoa(pid)); err != nil {
if err := cgroups.WriteCgroupProc(path, pid); err != nil {
return "", err
}
return path, nil
}
@ -347,10 +385,10 @@ func joinCgroups(c *configs.Cgroup, pid int) error {
return nil
}
// systemd represents slice heirarchy using `-`, so we need to follow suit when
// systemd represents slice hierarchy using `-`, so we need to follow suit when
// generating the path of slice. Essentially, test-a-b.slice becomes
// test.slice/test-a.slice/test-a-b.slice.
func expandSlice(slice string) (string, error) {
func ExpandSlice(slice string) (string, error) {
suffix := ".slice"
// Name has to end with ".slice", but can't be just ".slice".
if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
@ -364,6 +402,10 @@ func expandSlice(slice string) (string, error) {
var path, prefix string
sliceName := strings.TrimSuffix(slice, suffix)
// if input was -.slice, we should just return root now
if sliceName == "-" {
return "/", nil
}
for _, component := range strings.Split(sliceName, "-") {
// test--a.slice isn't permitted, nor is -test.slice.
if component == "" {
@ -384,7 +426,7 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
return "", err
}
initPath, err := cgroups.GetInitCgroupDir(subsystem)
initPath, err := cgroups.GetInitCgroup(subsystem)
if err != nil {
return "", err
}
@ -396,7 +438,7 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
slice = c.Parent
}
slice, err = expandSlice(slice)
slice, err = ExpandSlice(slice)
if err != nil {
return "", err
}
@ -483,7 +525,11 @@ func (m *Manager) Set(container *configs.Config) error {
}
func getUnitName(c *configs.Cgroup) string {
// by default, we create a scope unless the user explicitly asks for a slice.
if !strings.HasSuffix(c.Name, ".slice") {
return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
}
return c.Name
}
func setKernelMemory(c *configs.Cgroup) error {
@ -497,3 +543,13 @@ func setKernelMemory(c *configs.Cgroup) error {
}
return fs.EnableKernelMemoryAccounting(path)
}
// isUnitExists returns true if the error is that a systemd unit already exists.
func isUnitExists(err error) bool {
if err != nil {
if dbusError, ok := err.(dbus.Error); ok {
return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
}
}
return false
}

View File

@ -23,36 +23,14 @@ const (
// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
func FindCgroupMountpoint(subsystem string) (string, error) {
// We are not using mount.GetMounts() because it's super-inefficient,
// parsing it directly sped up x10 times because of not using Sscanf.
// It was one of two major performance drawbacks in container start.
if !isSubsystemAvailable(subsystem) {
return "", NewNotFoundError(subsystem)
}
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return "", err
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
txt := scanner.Text()
fields := strings.Split(txt, " ")
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if opt == subsystem {
return fields[4], nil
}
}
}
if err := scanner.Err(); err != nil {
return "", err
}
return "", NewNotFoundError(subsystem)
mnt, _, err := FindCgroupMountpointAndRoot(subsystem)
return mnt, err
}
func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
// We are not using mount.GetMounts() because it's super-inefficient,
// parsing it directly sped up x10 times because of not using Sscanf.
// It was one of two major performance drawbacks in container start.
if !isSubsystemAvailable(subsystem) {
return "", "", NewNotFoundError(subsystem)
}
@ -131,7 +109,7 @@ type Mount struct {
Subsystems []string
}
func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) {
func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
if len(m.Subsystems) == 0 {
return "", fmt.Errorf("no subsystem for mount")
}
@ -149,7 +127,7 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
if sepIdx == -1 {
return nil, fmt.Errorf("invalid mountinfo format")
}
if txt[sepIdx+3:sepIdx+9] != "cgroup" {
if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
continue
}
fields := strings.Split(txt, " ")
@ -211,9 +189,6 @@ func GetAllSubsystems() ([]string, error) {
s := bufio.NewScanner(f)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
text := s.Text()
if text[0] != '#' {
parts := strings.Fields(text)
@ -222,11 +197,14 @@ func GetAllSubsystems() ([]string, error) {
}
}
}
if err := s.Err(); err != nil {
return nil, err
}
return subsystems, nil
}
// GetThisCgroupDir returns the relative path to the cgroup docker is running in.
func GetThisCgroupDir(subsystem string) (string, error) {
// GetOwnCgroup returns the relative path to the cgroup docker is running in.
func GetOwnCgroup(subsystem string) (string, error) {
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return "", err
@ -235,8 +213,16 @@ func GetThisCgroupDir(subsystem string) (string, error) {
return getControllerPath(subsystem, cgroups)
}
func GetInitCgroupDir(subsystem string) (string, error) {
func GetOwnCgroupPath(subsystem string) (string, error) {
cgroup, err := GetOwnCgroup(subsystem)
if err != nil {
return "", err
}
return getCgroupPathHelper(subsystem, cgroup)
}
func GetInitCgroup(subsystem string) (string, error) {
cgroups, err := ParseCgroupFile("/proc/1/cgroup")
if err != nil {
return "", err
@ -245,6 +231,31 @@ func GetInitCgroupDir(subsystem string) (string, error) {
return getControllerPath(subsystem, cgroups)
}
func GetInitCgroupPath(subsystem string) (string, error) {
cgroup, err := GetInitCgroup(subsystem)
if err != nil {
return "", err
}
return getCgroupPathHelper(subsystem, cgroup)
}
func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
mnt, root, err := FindCgroupMountpointAndRoot(subsystem)
if err != nil {
return "", err
}
// This is needed for nested containers, because in /proc/self/cgroup we
// see pathes from host, which don't exist in container.
relCgroup, err := filepath.Rel(root, cgroup)
if err != nil {
return "", err
}
return filepath.Join(mnt, relCgroup), nil
}
func readProcsFile(dir string) ([]int, error) {
f, err := os.Open(filepath.Join(dir, CgroupProcesses))
if err != nil {
@ -287,10 +298,6 @@ func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
cgroups := make(map[string]string)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
text := s.Text()
// from cgroups(7):
// /proc/[pid]/cgroup
@ -307,6 +314,10 @@ func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
cgroups[subs] = parts[2]
}
}
if err := s.Err(); err != nil {
return nil, err
}
return cgroups, nil
}

View File

@ -22,7 +22,7 @@ type Cgroup struct {
// The path is assumed to be relative to the host system cgroup mountpoint.
Path string `json:"path"`
// ScopePrefix decribes prefix for the scope name
// ScopePrefix describes prefix for the scope name
ScopePrefix string `json:"scope_prefix"`
// Paths represent the absolute cgroups paths to join.
@ -45,34 +45,34 @@ type Resources struct {
Devices []*Device `json:"devices"`
// Memory limit (in bytes)
Memory int64 `json:"memory"`
Memory uint64 `json:"memory"`
// Memory reservation or soft_limit (in bytes)
MemoryReservation int64 `json:"memory_reservation"`
MemoryReservation uint64 `json:"memory_reservation"`
// Total memory usage (memory + swap); set `-1` to enable unlimited swap
MemorySwap int64 `json:"memory_swap"`
MemorySwap uint64 `json:"memory_swap"`
// Kernel memory limit (in bytes)
KernelMemory int64 `json:"kernel_memory"`
KernelMemory uint64 `json:"kernel_memory"`
// Kernel memory limit for TCP use (in bytes)
KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
KernelMemoryTCP uint64 `json:"kernel_memory_tcp"`
// CPU shares (relative weight vs. other containers)
CpuShares int64 `json:"cpu_shares"`
CpuShares uint64 `json:"cpu_shares"`
// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
CpuQuota int64 `json:"cpu_quota"`
// CPU period to be used for hardcapping (in usecs). 0 to use system default.
CpuPeriod int64 `json:"cpu_period"`
CpuPeriod uint64 `json:"cpu_period"`
// How many time CPU will use in realtime scheduling (in usecs).
CpuRtRuntime int64 `json:"cpu_rt_quota"`
// CPU period to be used for realtime scheduling (in usecs).
CpuRtPeriod int64 `json:"cpu_rt_period"`
CpuRtPeriod uint64 `json:"cpu_rt_period"`
// CPU to use
CpusetCpus string `json:"cpuset_cpus"`
@ -95,7 +95,7 @@ type Resources struct {
// IO read rate limit per cgroup per device, bytes per second.
BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"`
// IO write rate limit per cgroup per divice, bytes per second.
// IO write rate limit per cgroup per device, bytes per second.
BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"`
// IO read rate limit per cgroup per device, IO per second.
@ -114,7 +114,7 @@ type Resources struct {
OomKillDisable bool `json:"oom_kill_disable"`
// Tuning swappiness behaviour per cgroup
MemorySwappiness *int64 `json:"memory_swappiness"`
MemorySwappiness *uint64 `json:"memory_swappiness"`
// Set priority of network traffic for container
NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`

View File

@ -8,6 +8,7 @@ import (
"time"
"github.com/Sirupsen/logrus"
"github.com/opencontainers/runtime-spec/specs-go"
)
type Rlimit struct {
@ -85,11 +86,6 @@ type Config struct {
// that the parent process dies.
ParentDeathSignal int `json:"parent_death_signal"`
// PivotDir allows a custom directory inside the container's root filesystem to be used as pivot, when NoPivotRoot is not set.
// When a custom PivotDir not set, a temporary dir inside the root filesystem will be used. The pivot dir needs to be writeable.
// This is required when using read only root filesystems. In these cases, a read/writeable path can be (bind) mounted somewhere inside the root filesystem to act as pivot.
PivotDir string `json:"pivot_dir"`
// Path to a directory containing the container's root filesystem.
Rootfs string `json:"rootfs"`
@ -117,8 +113,8 @@ type Config struct {
Namespaces Namespaces `json:"namespaces"`
// Capabilities specify the capabilities to keep when executing the process inside the container
// All capbilities not specified will be dropped from the processes capability mask
Capabilities []string `json:"capabilities"`
// All capabilities not specified will be dropped from the processes capability mask
Capabilities *Capabilities `json:"capabilities"`
// Networks specifies the container's network setup to be created
Networks []*Network `json:"networks"`
@ -187,6 +183,9 @@ type Config struct {
// NoNewKeyring will not allocated a new session keyring for the container. It will use the
// callers keyring in this case.
NoNewKeyring bool `json:"no_new_keyring"`
// Rootless specifies whether the container is a rootless container.
Rootless bool `json:"rootless"`
}
type Hooks struct {
@ -201,6 +200,19 @@ type Hooks struct {
Poststop []Hook
}
type Capabilities struct {
// Bounding is the set of capabilities checked by the kernel.
Bounding []string
// Effective is the set of capabilities checked by the kernel.
Effective []string
// Inheritable is the capabilities preserved across execve.
Inheritable []string
// Permitted is the limiting superset for effective capabilities.
Permitted []string
// Ambient is the ambient set of capabilities that are kept.
Ambient []string
}
func (hooks *Hooks) UnmarshalJSON(b []byte) error {
var state struct {
Prestart []CommandHook
@ -248,13 +260,7 @@ func (hooks Hooks) MarshalJSON() ([]byte, error) {
}
// HookState is the payload provided to a hook on execution.
type HookState struct {
Version string `json:"ociVersion"`
ID string `json:"id"`
Pid int `json:"pid"`
Root string `json:"root"`
BundlePath string `json:"bundlePath"`
}
type HookState specs.State
type Hook interface {
// Run executes the hook with the provided state.

View File

@ -4,38 +4,50 @@ package configs
import "fmt"
// HostUID gets the root uid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostUID() (int, error) {
// HostUID gets the translated uid for the process on host which could be
// different when user namespaces are enabled.
func (c Config) HostUID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.UidMappings == nil {
return -1, fmt.Errorf("User namespaces enabled, but no user mappings found.")
return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.")
}
id, found := c.hostIDFromMapping(0, c.UidMappings)
id, found := c.hostIDFromMapping(containerId, c.UidMappings)
if !found {
return -1, fmt.Errorf("User namespaces enabled, but no root user mapping found.")
return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.")
}
return id, nil
}
// Return default root uid 0
return 0, nil
// Return unchanged id.
return containerId, nil
}
// HostGID gets the root gid for the process on host which could be non-zero
// HostRootUID gets the root uid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostGID() (int, error) {
func (c Config) HostRootUID() (int, error) {
return c.HostUID(0)
}
// HostGID gets the translated gid for the process on host which could be
// different when user namespaces are enabled.
func (c Config) HostGID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.GidMappings == nil {
return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.")
}
id, found := c.hostIDFromMapping(0, c.GidMappings)
id, found := c.hostIDFromMapping(containerId, c.GidMappings)
if !found {
return -1, fmt.Errorf("User namespaces enabled, but no root group mapping found.")
return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.")
}
return id, nil
}
// Return default root gid 0
return 0, nil
// Return unchanged id.
return containerId, nil
}
// HostRootGID gets the root gid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostRootGID() (int, error) {
return c.HostGID(0)
}
// Utility function that gets a host ID for a container ID from user namespace map

View File

@ -1,5 +1,11 @@
package configs
const (
// EXT_COPYUP is a directive to copy up the contents of a directory when
// a tmpfs is mounted over it.
EXT_COPYUP = 1 << iota
)
type Mount struct {
// Source path for the mount.
Source string `json:"source"`
@ -22,6 +28,9 @@ type Mount struct {
// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
Relabel string `json:"relabel"`
// Extensions are additional flags that are specific to runc.
Extensions int `json:"extensions"`
// Optional Command to be run before Source is mounted.
PremountCmds []Command `json:"premount_cmds"`

View File

@ -4,12 +4,10 @@ package configs
func (n *Namespace) Syscall() int {
panic("No namespace syscall support")
return 0
}
// CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr {
panic("No namespace syscall support")
return uintptr(0)
}

View File

@ -22,8 +22,8 @@ var (
supportedNamespaces = make(map[NamespaceType]bool)
)
// nsToFile converts the namespace type to its filename
func nsToFile(ns NamespaceType) string {
// NsName converts the namespace type to its filename
func NsName(ns NamespaceType) string {
switch ns {
case NEWNET:
return "net"
@ -50,7 +50,7 @@ func IsNamespaceSupported(ns NamespaceType) bool {
if ok {
return supported
}
nsFile := nsToFile(ns)
nsFile := NsName(ns)
// if the namespace type is unknown, just return false
if nsFile == "" {
return false
@ -84,7 +84,7 @@ func (n *Namespace) GetPath(pid int) string {
if n.Path != "" {
return n.Path
}
return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type))
return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
}
func (n *Namespaces) Remove(t NamespaceType) bool {

View File

@ -0,0 +1,117 @@
package validate
import (
"fmt"
"os"
"reflect"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
)
var (
geteuid = os.Geteuid
getegid = os.Getegid
)
func (v *ConfigValidator) rootless(config *configs.Config) error {
if err := rootlessMappings(config); err != nil {
return err
}
if err := rootlessMount(config); err != nil {
return err
}
// Currently, cgroups cannot effectively be used in rootless containers.
// The new cgroup namespace doesn't really help us either because it doesn't
// have nice interactions with the user namespace (we're working with upstream
// to fix this).
if err := rootlessCgroup(config); err != nil {
return err
}
// XXX: We currently can't verify the user config at all, because
// configs.Config doesn't store the user-related configs. So this
// has to be verified by setupUser() in init_linux.go.
return nil
}
func rootlessMappings(config *configs.Config) error {
rootuid, err := config.HostRootUID()
if err != nil {
return fmt.Errorf("failed to get root uid from uidMappings: %v", err)
}
if euid := geteuid(); euid != 0 {
if !config.Namespaces.Contains(configs.NEWUSER) {
return fmt.Errorf("rootless containers require user namespaces")
}
if rootuid != euid {
return fmt.Errorf("rootless containers cannot map container root to a different host user")
}
}
rootgid, err := config.HostRootGID()
if err != nil {
return fmt.Errorf("failed to get root gid from gidMappings: %v", err)
}
// Similar to the above test, we need to make sure that we aren't trying to
// map to a group ID that we don't have the right to be.
if rootgid != getegid() {
return fmt.Errorf("rootless containers cannot map container root to a different host group")
}
// We can only map one user and group inside a container (our own).
if len(config.UidMappings) != 1 || config.UidMappings[0].Size != 1 {
return fmt.Errorf("rootless containers cannot map more than one user")
}
if len(config.GidMappings) != 1 || config.GidMappings[0].Size != 1 {
return fmt.Errorf("rootless containers cannot map more than one group")
}
return nil
}
// cgroup verifies that the user isn't trying to set any cgroup limits or paths.
func rootlessCgroup(config *configs.Config) error {
// Nothing set at all.
if config.Cgroups == nil || config.Cgroups.Resources == nil {
return nil
}
// Used for comparing to the zero value.
left := reflect.ValueOf(*config.Cgroups.Resources)
right := reflect.Zero(left.Type())
// This is all we need to do, since specconv won't add cgroup options in
// rootless mode.
if !reflect.DeepEqual(left.Interface(), right.Interface()) {
return fmt.Errorf("cannot specify resource limits in rootless container")
}
return nil
}
// mount verifies that the user isn't trying to set up any mounts they don't have
// the rights to do. In addition, it makes sure that no mount has a `uid=` or
// `gid=` option that doesn't resolve to root.
func rootlessMount(config *configs.Config) error {
// XXX: We could whitelist allowed devices at this point, but I'm not
// convinced that's a good idea. The kernel is the best arbiter of
// access control.
for _, mount := range config.Mounts {
// Check that the options list doesn't contain any uid= or gid= entries
// that don't resolve to root.
for _, opt := range strings.Split(mount.Data, ",") {
if strings.HasPrefix(opt, "uid=") && opt != "uid=0" {
return fmt.Errorf("cannot specify uid= mount options in rootless containers where argument isn't 0")
}
if strings.HasPrefix(opt, "gid=") && opt != "gid=0" {
return fmt.Errorf("cannot specify gid= mount options in rootless containers where argument isn't 0")
}
}
}
return nil
}

View File

@ -40,12 +40,23 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
if err := v.sysctl(config); err != nil {
return err
}
if config.Rootless {
if err := v.rootless(config); err != nil {
return err
}
}
return nil
}
// rootfs validates if the rootfs is an absolute path and is not a symlink
// to the container's root filesystem.
func (v *ConfigValidator) rootfs(config *configs.Config) error {
if _, err := os.Stat(config.Rootfs); err != nil {
if os.IsNotExist(err) {
return fmt.Errorf("rootfs (%s) does not exist", config.Rootfs)
}
return err
}
cleaned, err := filepath.Abs(config.Rootfs)
if err != nil {
return err
@ -126,6 +137,11 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
}
if strings.HasPrefix(s, "net.") {
if config.Namespaces.Contains(configs.NEWNET) {
if path := config.Namespaces.PathOf(configs.NEWNET); path != "" {
if err := checkHostNs(s, path); err != nil {
return err
}
}
continue
} else {
return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s)
@ -136,3 +152,44 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
return nil
}
func isSymbolicLink(path string) (bool, error) {
fi, err := os.Lstat(path)
if err != nil {
return false, err
}
return fi.Mode()&os.ModeSymlink == os.ModeSymlink, nil
}
// checkHostNs checks whether network sysctl is used in host namespace.
func checkHostNs(sysctlConfig string, path string) error {
var currentProcessNetns = "/proc/self/ns/net"
// readlink on the current processes network namespace
destOfCurrentProcess, err := os.Readlink(currentProcessNetns)
if err != nil {
return fmt.Errorf("read soft link %q error", currentProcessNetns)
}
// First check if the provided path is a symbolic link
symLink, err := isSymbolicLink(path)
if err != nil {
return fmt.Errorf("could not check that %q is a symlink: %v", path, err)
}
if symLink == false {
// The provided namespace is not a symbolic link,
// it is not the host namespace.
return nil
}
// readlink on the path provided in the struct
destOfContainer, err := os.Readlink(path)
if err != nil {
return fmt.Errorf("read soft link %q error", path)
}
if destOfContainer == destOfCurrentProcess {
return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", sysctlConfig)
}
return nil
}

View File

@ -1,15 +1,17 @@
package libcontainer
import "io"
import (
"io"
"os"
)
// Console represents a pseudo TTY.
type Console interface {
io.ReadWriter
io.Closer
io.ReadWriteCloser
// Path returns the filesystem path to the slave side of the pty.
Path() string
// Fd returns the fd for the master of the pty.
Fd() uintptr
File() *os.File
}

View File

@ -6,8 +6,8 @@ import (
"errors"
)
// NewConsole returns an initalized console that can be used within a container by copying bytes
// newConsole returns an initialized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process.
func NewConsole(uid, gid int) (Console, error) {
func newConsole() (Console, error) {
return nil, errors.New("libcontainer console is not supported on FreeBSD")
}

View File

@ -3,20 +3,26 @@ package libcontainer
import (
"fmt"
"os"
"path/filepath"
"syscall"
"unsafe"
"github.com/opencontainers/runc/libcontainer/label"
)
// NewConsole returns an initalized console that can be used within a container by copying bytes
func ConsoleFromFile(f *os.File) Console {
return &linuxConsole{
master: f,
}
}
// newConsole returns an initialized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process.
func NewConsole(uid, gid int) (Console, error) {
func newConsole() (Console, error) {
master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
if err != nil {
return nil, err
}
if err := saneTerminal(master); err != nil {
return nil, err
}
console, err := ptsname(master)
if err != nil {
return nil, err
@ -24,34 +30,20 @@ func NewConsole(uid, gid int) (Console, error) {
if err := unlockpt(master); err != nil {
return nil, err
}
if err := os.Chmod(console, 0600); err != nil {
return nil, err
}
if err := os.Chown(console, uid, gid); err != nil {
return nil, err
}
return &linuxConsole{
slavePath: console,
master: master,
}, nil
}
// newConsoleFromPath is an internal function returning an initialized console for use inside
// a container's MNT namespace.
func newConsoleFromPath(slavePath string) *linuxConsole {
return &linuxConsole{
slavePath: slavePath,
}
}
// linuxConsole is a linux psuedo TTY for use within a container.
// linuxConsole is a linux pseudo TTY for use within a container.
type linuxConsole struct {
master *os.File
slavePath string
}
func (c *linuxConsole) Fd() uintptr {
return c.master.Fd()
func (c *linuxConsole) File() *os.File {
return c.master
}
func (c *linuxConsole) Path() string {
@ -75,21 +67,17 @@ func (c *linuxConsole) Close() error {
// mount initializes the console inside the rootfs mounting with the specified mount label
// and applying the correct ownership of the console.
func (c *linuxConsole) mount(rootfs, mountLabel string) error {
func (c *linuxConsole) mount() error {
oldMask := syscall.Umask(0000)
defer syscall.Umask(oldMask)
if err := label.SetFileLabel(c.slavePath, mountLabel); err != nil {
return err
}
dest := filepath.Join(rootfs, "/dev/console")
f, err := os.Create(dest)
f, err := os.Create("/dev/console")
if err != nil && !os.IsExist(err) {
return err
}
if f != nil {
f.Close()
}
return syscall.Mount(c.slavePath, dest, "bind", syscall.MS_BIND, "")
return syscall.Mount(c.slavePath, "/dev/console", "bind", syscall.MS_BIND, "")
}
// dupStdio opens the slavePath for the console and dups the fds to the current
@ -143,3 +131,26 @@ func ptsname(f *os.File) (string, error) {
}
return fmt.Sprintf("/dev/pts/%d", n), nil
}
// saneTerminal sets the necessary tty_ioctl(4)s to ensure that a pty pair
// created by us acts normally. In particular, a not-very-well-known default of
// Linux unix98 ptys is that they have +onlcr by default. While this isn't a
// problem for terminal emulators, because we relay data from the terminal we
// also relay that funky line discipline.
func saneTerminal(terminal *os.File) error {
// Go doesn't have a wrapper for any of the termios ioctls.
var termios syscall.Termios
if err := ioctl(terminal.Fd(), syscall.TCGETS, uintptr(unsafe.Pointer(&termios))); err != nil {
return fmt.Errorf("ioctl(tty, tcgets): %s", err.Error())
}
// Set -onlcr so we don't have to deal with \r.
termios.Oflag &^= syscall.ONLCR
if err := ioctl(terminal.Fd(), syscall.TCSETS, uintptr(unsafe.Pointer(&termios))); err != nil {
return fmt.Errorf("ioctl(tty, tcsets): %s", err.Error())
}
return nil
}

View File

@ -4,8 +4,8 @@ import (
"errors"
)
// NewConsole returns an initalized console that can be used within a container by copying bytes
// newConsole returns an initialized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process.
func NewConsole(uid, gid int) (Console, error) {
func newConsole() (Console, error) {
return nil, errors.New("libcontainer console is not supported on Solaris")
}

View File

@ -1,11 +1,11 @@
package libcontainer
// NewConsole returns an initalized console that can be used within a container
func NewConsole(uid, gid int) (Console, error) {
// newConsole returns an initialized console that can be used within a container
func newConsole() (Console, error) {
return &windowsConsole{}, nil
}
// windowsConsole is a Windows psuedo TTY for use within a container.
// windowsConsole is a Windows pseudo TTY for use within a container.
type windowsConsole struct {
}

View File

@ -123,7 +123,7 @@ type BaseContainer interface {
// SystemError - System error.
Start(process *Process) (err error)
// Run immediatly starts the process inside the conatiner. Returns error if process
// Run immediately starts the process inside the container. Returns error if process
// fails to start. It does not block waiting for the exec fifo after start returns but
// opens the fifo after start returns.
//
@ -134,20 +134,29 @@ type BaseContainer interface {
// SystemError - System error.
Run(process *Process) (err error)
// Destroys the container after killing all running processes.
// Destroys the container, if its in a valid state, after killing any
// remaining running processes.
//
// Any event registrations are removed before the container is destroyed.
// No error is returned if the container is already destroyed.
//
// Running containers must first be stopped using Signal(..).
// Paused containers must first be resumed using Resume(..).
//
// errors:
// ContainerNotStopped - Container is still running,
// ContainerPaused - Container is paused,
// SystemError - System error.
Destroy() error
// Signal sends the provided signal code to the container's initial process.
//
// If all is specified the signal is sent to all processes in the container
// including the initial process.
//
// errors:
// SystemError - System error.
Signal(s os.Signal) error
Signal(s os.Signal, all bool) error
// Exec signals the container to exec the users process at the end of the init.
//

View File

@ -51,6 +51,9 @@ type State struct {
// Platform specific fields below here
// Specifies if the container was started under the rootless mode.
Rootless bool `json:"rootless"`
// Path to all the cgroups setup for a container. Key is cgroup subsystem name
// with the value as the path.
CgroupPaths map[string]string `json:"cgroup_paths"`
@ -191,17 +194,29 @@ func (c *linuxContainer) Start(process *Process) error {
if err != nil {
return err
}
return c.start(process, status == Stopped)
if status == Stopped {
if err := c.createExecFifo(); err != nil {
return err
}
}
if err := c.start(process, status == Stopped); err != nil {
if status == Stopped {
c.deleteExecFifo()
}
return err
}
return nil
}
func (c *linuxContainer) Run(process *Process) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
c.m.Unlock()
return err
}
if err := c.start(process, status == Stopped); err != nil {
c.m.Unlock()
if err := c.Start(process); err != nil {
return err
}
if status == Stopped {
@ -266,8 +281,7 @@ func (c *linuxContainer) start(process *Process, isInit bool) error {
Version: c.config.Version,
ID: c.id,
Pid: parent.pid(),
Root: c.config.Rootfs,
BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
}
for i, hook := range c.config.Hooks.Poststart {
if err := hook.Run(s); err != nil {
@ -282,33 +296,75 @@ func (c *linuxContainer) start(process *Process, isInit bool) error {
return nil
}
func (c *linuxContainer) Signal(s os.Signal) error {
func (c *linuxContainer) Signal(s os.Signal, all bool) error {
if all {
return signalAllProcesses(c.cgroupManager, s)
}
if err := c.initProcess.signal(s); err != nil {
return newSystemErrorWithCause(err, "signaling init process")
}
return nil
}
func (c *linuxContainer) createExecFifo() error {
rootuid, err := c.Config().HostRootUID()
if err != nil {
return err
}
rootgid, err := c.Config().HostRootGID()
if err != nil {
return err
}
fifoName := filepath.Join(c.root, execFifoFilename)
if _, err := os.Stat(fifoName); err == nil {
return fmt.Errorf("exec fifo %s already exists", fifoName)
}
oldMask := syscall.Umask(0000)
if err := syscall.Mkfifo(fifoName, 0622); err != nil {
syscall.Umask(oldMask)
return err
}
syscall.Umask(oldMask)
if err := os.Chown(fifoName, rootuid, rootgid); err != nil {
return err
}
return nil
}
func (c *linuxContainer) deleteExecFifo() {
fifoName := filepath.Join(c.root, execFifoFilename)
os.Remove(fifoName)
}
func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
parentPipe, childPipe, err := newPipe()
parentPipe, childPipe, err := utils.NewSockPair("init")
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new init pipe")
}
rootDir, err := os.Open(c.root)
if err != nil {
return nil, err
}
cmd, err := c.commandTemplate(p, childPipe, rootDir)
cmd, err := c.commandTemplate(p, childPipe)
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new command template")
}
if !doInit {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir)
return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
}
// We only set up rootDir if we're not doing a `runc exec`. The reason for
// this is to avoid cases where a racing, unprivileged process inside the
// container can get access to the statedir file descriptor (which would
// allow for container rootfs escape).
rootDir, err := os.Open(c.root)
if err != nil {
return nil, err
}
cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir)
cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
}
func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) {
func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...)
cmd.Stdin = p.Stdin
cmd.Stdout = p.Stdout
@ -317,10 +373,17 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File
if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &syscall.SysProcAttr{}
}
cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir)
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
if p.ConsoleSocket != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2),
fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
)
}
cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe)
cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
)
// NOTE: when running a container with no PID namespace and the parent process spawning the container is
// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
// even with the parent still running.
@ -339,7 +402,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
}
}
_, sharePidns := nsMaps[configs.NEWPID]
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "")
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
if err != nil {
return nil, err
}
@ -357,19 +420,18 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
}, nil
}
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) {
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
state, err := c.currentState()
if err != nil {
return nil, newSystemErrorWithCause(err, "getting container's current state")
}
// for setns process, we dont have to set cloneflags as the process namespaces
// for setns process, we don't have to set cloneflags as the process namespaces
// will only be set via setns syscall
data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath)
data, err := c.bootstrapData(0, state.NamespacePaths)
if err != nil {
return nil, err
}
// TODO: set on container for process management
return &setnsProcess{
cmd: cmd,
cgroupPaths: c.cgroupManager.GetPaths(),
@ -378,7 +440,6 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe,
config: c.newInitConfig(p),
process: p,
bootstrapData: data,
rootDir: rootDir,
}, nil
}
@ -390,15 +451,14 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
User: process.User,
AdditionalGroups: process.AdditionalGroups,
Cwd: process.Cwd,
Console: process.consolePath,
Capabilities: process.Capabilities,
PassedFilesCount: len(process.ExtraFiles),
ContainerId: c.ID(),
NoNewPrivileges: c.config.NoNewPrivileges,
Rootless: c.config.Rootless,
AppArmorProfile: c.config.AppArmorProfile,
ProcessLabel: c.config.ProcessLabel,
Rlimits: c.config.Rlimits,
ExecFifoPath: filepath.Join(c.root, execFifoFilename),
}
if process.NoNewPrivileges != nil {
cfg.NoNewPrivileges = *process.NoNewPrivileges
@ -412,17 +472,10 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
if len(process.Rlimits) > 0 {
cfg.Rlimits = process.Rlimits
}
cfg.CreateConsole = process.ConsoleSocket != nil
return cfg
}
func newPipe() (parent *os.File, child *os.File, err error) {
fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
if err != nil {
return nil, nil, err
}
return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil
}
func (c *linuxContainer) Destroy() error {
c.m.Lock()
defer c.m.Unlock()
@ -467,10 +520,18 @@ func (c *linuxContainer) Resume() error {
}
func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
// XXX(cyphar): This requires cgroups.
if c.config.Rootless {
return nil, fmt.Errorf("cannot get OOM notifications from rootless container")
}
return notifyOnOOM(c.cgroupManager.GetPaths())
}
func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
// XXX(cyphar): This requires cgroups.
if c.config.Rootless {
return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container")
}
return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
}
@ -546,10 +607,40 @@ func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}
func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
for _, path := range c.config.MaskPaths {
fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
if err != nil {
if os.IsNotExist(err) {
continue
}
return err
}
if fi.IsDir() {
continue
}
extMnt := &criurpc.ExtMountMap{
Key: proto.String(path),
Val: proto.String("/dev/null"),
}
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}
return nil
}
func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
c.m.Lock()
defer c.m.Unlock()
// TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
// support for doing unprivileged dumps, but the setup of
// rootless containers might make this complicated.
if c.config.Rootless {
return fmt.Errorf("cannot checkpoint a rootless container")
}
if err := c.checkCriuVersion("1.5.2"); err != nil {
return err
}
@ -609,6 +700,12 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
}
}
//pre-dump may need parentImage param to complete iterative migration
if criuOpts.ParentImage != "" {
rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
rpcOpts.TrackMem = proto.Bool(true)
}
// append optional manage cgroups mode
if criuOpts.ManageCgroupsMode != 0 {
if err := c.checkCriuVersion("1.7"); err != nil {
@ -618,12 +715,19 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
rpcOpts.ManageCgroupsMode = &mode
}
t := criurpc.CriuReqType_DUMP
var t criurpc.CriuReqType
if criuOpts.PreDump {
t = criurpc.CriuReqType_PRE_DUMP
} else {
t = criurpc.CriuReqType_DUMP
}
req := &criurpc.CriuReq{
Type: &t,
Opts: &rpcOpts,
}
//no need to dump these information in pre-dump
if !criuOpts.PreDump {
for _, m := range c.config.Mounts {
switch m.Device {
case "bind":
@ -641,8 +745,16 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
}
}
// Write the FD info to a file in the image directory
if err := c.addMaskPaths(req); err != nil {
return err
}
for _, node := range c.config.Devices {
m := &configs.Mount{Destination: node.Path, Source: node.Path}
c.addCriuDumpMount(req, m)
}
// Write the FD info to a file in the image directory
fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
if err != nil {
return err
@ -652,6 +764,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
if err != nil {
return err
}
}
err = c.criuSwrk(nil, req, criuOpts, false)
if err != nil {
@ -697,6 +810,13 @@ func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock()
defer c.m.Unlock()
// TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
// support for unprivileged restore at the moment.
if c.config.Rootless {
return fmt.Errorf("cannot restore a rootless container")
}
if err := c.checkCriuVersion("1.5.2"); err != nil {
return err
}
@ -778,6 +898,16 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
}
}
if len(c.config.MaskPaths) > 0 {
m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
c.addCriuRestoreMount(req, m)
}
for _, node := range c.config.Devices {
m := &configs.Mount{Destination: node.Path, Source: node.Path}
c.addCriuRestoreMount(req, m)
}
if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 {
c.restoreNetwork(req, criuOpts)
}
@ -814,6 +944,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
}
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
// XXX: Do we need to deal with this case? AFAIK criu still requires root.
if err := c.cgroupManager.Apply(pid); err != nil {
return err
}
@ -953,6 +1084,23 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
case t == criurpc.CriuReqType_RESTORE:
case t == criurpc.CriuReqType_DUMP:
break
case t == criurpc.CriuReqType_PRE_DUMP:
// In pre-dump mode CRIU is in a loop and waits for
// the final DUMP command.
// The current runc pre-dump approach, however, is
// start criu in PRE_DUMP once for a single pre-dump
// and not the whole series of pre-dump, pre-dump, ...m, dump
// If we got the message CriuReqType_PRE_DUMP it means
// CRIU was successful and we need to forcefully stop CRIU
logrus.Debugf("PRE_DUMP finished. Send close signal to CRIU service")
criuClient.Close()
// Process status won't be success, because one end of sockets is closed
_, err := cmd.Process.Wait()
if err != nil {
logrus.Debugf("After PRE_DUMP CRIU exiting failed")
return err
}
return nil
default:
return fmt.Errorf("unable to parse the response %s", resp.String())
}
@ -1026,7 +1174,7 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
Version: c.config.Version,
ID: c.id,
Pid: int(notify.GetPid()),
Root: c.config.Rootfs,
Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
}
for i, hook := range c.config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
@ -1154,14 +1302,9 @@ func (c *linuxContainer) runType() (Status, error) {
if !exist || err != nil {
return Stopped, err
}
// check if the process that is running is the init process or the user's process.
// this is the difference between the container Running and Created.
environ, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/environ", pid))
if err != nil {
return Stopped, newSystemErrorWithCausef(err, "reading /proc/%d/environ", pid)
}
check := []byte("_LIBCONTAINER")
if bytes.Contains(environ, check) {
// We'll create exec fifo and blocking on it after container is created,
// and delete it after start container.
if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
return Created, nil
}
return Running, nil
@ -1198,6 +1341,7 @@ func (c *linuxContainer) currentState() (*State, error) {
InitProcessStartTime: startTime,
Created: c.created,
},
Rootless: c.config.Rootless,
CgroupPaths: c.cgroupManager.GetPaths(),
NamespacePaths: make(map[configs.NamespaceType]string),
ExternalDescriptors: externalDescriptors,
@ -1223,16 +1367,22 @@ func (c *linuxContainer) currentState() (*State, error) {
// can setns in order.
func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
paths := []string{}
nsTypes := []configs.NamespaceType{
order := []configs.NamespaceType{
// The user namespace *must* be done first.
configs.NEWUSER,
configs.NEWIPC,
configs.NEWUTS,
configs.NEWNET,
configs.NEWPID,
configs.NEWNS,
}
// join userns if the init process explicitly requires NEWUSER
if c.config.Namespaces.Contains(configs.NEWUSER) {
nsTypes = append(nsTypes, configs.NEWUSER)
// Remove namespaces that we don't need to join.
var nsTypes []configs.NamespaceType
for _, ns := range order {
if c.config.Namespaces.Contains(ns) {
nsTypes = append(nsTypes, ns)
}
}
for _, nsType := range nsTypes {
if p, ok := namespaces[nsType]; ok && p != "" {
@ -1249,7 +1399,7 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp
if strings.ContainsRune(p, ',') {
return nil, newSystemError(fmt.Errorf("invalid path %s", p))
}
paths = append(paths, p)
paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(nsType), p))
}
}
return paths, nil
@ -1272,7 +1422,7 @@ func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
// such as one that uses nsenter package to bootstrap the container's
// init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) {
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
// create the netlink message
r := nl.NewNetlinkRequest(int(InitMsg), 0)
@ -1282,14 +1432,6 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
Value: uint32(cloneFlags),
})
// write console path
if consolePath != "" {
r.AddData(&Bytemsg{
Type: ConsolePathAttr,
Value: []byte(consolePath),
})
}
// write custom namespace paths
if len(nsMaps) > 0 {
nsPaths, err := c.orderNamespacePaths(nsMaps)
@ -1327,6 +1469,8 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
Type: GidmapAttr,
Value: b,
})
// The following only applies if we are root.
if !c.config.Rootless {
// check if we have CAP_SETGID to setgroup properly
pid, err := capability.NewPid(os.Getpid())
if err != nil {
@ -1340,6 +1484,19 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
}
}
}
}
// write oom_score_adj
r.AddData(&Bytemsg{
Type: OomScoreAdjAttr,
Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)),
})
// write rootless
r.AddData(&Boolmsg{
Type: RootlessAttr,
Value: c.config.Rootless,
})
return bytes.NewReader(r.Serialize()), nil
}

View File

@ -25,11 +25,13 @@ type VethPairName struct {
type CriuOpts struct {
ImagesDirectory string // directory for storing image files
WorkDirectory string // directory to cd and write logs/pidfiles/stats to
ParentImage string // direcotry for storing parent image files in pre-dump and dump
LeaveRunning bool // leave container in running state after checkpoint
TcpEstablished bool // checkpoint/restore established TCP connections
ExternalUnixConnections bool // allow external unix connections
ShellJob bool // allow to dump and restore shell jobs
FileLocks bool // handle file locks, for safety
PreDump bool // call criu predump to perform iterative checkpoint
PageServer CriuPageServerInfo // allow to dump to criu page server
VethPairs []VethPairName // pass the veth to criu when restore
ManageCgroupsMode cgMode // dump or restore cgroup mode

View File

@ -60,9 +60,9 @@ func (c ErrorCode) String() string {
type Error interface {
error
// Returns a verbose string including the error message
// and a representation of the stack trace suitable for
// printing.
// Returns an error if it failed to write the detail of the Error to w.
// The detail of the Error may include the error message and a
// representation of the stack trace.
Detail(w io.Writer) error
// Returns the error code for this error.

View File

@ -15,6 +15,7 @@ import (
"github.com/docker/docker/pkg/mount"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/rootless"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/configs/validate"
@ -34,7 +35,15 @@ var (
// InitArgs returns an options func to configure a LinuxFactory with the
// provided init binary path and arguments.
func InitArgs(args ...string) func(*LinuxFactory) error {
return func(l *LinuxFactory) error {
return func(l *LinuxFactory) (err error) {
if len(args) > 0 {
// Resolve relative paths to ensure that its available
// after directory changes.
if args[0], err = filepath.Abs(args[0]); err != nil {
return newGenericError(err, ConfigInvalid)
}
}
l.InitArgs = args
return nil
}
@ -65,6 +74,20 @@ func Cgroupfs(l *LinuxFactory) error {
return nil
}
// RootlessCgroups is an options func to configure a LinuxFactory to
// return containers that use the "rootless" cgroup manager, which will
// fail to do any operations not possible to do with an unprivileged user.
// It should only be used in conjunction with rootless containers.
func RootlessCgroups(l *LinuxFactory) error {
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &rootless.Manager{
Cgroups: config,
Paths: paths,
}
}
return nil
}
// TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs.
func TmpfsRoot(l *LinuxFactory) error {
mounted, err := mount.Mounted(l.Root)
@ -141,11 +164,11 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid)
}
uid, err := config.HostUID()
uid, err := config.HostRootUID()
if err != nil {
return nil, newGenericError(err, SystemError)
}
gid, err := config.HostGID()
gid, err := config.HostRootGID()
if err != nil {
return nil, newGenericError(err, SystemError)
}
@ -161,15 +184,8 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
if err := os.Chown(containerRoot, uid, gid); err != nil {
return nil, newGenericError(err, SystemError)
}
fifoName := filepath.Join(containerRoot, execFifoFilename)
oldMask := syscall.Umask(0000)
if err := syscall.Mkfifo(fifoName, 0622); err != nil {
syscall.Umask(oldMask)
return nil, newGenericError(err, SystemError)
}
syscall.Umask(oldMask)
if err := os.Chown(fifoName, uid, gid); err != nil {
return nil, newGenericError(err, SystemError)
if config.Rootless {
RootlessCgroups(l)
}
c := &linuxContainer{
id: id,
@ -197,6 +213,10 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
processStartTime: state.InitProcessStartTime,
fds: state.ExternalDescriptors,
}
// We have to use the RootlessManager.
if state.Rootless {
RootlessCgroups(l)
}
c := &linuxContainer{
initProcess: r,
initProcessStartTime: state.InitProcessStartTime,
@ -222,54 +242,71 @@ func (l *LinuxFactory) Type() string {
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) {
var pipefd, rootfd int
for k, v := range map[string]*int{
"_LIBCONTAINER_INITPIPE": &pipefd,
"_LIBCONTAINER_STATEDIR": &rootfd,
} {
s := os.Getenv(k)
var (
pipefd, rootfd int
consoleSocket *os.File
envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE")
envStateDir = os.Getenv("_LIBCONTAINER_STATEDIR")
envConsole = os.Getenv("_LIBCONTAINER_CONSOLE")
)
i, err := strconv.Atoi(s)
// Get the INITPIPE.
pipefd, err = strconv.Atoi(envInitPipe)
if err != nil {
return fmt.Errorf("unable to convert %s=%s to int", k, s)
}
*v = i
return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
}
var (
pipe = os.NewFile(uintptr(pipefd), "pipe")
it = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
)
defer pipe.Close()
// Only init processes have STATEDIR.
rootfd = -1
if it == initStandard {
if rootfd, err = strconv.Atoi(envStateDir); err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err)
}
}
if envConsole != "" {
console, err := strconv.Atoi(envConsole)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err)
}
consoleSocket = os.NewFile(uintptr(console), "console-socket")
defer consoleSocket.Close()
}
// clear the current process's environment to clean any libcontainer
// specific env vars.
os.Clearenv()
var i initer
defer func() {
// We have an error during the initialization of the container's init,
// send it back to the parent process in the form of an initError.
// If container's init successed, syscall.Exec will not return, hence
// this defer function will never be called.
if _, ok := i.(*linuxStandardInit); ok {
// Synchronisation only necessary for standard init.
if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
panic(err)
}
fmt.Fprintln(os.Stderr, err)
return
}
if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
panic(err)
fmt.Fprintln(os.Stderr, err)
return
}
// ensure that this pipe is always closed
pipe.Close()
}()
defer func() {
if e := recover(); e != nil {
err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
}
}()
i, err = newContainerInit(it, pipe, rootfd)
i, err := newContainerInit(it, pipe, consoleSocket, rootfd)
if err != nil {
return err
}
// If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
return i.Init()
}
@ -277,7 +314,7 @@ func (l *LinuxFactory) loadState(root, id string) (*State, error) {
f, err := os.Open(filepath.Join(root, stateFilename))
if err != nil {
if os.IsNotExist(err) {
return nil, newGenericError(fmt.Errorf("container %q does not exists", id), ContainerNotExists)
return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists)
}
return nil, newGenericError(err, SystemError)
}

View File

@ -9,20 +9,6 @@ import (
"github.com/opencontainers/runc/libcontainer/stacktrace"
)
type syncType uint8
const (
procReady syncType = iota
procError
procRun
procHooks
procResume
)
type syncT struct {
Type syncType `json:"type"`
}
var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
Code: {{.ECode}}
{{if .Message }}

View File

@ -6,12 +6,11 @@ import (
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net"
"os"
"strconv"
"strings"
"syscall"
"unsafe"
"github.com/Sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/cgroups"
@ -47,26 +46,26 @@ type initConfig struct {
Args []string `json:"args"`
Env []string `json:"env"`
Cwd string `json:"cwd"`
Capabilities []string `json:"capabilities"`
Capabilities *configs.Capabilities `json:"capabilities"`
ProcessLabel string `json:"process_label"`
AppArmorProfile string `json:"apparmor_profile"`
NoNewPrivileges bool `json:"no_new_privileges"`
User string `json:"user"`
AdditionalGroups []string `json:"additional_groups"`
Config *configs.Config `json:"config"`
Console string `json:"console"`
Networks []*network `json:"network"`
PassedFilesCount int `json:"passed_files_count"`
ContainerId string `json:"containerid"`
Rlimits []configs.Rlimit `json:"rlimits"`
ExecFifoPath string `json:"start_pipe_path"`
CreateConsole bool `json:"create_console"`
Rootless bool `json:"rootless"`
}
type initer interface {
Init() error
}
func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) {
func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDirFD int) (initer, error) {
var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err
@ -77,11 +76,14 @@ func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error)
switch t {
case initSetns:
return &linuxSetnsInit{
pipe: pipe,
consoleSocket: consoleSocket,
config: config,
}, nil
case initStandard:
return &linuxStandardInit{
pipe: pipe,
consoleSocket: consoleSocket,
parentPid: syscall.Getppid(),
config: config,
stateDirFD: stateDirFD,
@ -116,16 +118,18 @@ func finalizeNamespace(config *initConfig) error {
return err
}
capabilities := config.Config.Capabilities
capabilities := &configs.Capabilities{}
if config.Capabilities != nil {
capabilities = config.Capabilities
} else if config.Config.Capabilities != nil {
capabilities = config.Config.Capabilities
}
w, err := newCapWhitelist(capabilities)
w, err := newContainerCapList(capabilities)
if err != nil {
return err
}
// drop capabilities in bounding set before changing user
if err := w.dropBoundingSet(); err != nil {
if err := w.ApplyBoundingSet(); err != nil {
return err
}
// preserve existing capabilities while we change users
@ -138,8 +142,7 @@ func finalizeNamespace(config *initConfig) error {
if err := system.ClearKeepCaps(); err != nil {
return err
}
// drop all other capabilities
if err := w.drop(); err != nil {
if err := w.ApplyCaps(); err != nil {
return err
}
if config.Cwd != "" {
@ -150,24 +153,59 @@ func finalizeNamespace(config *initConfig) error {
return nil
}
// setupConsole sets up the console from inside the container, and sends the
// master pty fd to the config.Pipe (using cmsg). This is done to ensure that
// consoles are scoped to a container properly (see runc#814 and the many
// issues related to that). This has to be run *after* we've pivoted to the new
// rootfs (and the users' configuration is entirely set up).
func setupConsole(socket *os.File, config *initConfig, mount bool) error {
defer socket.Close()
// At this point, /dev/ptmx points to something that we would expect. We
// used to change the owner of the slave path, but since the /dev/pts mount
// can have gid=X set (at the users' option). So touching the owner of the
// slave PTY is not necessary, as the kernel will handle that for us. Note
// however, that setupUser (specifically fixStdioPermissions) *will* change
// the UID owner of the console to be the user the process will run as (so
// they can actually control their console).
console, err := newConsole()
if err != nil {
return err
}
// After we return from here, we don't need the console anymore.
defer console.Close()
linuxConsole, ok := console.(*linuxConsole)
if !ok {
return fmt.Errorf("failed to cast console to *linuxConsole")
}
// Mount the console inside our rootfs.
if mount {
if err := linuxConsole.mount(); err != nil {
return err
}
}
// While we can access console.master, using the API is a good idea.
if err := utils.SendFd(socket, linuxConsole.File()); err != nil {
return err
}
// Now, dup over all the things.
return linuxConsole.dupStdio()
}
// syncParentReady sends to the given pipe a JSON payload which indicates that
// the init is ready to Exec the child process. It then waits for the parent to
// indicate that it is cleared to Exec.
func syncParentReady(pipe io.ReadWriter) error {
// Tell parent.
if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil {
if err := writeSync(pipe, procReady); err != nil {
return err
}
// Wait for parent to give the all-clear.
var procSync syncT
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type != procRun {
return fmt.Errorf("invalid synchronisation flag from parent")
}
if err := readSync(pipe, procRun); err != nil {
return err
}
return nil
}
@ -176,19 +214,15 @@ func syncParentReady(pipe io.ReadWriter) error {
// indicate that it is cleared to resume.
func syncParentHooks(pipe io.ReadWriter) error {
// Tell parent.
if err := utils.WriteJSON(pipe, syncT{procHooks}); err != nil {
if err := writeSync(pipe, procHooks); err != nil {
return err
}
// Wait for parent to give the all-clear.
var procSync syncT
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type != procResume {
return fmt.Errorf("invalid synchronisation flag from parent")
}
if err := readSync(pipe, procResume); err != nil {
return err
}
return nil
}
@ -196,18 +230,21 @@ func syncParentHooks(pipe io.ReadWriter) error {
func setupUser(config *initConfig) error {
// Set up defaults.
defaultExecUser := user.ExecUser{
Uid: syscall.Getuid(),
Gid: syscall.Getgid(),
Uid: 0,
Gid: 0,
Home: "/",
}
passwdPath, err := user.GetPasswdPath()
if err != nil {
return err
}
groupPath, err := user.GetGroupPath()
if err != nil {
return err
}
execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
if err != nil {
return err
@ -220,22 +257,49 @@ func setupUser(config *initConfig) error {
return err
}
}
if config.Rootless {
if execUser.Uid != 0 {
return fmt.Errorf("cannot run as a non-root user in a rootless container")
}
if execUser.Gid != 0 {
return fmt.Errorf("cannot run as a non-root group in a rootless container")
}
// We cannot set any additional groups in a rootless container and thus we
// bail if the user asked us to do so. TODO: We currently can't do this
// earlier, but if libcontainer.Process.User was typesafe this might work.
if len(addGroups) > 0 {
return fmt.Errorf("cannot set any additional groups in a rootless container")
}
}
// before we change to the container's user make sure that the processes STDIO
// is correctly owned by the user that we are switching to.
if err := fixStdioPermissions(execUser); err != nil {
if err := fixStdioPermissions(config, execUser); err != nil {
return err
}
// This isn't allowed in an unprivileged user namespace since Linux 3.19.
// There's nothing we can do about /etc/group entries, so we silently
// ignore setting groups here (since the user didn't explicitly ask us to
// set the group).
if !config.Rootless {
suppGroups := append(execUser.Sgids, addGroups...)
if err := syscall.Setgroups(suppGroups); err != nil {
return err
}
}
if err := system.Setgid(execUser.Gid); err != nil {
return err
}
if err := system.Setuid(execUser.Uid); err != nil {
return err
}
// if we didn't get HOME already, set it based on the user's HOME
if envHome := os.Getenv("HOME"); envHome == "" {
if err := os.Setenv("HOME", execUser.Home); err != nil {
@ -248,7 +312,7 @@ func setupUser(config *initConfig) error {
// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
// The ownership needs to match because it is created outside of the container and needs to be
// localized.
func fixStdioPermissions(u *user.ExecUser) error {
func fixStdioPermissions(config *initConfig, u *user.ExecUser) error {
var null syscall.Stat_t
if err := syscall.Stat("/dev/null", &null); err != nil {
return err
@ -262,11 +326,27 @@ func fixStdioPermissions(u *user.ExecUser) error {
if err := syscall.Fstat(int(fd), &s); err != nil {
return err
}
// skip chown of /dev/null if it was used as one of the STDIO fds.
// Skip chown of /dev/null if it was used as one of the STDIO fds.
if s.Rdev == null.Rdev {
continue
}
if err := syscall.Fchown(int(fd), u.Uid, u.Gid); err != nil {
// Skip chown if s.Gid is actually an unmapped gid in the host. While
// this is a bit dodgy if it just so happens that the console _is_
// owned by overflow_gid, there's no way for us to disambiguate this as
// a userspace program.
if _, err := config.Config.HostGID(int(s.Gid)); err != nil {
continue
}
// We only change the uid owner (as it is possible for the mount to
// prefer a different gid, and there's no reason for us to change it).
// The reason why we don't just leave the default uid=X mount setup is
// that users expect to be able to actually use their console. Without
// this code, you couldn't effectively run as a non-root user inside a
// container and also have a console set up.
if err := syscall.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil {
return err
}
}
@ -328,16 +408,51 @@ func setupRlimits(limits []configs.Rlimit, pid int) error {
return nil
}
func setOomScoreAdj(oomScoreAdj int, pid int) error {
path := fmt.Sprintf("/proc/%d/oom_score_adj", pid)
const _P_PID = 1
return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600)
type siginfo struct {
si_signo int32
si_errno int32
si_code int32
// below here is a union; si_pid is the only field we use
si_pid int32
// Pad to 128 bytes as detailed in blockUntilWaitable
pad [96]byte
}
// killCgroupProcesses freezes then iterates over all the processes inside the
// manager's cgroups sending a SIGKILL to each process then waiting for them to
// exit.
func killCgroupProcesses(m cgroups.Manager) error {
// isWaitable returns true if the process has exited false otherwise.
// Its based off blockUntilWaitable in src/os/wait_waitid.go
func isWaitable(pid int) (bool, error) {
si := &siginfo{}
_, _, e := syscall.Syscall6(syscall.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), syscall.WEXITED|syscall.WNOWAIT|syscall.WNOHANG, 0, 0)
if e != 0 {
return false, os.NewSyscallError("waitid", e)
}
return si.si_pid != 0, nil
}
// isNoChildren returns true if err represents a syscall.ECHILD false otherwise
func isNoChildren(err error) bool {
switch err := err.(type) {
case syscall.Errno:
if err == syscall.ECHILD {
return true
}
case *os.SyscallError:
if err.Err == syscall.ECHILD {
return true
}
}
return false
}
// signalAllProcesses freezes then iterates over all the processes inside the
// manager's cgroups sending the signal s to them.
// If s is SIGKILL then it will wait for each process to exit.
// For all other signals it will check if the process is ready to report its
// exit status and only if it is will a wait be performed.
func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
var procs []*os.Process
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Warn(err)
@ -354,16 +469,31 @@ func killCgroupProcesses(m cgroups.Manager) error {
continue
}
procs = append(procs, p)
if err := p.Kill(); err != nil {
if err := p.Signal(s); err != nil {
logrus.Warn(err)
}
}
if err := m.Freeze(configs.Thawed); err != nil {
logrus.Warn(err)
}
for _, p := range procs {
if s != syscall.SIGKILL {
if ok, err := isWaitable(p.Pid); err != nil {
if !isNoChildren(err) {
logrus.Warn("signalAllProcesses: ", p.Pid, err)
}
continue
} else if !ok {
// Not ready to report so don't wait
continue
}
}
if _, err := p.Wait(); err != nil {
logrus.Warn(err)
if !isNoChildren(err) {
logrus.Warn("wait: ", err)
}
}
}
return nil

View File

@ -9,6 +9,10 @@ func InitLabels(options []string) (string, string, error) {
return "", "", nil
}
func GetROMountLabel() string {
return ""
}
func GenLabels(options string) (string, string, error) {
return "", "", nil
}

View File

@ -55,6 +55,10 @@ func InitLabels(options []string) (string, string, error) {
return processLabel, mountLabel, nil
}
func GetROMountLabel() string {
return selinux.GetROFileLabel()
}
// DEPRECATED: The GenLabels function is only to be used during the transition to the official API.
func GenLabels(options string) (string, string, error) {
return InitLabels(strings.Fields(options))
@ -138,7 +142,7 @@ func Relabel(path string, fileLabel string, shared bool) error {
fileLabel = c.Get()
}
if err := selinux.Chcon(path, fileLabel, true); err != nil {
return fmt.Errorf("SELinux relabeling of %s is not allowed: %q", path, err)
return err
}
return nil
}
@ -169,7 +173,7 @@ func UnreserveLabel(label string) error {
return nil
}
// DupSecOpt takes an process label and returns security options that
// DupSecOpt takes a process label and returns security options that
// can be used to set duplicate labels on future container processes
func DupSecOpt(src string) []string {
return selinux.DupSecOpt(src)

View File

@ -13,11 +13,13 @@ import (
const (
InitMsg uint16 = 62000
CloneFlagsAttr uint16 = 27281
ConsolePathAttr uint16 = 27282
NsPathsAttr uint16 = 27283
UidmapAttr uint16 = 27284
GidmapAttr uint16 = 27285
SetgroupAttr uint16 = 27286
NsPathsAttr uint16 = 27282
UidmapAttr uint16 = 27283
GidmapAttr uint16 = 27284
SetgroupAttr uint16 = 27285
OomScoreAdjAttr uint16 = 27286
RootlessAttr uint16 = 27287
// When syscall.NLA_HDRLEN is in gccgo, take this out.
syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1)
)

View File

@ -47,12 +47,9 @@ type Process struct {
// ExtraFiles specifies additional open files to be inherited by the container
ExtraFiles []*os.File
// consolePath is the path to the console allocated to the container.
consolePath string
// Capabilities specify the capabilities to keep when executing the process inside the container
// All capabilities not specified will be dropped from the processes capability mask
Capabilities []string
Capabilities *configs.Capabilities
// AppArmorProfile specifies the profile to apply to the process and is
// changed at the time the process is execed
@ -68,6 +65,9 @@ type Process struct {
// If Rlimits are not set, the container will inherit rlimits from the parent process
Rlimits []configs.Rlimit
// ConsoleSocket provides the masterfd console.
ConsoleSocket *os.File
ops processOperations
}
@ -104,22 +104,3 @@ type IO struct {
Stdout io.ReadCloser
Stderr io.ReadCloser
}
// NewConsole creates new console for process and returns it
func (p *Process) NewConsole(rootuid, rootgid int) (Console, error) {
console, err := NewConsole(rootuid, rootgid)
if err != nil {
return nil, err
}
p.consolePath = console.Path()
return console, nil
}
// ConsoleFromPath sets the process's console with the path provided
func (p *Process) ConsoleFromPath(path string) error {
if p.consolePath != "" {
return newGenericError(fmt.Errorf("console path already exists for process"), ConsoleExists)
}
p.consolePath = path
return nil
}

View File

@ -51,7 +51,6 @@ type setnsProcess struct {
fds []string
process *Process
bootstrapData io.Reader
rootDir *os.File
}
func (p *setnsProcess) startTime() (string, error) {
@ -70,7 +69,6 @@ func (p *setnsProcess) start() (err error) {
defer p.parentPipe.Close()
err = p.cmd.Start()
p.childPipe.Close()
p.rootDir.Close()
if err != nil {
return newSystemErrorWithCause(err, "starting setns process")
}
@ -82,15 +80,12 @@ func (p *setnsProcess) start() (err error) {
if err = p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "executing setns process")
}
if len(p.cgroupPaths) > 0 {
// We can't join cgroups if we're in a rootless container.
if !p.config.Rootless && len(p.cgroupPaths) > 0 {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
}
}
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting oom score")
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
@ -100,15 +95,22 @@ func (p *setnsProcess) start() (err error) {
return newSystemErrorWithCause(err, "writing config to pipe")
}
ierr := parseSync(p.parentPipe, func(sync *syncT) error {
switch sync.Type {
case procReady:
// This shouldn't happen.
panic("unexpected procReady in setns")
case procHooks:
// This shouldn't happen.
panic("unexpected procHooks in setns")
default:
return newSystemError(fmt.Errorf("invalid JSON payload from child"))
}
})
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemErrorWithCause(err, "calling shutdown on init pipe")
}
// wait for the child process to fully complete and receive an error message
// if one was encoutered
var ierr *genericError
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemErrorWithCause(err, "decoding init error from pipe")
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
@ -146,7 +148,7 @@ func (p *setnsProcess) execSetns() error {
}
// terminate sends a SIGKILL to the forked process for the setns routine then waits to
// avoid the process becomming a zombie.
// avoid the process becoming a zombie.
func (p *setnsProcess) terminate() error {
if p.cmd.Process == nil {
return nil
@ -239,7 +241,7 @@ func (p *initProcess) start() error {
return newSystemErrorWithCause(err, "starting init process command")
}
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return err
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
}
if err := p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "running exec setns process for init")
@ -252,8 +254,9 @@ func (p *initProcess) start() error {
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
}
p.setExternalDescriptors(fds)
// Do this before syncing with child so that no children
// can escape the cgroup
// Do this before syncing with child so that no children can escape the
// cgroup. We don't need to worry about not doing this and not being root
// because we'd be using the rootless cgroup manager in that case.
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
}
@ -264,36 +267,22 @@ func (p *initProcess) start() error {
}
}()
if err := p.createNetworkInterfaces(); err != nil {
return newSystemErrorWithCause(err, "creating nework interfaces")
return newSystemErrorWithCause(err, "creating network interfaces")
}
if err := p.sendConfig(); err != nil {
return newSystemErrorWithCause(err, "sending config to init process")
}
var (
procSync syncT
sentRun bool
sentResume bool
ierr *genericError
)
dec := json.NewDecoder(p.parentPipe)
loop:
for {
if err := dec.Decode(&procSync); err != nil {
if err == io.EOF {
break loop
}
return newSystemErrorWithCause(err, "decoding sync type from init pipe")
}
switch procSync.Type {
ierr := parseSync(p.parentPipe, func(sync *syncT) error {
switch sync.Type {
case procReady:
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
}
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting oom score for ready process")
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
@ -306,7 +295,7 @@ loop:
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Root: p.config.Config.Rootfs,
Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"),
}
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
@ -316,8 +305,8 @@ loop:
}
}
// Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
return newSystemErrorWithCause(err, "reading syncT run type")
if err := writeSync(p.parentPipe, procRun); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'run'")
}
sentRun = true
case procHooks:
@ -326,8 +315,7 @@ loop:
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Root: p.config.Config.Rootfs,
BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"),
Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"),
}
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
@ -336,25 +324,17 @@ loop:
}
}
// Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil {
return newSystemErrorWithCause(err, "reading syncT resume type")
if err := writeSync(p.parentPipe, procResume); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'resume'")
}
sentResume = true
case procError:
// wait for the child process to fully complete and receive an error message
// if one was encoutered
if err := dec.Decode(&ierr); err != nil && err != io.EOF {
return newSystemErrorWithCause(err, "decoding proc error from init")
}
if ierr != nil {
break loop
}
// Programmer error.
panic("No error following JSON procError payload.")
default:
return newSystemError(fmt.Errorf("invalid JSON payload from child"))
}
}
return nil
})
if !sentRun {
return newSystemErrorWithCause(ierr, "container init")
}
@ -364,6 +344,7 @@ loop:
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemErrorWithCause(err, "shutting down init pipe")
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
@ -379,7 +360,7 @@ func (p *initProcess) wait() (*os.ProcessState, error) {
}
// we should kill all processes in cgroup when init is died if we use host PID namespace
if p.sharePidns {
killCgroupProcesses(p.manager)
signalAllProcesses(p.manager, syscall.SIGKILL)
}
return p.cmd.ProcessState, nil
}
@ -440,9 +421,17 @@ func getPipeFds(pid int) ([]string, error) {
dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
for i := 0; i < 3; i++ {
// XXX: This breaks if the path is not a valid symlink (which can
// happen in certain particularly unlucky mount namespace setups).
f := filepath.Join(dirPath, strconv.Itoa(i))
target, err := os.Readlink(f)
if err != nil {
// Ignore permission errors, for rootless containers and other
// non-dumpable processes. if we can't get the fd for a particular
// file, there's not much we can do.
if os.IsPermission(err) {
continue
}
return fds, err
}
fds[i] = target
@ -450,8 +439,10 @@ func getPipeFds(pid int) ([]string, error) {
return fds, nil
}
// InitializeIO creates pipes for use with the process's STDIO
// and returns the opposite side for each
// InitializeIO creates pipes for use with the process's stdio and returns the
// opposite side for each. Do not use this if you want to have a pseudoterminal
// set up for you by libcontainer (TODO: fix that too).
// TODO: This is mostly unnecessary, and should be handled by clients.
func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
var fds []uintptr
i = &IO{}

View File

@ -16,6 +16,7 @@ import (
"github.com/docker/docker/pkg/mount"
"github.com/docker/docker/pkg/symlink"
"github.com/mrunalp/fileutils"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/label"
@ -35,9 +36,11 @@ func needsSetupDev(config *configs.Config) bool {
return true
}
// setupRootfs sets up the devices, mount points, and filesystems for use inside a
// new mount namespace.
func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWriter) (err error) {
// prepareRootfs sets up the devices, mount points, and filesystems for use
// inside a new mount namespace. It doesn't set anything as ro or pivot_root,
// because console setup happens inside the caller. You must call
// finalizeRootfs in order to finish the rootfs setup.
func prepareRootfs(pipe io.ReadWriter, config *configs.Config) (err error) {
if err := prepareRoot(config); err != nil {
return newSystemErrorWithCause(err, "preparing rootfs")
}
@ -49,6 +52,7 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit
return newSystemErrorWithCause(err, "running premount command")
}
}
if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil {
return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination)
}
@ -59,17 +63,19 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit
}
}
}
if setupDev {
if err := createDevices(config); err != nil {
return newSystemErrorWithCause(err, "creating device nodes")
}
if err := setupPtmx(config, console); err != nil {
if err := setupPtmx(config); err != nil {
return newSystemErrorWithCause(err, "setting up ptmx")
}
if err := setupDevSymlinks(config.Rootfs); err != nil {
return newSystemErrorWithCause(err, "setting up /dev symlinks")
}
}
// Signal the parent to run the pre-start hooks.
// The hooks are run after the mounts are setup, but before we switch to the new
// root, so that the old root is still available in the hooks for any mount
@ -77,39 +83,59 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit
if err := syncParentHooks(pipe); err != nil {
return err
}
// The reason these operations are done here rather than in finalizeRootfs
// is because the console-handling code gets quite sticky if we have to set
// up the console before doing the pivot_root(2). This is because the
// Console API has to also work with the ExecIn case, which means that the
// API must be able to deal with being inside as well as outside the
// container. It's just cleaner to do this here (at the expense of the
// operation not being perfectly split).
if err := syscall.Chdir(config.Rootfs); err != nil {
return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs)
}
if config.NoPivotRoot {
err = msMoveRoot(config.Rootfs)
} else {
err = pivotRoot(config.Rootfs, config.PivotDir)
err = pivotRoot(config.Rootfs)
}
if err != nil {
return newSystemErrorWithCause(err, "jailing process inside rootfs")
}
if setupDev {
if err := reOpenDevNull(); err != nil {
return newSystemErrorWithCause(err, "reopening /dev/null inside container")
}
}
// remount dev as ro if specifed
return nil
}
// finalizeRootfs actually switches the root of the process and sets anything
// to ro if necessary. You must call prepareRootfs first.
func finalizeRootfs(config *configs.Config) (err error) {
// remount dev as ro if specified
for _, m := range config.Mounts {
if libcontainerUtils.CleanPath(m.Destination) == "/dev" {
if m.Flags&syscall.MS_RDONLY != 0 {
if err := remountReadonly(m.Destination); err != nil {
if m.Flags&syscall.MS_RDONLY == syscall.MS_RDONLY {
if err := remountReadonly(m); err != nil {
return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination)
}
}
break
}
}
// set rootfs ( / ) as readonly
if config.Readonlyfs {
if err := setReadonly(); err != nil {
return newSystemErrorWithCause(err, "setting rootfs as readonly")
}
}
syscall.Umask(0022)
return nil
}
@ -152,15 +178,41 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
}
return nil
case "tmpfs":
copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
tmpDir := ""
stat, err := os.Stat(dest)
if err != nil {
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
}
if copyUp {
tmpDir, err = ioutil.TempDir("/tmp", "runctmpdir")
if err != nil {
return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
}
defer os.RemoveAll(tmpDir)
m.Destination = tmpDir
}
if err := mountPropagate(m, rootfs, mountLabel); err != nil {
return err
}
if copyUp {
if err := fileutils.CopyDirectory(dest, tmpDir); err != nil {
errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err)
if err1 := syscall.Unmount(tmpDir, syscall.MNT_DETACH); err1 != nil {
return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
}
return errMsg
}
if err := syscall.Mount(tmpDir, dest, "", syscall.MS_MOVE, ""); err != nil {
errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err)
if err1 := syscall.Unmount(tmpDir, syscall.MNT_DETACH); err1 != nil {
return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
}
return errMsg
}
}
if stat != nil {
if err = os.Chmod(dest, stat.Mode()); err != nil {
return err
@ -178,7 +230,7 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
// any previous mounts can invalidate the next mount's destination.
// this can happen when a user specifies mounts within other mounts to cause breakouts or other
// evil stuff to try to escape the container's rootfs.
if dest, err = symlink.FollowSymlinkInScope(filepath.Join(rootfs, m.Destination), rootfs); err != nil {
if dest, err = symlink.FollowSymlinkInScope(dest, rootfs); err != nil {
return err
}
if err := checkMountDestination(rootfs, dest); err != nil {
@ -261,6 +313,19 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
}
}
default:
// ensure that the destination of the mount is resolved of symlinks at mount time because
// any previous mounts can invalidate the next mount's destination.
// this can happen when a user specifies mounts within other mounts to cause breakouts or other
// evil stuff to try to escape the container's rootfs.
var err error
if dest, err = symlink.FollowSymlinkInScope(dest, rootfs); err != nil {
return err
}
if err := checkMountDestination(rootfs, dest); err != nil {
return err
}
// update the mount with the correct dest after symlinks are resolved.
m.Destination = dest
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
@ -283,7 +348,7 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
var binds []*configs.Mount
for _, mm := range mounts {
dir, err := mm.GetThisCgroupDir(cgroupPaths)
dir, err := mm.GetOwnCgroup(cgroupPaths)
if err != nil {
return nil, err
}
@ -294,7 +359,7 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
binds = append(binds, &configs.Mount{
Device: "bind",
Source: filepath.Join(mm.Mountpoint, relDir),
Destination: filepath.Join(m.Destination, strings.Join(mm.Subsystems, ",")),
Destination: filepath.Join(m.Destination, filepath.Base(mm.Mountpoint)),
Flags: syscall.MS_BIND | syscall.MS_REC | m.Flags,
PropagationFlags: m.PropagationFlags,
})
@ -306,9 +371,6 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
// checkMountDestination checks to ensure that the mount destination is not over the top of /proc.
// dest is required to be an abs path and have any symlinks resolved before calling this function.
func checkMountDestination(rootfs, dest string) error {
if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) {
return fmt.Errorf("mounting into / is prohibited")
}
invalidDestinations := []string{
"/proc",
}
@ -448,10 +510,12 @@ func createDeviceNode(rootfs string, node *configs.Device, bind bool) error {
func mknodDevice(dest string, node *configs.Device) error {
fileMode := node.FileMode
switch node.Type {
case 'c':
case 'c', 'u':
fileMode |= syscall.S_IFCHR
case 'b':
fileMode |= syscall.S_IFBLK
case 'p':
fileMode |= syscall.S_IFIFO
default:
return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path)
}
@ -539,11 +603,13 @@ func prepareRoot(config *configs.Config) error {
if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil {
return err
}
if config.NoPivotRoot {
// Make parent mount private to make sure following bind mount does
// not propagate in other namespaces. Also it will help with kernel
// check pass in pivot_root. (IS_SHARED(new_mnt->mnt_parent))
if err := rootfsParentMountPrivate(config.Rootfs); err != nil {
return err
}
}
return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "")
}
@ -552,7 +618,7 @@ func setReadonly() error {
return syscall.Mount("/", "/", "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, "")
}
func setupPtmx(config *configs.Config, console *linuxConsole) error {
func setupPtmx(config *configs.Config) error {
ptmx := filepath.Join(config.Rootfs, "dev/ptmx")
if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
return err
@ -560,54 +626,61 @@ func setupPtmx(config *configs.Config, console *linuxConsole) error {
if err := os.Symlink("pts/ptmx", ptmx); err != nil {
return fmt.Errorf("symlink dev ptmx %s", err)
}
if console != nil {
return console.mount(config.Rootfs, config.MountLabel)
}
return nil
}
func pivotRoot(rootfs, pivotBaseDir string) (err error) {
if pivotBaseDir == "" {
pivotBaseDir = "/"
}
tmpDir := filepath.Join(rootfs, pivotBaseDir)
if err := os.MkdirAll(tmpDir, 0755); err != nil {
return fmt.Errorf("can't create tmp dir %s, error %v", tmpDir, err)
}
pivotDir, err := ioutil.TempDir(tmpDir, ".pivot_root")
// pivotRoot will call pivot_root such that rootfs becomes the new root
// filesystem, and everything else is cleaned up.
func pivotRoot(rootfs string) error {
// While the documentation may claim otherwise, pivot_root(".", ".") is
// actually valid. What this results in is / being the new root but
// /proc/self/cwd being the old root. Since we can play around with the cwd
// with pivot_root this allows us to pivot without creating directories in
// the rootfs. Shout-outs to the LXC developers for giving us this idea.
oldroot, err := syscall.Open("/", syscall.O_DIRECTORY|syscall.O_RDONLY, 0)
if err != nil {
return fmt.Errorf("can't create pivot_root dir %s, error %v", pivotDir, err)
}
defer func() {
errVal := os.Remove(pivotDir)
if err == nil {
err = errVal
}
}()
if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
// Make the parent mount private
if err := rootfsParentMountPrivate(rootfs); err != nil {
return err
}
// Try again
if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
defer syscall.Close(oldroot)
newroot, err := syscall.Open(rootfs, syscall.O_DIRECTORY|syscall.O_RDONLY, 0)
if err != nil {
return err
}
defer syscall.Close(newroot)
// Change to the new root so that the pivot_root actually acts on it.
if err := syscall.Fchdir(newroot); err != nil {
return err
}
if err := syscall.PivotRoot(".", "."); err != nil {
return fmt.Errorf("pivot_root %s", err)
}
}
if err := syscall.Chdir("/"); err != nil {
return fmt.Errorf("chdir / %s", err)
}
// path to pivot dir now changed, update
pivotDir = filepath.Join(pivotBaseDir, filepath.Base(pivotDir))
// Make pivotDir rprivate to make sure any of the unmounts don't
// propagate to parent.
if err := syscall.Mount("", pivotDir, "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil {
// Currently our "." is oldroot (according to the current kernel code).
// However, purely for safety, we will fchdir(oldroot) since there isn't
// really any guarantee from the kernel what /proc/self/cwd will be after a
// pivot_root(2).
if err := syscall.Fchdir(oldroot); err != nil {
return err
}
if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil {
return fmt.Errorf("unmount pivot_root dir %s", err)
// Make oldroot rprivate to make sure our unmounts don't propagate to the
// host (and thus bork the machine).
if err := syscall.Mount("", ".", "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil {
return err
}
// Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
if err := syscall.Unmount(".", syscall.MNT_DETACH); err != nil {
return err
}
// Switch back to our shiny new root.
if err := syscall.Chdir("/"); err != nil {
return fmt.Errorf("chdir / %s", err)
}
return nil
}
@ -642,17 +715,26 @@ func createIfNotExists(path string, isDir bool) error {
return nil
}
// remountReadonly will bind over the top of an existing path and ensure that it is read-only.
func remountReadonly(path string) error {
for i := 0; i < 5; i++ {
if err := syscall.Mount("", path, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil && !os.IsNotExist(err) {
switch err {
case syscall.EINVAL:
// Probably not a mountpoint, use bind-mount
if err := syscall.Mount(path, path, "", syscall.MS_BIND, ""); err != nil {
// readonlyPath will make a path read only.
func readonlyPath(path string) error {
if err := syscall.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
return syscall.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC|defaultMountFlags, "")
return syscall.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, "")
}
// remountReadonly will remount an existing mount point and ensure that it is read-only.
func remountReadonly(m *configs.Mount) error {
var (
dest = m.Destination
flags = m.Flags
)
for i := 0; i < 5; i++ {
if err := syscall.Mount("", dest, "", uintptr(flags|syscall.MS_REMOUNT|syscall.MS_RDONLY), ""); err != nil {
switch err {
case syscall.EBUSY:
time.Sleep(100 * time.Millisecond)
continue
@ -662,13 +744,19 @@ func remountReadonly(path string) error {
}
return nil
}
return fmt.Errorf("unable to mount %s as readonly max retries reached", path)
return fmt.Errorf("unable to mount %s as readonly max retries reached", dest)
}
// maskFile bind mounts /dev/null over the top of the specified path inside a container
// to avoid security issues from processes reading information from non-namespace aware mounts ( proc/kcore ).
func maskFile(path string) error {
// maskPath masks the top of the specified path inside a container to avoid
// security issues from processes reading information from non-namespace aware
// mounts ( proc/kcore ).
// For files, maskPath bind mounts /dev/null over the top of the specified path.
// For directories, maskPath mounts read-only tmpfs over the top of the specified path.
func maskPath(path string) error {
if err := syscall.Mount("/dev/null", path, "", syscall.MS_BIND, ""); err != nil && !os.IsNotExist(err) {
if err == syscall.ENOTDIR {
return syscall.Mount("tmpfs", path, "tmpfs", syscall.MS_RDONLY, "")
}
return err
}
return nil
@ -705,7 +793,9 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
if libcontainerUtils.CleanPath(dest) == "/dev" {
flags &= ^syscall.MS_RDONLY
}
if !strings.HasPrefix(dest, rootfs) {
copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
if !(copyUp || strings.HasPrefix(dest, rootfs)) {
dest = filepath.Join(rootfs, dest)
}

View File

@ -212,10 +212,6 @@ func parseStatusFile(path string) (map[string]string, error) {
status := make(map[string]string)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
text := s.Text()
parts := strings.Split(text, ":")
@ -225,5 +221,9 @@ func parseStatusFile(path string) (map[string]string, error) {
status[parts[0]] = parts[1]
}
if err := s.Err(); err != nil {
return nil, err
}
return status, nil
}

View File

@ -32,33 +32,73 @@ const (
stRdOnly = 0x01
)
type selinuxState struct {
enabledSet bool
enabled bool
selinuxfsSet bool
selinuxfs string
mcsList map[string]bool
sync.Mutex
}
var (
assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`)
mcsList = make(map[string]bool)
mcsLock sync.Mutex
selinuxfs = "unknown"
selinuxEnabled = false // Stores whether selinux is currently enabled
selinuxEnabledChecked = false // Stores whether selinux enablement has been checked or established yet
state = selinuxState{
mcsList: make(map[string]bool),
}
)
type SELinuxContext map[string]string
// SetDisabled disables selinux support for the package
func SetDisabled() {
selinuxEnabled, selinuxEnabledChecked = false, true
func (s *selinuxState) setEnable(enabled bool) bool {
s.Lock()
defer s.Unlock()
s.enabledSet = true
s.enabled = enabled
return s.enabled
}
// getSelinuxMountPoint returns the path to the mountpoint of an selinuxfs
// filesystem or an empty string if no mountpoint is found. Selinuxfs is
// a proc-like pseudo-filesystem that exposes the selinux policy API to
// processes. The existence of an selinuxfs mount is used to determine
// whether selinux is currently enabled or not.
func getSelinuxMountPoint() string {
if selinuxfs != "unknown" {
func (s *selinuxState) getEnabled() bool {
s.Lock()
enabled := s.enabled
enabledSet := s.enabledSet
s.Unlock()
if enabledSet {
return enabled
}
enabled = false
if fs := getSelinuxMountPoint(); fs != "" {
if con, _ := Getcon(); con != "kernel" {
enabled = true
}
}
return s.setEnable(enabled)
}
// SetDisabled disables selinux support for the package
func SetDisabled() {
state.setEnable(false)
}
func (s *selinuxState) setSELinuxfs(selinuxfs string) string {
s.Lock()
defer s.Unlock()
s.selinuxfsSet = true
s.selinuxfs = selinuxfs
return s.selinuxfs
}
func (s *selinuxState) getSELinuxfs() string {
s.Lock()
selinuxfs := s.selinuxfs
selinuxfsSet := s.selinuxfsSet
s.Unlock()
if selinuxfsSet {
return selinuxfs
}
selinuxfs = ""
selinuxfs = ""
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return selinuxfs
@ -91,21 +131,21 @@ func getSelinuxMountPoint() string {
selinuxfs = ""
}
}
return selinuxfs
return s.setSELinuxfs(selinuxfs)
}
// getSelinuxMountPoint returns the path to the mountpoint of an selinuxfs
// filesystem or an empty string if no mountpoint is found. Selinuxfs is
// a proc-like pseudo-filesystem that exposes the selinux policy API to
// processes. The existence of an selinuxfs mount is used to determine
// whether selinux is currently enabled or not.
func getSelinuxMountPoint() string {
return state.getSELinuxfs()
}
// SelinuxEnabled returns whether selinux is currently enabled.
func SelinuxEnabled() bool {
if selinuxEnabledChecked {
return selinuxEnabled
}
selinuxEnabledChecked = true
if fs := getSelinuxMountPoint(); fs != "" {
if con, _ := Getcon(); con != "kernel" {
selinuxEnabled = true
}
}
return selinuxEnabled
return state.getEnabled()
}
func readConfig(target string) (value string) {
@ -283,19 +323,19 @@ func SelinuxGetEnforceMode() int {
}
func mcsAdd(mcs string) error {
mcsLock.Lock()
defer mcsLock.Unlock()
if mcsList[mcs] {
state.Lock()
defer state.Unlock()
if state.mcsList[mcs] {
return fmt.Errorf("MCS Label already exists")
}
mcsList[mcs] = true
state.mcsList[mcs] = true
return nil
}
func mcsDelete(mcs string) {
mcsLock.Lock()
mcsList[mcs] = false
mcsLock.Unlock()
state.Lock()
defer state.Unlock()
state.mcsList[mcs] = false
}
func IntToMcs(id int, catRange uint32) string {
@ -334,9 +374,7 @@ func uniqMcs(catRange uint32) string {
continue
} else {
if c1 > c2 {
t := c1
c1 = c2
c2 = t
c1, c2 = c2, c1
}
}
mcs = fmt.Sprintf("s0:c%d,c%d", c1, c2)
@ -355,6 +393,12 @@ func FreeLxcContexts(scon string) {
}
}
var roFileLabel string
func GetROFileLabel() (fileLabel string) {
return roFileLabel
}
func GetLxcContexts() (processLabel string, fileLabel string) {
var (
val, key string
@ -399,6 +443,9 @@ func GetLxcContexts() (processLabel string, fileLabel string) {
if key == "file" {
fileLabel = strings.Trim(val, "\"")
}
if key == "ro_file" {
roFileLabel = strings.Trim(val, "\"")
}
}
}
@ -406,6 +453,9 @@ func GetLxcContexts() (processLabel string, fileLabel string) {
return "", ""
}
if roFileLabel == "" {
roFileLabel = fileLabel
}
exit:
// mcs := IntToMcs(os.Getpid(), 1024)
mcs := uniqMcs(1024)
@ -446,7 +496,7 @@ func badPrefix(fpath string) error {
for _, prefix := range badprefixes {
if fpath == prefix || strings.HasPrefix(fpath, fmt.Sprintf("%s/", prefix)) {
return fmt.Errorf("Relabeling content in %s is not allowed.", prefix)
return fmt.Errorf("relabeling content in %s is not allowed", prefix)
}
}
return nil
@ -486,14 +536,14 @@ func DupSecOpt(src string) []string {
con["level"] == "" {
return nil
}
return []string{"label=user:" + con["user"],
"label=role:" + con["role"],
"label=type:" + con["type"],
"label=level:" + con["level"]}
return []string{"user:" + con["user"],
"role:" + con["role"],
"type:" + con["type"],
"level:" + con["level"]}
}
// DisableSecOpt returns a security opt that can be used to disabling SELinux
// labeling support for future container processes
func DisableSecOpt() []string {
return []string{"label=disable"}
return []string{"disable"}
}

View File

@ -16,6 +16,8 @@ import (
// linuxSetnsInit performs the container's initialization for running a new process
// inside an existing container.
type linuxSetnsInit struct {
pipe *os.File
consoleSocket *os.File
config *initConfig
}
@ -30,6 +32,14 @@ func (l *linuxSetnsInit) Init() error {
return err
}
}
if l.config.CreateConsole {
if err := setupConsole(l.consoleSocket, l.config, false); err != nil {
return err
}
if err := system.Setctty(); err != nil {
return err
}
}
if l.config.NoNewPrivileges {
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err

View File

@ -4,7 +4,6 @@ package libcontainer
import (
"fmt"
"io"
"os"
"os/exec"
"syscall"
@ -18,7 +17,8 @@ import (
)
type linuxStandardInit struct {
pipe io.ReadWriteCloser
pipe *os.File
consoleSocket *os.File
parentPid int
stateDirFD int
config *initConfig
@ -59,18 +59,6 @@ func (l *linuxStandardInit) Init() error {
}
}
var console *linuxConsole
if l.config.Console != "" {
console = newConsoleFromPath(l.config.Console)
if err := console.dupStdio(); err != nil {
return err
}
}
if console != nil {
if err := system.Setctty(); err != nil {
return err
}
}
if err := setupNetwork(l.config); err != nil {
return err
}
@ -79,12 +67,33 @@ func (l *linuxStandardInit) Init() error {
}
label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace
// prepareRootfs() can be executed only for a new mount namespace.
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := setupRootfs(l.config.Config, console, l.pipe); err != nil {
if err := prepareRootfs(l.pipe, l.config.Config); err != nil {
return err
}
}
// Set up the console. This has to be done *before* we finalize the rootfs,
// but *after* we've given the user the chance to set up all of the mounts
// they wanted.
if l.config.CreateConsole {
if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
return err
}
if err := system.Setctty(); err != nil {
return err
}
}
// Finish the rootfs setup.
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := finalizeRootfs(l.config.Config); err != nil {
return err
}
}
if hostname := l.config.Config.Hostname; hostname != "" {
if err := syscall.Sethostname([]byte(hostname)); err != nil {
return err
@ -103,12 +112,12 @@ func (l *linuxStandardInit) Init() error {
}
}
for _, path := range l.config.Config.ReadonlyPaths {
if err := remountReadonly(path); err != nil {
if err := readonlyPath(path); err != nil {
return err
}
}
for _, path := range l.config.Config.MaskPaths {
if err := maskFile(path); err != nil {
if err := maskPath(path); err != nil {
return err
}
}
@ -143,7 +152,7 @@ func (l *linuxStandardInit) Init() error {
if err := pdeath.Restore(); err != nil {
return err
}
// compare the parent from the inital start of the init process and make sure that it did not change.
// compare the parent from the initial start of the init process and make sure that it did not change.
// if the parent changes that means it died and we were reparented to something else so we should
// just kill ourself and not cause problems for someone else.
if syscall.Getppid() != l.parentPid {
@ -171,6 +180,9 @@ func (l *linuxStandardInit) Init() error {
return newSystemErrorWithCause(err, "init seccomp")
}
}
// close the statedir fd before exec because the kernel resets dumpable in the wrong order
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
syscall.Close(l.stateDirFD)
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
return newSystemErrorWithCause(err, "exec user process")
}

View File

@ -39,7 +39,7 @@ type containerState interface {
func destroy(c *linuxContainer) error {
if !c.config.Namespaces.Contains(configs.NEWPID) {
if err := killCgroupProcesses(c.cgroupManager); err != nil {
if err := signalAllProcesses(c.cgroupManager, syscall.SIGKILL); err != nil {
logrus.Warn(err)
}
}
@ -60,8 +60,7 @@ func runPoststopHooks(c *linuxContainer) error {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Root: c.config.Rootfs,
BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
}
for _, hook := range c.config.Hooks.Poststop {
if err := hook.Run(s); err != nil {
@ -83,10 +82,7 @@ func (b *stoppedState) status() Status {
func (b *stoppedState) transition(s containerState) error {
switch s.(type) {
case *runningState:
b.c.state = s
return nil
case *restoredState:
case *runningState, *restoredState:
b.c.state = s
return nil
case *stoppedState:
@ -199,7 +195,7 @@ func (p *pausedState) destroy() error {
return newGenericError(fmt.Errorf("container is paused"), ContainerPaused)
}
// restoredState is the same as the running state but also has accociated checkpoint
// restoredState is the same as the running state but also has associated checkpoint
// information that maybe need destroyed when the container is stopped and destroy is called.
type restoredState struct {
imageDir string
@ -212,9 +208,7 @@ func (r *restoredState) status() Status {
func (r *restoredState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
return nil
case *runningState:
case *stoppedState, *runningState:
return nil
}
return newStateTransitionError(r, s)

View File

@ -0,0 +1,107 @@
package libcontainer
import (
"encoding/json"
"fmt"
"io"
"github.com/opencontainers/runc/libcontainer/utils"
)
type syncType string
// Constants that are used for synchronisation between the parent and child
// during container setup. They come in pairs (with procError being a generic
// response which is followed by a &genericError).
//
// [ child ] <-> [ parent ]
//
// procHooks --> [run hooks]
// <-- procResume
//
// procConsole -->
// <-- procConsoleReq
// [send(fd)] --> [recv(fd)]
// <-- procConsoleAck
//
// procReady --> [final setup]
// <-- procRun
const (
procError syncType = "procError"
procReady syncType = "procReady"
procRun syncType = "procRun"
procHooks syncType = "procHooks"
procResume syncType = "procResume"
)
type syncT struct {
Type syncType `json:"type"`
}
// writeSync is used to write to a synchronisation pipe. An error is returned
// if there was a problem writing the payload.
func writeSync(pipe io.Writer, sync syncType) error {
if err := utils.WriteJSON(pipe, syncT{sync}); err != nil {
return err
}
return nil
}
// readSync is used to read from a synchronisation pipe. An error is returned
// if we got a genericError, the pipe was closed, or we got an unexpected flag.
func readSync(pipe io.Reader, expected syncType) error {
var procSync syncT
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type == procError {
var ierr genericError
if err := json.NewDecoder(pipe).Decode(&ierr); err != nil {
return fmt.Errorf("failed reading error from parent: %v", err)
}
return &ierr
}
if procSync.Type != expected {
return fmt.Errorf("invalid synchronisation flag from parent")
}
}
return nil
}
// parseSync runs the given callback function on each syncT received from the
// child. It will return once io.EOF is returned from the given pipe.
func parseSync(pipe io.Reader, fn func(*syncT) error) error {
dec := json.NewDecoder(pipe)
for {
var sync syncT
if err := dec.Decode(&sync); err != nil {
if err == io.EOF {
break
}
return err
}
// We handle this case outside fn for cleanliness reasons.
var ierr *genericError
if sync.Type == procError {
if err := dec.Decode(&ierr); err != nil && err != io.EOF {
return newSystemErrorWithCause(err, "decoding proc error from init")
}
if ierr != nil {
return ierr
}
// Programmer error.
panic("No error following JSON procError payload.")
}
if err := fn(&sync); err != nil {
return err
}
}
return nil
}

View File

@ -14,8 +14,10 @@ func GetProcessStartTime(pid int) (string, error) {
if err != nil {
return "", err
}
return parseStartTime(string(data))
}
parts := strings.Split(string(data), " ")
func parseStartTime(stat string) (string, error) {
// the starttime is located at pos 22
// from the man page
//
@ -23,5 +25,19 @@ func GetProcessStartTime(pid int) (string, error) {
// (22) The time the process started after system boot. In kernels before Linux 2.6, this
// value was expressed in jiffies. Since Linux 2.6, the value is expressed in clock ticks
// (divide by sysconf(_SC_CLK_TCK)).
return parts[22-1], nil // starts at 1
//
// NOTE:
// pos 2 could contain space and is inside `(` and `)`:
// (2) comm %s
// The filename of the executable, in parentheses.
// This is visible whether or not the executable is
// swapped out.
//
// the following is an example:
// 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0
// get parts after last `)`:
s := strings.Split(stat, ")")
parts := strings.Split(strings.TrimSpace(s[len(s)-1]), " ")
return parts[22-3], nil // starts at 3 (after the filename pos `2`)
}

View File

@ -199,18 +199,16 @@ type ExecUser struct {
// files cannot be opened for any reason, the error is ignored and a nil
// io.Reader is passed instead.
func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) {
passwd, err := os.Open(passwdPath)
if err != nil {
passwd = nil
} else {
defer passwd.Close()
var passwd, group io.Reader
if passwdFile, err := os.Open(passwdPath); err == nil {
passwd = passwdFile
defer passwdFile.Close()
}
group, err := os.Open(groupPath)
if err != nil {
group = nil
} else {
defer group.Close()
if groupFile, err := os.Open(groupPath); err == nil {
group = groupFile
defer groupFile.Close()
}
return GetExecUser(userSpec, defaults, passwd, group)
@ -343,7 +341,7 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (
if len(groups) > 0 {
// First match wins, even if there's more than one matching entry.
user.Gid = groups[0].Gid
} else if groupArg != "" {
} else {
// If we can't find a group with the given name, the only other valid
// option is if it's a numeric group name with no associated entry in group.
@ -433,9 +431,11 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
// that opens the groupPath given and gives it as an argument to
// GetAdditionalGroups.
func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) {
group, err := os.Open(groupPath)
if err == nil {
defer group.Close()
var group io.Reader
if groupFile, err := os.Open(groupPath); err == nil {
group = groupFile
defer groupFile.Close()
}
return GetAdditionalGroups(additionalGroups, group)
}

View File

@ -0,0 +1,148 @@
/*
* Copyright 2016 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h>
#include "cmsg.h"
#define error(fmt, ...) \
({ \
fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \
errno = ECOMM; \
goto err; /* return value */ \
})
/*
* Sends a file descriptor along the sockfd provided. Returns the return
* value of sendmsg(2). Any synchronisation and preparation of state
* should be done external to this (we expect the other side to be in
* recvfd() in the code).
*/
ssize_t sendfd(int sockfd, struct file_t file)
{
struct msghdr msg = {0};
struct iovec iov[1] = {0};
struct cmsghdr *cmsg;
int *fdptr;
int ret;
union {
char buf[CMSG_SPACE(sizeof(file.fd))];
struct cmsghdr align;
} u;
/*
* We need to send some other data along with the ancillary data,
* otherwise the other side won't recieve any data. This is very
* well-hidden in the documentation (and only applies to
* SOCK_STREAM). See the bottom part of unix(7).
*/
iov[0].iov_base = file.name;
iov[0].iov_len = strlen(file.name) + 1;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_control = u.buf;
msg.msg_controllen = sizeof(u.buf);
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
fdptr = (int *) CMSG_DATA(cmsg);
memcpy(fdptr, &file.fd, sizeof(int));
return sendmsg(sockfd, &msg, 0);
}
/*
* Receives a file descriptor from the sockfd provided. Returns the file
* descriptor as sent from sendfd(). It will return the file descriptor
* or die (literally) trying. Any synchronisation and preparation of
* state should be done external to this (we expect the other side to be
* in sendfd() in the code).
*/
struct file_t recvfd(int sockfd)
{
struct msghdr msg = {0};
struct iovec iov[1] = {0};
struct cmsghdr *cmsg;
struct file_t file = {0};
int *fdptr;
int olderrno;
union {
char buf[CMSG_SPACE(sizeof(file.fd))];
struct cmsghdr align;
} u;
/* Allocate a buffer. */
/* TODO: Make this dynamic with MSG_PEEK. */
file.name = malloc(TAG_BUFFER);
if (!file.name)
error("recvfd: failed to allocate file.tag buffer\n");
/*
* We need to "recieve" the non-ancillary data even though we don't
* plan to use it at all. Otherwise, things won't work as expected.
* See unix(7) and other well-hidden documentation.
*/
iov[0].iov_base = file.name;
iov[0].iov_len = TAG_BUFFER;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_control = u.buf;
msg.msg_controllen = sizeof(u.buf);
ssize_t ret = recvmsg(sockfd, &msg, 0);
if (ret < 0)
goto err;
cmsg = CMSG_FIRSTHDR(&msg);
if (!cmsg)
error("recvfd: got NULL from CMSG_FIRSTHDR");
if (cmsg->cmsg_level != SOL_SOCKET)
error("recvfd: expected SOL_SOCKET in cmsg: %d", cmsg->cmsg_level);
if (cmsg->cmsg_type != SCM_RIGHTS)
error("recvfd: expected SCM_RIGHTS in cmsg: %d", cmsg->cmsg_type);
if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
error("recvfd: expected correct CMSG_LEN in cmsg: %lu", (unsigned long)cmsg->cmsg_len);
fdptr = (int *) CMSG_DATA(cmsg);
if (!fdptr || *fdptr < 0)
error("recvfd: recieved invalid pointer");
file.fd = *fdptr;
return file;
err:
olderrno = errno;
free(file.name);
errno = olderrno;
return (struct file_t){0};
}

View File

@ -0,0 +1,57 @@
// +build linux
package utils
/*
* Copyright 2016 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
#include <errno.h>
#include <stdlib.h>
#include "cmsg.h"
*/
import "C"
import (
"os"
"unsafe"
)
// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
// socket. The file name of the remote file descriptor will be recreated
// locally (it is sent as non-auxiliary data in the same payload).
func RecvFd(socket *os.File) (*os.File, error) {
file, err := C.recvfd(C.int(socket.Fd()))
if err != nil {
return nil, err
}
defer C.free(unsafe.Pointer(file.name))
return os.NewFile(uintptr(file.fd), C.GoString(file.name)), nil
}
// SendFd sends a file descriptor over the given AF_UNIX socket. In
// addition, the file.Name() of the given file will also be sent as
// non-auxiliary data in the same payload (allowing to send contextual
// information for a file descriptor).
func SendFd(socket, file *os.File) error {
var cfile C.struct_file_t
cfile.fd = C.int(file.Fd())
cfile.name = C.CString(file.Name())
defer C.free(unsafe.Pointer(cfile.name))
_, err := C.sendfd(C.int(socket.Fd()), cfile)
return err
}

View File

@ -0,0 +1,36 @@
/*
* Copyright 2016 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#if !defined(CMSG_H)
#define CMSG_H
#include <sys/types.h>
/* TODO: Implement this properly with MSG_PEEK. */
#define TAG_BUFFER 4096
/* This mirrors Go's (*os.File). */
struct file_t {
char *name;
int fd;
};
struct file_t recvfd(int sockfd);
ssize_t sendfd(int sockfd, struct file_t file);
#endif /* !defined(CMSG_H) */

View File

@ -9,6 +9,7 @@ import (
"path/filepath"
"strings"
"syscall"
"unsafe"
)
const (
@ -102,7 +103,7 @@ func SearchLabels(labels []string, query string) string {
}
// Annotations returns the bundle path and user defined annotations from the
// libcontianer state. We need to remove the bundle because that is a label
// libcontainer state. We need to remove the bundle because that is a label
// added by libcontainer.
func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
userAnnotations = make(map[string]string)
@ -119,3 +120,7 @@ func Annotations(labels []string) (bundle string, userAnnotations map[string]str
}
return
}
func GetIntSize() int {
return int(unsafe.Sizeof(1))
}

View File

@ -4,6 +4,7 @@ package utils
import (
"io/ioutil"
"os"
"strconv"
"syscall"
)
@ -31,3 +32,12 @@ func CloseExecFrom(minFd int) error {
}
return nil
}
// NewSockPair returns a new unix socket pair
func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
if err != nil {
return nil, nil, err
}
return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
}

191
vendor/github.com/opencontainers/runtime-spec/LICENSE generated vendored Normal file
View File

@ -0,0 +1,191 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
Copyright 2015 The Linux Foundation.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,563 @@
package specs
import "os"
// Spec is the base configuration for the container.
type Spec struct {
// Version of the Open Container Runtime Specification with which the bundle complies.
Version string `json:"ociVersion"`
// Platform specifies the configuration's target platform.
Platform Platform `json:"platform"`
// Process configures the container process.
Process Process `json:"process"`
// Root configures the container's root filesystem.
Root Root `json:"root"`
// Hostname configures the container's hostname.
Hostname string `json:"hostname,omitempty"`
// Mounts configures additional mounts (on top of Root).
Mounts []Mount `json:"mounts,omitempty"`
// Hooks configures callbacks for container lifecycle events.
Hooks *Hooks `json:"hooks,omitempty"`
// Annotations contains arbitrary metadata for the container.
Annotations map[string]string `json:"annotations,omitempty"`
// Linux is platform specific configuration for Linux based containers.
Linux *Linux `json:"linux,omitempty" platform:"linux"`
// Solaris is platform specific configuration for Solaris containers.
Solaris *Solaris `json:"solaris,omitempty" platform:"solaris"`
// Windows is platform specific configuration for Windows based containers, including Hyper-V containers.
Windows *Windows `json:"windows,omitempty" platform:"windows"`
}
// Process contains information to start a specific application inside the container.
type Process struct {
// Terminal creates an interactive terminal for the container.
Terminal bool `json:"terminal,omitempty"`
// ConsoleSize specifies the size of the console.
ConsoleSize Box `json:"consoleSize,omitempty"`
// User specifies user information for the process.
User User `json:"user"`
// Args specifies the binary and arguments for the application to execute.
Args []string `json:"args"`
// Env populates the process environment for the process.
Env []string `json:"env,omitempty"`
// Cwd is the current working directory for the process and must be
// relative to the container's root.
Cwd string `json:"cwd"`
// Capabilities are Linux capabilities that are kept for the process.
Capabilities *LinuxCapabilities `json:"capabilities,omitempty" platform:"linux"`
// Rlimits specifies rlimit options to apply to the process.
Rlimits []LinuxRlimit `json:"rlimits,omitempty" platform:"linux"`
// NoNewPrivileges controls whether additional privileges could be gained by processes in the container.
NoNewPrivileges bool `json:"noNewPrivileges,omitempty" platform:"linux"`
// ApparmorProfile specifies the apparmor profile for the container.
ApparmorProfile string `json:"apparmorProfile,omitempty" platform:"linux"`
// SelinuxLabel specifies the selinux context that the container process is run as.
SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"`
}
// LinuxCapabilities specifies the whitelist of capabilities that are kept for a process.
// http://man7.org/linux/man-pages/man7/capabilities.7.html
type LinuxCapabilities struct {
// Bounding is the set of capabilities checked by the kernel.
Bounding []string `json:"bounding,omitempty" platform:"linux"`
// Effective is the set of capabilities checked by the kernel.
Effective []string `json:"effective,omitempty" platform:"linux"`
// Inheritable is the capabilities preserved across execve.
Inheritable []string `json:"inheritable,omitempty" platform:"linux"`
// Permitted is the limiting superset for effective capabilities.
Permitted []string `json:"permitted,omitempty" platform:"linux"`
// Ambient is the ambient set of capabilities that are kept.
Ambient []string `json:"ambient,omitempty" platform:"linux"`
}
// Box specifies dimensions of a rectangle. Used for specifying the size of a console.
type Box struct {
// Height is the vertical dimension of a box.
Height uint `json:"height"`
// Width is the horizontal dimension of a box.
Width uint `json:"width"`
}
// User specifies specific user (and group) information for the container process.
type User struct {
// UID is the user id.
UID uint32 `json:"uid" platform:"linux,solaris"`
// GID is the group id.
GID uint32 `json:"gid" platform:"linux,solaris"`
// AdditionalGids are additional group ids set for the container's process.
AdditionalGids []uint32 `json:"additionalGids,omitempty" platform:"linux,solaris"`
// Username is the user name.
Username string `json:"username,omitempty" platform:"windows"`
}
// Root contains information about the container's root filesystem on the host.
type Root struct {
// Path is the absolute path to the container's root filesystem.
Path string `json:"path"`
// Readonly makes the root filesystem for the container readonly before the process is executed.
Readonly bool `json:"readonly,omitempty"`
}
// Platform specifies OS and arch information for the host system that the container
// is created for.
type Platform struct {
// OS is the operating system.
OS string `json:"os"`
// Arch is the architecture
Arch string `json:"arch"`
}
// Mount specifies a mount for a container.
type Mount struct {
// Destination is the path where the mount will be placed relative to the container's root. The path and child directories MUST exist, a runtime MUST NOT create directories automatically to a mount point.
Destination string `json:"destination"`
// Type specifies the mount kind.
Type string `json:"type,omitempty"`
// Source specifies the source path of the mount. In the case of bind mounts on
// Linux based systems this would be the file on the host.
Source string `json:"source,omitempty"`
// Options are fstab style mount options.
Options []string `json:"options,omitempty"`
}
// Hook specifies a command that is run at a particular event in the lifecycle of a container
type Hook struct {
Path string `json:"path"`
Args []string `json:"args,omitempty"`
Env []string `json:"env,omitempty"`
Timeout *int `json:"timeout,omitempty"`
}
// Hooks for container setup and teardown
type Hooks struct {
// Prestart is a list of hooks to be run before the container process is executed.
// On Linux, they are run after the container namespaces are created.
Prestart []Hook `json:"prestart,omitempty"`
// Poststart is a list of hooks to be run after the container process is started.
Poststart []Hook `json:"poststart,omitempty"`
// Poststop is a list of hooks to be run after the container process exits.
Poststop []Hook `json:"poststop,omitempty"`
}
// Linux contains platform specific configuration for Linux based containers.
type Linux struct {
// UIDMapping specifies user mappings for supporting user namespaces on Linux.
UIDMappings []LinuxIDMapping `json:"uidMappings,omitempty"`
// GIDMapping specifies group mappings for supporting user namespaces on Linux.
GIDMappings []LinuxIDMapping `json:"gidMappings,omitempty"`
// Sysctl are a set of key value pairs that are set for the container on start
Sysctl map[string]string `json:"sysctl,omitempty"`
// Resources contain cgroup information for handling resource constraints
// for the container
Resources *LinuxResources `json:"resources,omitempty"`
// CgroupsPath specifies the path to cgroups that are created and/or joined by the container.
// The path is expected to be relative to the cgroups mountpoint.
// If resources are specified, the cgroups at CgroupsPath will be updated based on resources.
CgroupsPath string `json:"cgroupsPath,omitempty"`
// Namespaces contains the namespaces that are created and/or joined by the container
Namespaces []LinuxNamespace `json:"namespaces,omitempty"`
// Devices are a list of device nodes that are created for the container
Devices []LinuxDevice `json:"devices,omitempty"`
// Seccomp specifies the seccomp security settings for the container.
Seccomp *LinuxSeccomp `json:"seccomp,omitempty"`
// RootfsPropagation is the rootfs mount propagation mode for the container.
RootfsPropagation string `json:"rootfsPropagation,omitempty"`
// MaskedPaths masks over the provided paths inside the container.
MaskedPaths []string `json:"maskedPaths,omitempty"`
// ReadonlyPaths sets the provided paths as RO inside the container.
ReadonlyPaths []string `json:"readonlyPaths,omitempty"`
// MountLabel specifies the selinux context for the mounts in the container.
MountLabel string `json:"mountLabel,omitempty"`
// IntelRdt contains Intel Resource Director Technology (RDT) information
// for handling resource constraints (e.g., L3 cache) for the container
IntelRdt *LinuxIntelRdt `json:"intelRdt,omitempty"`
}
// LinuxNamespace is the configuration for a Linux namespace
type LinuxNamespace struct {
// Type is the type of Linux namespace
Type LinuxNamespaceType `json:"type"`
// Path is a path to an existing namespace persisted on disk that can be joined
// and is of the same type
Path string `json:"path,omitempty"`
}
// LinuxNamespaceType is one of the Linux namespaces
type LinuxNamespaceType string
const (
// PIDNamespace for isolating process IDs
PIDNamespace LinuxNamespaceType = "pid"
// NetworkNamespace for isolating network devices, stacks, ports, etc
NetworkNamespace = "network"
// MountNamespace for isolating mount points
MountNamespace = "mount"
// IPCNamespace for isolating System V IPC, POSIX message queues
IPCNamespace = "ipc"
// UTSNamespace for isolating hostname and NIS domain name
UTSNamespace = "uts"
// UserNamespace for isolating user and group IDs
UserNamespace = "user"
// CgroupNamespace for isolating cgroup hierarchies
CgroupNamespace = "cgroup"
)
// LinuxIDMapping specifies UID/GID mappings
type LinuxIDMapping struct {
// HostID is the starting UID/GID on the host to be mapped to 'ContainerID'
HostID uint32 `json:"hostID"`
// ContainerID is the starting UID/GID in the container
ContainerID uint32 `json:"containerID"`
// Size is the number of IDs to be mapped
Size uint32 `json:"size"`
}
// LinuxRlimit type and restrictions
type LinuxRlimit struct {
// Type of the rlimit to set
Type string `json:"type"`
// Hard is the hard limit for the specified type
Hard uint64 `json:"hard"`
// Soft is the soft limit for the specified type
Soft uint64 `json:"soft"`
}
// LinuxHugepageLimit structure corresponds to limiting kernel hugepages
type LinuxHugepageLimit struct {
// Pagesize is the hugepage size
Pagesize string `json:"pageSize"`
// Limit is the limit of "hugepagesize" hugetlb usage
Limit uint64 `json:"limit"`
}
// LinuxInterfacePriority for network interfaces
type LinuxInterfacePriority struct {
// Name is the name of the network interface
Name string `json:"name"`
// Priority for the interface
Priority uint32 `json:"priority"`
}
// linuxBlockIODevice holds major:minor format supported in blkio cgroup
type linuxBlockIODevice struct {
// Major is the device's major number.
Major int64 `json:"major"`
// Minor is the device's minor number.
Minor int64 `json:"minor"`
}
// LinuxWeightDevice struct holds a `major:minor weight` pair for blkioWeightDevice
type LinuxWeightDevice struct {
linuxBlockIODevice
// Weight is the bandwidth rate for the device, range is from 10 to 1000
Weight *uint16 `json:"weight,omitempty"`
// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, CFQ scheduler only
LeafWeight *uint16 `json:"leafWeight,omitempty"`
}
// LinuxThrottleDevice struct holds a `major:minor rate_per_second` pair
type LinuxThrottleDevice struct {
linuxBlockIODevice
// Rate is the IO rate limit per cgroup per device
Rate uint64 `json:"rate"`
}
// LinuxBlockIO for Linux cgroup 'blkio' resource management
type LinuxBlockIO struct {
// Specifies per cgroup weight, range is from 10 to 1000
Weight *uint16 `json:"blkioWeight,omitempty"`
// Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, CFQ scheduler only
LeafWeight *uint16 `json:"blkioLeafWeight,omitempty"`
// Weight per cgroup per device, can override BlkioWeight
WeightDevice []LinuxWeightDevice `json:"blkioWeightDevice,omitempty"`
// IO read rate limit per cgroup per device, bytes per second
ThrottleReadBpsDevice []LinuxThrottleDevice `json:"blkioThrottleReadBpsDevice,omitempty"`
// IO write rate limit per cgroup per device, bytes per second
ThrottleWriteBpsDevice []LinuxThrottleDevice `json:"blkioThrottleWriteBpsDevice,omitempty"`
// IO read rate limit per cgroup per device, IO per second
ThrottleReadIOPSDevice []LinuxThrottleDevice `json:"blkioThrottleReadIOPSDevice,omitempty"`
// IO write rate limit per cgroup per device, IO per second
ThrottleWriteIOPSDevice []LinuxThrottleDevice `json:"blkioThrottleWriteIOPSDevice,omitempty"`
}
// LinuxMemory for Linux cgroup 'memory' resource management
type LinuxMemory struct {
// Memory limit (in bytes).
Limit *uint64 `json:"limit,omitempty"`
// Memory reservation or soft_limit (in bytes).
Reservation *uint64 `json:"reservation,omitempty"`
// Total memory limit (memory + swap).
Swap *uint64 `json:"swap,omitempty"`
// Kernel memory limit (in bytes).
Kernel *uint64 `json:"kernel,omitempty"`
// Kernel memory limit for tcp (in bytes)
KernelTCP *uint64 `json:"kernelTCP,omitempty"`
// How aggressive the kernel will swap memory pages. Range from 0 to 100.
Swappiness *uint64 `json:"swappiness,omitempty"`
}
// LinuxCPU for Linux cgroup 'cpu' resource management
type LinuxCPU struct {
// CPU shares (relative weight (ratio) vs. other cgroups with cpu shares).
Shares *uint64 `json:"shares,omitempty"`
// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
Quota *int64 `json:"quota,omitempty"`
// CPU period to be used for hardcapping (in usecs).
Period *uint64 `json:"period,omitempty"`
// How much time realtime scheduling may use (in usecs).
RealtimeRuntime *int64 `json:"realtimeRuntime,omitempty"`
// CPU period to be used for realtime scheduling (in usecs).
RealtimePeriod *uint64 `json:"realtimePeriod,omitempty"`
// CPUs to use within the cpuset. Default is to use any CPU available.
Cpus string `json:"cpus,omitempty"`
// List of memory nodes in the cpuset. Default is to use any available memory node.
Mems string `json:"mems,omitempty"`
}
// LinuxPids for Linux cgroup 'pids' resource management (Linux 4.3)
type LinuxPids struct {
// Maximum number of PIDs. Default is "no limit".
Limit int64 `json:"limit"`
}
// LinuxNetwork identification and priority configuration
type LinuxNetwork struct {
// Set class identifier for container's network packets
ClassID *uint32 `json:"classID,omitempty"`
// Set priority of network traffic for container
Priorities []LinuxInterfacePriority `json:"priorities,omitempty"`
}
// LinuxResources has container runtime resource constraints
type LinuxResources struct {
// Devices configures the device whitelist.
Devices []LinuxDeviceCgroup `json:"devices,omitempty"`
// DisableOOMKiller disables the OOM killer for out of memory conditions
DisableOOMKiller *bool `json:"disableOOMKiller,omitempty"`
// Specify an oom_score_adj for the container.
OOMScoreAdj *int `json:"oomScoreAdj,omitempty"`
// Memory restriction configuration
Memory *LinuxMemory `json:"memory,omitempty"`
// CPU resource restriction configuration
CPU *LinuxCPU `json:"cpu,omitempty"`
// Task resource restriction configuration.
Pids *LinuxPids `json:"pids,omitempty"`
// BlockIO restriction configuration
BlockIO *LinuxBlockIO `json:"blockIO,omitempty"`
// Hugetlb limit (in bytes)
HugepageLimits []LinuxHugepageLimit `json:"hugepageLimits,omitempty"`
// Network restriction configuration
Network *LinuxNetwork `json:"network,omitempty"`
}
// LinuxDevice represents the mknod information for a Linux special device file
type LinuxDevice struct {
// Path to the device.
Path string `json:"path"`
// Device type, block, char, etc.
Type string `json:"type"`
// Major is the device's major number.
Major int64 `json:"major"`
// Minor is the device's minor number.
Minor int64 `json:"minor"`
// FileMode permission bits for the device.
FileMode *os.FileMode `json:"fileMode,omitempty"`
// UID of the device.
UID *uint32 `json:"uid,omitempty"`
// Gid of the device.
GID *uint32 `json:"gid,omitempty"`
}
// LinuxDeviceCgroup represents a device rule for the whitelist controller
type LinuxDeviceCgroup struct {
// Allow or deny
Allow bool `json:"allow"`
// Device type, block, char, etc.
Type string `json:"type,omitempty"`
// Major is the device's major number.
Major *int64 `json:"major,omitempty"`
// Minor is the device's minor number.
Minor *int64 `json:"minor,omitempty"`
// Cgroup access permissions format, rwm.
Access string `json:"access,omitempty"`
}
// Solaris contains platform specific configuration for Solaris application containers.
type Solaris struct {
// SMF FMRI which should go "online" before we start the container process.
Milestone string `json:"milestone,omitempty"`
// Maximum set of privileges any process in this container can obtain.
LimitPriv string `json:"limitpriv,omitempty"`
// The maximum amount of shared memory allowed for this container.
MaxShmMemory string `json:"maxShmMemory,omitempty"`
// Specification for automatic creation of network resources for this container.
Anet []SolarisAnet `json:"anet,omitempty"`
// Set limit on the amount of CPU time that can be used by container.
CappedCPU *SolarisCappedCPU `json:"cappedCPU,omitempty"`
// The physical and swap caps on the memory that can be used by this container.
CappedMemory *SolarisCappedMemory `json:"cappedMemory,omitempty"`
}
// SolarisCappedCPU allows users to set limit on the amount of CPU time that can be used by container.
type SolarisCappedCPU struct {
Ncpus string `json:"ncpus,omitempty"`
}
// SolarisCappedMemory allows users to set the physical and swap caps on the memory that can be used by this container.
type SolarisCappedMemory struct {
Physical string `json:"physical,omitempty"`
Swap string `json:"swap,omitempty"`
}
// SolarisAnet provides the specification for automatic creation of network resources for this container.
type SolarisAnet struct {
// Specify a name for the automatically created VNIC datalink.
Linkname string `json:"linkname,omitempty"`
// Specify the link over which the VNIC will be created.
Lowerlink string `json:"lowerLink,omitempty"`
// The set of IP addresses that the container can use.
Allowedaddr string `json:"allowedAddress,omitempty"`
// Specifies whether allowedAddress limitation is to be applied to the VNIC.
Configallowedaddr string `json:"configureAllowedAddress,omitempty"`
// The value of the optional default router.
Defrouter string `json:"defrouter,omitempty"`
// Enable one or more types of link protection.
Linkprotection string `json:"linkProtection,omitempty"`
// Set the VNIC's macAddress
Macaddress string `json:"macAddress,omitempty"`
}
// Windows defines the runtime configuration for Windows based containers, including Hyper-V containers.
type Windows struct {
// Resources contains information for handling resource constraints for the container.
Resources *WindowsResources `json:"resources,omitempty"`
}
// WindowsResources has container runtime resource constraints for containers running on Windows.
type WindowsResources struct {
// Memory restriction configuration.
Memory *WindowsMemoryResources `json:"memory,omitempty"`
// CPU resource restriction configuration.
CPU *WindowsCPUResources `json:"cpu,omitempty"`
// Storage restriction configuration.
Storage *WindowsStorageResources `json:"storage,omitempty"`
// Network restriction configuration.
Network *WindowsNetworkResources `json:"network,omitempty"`
}
// WindowsMemoryResources contains memory resource management settings.
type WindowsMemoryResources struct {
// Memory limit in bytes.
Limit *uint64 `json:"limit,omitempty"`
// Memory reservation in bytes.
Reservation *uint64 `json:"reservation,omitempty"`
}
// WindowsCPUResources contains CPU resource management settings.
type WindowsCPUResources struct {
// Number of CPUs available to the container.
Count *uint64 `json:"count,omitempty"`
// CPU shares (relative weight to other containers with cpu shares). Range is from 1 to 10000.
Shares *uint16 `json:"shares,omitempty"`
// Percent of available CPUs usable by the container.
Percent *uint8 `json:"percent,omitempty"`
}
// WindowsStorageResources contains storage resource management settings.
type WindowsStorageResources struct {
// Specifies maximum Iops for the system drive.
Iops *uint64 `json:"iops,omitempty"`
// Specifies maximum bytes per second for the system drive.
Bps *uint64 `json:"bps,omitempty"`
// Sandbox size specifies the minimum size of the system drive in bytes.
SandboxSize *uint64 `json:"sandboxSize,omitempty"`
}
// WindowsNetworkResources contains network resource management settings.
type WindowsNetworkResources struct {
// EgressBandwidth is the maximum egress bandwidth in bytes per second.
EgressBandwidth *uint64 `json:"egressBandwidth,omitempty"`
}
// LinuxSeccomp represents syscall restrictions
type LinuxSeccomp struct {
DefaultAction LinuxSeccompAction `json:"defaultAction"`
Architectures []Arch `json:"architectures,omitempty"`
Syscalls []LinuxSyscall `json:"syscalls"`
}
// Arch used for additional architectures
type Arch string
// Additional architectures permitted to be used for system calls
// By default only the native architecture of the kernel is permitted
const (
ArchX86 Arch = "SCMP_ARCH_X86"
ArchX86_64 Arch = "SCMP_ARCH_X86_64"
ArchX32 Arch = "SCMP_ARCH_X32"
ArchARM Arch = "SCMP_ARCH_ARM"
ArchAARCH64 Arch = "SCMP_ARCH_AARCH64"
ArchMIPS Arch = "SCMP_ARCH_MIPS"
ArchMIPS64 Arch = "SCMP_ARCH_MIPS64"
ArchMIPS64N32 Arch = "SCMP_ARCH_MIPS64N32"
ArchMIPSEL Arch = "SCMP_ARCH_MIPSEL"
ArchMIPSEL64 Arch = "SCMP_ARCH_MIPSEL64"
ArchMIPSEL64N32 Arch = "SCMP_ARCH_MIPSEL64N32"
ArchPPC Arch = "SCMP_ARCH_PPC"
ArchPPC64 Arch = "SCMP_ARCH_PPC64"
ArchPPC64LE Arch = "SCMP_ARCH_PPC64LE"
ArchS390 Arch = "SCMP_ARCH_S390"
ArchS390X Arch = "SCMP_ARCH_S390X"
ArchPARISC Arch = "SCMP_ARCH_PARISC"
ArchPARISC64 Arch = "SCMP_ARCH_PARISC64"
)
// LinuxSeccompAction taken upon Seccomp rule match
type LinuxSeccompAction string
// Define actions for Seccomp rules
const (
ActKill LinuxSeccompAction = "SCMP_ACT_KILL"
ActTrap LinuxSeccompAction = "SCMP_ACT_TRAP"
ActErrno LinuxSeccompAction = "SCMP_ACT_ERRNO"
ActTrace LinuxSeccompAction = "SCMP_ACT_TRACE"
ActAllow LinuxSeccompAction = "SCMP_ACT_ALLOW"
)
// LinuxSeccompOperator used to match syscall arguments in Seccomp
type LinuxSeccompOperator string
// Define operators for syscall arguments in Seccomp
const (
OpNotEqual LinuxSeccompOperator = "SCMP_CMP_NE"
OpLessThan LinuxSeccompOperator = "SCMP_CMP_LT"
OpLessEqual LinuxSeccompOperator = "SCMP_CMP_LE"
OpEqualTo LinuxSeccompOperator = "SCMP_CMP_EQ"
OpGreaterEqual LinuxSeccompOperator = "SCMP_CMP_GE"
OpGreaterThan LinuxSeccompOperator = "SCMP_CMP_GT"
OpMaskedEqual LinuxSeccompOperator = "SCMP_CMP_MASKED_EQ"
)
// LinuxSeccompArg used for matching specific syscall arguments in Seccomp
type LinuxSeccompArg struct {
Index uint `json:"index"`
Value uint64 `json:"value"`
ValueTwo uint64 `json:"valueTwo"`
Op LinuxSeccompOperator `json:"op"`
}
// LinuxSyscall is used to match a syscall in Seccomp
type LinuxSyscall struct {
Names []string `json:"names"`
Action LinuxSeccompAction `json:"action"`
Args []LinuxSeccompArg `json:"args,omitempty"`
}
// LinuxIntelRdt has container runtime resource constraints
// for Intel RDT/CAT which introduced in Linux 4.10 kernel
type LinuxIntelRdt struct {
// The schema for L3 cache id and capacity bitmask (CBM)
// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
L3CacheSchema string `json:"l3CacheSchema,omitempty"`
}

View File

@ -0,0 +1,17 @@
package specs
// State holds information about the runtime state of the container.
type State struct {
// Version is the version of the specification that is supported.
Version string `json:"ociVersion"`
// ID is the container ID
ID string `json:"id"`
// Status is the runtime status of the container.
Status string `json:"status"`
// Pid is the process ID for the container process.
Pid int `json:"pid"`
// Bundle is the path to the container's bundle directory.
Bundle string `json:"bundle"`
// Annotations are key values associated with the container.
Annotations map[string]string `json:"annotations,omitempty"`
}

View File

@ -0,0 +1,18 @@
package specs
import "fmt"
const (
// VersionMajor is for an API incompatible changes
VersionMajor = 1
// VersionMinor is for functionality in a backwards-compatible manner
VersionMinor = 0
// VersionPatch is for backwards-compatible bug fixes
VersionPatch = 0
// VersionDev indicates development branch. Releases will be empty string.
VersionDev = "-rc5-dev"
)
// Version is the specification version that the package types support.
var Version = fmt.Sprintf("%d.%d.%d%s", VersionMajor, VersionMinor, VersionPatch, VersionDev)

View File

@ -10,42 +10,42 @@ package capability
type Capabilities interface {
// Get check whether a capability present in the given
// capabilities set. The 'which' value should be one of EFFECTIVE,
// PERMITTED, INHERITABLE or BOUNDING.
// PERMITTED, INHERITABLE, BOUNDING or AMBIENT.
Get(which CapType, what Cap) bool
// Empty check whether all capability bits of the given capabilities
// set are zero. The 'which' value should be one of EFFECTIVE,
// PERMITTED, INHERITABLE or BOUNDING.
// PERMITTED, INHERITABLE, BOUNDING or AMBIENT.
Empty(which CapType) bool
// Full check whether all capability bits of the given capabilities
// set are one. The 'which' value should be one of EFFECTIVE,
// PERMITTED, INHERITABLE or BOUNDING.
// PERMITTED, INHERITABLE, BOUNDING or AMBIENT.
Full(which CapType) bool
// Set sets capabilities of the given capabilities sets. The
// 'which' value should be one or combination (OR'ed) of EFFECTIVE,
// PERMITTED, INHERITABLE or BOUNDING.
// PERMITTED, INHERITABLE, BOUNDING or AMBIENT.
Set(which CapType, caps ...Cap)
// Unset unsets capabilities of the given capabilities sets. The
// 'which' value should be one or combination (OR'ed) of EFFECTIVE,
// PERMITTED, INHERITABLE or BOUNDING.
// PERMITTED, INHERITABLE, BOUNDING or AMBIENT.
Unset(which CapType, caps ...Cap)
// Fill sets all bits of the given capabilities kind to one. The
// 'kind' value should be one or combination (OR'ed) of CAPS or
// BOUNDS.
// 'kind' value should be one or combination (OR'ed) of CAPS,
// BOUNDS or AMBS.
Fill(kind CapType)
// Clear sets all bits of the given capabilities kind to zero. The
// 'kind' value should be one or combination (OR'ed) of CAPS or
// BOUNDS.
// 'kind' value should be one or combination (OR'ed) of CAPS,
// BOUNDS or AMBS.
Clear(kind CapType)
// String return current capabilities state of the given capabilities
// set as string. The 'which' value should be one of EFFECTIVE,
// PERMITTED, INHERITABLE or BOUNDING.
// PERMITTED, INHERITABLE BOUNDING or AMBIENT
StringCap(which CapType) string
// String return current capabilities state as string.

View File

@ -238,6 +238,7 @@ type capsV3 struct {
hdr capHeader
data [2]capData
bounds [2]uint32
ambient [2]uint32
}
func (c *capsV3) Get(which CapType, what Cap) bool {
@ -256,6 +257,8 @@ func (c *capsV3) Get(which CapType, what Cap) bool {
return (1<<uint(what))&c.data[i].inheritable != 0
case BOUNDING:
return (1<<uint(what))&c.bounds[i] != 0
case AMBIENT:
return (1<<uint(what))&c.ambient[i] != 0
}
return false
@ -275,6 +278,9 @@ func (c *capsV3) getData(which CapType, dest []uint32) {
case BOUNDING:
dest[0] = c.bounds[0]
dest[1] = c.bounds[1]
case AMBIENT:
dest[0] = c.ambient[0]
dest[1] = c.ambient[1]
}
}
@ -313,6 +319,9 @@ func (c *capsV3) Set(which CapType, caps ...Cap) {
if which&BOUNDING != 0 {
c.bounds[i] |= 1 << uint(what)
}
if which&AMBIENT != 0 {
c.ambient[i] |= 1 << uint(what)
}
}
}
@ -336,6 +345,9 @@ func (c *capsV3) Unset(which CapType, caps ...Cap) {
if which&BOUNDING != 0 {
c.bounds[i] &= ^(1 << uint(what))
}
if which&AMBIENT != 0 {
c.ambient[i] &= ^(1 << uint(what))
}
}
}
@ -353,6 +365,10 @@ func (c *capsV3) Fill(kind CapType) {
c.bounds[0] = 0xffffffff
c.bounds[1] = 0xffffffff
}
if kind&AMBS == AMBS {
c.ambient[0] = 0xffffffff
c.ambient[1] = 0xffffffff
}
}
func (c *capsV3) Clear(kind CapType) {
@ -369,6 +385,10 @@ func (c *capsV3) Clear(kind CapType) {
c.bounds[0] = 0
c.bounds[1] = 0
}
if kind&AMBS == AMBS {
c.ambient[0] = 0
c.ambient[1] = 0
}
}
func (c *capsV3) StringCap(which CapType) (ret string) {
@ -410,6 +430,10 @@ func (c *capsV3) Load() (err error) {
fmt.Sscanf(line[4:], "nd: %08x%08x", &c.bounds[1], &c.bounds[0])
break
}
if strings.HasPrefix(line, "CapA") {
fmt.Sscanf(line[4:], "mb: %08x%08x", &c.ambient[1], &c.ambient[0])
break
}
}
f.Close()
@ -442,7 +466,25 @@ func (c *capsV3) Apply(kind CapType) (err error) {
}
if kind&CAPS == CAPS {
return capset(&c.hdr, &c.data[0])
err = capset(&c.hdr, &c.data[0])
if err != nil {
return
}
}
if kind&AMBS == AMBS {
for i := Cap(0); i <= CAP_LAST_CAP; i++ {
action := pr_CAP_AMBIENT_LOWER
if c.Get(AMBIENT, i) {
action = pr_CAP_AMBIENT_RAISE
}
err := prctl(pr_CAP_AMBIENT, action, uintptr(i), 0, 0)
// Ignore EINVAL as not supported on kernels before 4.3
if errno, ok := err.(syscall.Errno); ok && errno == syscall.EINVAL {
err = nil
continue
}
}
}
return

View File

@ -20,6 +20,8 @@ func (c CapType) String() string {
return "bounding"
case CAPS:
return "caps"
case AMBIENT:
return "ambient"
}
return "unknown"
}
@ -29,9 +31,11 @@ const (
PERMITTED
INHERITABLE
BOUNDING
AMBIENT
CAPS = EFFECTIVE | PERMITTED | INHERITABLE
BOUNDS = BOUNDING
AMBS = AMBIENT
)
//go:generate go run enumgen/gen.go

View File

@ -38,6 +38,15 @@ func capset(hdr *capHeader, data *capData) (err error) {
return
}
// not yet in syscall
const (
pr_CAP_AMBIENT = 47
pr_CAP_AMBIENT_IS_SET = uintptr(1)
pr_CAP_AMBIENT_RAISE = uintptr(2)
pr_CAP_AMBIENT_LOWER = uintptr(3)
pr_CAP_AMBIENT_CLEAR_ALL = uintptr(4)
)
func prctl(option int, arg2, arg3, arg4, arg5 uintptr) (err error) {
_, _, e1 := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0)
if e1 != 0 {