Merge pull request #1726 from sjenning/bump-runc

Bump runc
This commit is contained in:
Derek Carr 2017-08-22 18:05:50 -04:00 committed by GitHub
commit 48f6b9981a
66 changed files with 3371 additions and 763 deletions

80
Godeps/Godeps.json generated
View File

@ -380,85 +380,99 @@
"Comment": "v2.1.1-5-g1b4ae6f", "Comment": "v2.1.1-5-g1b4ae6f",
"Rev": "1b4ae6fb4e77b095934d4430860ff202060169f8" "Rev": "1b4ae6fb4e77b095934d4430860ff202060169f8"
}, },
{
"ImportPath": "github.com/mrunalp/fileutils",
"Rev": "4ee1cc9a80582a0c75febdd5cfa779ee4361cbca"
},
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer", "ImportPath": "github.com/opencontainers/runc/libcontainer",
"Comment": "v1.0.0-rc1-224-g5653ced", "Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/apparmor", "ImportPath": "github.com/opencontainers/runc/libcontainer/apparmor",
"Comment": "v1.0.0-rc1-224-g5653ced", "Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups", "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups",
"Comment": "v1.0.0-rc1-224-g5653ced", "Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/fs", "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/fs",
"Comment": "v1.0.0-rc1-224-g5653ced", "Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/rootless",
"Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/systemd", "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/systemd",
"Comment": "v1.0.0-rc1-224-g5653ced", "Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/configs", "ImportPath": "github.com/opencontainers/runc/libcontainer/configs",
"Comment": "v1.0.0-rc1-224-g5653ced", "Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/configs/validate", "ImportPath": "github.com/opencontainers/runc/libcontainer/configs/validate",
"Comment": "v1.0.0-rc1-224-g5653ced", "Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/criurpc", "ImportPath": "github.com/opencontainers/runc/libcontainer/criurpc",
"Comment": "v1.0.0-rc1-224-g5653ced", "Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/keys", "ImportPath": "github.com/opencontainers/runc/libcontainer/keys",
"Comment": "v1.0.0-rc1-224-g5653ced", "Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/label", "ImportPath": "github.com/opencontainers/runc/libcontainer/label",
"Comment": "v1.0.0-rc1-224-g5653ced", "Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/seccomp", "ImportPath": "github.com/opencontainers/runc/libcontainer/seccomp",
"Comment": "v1.0.0-rc1-224-g5653ced", "Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/selinux", "ImportPath": "github.com/opencontainers/runc/libcontainer/selinux",
"Comment": "v1.0.0-rc1-224-g5653ced", "Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/stacktrace", "ImportPath": "github.com/opencontainers/runc/libcontainer/stacktrace",
"Comment": "v1.0.0-rc1-224-g5653ced", "Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/system", "ImportPath": "github.com/opencontainers/runc/libcontainer/system",
"Comment": "v1.0.0-rc1-224-g5653ced", "Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/user", "ImportPath": "github.com/opencontainers/runc/libcontainer/user",
"Comment": "v1.0.0-rc1-224-g5653ced", "Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/utils", "ImportPath": "github.com/opencontainers/runc/libcontainer/utils",
"Comment": "v1.0.0-rc1-224-g5653ced", "Comment": "v1.0.0-rc3-15-g4d6225ae",
"Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f"
},
{
"ImportPath": "github.com/opencontainers/runtime-spec/specs-go",
"Comment": "v1.0.0-rc5-67-gf227620",
"Rev": "f2276206b32ad0c2478bfc6440ceb7d51d815cf8"
}, },
{ {
"ImportPath": "github.com/pborman/uuid", "ImportPath": "github.com/pborman/uuid",
@ -513,7 +527,7 @@
}, },
{ {
"ImportPath": "github.com/syndtr/gocapability/capability", "ImportPath": "github.com/syndtr/gocapability/capability",
"Rev": "2c00daeb6c3b45114c80ac44119e7b8801fdd852" "Rev": "e7cb7fa329f456b3855136a2642b197bad7366ba"
}, },
{ {
"ImportPath": "github.com/vishvananda/netlink", "ImportPath": "github.com/vishvananda/netlink",

View File

@ -136,7 +136,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn
} }
// Detect the container we are running on. // Detect the container we are running on.
selfContainer, err := cgroups.GetThisCgroupDir("cpu") selfContainer, err := cgroups.GetOwnCgroupPath("cpu")
if err != nil { if err != nil {
return nil, err return nil, err
} }

1
vendor/github.com/mrunalp/fileutils/.gitignore generated vendored Normal file
View File

@ -0,0 +1 @@
/gocp

191
vendor/github.com/mrunalp/fileutils/LICENSE generated vendored Normal file
View File

@ -0,0 +1,191 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
Copyright 2014 Docker, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

1
vendor/github.com/mrunalp/fileutils/MAINTAINERS generated vendored Normal file
View File

@ -0,0 +1 @@
Mrunal Patel <mrunalp@gmail.com> (@mrunalp)

5
vendor/github.com/mrunalp/fileutils/README.md generated vendored Normal file
View File

@ -0,0 +1,5 @@
# fileutils
Collection of utilities for file manipulation in golang
The library is based on docker pkg/archive pkg/idtools but does copies instead of handling archive formats.

161
vendor/github.com/mrunalp/fileutils/fileutils.go generated vendored Normal file
View File

@ -0,0 +1,161 @@
package fileutils
import (
"fmt"
"io"
"os"
"path/filepath"
"syscall"
)
// CopyFile copies the file at source to dest
func CopyFile(source string, dest string) error {
si, err := os.Lstat(source)
if err != nil {
return err
}
st, ok := si.Sys().(*syscall.Stat_t)
if !ok {
return fmt.Errorf("could not convert to syscall.Stat_t")
}
uid := int(st.Uid)
gid := int(st.Gid)
// Handle symlinks
if si.Mode()&os.ModeSymlink != 0 {
target, err := os.Readlink(source)
if err != nil {
return err
}
if err := os.Symlink(target, dest); err != nil {
return err
}
}
// Handle device files
if st.Mode&syscall.S_IFMT == syscall.S_IFBLK || st.Mode&syscall.S_IFMT == syscall.S_IFCHR {
devMajor := int64(major(uint64(st.Rdev)))
devMinor := int64(minor(uint64(st.Rdev)))
mode := uint32(si.Mode() & 07777)
if st.Mode&syscall.S_IFMT == syscall.S_IFBLK {
mode |= syscall.S_IFBLK
}
if st.Mode&syscall.S_IFMT == syscall.S_IFCHR {
mode |= syscall.S_IFCHR
}
if err := syscall.Mknod(dest, mode, int(mkdev(devMajor, devMinor))); err != nil {
return err
}
}
// Handle regular files
if si.Mode().IsRegular() {
sf, err := os.Open(source)
if err != nil {
return err
}
defer sf.Close()
df, err := os.Create(dest)
if err != nil {
return err
}
defer df.Close()
_, err = io.Copy(df, sf)
if err != nil {
return err
}
}
// Chown the file
if err := os.Lchown(dest, uid, gid); err != nil {
return err
}
// Chmod the file
if !(si.Mode()&os.ModeSymlink == os.ModeSymlink) {
if err := os.Chmod(dest, si.Mode()); err != nil {
return err
}
}
return nil
}
// CopyDirectory copies the files under the source directory
// to dest directory. The dest directory is created if it
// does not exist.
func CopyDirectory(source string, dest string) error {
fi, err := os.Stat(source)
if err != nil {
return err
}
// Get owner.
st, ok := fi.Sys().(*syscall.Stat_t)
if !ok {
return fmt.Errorf("could not convert to syscall.Stat_t")
}
// We have to pick an owner here anyway.
if err := MkdirAllNewAs(dest, fi.Mode(), int(st.Uid), int(st.Gid)); err != nil {
return err
}
return filepath.Walk(source, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
// Get the relative path
relPath, err := filepath.Rel(source, path)
if err != nil {
return nil
}
if info.IsDir() {
// Skip the source directory.
if path != source {
// Get the owner.
st, ok := info.Sys().(*syscall.Stat_t)
if !ok {
return fmt.Errorf("could not convert to syscall.Stat_t")
}
uid := int(st.Uid)
gid := int(st.Gid)
if err := os.Mkdir(filepath.Join(dest, relPath), info.Mode()); err != nil {
return err
}
if err := os.Lchown(filepath.Join(dest, relPath), uid, gid); err != nil {
return err
}
}
return nil
}
// Copy the file.
if err := CopyFile(path, filepath.Join(dest, relPath)); err != nil {
return err
}
return nil
})
}
func major(device uint64) uint64 {
return (device >> 8) & 0xfff
}
func minor(device uint64) uint64 {
return (device & 0xff) | ((device >> 12) & 0xfff00)
}
func mkdev(major int64, minor int64) uint32 {
return uint32(((minor & 0xfff00) << 12) | ((major & 0xfff) << 8) | (minor & 0xff))
}

49
vendor/github.com/mrunalp/fileutils/idtools.go generated vendored Normal file
View File

@ -0,0 +1,49 @@
package fileutils
import (
"os"
"path/filepath"
)
// MkdirAllNewAs creates a directory (include any along the path) and then modifies
// ownership ONLY of newly created directories to the requested uid/gid. If the
// directories along the path exist, no change of ownership will be performed
func MkdirAllNewAs(path string, mode os.FileMode, ownerUID, ownerGID int) error {
// make an array containing the original path asked for, plus (for mkAll == true)
// all path components leading up to the complete path that don't exist before we MkdirAll
// so that we can chown all of them properly at the end. If chownExisting is false, we won't
// chown the full directory path if it exists
var paths []string
if _, err := os.Stat(path); err != nil && os.IsNotExist(err) {
paths = []string{path}
} else if err == nil {
// nothing to do; directory path fully exists already
return nil
}
// walk back to "/" looking for directories which do not exist
// and add them to the paths array for chown after creation
dirPath := path
for {
dirPath = filepath.Dir(dirPath)
if dirPath == "/" {
break
}
if _, err := os.Stat(dirPath); err != nil && os.IsNotExist(err) {
paths = append(paths, dirPath)
}
}
if err := os.MkdirAll(path, mode); err != nil && !os.IsExist(err) {
return err
}
// even if it existed, we will chown the requested path + any subpaths that
// didn't exist when we called MkdirAll
for _, pathComponent := range paths {
if err := os.Chown(pathComponent, ownerUID, ownerGID); err != nil {
return err
}
}
return nil
}

View File

@ -1,3 +1,7 @@
# libcontainer
[![GoDoc](https://godoc.org/github.com/opencontainers/runc/libcontainer?status.svg)](https://godoc.org/github.com/opencontainers/runc/libcontainer)
Libcontainer provides a native Go implementation for creating containers Libcontainer provides a native Go implementation for creating containers
with namespaces, cgroups, capabilities, and filesystem access controls. with namespaces, cgroups, capabilities, and filesystem access controls.
It allows you to manage the lifecycle of the container performing additional operations It allows you to manage the lifecycle of the container performing additional operations
@ -16,7 +20,14 @@ the current binary (/proc/self/exe) to be executed as the init process, and use
arg "init", we call the first step process "bootstrap", so you always need a "init" arg "init", we call the first step process "bootstrap", so you always need a "init"
function as the entry of "bootstrap". function as the entry of "bootstrap".
In addition to the go init function the early stage bootstrap is handled by importing
[nsenter](https://github.com/opencontainers/runc/blob/master/libcontainer/nsenter/README.md).
```go ```go
import (
_ "github.com/opencontainers/runc/libcontainer/nsenter"
)
func init() { func init() {
if len(os.Args) > 1 && os.Args[1] == "init" { if len(os.Args) > 1 && os.Args[1] == "init" {
runtime.GOMAXPROCS(1) runtime.GOMAXPROCS(1)
@ -83,6 +94,7 @@ config := &configs.Config{
}, },
MaskPaths: []string{ MaskPaths: []string{
"/proc/kcore", "/proc/kcore",
"/sys/firmware",
}, },
ReadonlyPaths: []string{ ReadonlyPaths: []string{
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
@ -184,7 +196,7 @@ process := &libcontainer.Process{
Stderr: os.Stderr, Stderr: os.Stderr,
} }
err := container.Start(process) err := container.Run(process)
if err != nil { if err != nil {
container.Destroy() container.Destroy()
logrus.Fatal(err) logrus.Fatal(err)
@ -222,6 +234,12 @@ container.Signal(signal)
// update container resource constraints. // update container resource constraints.
container.Set(config) container.Set(config)
// get current status of the container.
status, err := container.Status()
// get current container's state information.
state, err := container.State()
``` ```

View File

@ -7,10 +7,11 @@ import (
"os" "os"
"strings" "strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/syndtr/gocapability/capability" "github.com/syndtr/gocapability/capability"
) )
const allCapabilityTypes = capability.CAPS | capability.BOUNDS const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS
var capabilityMap map[string]capability.Cap var capabilityMap map[string]capability.Cap
@ -30,40 +31,84 @@ func init() {
} }
} }
func newCapWhitelist(caps []string) (*whitelist, error) { func newContainerCapList(capConfig *configs.Capabilities) (*containerCapabilities, error) {
l := []capability.Cap{} bounding := []capability.Cap{}
for _, c := range caps { for _, c := range capConfig.Bounding {
v, ok := capabilityMap[c] v, ok := capabilityMap[c]
if !ok { if !ok {
return nil, fmt.Errorf("unknown capability %q", c) return nil, fmt.Errorf("unknown capability %q", c)
} }
l = append(l, v) bounding = append(bounding, v)
}
effective := []capability.Cap{}
for _, c := range capConfig.Effective {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
effective = append(effective, v)
}
inheritable := []capability.Cap{}
for _, c := range capConfig.Inheritable {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
inheritable = append(inheritable, v)
}
permitted := []capability.Cap{}
for _, c := range capConfig.Permitted {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
permitted = append(permitted, v)
}
ambient := []capability.Cap{}
for _, c := range capConfig.Ambient {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
ambient = append(ambient, v)
} }
pid, err := capability.NewPid(os.Getpid()) pid, err := capability.NewPid(os.Getpid())
if err != nil { if err != nil {
return nil, err return nil, err
} }
return &whitelist{ return &containerCapabilities{
keep: l, bounding: bounding,
effective: effective,
inheritable: inheritable,
permitted: permitted,
ambient: ambient,
pid: pid, pid: pid,
}, nil }, nil
} }
type whitelist struct { type containerCapabilities struct {
pid capability.Capabilities pid capability.Capabilities
keep []capability.Cap bounding []capability.Cap
effective []capability.Cap
inheritable []capability.Cap
permitted []capability.Cap
ambient []capability.Cap
} }
// dropBoundingSet drops the capability bounding set to those specified in the whitelist. // ApplyBoundingSet sets the capability bounding set to those specified in the whitelist.
func (w *whitelist) dropBoundingSet() error { func (c *containerCapabilities) ApplyBoundingSet() error {
w.pid.Clear(capability.BOUNDS) c.pid.Clear(capability.BOUNDS)
w.pid.Set(capability.BOUNDS, w.keep...) c.pid.Set(capability.BOUNDS, c.bounding...)
return w.pid.Apply(capability.BOUNDS) return c.pid.Apply(capability.BOUNDS)
} }
// drop drops all capabilities for the current process except those specified in the whitelist. // Apply sets all the capabilities for the current process in the config.
func (w *whitelist) drop() error { func (c *containerCapabilities) ApplyCaps() error {
w.pid.Clear(allCapabilityTypes) c.pid.Clear(allCapabilityTypes)
w.pid.Set(allCapabilityTypes, w.keep...) c.pid.Set(capability.BOUNDS, c.bounding...)
return w.pid.Apply(allCapabilityTypes) c.pid.Set(capability.PERMITTED, c.permitted...)
c.pid.Set(capability.INHERITABLE, c.inheritable...)
c.pid.Set(capability.EFFECTIVE, c.effective...)
c.pid.Set(capability.AMBIENT, c.ambient...)
return c.pid.Apply(allCapabilityTypes)
} }

View File

@ -27,9 +27,9 @@ type Manager interface {
// Destroys the cgroup set // Destroys the cgroup set
Destroy() error Destroy() error
// NewCgroupManager() and LoadCgroupManager() require following attributes: // The option func SystemdCgroups() and Cgroupfs() require following attributes:
// Paths map[string]string // Paths map[string]string
// Cgroups *cgroups.Cgroup // Cgroups *configs.Cgroup
// Paths maps cgroup subsystem to path at which it is mounted. // Paths maps cgroup subsystem to path at which it is mounted.
// Cgroups specifies specific cgroup settings for the various subsystems // Cgroups specifies specific cgroup settings for the various subsystems

View File

@ -114,8 +114,8 @@ func (m *Manager) Apply(pid int) (err error) {
return err return err
} }
m.Paths = make(map[string]string)
if c.Paths != nil { if c.Paths != nil {
paths := make(map[string]string)
for name, path := range c.Paths { for name, path := range c.Paths {
_, err := d.path(name) _, err := d.path(name)
if err != nil { if err != nil {
@ -124,17 +124,12 @@ func (m *Manager) Apply(pid int) (err error) {
} }
return err return err
} }
paths[name] = path m.Paths[name] = path
} }
m.Paths = paths
return cgroups.EnterPid(m.Paths, pid) return cgroups.EnterPid(m.Paths, pid)
} }
paths := make(map[string]string)
for _, sys := range subsystems { for _, sys := range subsystems {
if err := sys.Apply(d); err != nil {
return err
}
// TODO: Apply should, ideally, be reentrant or be broken up into a separate // TODO: Apply should, ideally, be reentrant or be broken up into a separate
// create and join phase so that the cgroup hierarchy for a container can be // create and join phase so that the cgroup hierarchy for a container can be
// created then join consists of writing the process pids to cgroup.procs // created then join consists of writing the process pids to cgroup.procs
@ -147,9 +142,12 @@ func (m *Manager) Apply(pid int) (err error) {
} }
return err return err
} }
paths[sys.Name()] = p m.Paths[sys.Name()] = p
if err := sys.Apply(d); err != nil {
return err
}
} }
m.Paths = paths
return nil return nil
} }
@ -269,25 +267,8 @@ func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
}, nil }, nil
} }
func (raw *cgroupData) parentPath(subsystem, mountpoint, root string) (string, error) {
// Use GetThisCgroupDir instead of GetInitCgroupDir, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
initPath, err := cgroups.GetThisCgroupDir(subsystem)
if err != nil {
return "", err
}
// This is needed for nested containers, because in /proc/self/cgroup we
// see pathes from host, which don't exist in container.
relDir, err := filepath.Rel(root, initPath)
if err != nil {
return "", err
}
return filepath.Join(mountpoint, relDir), nil
}
func (raw *cgroupData) path(subsystem string) (string, error) { func (raw *cgroupData) path(subsystem string) (string, error) {
mnt, root, err := cgroups.FindCgroupMountpointAndRoot(subsystem) mnt, err := cgroups.FindCgroupMountpoint(subsystem)
// If we didn't mount the subsystem, there is no point we make the path. // If we didn't mount the subsystem, there is no point we make the path.
if err != nil { if err != nil {
return "", err return "", err
@ -295,11 +276,14 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process. // If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(raw.innerPath) { if filepath.IsAbs(raw.innerPath) {
// Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'. // Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
} }
parentPath, err := raw.parentPath(subsystem, mnt, root) // Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
if err != nil { if err != nil {
return "", err return "", err
} }
@ -348,8 +332,8 @@ func removePath(p string, err error) error {
return nil return nil
} }
func CheckCpushares(path string, c int64) error { func CheckCpushares(path string, c uint64) error {
var cpuShares int64 var cpuShares uint64
if c == 0 { if c == 0 {
return nil return nil

View File

@ -55,7 +55,7 @@ func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error
func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error { func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpuRtPeriod != 0 { if cgroup.Resources.CpuRtPeriod != 0 {
if err := writeFile(path, "cpu.rt_period_us", strconv.FormatInt(cgroup.Resources.CpuRtPeriod, 10)); err != nil { if err := writeFile(path, "cpu.rt_period_us", strconv.FormatUint(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
return err return err
} }
} }
@ -69,12 +69,12 @@ func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error {
func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error { func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpuShares != 0 { if cgroup.Resources.CpuShares != 0 {
if err := writeFile(path, "cpu.shares", strconv.FormatInt(cgroup.Resources.CpuShares, 10)); err != nil { if err := writeFile(path, "cpu.shares", strconv.FormatUint(cgroup.Resources.CpuShares, 10)); err != nil {
return err return err
} }
} }
if cgroup.Resources.CpuPeriod != 0 { if cgroup.Resources.CpuPeriod != 0 {
if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(cgroup.Resources.CpuPeriod, 10)); err != nil { if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatUint(cgroup.Resources.CpuPeriod, 10)); err != nil {
return err return err
} }
} }

View File

@ -61,9 +61,26 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
if err != nil { if err != nil {
return err return err
} }
if err := s.ensureParent(dir, root); err != nil { // 'ensureParent' start with parent because we don't want to
// explicitly inherit from parent, it could conflict with
// 'cpuset.cpu_exclusive'.
if err := s.ensureParent(filepath.Dir(dir), root); err != nil {
return err return err
} }
if err := os.MkdirAll(dir, 0755); err != nil {
return err
}
// We didn't inherit cpuset configs from parent, but we have
// to ensure cpuset configs are set before moving task into the
// cgroup.
// The logic is, if user specified cpuset configs, use these
// specified configs, otherwise, inherit from parent. This makes
// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
// keep backward compatbility.
if err := s.ensureCpusAndMems(dir, cgroup); err != nil {
return err
}
// because we are not using d.join we need to place the pid into the procs file // because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems // unlike the other subsystems
if err := cgroups.WriteCgroupProc(dir, pid); err != nil { if err := cgroups.WriteCgroupProc(dir, pid); err != nil {
@ -136,3 +153,10 @@ func (s *CpusetGroup) copyIfNeeded(current, parent string) error {
func (s *CpusetGroup) isEmpty(b []byte) bool { func (s *CpusetGroup) isEmpty(b []byte) bool {
return len(bytes.Trim(b, "\n")) == 0 return len(bytes.Trim(b, "\n")) == 0
} }
func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error {
if err := s.Set(path, cgroup); err != nil {
return err
}
return s.copyIfNeeded(path, filepath.Dir(path))
}

View File

@ -18,6 +18,8 @@ import (
const ( const (
cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes" cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes"
cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
cgroupMemoryLimit = "memory.limit_in_bytes"
) )
type MemoryGroup struct { type MemoryGroup struct {
@ -31,17 +33,23 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
path, err := d.path("memory") path, err := d.path("memory")
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} else if path == "" {
return nil
} }
if memoryAssigned(d.config) { if memoryAssigned(d.config) {
if path != "" { if _, err := os.Stat(path); os.IsNotExist(err) {
if err := os.MkdirAll(path, 0755); err != nil { if err := os.MkdirAll(path, 0755); err != nil {
return err return err
} }
} // Only enable kernel memory accouting when this cgroup
// is created by libcontainer, otherwise we might get
// error when people use `cgroupsPath` to join an existed
// cgroup whose kernel memory is not initialized.
if err := EnableKernelMemoryAccounting(path); err != nil { if err := EnableKernelMemoryAccounting(path); err != nil {
return err return err
} }
} }
}
defer func() { defer func() {
if err != nil { if err != nil {
os.RemoveAll(path) os.RemoveAll(path)
@ -62,18 +70,15 @@ func EnableKernelMemoryAccounting(path string) error {
// We have to limit the kernel memory here as it won't be accounted at all // We have to limit the kernel memory here as it won't be accounted at all
// until a limit is set on the cgroup and limit cannot be set once the // until a limit is set on the cgroup and limit cannot be set once the
// cgroup has children, or if there are already tasks in the cgroup. // cgroup has children, or if there are already tasks in the cgroup.
kernelMemoryLimit := int64(1) for _, i := range []int64{1, -1} {
if err := setKernelMemory(path, kernelMemoryLimit); err != nil { if err := setKernelMemory(path, uint64(i)); err != nil {
return err return err
} }
kernelMemoryLimit = int64(-1)
if err := setKernelMemory(path, kernelMemoryLimit); err != nil {
return err
} }
return nil return nil
} }
func setKernelMemory(path string, kernelMemoryLimit int64) error { func setKernelMemory(path string, kernelMemoryLimit uint64) error {
if path == "" { if path == "" {
return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit) return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit)
} }
@ -81,7 +86,7 @@ func setKernelMemory(path string, kernelMemoryLimit int64) error {
// kernel memory is not enabled on the system so we should do nothing // kernel memory is not enabled on the system so we should do nothing
return nil return nil
} }
if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil { if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatUint(kernelMemoryLimit, 10)), 0700); err != nil {
// Check if the error number returned by the syscall is "EBUSY" // Check if the error number returned by the syscall is "EBUSY"
// The EBUSY signal is returned on attempts to write to the // The EBUSY signal is returned on attempts to write to the
// memory.kmem.limit_in_bytes file if the cgroup has children or // memory.kmem.limit_in_bytes file if the cgroup has children or
@ -99,9 +104,20 @@ func setKernelMemory(path string, kernelMemoryLimit int64) error {
} }
func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
ulimited := -1
// If the memory update is set to uint64(-1) we should also
// set swap to uint64(-1), it means unlimited memory.
if cgroup.Resources.Memory == uint64(ulimited) {
// Only set swap if it's enbled in kernel
if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
cgroup.Resources.MemorySwap = uint64(ulimited)
}
}
// When memory and swap memory are both set, we need to handle the cases // When memory and swap memory are both set, we need to handle the cases
// for updating container. // for updating container.
if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap > 0 { if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap != 0 {
memoryUsage, err := getMemoryData(path, "") memoryUsage, err := getMemoryData(path, "")
if err != nil { if err != nil {
return err return err
@ -110,29 +126,29 @@ func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
// When update memory limit, we should adapt the write sequence // When update memory limit, we should adapt the write sequence
// for memory and swap memory, so it won't fail because the new // for memory and swap memory, so it won't fail because the new
// value and the old value don't fit kernel's validation. // value and the old value don't fit kernel's validation.
if memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) { if cgroup.Resources.MemorySwap == uint64(ulimited) || memoryUsage.Limit < cgroup.Resources.MemorySwap {
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { if err := writeFile(path, cgroupMemorySwapLimit, strconv.FormatUint(cgroup.Resources.MemorySwap, 10)); err != nil {
return err return err
} }
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { if err := writeFile(path, cgroupMemoryLimit, strconv.FormatUint(cgroup.Resources.Memory, 10)); err != nil {
return err return err
} }
} else { } else {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { if err := writeFile(path, cgroupMemoryLimit, strconv.FormatUint(cgroup.Resources.Memory, 10)); err != nil {
return err return err
} }
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { if err := writeFile(path, cgroupMemorySwapLimit, strconv.FormatUint(cgroup.Resources.MemorySwap, 10)); err != nil {
return err return err
} }
} }
} else { } else {
if cgroup.Resources.Memory != 0 { if cgroup.Resources.Memory != 0 {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { if err := writeFile(path, cgroupMemoryLimit, strconv.FormatUint(cgroup.Resources.Memory, 10)); err != nil {
return err return err
} }
} }
if cgroup.Resources.MemorySwap > 0 { if cgroup.Resources.MemorySwap != 0 {
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { if err := writeFile(path, cgroupMemorySwapLimit, strconv.FormatUint(cgroup.Resources.MemorySwap, 10)); err != nil {
return err return err
} }
} }
@ -153,13 +169,13 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
} }
if cgroup.Resources.MemoryReservation != 0 { if cgroup.Resources.MemoryReservation != 0 {
if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatUint(cgroup.Resources.MemoryReservation, 10)); err != nil {
return err return err
} }
} }
if cgroup.Resources.KernelMemoryTCP != 0 { if cgroup.Resources.KernelMemoryTCP != 0 {
if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil { if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatUint(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
return err return err
} }
} }
@ -170,12 +186,12 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
} }
if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 { if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 {
return nil return nil
} else if int64(*cgroup.Resources.MemorySwappiness) >= 0 && int64(*cgroup.Resources.MemorySwappiness) <= 100 { } else if *cgroup.Resources.MemorySwappiness <= 100 {
if err := writeFile(path, "memory.swappiness", strconv.FormatInt(*cgroup.Resources.MemorySwappiness, 10)); err != nil { if err := writeFile(path, "memory.swappiness", strconv.FormatUint(*cgroup.Resources.MemorySwappiness, 10)); err != nil {
return err return err
} }
} else { } else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", int64(*cgroup.Resources.MemorySwappiness)) return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *cgroup.Resources.MemorySwappiness)
} }
return nil return nil
@ -227,6 +243,14 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
} }
stats.MemoryStats.KernelTCPUsage = kernelTCPUsage stats.MemoryStats.KernelTCPUsage = kernelTCPUsage
useHierarchy := strings.Join([]string{"memory", "use_hierarchy"}, ".")
value, err := getCgroupParamUint(path, useHierarchy)
if err != nil {
return err
}
if value == 1 {
stats.MemoryStats.UseHierarchy = true
}
return nil return nil
} }
@ -237,7 +261,7 @@ func memoryAssigned(cgroup *configs.Cgroup) bool {
cgroup.Resources.KernelMemory > 0 || cgroup.Resources.KernelMemory > 0 ||
cgroup.Resources.KernelMemoryTCP > 0 || cgroup.Resources.KernelMemoryTCP > 0 ||
cgroup.Resources.OomKillDisable || cgroup.Resources.OomKillDisable ||
(cgroup.Resources.MemorySwappiness != nil && *cgroup.Resources.MemorySwappiness != -1) (cgroup.Resources.MemorySwappiness != nil && int64(*cgroup.Resources.MemorySwappiness) != -1)
} }
func getMemoryData(path, name string) (cgroups.MemoryData, error) { func getMemoryData(path, name string) (cgroups.MemoryData, error) {

View File

@ -0,0 +1,128 @@
// +build linux
package rootless
import (
"fmt"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/configs/validate"
)
// TODO: This is copied from libcontainer/cgroups/fs, which duplicates this code
// needlessly. We should probably export this list.
var subsystems = []subsystem{
&fs.CpusetGroup{},
&fs.DevicesGroup{},
&fs.MemoryGroup{},
&fs.CpuGroup{},
&fs.CpuacctGroup{},
&fs.PidsGroup{},
&fs.BlkioGroup{},
&fs.HugetlbGroup{},
&fs.NetClsGroup{},
&fs.NetPrioGroup{},
&fs.PerfEventGroup{},
&fs.FreezerGroup{},
&fs.NameGroup{GroupName: "name=systemd"},
}
type subsystem interface {
// Name returns the name of the subsystem.
Name() string
// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
GetStats(path string, stats *cgroups.Stats) error
}
// The noop cgroup manager is used for rootless containers, because we currently
// cannot manage cgroups if we are in a rootless setup. This manager is chosen
// by factory if we are in rootless mode. We error out if any cgroup options are
// set in the config -- this may change in the future with upcoming kernel features
// like the cgroup namespace.
type Manager struct {
Cgroups *configs.Cgroup
Paths map[string]string
}
func (m *Manager) Apply(pid int) error {
// If there are no cgroup settings, there's nothing to do.
if m.Cgroups == nil {
return nil
}
// We can't set paths.
// TODO(cyphar): Implement the case where the runner of a rootless container
// owns their own cgroup, which would allow us to set up a
// cgroup for each path.
if m.Cgroups.Paths != nil {
return fmt.Errorf("cannot change cgroup path in rootless container")
}
// We load the paths into the manager.
paths := make(map[string]string)
for _, sys := range subsystems {
name := sys.Name()
path, err := cgroups.GetOwnCgroupPath(name)
if err != nil {
// Ignore paths we couldn't resolve.
continue
}
paths[name] = path
}
m.Paths = paths
return nil
}
func (m *Manager) GetPaths() map[string]string {
return m.Paths
}
func (m *Manager) Set(container *configs.Config) error {
// We have to re-do the validation here, since someone might decide to
// update a rootless container.
return validate.New().Validate(container)
}
func (m *Manager) GetPids() ([]int, error) {
dir, err := cgroups.GetOwnCgroupPath("devices")
if err != nil {
return nil, err
}
return cgroups.GetPids(dir)
}
func (m *Manager) GetAllPids() ([]int, error) {
dir, err := cgroups.GetOwnCgroupPath("devices")
if err != nil {
return nil, err
}
return cgroups.GetAllPids(dir)
}
func (m *Manager) GetStats() (*cgroups.Stats, error) {
// TODO(cyphar): We can make this work if we figure out a way to allow usage
// of cgroups with a rootless container. While this doesn't
// actually require write access to a cgroup directory, the
// statistics are not useful if they can be affected by
// non-container processes.
return nil, fmt.Errorf("cannot get cgroup stats in rootless container")
}
func (m *Manager) Freeze(state configs.FreezerState) error {
// TODO(cyphar): We can make this work if we figure out a way to allow usage
// of cgroups with a rootless container.
return fmt.Errorf("cannot use freezer cgroup in rootless container")
}
func (m *Manager) Destroy() error {
// We don't have to do anything here because we didn't do any setup.
return nil
}

View File

@ -51,6 +51,8 @@ type MemoryStats struct {
KernelUsage MemoryData `json:"kernel_usage,omitempty"` KernelUsage MemoryData `json:"kernel_usage,omitempty"`
// usage of kernel TCP memory // usage of kernel TCP memory
KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"` KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
// if true, memory usage is accounted for throughout a hierarchy of cgroups.
UseHierarchy bool `json:"use_hierarchy"`
Stats map[string]uint64 `json:"stats,omitempty"` Stats map[string]uint64 `json:"stats,omitempty"`
} }

View File

@ -5,10 +5,8 @@ package systemd
import ( import (
"errors" "errors"
"fmt" "fmt"
"io/ioutil"
"os" "os"
"path/filepath" "path/filepath"
"strconv"
"strings" "strings"
"sync" "sync"
"time" "time"
@ -67,12 +65,14 @@ var subsystems = subsystemSet{
const ( const (
testScopeWait = 4 testScopeWait = 4
testSliceWait = 4
) )
var ( var (
connLock sync.Mutex connLock sync.Mutex
theConn *systemdDbus.Conn theConn *systemdDbus.Conn
hasStartTransientUnit bool hasStartTransientUnit bool
hasStartTransientSliceUnit bool
hasTransientDefaultDependencies bool hasTransientDefaultDependencies bool
hasDelegate bool hasDelegate bool
) )
@ -159,8 +159,36 @@ func UseSystemd() bool {
} }
} }
// Assume we have the ability to start a transient unit as a slice
// This was broken until systemd v229, but has been back-ported on RHEL environments >= 219
// For details, see: https://bugzilla.redhat.com/show_bug.cgi?id=1370299
hasStartTransientSliceUnit = true
// To ensure simple clean-up, we create a slice off the root with no hierarchy
slice := fmt.Sprintf("libcontainer_%d_systemd_test_default.slice", os.Getpid())
if _, err := theConn.StartTransientUnit(slice, "replace", nil, nil); err != nil {
if _, ok := err.(dbus.Error); ok {
hasStartTransientSliceUnit = false
}
}
for i := 0; i <= testSliceWait; i++ {
if _, err := theConn.StopUnit(slice, "replace", nil); err != nil {
if dbusError, ok := err.(dbus.Error); ok {
if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") {
hasStartTransientSliceUnit = false
break
}
}
} else {
break
}
time.Sleep(time.Millisecond)
}
// Not critical because of the stop unit logic above. // Not critical because of the stop unit logic above.
theConn.StopUnit(scope, "replace", nil) theConn.StopUnit(scope, "replace", nil)
theConn.StopUnit(slice, "replace", nil)
} }
return hasStartTransientUnit return hasStartTransientUnit
} }
@ -194,11 +222,24 @@ func (m *Manager) Apply(pid int) error {
slice = c.Parent slice = c.Parent
} }
properties = append(properties, properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
systemdDbus.PropSlice(slice),
systemdDbus.PropDescription("docker container "+c.Name), // if we create a slice, the parent is defined via a Wants=
newProp("PIDs", []uint32{uint32(pid)}), if strings.HasSuffix(unitName, ".slice") {
) // This was broken until systemd v229, but has been back-ported on RHEL environments >= 219
if !hasStartTransientSliceUnit {
return fmt.Errorf("systemd version does not support ability to start a slice as transient unit")
}
properties = append(properties, systemdDbus.PropWants(slice))
} else {
// otherwise, we use Slice=
properties = append(properties, systemdDbus.PropSlice(slice))
}
// only add pid if its valid, -1 is used w/ general slice creation.
if pid != -1 {
properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
}
if hasDelegate { if hasDelegate {
// This is only supported on systemd versions 218 and above. // This is only supported on systemd versions 218 and above.
@ -219,12 +260,19 @@ func (m *Manager) Apply(pid int) error {
if c.Resources.Memory != 0 { if c.Resources.Memory != 0 {
properties = append(properties, properties = append(properties,
newProp("MemoryLimit", uint64(c.Resources.Memory))) newProp("MemoryLimit", c.Resources.Memory))
} }
if c.Resources.CpuShares != 0 { if c.Resources.CpuShares != 0 {
properties = append(properties, properties = append(properties,
newProp("CPUShares", uint64(c.Resources.CpuShares))) newProp("CPUShares", c.Resources.CpuShares))
}
// cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 {
cpuQuotaPerSecUSec := uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
properties = append(properties,
newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
} }
if c.Resources.BlkioWeight != 0 { if c.Resources.BlkioWeight != 0 {
@ -240,7 +288,7 @@ func (m *Manager) Apply(pid int) error {
} }
} }
if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil { if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil && !isUnitExists(err) {
return err return err
} }
@ -285,15 +333,6 @@ func (m *Manager) GetPaths() map[string]string {
return paths return paths
} }
func writeFile(dir, file, data string) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s", file)
}
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
}
func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
path, err := getSubsystemPath(c, subsystem) path, err := getSubsystemPath(c, subsystem)
if err != nil { if err != nil {
@ -302,10 +341,9 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
if err := os.MkdirAll(path, 0755); err != nil { if err := os.MkdirAll(path, 0755); err != nil {
return "", err return "", err
} }
if err := writeFile(path, "cgroup.procs", strconv.Itoa(pid)); err != nil { if err := cgroups.WriteCgroupProc(path, pid); err != nil {
return "", err return "", err
} }
return path, nil return path, nil
} }
@ -347,10 +385,10 @@ func joinCgroups(c *configs.Cgroup, pid int) error {
return nil return nil
} }
// systemd represents slice heirarchy using `-`, so we need to follow suit when // systemd represents slice hierarchy using `-`, so we need to follow suit when
// generating the path of slice. Essentially, test-a-b.slice becomes // generating the path of slice. Essentially, test-a-b.slice becomes
// test.slice/test-a.slice/test-a-b.slice. // test.slice/test-a.slice/test-a-b.slice.
func expandSlice(slice string) (string, error) { func ExpandSlice(slice string) (string, error) {
suffix := ".slice" suffix := ".slice"
// Name has to end with ".slice", but can't be just ".slice". // Name has to end with ".slice", but can't be just ".slice".
if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) { if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
@ -364,6 +402,10 @@ func expandSlice(slice string) (string, error) {
var path, prefix string var path, prefix string
sliceName := strings.TrimSuffix(slice, suffix) sliceName := strings.TrimSuffix(slice, suffix)
// if input was -.slice, we should just return root now
if sliceName == "-" {
return "/", nil
}
for _, component := range strings.Split(sliceName, "-") { for _, component := range strings.Split(sliceName, "-") {
// test--a.slice isn't permitted, nor is -test.slice. // test--a.slice isn't permitted, nor is -test.slice.
if component == "" { if component == "" {
@ -384,7 +426,7 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
return "", err return "", err
} }
initPath, err := cgroups.GetInitCgroupDir(subsystem) initPath, err := cgroups.GetInitCgroup(subsystem)
if err != nil { if err != nil {
return "", err return "", err
} }
@ -396,7 +438,7 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
slice = c.Parent slice = c.Parent
} }
slice, err = expandSlice(slice) slice, err = ExpandSlice(slice)
if err != nil { if err != nil {
return "", err return "", err
} }
@ -483,8 +525,12 @@ func (m *Manager) Set(container *configs.Config) error {
} }
func getUnitName(c *configs.Cgroup) string { func getUnitName(c *configs.Cgroup) string {
// by default, we create a scope unless the user explicitly asks for a slice.
if !strings.HasSuffix(c.Name, ".slice") {
return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name) return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
} }
return c.Name
}
func setKernelMemory(c *configs.Cgroup) error { func setKernelMemory(c *configs.Cgroup) error {
path, err := getSubsystemPath(c, "memory") path, err := getSubsystemPath(c, "memory")
@ -497,3 +543,13 @@ func setKernelMemory(c *configs.Cgroup) error {
} }
return fs.EnableKernelMemoryAccounting(path) return fs.EnableKernelMemoryAccounting(path)
} }
// isUnitExists returns true if the error is that a systemd unit already exists.
func isUnitExists(err error) bool {
if err != nil {
if dbusError, ok := err.(dbus.Error); ok {
return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
}
}
return false
}

View File

@ -23,36 +23,14 @@ const (
// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
func FindCgroupMountpoint(subsystem string) (string, error) { func FindCgroupMountpoint(subsystem string) (string, error) {
// We are not using mount.GetMounts() because it's super-inefficient, mnt, _, err := FindCgroupMountpointAndRoot(subsystem)
// parsing it directly sped up x10 times because of not using Sscanf. return mnt, err
// It was one of two major performance drawbacks in container start.
if !isSubsystemAvailable(subsystem) {
return "", NewNotFoundError(subsystem)
}
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return "", err
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
txt := scanner.Text()
fields := strings.Split(txt, " ")
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if opt == subsystem {
return fields[4], nil
}
}
}
if err := scanner.Err(); err != nil {
return "", err
}
return "", NewNotFoundError(subsystem)
} }
func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) { func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
// We are not using mount.GetMounts() because it's super-inefficient,
// parsing it directly sped up x10 times because of not using Sscanf.
// It was one of two major performance drawbacks in container start.
if !isSubsystemAvailable(subsystem) { if !isSubsystemAvailable(subsystem) {
return "", "", NewNotFoundError(subsystem) return "", "", NewNotFoundError(subsystem)
} }
@ -131,7 +109,7 @@ type Mount struct {
Subsystems []string Subsystems []string
} }
func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) { func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
if len(m.Subsystems) == 0 { if len(m.Subsystems) == 0 {
return "", fmt.Errorf("no subsystem for mount") return "", fmt.Errorf("no subsystem for mount")
} }
@ -149,7 +127,7 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
if sepIdx == -1 { if sepIdx == -1 {
return nil, fmt.Errorf("invalid mountinfo format") return nil, fmt.Errorf("invalid mountinfo format")
} }
if txt[sepIdx+3:sepIdx+9] != "cgroup" { if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
continue continue
} }
fields := strings.Split(txt, " ") fields := strings.Split(txt, " ")
@ -211,9 +189,6 @@ func GetAllSubsystems() ([]string, error) {
s := bufio.NewScanner(f) s := bufio.NewScanner(f)
for s.Scan() { for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
text := s.Text() text := s.Text()
if text[0] != '#' { if text[0] != '#' {
parts := strings.Fields(text) parts := strings.Fields(text)
@ -222,11 +197,14 @@ func GetAllSubsystems() ([]string, error) {
} }
} }
} }
if err := s.Err(); err != nil {
return nil, err
}
return subsystems, nil return subsystems, nil
} }
// GetThisCgroupDir returns the relative path to the cgroup docker is running in. // GetOwnCgroup returns the relative path to the cgroup docker is running in.
func GetThisCgroupDir(subsystem string) (string, error) { func GetOwnCgroup(subsystem string) (string, error) {
cgroups, err := ParseCgroupFile("/proc/self/cgroup") cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil { if err != nil {
return "", err return "", err
@ -235,8 +213,16 @@ func GetThisCgroupDir(subsystem string) (string, error) {
return getControllerPath(subsystem, cgroups) return getControllerPath(subsystem, cgroups)
} }
func GetInitCgroupDir(subsystem string) (string, error) { func GetOwnCgroupPath(subsystem string) (string, error) {
cgroup, err := GetOwnCgroup(subsystem)
if err != nil {
return "", err
}
return getCgroupPathHelper(subsystem, cgroup)
}
func GetInitCgroup(subsystem string) (string, error) {
cgroups, err := ParseCgroupFile("/proc/1/cgroup") cgroups, err := ParseCgroupFile("/proc/1/cgroup")
if err != nil { if err != nil {
return "", err return "", err
@ -245,6 +231,31 @@ func GetInitCgroupDir(subsystem string) (string, error) {
return getControllerPath(subsystem, cgroups) return getControllerPath(subsystem, cgroups)
} }
func GetInitCgroupPath(subsystem string) (string, error) {
cgroup, err := GetInitCgroup(subsystem)
if err != nil {
return "", err
}
return getCgroupPathHelper(subsystem, cgroup)
}
func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
mnt, root, err := FindCgroupMountpointAndRoot(subsystem)
if err != nil {
return "", err
}
// This is needed for nested containers, because in /proc/self/cgroup we
// see pathes from host, which don't exist in container.
relCgroup, err := filepath.Rel(root, cgroup)
if err != nil {
return "", err
}
return filepath.Join(mnt, relCgroup), nil
}
func readProcsFile(dir string) ([]int, error) { func readProcsFile(dir string) ([]int, error) {
f, err := os.Open(filepath.Join(dir, CgroupProcesses)) f, err := os.Open(filepath.Join(dir, CgroupProcesses))
if err != nil { if err != nil {
@ -287,10 +298,6 @@ func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
cgroups := make(map[string]string) cgroups := make(map[string]string)
for s.Scan() { for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
text := s.Text() text := s.Text()
// from cgroups(7): // from cgroups(7):
// /proc/[pid]/cgroup // /proc/[pid]/cgroup
@ -307,6 +314,10 @@ func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
cgroups[subs] = parts[2] cgroups[subs] = parts[2]
} }
} }
if err := s.Err(); err != nil {
return nil, err
}
return cgroups, nil return cgroups, nil
} }

View File

@ -22,7 +22,7 @@ type Cgroup struct {
// The path is assumed to be relative to the host system cgroup mountpoint. // The path is assumed to be relative to the host system cgroup mountpoint.
Path string `json:"path"` Path string `json:"path"`
// ScopePrefix decribes prefix for the scope name // ScopePrefix describes prefix for the scope name
ScopePrefix string `json:"scope_prefix"` ScopePrefix string `json:"scope_prefix"`
// Paths represent the absolute cgroups paths to join. // Paths represent the absolute cgroups paths to join.
@ -45,34 +45,34 @@ type Resources struct {
Devices []*Device `json:"devices"` Devices []*Device `json:"devices"`
// Memory limit (in bytes) // Memory limit (in bytes)
Memory int64 `json:"memory"` Memory uint64 `json:"memory"`
// Memory reservation or soft_limit (in bytes) // Memory reservation or soft_limit (in bytes)
MemoryReservation int64 `json:"memory_reservation"` MemoryReservation uint64 `json:"memory_reservation"`
// Total memory usage (memory + swap); set `-1` to enable unlimited swap // Total memory usage (memory + swap); set `-1` to enable unlimited swap
MemorySwap int64 `json:"memory_swap"` MemorySwap uint64 `json:"memory_swap"`
// Kernel memory limit (in bytes) // Kernel memory limit (in bytes)
KernelMemory int64 `json:"kernel_memory"` KernelMemory uint64 `json:"kernel_memory"`
// Kernel memory limit for TCP use (in bytes) // Kernel memory limit for TCP use (in bytes)
KernelMemoryTCP int64 `json:"kernel_memory_tcp"` KernelMemoryTCP uint64 `json:"kernel_memory_tcp"`
// CPU shares (relative weight vs. other containers) // CPU shares (relative weight vs. other containers)
CpuShares int64 `json:"cpu_shares"` CpuShares uint64 `json:"cpu_shares"`
// CPU hardcap limit (in usecs). Allowed cpu time in a given period. // CPU hardcap limit (in usecs). Allowed cpu time in a given period.
CpuQuota int64 `json:"cpu_quota"` CpuQuota int64 `json:"cpu_quota"`
// CPU period to be used for hardcapping (in usecs). 0 to use system default. // CPU period to be used for hardcapping (in usecs). 0 to use system default.
CpuPeriod int64 `json:"cpu_period"` CpuPeriod uint64 `json:"cpu_period"`
// How many time CPU will use in realtime scheduling (in usecs). // How many time CPU will use in realtime scheduling (in usecs).
CpuRtRuntime int64 `json:"cpu_rt_quota"` CpuRtRuntime int64 `json:"cpu_rt_quota"`
// CPU period to be used for realtime scheduling (in usecs). // CPU period to be used for realtime scheduling (in usecs).
CpuRtPeriod int64 `json:"cpu_rt_period"` CpuRtPeriod uint64 `json:"cpu_rt_period"`
// CPU to use // CPU to use
CpusetCpus string `json:"cpuset_cpus"` CpusetCpus string `json:"cpuset_cpus"`
@ -95,7 +95,7 @@ type Resources struct {
// IO read rate limit per cgroup per device, bytes per second. // IO read rate limit per cgroup per device, bytes per second.
BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"` BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"`
// IO write rate limit per cgroup per divice, bytes per second. // IO write rate limit per cgroup per device, bytes per second.
BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"` BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"`
// IO read rate limit per cgroup per device, IO per second. // IO read rate limit per cgroup per device, IO per second.
@ -114,7 +114,7 @@ type Resources struct {
OomKillDisable bool `json:"oom_kill_disable"` OomKillDisable bool `json:"oom_kill_disable"`
// Tuning swappiness behaviour per cgroup // Tuning swappiness behaviour per cgroup
MemorySwappiness *int64 `json:"memory_swappiness"` MemorySwappiness *uint64 `json:"memory_swappiness"`
// Set priority of network traffic for container // Set priority of network traffic for container
NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"` NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`

View File

@ -8,6 +8,7 @@ import (
"time" "time"
"github.com/Sirupsen/logrus" "github.com/Sirupsen/logrus"
"github.com/opencontainers/runtime-spec/specs-go"
) )
type Rlimit struct { type Rlimit struct {
@ -85,11 +86,6 @@ type Config struct {
// that the parent process dies. // that the parent process dies.
ParentDeathSignal int `json:"parent_death_signal"` ParentDeathSignal int `json:"parent_death_signal"`
// PivotDir allows a custom directory inside the container's root filesystem to be used as pivot, when NoPivotRoot is not set.
// When a custom PivotDir not set, a temporary dir inside the root filesystem will be used. The pivot dir needs to be writeable.
// This is required when using read only root filesystems. In these cases, a read/writeable path can be (bind) mounted somewhere inside the root filesystem to act as pivot.
PivotDir string `json:"pivot_dir"`
// Path to a directory containing the container's root filesystem. // Path to a directory containing the container's root filesystem.
Rootfs string `json:"rootfs"` Rootfs string `json:"rootfs"`
@ -117,8 +113,8 @@ type Config struct {
Namespaces Namespaces `json:"namespaces"` Namespaces Namespaces `json:"namespaces"`
// Capabilities specify the capabilities to keep when executing the process inside the container // Capabilities specify the capabilities to keep when executing the process inside the container
// All capbilities not specified will be dropped from the processes capability mask // All capabilities not specified will be dropped from the processes capability mask
Capabilities []string `json:"capabilities"` Capabilities *Capabilities `json:"capabilities"`
// Networks specifies the container's network setup to be created // Networks specifies the container's network setup to be created
Networks []*Network `json:"networks"` Networks []*Network `json:"networks"`
@ -187,6 +183,9 @@ type Config struct {
// NoNewKeyring will not allocated a new session keyring for the container. It will use the // NoNewKeyring will not allocated a new session keyring for the container. It will use the
// callers keyring in this case. // callers keyring in this case.
NoNewKeyring bool `json:"no_new_keyring"` NoNewKeyring bool `json:"no_new_keyring"`
// Rootless specifies whether the container is a rootless container.
Rootless bool `json:"rootless"`
} }
type Hooks struct { type Hooks struct {
@ -201,6 +200,19 @@ type Hooks struct {
Poststop []Hook Poststop []Hook
} }
type Capabilities struct {
// Bounding is the set of capabilities checked by the kernel.
Bounding []string
// Effective is the set of capabilities checked by the kernel.
Effective []string
// Inheritable is the capabilities preserved across execve.
Inheritable []string
// Permitted is the limiting superset for effective capabilities.
Permitted []string
// Ambient is the ambient set of capabilities that are kept.
Ambient []string
}
func (hooks *Hooks) UnmarshalJSON(b []byte) error { func (hooks *Hooks) UnmarshalJSON(b []byte) error {
var state struct { var state struct {
Prestart []CommandHook Prestart []CommandHook
@ -248,13 +260,7 @@ func (hooks Hooks) MarshalJSON() ([]byte, error) {
} }
// HookState is the payload provided to a hook on execution. // HookState is the payload provided to a hook on execution.
type HookState struct { type HookState specs.State
Version string `json:"ociVersion"`
ID string `json:"id"`
Pid int `json:"pid"`
Root string `json:"root"`
BundlePath string `json:"bundlePath"`
}
type Hook interface { type Hook interface {
// Run executes the hook with the provided state. // Run executes the hook with the provided state.

View File

@ -4,38 +4,50 @@ package configs
import "fmt" import "fmt"
// HostUID gets the root uid for the process on host which could be non-zero // HostUID gets the translated uid for the process on host which could be
// when user namespaces are enabled. // different when user namespaces are enabled.
func (c Config) HostUID() (int, error) { func (c Config) HostUID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) { if c.Namespaces.Contains(NEWUSER) {
if c.UidMappings == nil { if c.UidMappings == nil {
return -1, fmt.Errorf("User namespaces enabled, but no user mappings found.") return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.")
} }
id, found := c.hostIDFromMapping(0, c.UidMappings) id, found := c.hostIDFromMapping(containerId, c.UidMappings)
if !found { if !found {
return -1, fmt.Errorf("User namespaces enabled, but no root user mapping found.") return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.")
} }
return id, nil return id, nil
} }
// Return default root uid 0 // Return unchanged id.
return 0, nil return containerId, nil
} }
// HostGID gets the root gid for the process on host which could be non-zero // HostRootUID gets the root uid for the process on host which could be non-zero
// when user namespaces are enabled. // when user namespaces are enabled.
func (c Config) HostGID() (int, error) { func (c Config) HostRootUID() (int, error) {
return c.HostUID(0)
}
// HostGID gets the translated gid for the process on host which could be
// different when user namespaces are enabled.
func (c Config) HostGID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) { if c.Namespaces.Contains(NEWUSER) {
if c.GidMappings == nil { if c.GidMappings == nil {
return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.") return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.")
} }
id, found := c.hostIDFromMapping(0, c.GidMappings) id, found := c.hostIDFromMapping(containerId, c.GidMappings)
if !found { if !found {
return -1, fmt.Errorf("User namespaces enabled, but no root group mapping found.") return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.")
} }
return id, nil return id, nil
} }
// Return default root gid 0 // Return unchanged id.
return 0, nil return containerId, nil
}
// HostRootGID gets the root gid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostRootGID() (int, error) {
return c.HostGID(0)
} }
// Utility function that gets a host ID for a container ID from user namespace map // Utility function that gets a host ID for a container ID from user namespace map

View File

@ -1,5 +1,11 @@
package configs package configs
const (
// EXT_COPYUP is a directive to copy up the contents of a directory when
// a tmpfs is mounted over it.
EXT_COPYUP = 1 << iota
)
type Mount struct { type Mount struct {
// Source path for the mount. // Source path for the mount.
Source string `json:"source"` Source string `json:"source"`
@ -22,6 +28,9 @@ type Mount struct {
// Relabel source if set, "z" indicates shared, "Z" indicates unshared. // Relabel source if set, "z" indicates shared, "Z" indicates unshared.
Relabel string `json:"relabel"` Relabel string `json:"relabel"`
// Extensions are additional flags that are specific to runc.
Extensions int `json:"extensions"`
// Optional Command to be run before Source is mounted. // Optional Command to be run before Source is mounted.
PremountCmds []Command `json:"premount_cmds"` PremountCmds []Command `json:"premount_cmds"`

View File

@ -4,12 +4,10 @@ package configs
func (n *Namespace) Syscall() int { func (n *Namespace) Syscall() int {
panic("No namespace syscall support") panic("No namespace syscall support")
return 0
} }
// CloneFlags parses the container's Namespaces options to set the correct // CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This function returns flags only for new namespaces. // flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr { func (n *Namespaces) CloneFlags() uintptr {
panic("No namespace syscall support") panic("No namespace syscall support")
return uintptr(0)
} }

View File

@ -22,8 +22,8 @@ var (
supportedNamespaces = make(map[NamespaceType]bool) supportedNamespaces = make(map[NamespaceType]bool)
) )
// nsToFile converts the namespace type to its filename // NsName converts the namespace type to its filename
func nsToFile(ns NamespaceType) string { func NsName(ns NamespaceType) string {
switch ns { switch ns {
case NEWNET: case NEWNET:
return "net" return "net"
@ -50,7 +50,7 @@ func IsNamespaceSupported(ns NamespaceType) bool {
if ok { if ok {
return supported return supported
} }
nsFile := nsToFile(ns) nsFile := NsName(ns)
// if the namespace type is unknown, just return false // if the namespace type is unknown, just return false
if nsFile == "" { if nsFile == "" {
return false return false
@ -84,7 +84,7 @@ func (n *Namespace) GetPath(pid int) string {
if n.Path != "" { if n.Path != "" {
return n.Path return n.Path
} }
return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type)) return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
} }
func (n *Namespaces) Remove(t NamespaceType) bool { func (n *Namespaces) Remove(t NamespaceType) bool {

View File

@ -0,0 +1,117 @@
package validate
import (
"fmt"
"os"
"reflect"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
)
var (
geteuid = os.Geteuid
getegid = os.Getegid
)
func (v *ConfigValidator) rootless(config *configs.Config) error {
if err := rootlessMappings(config); err != nil {
return err
}
if err := rootlessMount(config); err != nil {
return err
}
// Currently, cgroups cannot effectively be used in rootless containers.
// The new cgroup namespace doesn't really help us either because it doesn't
// have nice interactions with the user namespace (we're working with upstream
// to fix this).
if err := rootlessCgroup(config); err != nil {
return err
}
// XXX: We currently can't verify the user config at all, because
// configs.Config doesn't store the user-related configs. So this
// has to be verified by setupUser() in init_linux.go.
return nil
}
func rootlessMappings(config *configs.Config) error {
rootuid, err := config.HostRootUID()
if err != nil {
return fmt.Errorf("failed to get root uid from uidMappings: %v", err)
}
if euid := geteuid(); euid != 0 {
if !config.Namespaces.Contains(configs.NEWUSER) {
return fmt.Errorf("rootless containers require user namespaces")
}
if rootuid != euid {
return fmt.Errorf("rootless containers cannot map container root to a different host user")
}
}
rootgid, err := config.HostRootGID()
if err != nil {
return fmt.Errorf("failed to get root gid from gidMappings: %v", err)
}
// Similar to the above test, we need to make sure that we aren't trying to
// map to a group ID that we don't have the right to be.
if rootgid != getegid() {
return fmt.Errorf("rootless containers cannot map container root to a different host group")
}
// We can only map one user and group inside a container (our own).
if len(config.UidMappings) != 1 || config.UidMappings[0].Size != 1 {
return fmt.Errorf("rootless containers cannot map more than one user")
}
if len(config.GidMappings) != 1 || config.GidMappings[0].Size != 1 {
return fmt.Errorf("rootless containers cannot map more than one group")
}
return nil
}
// cgroup verifies that the user isn't trying to set any cgroup limits or paths.
func rootlessCgroup(config *configs.Config) error {
// Nothing set at all.
if config.Cgroups == nil || config.Cgroups.Resources == nil {
return nil
}
// Used for comparing to the zero value.
left := reflect.ValueOf(*config.Cgroups.Resources)
right := reflect.Zero(left.Type())
// This is all we need to do, since specconv won't add cgroup options in
// rootless mode.
if !reflect.DeepEqual(left.Interface(), right.Interface()) {
return fmt.Errorf("cannot specify resource limits in rootless container")
}
return nil
}
// mount verifies that the user isn't trying to set up any mounts they don't have
// the rights to do. In addition, it makes sure that no mount has a `uid=` or
// `gid=` option that doesn't resolve to root.
func rootlessMount(config *configs.Config) error {
// XXX: We could whitelist allowed devices at this point, but I'm not
// convinced that's a good idea. The kernel is the best arbiter of
// access control.
for _, mount := range config.Mounts {
// Check that the options list doesn't contain any uid= or gid= entries
// that don't resolve to root.
for _, opt := range strings.Split(mount.Data, ",") {
if strings.HasPrefix(opt, "uid=") && opt != "uid=0" {
return fmt.Errorf("cannot specify uid= mount options in rootless containers where argument isn't 0")
}
if strings.HasPrefix(opt, "gid=") && opt != "gid=0" {
return fmt.Errorf("cannot specify gid= mount options in rootless containers where argument isn't 0")
}
}
}
return nil
}

View File

@ -40,12 +40,23 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
if err := v.sysctl(config); err != nil { if err := v.sysctl(config); err != nil {
return err return err
} }
if config.Rootless {
if err := v.rootless(config); err != nil {
return err
}
}
return nil return nil
} }
// rootfs validates if the rootfs is an absolute path and is not a symlink // rootfs validates if the rootfs is an absolute path and is not a symlink
// to the container's root filesystem. // to the container's root filesystem.
func (v *ConfigValidator) rootfs(config *configs.Config) error { func (v *ConfigValidator) rootfs(config *configs.Config) error {
if _, err := os.Stat(config.Rootfs); err != nil {
if os.IsNotExist(err) {
return fmt.Errorf("rootfs (%s) does not exist", config.Rootfs)
}
return err
}
cleaned, err := filepath.Abs(config.Rootfs) cleaned, err := filepath.Abs(config.Rootfs)
if err != nil { if err != nil {
return err return err
@ -126,6 +137,11 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
} }
if strings.HasPrefix(s, "net.") { if strings.HasPrefix(s, "net.") {
if config.Namespaces.Contains(configs.NEWNET) { if config.Namespaces.Contains(configs.NEWNET) {
if path := config.Namespaces.PathOf(configs.NEWNET); path != "" {
if err := checkHostNs(s, path); err != nil {
return err
}
}
continue continue
} else { } else {
return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s) return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s)
@ -136,3 +152,44 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
return nil return nil
} }
func isSymbolicLink(path string) (bool, error) {
fi, err := os.Lstat(path)
if err != nil {
return false, err
}
return fi.Mode()&os.ModeSymlink == os.ModeSymlink, nil
}
// checkHostNs checks whether network sysctl is used in host namespace.
func checkHostNs(sysctlConfig string, path string) error {
var currentProcessNetns = "/proc/self/ns/net"
// readlink on the current processes network namespace
destOfCurrentProcess, err := os.Readlink(currentProcessNetns)
if err != nil {
return fmt.Errorf("read soft link %q error", currentProcessNetns)
}
// First check if the provided path is a symbolic link
symLink, err := isSymbolicLink(path)
if err != nil {
return fmt.Errorf("could not check that %q is a symlink: %v", path, err)
}
if symLink == false {
// The provided namespace is not a symbolic link,
// it is not the host namespace.
return nil
}
// readlink on the path provided in the struct
destOfContainer, err := os.Readlink(path)
if err != nil {
return fmt.Errorf("read soft link %q error", path)
}
if destOfContainer == destOfCurrentProcess {
return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", sysctlConfig)
}
return nil
}

View File

@ -1,15 +1,17 @@
package libcontainer package libcontainer
import "io" import (
"io"
"os"
)
// Console represents a pseudo TTY. // Console represents a pseudo TTY.
type Console interface { type Console interface {
io.ReadWriter io.ReadWriteCloser
io.Closer
// Path returns the filesystem path to the slave side of the pty. // Path returns the filesystem path to the slave side of the pty.
Path() string Path() string
// Fd returns the fd for the master of the pty. // Fd returns the fd for the master of the pty.
Fd() uintptr File() *os.File
} }

View File

@ -6,8 +6,8 @@ import (
"errors" "errors"
) )
// NewConsole returns an initalized console that can be used within a container by copying bytes // newConsole returns an initialized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process. // from the master side to the slave that is attached as the tty for the container's init process.
func NewConsole(uid, gid int) (Console, error) { func newConsole() (Console, error) {
return nil, errors.New("libcontainer console is not supported on FreeBSD") return nil, errors.New("libcontainer console is not supported on FreeBSD")
} }

View File

@ -3,20 +3,26 @@ package libcontainer
import ( import (
"fmt" "fmt"
"os" "os"
"path/filepath"
"syscall" "syscall"
"unsafe" "unsafe"
"github.com/opencontainers/runc/libcontainer/label"
) )
// NewConsole returns an initalized console that can be used within a container by copying bytes func ConsoleFromFile(f *os.File) Console {
return &linuxConsole{
master: f,
}
}
// newConsole returns an initialized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process. // from the master side to the slave that is attached as the tty for the container's init process.
func NewConsole(uid, gid int) (Console, error) { func newConsole() (Console, error) {
master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
if err != nil { if err != nil {
return nil, err return nil, err
} }
if err := saneTerminal(master); err != nil {
return nil, err
}
console, err := ptsname(master) console, err := ptsname(master)
if err != nil { if err != nil {
return nil, err return nil, err
@ -24,34 +30,20 @@ func NewConsole(uid, gid int) (Console, error) {
if err := unlockpt(master); err != nil { if err := unlockpt(master); err != nil {
return nil, err return nil, err
} }
if err := os.Chmod(console, 0600); err != nil {
return nil, err
}
if err := os.Chown(console, uid, gid); err != nil {
return nil, err
}
return &linuxConsole{ return &linuxConsole{
slavePath: console, slavePath: console,
master: master, master: master,
}, nil }, nil
} }
// newConsoleFromPath is an internal function returning an initialized console for use inside // linuxConsole is a linux pseudo TTY for use within a container.
// a container's MNT namespace.
func newConsoleFromPath(slavePath string) *linuxConsole {
return &linuxConsole{
slavePath: slavePath,
}
}
// linuxConsole is a linux psuedo TTY for use within a container.
type linuxConsole struct { type linuxConsole struct {
master *os.File master *os.File
slavePath string slavePath string
} }
func (c *linuxConsole) Fd() uintptr { func (c *linuxConsole) File() *os.File {
return c.master.Fd() return c.master
} }
func (c *linuxConsole) Path() string { func (c *linuxConsole) Path() string {
@ -75,21 +67,17 @@ func (c *linuxConsole) Close() error {
// mount initializes the console inside the rootfs mounting with the specified mount label // mount initializes the console inside the rootfs mounting with the specified mount label
// and applying the correct ownership of the console. // and applying the correct ownership of the console.
func (c *linuxConsole) mount(rootfs, mountLabel string) error { func (c *linuxConsole) mount() error {
oldMask := syscall.Umask(0000) oldMask := syscall.Umask(0000)
defer syscall.Umask(oldMask) defer syscall.Umask(oldMask)
if err := label.SetFileLabel(c.slavePath, mountLabel); err != nil { f, err := os.Create("/dev/console")
return err
}
dest := filepath.Join(rootfs, "/dev/console")
f, err := os.Create(dest)
if err != nil && !os.IsExist(err) { if err != nil && !os.IsExist(err) {
return err return err
} }
if f != nil { if f != nil {
f.Close() f.Close()
} }
return syscall.Mount(c.slavePath, dest, "bind", syscall.MS_BIND, "") return syscall.Mount(c.slavePath, "/dev/console", "bind", syscall.MS_BIND, "")
} }
// dupStdio opens the slavePath for the console and dups the fds to the current // dupStdio opens the slavePath for the console and dups the fds to the current
@ -143,3 +131,26 @@ func ptsname(f *os.File) (string, error) {
} }
return fmt.Sprintf("/dev/pts/%d", n), nil return fmt.Sprintf("/dev/pts/%d", n), nil
} }
// saneTerminal sets the necessary tty_ioctl(4)s to ensure that a pty pair
// created by us acts normally. In particular, a not-very-well-known default of
// Linux unix98 ptys is that they have +onlcr by default. While this isn't a
// problem for terminal emulators, because we relay data from the terminal we
// also relay that funky line discipline.
func saneTerminal(terminal *os.File) error {
// Go doesn't have a wrapper for any of the termios ioctls.
var termios syscall.Termios
if err := ioctl(terminal.Fd(), syscall.TCGETS, uintptr(unsafe.Pointer(&termios))); err != nil {
return fmt.Errorf("ioctl(tty, tcgets): %s", err.Error())
}
// Set -onlcr so we don't have to deal with \r.
termios.Oflag &^= syscall.ONLCR
if err := ioctl(terminal.Fd(), syscall.TCSETS, uintptr(unsafe.Pointer(&termios))); err != nil {
return fmt.Errorf("ioctl(tty, tcsets): %s", err.Error())
}
return nil
}

View File

@ -4,8 +4,8 @@ import (
"errors" "errors"
) )
// NewConsole returns an initalized console that can be used within a container by copying bytes // newConsole returns an initialized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process. // from the master side to the slave that is attached as the tty for the container's init process.
func NewConsole(uid, gid int) (Console, error) { func newConsole() (Console, error) {
return nil, errors.New("libcontainer console is not supported on Solaris") return nil, errors.New("libcontainer console is not supported on Solaris")
} }

View File

@ -1,11 +1,11 @@
package libcontainer package libcontainer
// NewConsole returns an initalized console that can be used within a container // newConsole returns an initialized console that can be used within a container
func NewConsole(uid, gid int) (Console, error) { func newConsole() (Console, error) {
return &windowsConsole{}, nil return &windowsConsole{}, nil
} }
// windowsConsole is a Windows psuedo TTY for use within a container. // windowsConsole is a Windows pseudo TTY for use within a container.
type windowsConsole struct { type windowsConsole struct {
} }

View File

@ -123,7 +123,7 @@ type BaseContainer interface {
// SystemError - System error. // SystemError - System error.
Start(process *Process) (err error) Start(process *Process) (err error)
// Run immediatly starts the process inside the conatiner. Returns error if process // Run immediately starts the process inside the container. Returns error if process
// fails to start. It does not block waiting for the exec fifo after start returns but // fails to start. It does not block waiting for the exec fifo after start returns but
// opens the fifo after start returns. // opens the fifo after start returns.
// //
@ -134,20 +134,29 @@ type BaseContainer interface {
// SystemError - System error. // SystemError - System error.
Run(process *Process) (err error) Run(process *Process) (err error)
// Destroys the container after killing all running processes. // Destroys the container, if its in a valid state, after killing any
// remaining running processes.
// //
// Any event registrations are removed before the container is destroyed. // Any event registrations are removed before the container is destroyed.
// No error is returned if the container is already destroyed. // No error is returned if the container is already destroyed.
// //
// Running containers must first be stopped using Signal(..).
// Paused containers must first be resumed using Resume(..).
//
// errors: // errors:
// ContainerNotStopped - Container is still running,
// ContainerPaused - Container is paused,
// SystemError - System error. // SystemError - System error.
Destroy() error Destroy() error
// Signal sends the provided signal code to the container's initial process. // Signal sends the provided signal code to the container's initial process.
// //
// If all is specified the signal is sent to all processes in the container
// including the initial process.
//
// errors: // errors:
// SystemError - System error. // SystemError - System error.
Signal(s os.Signal) error Signal(s os.Signal, all bool) error
// Exec signals the container to exec the users process at the end of the init. // Exec signals the container to exec the users process at the end of the init.
// //

View File

@ -51,6 +51,9 @@ type State struct {
// Platform specific fields below here // Platform specific fields below here
// Specifies if the container was started under the rootless mode.
Rootless bool `json:"rootless"`
// Path to all the cgroups setup for a container. Key is cgroup subsystem name // Path to all the cgroups setup for a container. Key is cgroup subsystem name
// with the value as the path. // with the value as the path.
CgroupPaths map[string]string `json:"cgroup_paths"` CgroupPaths map[string]string `json:"cgroup_paths"`
@ -191,17 +194,29 @@ func (c *linuxContainer) Start(process *Process) error {
if err != nil { if err != nil {
return err return err
} }
return c.start(process, status == Stopped) if status == Stopped {
if err := c.createExecFifo(); err != nil {
return err
}
}
if err := c.start(process, status == Stopped); err != nil {
if status == Stopped {
c.deleteExecFifo()
}
return err
}
return nil
} }
func (c *linuxContainer) Run(process *Process) error { func (c *linuxContainer) Run(process *Process) error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus() status, err := c.currentStatus()
if err != nil { if err != nil {
c.m.Unlock()
return err return err
} }
if err := c.start(process, status == Stopped); err != nil { c.m.Unlock()
if err := c.Start(process); err != nil {
return err return err
} }
if status == Stopped { if status == Stopped {
@ -266,8 +281,7 @@ func (c *linuxContainer) start(process *Process, isInit bool) error {
Version: c.config.Version, Version: c.config.Version,
ID: c.id, ID: c.id,
Pid: parent.pid(), Pid: parent.pid(),
Root: c.config.Rootfs, Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
} }
for i, hook := range c.config.Hooks.Poststart { for i, hook := range c.config.Hooks.Poststart {
if err := hook.Run(s); err != nil { if err := hook.Run(s); err != nil {
@ -282,33 +296,75 @@ func (c *linuxContainer) start(process *Process, isInit bool) error {
return nil return nil
} }
func (c *linuxContainer) Signal(s os.Signal) error { func (c *linuxContainer) Signal(s os.Signal, all bool) error {
if all {
return signalAllProcesses(c.cgroupManager, s)
}
if err := c.initProcess.signal(s); err != nil { if err := c.initProcess.signal(s); err != nil {
return newSystemErrorWithCause(err, "signaling init process") return newSystemErrorWithCause(err, "signaling init process")
} }
return nil return nil
} }
func (c *linuxContainer) createExecFifo() error {
rootuid, err := c.Config().HostRootUID()
if err != nil {
return err
}
rootgid, err := c.Config().HostRootGID()
if err != nil {
return err
}
fifoName := filepath.Join(c.root, execFifoFilename)
if _, err := os.Stat(fifoName); err == nil {
return fmt.Errorf("exec fifo %s already exists", fifoName)
}
oldMask := syscall.Umask(0000)
if err := syscall.Mkfifo(fifoName, 0622); err != nil {
syscall.Umask(oldMask)
return err
}
syscall.Umask(oldMask)
if err := os.Chown(fifoName, rootuid, rootgid); err != nil {
return err
}
return nil
}
func (c *linuxContainer) deleteExecFifo() {
fifoName := filepath.Join(c.root, execFifoFilename)
os.Remove(fifoName)
}
func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) { func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
parentPipe, childPipe, err := newPipe() parentPipe, childPipe, err := utils.NewSockPair("init")
if err != nil { if err != nil {
return nil, newSystemErrorWithCause(err, "creating new init pipe") return nil, newSystemErrorWithCause(err, "creating new init pipe")
} }
rootDir, err := os.Open(c.root) cmd, err := c.commandTemplate(p, childPipe)
if err != nil {
return nil, err
}
cmd, err := c.commandTemplate(p, childPipe, rootDir)
if err != nil { if err != nil {
return nil, newSystemErrorWithCause(err, "creating new command template") return nil, newSystemErrorWithCause(err, "creating new command template")
} }
if !doInit { if !doInit {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir) return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
} }
// We only set up rootDir if we're not doing a `runc exec`. The reason for
// this is to avoid cases where a racing, unprivileged process inside the
// container can get access to the statedir file descriptor (which would
// allow for container rootfs escape).
rootDir, err := os.Open(c.root)
if err != nil {
return nil, err
}
cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir)
cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir) return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
} }
func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) { func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...) cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...)
cmd.Stdin = p.Stdin cmd.Stdin = p.Stdin
cmd.Stdout = p.Stdout cmd.Stdout = p.Stdout
@ -317,10 +373,17 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File
if cmd.SysProcAttr == nil { if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &syscall.SysProcAttr{} cmd.SysProcAttr = &syscall.SysProcAttr{}
} }
cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir) cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
if p.ConsoleSocket != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
cmd.Env = append(cmd.Env, cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2), fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) )
}
cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe)
cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
)
// NOTE: when running a container with no PID namespace and the parent process spawning the container is // NOTE: when running a container with no PID namespace and the parent process spawning the container is
// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
// even with the parent still running. // even with the parent still running.
@ -339,7 +402,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
} }
} }
_, sharePidns := nsMaps[configs.NEWPID] _, sharePidns := nsMaps[configs.NEWPID]
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "") data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -357,19 +420,18 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
}, nil }, nil
} }
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) { func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
state, err := c.currentState() state, err := c.currentState()
if err != nil { if err != nil {
return nil, newSystemErrorWithCause(err, "getting container's current state") return nil, newSystemErrorWithCause(err, "getting container's current state")
} }
// for setns process, we dont have to set cloneflags as the process namespaces // for setns process, we don't have to set cloneflags as the process namespaces
// will only be set via setns syscall // will only be set via setns syscall
data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath) data, err := c.bootstrapData(0, state.NamespacePaths)
if err != nil { if err != nil {
return nil, err return nil, err
} }
// TODO: set on container for process management
return &setnsProcess{ return &setnsProcess{
cmd: cmd, cmd: cmd,
cgroupPaths: c.cgroupManager.GetPaths(), cgroupPaths: c.cgroupManager.GetPaths(),
@ -378,7 +440,6 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe,
config: c.newInitConfig(p), config: c.newInitConfig(p),
process: p, process: p,
bootstrapData: data, bootstrapData: data,
rootDir: rootDir,
}, nil }, nil
} }
@ -390,15 +451,14 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
User: process.User, User: process.User,
AdditionalGroups: process.AdditionalGroups, AdditionalGroups: process.AdditionalGroups,
Cwd: process.Cwd, Cwd: process.Cwd,
Console: process.consolePath,
Capabilities: process.Capabilities, Capabilities: process.Capabilities,
PassedFilesCount: len(process.ExtraFiles), PassedFilesCount: len(process.ExtraFiles),
ContainerId: c.ID(), ContainerId: c.ID(),
NoNewPrivileges: c.config.NoNewPrivileges, NoNewPrivileges: c.config.NoNewPrivileges,
Rootless: c.config.Rootless,
AppArmorProfile: c.config.AppArmorProfile, AppArmorProfile: c.config.AppArmorProfile,
ProcessLabel: c.config.ProcessLabel, ProcessLabel: c.config.ProcessLabel,
Rlimits: c.config.Rlimits, Rlimits: c.config.Rlimits,
ExecFifoPath: filepath.Join(c.root, execFifoFilename),
} }
if process.NoNewPrivileges != nil { if process.NoNewPrivileges != nil {
cfg.NoNewPrivileges = *process.NoNewPrivileges cfg.NoNewPrivileges = *process.NoNewPrivileges
@ -412,17 +472,10 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
if len(process.Rlimits) > 0 { if len(process.Rlimits) > 0 {
cfg.Rlimits = process.Rlimits cfg.Rlimits = process.Rlimits
} }
cfg.CreateConsole = process.ConsoleSocket != nil
return cfg return cfg
} }
func newPipe() (parent *os.File, child *os.File, err error) {
fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
if err != nil {
return nil, nil, err
}
return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil
}
func (c *linuxContainer) Destroy() error { func (c *linuxContainer) Destroy() error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
@ -467,10 +520,18 @@ func (c *linuxContainer) Resume() error {
} }
func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
// XXX(cyphar): This requires cgroups.
if c.config.Rootless {
return nil, fmt.Errorf("cannot get OOM notifications from rootless container")
}
return notifyOnOOM(c.cgroupManager.GetPaths()) return notifyOnOOM(c.cgroupManager.GetPaths())
} }
func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
// XXX(cyphar): This requires cgroups.
if c.config.Rootless {
return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container")
}
return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
} }
@ -546,10 +607,40 @@ func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
} }
func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
for _, path := range c.config.MaskPaths {
fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
if err != nil {
if os.IsNotExist(err) {
continue
}
return err
}
if fi.IsDir() {
continue
}
extMnt := &criurpc.ExtMountMap{
Key: proto.String(path),
Val: proto.String("/dev/null"),
}
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}
return nil
}
func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
// TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
// support for doing unprivileged dumps, but the setup of
// rootless containers might make this complicated.
if c.config.Rootless {
return fmt.Errorf("cannot checkpoint a rootless container")
}
if err := c.checkCriuVersion("1.5.2"); err != nil { if err := c.checkCriuVersion("1.5.2"); err != nil {
return err return err
} }
@ -609,6 +700,12 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
} }
} }
//pre-dump may need parentImage param to complete iterative migration
if criuOpts.ParentImage != "" {
rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
rpcOpts.TrackMem = proto.Bool(true)
}
// append optional manage cgroups mode // append optional manage cgroups mode
if criuOpts.ManageCgroupsMode != 0 { if criuOpts.ManageCgroupsMode != 0 {
if err := c.checkCriuVersion("1.7"); err != nil { if err := c.checkCriuVersion("1.7"); err != nil {
@ -618,12 +715,19 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
rpcOpts.ManageCgroupsMode = &mode rpcOpts.ManageCgroupsMode = &mode
} }
t := criurpc.CriuReqType_DUMP var t criurpc.CriuReqType
if criuOpts.PreDump {
t = criurpc.CriuReqType_PRE_DUMP
} else {
t = criurpc.CriuReqType_DUMP
}
req := &criurpc.CriuReq{ req := &criurpc.CriuReq{
Type: &t, Type: &t,
Opts: &rpcOpts, Opts: &rpcOpts,
} }
//no need to dump these information in pre-dump
if !criuOpts.PreDump {
for _, m := range c.config.Mounts { for _, m := range c.config.Mounts {
switch m.Device { switch m.Device {
case "bind": case "bind":
@ -641,8 +745,16 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
} }
} }
// Write the FD info to a file in the image directory if err := c.addMaskPaths(req); err != nil {
return err
}
for _, node := range c.config.Devices {
m := &configs.Mount{Destination: node.Path, Source: node.Path}
c.addCriuDumpMount(req, m)
}
// Write the FD info to a file in the image directory
fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors()) fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
if err != nil { if err != nil {
return err return err
@ -652,6 +764,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
if err != nil { if err != nil {
return err return err
} }
}
err = c.criuSwrk(nil, req, criuOpts, false) err = c.criuSwrk(nil, req, criuOpts, false)
if err != nil { if err != nil {
@ -697,6 +810,13 @@ func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
// TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
// support for unprivileged restore at the moment.
if c.config.Rootless {
return fmt.Errorf("cannot restore a rootless container")
}
if err := c.checkCriuVersion("1.5.2"); err != nil { if err := c.checkCriuVersion("1.5.2"); err != nil {
return err return err
} }
@ -778,6 +898,16 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
} }
} }
if len(c.config.MaskPaths) > 0 {
m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
c.addCriuRestoreMount(req, m)
}
for _, node := range c.config.Devices {
m := &configs.Mount{Destination: node.Path, Source: node.Path}
c.addCriuRestoreMount(req, m)
}
if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 { if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 {
c.restoreNetwork(req, criuOpts) c.restoreNetwork(req, criuOpts)
} }
@ -814,6 +944,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
} }
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
// XXX: Do we need to deal with this case? AFAIK criu still requires root.
if err := c.cgroupManager.Apply(pid); err != nil { if err := c.cgroupManager.Apply(pid); err != nil {
return err return err
} }
@ -953,6 +1084,23 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
case t == criurpc.CriuReqType_RESTORE: case t == criurpc.CriuReqType_RESTORE:
case t == criurpc.CriuReqType_DUMP: case t == criurpc.CriuReqType_DUMP:
break break
case t == criurpc.CriuReqType_PRE_DUMP:
// In pre-dump mode CRIU is in a loop and waits for
// the final DUMP command.
// The current runc pre-dump approach, however, is
// start criu in PRE_DUMP once for a single pre-dump
// and not the whole series of pre-dump, pre-dump, ...m, dump
// If we got the message CriuReqType_PRE_DUMP it means
// CRIU was successful and we need to forcefully stop CRIU
logrus.Debugf("PRE_DUMP finished. Send close signal to CRIU service")
criuClient.Close()
// Process status won't be success, because one end of sockets is closed
_, err := cmd.Process.Wait()
if err != nil {
logrus.Debugf("After PRE_DUMP CRIU exiting failed")
return err
}
return nil
default: default:
return fmt.Errorf("unable to parse the response %s", resp.String()) return fmt.Errorf("unable to parse the response %s", resp.String())
} }
@ -1026,7 +1174,7 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
Version: c.config.Version, Version: c.config.Version,
ID: c.id, ID: c.id,
Pid: int(notify.GetPid()), Pid: int(notify.GetPid()),
Root: c.config.Rootfs, Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
} }
for i, hook := range c.config.Hooks.Prestart { for i, hook := range c.config.Hooks.Prestart {
if err := hook.Run(s); err != nil { if err := hook.Run(s); err != nil {
@ -1154,14 +1302,9 @@ func (c *linuxContainer) runType() (Status, error) {
if !exist || err != nil { if !exist || err != nil {
return Stopped, err return Stopped, err
} }
// check if the process that is running is the init process or the user's process. // We'll create exec fifo and blocking on it after container is created,
// this is the difference between the container Running and Created. // and delete it after start container.
environ, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/environ", pid)) if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
if err != nil {
return Stopped, newSystemErrorWithCausef(err, "reading /proc/%d/environ", pid)
}
check := []byte("_LIBCONTAINER")
if bytes.Contains(environ, check) {
return Created, nil return Created, nil
} }
return Running, nil return Running, nil
@ -1198,6 +1341,7 @@ func (c *linuxContainer) currentState() (*State, error) {
InitProcessStartTime: startTime, InitProcessStartTime: startTime,
Created: c.created, Created: c.created,
}, },
Rootless: c.config.Rootless,
CgroupPaths: c.cgroupManager.GetPaths(), CgroupPaths: c.cgroupManager.GetPaths(),
NamespacePaths: make(map[configs.NamespaceType]string), NamespacePaths: make(map[configs.NamespaceType]string),
ExternalDescriptors: externalDescriptors, ExternalDescriptors: externalDescriptors,
@ -1223,16 +1367,22 @@ func (c *linuxContainer) currentState() (*State, error) {
// can setns in order. // can setns in order.
func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
paths := []string{} paths := []string{}
nsTypes := []configs.NamespaceType{ order := []configs.NamespaceType{
// The user namespace *must* be done first.
configs.NEWUSER,
configs.NEWIPC, configs.NEWIPC,
configs.NEWUTS, configs.NEWUTS,
configs.NEWNET, configs.NEWNET,
configs.NEWPID, configs.NEWPID,
configs.NEWNS, configs.NEWNS,
} }
// join userns if the init process explicitly requires NEWUSER
if c.config.Namespaces.Contains(configs.NEWUSER) { // Remove namespaces that we don't need to join.
nsTypes = append(nsTypes, configs.NEWUSER) var nsTypes []configs.NamespaceType
for _, ns := range order {
if c.config.Namespaces.Contains(ns) {
nsTypes = append(nsTypes, ns)
}
} }
for _, nsType := range nsTypes { for _, nsType := range nsTypes {
if p, ok := namespaces[nsType]; ok && p != "" { if p, ok := namespaces[nsType]; ok && p != "" {
@ -1249,7 +1399,7 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp
if strings.ContainsRune(p, ',') { if strings.ContainsRune(p, ',') {
return nil, newSystemError(fmt.Errorf("invalid path %s", p)) return nil, newSystemError(fmt.Errorf("invalid path %s", p))
} }
paths = append(paths, p) paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(nsType), p))
} }
} }
return paths, nil return paths, nil
@ -1272,7 +1422,7 @@ func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
// such as one that uses nsenter package to bootstrap the container's // such as one that uses nsenter package to bootstrap the container's
// init process correctly, i.e. with correct namespaces, uid/gid // init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc. // mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) { func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
// create the netlink message // create the netlink message
r := nl.NewNetlinkRequest(int(InitMsg), 0) r := nl.NewNetlinkRequest(int(InitMsg), 0)
@ -1282,14 +1432,6 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
Value: uint32(cloneFlags), Value: uint32(cloneFlags),
}) })
// write console path
if consolePath != "" {
r.AddData(&Bytemsg{
Type: ConsolePathAttr,
Value: []byte(consolePath),
})
}
// write custom namespace paths // write custom namespace paths
if len(nsMaps) > 0 { if len(nsMaps) > 0 {
nsPaths, err := c.orderNamespacePaths(nsMaps) nsPaths, err := c.orderNamespacePaths(nsMaps)
@ -1327,6 +1469,8 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
Type: GidmapAttr, Type: GidmapAttr,
Value: b, Value: b,
}) })
// The following only applies if we are root.
if !c.config.Rootless {
// check if we have CAP_SETGID to setgroup properly // check if we have CAP_SETGID to setgroup properly
pid, err := capability.NewPid(os.Getpid()) pid, err := capability.NewPid(os.Getpid())
if err != nil { if err != nil {
@ -1340,6 +1484,19 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
} }
} }
} }
}
// write oom_score_adj
r.AddData(&Bytemsg{
Type: OomScoreAdjAttr,
Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)),
})
// write rootless
r.AddData(&Boolmsg{
Type: RootlessAttr,
Value: c.config.Rootless,
})
return bytes.NewReader(r.Serialize()), nil return bytes.NewReader(r.Serialize()), nil
} }

View File

@ -25,11 +25,13 @@ type VethPairName struct {
type CriuOpts struct { type CriuOpts struct {
ImagesDirectory string // directory for storing image files ImagesDirectory string // directory for storing image files
WorkDirectory string // directory to cd and write logs/pidfiles/stats to WorkDirectory string // directory to cd and write logs/pidfiles/stats to
ParentImage string // direcotry for storing parent image files in pre-dump and dump
LeaveRunning bool // leave container in running state after checkpoint LeaveRunning bool // leave container in running state after checkpoint
TcpEstablished bool // checkpoint/restore established TCP connections TcpEstablished bool // checkpoint/restore established TCP connections
ExternalUnixConnections bool // allow external unix connections ExternalUnixConnections bool // allow external unix connections
ShellJob bool // allow to dump and restore shell jobs ShellJob bool // allow to dump and restore shell jobs
FileLocks bool // handle file locks, for safety FileLocks bool // handle file locks, for safety
PreDump bool // call criu predump to perform iterative checkpoint
PageServer CriuPageServerInfo // allow to dump to criu page server PageServer CriuPageServerInfo // allow to dump to criu page server
VethPairs []VethPairName // pass the veth to criu when restore VethPairs []VethPairName // pass the veth to criu when restore
ManageCgroupsMode cgMode // dump or restore cgroup mode ManageCgroupsMode cgMode // dump or restore cgroup mode

View File

@ -60,9 +60,9 @@ func (c ErrorCode) String() string {
type Error interface { type Error interface {
error error
// Returns a verbose string including the error message // Returns an error if it failed to write the detail of the Error to w.
// and a representation of the stack trace suitable for // The detail of the Error may include the error message and a
// printing. // representation of the stack trace.
Detail(w io.Writer) error Detail(w io.Writer) error
// Returns the error code for this error. // Returns the error code for this error.

View File

@ -15,6 +15,7 @@ import (
"github.com/docker/docker/pkg/mount" "github.com/docker/docker/pkg/mount"
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs" "github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/rootless"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/configs/validate" "github.com/opencontainers/runc/libcontainer/configs/validate"
@ -34,7 +35,15 @@ var (
// InitArgs returns an options func to configure a LinuxFactory with the // InitArgs returns an options func to configure a LinuxFactory with the
// provided init binary path and arguments. // provided init binary path and arguments.
func InitArgs(args ...string) func(*LinuxFactory) error { func InitArgs(args ...string) func(*LinuxFactory) error {
return func(l *LinuxFactory) error { return func(l *LinuxFactory) (err error) {
if len(args) > 0 {
// Resolve relative paths to ensure that its available
// after directory changes.
if args[0], err = filepath.Abs(args[0]); err != nil {
return newGenericError(err, ConfigInvalid)
}
}
l.InitArgs = args l.InitArgs = args
return nil return nil
} }
@ -65,6 +74,20 @@ func Cgroupfs(l *LinuxFactory) error {
return nil return nil
} }
// RootlessCgroups is an options func to configure a LinuxFactory to
// return containers that use the "rootless" cgroup manager, which will
// fail to do any operations not possible to do with an unprivileged user.
// It should only be used in conjunction with rootless containers.
func RootlessCgroups(l *LinuxFactory) error {
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &rootless.Manager{
Cgroups: config,
Paths: paths,
}
}
return nil
}
// TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs. // TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs.
func TmpfsRoot(l *LinuxFactory) error { func TmpfsRoot(l *LinuxFactory) error {
mounted, err := mount.Mounted(l.Root) mounted, err := mount.Mounted(l.Root)
@ -141,11 +164,11 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
if err := l.Validator.Validate(config); err != nil { if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid) return nil, newGenericError(err, ConfigInvalid)
} }
uid, err := config.HostUID() uid, err := config.HostRootUID()
if err != nil { if err != nil {
return nil, newGenericError(err, SystemError) return nil, newGenericError(err, SystemError)
} }
gid, err := config.HostGID() gid, err := config.HostRootGID()
if err != nil { if err != nil {
return nil, newGenericError(err, SystemError) return nil, newGenericError(err, SystemError)
} }
@ -161,15 +184,8 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
if err := os.Chown(containerRoot, uid, gid); err != nil { if err := os.Chown(containerRoot, uid, gid); err != nil {
return nil, newGenericError(err, SystemError) return nil, newGenericError(err, SystemError)
} }
fifoName := filepath.Join(containerRoot, execFifoFilename) if config.Rootless {
oldMask := syscall.Umask(0000) RootlessCgroups(l)
if err := syscall.Mkfifo(fifoName, 0622); err != nil {
syscall.Umask(oldMask)
return nil, newGenericError(err, SystemError)
}
syscall.Umask(oldMask)
if err := os.Chown(fifoName, uid, gid); err != nil {
return nil, newGenericError(err, SystemError)
} }
c := &linuxContainer{ c := &linuxContainer{
id: id, id: id,
@ -197,6 +213,10 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
processStartTime: state.InitProcessStartTime, processStartTime: state.InitProcessStartTime,
fds: state.ExternalDescriptors, fds: state.ExternalDescriptors,
} }
// We have to use the RootlessManager.
if state.Rootless {
RootlessCgroups(l)
}
c := &linuxContainer{ c := &linuxContainer{
initProcess: r, initProcess: r,
initProcessStartTime: state.InitProcessStartTime, initProcessStartTime: state.InitProcessStartTime,
@ -222,54 +242,71 @@ func (l *LinuxFactory) Type() string {
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state // StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally // This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) { func (l *LinuxFactory) StartInitialization() (err error) {
var pipefd, rootfd int var (
for k, v := range map[string]*int{ pipefd, rootfd int
"_LIBCONTAINER_INITPIPE": &pipefd, consoleSocket *os.File
"_LIBCONTAINER_STATEDIR": &rootfd, envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE")
} { envStateDir = os.Getenv("_LIBCONTAINER_STATEDIR")
s := os.Getenv(k) envConsole = os.Getenv("_LIBCONTAINER_CONSOLE")
)
i, err := strconv.Atoi(s) // Get the INITPIPE.
pipefd, err = strconv.Atoi(envInitPipe)
if err != nil { if err != nil {
return fmt.Errorf("unable to convert %s=%s to int", k, s) return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
}
*v = i
} }
var ( var (
pipe = os.NewFile(uintptr(pipefd), "pipe") pipe = os.NewFile(uintptr(pipefd), "pipe")
it = initType(os.Getenv("_LIBCONTAINER_INITTYPE")) it = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
) )
defer pipe.Close()
// Only init processes have STATEDIR.
rootfd = -1
if it == initStandard {
if rootfd, err = strconv.Atoi(envStateDir); err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err)
}
}
if envConsole != "" {
console, err := strconv.Atoi(envConsole)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err)
}
consoleSocket = os.NewFile(uintptr(console), "console-socket")
defer consoleSocket.Close()
}
// clear the current process's environment to clean any libcontainer // clear the current process's environment to clean any libcontainer
// specific env vars. // specific env vars.
os.Clearenv() os.Clearenv()
var i initer
defer func() { defer func() {
// We have an error during the initialization of the container's init, // We have an error during the initialization of the container's init,
// send it back to the parent process in the form of an initError. // send it back to the parent process in the form of an initError.
// If container's init successed, syscall.Exec will not return, hence
// this defer function will never be called.
if _, ok := i.(*linuxStandardInit); ok {
// Synchronisation only necessary for standard init.
if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil { if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
panic(err) fmt.Fprintln(os.Stderr, err)
} return
} }
if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil { if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
panic(err) fmt.Fprintln(os.Stderr, err)
return
} }
// ensure that this pipe is always closed
pipe.Close()
}() }()
defer func() { defer func() {
if e := recover(); e != nil { if e := recover(); e != nil {
err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack())) err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
} }
}() }()
i, err = newContainerInit(it, pipe, rootfd)
i, err := newContainerInit(it, pipe, consoleSocket, rootfd)
if err != nil { if err != nil {
return err return err
} }
// If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
return i.Init() return i.Init()
} }
@ -277,7 +314,7 @@ func (l *LinuxFactory) loadState(root, id string) (*State, error) {
f, err := os.Open(filepath.Join(root, stateFilename)) f, err := os.Open(filepath.Join(root, stateFilename))
if err != nil { if err != nil {
if os.IsNotExist(err) { if os.IsNotExist(err) {
return nil, newGenericError(fmt.Errorf("container %q does not exists", id), ContainerNotExists) return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists)
} }
return nil, newGenericError(err, SystemError) return nil, newGenericError(err, SystemError)
} }

View File

@ -9,20 +9,6 @@ import (
"github.com/opencontainers/runc/libcontainer/stacktrace" "github.com/opencontainers/runc/libcontainer/stacktrace"
) )
type syncType uint8
const (
procReady syncType = iota
procError
procRun
procHooks
procResume
)
type syncT struct {
Type syncType `json:"type"`
}
var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}} var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
Code: {{.ECode}} Code: {{.ECode}}
{{if .Message }} {{if .Message }}

View File

@ -6,12 +6,11 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"io" "io"
"io/ioutil"
"net" "net"
"os" "os"
"strconv"
"strings" "strings"
"syscall" "syscall"
"unsafe"
"github.com/Sirupsen/logrus" "github.com/Sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
@ -47,26 +46,26 @@ type initConfig struct {
Args []string `json:"args"` Args []string `json:"args"`
Env []string `json:"env"` Env []string `json:"env"`
Cwd string `json:"cwd"` Cwd string `json:"cwd"`
Capabilities []string `json:"capabilities"` Capabilities *configs.Capabilities `json:"capabilities"`
ProcessLabel string `json:"process_label"` ProcessLabel string `json:"process_label"`
AppArmorProfile string `json:"apparmor_profile"` AppArmorProfile string `json:"apparmor_profile"`
NoNewPrivileges bool `json:"no_new_privileges"` NoNewPrivileges bool `json:"no_new_privileges"`
User string `json:"user"` User string `json:"user"`
AdditionalGroups []string `json:"additional_groups"` AdditionalGroups []string `json:"additional_groups"`
Config *configs.Config `json:"config"` Config *configs.Config `json:"config"`
Console string `json:"console"`
Networks []*network `json:"network"` Networks []*network `json:"network"`
PassedFilesCount int `json:"passed_files_count"` PassedFilesCount int `json:"passed_files_count"`
ContainerId string `json:"containerid"` ContainerId string `json:"containerid"`
Rlimits []configs.Rlimit `json:"rlimits"` Rlimits []configs.Rlimit `json:"rlimits"`
ExecFifoPath string `json:"start_pipe_path"` CreateConsole bool `json:"create_console"`
Rootless bool `json:"rootless"`
} }
type initer interface { type initer interface {
Init() error Init() error
} }
func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) { func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDirFD int) (initer, error) {
var config *initConfig var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil { if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err return nil, err
@ -77,11 +76,14 @@ func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error)
switch t { switch t {
case initSetns: case initSetns:
return &linuxSetnsInit{ return &linuxSetnsInit{
pipe: pipe,
consoleSocket: consoleSocket,
config: config, config: config,
}, nil }, nil
case initStandard: case initStandard:
return &linuxStandardInit{ return &linuxStandardInit{
pipe: pipe, pipe: pipe,
consoleSocket: consoleSocket,
parentPid: syscall.Getppid(), parentPid: syscall.Getppid(),
config: config, config: config,
stateDirFD: stateDirFD, stateDirFD: stateDirFD,
@ -116,16 +118,18 @@ func finalizeNamespace(config *initConfig) error {
return err return err
} }
capabilities := config.Config.Capabilities capabilities := &configs.Capabilities{}
if config.Capabilities != nil { if config.Capabilities != nil {
capabilities = config.Capabilities capabilities = config.Capabilities
} else if config.Config.Capabilities != nil {
capabilities = config.Config.Capabilities
} }
w, err := newCapWhitelist(capabilities) w, err := newContainerCapList(capabilities)
if err != nil { if err != nil {
return err return err
} }
// drop capabilities in bounding set before changing user // drop capabilities in bounding set before changing user
if err := w.dropBoundingSet(); err != nil { if err := w.ApplyBoundingSet(); err != nil {
return err return err
} }
// preserve existing capabilities while we change users // preserve existing capabilities while we change users
@ -138,8 +142,7 @@ func finalizeNamespace(config *initConfig) error {
if err := system.ClearKeepCaps(); err != nil { if err := system.ClearKeepCaps(); err != nil {
return err return err
} }
// drop all other capabilities if err := w.ApplyCaps(); err != nil {
if err := w.drop(); err != nil {
return err return err
} }
if config.Cwd != "" { if config.Cwd != "" {
@ -150,24 +153,59 @@ func finalizeNamespace(config *initConfig) error {
return nil return nil
} }
// setupConsole sets up the console from inside the container, and sends the
// master pty fd to the config.Pipe (using cmsg). This is done to ensure that
// consoles are scoped to a container properly (see runc#814 and the many
// issues related to that). This has to be run *after* we've pivoted to the new
// rootfs (and the users' configuration is entirely set up).
func setupConsole(socket *os.File, config *initConfig, mount bool) error {
defer socket.Close()
// At this point, /dev/ptmx points to something that we would expect. We
// used to change the owner of the slave path, but since the /dev/pts mount
// can have gid=X set (at the users' option). So touching the owner of the
// slave PTY is not necessary, as the kernel will handle that for us. Note
// however, that setupUser (specifically fixStdioPermissions) *will* change
// the UID owner of the console to be the user the process will run as (so
// they can actually control their console).
console, err := newConsole()
if err != nil {
return err
}
// After we return from here, we don't need the console anymore.
defer console.Close()
linuxConsole, ok := console.(*linuxConsole)
if !ok {
return fmt.Errorf("failed to cast console to *linuxConsole")
}
// Mount the console inside our rootfs.
if mount {
if err := linuxConsole.mount(); err != nil {
return err
}
}
// While we can access console.master, using the API is a good idea.
if err := utils.SendFd(socket, linuxConsole.File()); err != nil {
return err
}
// Now, dup over all the things.
return linuxConsole.dupStdio()
}
// syncParentReady sends to the given pipe a JSON payload which indicates that // syncParentReady sends to the given pipe a JSON payload which indicates that
// the init is ready to Exec the child process. It then waits for the parent to // the init is ready to Exec the child process. It then waits for the parent to
// indicate that it is cleared to Exec. // indicate that it is cleared to Exec.
func syncParentReady(pipe io.ReadWriter) error { func syncParentReady(pipe io.ReadWriter) error {
// Tell parent. // Tell parent.
if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil { if err := writeSync(pipe, procReady); err != nil {
return err return err
} }
// Wait for parent to give the all-clear. // Wait for parent to give the all-clear.
var procSync syncT if err := readSync(pipe, procRun); err != nil {
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { return err
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type != procRun {
return fmt.Errorf("invalid synchronisation flag from parent")
}
} }
return nil return nil
} }
@ -176,19 +214,15 @@ func syncParentReady(pipe io.ReadWriter) error {
// indicate that it is cleared to resume. // indicate that it is cleared to resume.
func syncParentHooks(pipe io.ReadWriter) error { func syncParentHooks(pipe io.ReadWriter) error {
// Tell parent. // Tell parent.
if err := utils.WriteJSON(pipe, syncT{procHooks}); err != nil { if err := writeSync(pipe, procHooks); err != nil {
return err return err
} }
// Wait for parent to give the all-clear. // Wait for parent to give the all-clear.
var procSync syncT if err := readSync(pipe, procResume); err != nil {
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { return err
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type != procResume {
return fmt.Errorf("invalid synchronisation flag from parent")
}
} }
return nil return nil
} }
@ -196,18 +230,21 @@ func syncParentHooks(pipe io.ReadWriter) error {
func setupUser(config *initConfig) error { func setupUser(config *initConfig) error {
// Set up defaults. // Set up defaults.
defaultExecUser := user.ExecUser{ defaultExecUser := user.ExecUser{
Uid: syscall.Getuid(), Uid: 0,
Gid: syscall.Getgid(), Gid: 0,
Home: "/", Home: "/",
} }
passwdPath, err := user.GetPasswdPath() passwdPath, err := user.GetPasswdPath()
if err != nil { if err != nil {
return err return err
} }
groupPath, err := user.GetGroupPath() groupPath, err := user.GetGroupPath()
if err != nil { if err != nil {
return err return err
} }
execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath) execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
if err != nil { if err != nil {
return err return err
@ -220,22 +257,49 @@ func setupUser(config *initConfig) error {
return err return err
} }
} }
if config.Rootless {
if execUser.Uid != 0 {
return fmt.Errorf("cannot run as a non-root user in a rootless container")
}
if execUser.Gid != 0 {
return fmt.Errorf("cannot run as a non-root group in a rootless container")
}
// We cannot set any additional groups in a rootless container and thus we
// bail if the user asked us to do so. TODO: We currently can't do this
// earlier, but if libcontainer.Process.User was typesafe this might work.
if len(addGroups) > 0 {
return fmt.Errorf("cannot set any additional groups in a rootless container")
}
}
// before we change to the container's user make sure that the processes STDIO // before we change to the container's user make sure that the processes STDIO
// is correctly owned by the user that we are switching to. // is correctly owned by the user that we are switching to.
if err := fixStdioPermissions(execUser); err != nil { if err := fixStdioPermissions(config, execUser); err != nil {
return err return err
} }
// This isn't allowed in an unprivileged user namespace since Linux 3.19.
// There's nothing we can do about /etc/group entries, so we silently
// ignore setting groups here (since the user didn't explicitly ask us to
// set the group).
if !config.Rootless {
suppGroups := append(execUser.Sgids, addGroups...) suppGroups := append(execUser.Sgids, addGroups...)
if err := syscall.Setgroups(suppGroups); err != nil { if err := syscall.Setgroups(suppGroups); err != nil {
return err return err
} }
}
if err := system.Setgid(execUser.Gid); err != nil { if err := system.Setgid(execUser.Gid); err != nil {
return err return err
} }
if err := system.Setuid(execUser.Uid); err != nil { if err := system.Setuid(execUser.Uid); err != nil {
return err return err
} }
// if we didn't get HOME already, set it based on the user's HOME // if we didn't get HOME already, set it based on the user's HOME
if envHome := os.Getenv("HOME"); envHome == "" { if envHome := os.Getenv("HOME"); envHome == "" {
if err := os.Setenv("HOME", execUser.Home); err != nil { if err := os.Setenv("HOME", execUser.Home); err != nil {
@ -248,7 +312,7 @@ func setupUser(config *initConfig) error {
// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user. // fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
// The ownership needs to match because it is created outside of the container and needs to be // The ownership needs to match because it is created outside of the container and needs to be
// localized. // localized.
func fixStdioPermissions(u *user.ExecUser) error { func fixStdioPermissions(config *initConfig, u *user.ExecUser) error {
var null syscall.Stat_t var null syscall.Stat_t
if err := syscall.Stat("/dev/null", &null); err != nil { if err := syscall.Stat("/dev/null", &null); err != nil {
return err return err
@ -262,11 +326,27 @@ func fixStdioPermissions(u *user.ExecUser) error {
if err := syscall.Fstat(int(fd), &s); err != nil { if err := syscall.Fstat(int(fd), &s); err != nil {
return err return err
} }
// skip chown of /dev/null if it was used as one of the STDIO fds.
// Skip chown of /dev/null if it was used as one of the STDIO fds.
if s.Rdev == null.Rdev { if s.Rdev == null.Rdev {
continue continue
} }
if err := syscall.Fchown(int(fd), u.Uid, u.Gid); err != nil {
// Skip chown if s.Gid is actually an unmapped gid in the host. While
// this is a bit dodgy if it just so happens that the console _is_
// owned by overflow_gid, there's no way for us to disambiguate this as
// a userspace program.
if _, err := config.Config.HostGID(int(s.Gid)); err != nil {
continue
}
// We only change the uid owner (as it is possible for the mount to
// prefer a different gid, and there's no reason for us to change it).
// The reason why we don't just leave the default uid=X mount setup is
// that users expect to be able to actually use their console. Without
// this code, you couldn't effectively run as a non-root user inside a
// container and also have a console set up.
if err := syscall.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil {
return err return err
} }
} }
@ -328,16 +408,51 @@ func setupRlimits(limits []configs.Rlimit, pid int) error {
return nil return nil
} }
func setOomScoreAdj(oomScoreAdj int, pid int) error { const _P_PID = 1
path := fmt.Sprintf("/proc/%d/oom_score_adj", pid)
return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600) type siginfo struct {
si_signo int32
si_errno int32
si_code int32
// below here is a union; si_pid is the only field we use
si_pid int32
// Pad to 128 bytes as detailed in blockUntilWaitable
pad [96]byte
} }
// killCgroupProcesses freezes then iterates over all the processes inside the // isWaitable returns true if the process has exited false otherwise.
// manager's cgroups sending a SIGKILL to each process then waiting for them to // Its based off blockUntilWaitable in src/os/wait_waitid.go
// exit. func isWaitable(pid int) (bool, error) {
func killCgroupProcesses(m cgroups.Manager) error { si := &siginfo{}
_, _, e := syscall.Syscall6(syscall.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), syscall.WEXITED|syscall.WNOWAIT|syscall.WNOHANG, 0, 0)
if e != 0 {
return false, os.NewSyscallError("waitid", e)
}
return si.si_pid != 0, nil
}
// isNoChildren returns true if err represents a syscall.ECHILD false otherwise
func isNoChildren(err error) bool {
switch err := err.(type) {
case syscall.Errno:
if err == syscall.ECHILD {
return true
}
case *os.SyscallError:
if err.Err == syscall.ECHILD {
return true
}
}
return false
}
// signalAllProcesses freezes then iterates over all the processes inside the
// manager's cgroups sending the signal s to them.
// If s is SIGKILL then it will wait for each process to exit.
// For all other signals it will check if the process is ready to report its
// exit status and only if it is will a wait be performed.
func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
var procs []*os.Process var procs []*os.Process
if err := m.Freeze(configs.Frozen); err != nil { if err := m.Freeze(configs.Frozen); err != nil {
logrus.Warn(err) logrus.Warn(err)
@ -354,16 +469,31 @@ func killCgroupProcesses(m cgroups.Manager) error {
continue continue
} }
procs = append(procs, p) procs = append(procs, p)
if err := p.Kill(); err != nil { if err := p.Signal(s); err != nil {
logrus.Warn(err) logrus.Warn(err)
} }
} }
if err := m.Freeze(configs.Thawed); err != nil { if err := m.Freeze(configs.Thawed); err != nil {
logrus.Warn(err) logrus.Warn(err)
} }
for _, p := range procs { for _, p := range procs {
if s != syscall.SIGKILL {
if ok, err := isWaitable(p.Pid); err != nil {
if !isNoChildren(err) {
logrus.Warn("signalAllProcesses: ", p.Pid, err)
}
continue
} else if !ok {
// Not ready to report so don't wait
continue
}
}
if _, err := p.Wait(); err != nil { if _, err := p.Wait(); err != nil {
logrus.Warn(err) if !isNoChildren(err) {
logrus.Warn("wait: ", err)
}
} }
} }
return nil return nil

View File

@ -9,6 +9,10 @@ func InitLabels(options []string) (string, string, error) {
return "", "", nil return "", "", nil
} }
func GetROMountLabel() string {
return ""
}
func GenLabels(options string) (string, string, error) { func GenLabels(options string) (string, string, error) {
return "", "", nil return "", "", nil
} }

View File

@ -55,6 +55,10 @@ func InitLabels(options []string) (string, string, error) {
return processLabel, mountLabel, nil return processLabel, mountLabel, nil
} }
func GetROMountLabel() string {
return selinux.GetROFileLabel()
}
// DEPRECATED: The GenLabels function is only to be used during the transition to the official API. // DEPRECATED: The GenLabels function is only to be used during the transition to the official API.
func GenLabels(options string) (string, string, error) { func GenLabels(options string) (string, string, error) {
return InitLabels(strings.Fields(options)) return InitLabels(strings.Fields(options))
@ -138,7 +142,7 @@ func Relabel(path string, fileLabel string, shared bool) error {
fileLabel = c.Get() fileLabel = c.Get()
} }
if err := selinux.Chcon(path, fileLabel, true); err != nil { if err := selinux.Chcon(path, fileLabel, true); err != nil {
return fmt.Errorf("SELinux relabeling of %s is not allowed: %q", path, err) return err
} }
return nil return nil
} }
@ -169,7 +173,7 @@ func UnreserveLabel(label string) error {
return nil return nil
} }
// DupSecOpt takes an process label and returns security options that // DupSecOpt takes a process label and returns security options that
// can be used to set duplicate labels on future container processes // can be used to set duplicate labels on future container processes
func DupSecOpt(src string) []string { func DupSecOpt(src string) []string {
return selinux.DupSecOpt(src) return selinux.DupSecOpt(src)

View File

@ -13,11 +13,13 @@ import (
const ( const (
InitMsg uint16 = 62000 InitMsg uint16 = 62000
CloneFlagsAttr uint16 = 27281 CloneFlagsAttr uint16 = 27281
ConsolePathAttr uint16 = 27282 NsPathsAttr uint16 = 27282
NsPathsAttr uint16 = 27283 UidmapAttr uint16 = 27283
UidmapAttr uint16 = 27284 GidmapAttr uint16 = 27284
GidmapAttr uint16 = 27285 SetgroupAttr uint16 = 27285
SetgroupAttr uint16 = 27286 OomScoreAdjAttr uint16 = 27286
RootlessAttr uint16 = 27287
// When syscall.NLA_HDRLEN is in gccgo, take this out. // When syscall.NLA_HDRLEN is in gccgo, take this out.
syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1) syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1)
) )

View File

@ -47,12 +47,9 @@ type Process struct {
// ExtraFiles specifies additional open files to be inherited by the container // ExtraFiles specifies additional open files to be inherited by the container
ExtraFiles []*os.File ExtraFiles []*os.File
// consolePath is the path to the console allocated to the container.
consolePath string
// Capabilities specify the capabilities to keep when executing the process inside the container // Capabilities specify the capabilities to keep when executing the process inside the container
// All capabilities not specified will be dropped from the processes capability mask // All capabilities not specified will be dropped from the processes capability mask
Capabilities []string Capabilities *configs.Capabilities
// AppArmorProfile specifies the profile to apply to the process and is // AppArmorProfile specifies the profile to apply to the process and is
// changed at the time the process is execed // changed at the time the process is execed
@ -68,6 +65,9 @@ type Process struct {
// If Rlimits are not set, the container will inherit rlimits from the parent process // If Rlimits are not set, the container will inherit rlimits from the parent process
Rlimits []configs.Rlimit Rlimits []configs.Rlimit
// ConsoleSocket provides the masterfd console.
ConsoleSocket *os.File
ops processOperations ops processOperations
} }
@ -104,22 +104,3 @@ type IO struct {
Stdout io.ReadCloser Stdout io.ReadCloser
Stderr io.ReadCloser Stderr io.ReadCloser
} }
// NewConsole creates new console for process and returns it
func (p *Process) NewConsole(rootuid, rootgid int) (Console, error) {
console, err := NewConsole(rootuid, rootgid)
if err != nil {
return nil, err
}
p.consolePath = console.Path()
return console, nil
}
// ConsoleFromPath sets the process's console with the path provided
func (p *Process) ConsoleFromPath(path string) error {
if p.consolePath != "" {
return newGenericError(fmt.Errorf("console path already exists for process"), ConsoleExists)
}
p.consolePath = path
return nil
}

View File

@ -51,7 +51,6 @@ type setnsProcess struct {
fds []string fds []string
process *Process process *Process
bootstrapData io.Reader bootstrapData io.Reader
rootDir *os.File
} }
func (p *setnsProcess) startTime() (string, error) { func (p *setnsProcess) startTime() (string, error) {
@ -70,7 +69,6 @@ func (p *setnsProcess) start() (err error) {
defer p.parentPipe.Close() defer p.parentPipe.Close()
err = p.cmd.Start() err = p.cmd.Start()
p.childPipe.Close() p.childPipe.Close()
p.rootDir.Close()
if err != nil { if err != nil {
return newSystemErrorWithCause(err, "starting setns process") return newSystemErrorWithCause(err, "starting setns process")
} }
@ -82,15 +80,12 @@ func (p *setnsProcess) start() (err error) {
if err = p.execSetns(); err != nil { if err = p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "executing setns process") return newSystemErrorWithCause(err, "executing setns process")
} }
if len(p.cgroupPaths) > 0 { // We can't join cgroups if we're in a rootless container.
if !p.config.Rootless && len(p.cgroupPaths) > 0 {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
} }
} }
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting oom score")
}
// set rlimits, this has to be done here because we lose permissions // set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace // to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
@ -100,15 +95,22 @@ func (p *setnsProcess) start() (err error) {
return newSystemErrorWithCause(err, "writing config to pipe") return newSystemErrorWithCause(err, "writing config to pipe")
} }
ierr := parseSync(p.parentPipe, func(sync *syncT) error {
switch sync.Type {
case procReady:
// This shouldn't happen.
panic("unexpected procReady in setns")
case procHooks:
// This shouldn't happen.
panic("unexpected procHooks in setns")
default:
return newSystemError(fmt.Errorf("invalid JSON payload from child"))
}
})
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemErrorWithCause(err, "calling shutdown on init pipe") return newSystemErrorWithCause(err, "calling shutdown on init pipe")
} }
// wait for the child process to fully complete and receive an error message
// if one was encoutered
var ierr *genericError
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemErrorWithCause(err, "decoding init error from pipe")
}
// Must be done after Shutdown so the child will exit and we can wait for it. // Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil { if ierr != nil {
p.wait() p.wait()
@ -146,7 +148,7 @@ func (p *setnsProcess) execSetns() error {
} }
// terminate sends a SIGKILL to the forked process for the setns routine then waits to // terminate sends a SIGKILL to the forked process for the setns routine then waits to
// avoid the process becomming a zombie. // avoid the process becoming a zombie.
func (p *setnsProcess) terminate() error { func (p *setnsProcess) terminate() error {
if p.cmd.Process == nil { if p.cmd.Process == nil {
return nil return nil
@ -239,7 +241,7 @@ func (p *initProcess) start() error {
return newSystemErrorWithCause(err, "starting init process command") return newSystemErrorWithCause(err, "starting init process command")
} }
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return err return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
} }
if err := p.execSetns(); err != nil { if err := p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "running exec setns process for init") return newSystemErrorWithCause(err, "running exec setns process for init")
@ -252,8 +254,9 @@ func (p *initProcess) start() error {
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid()) return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
} }
p.setExternalDescriptors(fds) p.setExternalDescriptors(fds)
// Do this before syncing with child so that no children // Do this before syncing with child so that no children can escape the
// can escape the cgroup // cgroup. We don't need to worry about not doing this and not being root
// because we'd be using the rootless cgroup manager in that case.
if err := p.manager.Apply(p.pid()); err != nil { if err := p.manager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying cgroup configuration for process") return newSystemErrorWithCause(err, "applying cgroup configuration for process")
} }
@ -264,36 +267,22 @@ func (p *initProcess) start() error {
} }
}() }()
if err := p.createNetworkInterfaces(); err != nil { if err := p.createNetworkInterfaces(); err != nil {
return newSystemErrorWithCause(err, "creating nework interfaces") return newSystemErrorWithCause(err, "creating network interfaces")
} }
if err := p.sendConfig(); err != nil { if err := p.sendConfig(); err != nil {
return newSystemErrorWithCause(err, "sending config to init process") return newSystemErrorWithCause(err, "sending config to init process")
} }
var ( var (
procSync syncT
sentRun bool sentRun bool
sentResume bool sentResume bool
ierr *genericError
) )
dec := json.NewDecoder(p.parentPipe) ierr := parseSync(p.parentPipe, func(sync *syncT) error {
loop: switch sync.Type {
for {
if err := dec.Decode(&procSync); err != nil {
if err == io.EOF {
break loop
}
return newSystemErrorWithCause(err, "decoding sync type from init pipe")
}
switch procSync.Type {
case procReady: case procReady:
if err := p.manager.Set(p.config.Config); err != nil { if err := p.manager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for ready process") return newSystemErrorWithCause(err, "setting cgroup config for ready process")
} }
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting oom score for ready process")
}
// set rlimits, this has to be done here because we lose permissions // set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace // to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
@ -306,7 +295,7 @@ loop:
Version: p.container.config.Version, Version: p.container.config.Version,
ID: p.container.id, ID: p.container.id,
Pid: p.pid(), Pid: p.pid(),
Root: p.config.Config.Rootfs, Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"),
} }
for i, hook := range p.config.Config.Hooks.Prestart { for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil { if err := hook.Run(s); err != nil {
@ -316,8 +305,8 @@ loop:
} }
} }
// Sync with child. // Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil { if err := writeSync(p.parentPipe, procRun); err != nil {
return newSystemErrorWithCause(err, "reading syncT run type") return newSystemErrorWithCause(err, "writing syncT 'run'")
} }
sentRun = true sentRun = true
case procHooks: case procHooks:
@ -326,8 +315,7 @@ loop:
Version: p.container.config.Version, Version: p.container.config.Version,
ID: p.container.id, ID: p.container.id,
Pid: p.pid(), Pid: p.pid(),
Root: p.config.Config.Rootfs, Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"),
BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"),
} }
for i, hook := range p.config.Config.Hooks.Prestart { for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil { if err := hook.Run(s); err != nil {
@ -336,25 +324,17 @@ loop:
} }
} }
// Sync with child. // Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil { if err := writeSync(p.parentPipe, procResume); err != nil {
return newSystemErrorWithCause(err, "reading syncT resume type") return newSystemErrorWithCause(err, "writing syncT 'resume'")
} }
sentResume = true sentResume = true
case procError:
// wait for the child process to fully complete and receive an error message
// if one was encoutered
if err := dec.Decode(&ierr); err != nil && err != io.EOF {
return newSystemErrorWithCause(err, "decoding proc error from init")
}
if ierr != nil {
break loop
}
// Programmer error.
panic("No error following JSON procError payload.")
default: default:
return newSystemError(fmt.Errorf("invalid JSON payload from child")) return newSystemError(fmt.Errorf("invalid JSON payload from child"))
} }
}
return nil
})
if !sentRun { if !sentRun {
return newSystemErrorWithCause(ierr, "container init") return newSystemErrorWithCause(ierr, "container init")
} }
@ -364,6 +344,7 @@ loop:
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemErrorWithCause(err, "shutting down init pipe") return newSystemErrorWithCause(err, "shutting down init pipe")
} }
// Must be done after Shutdown so the child will exit and we can wait for it. // Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil { if ierr != nil {
p.wait() p.wait()
@ -379,7 +360,7 @@ func (p *initProcess) wait() (*os.ProcessState, error) {
} }
// we should kill all processes in cgroup when init is died if we use host PID namespace // we should kill all processes in cgroup when init is died if we use host PID namespace
if p.sharePidns { if p.sharePidns {
killCgroupProcesses(p.manager) signalAllProcesses(p.manager, syscall.SIGKILL)
} }
return p.cmd.ProcessState, nil return p.cmd.ProcessState, nil
} }
@ -440,9 +421,17 @@ func getPipeFds(pid int) ([]string, error) {
dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd") dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
for i := 0; i < 3; i++ { for i := 0; i < 3; i++ {
// XXX: This breaks if the path is not a valid symlink (which can
// happen in certain particularly unlucky mount namespace setups).
f := filepath.Join(dirPath, strconv.Itoa(i)) f := filepath.Join(dirPath, strconv.Itoa(i))
target, err := os.Readlink(f) target, err := os.Readlink(f)
if err != nil { if err != nil {
// Ignore permission errors, for rootless containers and other
// non-dumpable processes. if we can't get the fd for a particular
// file, there's not much we can do.
if os.IsPermission(err) {
continue
}
return fds, err return fds, err
} }
fds[i] = target fds[i] = target
@ -450,8 +439,10 @@ func getPipeFds(pid int) ([]string, error) {
return fds, nil return fds, nil
} }
// InitializeIO creates pipes for use with the process's STDIO // InitializeIO creates pipes for use with the process's stdio and returns the
// and returns the opposite side for each // opposite side for each. Do not use this if you want to have a pseudoterminal
// set up for you by libcontainer (TODO: fix that too).
// TODO: This is mostly unnecessary, and should be handled by clients.
func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
var fds []uintptr var fds []uintptr
i = &IO{} i = &IO{}

View File

@ -16,6 +16,7 @@ import (
"github.com/docker/docker/pkg/mount" "github.com/docker/docker/pkg/mount"
"github.com/docker/docker/pkg/symlink" "github.com/docker/docker/pkg/symlink"
"github.com/mrunalp/fileutils"
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/label" "github.com/opencontainers/runc/libcontainer/label"
@ -35,9 +36,11 @@ func needsSetupDev(config *configs.Config) bool {
return true return true
} }
// setupRootfs sets up the devices, mount points, and filesystems for use inside a // prepareRootfs sets up the devices, mount points, and filesystems for use
// new mount namespace. // inside a new mount namespace. It doesn't set anything as ro or pivot_root,
func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWriter) (err error) { // because console setup happens inside the caller. You must call
// finalizeRootfs in order to finish the rootfs setup.
func prepareRootfs(pipe io.ReadWriter, config *configs.Config) (err error) {
if err := prepareRoot(config); err != nil { if err := prepareRoot(config); err != nil {
return newSystemErrorWithCause(err, "preparing rootfs") return newSystemErrorWithCause(err, "preparing rootfs")
} }
@ -49,6 +52,7 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit
return newSystemErrorWithCause(err, "running premount command") return newSystemErrorWithCause(err, "running premount command")
} }
} }
if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil { if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil {
return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination) return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination)
} }
@ -59,17 +63,19 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit
} }
} }
} }
if setupDev { if setupDev {
if err := createDevices(config); err != nil { if err := createDevices(config); err != nil {
return newSystemErrorWithCause(err, "creating device nodes") return newSystemErrorWithCause(err, "creating device nodes")
} }
if err := setupPtmx(config, console); err != nil { if err := setupPtmx(config); err != nil {
return newSystemErrorWithCause(err, "setting up ptmx") return newSystemErrorWithCause(err, "setting up ptmx")
} }
if err := setupDevSymlinks(config.Rootfs); err != nil { if err := setupDevSymlinks(config.Rootfs); err != nil {
return newSystemErrorWithCause(err, "setting up /dev symlinks") return newSystemErrorWithCause(err, "setting up /dev symlinks")
} }
} }
// Signal the parent to run the pre-start hooks. // Signal the parent to run the pre-start hooks.
// The hooks are run after the mounts are setup, but before we switch to the new // The hooks are run after the mounts are setup, but before we switch to the new
// root, so that the old root is still available in the hooks for any mount // root, so that the old root is still available in the hooks for any mount
@ -77,39 +83,59 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit
if err := syncParentHooks(pipe); err != nil { if err := syncParentHooks(pipe); err != nil {
return err return err
} }
// The reason these operations are done here rather than in finalizeRootfs
// is because the console-handling code gets quite sticky if we have to set
// up the console before doing the pivot_root(2). This is because the
// Console API has to also work with the ExecIn case, which means that the
// API must be able to deal with being inside as well as outside the
// container. It's just cleaner to do this here (at the expense of the
// operation not being perfectly split).
if err := syscall.Chdir(config.Rootfs); err != nil { if err := syscall.Chdir(config.Rootfs); err != nil {
return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs) return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs)
} }
if config.NoPivotRoot { if config.NoPivotRoot {
err = msMoveRoot(config.Rootfs) err = msMoveRoot(config.Rootfs)
} else { } else {
err = pivotRoot(config.Rootfs, config.PivotDir) err = pivotRoot(config.Rootfs)
} }
if err != nil { if err != nil {
return newSystemErrorWithCause(err, "jailing process inside rootfs") return newSystemErrorWithCause(err, "jailing process inside rootfs")
} }
if setupDev { if setupDev {
if err := reOpenDevNull(); err != nil { if err := reOpenDevNull(); err != nil {
return newSystemErrorWithCause(err, "reopening /dev/null inside container") return newSystemErrorWithCause(err, "reopening /dev/null inside container")
} }
} }
// remount dev as ro if specifed
return nil
}
// finalizeRootfs actually switches the root of the process and sets anything
// to ro if necessary. You must call prepareRootfs first.
func finalizeRootfs(config *configs.Config) (err error) {
// remount dev as ro if specified
for _, m := range config.Mounts { for _, m := range config.Mounts {
if libcontainerUtils.CleanPath(m.Destination) == "/dev" { if libcontainerUtils.CleanPath(m.Destination) == "/dev" {
if m.Flags&syscall.MS_RDONLY != 0 { if m.Flags&syscall.MS_RDONLY == syscall.MS_RDONLY {
if err := remountReadonly(m.Destination); err != nil { if err := remountReadonly(m); err != nil {
return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination) return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination)
} }
} }
break break
} }
} }
// set rootfs ( / ) as readonly // set rootfs ( / ) as readonly
if config.Readonlyfs { if config.Readonlyfs {
if err := setReadonly(); err != nil { if err := setReadonly(); err != nil {
return newSystemErrorWithCause(err, "setting rootfs as readonly") return newSystemErrorWithCause(err, "setting rootfs as readonly")
} }
} }
syscall.Umask(0022) syscall.Umask(0022)
return nil return nil
} }
@ -152,15 +178,41 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
} }
return nil return nil
case "tmpfs": case "tmpfs":
copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
tmpDir := ""
stat, err := os.Stat(dest) stat, err := os.Stat(dest)
if err != nil { if err != nil {
if err := os.MkdirAll(dest, 0755); err != nil { if err := os.MkdirAll(dest, 0755); err != nil {
return err return err
} }
} }
if copyUp {
tmpDir, err = ioutil.TempDir("/tmp", "runctmpdir")
if err != nil {
return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
}
defer os.RemoveAll(tmpDir)
m.Destination = tmpDir
}
if err := mountPropagate(m, rootfs, mountLabel); err != nil { if err := mountPropagate(m, rootfs, mountLabel); err != nil {
return err return err
} }
if copyUp {
if err := fileutils.CopyDirectory(dest, tmpDir); err != nil {
errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err)
if err1 := syscall.Unmount(tmpDir, syscall.MNT_DETACH); err1 != nil {
return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
}
return errMsg
}
if err := syscall.Mount(tmpDir, dest, "", syscall.MS_MOVE, ""); err != nil {
errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err)
if err1 := syscall.Unmount(tmpDir, syscall.MNT_DETACH); err1 != nil {
return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
}
return errMsg
}
}
if stat != nil { if stat != nil {
if err = os.Chmod(dest, stat.Mode()); err != nil { if err = os.Chmod(dest, stat.Mode()); err != nil {
return err return err
@ -178,7 +230,7 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
// any previous mounts can invalidate the next mount's destination. // any previous mounts can invalidate the next mount's destination.
// this can happen when a user specifies mounts within other mounts to cause breakouts or other // this can happen when a user specifies mounts within other mounts to cause breakouts or other
// evil stuff to try to escape the container's rootfs. // evil stuff to try to escape the container's rootfs.
if dest, err = symlink.FollowSymlinkInScope(filepath.Join(rootfs, m.Destination), rootfs); err != nil { if dest, err = symlink.FollowSymlinkInScope(dest, rootfs); err != nil {
return err return err
} }
if err := checkMountDestination(rootfs, dest); err != nil { if err := checkMountDestination(rootfs, dest); err != nil {
@ -261,6 +313,19 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
} }
} }
default: default:
// ensure that the destination of the mount is resolved of symlinks at mount time because
// any previous mounts can invalidate the next mount's destination.
// this can happen when a user specifies mounts within other mounts to cause breakouts or other
// evil stuff to try to escape the container's rootfs.
var err error
if dest, err = symlink.FollowSymlinkInScope(dest, rootfs); err != nil {
return err
}
if err := checkMountDestination(rootfs, dest); err != nil {
return err
}
// update the mount with the correct dest after symlinks are resolved.
m.Destination = dest
if err := os.MkdirAll(dest, 0755); err != nil { if err := os.MkdirAll(dest, 0755); err != nil {
return err return err
} }
@ -283,7 +348,7 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
var binds []*configs.Mount var binds []*configs.Mount
for _, mm := range mounts { for _, mm := range mounts {
dir, err := mm.GetThisCgroupDir(cgroupPaths) dir, err := mm.GetOwnCgroup(cgroupPaths)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -294,7 +359,7 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
binds = append(binds, &configs.Mount{ binds = append(binds, &configs.Mount{
Device: "bind", Device: "bind",
Source: filepath.Join(mm.Mountpoint, relDir), Source: filepath.Join(mm.Mountpoint, relDir),
Destination: filepath.Join(m.Destination, strings.Join(mm.Subsystems, ",")), Destination: filepath.Join(m.Destination, filepath.Base(mm.Mountpoint)),
Flags: syscall.MS_BIND | syscall.MS_REC | m.Flags, Flags: syscall.MS_BIND | syscall.MS_REC | m.Flags,
PropagationFlags: m.PropagationFlags, PropagationFlags: m.PropagationFlags,
}) })
@ -306,9 +371,6 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
// checkMountDestination checks to ensure that the mount destination is not over the top of /proc. // checkMountDestination checks to ensure that the mount destination is not over the top of /proc.
// dest is required to be an abs path and have any symlinks resolved before calling this function. // dest is required to be an abs path and have any symlinks resolved before calling this function.
func checkMountDestination(rootfs, dest string) error { func checkMountDestination(rootfs, dest string) error {
if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) {
return fmt.Errorf("mounting into / is prohibited")
}
invalidDestinations := []string{ invalidDestinations := []string{
"/proc", "/proc",
} }
@ -448,10 +510,12 @@ func createDeviceNode(rootfs string, node *configs.Device, bind bool) error {
func mknodDevice(dest string, node *configs.Device) error { func mknodDevice(dest string, node *configs.Device) error {
fileMode := node.FileMode fileMode := node.FileMode
switch node.Type { switch node.Type {
case 'c': case 'c', 'u':
fileMode |= syscall.S_IFCHR fileMode |= syscall.S_IFCHR
case 'b': case 'b':
fileMode |= syscall.S_IFBLK fileMode |= syscall.S_IFBLK
case 'p':
fileMode |= syscall.S_IFIFO
default: default:
return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path) return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path)
} }
@ -539,11 +603,13 @@ func prepareRoot(config *configs.Config) error {
if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil { if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil {
return err return err
} }
if config.NoPivotRoot {
// Make parent mount private to make sure following bind mount does
// not propagate in other namespaces. Also it will help with kernel
// check pass in pivot_root. (IS_SHARED(new_mnt->mnt_parent))
if err := rootfsParentMountPrivate(config.Rootfs); err != nil { if err := rootfsParentMountPrivate(config.Rootfs); err != nil {
return err return err
} }
}
return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "") return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "")
} }
@ -552,7 +618,7 @@ func setReadonly() error {
return syscall.Mount("/", "/", "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, "") return syscall.Mount("/", "/", "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, "")
} }
func setupPtmx(config *configs.Config, console *linuxConsole) error { func setupPtmx(config *configs.Config) error {
ptmx := filepath.Join(config.Rootfs, "dev/ptmx") ptmx := filepath.Join(config.Rootfs, "dev/ptmx")
if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
return err return err
@ -560,54 +626,61 @@ func setupPtmx(config *configs.Config, console *linuxConsole) error {
if err := os.Symlink("pts/ptmx", ptmx); err != nil { if err := os.Symlink("pts/ptmx", ptmx); err != nil {
return fmt.Errorf("symlink dev ptmx %s", err) return fmt.Errorf("symlink dev ptmx %s", err)
} }
if console != nil {
return console.mount(config.Rootfs, config.MountLabel)
}
return nil return nil
} }
func pivotRoot(rootfs, pivotBaseDir string) (err error) { // pivotRoot will call pivot_root such that rootfs becomes the new root
if pivotBaseDir == "" { // filesystem, and everything else is cleaned up.
pivotBaseDir = "/" func pivotRoot(rootfs string) error {
} // While the documentation may claim otherwise, pivot_root(".", ".") is
tmpDir := filepath.Join(rootfs, pivotBaseDir) // actually valid. What this results in is / being the new root but
if err := os.MkdirAll(tmpDir, 0755); err != nil { // /proc/self/cwd being the old root. Since we can play around with the cwd
return fmt.Errorf("can't create tmp dir %s, error %v", tmpDir, err) // with pivot_root this allows us to pivot without creating directories in
} // the rootfs. Shout-outs to the LXC developers for giving us this idea.
pivotDir, err := ioutil.TempDir(tmpDir, ".pivot_root")
oldroot, err := syscall.Open("/", syscall.O_DIRECTORY|syscall.O_RDONLY, 0)
if err != nil { if err != nil {
return fmt.Errorf("can't create pivot_root dir %s, error %v", pivotDir, err)
}
defer func() {
errVal := os.Remove(pivotDir)
if err == nil {
err = errVal
}
}()
if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
// Make the parent mount private
if err := rootfsParentMountPrivate(rootfs); err != nil {
return err return err
} }
// Try again defer syscall.Close(oldroot)
if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
newroot, err := syscall.Open(rootfs, syscall.O_DIRECTORY|syscall.O_RDONLY, 0)
if err != nil {
return err
}
defer syscall.Close(newroot)
// Change to the new root so that the pivot_root actually acts on it.
if err := syscall.Fchdir(newroot); err != nil {
return err
}
if err := syscall.PivotRoot(".", "."); err != nil {
return fmt.Errorf("pivot_root %s", err) return fmt.Errorf("pivot_root %s", err)
} }
}
if err := syscall.Chdir("/"); err != nil {
return fmt.Errorf("chdir / %s", err)
}
// path to pivot dir now changed, update
pivotDir = filepath.Join(pivotBaseDir, filepath.Base(pivotDir))
// Make pivotDir rprivate to make sure any of the unmounts don't // Currently our "." is oldroot (according to the current kernel code).
// propagate to parent. // However, purely for safety, we will fchdir(oldroot) since there isn't
if err := syscall.Mount("", pivotDir, "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil { // really any guarantee from the kernel what /proc/self/cwd will be after a
// pivot_root(2).
if err := syscall.Fchdir(oldroot); err != nil {
return err return err
} }
if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { // Make oldroot rprivate to make sure our unmounts don't propagate to the
return fmt.Errorf("unmount pivot_root dir %s", err) // host (and thus bork the machine).
if err := syscall.Mount("", ".", "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil {
return err
}
// Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
if err := syscall.Unmount(".", syscall.MNT_DETACH); err != nil {
return err
}
// Switch back to our shiny new root.
if err := syscall.Chdir("/"); err != nil {
return fmt.Errorf("chdir / %s", err)
} }
return nil return nil
} }
@ -642,17 +715,26 @@ func createIfNotExists(path string, isDir bool) error {
return nil return nil
} }
// remountReadonly will bind over the top of an existing path and ensure that it is read-only. // readonlyPath will make a path read only.
func remountReadonly(path string) error { func readonlyPath(path string) error {
for i := 0; i < 5; i++ { if err := syscall.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
if err := syscall.Mount("", path, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil && !os.IsNotExist(err) { if os.IsNotExist(err) {
switch err { return nil
case syscall.EINVAL: }
// Probably not a mountpoint, use bind-mount
if err := syscall.Mount(path, path, "", syscall.MS_BIND, ""); err != nil {
return err return err
} }
return syscall.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC|defaultMountFlags, "") return syscall.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, "")
}
// remountReadonly will remount an existing mount point and ensure that it is read-only.
func remountReadonly(m *configs.Mount) error {
var (
dest = m.Destination
flags = m.Flags
)
for i := 0; i < 5; i++ {
if err := syscall.Mount("", dest, "", uintptr(flags|syscall.MS_REMOUNT|syscall.MS_RDONLY), ""); err != nil {
switch err {
case syscall.EBUSY: case syscall.EBUSY:
time.Sleep(100 * time.Millisecond) time.Sleep(100 * time.Millisecond)
continue continue
@ -662,13 +744,19 @@ func remountReadonly(path string) error {
} }
return nil return nil
} }
return fmt.Errorf("unable to mount %s as readonly max retries reached", path) return fmt.Errorf("unable to mount %s as readonly max retries reached", dest)
} }
// maskFile bind mounts /dev/null over the top of the specified path inside a container // maskPath masks the top of the specified path inside a container to avoid
// to avoid security issues from processes reading information from non-namespace aware mounts ( proc/kcore ). // security issues from processes reading information from non-namespace aware
func maskFile(path string) error { // mounts ( proc/kcore ).
// For files, maskPath bind mounts /dev/null over the top of the specified path.
// For directories, maskPath mounts read-only tmpfs over the top of the specified path.
func maskPath(path string) error {
if err := syscall.Mount("/dev/null", path, "", syscall.MS_BIND, ""); err != nil && !os.IsNotExist(err) { if err := syscall.Mount("/dev/null", path, "", syscall.MS_BIND, ""); err != nil && !os.IsNotExist(err) {
if err == syscall.ENOTDIR {
return syscall.Mount("tmpfs", path, "tmpfs", syscall.MS_RDONLY, "")
}
return err return err
} }
return nil return nil
@ -705,7 +793,9 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
if libcontainerUtils.CleanPath(dest) == "/dev" { if libcontainerUtils.CleanPath(dest) == "/dev" {
flags &= ^syscall.MS_RDONLY flags &= ^syscall.MS_RDONLY
} }
if !strings.HasPrefix(dest, rootfs) {
copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
if !(copyUp || strings.HasPrefix(dest, rootfs)) {
dest = filepath.Join(rootfs, dest) dest = filepath.Join(rootfs, dest)
} }

View File

@ -212,10 +212,6 @@ func parseStatusFile(path string) (map[string]string, error) {
status := make(map[string]string) status := make(map[string]string)
for s.Scan() { for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
text := s.Text() text := s.Text()
parts := strings.Split(text, ":") parts := strings.Split(text, ":")
@ -225,5 +221,9 @@ func parseStatusFile(path string) (map[string]string, error) {
status[parts[0]] = parts[1] status[parts[0]] = parts[1]
} }
if err := s.Err(); err != nil {
return nil, err
}
return status, nil return status, nil
} }

View File

@ -32,33 +32,73 @@ const (
stRdOnly = 0x01 stRdOnly = 0x01
) )
type selinuxState struct {
enabledSet bool
enabled bool
selinuxfsSet bool
selinuxfs string
mcsList map[string]bool
sync.Mutex
}
var ( var (
assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`) assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`)
mcsList = make(map[string]bool) state = selinuxState{
mcsLock sync.Mutex mcsList: make(map[string]bool),
selinuxfs = "unknown" }
selinuxEnabled = false // Stores whether selinux is currently enabled
selinuxEnabledChecked = false // Stores whether selinux enablement has been checked or established yet
) )
type SELinuxContext map[string]string type SELinuxContext map[string]string
func (s *selinuxState) setEnable(enabled bool) bool {
s.Lock()
defer s.Unlock()
s.enabledSet = true
s.enabled = enabled
return s.enabled
}
func (s *selinuxState) getEnabled() bool {
s.Lock()
enabled := s.enabled
enabledSet := s.enabledSet
s.Unlock()
if enabledSet {
return enabled
}
enabled = false
if fs := getSelinuxMountPoint(); fs != "" {
if con, _ := Getcon(); con != "kernel" {
enabled = true
}
}
return s.setEnable(enabled)
}
// SetDisabled disables selinux support for the package // SetDisabled disables selinux support for the package
func SetDisabled() { func SetDisabled() {
selinuxEnabled, selinuxEnabledChecked = false, true state.setEnable(false)
} }
// getSelinuxMountPoint returns the path to the mountpoint of an selinuxfs func (s *selinuxState) setSELinuxfs(selinuxfs string) string {
// filesystem or an empty string if no mountpoint is found. Selinuxfs is s.Lock()
// a proc-like pseudo-filesystem that exposes the selinux policy API to defer s.Unlock()
// processes. The existence of an selinuxfs mount is used to determine s.selinuxfsSet = true
// whether selinux is currently enabled or not. s.selinuxfs = selinuxfs
func getSelinuxMountPoint() string { return s.selinuxfs
if selinuxfs != "unknown" { }
func (s *selinuxState) getSELinuxfs() string {
s.Lock()
selinuxfs := s.selinuxfs
selinuxfsSet := s.selinuxfsSet
s.Unlock()
if selinuxfsSet {
return selinuxfs return selinuxfs
} }
selinuxfs = ""
selinuxfs = ""
f, err := os.Open("/proc/self/mountinfo") f, err := os.Open("/proc/self/mountinfo")
if err != nil { if err != nil {
return selinuxfs return selinuxfs
@ -91,21 +131,21 @@ func getSelinuxMountPoint() string {
selinuxfs = "" selinuxfs = ""
} }
} }
return selinuxfs return s.setSELinuxfs(selinuxfs)
}
// getSelinuxMountPoint returns the path to the mountpoint of an selinuxfs
// filesystem or an empty string if no mountpoint is found. Selinuxfs is
// a proc-like pseudo-filesystem that exposes the selinux policy API to
// processes. The existence of an selinuxfs mount is used to determine
// whether selinux is currently enabled or not.
func getSelinuxMountPoint() string {
return state.getSELinuxfs()
} }
// SelinuxEnabled returns whether selinux is currently enabled. // SelinuxEnabled returns whether selinux is currently enabled.
func SelinuxEnabled() bool { func SelinuxEnabled() bool {
if selinuxEnabledChecked { return state.getEnabled()
return selinuxEnabled
}
selinuxEnabledChecked = true
if fs := getSelinuxMountPoint(); fs != "" {
if con, _ := Getcon(); con != "kernel" {
selinuxEnabled = true
}
}
return selinuxEnabled
} }
func readConfig(target string) (value string) { func readConfig(target string) (value string) {
@ -283,19 +323,19 @@ func SelinuxGetEnforceMode() int {
} }
func mcsAdd(mcs string) error { func mcsAdd(mcs string) error {
mcsLock.Lock() state.Lock()
defer mcsLock.Unlock() defer state.Unlock()
if mcsList[mcs] { if state.mcsList[mcs] {
return fmt.Errorf("MCS Label already exists") return fmt.Errorf("MCS Label already exists")
} }
mcsList[mcs] = true state.mcsList[mcs] = true
return nil return nil
} }
func mcsDelete(mcs string) { func mcsDelete(mcs string) {
mcsLock.Lock() state.Lock()
mcsList[mcs] = false defer state.Unlock()
mcsLock.Unlock() state.mcsList[mcs] = false
} }
func IntToMcs(id int, catRange uint32) string { func IntToMcs(id int, catRange uint32) string {
@ -334,9 +374,7 @@ func uniqMcs(catRange uint32) string {
continue continue
} else { } else {
if c1 > c2 { if c1 > c2 {
t := c1 c1, c2 = c2, c1
c1 = c2
c2 = t
} }
} }
mcs = fmt.Sprintf("s0:c%d,c%d", c1, c2) mcs = fmt.Sprintf("s0:c%d,c%d", c1, c2)
@ -355,6 +393,12 @@ func FreeLxcContexts(scon string) {
} }
} }
var roFileLabel string
func GetROFileLabel() (fileLabel string) {
return roFileLabel
}
func GetLxcContexts() (processLabel string, fileLabel string) { func GetLxcContexts() (processLabel string, fileLabel string) {
var ( var (
val, key string val, key string
@ -399,6 +443,9 @@ func GetLxcContexts() (processLabel string, fileLabel string) {
if key == "file" { if key == "file" {
fileLabel = strings.Trim(val, "\"") fileLabel = strings.Trim(val, "\"")
} }
if key == "ro_file" {
roFileLabel = strings.Trim(val, "\"")
}
} }
} }
@ -406,6 +453,9 @@ func GetLxcContexts() (processLabel string, fileLabel string) {
return "", "" return "", ""
} }
if roFileLabel == "" {
roFileLabel = fileLabel
}
exit: exit:
// mcs := IntToMcs(os.Getpid(), 1024) // mcs := IntToMcs(os.Getpid(), 1024)
mcs := uniqMcs(1024) mcs := uniqMcs(1024)
@ -446,7 +496,7 @@ func badPrefix(fpath string) error {
for _, prefix := range badprefixes { for _, prefix := range badprefixes {
if fpath == prefix || strings.HasPrefix(fpath, fmt.Sprintf("%s/", prefix)) { if fpath == prefix || strings.HasPrefix(fpath, fmt.Sprintf("%s/", prefix)) {
return fmt.Errorf("Relabeling content in %s is not allowed.", prefix) return fmt.Errorf("relabeling content in %s is not allowed", prefix)
} }
} }
return nil return nil
@ -486,14 +536,14 @@ func DupSecOpt(src string) []string {
con["level"] == "" { con["level"] == "" {
return nil return nil
} }
return []string{"label=user:" + con["user"], return []string{"user:" + con["user"],
"label=role:" + con["role"], "role:" + con["role"],
"label=type:" + con["type"], "type:" + con["type"],
"label=level:" + con["level"]} "level:" + con["level"]}
} }
// DisableSecOpt returns a security opt that can be used to disabling SELinux // DisableSecOpt returns a security opt that can be used to disabling SELinux
// labeling support for future container processes // labeling support for future container processes
func DisableSecOpt() []string { func DisableSecOpt() []string {
return []string{"label=disable"} return []string{"disable"}
} }

View File

@ -16,6 +16,8 @@ import (
// linuxSetnsInit performs the container's initialization for running a new process // linuxSetnsInit performs the container's initialization for running a new process
// inside an existing container. // inside an existing container.
type linuxSetnsInit struct { type linuxSetnsInit struct {
pipe *os.File
consoleSocket *os.File
config *initConfig config *initConfig
} }
@ -30,6 +32,14 @@ func (l *linuxSetnsInit) Init() error {
return err return err
} }
} }
if l.config.CreateConsole {
if err := setupConsole(l.consoleSocket, l.config, false); err != nil {
return err
}
if err := system.Setctty(); err != nil {
return err
}
}
if l.config.NoNewPrivileges { if l.config.NoNewPrivileges {
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err return err

View File

@ -4,7 +4,6 @@ package libcontainer
import ( import (
"fmt" "fmt"
"io"
"os" "os"
"os/exec" "os/exec"
"syscall" "syscall"
@ -18,7 +17,8 @@ import (
) )
type linuxStandardInit struct { type linuxStandardInit struct {
pipe io.ReadWriteCloser pipe *os.File
consoleSocket *os.File
parentPid int parentPid int
stateDirFD int stateDirFD int
config *initConfig config *initConfig
@ -59,18 +59,6 @@ func (l *linuxStandardInit) Init() error {
} }
} }
var console *linuxConsole
if l.config.Console != "" {
console = newConsoleFromPath(l.config.Console)
if err := console.dupStdio(); err != nil {
return err
}
}
if console != nil {
if err := system.Setctty(); err != nil {
return err
}
}
if err := setupNetwork(l.config); err != nil { if err := setupNetwork(l.config); err != nil {
return err return err
} }
@ -79,12 +67,33 @@ func (l *linuxStandardInit) Init() error {
} }
label.Init() label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace
// prepareRootfs() can be executed only for a new mount namespace.
if l.config.Config.Namespaces.Contains(configs.NEWNS) { if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := setupRootfs(l.config.Config, console, l.pipe); err != nil { if err := prepareRootfs(l.pipe, l.config.Config); err != nil {
return err return err
} }
} }
// Set up the console. This has to be done *before* we finalize the rootfs,
// but *after* we've given the user the chance to set up all of the mounts
// they wanted.
if l.config.CreateConsole {
if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
return err
}
if err := system.Setctty(); err != nil {
return err
}
}
// Finish the rootfs setup.
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := finalizeRootfs(l.config.Config); err != nil {
return err
}
}
if hostname := l.config.Config.Hostname; hostname != "" { if hostname := l.config.Config.Hostname; hostname != "" {
if err := syscall.Sethostname([]byte(hostname)); err != nil { if err := syscall.Sethostname([]byte(hostname)); err != nil {
return err return err
@ -103,12 +112,12 @@ func (l *linuxStandardInit) Init() error {
} }
} }
for _, path := range l.config.Config.ReadonlyPaths { for _, path := range l.config.Config.ReadonlyPaths {
if err := remountReadonly(path); err != nil { if err := readonlyPath(path); err != nil {
return err return err
} }
} }
for _, path := range l.config.Config.MaskPaths { for _, path := range l.config.Config.MaskPaths {
if err := maskFile(path); err != nil { if err := maskPath(path); err != nil {
return err return err
} }
} }
@ -143,7 +152,7 @@ func (l *linuxStandardInit) Init() error {
if err := pdeath.Restore(); err != nil { if err := pdeath.Restore(); err != nil {
return err return err
} }
// compare the parent from the inital start of the init process and make sure that it did not change. // compare the parent from the initial start of the init process and make sure that it did not change.
// if the parent changes that means it died and we were reparented to something else so we should // if the parent changes that means it died and we were reparented to something else so we should
// just kill ourself and not cause problems for someone else. // just kill ourself and not cause problems for someone else.
if syscall.Getppid() != l.parentPid { if syscall.Getppid() != l.parentPid {
@ -171,6 +180,9 @@ func (l *linuxStandardInit) Init() error {
return newSystemErrorWithCause(err, "init seccomp") return newSystemErrorWithCause(err, "init seccomp")
} }
} }
// close the statedir fd before exec because the kernel resets dumpable in the wrong order
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
syscall.Close(l.stateDirFD)
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
return newSystemErrorWithCause(err, "exec user process") return newSystemErrorWithCause(err, "exec user process")
} }

View File

@ -39,7 +39,7 @@ type containerState interface {
func destroy(c *linuxContainer) error { func destroy(c *linuxContainer) error {
if !c.config.Namespaces.Contains(configs.NEWPID) { if !c.config.Namespaces.Contains(configs.NEWPID) {
if err := killCgroupProcesses(c.cgroupManager); err != nil { if err := signalAllProcesses(c.cgroupManager, syscall.SIGKILL); err != nil {
logrus.Warn(err) logrus.Warn(err)
} }
} }
@ -60,8 +60,7 @@ func runPoststopHooks(c *linuxContainer) error {
s := configs.HookState{ s := configs.HookState{
Version: c.config.Version, Version: c.config.Version,
ID: c.id, ID: c.id,
Root: c.config.Rootfs, Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
} }
for _, hook := range c.config.Hooks.Poststop { for _, hook := range c.config.Hooks.Poststop {
if err := hook.Run(s); err != nil { if err := hook.Run(s); err != nil {
@ -83,10 +82,7 @@ func (b *stoppedState) status() Status {
func (b *stoppedState) transition(s containerState) error { func (b *stoppedState) transition(s containerState) error {
switch s.(type) { switch s.(type) {
case *runningState: case *runningState, *restoredState:
b.c.state = s
return nil
case *restoredState:
b.c.state = s b.c.state = s
return nil return nil
case *stoppedState: case *stoppedState:
@ -199,7 +195,7 @@ func (p *pausedState) destroy() error {
return newGenericError(fmt.Errorf("container is paused"), ContainerPaused) return newGenericError(fmt.Errorf("container is paused"), ContainerPaused)
} }
// restoredState is the same as the running state but also has accociated checkpoint // restoredState is the same as the running state but also has associated checkpoint
// information that maybe need destroyed when the container is stopped and destroy is called. // information that maybe need destroyed when the container is stopped and destroy is called.
type restoredState struct { type restoredState struct {
imageDir string imageDir string
@ -212,9 +208,7 @@ func (r *restoredState) status() Status {
func (r *restoredState) transition(s containerState) error { func (r *restoredState) transition(s containerState) error {
switch s.(type) { switch s.(type) {
case *stoppedState: case *stoppedState, *runningState:
return nil
case *runningState:
return nil return nil
} }
return newStateTransitionError(r, s) return newStateTransitionError(r, s)

View File

@ -0,0 +1,107 @@
package libcontainer
import (
"encoding/json"
"fmt"
"io"
"github.com/opencontainers/runc/libcontainer/utils"
)
type syncType string
// Constants that are used for synchronisation between the parent and child
// during container setup. They come in pairs (with procError being a generic
// response which is followed by a &genericError).
//
// [ child ] <-> [ parent ]
//
// procHooks --> [run hooks]
// <-- procResume
//
// procConsole -->
// <-- procConsoleReq
// [send(fd)] --> [recv(fd)]
// <-- procConsoleAck
//
// procReady --> [final setup]
// <-- procRun
const (
procError syncType = "procError"
procReady syncType = "procReady"
procRun syncType = "procRun"
procHooks syncType = "procHooks"
procResume syncType = "procResume"
)
type syncT struct {
Type syncType `json:"type"`
}
// writeSync is used to write to a synchronisation pipe. An error is returned
// if there was a problem writing the payload.
func writeSync(pipe io.Writer, sync syncType) error {
if err := utils.WriteJSON(pipe, syncT{sync}); err != nil {
return err
}
return nil
}
// readSync is used to read from a synchronisation pipe. An error is returned
// if we got a genericError, the pipe was closed, or we got an unexpected flag.
func readSync(pipe io.Reader, expected syncType) error {
var procSync syncT
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type == procError {
var ierr genericError
if err := json.NewDecoder(pipe).Decode(&ierr); err != nil {
return fmt.Errorf("failed reading error from parent: %v", err)
}
return &ierr
}
if procSync.Type != expected {
return fmt.Errorf("invalid synchronisation flag from parent")
}
}
return nil
}
// parseSync runs the given callback function on each syncT received from the
// child. It will return once io.EOF is returned from the given pipe.
func parseSync(pipe io.Reader, fn func(*syncT) error) error {
dec := json.NewDecoder(pipe)
for {
var sync syncT
if err := dec.Decode(&sync); err != nil {
if err == io.EOF {
break
}
return err
}
// We handle this case outside fn for cleanliness reasons.
var ierr *genericError
if sync.Type == procError {
if err := dec.Decode(&ierr); err != nil && err != io.EOF {
return newSystemErrorWithCause(err, "decoding proc error from init")
}
if ierr != nil {
return ierr
}
// Programmer error.
panic("No error following JSON procError payload.")
}
if err := fn(&sync); err != nil {
return err
}
}
return nil
}

View File

@ -14,8 +14,10 @@ func GetProcessStartTime(pid int) (string, error) {
if err != nil { if err != nil {
return "", err return "", err
} }
return parseStartTime(string(data))
}
parts := strings.Split(string(data), " ") func parseStartTime(stat string) (string, error) {
// the starttime is located at pos 22 // the starttime is located at pos 22
// from the man page // from the man page
// //
@ -23,5 +25,19 @@ func GetProcessStartTime(pid int) (string, error) {
// (22) The time the process started after system boot. In kernels before Linux 2.6, this // (22) The time the process started after system boot. In kernels before Linux 2.6, this
// value was expressed in jiffies. Since Linux 2.6, the value is expressed in clock ticks // value was expressed in jiffies. Since Linux 2.6, the value is expressed in clock ticks
// (divide by sysconf(_SC_CLK_TCK)). // (divide by sysconf(_SC_CLK_TCK)).
return parts[22-1], nil // starts at 1 //
// NOTE:
// pos 2 could contain space and is inside `(` and `)`:
// (2) comm %s
// The filename of the executable, in parentheses.
// This is visible whether or not the executable is
// swapped out.
//
// the following is an example:
// 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0
// get parts after last `)`:
s := strings.Split(stat, ")")
parts := strings.Split(strings.TrimSpace(s[len(s)-1]), " ")
return parts[22-3], nil // starts at 3 (after the filename pos `2`)
} }

View File

@ -199,18 +199,16 @@ type ExecUser struct {
// files cannot be opened for any reason, the error is ignored and a nil // files cannot be opened for any reason, the error is ignored and a nil
// io.Reader is passed instead. // io.Reader is passed instead.
func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) { func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) {
passwd, err := os.Open(passwdPath) var passwd, group io.Reader
if err != nil {
passwd = nil if passwdFile, err := os.Open(passwdPath); err == nil {
} else { passwd = passwdFile
defer passwd.Close() defer passwdFile.Close()
} }
group, err := os.Open(groupPath) if groupFile, err := os.Open(groupPath); err == nil {
if err != nil { group = groupFile
group = nil defer groupFile.Close()
} else {
defer group.Close()
} }
return GetExecUser(userSpec, defaults, passwd, group) return GetExecUser(userSpec, defaults, passwd, group)
@ -343,7 +341,7 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (
if len(groups) > 0 { if len(groups) > 0 {
// First match wins, even if there's more than one matching entry. // First match wins, even if there's more than one matching entry.
user.Gid = groups[0].Gid user.Gid = groups[0].Gid
} else if groupArg != "" { } else {
// If we can't find a group with the given name, the only other valid // If we can't find a group with the given name, the only other valid
// option is if it's a numeric group name with no associated entry in group. // option is if it's a numeric group name with no associated entry in group.
@ -433,9 +431,11 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
// that opens the groupPath given and gives it as an argument to // that opens the groupPath given and gives it as an argument to
// GetAdditionalGroups. // GetAdditionalGroups.
func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) { func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) {
group, err := os.Open(groupPath) var group io.Reader
if err == nil {
defer group.Close() if groupFile, err := os.Open(groupPath); err == nil {
group = groupFile
defer groupFile.Close()
} }
return GetAdditionalGroups(additionalGroups, group) return GetAdditionalGroups(additionalGroups, group)
} }

View File

@ -0,0 +1,148 @@
/*
* Copyright 2016 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h>
#include "cmsg.h"
#define error(fmt, ...) \
({ \
fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \
errno = ECOMM; \
goto err; /* return value */ \
})
/*
* Sends a file descriptor along the sockfd provided. Returns the return
* value of sendmsg(2). Any synchronisation and preparation of state
* should be done external to this (we expect the other side to be in
* recvfd() in the code).
*/
ssize_t sendfd(int sockfd, struct file_t file)
{
struct msghdr msg = {0};
struct iovec iov[1] = {0};
struct cmsghdr *cmsg;
int *fdptr;
int ret;
union {
char buf[CMSG_SPACE(sizeof(file.fd))];
struct cmsghdr align;
} u;
/*
* We need to send some other data along with the ancillary data,
* otherwise the other side won't recieve any data. This is very
* well-hidden in the documentation (and only applies to
* SOCK_STREAM). See the bottom part of unix(7).
*/
iov[0].iov_base = file.name;
iov[0].iov_len = strlen(file.name) + 1;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_control = u.buf;
msg.msg_controllen = sizeof(u.buf);
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
fdptr = (int *) CMSG_DATA(cmsg);
memcpy(fdptr, &file.fd, sizeof(int));
return sendmsg(sockfd, &msg, 0);
}
/*
* Receives a file descriptor from the sockfd provided. Returns the file
* descriptor as sent from sendfd(). It will return the file descriptor
* or die (literally) trying. Any synchronisation and preparation of
* state should be done external to this (we expect the other side to be
* in sendfd() in the code).
*/
struct file_t recvfd(int sockfd)
{
struct msghdr msg = {0};
struct iovec iov[1] = {0};
struct cmsghdr *cmsg;
struct file_t file = {0};
int *fdptr;
int olderrno;
union {
char buf[CMSG_SPACE(sizeof(file.fd))];
struct cmsghdr align;
} u;
/* Allocate a buffer. */
/* TODO: Make this dynamic with MSG_PEEK. */
file.name = malloc(TAG_BUFFER);
if (!file.name)
error("recvfd: failed to allocate file.tag buffer\n");
/*
* We need to "recieve" the non-ancillary data even though we don't
* plan to use it at all. Otherwise, things won't work as expected.
* See unix(7) and other well-hidden documentation.
*/
iov[0].iov_base = file.name;
iov[0].iov_len = TAG_BUFFER;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_control = u.buf;
msg.msg_controllen = sizeof(u.buf);
ssize_t ret = recvmsg(sockfd, &msg, 0);
if (ret < 0)
goto err;
cmsg = CMSG_FIRSTHDR(&msg);
if (!cmsg)
error("recvfd: got NULL from CMSG_FIRSTHDR");
if (cmsg->cmsg_level != SOL_SOCKET)
error("recvfd: expected SOL_SOCKET in cmsg: %d", cmsg->cmsg_level);
if (cmsg->cmsg_type != SCM_RIGHTS)
error("recvfd: expected SCM_RIGHTS in cmsg: %d", cmsg->cmsg_type);
if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
error("recvfd: expected correct CMSG_LEN in cmsg: %lu", (unsigned long)cmsg->cmsg_len);
fdptr = (int *) CMSG_DATA(cmsg);
if (!fdptr || *fdptr < 0)
error("recvfd: recieved invalid pointer");
file.fd = *fdptr;
return file;
err:
olderrno = errno;
free(file.name);
errno = olderrno;
return (struct file_t){0};
}

View File

@ -0,0 +1,57 @@
// +build linux
package utils
/*
* Copyright 2016 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
#include <errno.h>
#include <stdlib.h>
#include "cmsg.h"
*/
import "C"
import (
"os"
"unsafe"
)
// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
// socket. The file name of the remote file descriptor will be recreated
// locally (it is sent as non-auxiliary data in the same payload).
func RecvFd(socket *os.File) (*os.File, error) {
file, err := C.recvfd(C.int(socket.Fd()))
if err != nil {
return nil, err
}
defer C.free(unsafe.Pointer(file.name))
return os.NewFile(uintptr(file.fd), C.GoString(file.name)), nil
}
// SendFd sends a file descriptor over the given AF_UNIX socket. In
// addition, the file.Name() of the given file will also be sent as
// non-auxiliary data in the same payload (allowing to send contextual
// information for a file descriptor).
func SendFd(socket, file *os.File) error {
var cfile C.struct_file_t
cfile.fd = C.int(file.Fd())
cfile.name = C.CString(file.Name())
defer C.free(unsafe.Pointer(cfile.name))
_, err := C.sendfd(C.int(socket.Fd()), cfile)
return err
}

View File

@ -0,0 +1,36 @@
/*
* Copyright 2016 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#if !defined(CMSG_H)
#define CMSG_H
#include <sys/types.h>
/* TODO: Implement this properly with MSG_PEEK. */
#define TAG_BUFFER 4096
/* This mirrors Go's (*os.File). */
struct file_t {
char *name;
int fd;
};
struct file_t recvfd(int sockfd);
ssize_t sendfd(int sockfd, struct file_t file);
#endif /* !defined(CMSG_H) */

View File

@ -9,6 +9,7 @@ import (
"path/filepath" "path/filepath"
"strings" "strings"
"syscall" "syscall"
"unsafe"
) )
const ( const (
@ -102,7 +103,7 @@ func SearchLabels(labels []string, query string) string {
} }
// Annotations returns the bundle path and user defined annotations from the // Annotations returns the bundle path and user defined annotations from the
// libcontianer state. We need to remove the bundle because that is a label // libcontainer state. We need to remove the bundle because that is a label
// added by libcontainer. // added by libcontainer.
func Annotations(labels []string) (bundle string, userAnnotations map[string]string) { func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
userAnnotations = make(map[string]string) userAnnotations = make(map[string]string)
@ -119,3 +120,7 @@ func Annotations(labels []string) (bundle string, userAnnotations map[string]str
} }
return return
} }
func GetIntSize() int {
return int(unsafe.Sizeof(1))
}

View File

@ -4,6 +4,7 @@ package utils
import ( import (
"io/ioutil" "io/ioutil"
"os"
"strconv" "strconv"
"syscall" "syscall"
) )
@ -31,3 +32,12 @@ func CloseExecFrom(minFd int) error {
} }
return nil return nil
} }
// NewSockPair returns a new unix socket pair
func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
if err != nil {
return nil, nil, err
}
return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
}

191
vendor/github.com/opencontainers/runtime-spec/LICENSE generated vendored Normal file
View File

@ -0,0 +1,191 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
Copyright 2015 The Linux Foundation.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,563 @@
package specs
import "os"
// Spec is the base configuration for the container.
type Spec struct {
// Version of the Open Container Runtime Specification with which the bundle complies.
Version string `json:"ociVersion"`
// Platform specifies the configuration's target platform.
Platform Platform `json:"platform"`
// Process configures the container process.
Process Process `json:"process"`
// Root configures the container's root filesystem.
Root Root `json:"root"`
// Hostname configures the container's hostname.
Hostname string `json:"hostname,omitempty"`
// Mounts configures additional mounts (on top of Root).
Mounts []Mount `json:"mounts,omitempty"`
// Hooks configures callbacks for container lifecycle events.
Hooks *Hooks `json:"hooks,omitempty"`
// Annotations contains arbitrary metadata for the container.
Annotations map[string]string `json:"annotations,omitempty"`
// Linux is platform specific configuration for Linux based containers.
Linux *Linux `json:"linux,omitempty" platform:"linux"`
// Solaris is platform specific configuration for Solaris containers.
Solaris *Solaris `json:"solaris,omitempty" platform:"solaris"`
// Windows is platform specific configuration for Windows based containers, including Hyper-V containers.
Windows *Windows `json:"windows,omitempty" platform:"windows"`
}
// Process contains information to start a specific application inside the container.
type Process struct {
// Terminal creates an interactive terminal for the container.
Terminal bool `json:"terminal,omitempty"`
// ConsoleSize specifies the size of the console.
ConsoleSize Box `json:"consoleSize,omitempty"`
// User specifies user information for the process.
User User `json:"user"`
// Args specifies the binary and arguments for the application to execute.
Args []string `json:"args"`
// Env populates the process environment for the process.
Env []string `json:"env,omitempty"`
// Cwd is the current working directory for the process and must be
// relative to the container's root.
Cwd string `json:"cwd"`
// Capabilities are Linux capabilities that are kept for the process.
Capabilities *LinuxCapabilities `json:"capabilities,omitempty" platform:"linux"`
// Rlimits specifies rlimit options to apply to the process.
Rlimits []LinuxRlimit `json:"rlimits,omitempty" platform:"linux"`
// NoNewPrivileges controls whether additional privileges could be gained by processes in the container.
NoNewPrivileges bool `json:"noNewPrivileges,omitempty" platform:"linux"`
// ApparmorProfile specifies the apparmor profile for the container.
ApparmorProfile string `json:"apparmorProfile,omitempty" platform:"linux"`
// SelinuxLabel specifies the selinux context that the container process is run as.
SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"`
}
// LinuxCapabilities specifies the whitelist of capabilities that are kept for a process.
// http://man7.org/linux/man-pages/man7/capabilities.7.html
type LinuxCapabilities struct {
// Bounding is the set of capabilities checked by the kernel.
Bounding []string `json:"bounding,omitempty" platform:"linux"`
// Effective is the set of capabilities checked by the kernel.
Effective []string `json:"effective,omitempty" platform:"linux"`
// Inheritable is the capabilities preserved across execve.
Inheritable []string `json:"inheritable,omitempty" platform:"linux"`
// Permitted is the limiting superset for effective capabilities.
Permitted []string `json:"permitted,omitempty" platform:"linux"`
// Ambient is the ambient set of capabilities that are kept.
Ambient []string `json:"ambient,omitempty" platform:"linux"`
}
// Box specifies dimensions of a rectangle. Used for specifying the size of a console.
type Box struct {
// Height is the vertical dimension of a box.
Height uint `json:"height"`
// Width is the horizontal dimension of a box.
Width uint `json:"width"`
}
// User specifies specific user (and group) information for the container process.
type User struct {
// UID is the user id.
UID uint32 `json:"uid" platform:"linux,solaris"`
// GID is the group id.
GID uint32 `json:"gid" platform:"linux,solaris"`
// AdditionalGids are additional group ids set for the container's process.
AdditionalGids []uint32 `json:"additionalGids,omitempty" platform:"linux,solaris"`
// Username is the user name.
Username string `json:"username,omitempty" platform:"windows"`
}
// Root contains information about the container's root filesystem on the host.
type Root struct {
// Path is the absolute path to the container's root filesystem.
Path string `json:"path"`
// Readonly makes the root filesystem for the container readonly before the process is executed.
Readonly bool `json:"readonly,omitempty"`
}
// Platform specifies OS and arch information for the host system that the container
// is created for.
type Platform struct {
// OS is the operating system.
OS string `json:"os"`
// Arch is the architecture
Arch string `json:"arch"`
}
// Mount specifies a mount for a container.
type Mount struct {
// Destination is the path where the mount will be placed relative to the container's root. The path and child directories MUST exist, a runtime MUST NOT create directories automatically to a mount point.
Destination string `json:"destination"`
// Type specifies the mount kind.
Type string `json:"type,omitempty"`
// Source specifies the source path of the mount. In the case of bind mounts on
// Linux based systems this would be the file on the host.
Source string `json:"source,omitempty"`
// Options are fstab style mount options.
Options []string `json:"options,omitempty"`
}
// Hook specifies a command that is run at a particular event in the lifecycle of a container
type Hook struct {
Path string `json:"path"`
Args []string `json:"args,omitempty"`
Env []string `json:"env,omitempty"`
Timeout *int `json:"timeout,omitempty"`
}
// Hooks for container setup and teardown
type Hooks struct {
// Prestart is a list of hooks to be run before the container process is executed.
// On Linux, they are run after the container namespaces are created.
Prestart []Hook `json:"prestart,omitempty"`
// Poststart is a list of hooks to be run after the container process is started.
Poststart []Hook `json:"poststart,omitempty"`
// Poststop is a list of hooks to be run after the container process exits.
Poststop []Hook `json:"poststop,omitempty"`
}
// Linux contains platform specific configuration for Linux based containers.
type Linux struct {
// UIDMapping specifies user mappings for supporting user namespaces on Linux.
UIDMappings []LinuxIDMapping `json:"uidMappings,omitempty"`
// GIDMapping specifies group mappings for supporting user namespaces on Linux.
GIDMappings []LinuxIDMapping `json:"gidMappings,omitempty"`
// Sysctl are a set of key value pairs that are set for the container on start
Sysctl map[string]string `json:"sysctl,omitempty"`
// Resources contain cgroup information for handling resource constraints
// for the container
Resources *LinuxResources `json:"resources,omitempty"`
// CgroupsPath specifies the path to cgroups that are created and/or joined by the container.
// The path is expected to be relative to the cgroups mountpoint.
// If resources are specified, the cgroups at CgroupsPath will be updated based on resources.
CgroupsPath string `json:"cgroupsPath,omitempty"`
// Namespaces contains the namespaces that are created and/or joined by the container
Namespaces []LinuxNamespace `json:"namespaces,omitempty"`
// Devices are a list of device nodes that are created for the container
Devices []LinuxDevice `json:"devices,omitempty"`
// Seccomp specifies the seccomp security settings for the container.
Seccomp *LinuxSeccomp `json:"seccomp,omitempty"`
// RootfsPropagation is the rootfs mount propagation mode for the container.
RootfsPropagation string `json:"rootfsPropagation,omitempty"`
// MaskedPaths masks over the provided paths inside the container.
MaskedPaths []string `json:"maskedPaths,omitempty"`
// ReadonlyPaths sets the provided paths as RO inside the container.
ReadonlyPaths []string `json:"readonlyPaths,omitempty"`
// MountLabel specifies the selinux context for the mounts in the container.
MountLabel string `json:"mountLabel,omitempty"`
// IntelRdt contains Intel Resource Director Technology (RDT) information
// for handling resource constraints (e.g., L3 cache) for the container
IntelRdt *LinuxIntelRdt `json:"intelRdt,omitempty"`
}
// LinuxNamespace is the configuration for a Linux namespace
type LinuxNamespace struct {
// Type is the type of Linux namespace
Type LinuxNamespaceType `json:"type"`
// Path is a path to an existing namespace persisted on disk that can be joined
// and is of the same type
Path string `json:"path,omitempty"`
}
// LinuxNamespaceType is one of the Linux namespaces
type LinuxNamespaceType string
const (
// PIDNamespace for isolating process IDs
PIDNamespace LinuxNamespaceType = "pid"
// NetworkNamespace for isolating network devices, stacks, ports, etc
NetworkNamespace = "network"
// MountNamespace for isolating mount points
MountNamespace = "mount"
// IPCNamespace for isolating System V IPC, POSIX message queues
IPCNamespace = "ipc"
// UTSNamespace for isolating hostname and NIS domain name
UTSNamespace = "uts"
// UserNamespace for isolating user and group IDs
UserNamespace = "user"
// CgroupNamespace for isolating cgroup hierarchies
CgroupNamespace = "cgroup"
)
// LinuxIDMapping specifies UID/GID mappings
type LinuxIDMapping struct {
// HostID is the starting UID/GID on the host to be mapped to 'ContainerID'
HostID uint32 `json:"hostID"`
// ContainerID is the starting UID/GID in the container
ContainerID uint32 `json:"containerID"`
// Size is the number of IDs to be mapped
Size uint32 `json:"size"`
}
// LinuxRlimit type and restrictions
type LinuxRlimit struct {
// Type of the rlimit to set
Type string `json:"type"`
// Hard is the hard limit for the specified type
Hard uint64 `json:"hard"`
// Soft is the soft limit for the specified type
Soft uint64 `json:"soft"`
}
// LinuxHugepageLimit structure corresponds to limiting kernel hugepages
type LinuxHugepageLimit struct {
// Pagesize is the hugepage size
Pagesize string `json:"pageSize"`
// Limit is the limit of "hugepagesize" hugetlb usage
Limit uint64 `json:"limit"`
}
// LinuxInterfacePriority for network interfaces
type LinuxInterfacePriority struct {
// Name is the name of the network interface
Name string `json:"name"`
// Priority for the interface
Priority uint32 `json:"priority"`
}
// linuxBlockIODevice holds major:minor format supported in blkio cgroup
type linuxBlockIODevice struct {
// Major is the device's major number.
Major int64 `json:"major"`
// Minor is the device's minor number.
Minor int64 `json:"minor"`
}
// LinuxWeightDevice struct holds a `major:minor weight` pair for blkioWeightDevice
type LinuxWeightDevice struct {
linuxBlockIODevice
// Weight is the bandwidth rate for the device, range is from 10 to 1000
Weight *uint16 `json:"weight,omitempty"`
// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, CFQ scheduler only
LeafWeight *uint16 `json:"leafWeight,omitempty"`
}
// LinuxThrottleDevice struct holds a `major:minor rate_per_second` pair
type LinuxThrottleDevice struct {
linuxBlockIODevice
// Rate is the IO rate limit per cgroup per device
Rate uint64 `json:"rate"`
}
// LinuxBlockIO for Linux cgroup 'blkio' resource management
type LinuxBlockIO struct {
// Specifies per cgroup weight, range is from 10 to 1000
Weight *uint16 `json:"blkioWeight,omitempty"`
// Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, CFQ scheduler only
LeafWeight *uint16 `json:"blkioLeafWeight,omitempty"`
// Weight per cgroup per device, can override BlkioWeight
WeightDevice []LinuxWeightDevice `json:"blkioWeightDevice,omitempty"`
// IO read rate limit per cgroup per device, bytes per second
ThrottleReadBpsDevice []LinuxThrottleDevice `json:"blkioThrottleReadBpsDevice,omitempty"`
// IO write rate limit per cgroup per device, bytes per second
ThrottleWriteBpsDevice []LinuxThrottleDevice `json:"blkioThrottleWriteBpsDevice,omitempty"`
// IO read rate limit per cgroup per device, IO per second
ThrottleReadIOPSDevice []LinuxThrottleDevice `json:"blkioThrottleReadIOPSDevice,omitempty"`
// IO write rate limit per cgroup per device, IO per second
ThrottleWriteIOPSDevice []LinuxThrottleDevice `json:"blkioThrottleWriteIOPSDevice,omitempty"`
}
// LinuxMemory for Linux cgroup 'memory' resource management
type LinuxMemory struct {
// Memory limit (in bytes).
Limit *uint64 `json:"limit,omitempty"`
// Memory reservation or soft_limit (in bytes).
Reservation *uint64 `json:"reservation,omitempty"`
// Total memory limit (memory + swap).
Swap *uint64 `json:"swap,omitempty"`
// Kernel memory limit (in bytes).
Kernel *uint64 `json:"kernel,omitempty"`
// Kernel memory limit for tcp (in bytes)
KernelTCP *uint64 `json:"kernelTCP,omitempty"`
// How aggressive the kernel will swap memory pages. Range from 0 to 100.
Swappiness *uint64 `json:"swappiness,omitempty"`
}
// LinuxCPU for Linux cgroup 'cpu' resource management
type LinuxCPU struct {
// CPU shares (relative weight (ratio) vs. other cgroups with cpu shares).
Shares *uint64 `json:"shares,omitempty"`
// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
Quota *int64 `json:"quota,omitempty"`
// CPU period to be used for hardcapping (in usecs).
Period *uint64 `json:"period,omitempty"`
// How much time realtime scheduling may use (in usecs).
RealtimeRuntime *int64 `json:"realtimeRuntime,omitempty"`
// CPU period to be used for realtime scheduling (in usecs).
RealtimePeriod *uint64 `json:"realtimePeriod,omitempty"`
// CPUs to use within the cpuset. Default is to use any CPU available.
Cpus string `json:"cpus,omitempty"`
// List of memory nodes in the cpuset. Default is to use any available memory node.
Mems string `json:"mems,omitempty"`
}
// LinuxPids for Linux cgroup 'pids' resource management (Linux 4.3)
type LinuxPids struct {
// Maximum number of PIDs. Default is "no limit".
Limit int64 `json:"limit"`
}
// LinuxNetwork identification and priority configuration
type LinuxNetwork struct {
// Set class identifier for container's network packets
ClassID *uint32 `json:"classID,omitempty"`
// Set priority of network traffic for container
Priorities []LinuxInterfacePriority `json:"priorities,omitempty"`
}
// LinuxResources has container runtime resource constraints
type LinuxResources struct {
// Devices configures the device whitelist.
Devices []LinuxDeviceCgroup `json:"devices,omitempty"`
// DisableOOMKiller disables the OOM killer for out of memory conditions
DisableOOMKiller *bool `json:"disableOOMKiller,omitempty"`
// Specify an oom_score_adj for the container.
OOMScoreAdj *int `json:"oomScoreAdj,omitempty"`
// Memory restriction configuration
Memory *LinuxMemory `json:"memory,omitempty"`
// CPU resource restriction configuration
CPU *LinuxCPU `json:"cpu,omitempty"`
// Task resource restriction configuration.
Pids *LinuxPids `json:"pids,omitempty"`
// BlockIO restriction configuration
BlockIO *LinuxBlockIO `json:"blockIO,omitempty"`
// Hugetlb limit (in bytes)
HugepageLimits []LinuxHugepageLimit `json:"hugepageLimits,omitempty"`
// Network restriction configuration
Network *LinuxNetwork `json:"network,omitempty"`
}
// LinuxDevice represents the mknod information for a Linux special device file
type LinuxDevice struct {
// Path to the device.
Path string `json:"path"`
// Device type, block, char, etc.
Type string `json:"type"`
// Major is the device's major number.
Major int64 `json:"major"`
// Minor is the device's minor number.
Minor int64 `json:"minor"`
// FileMode permission bits for the device.
FileMode *os.FileMode `json:"fileMode,omitempty"`
// UID of the device.
UID *uint32 `json:"uid,omitempty"`
// Gid of the device.
GID *uint32 `json:"gid,omitempty"`
}
// LinuxDeviceCgroup represents a device rule for the whitelist controller
type LinuxDeviceCgroup struct {
// Allow or deny
Allow bool `json:"allow"`
// Device type, block, char, etc.
Type string `json:"type,omitempty"`
// Major is the device's major number.
Major *int64 `json:"major,omitempty"`
// Minor is the device's minor number.
Minor *int64 `json:"minor,omitempty"`
// Cgroup access permissions format, rwm.
Access string `json:"access,omitempty"`
}
// Solaris contains platform specific configuration for Solaris application containers.
type Solaris struct {
// SMF FMRI which should go "online" before we start the container process.
Milestone string `json:"milestone,omitempty"`
// Maximum set of privileges any process in this container can obtain.
LimitPriv string `json:"limitpriv,omitempty"`
// The maximum amount of shared memory allowed for this container.
MaxShmMemory string `json:"maxShmMemory,omitempty"`
// Specification for automatic creation of network resources for this container.
Anet []SolarisAnet `json:"anet,omitempty"`
// Set limit on the amount of CPU time that can be used by container.
CappedCPU *SolarisCappedCPU `json:"cappedCPU,omitempty"`
// The physical and swap caps on the memory that can be used by this container.
CappedMemory *SolarisCappedMemory `json:"cappedMemory,omitempty"`
}
// SolarisCappedCPU allows users to set limit on the amount of CPU time that can be used by container.
type SolarisCappedCPU struct {
Ncpus string `json:"ncpus,omitempty"`
}
// SolarisCappedMemory allows users to set the physical and swap caps on the memory that can be used by this container.
type SolarisCappedMemory struct {
Physical string `json:"physical,omitempty"`
Swap string `json:"swap,omitempty"`
}
// SolarisAnet provides the specification for automatic creation of network resources for this container.
type SolarisAnet struct {
// Specify a name for the automatically created VNIC datalink.
Linkname string `json:"linkname,omitempty"`
// Specify the link over which the VNIC will be created.
Lowerlink string `json:"lowerLink,omitempty"`
// The set of IP addresses that the container can use.
Allowedaddr string `json:"allowedAddress,omitempty"`
// Specifies whether allowedAddress limitation is to be applied to the VNIC.
Configallowedaddr string `json:"configureAllowedAddress,omitempty"`
// The value of the optional default router.
Defrouter string `json:"defrouter,omitempty"`
// Enable one or more types of link protection.
Linkprotection string `json:"linkProtection,omitempty"`
// Set the VNIC's macAddress
Macaddress string `json:"macAddress,omitempty"`
}
// Windows defines the runtime configuration for Windows based containers, including Hyper-V containers.
type Windows struct {
// Resources contains information for handling resource constraints for the container.
Resources *WindowsResources `json:"resources,omitempty"`
}
// WindowsResources has container runtime resource constraints for containers running on Windows.
type WindowsResources struct {
// Memory restriction configuration.
Memory *WindowsMemoryResources `json:"memory,omitempty"`
// CPU resource restriction configuration.
CPU *WindowsCPUResources `json:"cpu,omitempty"`
// Storage restriction configuration.
Storage *WindowsStorageResources `json:"storage,omitempty"`
// Network restriction configuration.
Network *WindowsNetworkResources `json:"network,omitempty"`
}
// WindowsMemoryResources contains memory resource management settings.
type WindowsMemoryResources struct {
// Memory limit in bytes.
Limit *uint64 `json:"limit,omitempty"`
// Memory reservation in bytes.
Reservation *uint64 `json:"reservation,omitempty"`
}
// WindowsCPUResources contains CPU resource management settings.
type WindowsCPUResources struct {
// Number of CPUs available to the container.
Count *uint64 `json:"count,omitempty"`
// CPU shares (relative weight to other containers with cpu shares). Range is from 1 to 10000.
Shares *uint16 `json:"shares,omitempty"`
// Percent of available CPUs usable by the container.
Percent *uint8 `json:"percent,omitempty"`
}
// WindowsStorageResources contains storage resource management settings.
type WindowsStorageResources struct {
// Specifies maximum Iops for the system drive.
Iops *uint64 `json:"iops,omitempty"`
// Specifies maximum bytes per second for the system drive.
Bps *uint64 `json:"bps,omitempty"`
// Sandbox size specifies the minimum size of the system drive in bytes.
SandboxSize *uint64 `json:"sandboxSize,omitempty"`
}
// WindowsNetworkResources contains network resource management settings.
type WindowsNetworkResources struct {
// EgressBandwidth is the maximum egress bandwidth in bytes per second.
EgressBandwidth *uint64 `json:"egressBandwidth,omitempty"`
}
// LinuxSeccomp represents syscall restrictions
type LinuxSeccomp struct {
DefaultAction LinuxSeccompAction `json:"defaultAction"`
Architectures []Arch `json:"architectures,omitempty"`
Syscalls []LinuxSyscall `json:"syscalls"`
}
// Arch used for additional architectures
type Arch string
// Additional architectures permitted to be used for system calls
// By default only the native architecture of the kernel is permitted
const (
ArchX86 Arch = "SCMP_ARCH_X86"
ArchX86_64 Arch = "SCMP_ARCH_X86_64"
ArchX32 Arch = "SCMP_ARCH_X32"
ArchARM Arch = "SCMP_ARCH_ARM"
ArchAARCH64 Arch = "SCMP_ARCH_AARCH64"
ArchMIPS Arch = "SCMP_ARCH_MIPS"
ArchMIPS64 Arch = "SCMP_ARCH_MIPS64"
ArchMIPS64N32 Arch = "SCMP_ARCH_MIPS64N32"
ArchMIPSEL Arch = "SCMP_ARCH_MIPSEL"
ArchMIPSEL64 Arch = "SCMP_ARCH_MIPSEL64"
ArchMIPSEL64N32 Arch = "SCMP_ARCH_MIPSEL64N32"
ArchPPC Arch = "SCMP_ARCH_PPC"
ArchPPC64 Arch = "SCMP_ARCH_PPC64"
ArchPPC64LE Arch = "SCMP_ARCH_PPC64LE"
ArchS390 Arch = "SCMP_ARCH_S390"
ArchS390X Arch = "SCMP_ARCH_S390X"
ArchPARISC Arch = "SCMP_ARCH_PARISC"
ArchPARISC64 Arch = "SCMP_ARCH_PARISC64"
)
// LinuxSeccompAction taken upon Seccomp rule match
type LinuxSeccompAction string
// Define actions for Seccomp rules
const (
ActKill LinuxSeccompAction = "SCMP_ACT_KILL"
ActTrap LinuxSeccompAction = "SCMP_ACT_TRAP"
ActErrno LinuxSeccompAction = "SCMP_ACT_ERRNO"
ActTrace LinuxSeccompAction = "SCMP_ACT_TRACE"
ActAllow LinuxSeccompAction = "SCMP_ACT_ALLOW"
)
// LinuxSeccompOperator used to match syscall arguments in Seccomp
type LinuxSeccompOperator string
// Define operators for syscall arguments in Seccomp
const (
OpNotEqual LinuxSeccompOperator = "SCMP_CMP_NE"
OpLessThan LinuxSeccompOperator = "SCMP_CMP_LT"
OpLessEqual LinuxSeccompOperator = "SCMP_CMP_LE"
OpEqualTo LinuxSeccompOperator = "SCMP_CMP_EQ"
OpGreaterEqual LinuxSeccompOperator = "SCMP_CMP_GE"
OpGreaterThan LinuxSeccompOperator = "SCMP_CMP_GT"
OpMaskedEqual LinuxSeccompOperator = "SCMP_CMP_MASKED_EQ"
)
// LinuxSeccompArg used for matching specific syscall arguments in Seccomp
type LinuxSeccompArg struct {
Index uint `json:"index"`
Value uint64 `json:"value"`
ValueTwo uint64 `json:"valueTwo"`
Op LinuxSeccompOperator `json:"op"`
}
// LinuxSyscall is used to match a syscall in Seccomp
type LinuxSyscall struct {
Names []string `json:"names"`
Action LinuxSeccompAction `json:"action"`
Args []LinuxSeccompArg `json:"args,omitempty"`
}
// LinuxIntelRdt has container runtime resource constraints
// for Intel RDT/CAT which introduced in Linux 4.10 kernel
type LinuxIntelRdt struct {
// The schema for L3 cache id and capacity bitmask (CBM)
// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
L3CacheSchema string `json:"l3CacheSchema,omitempty"`
}

View File

@ -0,0 +1,17 @@
package specs
// State holds information about the runtime state of the container.
type State struct {
// Version is the version of the specification that is supported.
Version string `json:"ociVersion"`
// ID is the container ID
ID string `json:"id"`
// Status is the runtime status of the container.
Status string `json:"status"`
// Pid is the process ID for the container process.
Pid int `json:"pid"`
// Bundle is the path to the container's bundle directory.
Bundle string `json:"bundle"`
// Annotations are key values associated with the container.
Annotations map[string]string `json:"annotations,omitempty"`
}

View File

@ -0,0 +1,18 @@
package specs
import "fmt"
const (
// VersionMajor is for an API incompatible changes
VersionMajor = 1
// VersionMinor is for functionality in a backwards-compatible manner
VersionMinor = 0
// VersionPatch is for backwards-compatible bug fixes
VersionPatch = 0
// VersionDev indicates development branch. Releases will be empty string.
VersionDev = "-rc5-dev"
)
// Version is the specification version that the package types support.
var Version = fmt.Sprintf("%d.%d.%d%s", VersionMajor, VersionMinor, VersionPatch, VersionDev)

View File

@ -10,42 +10,42 @@ package capability
type Capabilities interface { type Capabilities interface {
// Get check whether a capability present in the given // Get check whether a capability present in the given
// capabilities set. The 'which' value should be one of EFFECTIVE, // capabilities set. The 'which' value should be one of EFFECTIVE,
// PERMITTED, INHERITABLE or BOUNDING. // PERMITTED, INHERITABLE, BOUNDING or AMBIENT.
Get(which CapType, what Cap) bool Get(which CapType, what Cap) bool
// Empty check whether all capability bits of the given capabilities // Empty check whether all capability bits of the given capabilities
// set are zero. The 'which' value should be one of EFFECTIVE, // set are zero. The 'which' value should be one of EFFECTIVE,
// PERMITTED, INHERITABLE or BOUNDING. // PERMITTED, INHERITABLE, BOUNDING or AMBIENT.
Empty(which CapType) bool Empty(which CapType) bool
// Full check whether all capability bits of the given capabilities // Full check whether all capability bits of the given capabilities
// set are one. The 'which' value should be one of EFFECTIVE, // set are one. The 'which' value should be one of EFFECTIVE,
// PERMITTED, INHERITABLE or BOUNDING. // PERMITTED, INHERITABLE, BOUNDING or AMBIENT.
Full(which CapType) bool Full(which CapType) bool
// Set sets capabilities of the given capabilities sets. The // Set sets capabilities of the given capabilities sets. The
// 'which' value should be one or combination (OR'ed) of EFFECTIVE, // 'which' value should be one or combination (OR'ed) of EFFECTIVE,
// PERMITTED, INHERITABLE or BOUNDING. // PERMITTED, INHERITABLE, BOUNDING or AMBIENT.
Set(which CapType, caps ...Cap) Set(which CapType, caps ...Cap)
// Unset unsets capabilities of the given capabilities sets. The // Unset unsets capabilities of the given capabilities sets. The
// 'which' value should be one or combination (OR'ed) of EFFECTIVE, // 'which' value should be one or combination (OR'ed) of EFFECTIVE,
// PERMITTED, INHERITABLE or BOUNDING. // PERMITTED, INHERITABLE, BOUNDING or AMBIENT.
Unset(which CapType, caps ...Cap) Unset(which CapType, caps ...Cap)
// Fill sets all bits of the given capabilities kind to one. The // Fill sets all bits of the given capabilities kind to one. The
// 'kind' value should be one or combination (OR'ed) of CAPS or // 'kind' value should be one or combination (OR'ed) of CAPS,
// BOUNDS. // BOUNDS or AMBS.
Fill(kind CapType) Fill(kind CapType)
// Clear sets all bits of the given capabilities kind to zero. The // Clear sets all bits of the given capabilities kind to zero. The
// 'kind' value should be one or combination (OR'ed) of CAPS or // 'kind' value should be one or combination (OR'ed) of CAPS,
// BOUNDS. // BOUNDS or AMBS.
Clear(kind CapType) Clear(kind CapType)
// String return current capabilities state of the given capabilities // String return current capabilities state of the given capabilities
// set as string. The 'which' value should be one of EFFECTIVE, // set as string. The 'which' value should be one of EFFECTIVE,
// PERMITTED, INHERITABLE or BOUNDING. // PERMITTED, INHERITABLE BOUNDING or AMBIENT
StringCap(which CapType) string StringCap(which CapType) string
// String return current capabilities state as string. // String return current capabilities state as string.

View File

@ -238,6 +238,7 @@ type capsV3 struct {
hdr capHeader hdr capHeader
data [2]capData data [2]capData
bounds [2]uint32 bounds [2]uint32
ambient [2]uint32
} }
func (c *capsV3) Get(which CapType, what Cap) bool { func (c *capsV3) Get(which CapType, what Cap) bool {
@ -256,6 +257,8 @@ func (c *capsV3) Get(which CapType, what Cap) bool {
return (1<<uint(what))&c.data[i].inheritable != 0 return (1<<uint(what))&c.data[i].inheritable != 0
case BOUNDING: case BOUNDING:
return (1<<uint(what))&c.bounds[i] != 0 return (1<<uint(what))&c.bounds[i] != 0
case AMBIENT:
return (1<<uint(what))&c.ambient[i] != 0
} }
return false return false
@ -275,6 +278,9 @@ func (c *capsV3) getData(which CapType, dest []uint32) {
case BOUNDING: case BOUNDING:
dest[0] = c.bounds[0] dest[0] = c.bounds[0]
dest[1] = c.bounds[1] dest[1] = c.bounds[1]
case AMBIENT:
dest[0] = c.ambient[0]
dest[1] = c.ambient[1]
} }
} }
@ -313,6 +319,9 @@ func (c *capsV3) Set(which CapType, caps ...Cap) {
if which&BOUNDING != 0 { if which&BOUNDING != 0 {
c.bounds[i] |= 1 << uint(what) c.bounds[i] |= 1 << uint(what)
} }
if which&AMBIENT != 0 {
c.ambient[i] |= 1 << uint(what)
}
} }
} }
@ -336,6 +345,9 @@ func (c *capsV3) Unset(which CapType, caps ...Cap) {
if which&BOUNDING != 0 { if which&BOUNDING != 0 {
c.bounds[i] &= ^(1 << uint(what)) c.bounds[i] &= ^(1 << uint(what))
} }
if which&AMBIENT != 0 {
c.ambient[i] &= ^(1 << uint(what))
}
} }
} }
@ -353,6 +365,10 @@ func (c *capsV3) Fill(kind CapType) {
c.bounds[0] = 0xffffffff c.bounds[0] = 0xffffffff
c.bounds[1] = 0xffffffff c.bounds[1] = 0xffffffff
} }
if kind&AMBS == AMBS {
c.ambient[0] = 0xffffffff
c.ambient[1] = 0xffffffff
}
} }
func (c *capsV3) Clear(kind CapType) { func (c *capsV3) Clear(kind CapType) {
@ -369,6 +385,10 @@ func (c *capsV3) Clear(kind CapType) {
c.bounds[0] = 0 c.bounds[0] = 0
c.bounds[1] = 0 c.bounds[1] = 0
} }
if kind&AMBS == AMBS {
c.ambient[0] = 0
c.ambient[1] = 0
}
} }
func (c *capsV3) StringCap(which CapType) (ret string) { func (c *capsV3) StringCap(which CapType) (ret string) {
@ -410,6 +430,10 @@ func (c *capsV3) Load() (err error) {
fmt.Sscanf(line[4:], "nd: %08x%08x", &c.bounds[1], &c.bounds[0]) fmt.Sscanf(line[4:], "nd: %08x%08x", &c.bounds[1], &c.bounds[0])
break break
} }
if strings.HasPrefix(line, "CapA") {
fmt.Sscanf(line[4:], "mb: %08x%08x", &c.ambient[1], &c.ambient[0])
break
}
} }
f.Close() f.Close()
@ -442,7 +466,25 @@ func (c *capsV3) Apply(kind CapType) (err error) {
} }
if kind&CAPS == CAPS { if kind&CAPS == CAPS {
return capset(&c.hdr, &c.data[0]) err = capset(&c.hdr, &c.data[0])
if err != nil {
return
}
}
if kind&AMBS == AMBS {
for i := Cap(0); i <= CAP_LAST_CAP; i++ {
action := pr_CAP_AMBIENT_LOWER
if c.Get(AMBIENT, i) {
action = pr_CAP_AMBIENT_RAISE
}
err := prctl(pr_CAP_AMBIENT, action, uintptr(i), 0, 0)
// Ignore EINVAL as not supported on kernels before 4.3
if errno, ok := err.(syscall.Errno); ok && errno == syscall.EINVAL {
err = nil
continue
}
}
} }
return return

View File

@ -20,6 +20,8 @@ func (c CapType) String() string {
return "bounding" return "bounding"
case CAPS: case CAPS:
return "caps" return "caps"
case AMBIENT:
return "ambient"
} }
return "unknown" return "unknown"
} }
@ -29,9 +31,11 @@ const (
PERMITTED PERMITTED
INHERITABLE INHERITABLE
BOUNDING BOUNDING
AMBIENT
CAPS = EFFECTIVE | PERMITTED | INHERITABLE CAPS = EFFECTIVE | PERMITTED | INHERITABLE
BOUNDS = BOUNDING BOUNDS = BOUNDING
AMBS = AMBIENT
) )
//go:generate go run enumgen/gen.go //go:generate go run enumgen/gen.go

View File

@ -38,6 +38,15 @@ func capset(hdr *capHeader, data *capData) (err error) {
return return
} }
// not yet in syscall
const (
pr_CAP_AMBIENT = 47
pr_CAP_AMBIENT_IS_SET = uintptr(1)
pr_CAP_AMBIENT_RAISE = uintptr(2)
pr_CAP_AMBIENT_LOWER = uintptr(3)
pr_CAP_AMBIENT_CLEAR_ALL = uintptr(4)
)
func prctl(option int, arg2, arg3, arg4, arg5 uintptr) (err error) { func prctl(option int, arg2, arg3, arg4, arg5 uintptr) (err error) {
_, _, e1 := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0) _, _, e1 := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0)
if e1 != 0 { if e1 != 0 {