From 749f73148b4ea812ee4ed112b413eea3b530a864 Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Tue, 22 Aug 2017 16:06:01 -0500 Subject: [PATCH 1/2] bump(github.com/opencontainers/runc/libcontainer): 4d6225ae --- Godeps/Godeps.json | 80 ++- .../github.com/mrunalp/fileutils/.gitignore | 1 + vendor/github.com/mrunalp/fileutils/LICENSE | 191 ++++++ .../github.com/mrunalp/fileutils/MAINTAINERS | 1 + vendor/github.com/mrunalp/fileutils/README.md | 5 + .../github.com/mrunalp/fileutils/fileutils.go | 161 +++++ .../github.com/mrunalp/fileutils/idtools.go | 49 ++ .../runc/libcontainer/README.md | 20 +- .../runc/libcontainer/capabilities_linux.go | 87 ++- .../runc/libcontainer/cgroups/cgroups.go | 4 +- .../runc/libcontainer/cgroups/fs/apply_raw.go | 46 +- .../runc/libcontainer/cgroups/fs/cpu.go | 6 +- .../runc/libcontainer/cgroups/fs/cpuset.go | 26 +- .../runc/libcontainer/cgroups/fs/memory.go | 80 ++- .../libcontainer/cgroups/rootless/rootless.go | 128 ++++ .../runc/libcontainer/cgroups/stats.go | 2 + .../cgroups/systemd/apply_systemd.go | 108 +++- .../runc/libcontainer/cgroups/utils.go | 89 +-- .../runc/libcontainer/configs/cgroup_unix.go | 22 +- .../runc/libcontainer/configs/config.go | 34 +- .../runc/libcontainer/configs/config_unix.go | 40 +- .../runc/libcontainer/configs/mount.go | 9 + .../configs/namespaces_syscall_unsupported.go | 2 - .../libcontainer/configs/namespaces_unix.go | 8 +- .../libcontainer/configs/validate/rootless.go | 117 ++++ .../configs/validate/validator.go | 57 ++ .../runc/libcontainer/console.go | 10 +- .../runc/libcontainer/console_freebsd.go | 4 +- .../runc/libcontainer/console_linux.go | 69 ++- .../runc/libcontainer/console_solaris.go | 4 +- .../runc/libcontainer/console_windows.go | 6 +- .../runc/libcontainer/container.go | 15 +- .../runc/libcontainer/container_linux.go | 341 ++++++++--- .../runc/libcontainer/criu_opts_unix.go | 2 + .../opencontainers/runc/libcontainer/error.go | 6 +- .../runc/libcontainer/factory_linux.go | 109 ++-- .../runc/libcontainer/generic_error.go | 14 - .../runc/libcontainer/init_linux.go | 260 ++++++-- .../runc/libcontainer/label/label.go | 4 + .../runc/libcontainer/label/label_selinux.go | 8 +- .../runc/libcontainer/message_linux.go | 12 +- .../runc/libcontainer/process.go | 27 +- .../runc/libcontainer/process_linux.go | 109 ++-- .../runc/libcontainer/rootfs_linux.go | 240 +++++--- .../libcontainer/seccomp/seccomp_linux.go | 8 +- .../runc/libcontainer/selinux/selinux.go | 138 +++-- .../runc/libcontainer/setns_init_linux.go | 12 +- .../runc/libcontainer/standard_init_linux.go | 56 +- .../runc/libcontainer/state_linux.go | 20 +- .../opencontainers/runc/libcontainer/sync.go | 107 ++++ .../runc/libcontainer/system/proc.go | 20 +- .../runc/libcontainer/user/user.go | 28 +- .../runc/libcontainer/utils/cmsg.c | 148 +++++ .../runc/libcontainer/utils/cmsg.go | 57 ++ .../runc/libcontainer/utils/cmsg.h | 36 ++ .../runc/libcontainer/utils/utils.go | 7 +- .../runc/libcontainer/utils/utils_unix.go | 10 + .../opencontainers/runtime-spec/LICENSE | 191 ++++++ .../runtime-spec/specs-go/config.go | 563 ++++++++++++++++++ .../runtime-spec/specs-go/state.go | 17 + .../runtime-spec/specs-go/version.go | 18 + .../gocapability/capability/capability.go | 20 +- .../capability/capability_linux.go | 50 +- .../syndtr/gocapability/capability/enum.go | 4 + .../gocapability/capability/syscall_linux.go | 9 + 65 files changed, 3370 insertions(+), 762 deletions(-) create mode 100644 vendor/github.com/mrunalp/fileutils/.gitignore create mode 100644 vendor/github.com/mrunalp/fileutils/LICENSE create mode 100644 vendor/github.com/mrunalp/fileutils/MAINTAINERS create mode 100644 vendor/github.com/mrunalp/fileutils/README.md create mode 100644 vendor/github.com/mrunalp/fileutils/fileutils.go create mode 100644 vendor/github.com/mrunalp/fileutils/idtools.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/cgroups/rootless/rootless.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/sync.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.c create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.h create mode 100644 vendor/github.com/opencontainers/runtime-spec/LICENSE create mode 100644 vendor/github.com/opencontainers/runtime-spec/specs-go/config.go create mode 100644 vendor/github.com/opencontainers/runtime-spec/specs-go/state.go create mode 100644 vendor/github.com/opencontainers/runtime-spec/specs-go/version.go diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json index 6920c966..c9958791 100644 --- a/Godeps/Godeps.json +++ b/Godeps/Godeps.json @@ -380,85 +380,99 @@ "Comment": "v2.1.1-5-g1b4ae6f", "Rev": "1b4ae6fb4e77b095934d4430860ff202060169f8" }, + { + "ImportPath": "github.com/mrunalp/fileutils", + "Rev": "4ee1cc9a80582a0c75febdd5cfa779ee4361cbca" + }, { "ImportPath": "github.com/opencontainers/runc/libcontainer", - "Comment": "v1.0.0-rc1-224-g5653ced", - "Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/apparmor", - "Comment": "v1.0.0-rc1-224-g5653ced", - "Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups", - "Comment": "v1.0.0-rc1-224-g5653ced", - "Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/fs", - "Comment": "v1.0.0-rc1-224-g5653ced", - "Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" + }, + { + "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/rootless", + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/systemd", - "Comment": "v1.0.0-rc1-224-g5653ced", - "Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/configs", - "Comment": "v1.0.0-rc1-224-g5653ced", - "Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/configs/validate", - "Comment": "v1.0.0-rc1-224-g5653ced", - "Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/criurpc", - "Comment": "v1.0.0-rc1-224-g5653ced", - "Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/keys", - "Comment": "v1.0.0-rc1-224-g5653ced", - "Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/label", - "Comment": "v1.0.0-rc1-224-g5653ced", - "Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/seccomp", - "Comment": "v1.0.0-rc1-224-g5653ced", - "Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/selinux", - "Comment": "v1.0.0-rc1-224-g5653ced", - "Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/stacktrace", - "Comment": "v1.0.0-rc1-224-g5653ced", - "Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/system", - "Comment": "v1.0.0-rc1-224-g5653ced", - "Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/user", - "Comment": "v1.0.0-rc1-224-g5653ced", - "Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" }, { "ImportPath": "github.com/opencontainers/runc/libcontainer/utils", - "Comment": "v1.0.0-rc1-224-g5653ced", - "Rev": "5653ced544b1fa9d9623f12aaf9601bf1aefc013" + "Comment": "v1.0.0-rc3-15-g4d6225ae", + "Rev": "4d6225aec25991f62fb37e5fec0ec5853c660f9f" + }, + { + "ImportPath": "github.com/opencontainers/runtime-spec/specs-go", + "Comment": "v1.0.0-rc5-67-gf227620", + "Rev": "f2276206b32ad0c2478bfc6440ceb7d51d815cf8" }, { "ImportPath": "github.com/pborman/uuid", @@ -513,7 +527,7 @@ }, { "ImportPath": "github.com/syndtr/gocapability/capability", - "Rev": "2c00daeb6c3b45114c80ac44119e7b8801fdd852" + "Rev": "e7cb7fa329f456b3855136a2642b197bad7366ba" }, { "ImportPath": "github.com/vishvananda/netlink", diff --git a/vendor/github.com/mrunalp/fileutils/.gitignore b/vendor/github.com/mrunalp/fileutils/.gitignore new file mode 100644 index 00000000..aac977bc --- /dev/null +++ b/vendor/github.com/mrunalp/fileutils/.gitignore @@ -0,0 +1 @@ +/gocp diff --git a/vendor/github.com/mrunalp/fileutils/LICENSE b/vendor/github.com/mrunalp/fileutils/LICENSE new file mode 100644 index 00000000..27448585 --- /dev/null +++ b/vendor/github.com/mrunalp/fileutils/LICENSE @@ -0,0 +1,191 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2014 Docker, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/mrunalp/fileutils/MAINTAINERS b/vendor/github.com/mrunalp/fileutils/MAINTAINERS new file mode 100644 index 00000000..4a2cafa5 --- /dev/null +++ b/vendor/github.com/mrunalp/fileutils/MAINTAINERS @@ -0,0 +1 @@ +Mrunal Patel (@mrunalp) diff --git a/vendor/github.com/mrunalp/fileutils/README.md b/vendor/github.com/mrunalp/fileutils/README.md new file mode 100644 index 00000000..6cb4140e --- /dev/null +++ b/vendor/github.com/mrunalp/fileutils/README.md @@ -0,0 +1,5 @@ +# fileutils + +Collection of utilities for file manipulation in golang + +The library is based on docker pkg/archive pkg/idtools but does copies instead of handling archive formats. diff --git a/vendor/github.com/mrunalp/fileutils/fileutils.go b/vendor/github.com/mrunalp/fileutils/fileutils.go new file mode 100644 index 00000000..b60cb909 --- /dev/null +++ b/vendor/github.com/mrunalp/fileutils/fileutils.go @@ -0,0 +1,161 @@ +package fileutils + +import ( + "fmt" + "io" + "os" + "path/filepath" + "syscall" +) + +// CopyFile copies the file at source to dest +func CopyFile(source string, dest string) error { + si, err := os.Lstat(source) + if err != nil { + return err + } + + st, ok := si.Sys().(*syscall.Stat_t) + if !ok { + return fmt.Errorf("could not convert to syscall.Stat_t") + } + + uid := int(st.Uid) + gid := int(st.Gid) + + // Handle symlinks + if si.Mode()&os.ModeSymlink != 0 { + target, err := os.Readlink(source) + if err != nil { + return err + } + if err := os.Symlink(target, dest); err != nil { + return err + } + } + + // Handle device files + if st.Mode&syscall.S_IFMT == syscall.S_IFBLK || st.Mode&syscall.S_IFMT == syscall.S_IFCHR { + devMajor := int64(major(uint64(st.Rdev))) + devMinor := int64(minor(uint64(st.Rdev))) + mode := uint32(si.Mode() & 07777) + if st.Mode&syscall.S_IFMT == syscall.S_IFBLK { + mode |= syscall.S_IFBLK + } + if st.Mode&syscall.S_IFMT == syscall.S_IFCHR { + mode |= syscall.S_IFCHR + } + if err := syscall.Mknod(dest, mode, int(mkdev(devMajor, devMinor))); err != nil { + return err + } + } + + // Handle regular files + if si.Mode().IsRegular() { + sf, err := os.Open(source) + if err != nil { + return err + } + defer sf.Close() + + df, err := os.Create(dest) + if err != nil { + return err + } + defer df.Close() + + _, err = io.Copy(df, sf) + if err != nil { + return err + } + } + + // Chown the file + if err := os.Lchown(dest, uid, gid); err != nil { + return err + } + + // Chmod the file + if !(si.Mode()&os.ModeSymlink == os.ModeSymlink) { + if err := os.Chmod(dest, si.Mode()); err != nil { + return err + } + } + + return nil +} + +// CopyDirectory copies the files under the source directory +// to dest directory. The dest directory is created if it +// does not exist. +func CopyDirectory(source string, dest string) error { + fi, err := os.Stat(source) + if err != nil { + return err + } + + // Get owner. + st, ok := fi.Sys().(*syscall.Stat_t) + if !ok { + return fmt.Errorf("could not convert to syscall.Stat_t") + } + + // We have to pick an owner here anyway. + if err := MkdirAllNewAs(dest, fi.Mode(), int(st.Uid), int(st.Gid)); err != nil { + return err + } + + return filepath.Walk(source, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + + // Get the relative path + relPath, err := filepath.Rel(source, path) + if err != nil { + return nil + } + + if info.IsDir() { + // Skip the source directory. + if path != source { + // Get the owner. + st, ok := info.Sys().(*syscall.Stat_t) + if !ok { + return fmt.Errorf("could not convert to syscall.Stat_t") + } + + uid := int(st.Uid) + gid := int(st.Gid) + + if err := os.Mkdir(filepath.Join(dest, relPath), info.Mode()); err != nil { + return err + } + + if err := os.Lchown(filepath.Join(dest, relPath), uid, gid); err != nil { + return err + } + } + return nil + } + + // Copy the file. + if err := CopyFile(path, filepath.Join(dest, relPath)); err != nil { + return err + } + + return nil + }) +} + +func major(device uint64) uint64 { + return (device >> 8) & 0xfff +} + +func minor(device uint64) uint64 { + return (device & 0xff) | ((device >> 12) & 0xfff00) +} + +func mkdev(major int64, minor int64) uint32 { + return uint32(((minor & 0xfff00) << 12) | ((major & 0xfff) << 8) | (minor & 0xff)) +} diff --git a/vendor/github.com/mrunalp/fileutils/idtools.go b/vendor/github.com/mrunalp/fileutils/idtools.go new file mode 100644 index 00000000..161aec8f --- /dev/null +++ b/vendor/github.com/mrunalp/fileutils/idtools.go @@ -0,0 +1,49 @@ +package fileutils + +import ( + "os" + "path/filepath" +) + +// MkdirAllNewAs creates a directory (include any along the path) and then modifies +// ownership ONLY of newly created directories to the requested uid/gid. If the +// directories along the path exist, no change of ownership will be performed +func MkdirAllNewAs(path string, mode os.FileMode, ownerUID, ownerGID int) error { + // make an array containing the original path asked for, plus (for mkAll == true) + // all path components leading up to the complete path that don't exist before we MkdirAll + // so that we can chown all of them properly at the end. If chownExisting is false, we won't + // chown the full directory path if it exists + var paths []string + if _, err := os.Stat(path); err != nil && os.IsNotExist(err) { + paths = []string{path} + } else if err == nil { + // nothing to do; directory path fully exists already + return nil + } + + // walk back to "/" looking for directories which do not exist + // and add them to the paths array for chown after creation + dirPath := path + for { + dirPath = filepath.Dir(dirPath) + if dirPath == "/" { + break + } + if _, err := os.Stat(dirPath); err != nil && os.IsNotExist(err) { + paths = append(paths, dirPath) + } + } + + if err := os.MkdirAll(path, mode); err != nil && !os.IsExist(err) { + return err + } + + // even if it existed, we will chown the requested path + any subpaths that + // didn't exist when we called MkdirAll + for _, pathComponent := range paths { + if err := os.Chown(pathComponent, ownerUID, ownerGID); err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/README.md b/vendor/github.com/opencontainers/runc/libcontainer/README.md index 457b132e..d2a7d788 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/README.md +++ b/vendor/github.com/opencontainers/runc/libcontainer/README.md @@ -1,3 +1,7 @@ +# libcontainer + +[![GoDoc](https://godoc.org/github.com/opencontainers/runc/libcontainer?status.svg)](https://godoc.org/github.com/opencontainers/runc/libcontainer) + Libcontainer provides a native Go implementation for creating containers with namespaces, cgroups, capabilities, and filesystem access controls. It allows you to manage the lifecycle of the container performing additional operations @@ -16,7 +20,14 @@ the current binary (/proc/self/exe) to be executed as the init process, and use arg "init", we call the first step process "bootstrap", so you always need a "init" function as the entry of "bootstrap". +In addition to the go init function the early stage bootstrap is handled by importing +[nsenter](https://github.com/opencontainers/runc/blob/master/libcontainer/nsenter/README.md). + ```go +import ( + _ "github.com/opencontainers/runc/libcontainer/nsenter" +) + func init() { if len(os.Args) > 1 && os.Args[1] == "init" { runtime.GOMAXPROCS(1) @@ -83,6 +94,7 @@ config := &configs.Config{ }, MaskPaths: []string{ "/proc/kcore", + "/sys/firmware", }, ReadonlyPaths: []string{ "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", @@ -184,7 +196,7 @@ process := &libcontainer.Process{ Stderr: os.Stderr, } -err := container.Start(process) +err := container.Run(process) if err != nil { container.Destroy() logrus.Fatal(err) @@ -222,6 +234,12 @@ container.Signal(signal) // update container resource constraints. container.Set(config) + +// get current status of the container. +status, err := container.Status() + +// get current container's state information. +state, err := container.State() ``` diff --git a/vendor/github.com/opencontainers/runc/libcontainer/capabilities_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/capabilities_linux.go index 4eda56d1..8981b2a2 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/capabilities_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/capabilities_linux.go @@ -7,10 +7,11 @@ import ( "os" "strings" + "github.com/opencontainers/runc/libcontainer/configs" "github.com/syndtr/gocapability/capability" ) -const allCapabilityTypes = capability.CAPS | capability.BOUNDS +const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS var capabilityMap map[string]capability.Cap @@ -30,40 +31,84 @@ func init() { } } -func newCapWhitelist(caps []string) (*whitelist, error) { - l := []capability.Cap{} - for _, c := range caps { +func newContainerCapList(capConfig *configs.Capabilities) (*containerCapabilities, error) { + bounding := []capability.Cap{} + for _, c := range capConfig.Bounding { v, ok := capabilityMap[c] if !ok { return nil, fmt.Errorf("unknown capability %q", c) } - l = append(l, v) + bounding = append(bounding, v) + } + effective := []capability.Cap{} + for _, c := range capConfig.Effective { + v, ok := capabilityMap[c] + if !ok { + return nil, fmt.Errorf("unknown capability %q", c) + } + effective = append(effective, v) + } + inheritable := []capability.Cap{} + for _, c := range capConfig.Inheritable { + v, ok := capabilityMap[c] + if !ok { + return nil, fmt.Errorf("unknown capability %q", c) + } + inheritable = append(inheritable, v) + } + permitted := []capability.Cap{} + for _, c := range capConfig.Permitted { + v, ok := capabilityMap[c] + if !ok { + return nil, fmt.Errorf("unknown capability %q", c) + } + permitted = append(permitted, v) + } + ambient := []capability.Cap{} + for _, c := range capConfig.Ambient { + v, ok := capabilityMap[c] + if !ok { + return nil, fmt.Errorf("unknown capability %q", c) + } + ambient = append(ambient, v) } pid, err := capability.NewPid(os.Getpid()) if err != nil { return nil, err } - return &whitelist{ - keep: l, - pid: pid, + return &containerCapabilities{ + bounding: bounding, + effective: effective, + inheritable: inheritable, + permitted: permitted, + ambient: ambient, + pid: pid, }, nil } -type whitelist struct { - pid capability.Capabilities - keep []capability.Cap +type containerCapabilities struct { + pid capability.Capabilities + bounding []capability.Cap + effective []capability.Cap + inheritable []capability.Cap + permitted []capability.Cap + ambient []capability.Cap } -// dropBoundingSet drops the capability bounding set to those specified in the whitelist. -func (w *whitelist) dropBoundingSet() error { - w.pid.Clear(capability.BOUNDS) - w.pid.Set(capability.BOUNDS, w.keep...) - return w.pid.Apply(capability.BOUNDS) +// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist. +func (c *containerCapabilities) ApplyBoundingSet() error { + c.pid.Clear(capability.BOUNDS) + c.pid.Set(capability.BOUNDS, c.bounding...) + return c.pid.Apply(capability.BOUNDS) } -// drop drops all capabilities for the current process except those specified in the whitelist. -func (w *whitelist) drop() error { - w.pid.Clear(allCapabilityTypes) - w.pid.Set(allCapabilityTypes, w.keep...) - return w.pid.Apply(allCapabilityTypes) +// Apply sets all the capabilities for the current process in the config. +func (c *containerCapabilities) ApplyCaps() error { + c.pid.Clear(allCapabilityTypes) + c.pid.Set(capability.BOUNDS, c.bounding...) + c.pid.Set(capability.PERMITTED, c.permitted...) + c.pid.Set(capability.INHERITABLE, c.inheritable...) + c.pid.Set(capability.EFFECTIVE, c.effective...) + c.pid.Set(capability.AMBIENT, c.ambient...) + return c.pid.Apply(allCapabilityTypes) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go index 35fc8eb9..25ff5158 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go @@ -27,9 +27,9 @@ type Manager interface { // Destroys the cgroup set Destroy() error - // NewCgroupManager() and LoadCgroupManager() require following attributes: + // The option func SystemdCgroups() and Cgroupfs() require following attributes: // Paths map[string]string - // Cgroups *cgroups.Cgroup + // Cgroups *configs.Cgroup // Paths maps cgroup subsystem to path at which it is mounted. // Cgroups specifies specific cgroup settings for the various subsystems diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go index 9692e4fb..22d82acb 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go @@ -114,8 +114,8 @@ func (m *Manager) Apply(pid int) (err error) { return err } + m.Paths = make(map[string]string) if c.Paths != nil { - paths := make(map[string]string) for name, path := range c.Paths { _, err := d.path(name) if err != nil { @@ -124,17 +124,12 @@ func (m *Manager) Apply(pid int) (err error) { } return err } - paths[name] = path + m.Paths[name] = path } - m.Paths = paths return cgroups.EnterPid(m.Paths, pid) } - paths := make(map[string]string) for _, sys := range subsystems { - if err := sys.Apply(d); err != nil { - return err - } // TODO: Apply should, ideally, be reentrant or be broken up into a separate // create and join phase so that the cgroup hierarchy for a container can be // created then join consists of writing the process pids to cgroup.procs @@ -147,9 +142,12 @@ func (m *Manager) Apply(pid int) (err error) { } return err } - paths[sys.Name()] = p + m.Paths[sys.Name()] = p + + if err := sys.Apply(d); err != nil { + return err + } } - m.Paths = paths return nil } @@ -269,25 +267,8 @@ func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { }, nil } -func (raw *cgroupData) parentPath(subsystem, mountpoint, root string) (string, error) { - // Use GetThisCgroupDir instead of GetInitCgroupDir, because the creating - // process could in container and shared pid namespace with host, and - // /proc/1/cgroup could point to whole other world of cgroups. - initPath, err := cgroups.GetThisCgroupDir(subsystem) - if err != nil { - return "", err - } - // This is needed for nested containers, because in /proc/self/cgroup we - // see pathes from host, which don't exist in container. - relDir, err := filepath.Rel(root, initPath) - if err != nil { - return "", err - } - return filepath.Join(mountpoint, relDir), nil -} - func (raw *cgroupData) path(subsystem string) (string, error) { - mnt, root, err := cgroups.FindCgroupMountpointAndRoot(subsystem) + mnt, err := cgroups.FindCgroupMountpoint(subsystem) // If we didn't mount the subsystem, there is no point we make the path. if err != nil { return "", err @@ -295,11 +276,14 @@ func (raw *cgroupData) path(subsystem string) (string, error) { // If the cgroup name/path is absolute do not look relative to the cgroup of the init process. if filepath.IsAbs(raw.innerPath) { - // Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'. + // Sometimes subsystems can be mounted together as 'cpu,cpuacct'. return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil } - parentPath, err := raw.parentPath(subsystem, mnt, root) + // Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating + // process could in container and shared pid namespace with host, and + // /proc/1/cgroup could point to whole other world of cgroups. + parentPath, err := cgroups.GetOwnCgroupPath(subsystem) if err != nil { return "", err } @@ -348,8 +332,8 @@ func removePath(p string, err error) error { return nil } -func CheckCpushares(path string, c int64) error { - var cpuShares int64 +func CheckCpushares(path string, c uint64) error { + var cpuShares uint64 if c == 0 { return nil diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go index 7cd506a8..b712bd0b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go @@ -55,7 +55,7 @@ func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error { if cgroup.Resources.CpuRtPeriod != 0 { - if err := writeFile(path, "cpu.rt_period_us", strconv.FormatInt(cgroup.Resources.CpuRtPeriod, 10)); err != nil { + if err := writeFile(path, "cpu.rt_period_us", strconv.FormatUint(cgroup.Resources.CpuRtPeriod, 10)); err != nil { return err } } @@ -69,12 +69,12 @@ func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error { func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error { if cgroup.Resources.CpuShares != 0 { - if err := writeFile(path, "cpu.shares", strconv.FormatInt(cgroup.Resources.CpuShares, 10)); err != nil { + if err := writeFile(path, "cpu.shares", strconv.FormatUint(cgroup.Resources.CpuShares, 10)); err != nil { return err } } if cgroup.Resources.CpuPeriod != 0 { - if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(cgroup.Resources.CpuPeriod, 10)); err != nil { + if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatUint(cgroup.Resources.CpuPeriod, 10)); err != nil { return err } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go index 29265c70..918b9a30 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go @@ -61,9 +61,26 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro if err != nil { return err } - if err := s.ensureParent(dir, root); err != nil { + // 'ensureParent' start with parent because we don't want to + // explicitly inherit from parent, it could conflict with + // 'cpuset.cpu_exclusive'. + if err := s.ensureParent(filepath.Dir(dir), root); err != nil { return err } + if err := os.MkdirAll(dir, 0755); err != nil { + return err + } + // We didn't inherit cpuset configs from parent, but we have + // to ensure cpuset configs are set before moving task into the + // cgroup. + // The logic is, if user specified cpuset configs, use these + // specified configs, otherwise, inherit from parent. This makes + // cpuset configs work correctly with 'cpuset.cpu_exclusive', and + // keep backward compatbility. + if err := s.ensureCpusAndMems(dir, cgroup); err != nil { + return err + } + // because we are not using d.join we need to place the pid into the procs file // unlike the other subsystems if err := cgroups.WriteCgroupProc(dir, pid); err != nil { @@ -136,3 +153,10 @@ func (s *CpusetGroup) copyIfNeeded(current, parent string) error { func (s *CpusetGroup) isEmpty(b []byte) bool { return len(bytes.Trim(b, "\n")) == 0 } + +func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error { + if err := s.Set(path, cgroup); err != nil { + return err + } + return s.copyIfNeeded(path, filepath.Dir(path)) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go index 6c0fd022..1ecd204a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go @@ -18,6 +18,8 @@ import ( const ( cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes" + cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" + cgroupMemoryLimit = "memory.limit_in_bytes" ) type MemoryGroup struct { @@ -31,15 +33,21 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) { path, err := d.path("memory") if err != nil && !cgroups.IsNotFound(err) { return err + } else if path == "" { + return nil } if memoryAssigned(d.config) { - if path != "" { + if _, err := os.Stat(path); os.IsNotExist(err) { if err := os.MkdirAll(path, 0755); err != nil { return err } - } - if err := EnableKernelMemoryAccounting(path); err != nil { - return err + // Only enable kernel memory accouting when this cgroup + // is created by libcontainer, otherwise we might get + // error when people use `cgroupsPath` to join an existed + // cgroup whose kernel memory is not initialized. + if err := EnableKernelMemoryAccounting(path); err != nil { + return err + } } } defer func() { @@ -62,18 +70,15 @@ func EnableKernelMemoryAccounting(path string) error { // We have to limit the kernel memory here as it won't be accounted at all // until a limit is set on the cgroup and limit cannot be set once the // cgroup has children, or if there are already tasks in the cgroup. - kernelMemoryLimit := int64(1) - if err := setKernelMemory(path, kernelMemoryLimit); err != nil { - return err - } - kernelMemoryLimit = int64(-1) - if err := setKernelMemory(path, kernelMemoryLimit); err != nil { - return err + for _, i := range []int64{1, -1} { + if err := setKernelMemory(path, uint64(i)); err != nil { + return err + } } return nil } -func setKernelMemory(path string, kernelMemoryLimit int64) error { +func setKernelMemory(path string, kernelMemoryLimit uint64) error { if path == "" { return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit) } @@ -81,7 +86,7 @@ func setKernelMemory(path string, kernelMemoryLimit int64) error { // kernel memory is not enabled on the system so we should do nothing return nil } - if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil { + if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatUint(kernelMemoryLimit, 10)), 0700); err != nil { // Check if the error number returned by the syscall is "EBUSY" // The EBUSY signal is returned on attempts to write to the // memory.kmem.limit_in_bytes file if the cgroup has children or @@ -99,9 +104,20 @@ func setKernelMemory(path string, kernelMemoryLimit int64) error { } func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { + ulimited := -1 + + // If the memory update is set to uint64(-1) we should also + // set swap to uint64(-1), it means unlimited memory. + if cgroup.Resources.Memory == uint64(ulimited) { + // Only set swap if it's enbled in kernel + if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) { + cgroup.Resources.MemorySwap = uint64(ulimited) + } + } + // When memory and swap memory are both set, we need to handle the cases // for updating container. - if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap > 0 { + if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap != 0 { memoryUsage, err := getMemoryData(path, "") if err != nil { return err @@ -110,29 +126,29 @@ func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { // When update memory limit, we should adapt the write sequence // for memory and swap memory, so it won't fail because the new // value and the old value don't fit kernel's validation. - if memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) { - if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + if cgroup.Resources.MemorySwap == uint64(ulimited) || memoryUsage.Limit < cgroup.Resources.MemorySwap { + if err := writeFile(path, cgroupMemorySwapLimit, strconv.FormatUint(cgroup.Resources.MemorySwap, 10)); err != nil { return err } - if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + if err := writeFile(path, cgroupMemoryLimit, strconv.FormatUint(cgroup.Resources.Memory, 10)); err != nil { return err } } else { - if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + if err := writeFile(path, cgroupMemoryLimit, strconv.FormatUint(cgroup.Resources.Memory, 10)); err != nil { return err } - if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + if err := writeFile(path, cgroupMemorySwapLimit, strconv.FormatUint(cgroup.Resources.MemorySwap, 10)); err != nil { return err } } } else { if cgroup.Resources.Memory != 0 { - if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + if err := writeFile(path, cgroupMemoryLimit, strconv.FormatUint(cgroup.Resources.Memory, 10)); err != nil { return err } } - if cgroup.Resources.MemorySwap > 0 { - if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + if cgroup.Resources.MemorySwap != 0 { + if err := writeFile(path, cgroupMemorySwapLimit, strconv.FormatUint(cgroup.Resources.MemorySwap, 10)); err != nil { return err } } @@ -153,13 +169,13 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { } if cgroup.Resources.MemoryReservation != 0 { - if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { + if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatUint(cgroup.Resources.MemoryReservation, 10)); err != nil { return err } } if cgroup.Resources.KernelMemoryTCP != 0 { - if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil { + if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatUint(cgroup.Resources.KernelMemoryTCP, 10)); err != nil { return err } } @@ -170,12 +186,12 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { } if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 { return nil - } else if int64(*cgroup.Resources.MemorySwappiness) >= 0 && int64(*cgroup.Resources.MemorySwappiness) <= 100 { - if err := writeFile(path, "memory.swappiness", strconv.FormatInt(*cgroup.Resources.MemorySwappiness, 10)); err != nil { + } else if *cgroup.Resources.MemorySwappiness <= 100 { + if err := writeFile(path, "memory.swappiness", strconv.FormatUint(*cgroup.Resources.MemorySwappiness, 10)); err != nil { return err } } else { - return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", int64(*cgroup.Resources.MemorySwappiness)) + return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *cgroup.Resources.MemorySwappiness) } return nil @@ -227,6 +243,14 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { } stats.MemoryStats.KernelTCPUsage = kernelTCPUsage + useHierarchy := strings.Join([]string{"memory", "use_hierarchy"}, ".") + value, err := getCgroupParamUint(path, useHierarchy) + if err != nil { + return err + } + if value == 1 { + stats.MemoryStats.UseHierarchy = true + } return nil } @@ -237,7 +261,7 @@ func memoryAssigned(cgroup *configs.Cgroup) bool { cgroup.Resources.KernelMemory > 0 || cgroup.Resources.KernelMemoryTCP > 0 || cgroup.Resources.OomKillDisable || - (cgroup.Resources.MemorySwappiness != nil && *cgroup.Resources.MemorySwappiness != -1) + (cgroup.Resources.MemorySwappiness != nil && int64(*cgroup.Resources.MemorySwappiness) != -1) } func getMemoryData(path, name string) (cgroups.MemoryData, error) { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/rootless/rootless.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/rootless/rootless.go new file mode 100644 index 00000000..b1efbfd9 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/rootless/rootless.go @@ -0,0 +1,128 @@ +// +build linux + +package rootless + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/configs/validate" +) + +// TODO: This is copied from libcontainer/cgroups/fs, which duplicates this code +// needlessly. We should probably export this list. + +var subsystems = []subsystem{ + &fs.CpusetGroup{}, + &fs.DevicesGroup{}, + &fs.MemoryGroup{}, + &fs.CpuGroup{}, + &fs.CpuacctGroup{}, + &fs.PidsGroup{}, + &fs.BlkioGroup{}, + &fs.HugetlbGroup{}, + &fs.NetClsGroup{}, + &fs.NetPrioGroup{}, + &fs.PerfEventGroup{}, + &fs.FreezerGroup{}, + &fs.NameGroup{GroupName: "name=systemd"}, +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + + // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. + GetStats(path string, stats *cgroups.Stats) error +} + +// The noop cgroup manager is used for rootless containers, because we currently +// cannot manage cgroups if we are in a rootless setup. This manager is chosen +// by factory if we are in rootless mode. We error out if any cgroup options are +// set in the config -- this may change in the future with upcoming kernel features +// like the cgroup namespace. + +type Manager struct { + Cgroups *configs.Cgroup + Paths map[string]string +} + +func (m *Manager) Apply(pid int) error { + // If there are no cgroup settings, there's nothing to do. + if m.Cgroups == nil { + return nil + } + + // We can't set paths. + // TODO(cyphar): Implement the case where the runner of a rootless container + // owns their own cgroup, which would allow us to set up a + // cgroup for each path. + if m.Cgroups.Paths != nil { + return fmt.Errorf("cannot change cgroup path in rootless container") + } + + // We load the paths into the manager. + paths := make(map[string]string) + for _, sys := range subsystems { + name := sys.Name() + + path, err := cgroups.GetOwnCgroupPath(name) + if err != nil { + // Ignore paths we couldn't resolve. + continue + } + + paths[name] = path + } + + m.Paths = paths + return nil +} + +func (m *Manager) GetPaths() map[string]string { + return m.Paths +} + +func (m *Manager) Set(container *configs.Config) error { + // We have to re-do the validation here, since someone might decide to + // update a rootless container. + return validate.New().Validate(container) +} + +func (m *Manager) GetPids() ([]int, error) { + dir, err := cgroups.GetOwnCgroupPath("devices") + if err != nil { + return nil, err + } + return cgroups.GetPids(dir) +} + +func (m *Manager) GetAllPids() ([]int, error) { + dir, err := cgroups.GetOwnCgroupPath("devices") + if err != nil { + return nil, err + } + return cgroups.GetAllPids(dir) +} + +func (m *Manager) GetStats() (*cgroups.Stats, error) { + // TODO(cyphar): We can make this work if we figure out a way to allow usage + // of cgroups with a rootless container. While this doesn't + // actually require write access to a cgroup directory, the + // statistics are not useful if they can be affected by + // non-container processes. + return nil, fmt.Errorf("cannot get cgroup stats in rootless container") +} + +func (m *Manager) Freeze(state configs.FreezerState) error { + // TODO(cyphar): We can make this work if we figure out a way to allow usage + // of cgroups with a rootless container. + return fmt.Errorf("cannot use freezer cgroup in rootless container") +} + +func (m *Manager) Destroy() error { + // We don't have to do anything here because we didn't do any setup. + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go index b483f1bf..8eeedc55 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go @@ -51,6 +51,8 @@ type MemoryStats struct { KernelUsage MemoryData `json:"kernel_usage,omitempty"` // usage of kernel TCP memory KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"` + // if true, memory usage is accounted for throughout a hierarchy of cgroups. + UseHierarchy bool `json:"use_hierarchy"` Stats map[string]uint64 `json:"stats,omitempty"` } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go index 22ff7117..456c57d9 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go @@ -5,10 +5,8 @@ package systemd import ( "errors" "fmt" - "io/ioutil" "os" "path/filepath" - "strconv" "strings" "sync" "time" @@ -67,12 +65,14 @@ var subsystems = subsystemSet{ const ( testScopeWait = 4 + testSliceWait = 4 ) var ( connLock sync.Mutex theConn *systemdDbus.Conn hasStartTransientUnit bool + hasStartTransientSliceUnit bool hasTransientDefaultDependencies bool hasDelegate bool ) @@ -159,8 +159,36 @@ func UseSystemd() bool { } } + // Assume we have the ability to start a transient unit as a slice + // This was broken until systemd v229, but has been back-ported on RHEL environments >= 219 + // For details, see: https://bugzilla.redhat.com/show_bug.cgi?id=1370299 + hasStartTransientSliceUnit = true + + // To ensure simple clean-up, we create a slice off the root with no hierarchy + slice := fmt.Sprintf("libcontainer_%d_systemd_test_default.slice", os.Getpid()) + if _, err := theConn.StartTransientUnit(slice, "replace", nil, nil); err != nil { + if _, ok := err.(dbus.Error); ok { + hasStartTransientSliceUnit = false + } + } + + for i := 0; i <= testSliceWait; i++ { + if _, err := theConn.StopUnit(slice, "replace", nil); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") { + hasStartTransientSliceUnit = false + break + } + } + } else { + break + } + time.Sleep(time.Millisecond) + } + // Not critical because of the stop unit logic above. theConn.StopUnit(scope, "replace", nil) + theConn.StopUnit(slice, "replace", nil) } return hasStartTransientUnit } @@ -194,11 +222,24 @@ func (m *Manager) Apply(pid int) error { slice = c.Parent } - properties = append(properties, - systemdDbus.PropSlice(slice), - systemdDbus.PropDescription("docker container "+c.Name), - newProp("PIDs", []uint32{uint32(pid)}), - ) + properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) + + // if we create a slice, the parent is defined via a Wants= + if strings.HasSuffix(unitName, ".slice") { + // This was broken until systemd v229, but has been back-ported on RHEL environments >= 219 + if !hasStartTransientSliceUnit { + return fmt.Errorf("systemd version does not support ability to start a slice as transient unit") + } + properties = append(properties, systemdDbus.PropWants(slice)) + } else { + // otherwise, we use Slice= + properties = append(properties, systemdDbus.PropSlice(slice)) + } + + // only add pid if its valid, -1 is used w/ general slice creation. + if pid != -1 { + properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) + } if hasDelegate { // This is only supported on systemd versions 218 and above. @@ -219,12 +260,19 @@ func (m *Manager) Apply(pid int) error { if c.Resources.Memory != 0 { properties = append(properties, - newProp("MemoryLimit", uint64(c.Resources.Memory))) + newProp("MemoryLimit", c.Resources.Memory)) } if c.Resources.CpuShares != 0 { properties = append(properties, - newProp("CPUShares", uint64(c.Resources.CpuShares))) + newProp("CPUShares", c.Resources.CpuShares)) + } + + // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd. + if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 { + cpuQuotaPerSecUSec := uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod + properties = append(properties, + newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) } if c.Resources.BlkioWeight != 0 { @@ -240,7 +288,7 @@ func (m *Manager) Apply(pid int) error { } } - if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil { + if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil && !isUnitExists(err) { return err } @@ -285,15 +333,6 @@ func (m *Manager) GetPaths() map[string]string { return paths } -func writeFile(dir, file, data string) error { - // Normally dir should not be empty, one case is that cgroup subsystem - // is not mounted, we will get empty dir, and we want it fail here. - if dir == "" { - return fmt.Errorf("no such directory for %s", file) - } - return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) -} - func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { path, err := getSubsystemPath(c, subsystem) if err != nil { @@ -302,10 +341,9 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { if err := os.MkdirAll(path, 0755); err != nil { return "", err } - if err := writeFile(path, "cgroup.procs", strconv.Itoa(pid)); err != nil { + if err := cgroups.WriteCgroupProc(path, pid); err != nil { return "", err } - return path, nil } @@ -347,10 +385,10 @@ func joinCgroups(c *configs.Cgroup, pid int) error { return nil } -// systemd represents slice heirarchy using `-`, so we need to follow suit when +// systemd represents slice hierarchy using `-`, so we need to follow suit when // generating the path of slice. Essentially, test-a-b.slice becomes // test.slice/test-a.slice/test-a-b.slice. -func expandSlice(slice string) (string, error) { +func ExpandSlice(slice string) (string, error) { suffix := ".slice" // Name has to end with ".slice", but can't be just ".slice". if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) { @@ -364,6 +402,10 @@ func expandSlice(slice string) (string, error) { var path, prefix string sliceName := strings.TrimSuffix(slice, suffix) + // if input was -.slice, we should just return root now + if sliceName == "-" { + return "/", nil + } for _, component := range strings.Split(sliceName, "-") { // test--a.slice isn't permitted, nor is -test.slice. if component == "" { @@ -384,7 +426,7 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { return "", err } - initPath, err := cgroups.GetInitCgroupDir(subsystem) + initPath, err := cgroups.GetInitCgroup(subsystem) if err != nil { return "", err } @@ -396,7 +438,7 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { slice = c.Parent } - slice, err = expandSlice(slice) + slice, err = ExpandSlice(slice) if err != nil { return "", err } @@ -483,7 +525,11 @@ func (m *Manager) Set(container *configs.Config) error { } func getUnitName(c *configs.Cgroup) string { - return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name) + // by default, we create a scope unless the user explicitly asks for a slice. + if !strings.HasSuffix(c.Name, ".slice") { + return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name) + } + return c.Name } func setKernelMemory(c *configs.Cgroup) error { @@ -497,3 +543,13 @@ func setKernelMemory(c *configs.Cgroup) error { } return fs.EnableKernelMemoryAccounting(path) } + +// isUnitExists returns true if the error is that a systemd unit already exists. +func isUnitExists(err error) bool { + if err != nil { + if dbusError, ok := err.(dbus.Error); ok { + return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists") + } + } + return false +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go index 8946dd59..5db37344 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -23,36 +23,14 @@ const ( // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt func FindCgroupMountpoint(subsystem string) (string, error) { - // We are not using mount.GetMounts() because it's super-inefficient, - // parsing it directly sped up x10 times because of not using Sscanf. - // It was one of two major performance drawbacks in container start. - if !isSubsystemAvailable(subsystem) { - return "", NewNotFoundError(subsystem) - } - f, err := os.Open("/proc/self/mountinfo") - if err != nil { - return "", err - } - defer f.Close() - - scanner := bufio.NewScanner(f) - for scanner.Scan() { - txt := scanner.Text() - fields := strings.Split(txt, " ") - for _, opt := range strings.Split(fields[len(fields)-1], ",") { - if opt == subsystem { - return fields[4], nil - } - } - } - if err := scanner.Err(); err != nil { - return "", err - } - - return "", NewNotFoundError(subsystem) + mnt, _, err := FindCgroupMountpointAndRoot(subsystem) + return mnt, err } func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) { + // We are not using mount.GetMounts() because it's super-inefficient, + // parsing it directly sped up x10 times because of not using Sscanf. + // It was one of two major performance drawbacks in container start. if !isSubsystemAvailable(subsystem) { return "", "", NewNotFoundError(subsystem) } @@ -131,7 +109,7 @@ type Mount struct { Subsystems []string } -func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) { +func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) { if len(m.Subsystems) == 0 { return "", fmt.Errorf("no subsystem for mount") } @@ -149,7 +127,7 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, if sepIdx == -1 { return nil, fmt.Errorf("invalid mountinfo format") } - if txt[sepIdx+3:sepIdx+9] != "cgroup" { + if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" { continue } fields := strings.Split(txt, " ") @@ -211,9 +189,6 @@ func GetAllSubsystems() ([]string, error) { s := bufio.NewScanner(f) for s.Scan() { - if err := s.Err(); err != nil { - return nil, err - } text := s.Text() if text[0] != '#' { parts := strings.Fields(text) @@ -222,11 +197,14 @@ func GetAllSubsystems() ([]string, error) { } } } + if err := s.Err(); err != nil { + return nil, err + } return subsystems, nil } -// GetThisCgroupDir returns the relative path to the cgroup docker is running in. -func GetThisCgroupDir(subsystem string) (string, error) { +// GetOwnCgroup returns the relative path to the cgroup docker is running in. +func GetOwnCgroup(subsystem string) (string, error) { cgroups, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return "", err @@ -235,8 +213,16 @@ func GetThisCgroupDir(subsystem string) (string, error) { return getControllerPath(subsystem, cgroups) } -func GetInitCgroupDir(subsystem string) (string, error) { +func GetOwnCgroupPath(subsystem string) (string, error) { + cgroup, err := GetOwnCgroup(subsystem) + if err != nil { + return "", err + } + return getCgroupPathHelper(subsystem, cgroup) +} + +func GetInitCgroup(subsystem string) (string, error) { cgroups, err := ParseCgroupFile("/proc/1/cgroup") if err != nil { return "", err @@ -245,6 +231,31 @@ func GetInitCgroupDir(subsystem string) (string, error) { return getControllerPath(subsystem, cgroups) } +func GetInitCgroupPath(subsystem string) (string, error) { + cgroup, err := GetInitCgroup(subsystem) + if err != nil { + return "", err + } + + return getCgroupPathHelper(subsystem, cgroup) +} + +func getCgroupPathHelper(subsystem, cgroup string) (string, error) { + mnt, root, err := FindCgroupMountpointAndRoot(subsystem) + if err != nil { + return "", err + } + + // This is needed for nested containers, because in /proc/self/cgroup we + // see pathes from host, which don't exist in container. + relCgroup, err := filepath.Rel(root, cgroup) + if err != nil { + return "", err + } + + return filepath.Join(mnt, relCgroup), nil +} + func readProcsFile(dir string) ([]int, error) { f, err := os.Open(filepath.Join(dir, CgroupProcesses)) if err != nil { @@ -287,10 +298,6 @@ func parseCgroupFromReader(r io.Reader) (map[string]string, error) { cgroups := make(map[string]string) for s.Scan() { - if err := s.Err(); err != nil { - return nil, err - } - text := s.Text() // from cgroups(7): // /proc/[pid]/cgroup @@ -307,6 +314,10 @@ func parseCgroupFromReader(r io.Reader) (map[string]string, error) { cgroups[subs] = parts[2] } } + if err := s.Err(); err != nil { + return nil, err + } + return cgroups, nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go index 94b38879..75722890 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go @@ -22,7 +22,7 @@ type Cgroup struct { // The path is assumed to be relative to the host system cgroup mountpoint. Path string `json:"path"` - // ScopePrefix decribes prefix for the scope name + // ScopePrefix describes prefix for the scope name ScopePrefix string `json:"scope_prefix"` // Paths represent the absolute cgroups paths to join. @@ -45,34 +45,34 @@ type Resources struct { Devices []*Device `json:"devices"` // Memory limit (in bytes) - Memory int64 `json:"memory"` + Memory uint64 `json:"memory"` // Memory reservation or soft_limit (in bytes) - MemoryReservation int64 `json:"memory_reservation"` + MemoryReservation uint64 `json:"memory_reservation"` // Total memory usage (memory + swap); set `-1` to enable unlimited swap - MemorySwap int64 `json:"memory_swap"` + MemorySwap uint64 `json:"memory_swap"` // Kernel memory limit (in bytes) - KernelMemory int64 `json:"kernel_memory"` + KernelMemory uint64 `json:"kernel_memory"` // Kernel memory limit for TCP use (in bytes) - KernelMemoryTCP int64 `json:"kernel_memory_tcp"` + KernelMemoryTCP uint64 `json:"kernel_memory_tcp"` // CPU shares (relative weight vs. other containers) - CpuShares int64 `json:"cpu_shares"` + CpuShares uint64 `json:"cpu_shares"` // CPU hardcap limit (in usecs). Allowed cpu time in a given period. CpuQuota int64 `json:"cpu_quota"` // CPU period to be used for hardcapping (in usecs). 0 to use system default. - CpuPeriod int64 `json:"cpu_period"` + CpuPeriod uint64 `json:"cpu_period"` // How many time CPU will use in realtime scheduling (in usecs). CpuRtRuntime int64 `json:"cpu_rt_quota"` // CPU period to be used for realtime scheduling (in usecs). - CpuRtPeriod int64 `json:"cpu_rt_period"` + CpuRtPeriod uint64 `json:"cpu_rt_period"` // CPU to use CpusetCpus string `json:"cpuset_cpus"` @@ -95,7 +95,7 @@ type Resources struct { // IO read rate limit per cgroup per device, bytes per second. BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"` - // IO write rate limit per cgroup per divice, bytes per second. + // IO write rate limit per cgroup per device, bytes per second. BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"` // IO read rate limit per cgroup per device, IO per second. @@ -114,7 +114,7 @@ type Resources struct { OomKillDisable bool `json:"oom_kill_disable"` // Tuning swappiness behaviour per cgroup - MemorySwappiness *int64 `json:"memory_swappiness"` + MemorySwappiness *uint64 `json:"memory_swappiness"` // Set priority of network traffic for container NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"` diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index a56d12bd..98f4b858 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -8,6 +8,7 @@ import ( "time" "github.com/Sirupsen/logrus" + "github.com/opencontainers/runtime-spec/specs-go" ) type Rlimit struct { @@ -85,11 +86,6 @@ type Config struct { // that the parent process dies. ParentDeathSignal int `json:"parent_death_signal"` - // PivotDir allows a custom directory inside the container's root filesystem to be used as pivot, when NoPivotRoot is not set. - // When a custom PivotDir not set, a temporary dir inside the root filesystem will be used. The pivot dir needs to be writeable. - // This is required when using read only root filesystems. In these cases, a read/writeable path can be (bind) mounted somewhere inside the root filesystem to act as pivot. - PivotDir string `json:"pivot_dir"` - // Path to a directory containing the container's root filesystem. Rootfs string `json:"rootfs"` @@ -117,8 +113,8 @@ type Config struct { Namespaces Namespaces `json:"namespaces"` // Capabilities specify the capabilities to keep when executing the process inside the container - // All capbilities not specified will be dropped from the processes capability mask - Capabilities []string `json:"capabilities"` + // All capabilities not specified will be dropped from the processes capability mask + Capabilities *Capabilities `json:"capabilities"` // Networks specifies the container's network setup to be created Networks []*Network `json:"networks"` @@ -187,6 +183,9 @@ type Config struct { // NoNewKeyring will not allocated a new session keyring for the container. It will use the // callers keyring in this case. NoNewKeyring bool `json:"no_new_keyring"` + + // Rootless specifies whether the container is a rootless container. + Rootless bool `json:"rootless"` } type Hooks struct { @@ -201,6 +200,19 @@ type Hooks struct { Poststop []Hook } +type Capabilities struct { + // Bounding is the set of capabilities checked by the kernel. + Bounding []string + // Effective is the set of capabilities checked by the kernel. + Effective []string + // Inheritable is the capabilities preserved across execve. + Inheritable []string + // Permitted is the limiting superset for effective capabilities. + Permitted []string + // Ambient is the ambient set of capabilities that are kept. + Ambient []string +} + func (hooks *Hooks) UnmarshalJSON(b []byte) error { var state struct { Prestart []CommandHook @@ -248,13 +260,7 @@ func (hooks Hooks) MarshalJSON() ([]byte, error) { } // HookState is the payload provided to a hook on execution. -type HookState struct { - Version string `json:"ociVersion"` - ID string `json:"id"` - Pid int `json:"pid"` - Root string `json:"root"` - BundlePath string `json:"bundlePath"` -} +type HookState specs.State type Hook interface { // Run executes the hook with the provided state. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_unix.go index a60554a7..84463995 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_unix.go @@ -4,38 +4,50 @@ package configs import "fmt" -// HostUID gets the root uid for the process on host which could be non-zero -// when user namespaces are enabled. -func (c Config) HostUID() (int, error) { +// HostUID gets the translated uid for the process on host which could be +// different when user namespaces are enabled. +func (c Config) HostUID(containerId int) (int, error) { if c.Namespaces.Contains(NEWUSER) { if c.UidMappings == nil { - return -1, fmt.Errorf("User namespaces enabled, but no user mappings found.") + return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.") } - id, found := c.hostIDFromMapping(0, c.UidMappings) + id, found := c.hostIDFromMapping(containerId, c.UidMappings) if !found { - return -1, fmt.Errorf("User namespaces enabled, but no root user mapping found.") + return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.") } return id, nil } - // Return default root uid 0 - return 0, nil + // Return unchanged id. + return containerId, nil } -// HostGID gets the root gid for the process on host which could be non-zero +// HostRootUID gets the root uid for the process on host which could be non-zero // when user namespaces are enabled. -func (c Config) HostGID() (int, error) { +func (c Config) HostRootUID() (int, error) { + return c.HostUID(0) +} + +// HostGID gets the translated gid for the process on host which could be +// different when user namespaces are enabled. +func (c Config) HostGID(containerId int) (int, error) { if c.Namespaces.Contains(NEWUSER) { if c.GidMappings == nil { return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.") } - id, found := c.hostIDFromMapping(0, c.GidMappings) + id, found := c.hostIDFromMapping(containerId, c.GidMappings) if !found { - return -1, fmt.Errorf("User namespaces enabled, but no root group mapping found.") + return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.") } return id, nil } - // Return default root gid 0 - return 0, nil + // Return unchanged id. + return containerId, nil +} + +// HostRootGID gets the root gid for the process on host which could be non-zero +// when user namespaces are enabled. +func (c Config) HostRootGID() (int, error) { + return c.HostGID(0) } // Utility function that gets a host ID for a container ID from user namespace map diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go index cc770c91..670757dd 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go @@ -1,5 +1,11 @@ package configs +const ( + // EXT_COPYUP is a directive to copy up the contents of a directory when + // a tmpfs is mounted over it. + EXT_COPYUP = 1 << iota +) + type Mount struct { // Source path for the mount. Source string `json:"source"` @@ -22,6 +28,9 @@ type Mount struct { // Relabel source if set, "z" indicates shared, "Z" indicates unshared. Relabel string `json:"relabel"` + // Extensions are additional flags that are specific to runc. + Extensions int `json:"extensions"` + // Optional Command to be run before Source is mounted. PremountCmds []Command `json:"premount_cmds"` diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go index 0547223a..5d9a5c81 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go @@ -4,12 +4,10 @@ package configs func (n *Namespace) Syscall() int { panic("No namespace syscall support") - return 0 } // CloneFlags parses the container's Namespaces options to set the correct // flags on clone, unshare. This function returns flags only for new namespaces. func (n *Namespaces) CloneFlags() uintptr { panic("No namespace syscall support") - return uintptr(0) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go index b9c820d0..8beba9d3 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go @@ -22,8 +22,8 @@ var ( supportedNamespaces = make(map[NamespaceType]bool) ) -// nsToFile converts the namespace type to its filename -func nsToFile(ns NamespaceType) string { +// NsName converts the namespace type to its filename +func NsName(ns NamespaceType) string { switch ns { case NEWNET: return "net" @@ -50,7 +50,7 @@ func IsNamespaceSupported(ns NamespaceType) bool { if ok { return supported } - nsFile := nsToFile(ns) + nsFile := NsName(ns) // if the namespace type is unknown, just return false if nsFile == "" { return false @@ -84,7 +84,7 @@ func (n *Namespace) GetPath(pid int) string { if n.Path != "" { return n.Path } - return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type)) + return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type)) } func (n *Namespaces) Remove(t NamespaceType) bool { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go new file mode 100644 index 00000000..0cebfaf8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go @@ -0,0 +1,117 @@ +package validate + +import ( + "fmt" + "os" + "reflect" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +var ( + geteuid = os.Geteuid + getegid = os.Getegid +) + +func (v *ConfigValidator) rootless(config *configs.Config) error { + if err := rootlessMappings(config); err != nil { + return err + } + if err := rootlessMount(config); err != nil { + return err + } + // Currently, cgroups cannot effectively be used in rootless containers. + // The new cgroup namespace doesn't really help us either because it doesn't + // have nice interactions with the user namespace (we're working with upstream + // to fix this). + if err := rootlessCgroup(config); err != nil { + return err + } + + // XXX: We currently can't verify the user config at all, because + // configs.Config doesn't store the user-related configs. So this + // has to be verified by setupUser() in init_linux.go. + + return nil +} + +func rootlessMappings(config *configs.Config) error { + rootuid, err := config.HostRootUID() + if err != nil { + return fmt.Errorf("failed to get root uid from uidMappings: %v", err) + } + if euid := geteuid(); euid != 0 { + if !config.Namespaces.Contains(configs.NEWUSER) { + return fmt.Errorf("rootless containers require user namespaces") + } + if rootuid != euid { + return fmt.Errorf("rootless containers cannot map container root to a different host user") + } + } + + rootgid, err := config.HostRootGID() + if err != nil { + return fmt.Errorf("failed to get root gid from gidMappings: %v", err) + } + + // Similar to the above test, we need to make sure that we aren't trying to + // map to a group ID that we don't have the right to be. + if rootgid != getegid() { + return fmt.Errorf("rootless containers cannot map container root to a different host group") + } + + // We can only map one user and group inside a container (our own). + if len(config.UidMappings) != 1 || config.UidMappings[0].Size != 1 { + return fmt.Errorf("rootless containers cannot map more than one user") + } + if len(config.GidMappings) != 1 || config.GidMappings[0].Size != 1 { + return fmt.Errorf("rootless containers cannot map more than one group") + } + + return nil +} + +// cgroup verifies that the user isn't trying to set any cgroup limits or paths. +func rootlessCgroup(config *configs.Config) error { + // Nothing set at all. + if config.Cgroups == nil || config.Cgroups.Resources == nil { + return nil + } + + // Used for comparing to the zero value. + left := reflect.ValueOf(*config.Cgroups.Resources) + right := reflect.Zero(left.Type()) + + // This is all we need to do, since specconv won't add cgroup options in + // rootless mode. + if !reflect.DeepEqual(left.Interface(), right.Interface()) { + return fmt.Errorf("cannot specify resource limits in rootless container") + } + + return nil +} + +// mount verifies that the user isn't trying to set up any mounts they don't have +// the rights to do. In addition, it makes sure that no mount has a `uid=` or +// `gid=` option that doesn't resolve to root. +func rootlessMount(config *configs.Config) error { + // XXX: We could whitelist allowed devices at this point, but I'm not + // convinced that's a good idea. The kernel is the best arbiter of + // access control. + + for _, mount := range config.Mounts { + // Check that the options list doesn't contain any uid= or gid= entries + // that don't resolve to root. + for _, opt := range strings.Split(mount.Data, ",") { + if strings.HasPrefix(opt, "uid=") && opt != "uid=0" { + return fmt.Errorf("cannot specify uid= mount options in rootless containers where argument isn't 0") + } + if strings.HasPrefix(opt, "gid=") && opt != "gid=0" { + return fmt.Errorf("cannot specify gid= mount options in rootless containers where argument isn't 0") + } + } + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go index 448cde27..0dd580ac 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go @@ -40,12 +40,23 @@ func (v *ConfigValidator) Validate(config *configs.Config) error { if err := v.sysctl(config); err != nil { return err } + if config.Rootless { + if err := v.rootless(config); err != nil { + return err + } + } return nil } // rootfs validates if the rootfs is an absolute path and is not a symlink // to the container's root filesystem. func (v *ConfigValidator) rootfs(config *configs.Config) error { + if _, err := os.Stat(config.Rootfs); err != nil { + if os.IsNotExist(err) { + return fmt.Errorf("rootfs (%s) does not exist", config.Rootfs) + } + return err + } cleaned, err := filepath.Abs(config.Rootfs) if err != nil { return err @@ -126,6 +137,11 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error { } if strings.HasPrefix(s, "net.") { if config.Namespaces.Contains(configs.NEWNET) { + if path := config.Namespaces.PathOf(configs.NEWNET); path != "" { + if err := checkHostNs(s, path); err != nil { + return err + } + } continue } else { return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s) @@ -136,3 +152,44 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error { return nil } + +func isSymbolicLink(path string) (bool, error) { + fi, err := os.Lstat(path) + if err != nil { + return false, err + } + + return fi.Mode()&os.ModeSymlink == os.ModeSymlink, nil +} + +// checkHostNs checks whether network sysctl is used in host namespace. +func checkHostNs(sysctlConfig string, path string) error { + var currentProcessNetns = "/proc/self/ns/net" + // readlink on the current processes network namespace + destOfCurrentProcess, err := os.Readlink(currentProcessNetns) + if err != nil { + return fmt.Errorf("read soft link %q error", currentProcessNetns) + } + + // First check if the provided path is a symbolic link + symLink, err := isSymbolicLink(path) + if err != nil { + return fmt.Errorf("could not check that %q is a symlink: %v", path, err) + } + + if symLink == false { + // The provided namespace is not a symbolic link, + // it is not the host namespace. + return nil + } + + // readlink on the path provided in the struct + destOfContainer, err := os.Readlink(path) + if err != nil { + return fmt.Errorf("read soft link %q error", path) + } + if destOfContainer == destOfCurrentProcess { + return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", sysctlConfig) + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console.go b/vendor/github.com/opencontainers/runc/libcontainer/console.go index 042a2a2e..917acc70 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/console.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/console.go @@ -1,15 +1,17 @@ package libcontainer -import "io" +import ( + "io" + "os" +) // Console represents a pseudo TTY. type Console interface { - io.ReadWriter - io.Closer + io.ReadWriteCloser // Path returns the filesystem path to the slave side of the pty. Path() string // Fd returns the fd for the master of the pty. - Fd() uintptr + File() *os.File } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_freebsd.go b/vendor/github.com/opencontainers/runc/libcontainer/console_freebsd.go index 3c89eda0..b7166a31 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/console_freebsd.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/console_freebsd.go @@ -6,8 +6,8 @@ import ( "errors" ) -// NewConsole returns an initalized console that can be used within a container by copying bytes +// newConsole returns an initialized console that can be used within a container by copying bytes // from the master side to the slave that is attached as the tty for the container's init process. -func NewConsole(uid, gid int) (Console, error) { +func newConsole() (Console, error) { return nil, errors.New("libcontainer console is not supported on FreeBSD") } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/console_linux.go index 7af771b6..e431766b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/console_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/console_linux.go @@ -3,20 +3,26 @@ package libcontainer import ( "fmt" "os" - "path/filepath" "syscall" "unsafe" - - "github.com/opencontainers/runc/libcontainer/label" ) -// NewConsole returns an initalized console that can be used within a container by copying bytes +func ConsoleFromFile(f *os.File) Console { + return &linuxConsole{ + master: f, + } +} + +// newConsole returns an initialized console that can be used within a container by copying bytes // from the master side to the slave that is attached as the tty for the container's init process. -func NewConsole(uid, gid int) (Console, error) { +func newConsole() (Console, error) { master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) if err != nil { return nil, err } + if err := saneTerminal(master); err != nil { + return nil, err + } console, err := ptsname(master) if err != nil { return nil, err @@ -24,34 +30,20 @@ func NewConsole(uid, gid int) (Console, error) { if err := unlockpt(master); err != nil { return nil, err } - if err := os.Chmod(console, 0600); err != nil { - return nil, err - } - if err := os.Chown(console, uid, gid); err != nil { - return nil, err - } return &linuxConsole{ slavePath: console, master: master, }, nil } -// newConsoleFromPath is an internal function returning an initialized console for use inside -// a container's MNT namespace. -func newConsoleFromPath(slavePath string) *linuxConsole { - return &linuxConsole{ - slavePath: slavePath, - } -} - -// linuxConsole is a linux psuedo TTY for use within a container. +// linuxConsole is a linux pseudo TTY for use within a container. type linuxConsole struct { master *os.File slavePath string } -func (c *linuxConsole) Fd() uintptr { - return c.master.Fd() +func (c *linuxConsole) File() *os.File { + return c.master } func (c *linuxConsole) Path() string { @@ -75,21 +67,17 @@ func (c *linuxConsole) Close() error { // mount initializes the console inside the rootfs mounting with the specified mount label // and applying the correct ownership of the console. -func (c *linuxConsole) mount(rootfs, mountLabel string) error { +func (c *linuxConsole) mount() error { oldMask := syscall.Umask(0000) defer syscall.Umask(oldMask) - if err := label.SetFileLabel(c.slavePath, mountLabel); err != nil { - return err - } - dest := filepath.Join(rootfs, "/dev/console") - f, err := os.Create(dest) + f, err := os.Create("/dev/console") if err != nil && !os.IsExist(err) { return err } if f != nil { f.Close() } - return syscall.Mount(c.slavePath, dest, "bind", syscall.MS_BIND, "") + return syscall.Mount(c.slavePath, "/dev/console", "bind", syscall.MS_BIND, "") } // dupStdio opens the slavePath for the console and dups the fds to the current @@ -143,3 +131,26 @@ func ptsname(f *os.File) (string, error) { } return fmt.Sprintf("/dev/pts/%d", n), nil } + +// saneTerminal sets the necessary tty_ioctl(4)s to ensure that a pty pair +// created by us acts normally. In particular, a not-very-well-known default of +// Linux unix98 ptys is that they have +onlcr by default. While this isn't a +// problem for terminal emulators, because we relay data from the terminal we +// also relay that funky line discipline. +func saneTerminal(terminal *os.File) error { + // Go doesn't have a wrapper for any of the termios ioctls. + var termios syscall.Termios + + if err := ioctl(terminal.Fd(), syscall.TCGETS, uintptr(unsafe.Pointer(&termios))); err != nil { + return fmt.Errorf("ioctl(tty, tcgets): %s", err.Error()) + } + + // Set -onlcr so we don't have to deal with \r. + termios.Oflag &^= syscall.ONLCR + + if err := ioctl(terminal.Fd(), syscall.TCSETS, uintptr(unsafe.Pointer(&termios))); err != nil { + return fmt.Errorf("ioctl(tty, tcsets): %s", err.Error()) + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go b/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go index 9e89f505..e5ca5459 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go @@ -4,8 +4,8 @@ import ( "errors" ) -// NewConsole returns an initalized console that can be used within a container by copying bytes +// newConsole returns an initialized console that can be used within a container by copying bytes // from the master side to the slave that is attached as the tty for the container's init process. -func NewConsole(uid, gid int) (Console, error) { +func newConsole() (Console, error) { return nil, errors.New("libcontainer console is not supported on Solaris") } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/console_windows.go index a68c02f6..c61e866a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/console_windows.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/console_windows.go @@ -1,11 +1,11 @@ package libcontainer -// NewConsole returns an initalized console that can be used within a container -func NewConsole(uid, gid int) (Console, error) { +// newConsole returns an initialized console that can be used within a container +func newConsole() (Console, error) { return &windowsConsole{}, nil } -// windowsConsole is a Windows psuedo TTY for use within a container. +// windowsConsole is a Windows pseudo TTY for use within a container. type windowsConsole struct { } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container.go b/vendor/github.com/opencontainers/runc/libcontainer/container.go index 6844fbc7..3ddb5ec6 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/container.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/container.go @@ -123,7 +123,7 @@ type BaseContainer interface { // SystemError - System error. Start(process *Process) (err error) - // Run immediatly starts the process inside the conatiner. Returns error if process + // Run immediately starts the process inside the container. Returns error if process // fails to start. It does not block waiting for the exec fifo after start returns but // opens the fifo after start returns. // @@ -134,20 +134,29 @@ type BaseContainer interface { // SystemError - System error. Run(process *Process) (err error) - // Destroys the container after killing all running processes. + // Destroys the container, if its in a valid state, after killing any + // remaining running processes. // // Any event registrations are removed before the container is destroyed. // No error is returned if the container is already destroyed. // + // Running containers must first be stopped using Signal(..). + // Paused containers must first be resumed using Resume(..). + // // errors: + // ContainerNotStopped - Container is still running, + // ContainerPaused - Container is paused, // SystemError - System error. Destroy() error // Signal sends the provided signal code to the container's initial process. // + // If all is specified the signal is sent to all processes in the container + // including the initial process. + // // errors: // SystemError - System error. - Signal(s os.Signal) error + Signal(s os.Signal, all bool) error // Exec signals the container to exec the users process at the end of the init. // diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go index 29c8b343..faecc468 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go @@ -51,6 +51,9 @@ type State struct { // Platform specific fields below here + // Specifies if the container was started under the rootless mode. + Rootless bool `json:"rootless"` + // Path to all the cgroups setup for a container. Key is cgroup subsystem name // with the value as the path. CgroupPaths map[string]string `json:"cgroup_paths"` @@ -191,17 +194,29 @@ func (c *linuxContainer) Start(process *Process) error { if err != nil { return err } - return c.start(process, status == Stopped) + if status == Stopped { + if err := c.createExecFifo(); err != nil { + return err + } + } + if err := c.start(process, status == Stopped); err != nil { + if status == Stopped { + c.deleteExecFifo() + } + return err + } + return nil } func (c *linuxContainer) Run(process *Process) error { c.m.Lock() - defer c.m.Unlock() status, err := c.currentStatus() if err != nil { + c.m.Unlock() return err } - if err := c.start(process, status == Stopped); err != nil { + c.m.Unlock() + if err := c.Start(process); err != nil { return err } if status == Stopped { @@ -263,11 +278,10 @@ func (c *linuxContainer) start(process *Process, isInit bool) error { if c.config.Hooks != nil { s := configs.HookState{ - Version: c.config.Version, - ID: c.id, - Pid: parent.pid(), - Root: c.config.Rootfs, - BundlePath: utils.SearchLabels(c.config.Labels, "bundle"), + Version: c.config.Version, + ID: c.id, + Pid: parent.pid(), + Bundle: utils.SearchLabels(c.config.Labels, "bundle"), } for i, hook := range c.config.Hooks.Poststart { if err := hook.Run(s); err != nil { @@ -282,33 +296,75 @@ func (c *linuxContainer) start(process *Process, isInit bool) error { return nil } -func (c *linuxContainer) Signal(s os.Signal) error { +func (c *linuxContainer) Signal(s os.Signal, all bool) error { + if all { + return signalAllProcesses(c.cgroupManager, s) + } if err := c.initProcess.signal(s); err != nil { return newSystemErrorWithCause(err, "signaling init process") } return nil } +func (c *linuxContainer) createExecFifo() error { + rootuid, err := c.Config().HostRootUID() + if err != nil { + return err + } + rootgid, err := c.Config().HostRootGID() + if err != nil { + return err + } + + fifoName := filepath.Join(c.root, execFifoFilename) + if _, err := os.Stat(fifoName); err == nil { + return fmt.Errorf("exec fifo %s already exists", fifoName) + } + oldMask := syscall.Umask(0000) + if err := syscall.Mkfifo(fifoName, 0622); err != nil { + syscall.Umask(oldMask) + return err + } + syscall.Umask(oldMask) + if err := os.Chown(fifoName, rootuid, rootgid); err != nil { + return err + } + return nil +} + +func (c *linuxContainer) deleteExecFifo() { + fifoName := filepath.Join(c.root, execFifoFilename) + os.Remove(fifoName) +} + func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) { - parentPipe, childPipe, err := newPipe() + parentPipe, childPipe, err := utils.NewSockPair("init") if err != nil { return nil, newSystemErrorWithCause(err, "creating new init pipe") } - rootDir, err := os.Open(c.root) - if err != nil { - return nil, err - } - cmd, err := c.commandTemplate(p, childPipe, rootDir) + cmd, err := c.commandTemplate(p, childPipe) if err != nil { return nil, newSystemErrorWithCause(err, "creating new command template") } if !doInit { - return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir) + return c.newSetnsProcess(p, cmd, parentPipe, childPipe) } + + // We only set up rootDir if we're not doing a `runc exec`. The reason for + // this is to avoid cases where a racing, unprivileged process inside the + // container can get access to the statedir file descriptor (which would + // allow for container rootfs escape). + rootDir, err := os.Open(c.root) + if err != nil { + return nil, err + } + cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir) + cmd.Env = append(cmd.Env, + fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir) } -func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) { +func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) { cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...) cmd.Stdin = p.Stdin cmd.Stdout = p.Stdout @@ -317,10 +373,17 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File if cmd.SysProcAttr == nil { cmd.SysProcAttr = &syscall.SysProcAttr{} } - cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir) + cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...) + if p.ConsoleSocket != nil { + cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket) + cmd.Env = append(cmd.Env, + fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), + ) + } + cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe) cmd.Env = append(cmd.Env, - fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2), - fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) + fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), + ) // NOTE: when running a container with no PID namespace and the parent process spawning the container is // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason // even with the parent still running. @@ -339,7 +402,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c } } _, sharePidns := nsMaps[configs.NEWPID] - data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "") + data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps) if err != nil { return nil, err } @@ -357,19 +420,18 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c }, nil } -func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) { +func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) state, err := c.currentState() if err != nil { return nil, newSystemErrorWithCause(err, "getting container's current state") } - // for setns process, we dont have to set cloneflags as the process namespaces + // for setns process, we don't have to set cloneflags as the process namespaces // will only be set via setns syscall - data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath) + data, err := c.bootstrapData(0, state.NamespacePaths) if err != nil { return nil, err } - // TODO: set on container for process management return &setnsProcess{ cmd: cmd, cgroupPaths: c.cgroupManager.GetPaths(), @@ -378,7 +440,6 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, config: c.newInitConfig(p), process: p, bootstrapData: data, - rootDir: rootDir, }, nil } @@ -390,15 +451,14 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { User: process.User, AdditionalGroups: process.AdditionalGroups, Cwd: process.Cwd, - Console: process.consolePath, Capabilities: process.Capabilities, PassedFilesCount: len(process.ExtraFiles), ContainerId: c.ID(), NoNewPrivileges: c.config.NoNewPrivileges, + Rootless: c.config.Rootless, AppArmorProfile: c.config.AppArmorProfile, ProcessLabel: c.config.ProcessLabel, Rlimits: c.config.Rlimits, - ExecFifoPath: filepath.Join(c.root, execFifoFilename), } if process.NoNewPrivileges != nil { cfg.NoNewPrivileges = *process.NoNewPrivileges @@ -412,17 +472,10 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { if len(process.Rlimits) > 0 { cfg.Rlimits = process.Rlimits } + cfg.CreateConsole = process.ConsoleSocket != nil return cfg } -func newPipe() (parent *os.File, child *os.File, err error) { - fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) - if err != nil { - return nil, nil, err - } - return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil -} - func (c *linuxContainer) Destroy() error { c.m.Lock() defer c.m.Unlock() @@ -467,10 +520,18 @@ func (c *linuxContainer) Resume() error { } func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { + // XXX(cyphar): This requires cgroups. + if c.config.Rootless { + return nil, fmt.Errorf("cannot get OOM notifications from rootless container") + } return notifyOnOOM(c.cgroupManager.GetPaths()) } func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { + // XXX(cyphar): This requires cgroups. + if c.config.Rootless { + return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container") + } return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) } @@ -546,10 +607,40 @@ func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) } +func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error { + for _, path := range c.config.MaskPaths { + fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path)) + if err != nil { + if os.IsNotExist(err) { + continue + } + return err + } + if fi.IsDir() { + continue + } + + extMnt := &criurpc.ExtMountMap{ + Key: proto.String(path), + Val: proto.String("/dev/null"), + } + req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) + } + + return nil +} + func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() + // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has + // support for doing unprivileged dumps, but the setup of + // rootless containers might make this complicated. + if c.config.Rootless { + return fmt.Errorf("cannot checkpoint a rootless container") + } + if err := c.checkCriuVersion("1.5.2"); err != nil { return err } @@ -609,6 +700,12 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { } } + //pre-dump may need parentImage param to complete iterative migration + if criuOpts.ParentImage != "" { + rpcOpts.ParentImg = proto.String(criuOpts.ParentImage) + rpcOpts.TrackMem = proto.Bool(true) + } + // append optional manage cgroups mode if criuOpts.ManageCgroupsMode != 0 { if err := c.checkCriuVersion("1.7"); err != nil { @@ -618,39 +715,55 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { rpcOpts.ManageCgroupsMode = &mode } - t := criurpc.CriuReqType_DUMP + var t criurpc.CriuReqType + if criuOpts.PreDump { + t = criurpc.CriuReqType_PRE_DUMP + } else { + t = criurpc.CriuReqType_DUMP + } req := &criurpc.CriuReq{ Type: &t, Opts: &rpcOpts, } - for _, m := range c.config.Mounts { - switch m.Device { - case "bind": - c.addCriuDumpMount(req, m) - break - case "cgroup": - binds, err := getCgroupMounts(m) - if err != nil { - return err + //no need to dump these information in pre-dump + if !criuOpts.PreDump { + for _, m := range c.config.Mounts { + switch m.Device { + case "bind": + c.addCriuDumpMount(req, m) + break + case "cgroup": + binds, err := getCgroupMounts(m) + if err != nil { + return err + } + for _, b := range binds { + c.addCriuDumpMount(req, b) + } + break } - for _, b := range binds { - c.addCriuDumpMount(req, b) - } - break } - } - // Write the FD info to a file in the image directory + if err := c.addMaskPaths(req); err != nil { + return err + } - fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors()) - if err != nil { - return err - } + for _, node := range c.config.Devices { + m := &configs.Mount{Destination: node.Path, Source: node.Path} + c.addCriuDumpMount(req, m) + } - err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655) - if err != nil { - return err + // Write the FD info to a file in the image directory + fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors()) + if err != nil { + return err + } + + err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655) + if err != nil { + return err + } } err = c.criuSwrk(nil, req, criuOpts, false) @@ -697,6 +810,13 @@ func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() + + // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have + // support for unprivileged restore at the moment. + if c.config.Rootless { + return fmt.Errorf("cannot restore a rootless container") + } + if err := c.checkCriuVersion("1.5.2"); err != nil { return err } @@ -778,6 +898,16 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { } } + if len(c.config.MaskPaths) > 0 { + m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"} + c.addCriuRestoreMount(req, m) + } + + for _, node := range c.config.Devices { + m := &configs.Mount{Destination: node.Path, Source: node.Path} + c.addCriuRestoreMount(req, m) + } + if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 { c.restoreNetwork(req, criuOpts) } @@ -814,6 +944,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { } func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { + // XXX: Do we need to deal with this case? AFAIK criu still requires root. if err := c.cgroupManager.Apply(pid); err != nil { return err } @@ -953,6 +1084,23 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts * case t == criurpc.CriuReqType_RESTORE: case t == criurpc.CriuReqType_DUMP: break + case t == criurpc.CriuReqType_PRE_DUMP: + // In pre-dump mode CRIU is in a loop and waits for + // the final DUMP command. + // The current runc pre-dump approach, however, is + // start criu in PRE_DUMP once for a single pre-dump + // and not the whole series of pre-dump, pre-dump, ...m, dump + // If we got the message CriuReqType_PRE_DUMP it means + // CRIU was successful and we need to forcefully stop CRIU + logrus.Debugf("PRE_DUMP finished. Send close signal to CRIU service") + criuClient.Close() + // Process status won't be success, because one end of sockets is closed + _, err := cmd.Process.Wait() + if err != nil { + logrus.Debugf("After PRE_DUMP CRIU exiting failed") + return err + } + return nil default: return fmt.Errorf("unable to parse the response %s", resp.String()) } @@ -1026,7 +1174,7 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc Version: c.config.Version, ID: c.id, Pid: int(notify.GetPid()), - Root: c.config.Rootfs, + Bundle: utils.SearchLabels(c.config.Labels, "bundle"), } for i, hook := range c.config.Hooks.Prestart { if err := hook.Run(s); err != nil { @@ -1154,14 +1302,9 @@ func (c *linuxContainer) runType() (Status, error) { if !exist || err != nil { return Stopped, err } - // check if the process that is running is the init process or the user's process. - // this is the difference between the container Running and Created. - environ, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/environ", pid)) - if err != nil { - return Stopped, newSystemErrorWithCausef(err, "reading /proc/%d/environ", pid) - } - check := []byte("_LIBCONTAINER") - if bytes.Contains(environ, check) { + // We'll create exec fifo and blocking on it after container is created, + // and delete it after start container. + if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil { return Created, nil } return Running, nil @@ -1198,6 +1341,7 @@ func (c *linuxContainer) currentState() (*State, error) { InitProcessStartTime: startTime, Created: c.created, }, + Rootless: c.config.Rootless, CgroupPaths: c.cgroupManager.GetPaths(), NamespacePaths: make(map[configs.NamespaceType]string), ExternalDescriptors: externalDescriptors, @@ -1223,16 +1367,22 @@ func (c *linuxContainer) currentState() (*State, error) { // can setns in order. func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { paths := []string{} - nsTypes := []configs.NamespaceType{ + order := []configs.NamespaceType{ + // The user namespace *must* be done first. + configs.NEWUSER, configs.NEWIPC, configs.NEWUTS, configs.NEWNET, configs.NEWPID, configs.NEWNS, } - // join userns if the init process explicitly requires NEWUSER - if c.config.Namespaces.Contains(configs.NEWUSER) { - nsTypes = append(nsTypes, configs.NEWUSER) + + // Remove namespaces that we don't need to join. + var nsTypes []configs.NamespaceType + for _, ns := range order { + if c.config.Namespaces.Contains(ns) { + nsTypes = append(nsTypes, ns) + } } for _, nsType := range nsTypes { if p, ok := namespaces[nsType]; ok && p != "" { @@ -1249,7 +1399,7 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp if strings.ContainsRune(p, ',') { return nil, newSystemError(fmt.Errorf("invalid path %s", p)) } - paths = append(paths, p) + paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(nsType), p)) } } return paths, nil @@ -1272,7 +1422,7 @@ func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { // such as one that uses nsenter package to bootstrap the container's // init process correctly, i.e. with correct namespaces, uid/gid // mapping etc. -func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) { +func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) { // create the netlink message r := nl.NewNetlinkRequest(int(InitMsg), 0) @@ -1282,14 +1432,6 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na Value: uint32(cloneFlags), }) - // write console path - if consolePath != "" { - r.AddData(&Bytemsg{ - Type: ConsolePathAttr, - Value: []byte(consolePath), - }) - } - // write custom namespace paths if len(nsMaps) > 0 { nsPaths, err := c.orderNamespacePaths(nsMaps) @@ -1327,19 +1469,34 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na Type: GidmapAttr, Value: b, }) - // check if we have CAP_SETGID to setgroup properly - pid, err := capability.NewPid(os.Getpid()) - if err != nil { - return nil, err - } - if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { - r.AddData(&Boolmsg{ - Type: SetgroupAttr, - Value: true, - }) + // The following only applies if we are root. + if !c.config.Rootless { + // check if we have CAP_SETGID to setgroup properly + pid, err := capability.NewPid(os.Getpid()) + if err != nil { + return nil, err + } + if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { + r.AddData(&Boolmsg{ + Type: SetgroupAttr, + Value: true, + }) + } } } } + // write oom_score_adj + r.AddData(&Bytemsg{ + Type: OomScoreAdjAttr, + Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)), + }) + + // write rootless + r.AddData(&Boolmsg{ + Type: RootlessAttr, + Value: c.config.Rootless, + }) + return bytes.NewReader(r.Serialize()), nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go index b163fbbb..9d7d4dc8 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_unix.go @@ -25,11 +25,13 @@ type VethPairName struct { type CriuOpts struct { ImagesDirectory string // directory for storing image files WorkDirectory string // directory to cd and write logs/pidfiles/stats to + ParentImage string // direcotry for storing parent image files in pre-dump and dump LeaveRunning bool // leave container in running state after checkpoint TcpEstablished bool // checkpoint/restore established TCP connections ExternalUnixConnections bool // allow external unix connections ShellJob bool // allow to dump and restore shell jobs FileLocks bool // handle file locks, for safety + PreDump bool // call criu predump to perform iterative checkpoint PageServer CriuPageServerInfo // allow to dump to criu page server VethPairs []VethPairName // pass the veth to criu when restore ManageCgroupsMode cgMode // dump or restore cgroup mode diff --git a/vendor/github.com/opencontainers/runc/libcontainer/error.go b/vendor/github.com/opencontainers/runc/libcontainer/error.go index b0639270..21a3789b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/error.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/error.go @@ -60,9 +60,9 @@ func (c ErrorCode) String() string { type Error interface { error - // Returns a verbose string including the error message - // and a representation of the stack trace suitable for - // printing. + // Returns an error if it failed to write the detail of the Error to w. + // The detail of the Error may include the error message and a + // representation of the stack trace. Detail(w io.Writer) error // Returns the error code for this error. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go index 0abc2c5a..6a0f8558 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go @@ -15,6 +15,7 @@ import ( "github.com/docker/docker/pkg/mount" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/cgroups/rootless" "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs/validate" @@ -34,7 +35,15 @@ var ( // InitArgs returns an options func to configure a LinuxFactory with the // provided init binary path and arguments. func InitArgs(args ...string) func(*LinuxFactory) error { - return func(l *LinuxFactory) error { + return func(l *LinuxFactory) (err error) { + if len(args) > 0 { + // Resolve relative paths to ensure that its available + // after directory changes. + if args[0], err = filepath.Abs(args[0]); err != nil { + return newGenericError(err, ConfigInvalid) + } + } + l.InitArgs = args return nil } @@ -65,6 +74,20 @@ func Cgroupfs(l *LinuxFactory) error { return nil } +// RootlessCgroups is an options func to configure a LinuxFactory to +// return containers that use the "rootless" cgroup manager, which will +// fail to do any operations not possible to do with an unprivileged user. +// It should only be used in conjunction with rootless containers. +func RootlessCgroups(l *LinuxFactory) error { + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &rootless.Manager{ + Cgroups: config, + Paths: paths, + } + } + return nil +} + // TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs. func TmpfsRoot(l *LinuxFactory) error { mounted, err := mount.Mounted(l.Root) @@ -141,11 +164,11 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err if err := l.Validator.Validate(config); err != nil { return nil, newGenericError(err, ConfigInvalid) } - uid, err := config.HostUID() + uid, err := config.HostRootUID() if err != nil { return nil, newGenericError(err, SystemError) } - gid, err := config.HostGID() + gid, err := config.HostRootGID() if err != nil { return nil, newGenericError(err, SystemError) } @@ -161,15 +184,8 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err if err := os.Chown(containerRoot, uid, gid); err != nil { return nil, newGenericError(err, SystemError) } - fifoName := filepath.Join(containerRoot, execFifoFilename) - oldMask := syscall.Umask(0000) - if err := syscall.Mkfifo(fifoName, 0622); err != nil { - syscall.Umask(oldMask) - return nil, newGenericError(err, SystemError) - } - syscall.Umask(oldMask) - if err := os.Chown(fifoName, uid, gid); err != nil { - return nil, newGenericError(err, SystemError) + if config.Rootless { + RootlessCgroups(l) } c := &linuxContainer{ id: id, @@ -197,6 +213,10 @@ func (l *LinuxFactory) Load(id string) (Container, error) { processStartTime: state.InitProcessStartTime, fds: state.ExternalDescriptors, } + // We have to use the RootlessManager. + if state.Rootless { + RootlessCgroups(l) + } c := &linuxContainer{ initProcess: r, initProcessStartTime: state.InitProcessStartTime, @@ -222,54 +242,71 @@ func (l *LinuxFactory) Type() string { // StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state // This is a low level implementation detail of the reexec and should not be consumed externally func (l *LinuxFactory) StartInitialization() (err error) { - var pipefd, rootfd int - for k, v := range map[string]*int{ - "_LIBCONTAINER_INITPIPE": &pipefd, - "_LIBCONTAINER_STATEDIR": &rootfd, - } { - s := os.Getenv(k) + var ( + pipefd, rootfd int + consoleSocket *os.File + envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE") + envStateDir = os.Getenv("_LIBCONTAINER_STATEDIR") + envConsole = os.Getenv("_LIBCONTAINER_CONSOLE") + ) - i, err := strconv.Atoi(s) - if err != nil { - return fmt.Errorf("unable to convert %s=%s to int", k, s) - } - *v = i + // Get the INITPIPE. + pipefd, err = strconv.Atoi(envInitPipe) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err) } + var ( pipe = os.NewFile(uintptr(pipefd), "pipe") it = initType(os.Getenv("_LIBCONTAINER_INITTYPE")) ) + defer pipe.Close() + + // Only init processes have STATEDIR. + rootfd = -1 + if it == initStandard { + if rootfd, err = strconv.Atoi(envStateDir); err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err) + } + } + + if envConsole != "" { + console, err := strconv.Atoi(envConsole) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err) + } + consoleSocket = os.NewFile(uintptr(console), "console-socket") + defer consoleSocket.Close() + } + // clear the current process's environment to clean any libcontainer // specific env vars. os.Clearenv() - var i initer defer func() { // We have an error during the initialization of the container's init, // send it back to the parent process in the form of an initError. - // If container's init successed, syscall.Exec will not return, hence - // this defer function will never be called. - if _, ok := i.(*linuxStandardInit); ok { - // Synchronisation only necessary for standard init. - if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil { - panic(err) - } + if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil { + fmt.Fprintln(os.Stderr, err) + return } if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil { - panic(err) + fmt.Fprintln(os.Stderr, err) + return } - // ensure that this pipe is always closed - pipe.Close() }() defer func() { if e := recover(); e != nil { err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack())) } }() - i, err = newContainerInit(it, pipe, rootfd) + + i, err := newContainerInit(it, pipe, consoleSocket, rootfd) if err != nil { return err } + + // If Init succeeds, syscall.Exec will not return, hence none of the defers will be called. return i.Init() } @@ -277,7 +314,7 @@ func (l *LinuxFactory) loadState(root, id string) (*State, error) { f, err := os.Open(filepath.Join(root, stateFilename)) if err != nil { if os.IsNotExist(err) { - return nil, newGenericError(fmt.Errorf("container %q does not exists", id), ContainerNotExists) + return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists) } return nil, newGenericError(err, SystemError) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go b/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go index de37715c..6e7de2fe 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/generic_error.go @@ -9,20 +9,6 @@ import ( "github.com/opencontainers/runc/libcontainer/stacktrace" ) -type syncType uint8 - -const ( - procReady syncType = iota - procError - procRun - procHooks - procResume -) - -type syncT struct { - Type syncType `json:"type"` -} - var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}} Code: {{.ECode}} {{if .Message }} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go index b1e6762e..99cc02cb 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go @@ -6,12 +6,11 @@ import ( "encoding/json" "fmt" "io" - "io/ioutil" "net" "os" - "strconv" "strings" "syscall" + "unsafe" "github.com/Sirupsen/logrus" "github.com/opencontainers/runc/libcontainer/cgroups" @@ -44,29 +43,29 @@ type network struct { // initConfig is used for transferring parameters from Exec() to Init() type initConfig struct { - Args []string `json:"args"` - Env []string `json:"env"` - Cwd string `json:"cwd"` - Capabilities []string `json:"capabilities"` - ProcessLabel string `json:"process_label"` - AppArmorProfile string `json:"apparmor_profile"` - NoNewPrivileges bool `json:"no_new_privileges"` - User string `json:"user"` - AdditionalGroups []string `json:"additional_groups"` - Config *configs.Config `json:"config"` - Console string `json:"console"` - Networks []*network `json:"network"` - PassedFilesCount int `json:"passed_files_count"` - ContainerId string `json:"containerid"` - Rlimits []configs.Rlimit `json:"rlimits"` - ExecFifoPath string `json:"start_pipe_path"` + Args []string `json:"args"` + Env []string `json:"env"` + Cwd string `json:"cwd"` + Capabilities *configs.Capabilities `json:"capabilities"` + ProcessLabel string `json:"process_label"` + AppArmorProfile string `json:"apparmor_profile"` + NoNewPrivileges bool `json:"no_new_privileges"` + User string `json:"user"` + AdditionalGroups []string `json:"additional_groups"` + Config *configs.Config `json:"config"` + Networks []*network `json:"network"` + PassedFilesCount int `json:"passed_files_count"` + ContainerId string `json:"containerid"` + Rlimits []configs.Rlimit `json:"rlimits"` + CreateConsole bool `json:"create_console"` + Rootless bool `json:"rootless"` } type initer interface { Init() error } -func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) { +func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDirFD int) (initer, error) { var config *initConfig if err := json.NewDecoder(pipe).Decode(&config); err != nil { return nil, err @@ -77,14 +76,17 @@ func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) switch t { case initSetns: return &linuxSetnsInit{ - config: config, + pipe: pipe, + consoleSocket: consoleSocket, + config: config, }, nil case initStandard: return &linuxStandardInit{ - pipe: pipe, - parentPid: syscall.Getppid(), - config: config, - stateDirFD: stateDirFD, + pipe: pipe, + consoleSocket: consoleSocket, + parentPid: syscall.Getppid(), + config: config, + stateDirFD: stateDirFD, }, nil } return nil, fmt.Errorf("unknown init type %q", t) @@ -116,16 +118,18 @@ func finalizeNamespace(config *initConfig) error { return err } - capabilities := config.Config.Capabilities + capabilities := &configs.Capabilities{} if config.Capabilities != nil { capabilities = config.Capabilities + } else if config.Config.Capabilities != nil { + capabilities = config.Config.Capabilities } - w, err := newCapWhitelist(capabilities) + w, err := newContainerCapList(capabilities) if err != nil { return err } // drop capabilities in bounding set before changing user - if err := w.dropBoundingSet(); err != nil { + if err := w.ApplyBoundingSet(); err != nil { return err } // preserve existing capabilities while we change users @@ -138,8 +142,7 @@ func finalizeNamespace(config *initConfig) error { if err := system.ClearKeepCaps(); err != nil { return err } - // drop all other capabilities - if err := w.drop(); err != nil { + if err := w.ApplyCaps(); err != nil { return err } if config.Cwd != "" { @@ -150,24 +153,59 @@ func finalizeNamespace(config *initConfig) error { return nil } +// setupConsole sets up the console from inside the container, and sends the +// master pty fd to the config.Pipe (using cmsg). This is done to ensure that +// consoles are scoped to a container properly (see runc#814 and the many +// issues related to that). This has to be run *after* we've pivoted to the new +// rootfs (and the users' configuration is entirely set up). +func setupConsole(socket *os.File, config *initConfig, mount bool) error { + defer socket.Close() + // At this point, /dev/ptmx points to something that we would expect. We + // used to change the owner of the slave path, but since the /dev/pts mount + // can have gid=X set (at the users' option). So touching the owner of the + // slave PTY is not necessary, as the kernel will handle that for us. Note + // however, that setupUser (specifically fixStdioPermissions) *will* change + // the UID owner of the console to be the user the process will run as (so + // they can actually control their console). + console, err := newConsole() + if err != nil { + return err + } + // After we return from here, we don't need the console anymore. + defer console.Close() + + linuxConsole, ok := console.(*linuxConsole) + if !ok { + return fmt.Errorf("failed to cast console to *linuxConsole") + } + // Mount the console inside our rootfs. + if mount { + if err := linuxConsole.mount(); err != nil { + return err + } + } + // While we can access console.master, using the API is a good idea. + if err := utils.SendFd(socket, linuxConsole.File()); err != nil { + return err + } + // Now, dup over all the things. + return linuxConsole.dupStdio() +} + // syncParentReady sends to the given pipe a JSON payload which indicates that // the init is ready to Exec the child process. It then waits for the parent to // indicate that it is cleared to Exec. func syncParentReady(pipe io.ReadWriter) error { // Tell parent. - if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil { + if err := writeSync(pipe, procReady); err != nil { return err } + // Wait for parent to give the all-clear. - var procSync syncT - if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { - if err == io.EOF { - return fmt.Errorf("parent closed synchronisation channel") - } - if procSync.Type != procRun { - return fmt.Errorf("invalid synchronisation flag from parent") - } + if err := readSync(pipe, procRun); err != nil { + return err } + return nil } @@ -176,19 +214,15 @@ func syncParentReady(pipe io.ReadWriter) error { // indicate that it is cleared to resume. func syncParentHooks(pipe io.ReadWriter) error { // Tell parent. - if err := utils.WriteJSON(pipe, syncT{procHooks}); err != nil { + if err := writeSync(pipe, procHooks); err != nil { return err } + // Wait for parent to give the all-clear. - var procSync syncT - if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { - if err == io.EOF { - return fmt.Errorf("parent closed synchronisation channel") - } - if procSync.Type != procResume { - return fmt.Errorf("invalid synchronisation flag from parent") - } + if err := readSync(pipe, procResume); err != nil { + return err } + return nil } @@ -196,18 +230,21 @@ func syncParentHooks(pipe io.ReadWriter) error { func setupUser(config *initConfig) error { // Set up defaults. defaultExecUser := user.ExecUser{ - Uid: syscall.Getuid(), - Gid: syscall.Getgid(), + Uid: 0, + Gid: 0, Home: "/", } + passwdPath, err := user.GetPasswdPath() if err != nil { return err } + groupPath, err := user.GetGroupPath() if err != nil { return err } + execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath) if err != nil { return err @@ -220,22 +257,49 @@ func setupUser(config *initConfig) error { return err } } + + if config.Rootless { + if execUser.Uid != 0 { + return fmt.Errorf("cannot run as a non-root user in a rootless container") + } + + if execUser.Gid != 0 { + return fmt.Errorf("cannot run as a non-root group in a rootless container") + } + + // We cannot set any additional groups in a rootless container and thus we + // bail if the user asked us to do so. TODO: We currently can't do this + // earlier, but if libcontainer.Process.User was typesafe this might work. + if len(addGroups) > 0 { + return fmt.Errorf("cannot set any additional groups in a rootless container") + } + } + // before we change to the container's user make sure that the processes STDIO // is correctly owned by the user that we are switching to. - if err := fixStdioPermissions(execUser); err != nil { + if err := fixStdioPermissions(config, execUser); err != nil { return err } - suppGroups := append(execUser.Sgids, addGroups...) - if err := syscall.Setgroups(suppGroups); err != nil { - return err + + // This isn't allowed in an unprivileged user namespace since Linux 3.19. + // There's nothing we can do about /etc/group entries, so we silently + // ignore setting groups here (since the user didn't explicitly ask us to + // set the group). + if !config.Rootless { + suppGroups := append(execUser.Sgids, addGroups...) + if err := syscall.Setgroups(suppGroups); err != nil { + return err + } } if err := system.Setgid(execUser.Gid); err != nil { return err } + if err := system.Setuid(execUser.Uid); err != nil { return err } + // if we didn't get HOME already, set it based on the user's HOME if envHome := os.Getenv("HOME"); envHome == "" { if err := os.Setenv("HOME", execUser.Home); err != nil { @@ -248,7 +312,7 @@ func setupUser(config *initConfig) error { // fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user. // The ownership needs to match because it is created outside of the container and needs to be // localized. -func fixStdioPermissions(u *user.ExecUser) error { +func fixStdioPermissions(config *initConfig, u *user.ExecUser) error { var null syscall.Stat_t if err := syscall.Stat("/dev/null", &null); err != nil { return err @@ -262,11 +326,27 @@ func fixStdioPermissions(u *user.ExecUser) error { if err := syscall.Fstat(int(fd), &s); err != nil { return err } - // skip chown of /dev/null if it was used as one of the STDIO fds. + + // Skip chown of /dev/null if it was used as one of the STDIO fds. if s.Rdev == null.Rdev { continue } - if err := syscall.Fchown(int(fd), u.Uid, u.Gid); err != nil { + + // Skip chown if s.Gid is actually an unmapped gid in the host. While + // this is a bit dodgy if it just so happens that the console _is_ + // owned by overflow_gid, there's no way for us to disambiguate this as + // a userspace program. + if _, err := config.Config.HostGID(int(s.Gid)); err != nil { + continue + } + + // We only change the uid owner (as it is possible for the mount to + // prefer a different gid, and there's no reason for us to change it). + // The reason why we don't just leave the default uid=X mount setup is + // that users expect to be able to actually use their console. Without + // this code, you couldn't effectively run as a non-root user inside a + // container and also have a console set up. + if err := syscall.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil { return err } } @@ -328,16 +408,51 @@ func setupRlimits(limits []configs.Rlimit, pid int) error { return nil } -func setOomScoreAdj(oomScoreAdj int, pid int) error { - path := fmt.Sprintf("/proc/%d/oom_score_adj", pid) +const _P_PID = 1 - return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600) +type siginfo struct { + si_signo int32 + si_errno int32 + si_code int32 + // below here is a union; si_pid is the only field we use + si_pid int32 + // Pad to 128 bytes as detailed in blockUntilWaitable + pad [96]byte } -// killCgroupProcesses freezes then iterates over all the processes inside the -// manager's cgroups sending a SIGKILL to each process then waiting for them to -// exit. -func killCgroupProcesses(m cgroups.Manager) error { +// isWaitable returns true if the process has exited false otherwise. +// Its based off blockUntilWaitable in src/os/wait_waitid.go +func isWaitable(pid int) (bool, error) { + si := &siginfo{} + _, _, e := syscall.Syscall6(syscall.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), syscall.WEXITED|syscall.WNOWAIT|syscall.WNOHANG, 0, 0) + if e != 0 { + return false, os.NewSyscallError("waitid", e) + } + + return si.si_pid != 0, nil +} + +// isNoChildren returns true if err represents a syscall.ECHILD false otherwise +func isNoChildren(err error) bool { + switch err := err.(type) { + case syscall.Errno: + if err == syscall.ECHILD { + return true + } + case *os.SyscallError: + if err.Err == syscall.ECHILD { + return true + } + } + return false +} + +// signalAllProcesses freezes then iterates over all the processes inside the +// manager's cgroups sending the signal s to them. +// If s is SIGKILL then it will wait for each process to exit. +// For all other signals it will check if the process is ready to report its +// exit status and only if it is will a wait be performed. +func signalAllProcesses(m cgroups.Manager, s os.Signal) error { var procs []*os.Process if err := m.Freeze(configs.Frozen); err != nil { logrus.Warn(err) @@ -354,16 +469,31 @@ func killCgroupProcesses(m cgroups.Manager) error { continue } procs = append(procs, p) - if err := p.Kill(); err != nil { + if err := p.Signal(s); err != nil { logrus.Warn(err) } } if err := m.Freeze(configs.Thawed); err != nil { logrus.Warn(err) } + for _, p := range procs { + if s != syscall.SIGKILL { + if ok, err := isWaitable(p.Pid); err != nil { + if !isNoChildren(err) { + logrus.Warn("signalAllProcesses: ", p.Pid, err) + } + continue + } else if !ok { + // Not ready to report so don't wait + continue + } + } + if _, err := p.Wait(); err != nil { - logrus.Warn(err) + if !isNoChildren(err) { + logrus.Warn("wait: ", err) + } } } return nil diff --git a/vendor/github.com/opencontainers/runc/libcontainer/label/label.go b/vendor/github.com/opencontainers/runc/libcontainer/label/label.go index 684bb41b..fddec463 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/label/label.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/label/label.go @@ -9,6 +9,10 @@ func InitLabels(options []string) (string, string, error) { return "", "", nil } +func GetROMountLabel() string { + return "" +} + func GenLabels(options string) (string, string, error) { return "", "", nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go b/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go index 1d9d78a3..058c9229 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go @@ -55,6 +55,10 @@ func InitLabels(options []string) (string, string, error) { return processLabel, mountLabel, nil } +func GetROMountLabel() string { + return selinux.GetROFileLabel() +} + // DEPRECATED: The GenLabels function is only to be used during the transition to the official API. func GenLabels(options string) (string, string, error) { return InitLabels(strings.Fields(options)) @@ -138,7 +142,7 @@ func Relabel(path string, fileLabel string, shared bool) error { fileLabel = c.Get() } if err := selinux.Chcon(path, fileLabel, true); err != nil { - return fmt.Errorf("SELinux relabeling of %s is not allowed: %q", path, err) + return err } return nil } @@ -169,7 +173,7 @@ func UnreserveLabel(label string) error { return nil } -// DupSecOpt takes an process label and returns security options that +// DupSecOpt takes a process label and returns security options that // can be used to set duplicate labels on future container processes func DupSecOpt(src string) []string { return selinux.DupSecOpt(src) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go index 400bd362..bc725a22 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go @@ -13,11 +13,13 @@ import ( const ( InitMsg uint16 = 62000 CloneFlagsAttr uint16 = 27281 - ConsolePathAttr uint16 = 27282 - NsPathsAttr uint16 = 27283 - UidmapAttr uint16 = 27284 - GidmapAttr uint16 = 27285 - SetgroupAttr uint16 = 27286 + NsPathsAttr uint16 = 27282 + UidmapAttr uint16 = 27283 + GidmapAttr uint16 = 27284 + SetgroupAttr uint16 = 27285 + OomScoreAdjAttr uint16 = 27286 + RootlessAttr uint16 = 27287 + // When syscall.NLA_HDRLEN is in gccgo, take this out. syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1) ) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process.go b/vendor/github.com/opencontainers/runc/libcontainer/process.go index 334add57..f1ad0814 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/process.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/process.go @@ -47,12 +47,9 @@ type Process struct { // ExtraFiles specifies additional open files to be inherited by the container ExtraFiles []*os.File - // consolePath is the path to the console allocated to the container. - consolePath string - // Capabilities specify the capabilities to keep when executing the process inside the container // All capabilities not specified will be dropped from the processes capability mask - Capabilities []string + Capabilities *configs.Capabilities // AppArmorProfile specifies the profile to apply to the process and is // changed at the time the process is execed @@ -68,6 +65,9 @@ type Process struct { // If Rlimits are not set, the container will inherit rlimits from the parent process Rlimits []configs.Rlimit + // ConsoleSocket provides the masterfd console. + ConsoleSocket *os.File + ops processOperations } @@ -104,22 +104,3 @@ type IO struct { Stdout io.ReadCloser Stderr io.ReadCloser } - -// NewConsole creates new console for process and returns it -func (p *Process) NewConsole(rootuid, rootgid int) (Console, error) { - console, err := NewConsole(rootuid, rootgid) - if err != nil { - return nil, err - } - p.consolePath = console.Path() - return console, nil -} - -// ConsoleFromPath sets the process's console with the path provided -func (p *Process) ConsoleFromPath(path string) error { - if p.consolePath != "" { - return newGenericError(fmt.Errorf("console path already exists for process"), ConsoleExists) - } - p.consolePath = path - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go index 5b81317f..bfe99551 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go @@ -51,7 +51,6 @@ type setnsProcess struct { fds []string process *Process bootstrapData io.Reader - rootDir *os.File } func (p *setnsProcess) startTime() (string, error) { @@ -70,7 +69,6 @@ func (p *setnsProcess) start() (err error) { defer p.parentPipe.Close() err = p.cmd.Start() p.childPipe.Close() - p.rootDir.Close() if err != nil { return newSystemErrorWithCause(err, "starting setns process") } @@ -82,15 +80,12 @@ func (p *setnsProcess) start() (err error) { if err = p.execSetns(); err != nil { return newSystemErrorWithCause(err, "executing setns process") } - if len(p.cgroupPaths) > 0 { + // We can't join cgroups if we're in a rootless container. + if !p.config.Rootless && len(p.cgroupPaths) > 0 { if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) } } - // set oom_score_adj - if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil { - return newSystemErrorWithCause(err, "setting oom score") - } // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { @@ -100,15 +95,22 @@ func (p *setnsProcess) start() (err error) { return newSystemErrorWithCause(err, "writing config to pipe") } + ierr := parseSync(p.parentPipe, func(sync *syncT) error { + switch sync.Type { + case procReady: + // This shouldn't happen. + panic("unexpected procReady in setns") + case procHooks: + // This shouldn't happen. + panic("unexpected procHooks in setns") + default: + return newSystemError(fmt.Errorf("invalid JSON payload from child")) + } + }) + if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { return newSystemErrorWithCause(err, "calling shutdown on init pipe") } - // wait for the child process to fully complete and receive an error message - // if one was encoutered - var ierr *genericError - if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { - return newSystemErrorWithCause(err, "decoding init error from pipe") - } // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { p.wait() @@ -146,7 +148,7 @@ func (p *setnsProcess) execSetns() error { } // terminate sends a SIGKILL to the forked process for the setns routine then waits to -// avoid the process becomming a zombie. +// avoid the process becoming a zombie. func (p *setnsProcess) terminate() error { if p.cmd.Process == nil { return nil @@ -239,7 +241,7 @@ func (p *initProcess) start() error { return newSystemErrorWithCause(err, "starting init process command") } if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { - return err + return newSystemErrorWithCause(err, "copying bootstrap data to pipe") } if err := p.execSetns(); err != nil { return newSystemErrorWithCause(err, "running exec setns process for init") @@ -252,8 +254,9 @@ func (p *initProcess) start() error { return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid()) } p.setExternalDescriptors(fds) - // Do this before syncing with child so that no children - // can escape the cgroup + // Do this before syncing with child so that no children can escape the + // cgroup. We don't need to worry about not doing this and not being root + // because we'd be using the rootless cgroup manager in that case. if err := p.manager.Apply(p.pid()); err != nil { return newSystemErrorWithCause(err, "applying cgroup configuration for process") } @@ -264,36 +267,22 @@ func (p *initProcess) start() error { } }() if err := p.createNetworkInterfaces(); err != nil { - return newSystemErrorWithCause(err, "creating nework interfaces") + return newSystemErrorWithCause(err, "creating network interfaces") } if err := p.sendConfig(); err != nil { return newSystemErrorWithCause(err, "sending config to init process") } var ( - procSync syncT sentRun bool sentResume bool - ierr *genericError ) - dec := json.NewDecoder(p.parentPipe) -loop: - for { - if err := dec.Decode(&procSync); err != nil { - if err == io.EOF { - break loop - } - return newSystemErrorWithCause(err, "decoding sync type from init pipe") - } - switch procSync.Type { + ierr := parseSync(p.parentPipe, func(sync *syncT) error { + switch sync.Type { case procReady: if err := p.manager.Set(p.config.Config); err != nil { return newSystemErrorWithCause(err, "setting cgroup config for ready process") } - // set oom_score_adj - if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil { - return newSystemErrorWithCause(err, "setting oom score for ready process") - } // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { @@ -306,7 +295,7 @@ loop: Version: p.container.config.Version, ID: p.container.id, Pid: p.pid(), - Root: p.config.Config.Rootfs, + Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"), } for i, hook := range p.config.Config.Hooks.Prestart { if err := hook.Run(s); err != nil { @@ -316,18 +305,17 @@ loop: } } // Sync with child. - if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil { - return newSystemErrorWithCause(err, "reading syncT run type") + if err := writeSync(p.parentPipe, procRun); err != nil { + return newSystemErrorWithCause(err, "writing syncT 'run'") } sentRun = true case procHooks: if p.config.Config.Hooks != nil { s := configs.HookState{ - Version: p.container.config.Version, - ID: p.container.id, - Pid: p.pid(), - Root: p.config.Config.Rootfs, - BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"), + Version: p.container.config.Version, + ID: p.container.id, + Pid: p.pid(), + Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"), } for i, hook := range p.config.Config.Hooks.Prestart { if err := hook.Run(s); err != nil { @@ -336,25 +324,17 @@ loop: } } // Sync with child. - if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil { - return newSystemErrorWithCause(err, "reading syncT resume type") + if err := writeSync(p.parentPipe, procResume); err != nil { + return newSystemErrorWithCause(err, "writing syncT 'resume'") } sentResume = true - case procError: - // wait for the child process to fully complete and receive an error message - // if one was encoutered - if err := dec.Decode(&ierr); err != nil && err != io.EOF { - return newSystemErrorWithCause(err, "decoding proc error from init") - } - if ierr != nil { - break loop - } - // Programmer error. - panic("No error following JSON procError payload.") default: return newSystemError(fmt.Errorf("invalid JSON payload from child")) } - } + + return nil + }) + if !sentRun { return newSystemErrorWithCause(ierr, "container init") } @@ -364,6 +344,7 @@ loop: if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { return newSystemErrorWithCause(err, "shutting down init pipe") } + // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { p.wait() @@ -379,7 +360,7 @@ func (p *initProcess) wait() (*os.ProcessState, error) { } // we should kill all processes in cgroup when init is died if we use host PID namespace if p.sharePidns { - killCgroupProcesses(p.manager) + signalAllProcesses(p.manager, syscall.SIGKILL) } return p.cmd.ProcessState, nil } @@ -440,9 +421,17 @@ func getPipeFds(pid int) ([]string, error) { dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd") for i := 0; i < 3; i++ { + // XXX: This breaks if the path is not a valid symlink (which can + // happen in certain particularly unlucky mount namespace setups). f := filepath.Join(dirPath, strconv.Itoa(i)) target, err := os.Readlink(f) if err != nil { + // Ignore permission errors, for rootless containers and other + // non-dumpable processes. if we can't get the fd for a particular + // file, there's not much we can do. + if os.IsPermission(err) { + continue + } return fds, err } fds[i] = target @@ -450,8 +439,10 @@ func getPipeFds(pid int) ([]string, error) { return fds, nil } -// InitializeIO creates pipes for use with the process's STDIO -// and returns the opposite side for each +// InitializeIO creates pipes for use with the process's stdio and returns the +// opposite side for each. Do not use this if you want to have a pseudoterminal +// set up for you by libcontainer (TODO: fix that too). +// TODO: This is mostly unnecessary, and should be handled by clients. func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { var fds []uintptr i = &IO{} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go index 67b7a275..b4948687 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go @@ -16,6 +16,7 @@ import ( "github.com/docker/docker/pkg/mount" "github.com/docker/docker/pkg/symlink" + "github.com/mrunalp/fileutils" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/label" @@ -35,9 +36,11 @@ func needsSetupDev(config *configs.Config) bool { return true } -// setupRootfs sets up the devices, mount points, and filesystems for use inside a -// new mount namespace. -func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWriter) (err error) { +// prepareRootfs sets up the devices, mount points, and filesystems for use +// inside a new mount namespace. It doesn't set anything as ro or pivot_root, +// because console setup happens inside the caller. You must call +// finalizeRootfs in order to finish the rootfs setup. +func prepareRootfs(pipe io.ReadWriter, config *configs.Config) (err error) { if err := prepareRoot(config); err != nil { return newSystemErrorWithCause(err, "preparing rootfs") } @@ -49,6 +52,7 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit return newSystemErrorWithCause(err, "running premount command") } } + if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil { return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination) } @@ -59,17 +63,19 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit } } } + if setupDev { if err := createDevices(config); err != nil { return newSystemErrorWithCause(err, "creating device nodes") } - if err := setupPtmx(config, console); err != nil { + if err := setupPtmx(config); err != nil { return newSystemErrorWithCause(err, "setting up ptmx") } if err := setupDevSymlinks(config.Rootfs); err != nil { return newSystemErrorWithCause(err, "setting up /dev symlinks") } } + // Signal the parent to run the pre-start hooks. // The hooks are run after the mounts are setup, but before we switch to the new // root, so that the old root is still available in the hooks for any mount @@ -77,39 +83,59 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit if err := syncParentHooks(pipe); err != nil { return err } + + // The reason these operations are done here rather than in finalizeRootfs + // is because the console-handling code gets quite sticky if we have to set + // up the console before doing the pivot_root(2). This is because the + // Console API has to also work with the ExecIn case, which means that the + // API must be able to deal with being inside as well as outside the + // container. It's just cleaner to do this here (at the expense of the + // operation not being perfectly split). + if err := syscall.Chdir(config.Rootfs); err != nil { return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs) } + if config.NoPivotRoot { err = msMoveRoot(config.Rootfs) } else { - err = pivotRoot(config.Rootfs, config.PivotDir) + err = pivotRoot(config.Rootfs) } if err != nil { return newSystemErrorWithCause(err, "jailing process inside rootfs") } + if setupDev { if err := reOpenDevNull(); err != nil { return newSystemErrorWithCause(err, "reopening /dev/null inside container") } } - // remount dev as ro if specifed + + return nil +} + +// finalizeRootfs actually switches the root of the process and sets anything +// to ro if necessary. You must call prepareRootfs first. +func finalizeRootfs(config *configs.Config) (err error) { + // remount dev as ro if specified for _, m := range config.Mounts { if libcontainerUtils.CleanPath(m.Destination) == "/dev" { - if m.Flags&syscall.MS_RDONLY != 0 { - if err := remountReadonly(m.Destination); err != nil { + if m.Flags&syscall.MS_RDONLY == syscall.MS_RDONLY { + if err := remountReadonly(m); err != nil { return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination) } } break } } + // set rootfs ( / ) as readonly if config.Readonlyfs { if err := setReadonly(); err != nil { return newSystemErrorWithCause(err, "setting rootfs as readonly") } } + syscall.Umask(0022) return nil } @@ -152,15 +178,41 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { } return nil case "tmpfs": + copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP + tmpDir := "" stat, err := os.Stat(dest) if err != nil { if err := os.MkdirAll(dest, 0755); err != nil { return err } } + if copyUp { + tmpDir, err = ioutil.TempDir("/tmp", "runctmpdir") + if err != nil { + return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir") + } + defer os.RemoveAll(tmpDir) + m.Destination = tmpDir + } if err := mountPropagate(m, rootfs, mountLabel); err != nil { return err } + if copyUp { + if err := fileutils.CopyDirectory(dest, tmpDir); err != nil { + errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err) + if err1 := syscall.Unmount(tmpDir, syscall.MNT_DETACH); err1 != nil { + return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg) + } + return errMsg + } + if err := syscall.Mount(tmpDir, dest, "", syscall.MS_MOVE, ""); err != nil { + errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err) + if err1 := syscall.Unmount(tmpDir, syscall.MNT_DETACH); err1 != nil { + return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg) + } + return errMsg + } + } if stat != nil { if err = os.Chmod(dest, stat.Mode()); err != nil { return err @@ -178,7 +230,7 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { // any previous mounts can invalidate the next mount's destination. // this can happen when a user specifies mounts within other mounts to cause breakouts or other // evil stuff to try to escape the container's rootfs. - if dest, err = symlink.FollowSymlinkInScope(filepath.Join(rootfs, m.Destination), rootfs); err != nil { + if dest, err = symlink.FollowSymlinkInScope(dest, rootfs); err != nil { return err } if err := checkMountDestination(rootfs, dest); err != nil { @@ -261,6 +313,19 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { } } default: + // ensure that the destination of the mount is resolved of symlinks at mount time because + // any previous mounts can invalidate the next mount's destination. + // this can happen when a user specifies mounts within other mounts to cause breakouts or other + // evil stuff to try to escape the container's rootfs. + var err error + if dest, err = symlink.FollowSymlinkInScope(dest, rootfs); err != nil { + return err + } + if err := checkMountDestination(rootfs, dest); err != nil { + return err + } + // update the mount with the correct dest after symlinks are resolved. + m.Destination = dest if err := os.MkdirAll(dest, 0755); err != nil { return err } @@ -283,7 +348,7 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { var binds []*configs.Mount for _, mm := range mounts { - dir, err := mm.GetThisCgroupDir(cgroupPaths) + dir, err := mm.GetOwnCgroup(cgroupPaths) if err != nil { return nil, err } @@ -294,7 +359,7 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { binds = append(binds, &configs.Mount{ Device: "bind", Source: filepath.Join(mm.Mountpoint, relDir), - Destination: filepath.Join(m.Destination, strings.Join(mm.Subsystems, ",")), + Destination: filepath.Join(m.Destination, filepath.Base(mm.Mountpoint)), Flags: syscall.MS_BIND | syscall.MS_REC | m.Flags, PropagationFlags: m.PropagationFlags, }) @@ -306,9 +371,6 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { // checkMountDestination checks to ensure that the mount destination is not over the top of /proc. // dest is required to be an abs path and have any symlinks resolved before calling this function. func checkMountDestination(rootfs, dest string) error { - if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) { - return fmt.Errorf("mounting into / is prohibited") - } invalidDestinations := []string{ "/proc", } @@ -448,10 +510,12 @@ func createDeviceNode(rootfs string, node *configs.Device, bind bool) error { func mknodDevice(dest string, node *configs.Device) error { fileMode := node.FileMode switch node.Type { - case 'c': + case 'c', 'u': fileMode |= syscall.S_IFCHR case 'b': fileMode |= syscall.S_IFBLK + case 'p': + fileMode |= syscall.S_IFIFO default: return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path) } @@ -539,10 +603,12 @@ func prepareRoot(config *configs.Config) error { if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil { return err } - if config.NoPivotRoot { - if err := rootfsParentMountPrivate(config.Rootfs); err != nil { - return err - } + + // Make parent mount private to make sure following bind mount does + // not propagate in other namespaces. Also it will help with kernel + // check pass in pivot_root. (IS_SHARED(new_mnt->mnt_parent)) + if err := rootfsParentMountPrivate(config.Rootfs); err != nil { + return err } return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "") @@ -552,7 +618,7 @@ func setReadonly() error { return syscall.Mount("/", "/", "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, "") } -func setupPtmx(config *configs.Config, console *linuxConsole) error { +func setupPtmx(config *configs.Config) error { ptmx := filepath.Join(config.Rootfs, "dev/ptmx") if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { return err @@ -560,54 +626,61 @@ func setupPtmx(config *configs.Config, console *linuxConsole) error { if err := os.Symlink("pts/ptmx", ptmx); err != nil { return fmt.Errorf("symlink dev ptmx %s", err) } - if console != nil { - return console.mount(config.Rootfs, config.MountLabel) - } return nil } -func pivotRoot(rootfs, pivotBaseDir string) (err error) { - if pivotBaseDir == "" { - pivotBaseDir = "/" - } - tmpDir := filepath.Join(rootfs, pivotBaseDir) - if err := os.MkdirAll(tmpDir, 0755); err != nil { - return fmt.Errorf("can't create tmp dir %s, error %v", tmpDir, err) - } - pivotDir, err := ioutil.TempDir(tmpDir, ".pivot_root") - if err != nil { - return fmt.Errorf("can't create pivot_root dir %s, error %v", pivotDir, err) - } - defer func() { - errVal := os.Remove(pivotDir) - if err == nil { - err = errVal - } - }() - if err := syscall.PivotRoot(rootfs, pivotDir); err != nil { - // Make the parent mount private - if err := rootfsParentMountPrivate(rootfs); err != nil { - return err - } - // Try again - if err := syscall.PivotRoot(rootfs, pivotDir); err != nil { - return fmt.Errorf("pivot_root %s", err) - } - } - if err := syscall.Chdir("/"); err != nil { - return fmt.Errorf("chdir / %s", err) - } - // path to pivot dir now changed, update - pivotDir = filepath.Join(pivotBaseDir, filepath.Base(pivotDir)) +// pivotRoot will call pivot_root such that rootfs becomes the new root +// filesystem, and everything else is cleaned up. +func pivotRoot(rootfs string) error { + // While the documentation may claim otherwise, pivot_root(".", ".") is + // actually valid. What this results in is / being the new root but + // /proc/self/cwd being the old root. Since we can play around with the cwd + // with pivot_root this allows us to pivot without creating directories in + // the rootfs. Shout-outs to the LXC developers for giving us this idea. - // Make pivotDir rprivate to make sure any of the unmounts don't - // propagate to parent. - if err := syscall.Mount("", pivotDir, "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil { + oldroot, err := syscall.Open("/", syscall.O_DIRECTORY|syscall.O_RDONLY, 0) + if err != nil { + return err + } + defer syscall.Close(oldroot) + + newroot, err := syscall.Open(rootfs, syscall.O_DIRECTORY|syscall.O_RDONLY, 0) + if err != nil { + return err + } + defer syscall.Close(newroot) + + // Change to the new root so that the pivot_root actually acts on it. + if err := syscall.Fchdir(newroot); err != nil { return err } - if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { - return fmt.Errorf("unmount pivot_root dir %s", err) + if err := syscall.PivotRoot(".", "."); err != nil { + return fmt.Errorf("pivot_root %s", err) + } + + // Currently our "." is oldroot (according to the current kernel code). + // However, purely for safety, we will fchdir(oldroot) since there isn't + // really any guarantee from the kernel what /proc/self/cwd will be after a + // pivot_root(2). + + if err := syscall.Fchdir(oldroot); err != nil { + return err + } + + // Make oldroot rprivate to make sure our unmounts don't propagate to the + // host (and thus bork the machine). + if err := syscall.Mount("", ".", "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil { + return err + } + // Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd. + if err := syscall.Unmount(".", syscall.MNT_DETACH); err != nil { + return err + } + + // Switch back to our shiny new root. + if err := syscall.Chdir("/"); err != nil { + return fmt.Errorf("chdir / %s", err) } return nil } @@ -642,17 +715,26 @@ func createIfNotExists(path string, isDir bool) error { return nil } -// remountReadonly will bind over the top of an existing path and ensure that it is read-only. -func remountReadonly(path string) error { +// readonlyPath will make a path read only. +func readonlyPath(path string) error { + if err := syscall.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + return syscall.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, "") +} + +// remountReadonly will remount an existing mount point and ensure that it is read-only. +func remountReadonly(m *configs.Mount) error { + var ( + dest = m.Destination + flags = m.Flags + ) for i := 0; i < 5; i++ { - if err := syscall.Mount("", path, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil && !os.IsNotExist(err) { + if err := syscall.Mount("", dest, "", uintptr(flags|syscall.MS_REMOUNT|syscall.MS_RDONLY), ""); err != nil { switch err { - case syscall.EINVAL: - // Probably not a mountpoint, use bind-mount - if err := syscall.Mount(path, path, "", syscall.MS_BIND, ""); err != nil { - return err - } - return syscall.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC|defaultMountFlags, "") case syscall.EBUSY: time.Sleep(100 * time.Millisecond) continue @@ -662,13 +744,19 @@ func remountReadonly(path string) error { } return nil } - return fmt.Errorf("unable to mount %s as readonly max retries reached", path) + return fmt.Errorf("unable to mount %s as readonly max retries reached", dest) } -// maskFile bind mounts /dev/null over the top of the specified path inside a container -// to avoid security issues from processes reading information from non-namespace aware mounts ( proc/kcore ). -func maskFile(path string) error { +// maskPath masks the top of the specified path inside a container to avoid +// security issues from processes reading information from non-namespace aware +// mounts ( proc/kcore ). +// For files, maskPath bind mounts /dev/null over the top of the specified path. +// For directories, maskPath mounts read-only tmpfs over the top of the specified path. +func maskPath(path string) error { if err := syscall.Mount("/dev/null", path, "", syscall.MS_BIND, ""); err != nil && !os.IsNotExist(err) { + if err == syscall.ENOTDIR { + return syscall.Mount("tmpfs", path, "tmpfs", syscall.MS_RDONLY, "") + } return err } return nil @@ -705,7 +793,9 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error { if libcontainerUtils.CleanPath(dest) == "/dev" { flags &= ^syscall.MS_RDONLY } - if !strings.HasPrefix(dest, rootfs) { + + copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP + if !(copyUp || strings.HasPrefix(dest, rootfs)) { dest = filepath.Join(rootfs, dest) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go index ec0ac1c0..518d2c38 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go @@ -212,10 +212,6 @@ func parseStatusFile(path string) (map[string]string, error) { status := make(map[string]string) for s.Scan() { - if err := s.Err(); err != nil { - return nil, err - } - text := s.Text() parts := strings.Split(text, ":") @@ -225,5 +221,9 @@ func parseStatusFile(path string) (map[string]string, error) { status[parts[0]] = parts[1] } + if err := s.Err(); err != nil { + return nil, err + } + return status, nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go b/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go index 2a18e2ad..5bd028bf 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go @@ -32,33 +32,73 @@ const ( stRdOnly = 0x01 ) +type selinuxState struct { + enabledSet bool + enabled bool + selinuxfsSet bool + selinuxfs string + mcsList map[string]bool + sync.Mutex +} + var ( - assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`) - mcsList = make(map[string]bool) - mcsLock sync.Mutex - selinuxfs = "unknown" - selinuxEnabled = false // Stores whether selinux is currently enabled - selinuxEnabledChecked = false // Stores whether selinux enablement has been checked or established yet + assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`) + state = selinuxState{ + mcsList: make(map[string]bool), + } ) type SELinuxContext map[string]string -// SetDisabled disables selinux support for the package -func SetDisabled() { - selinuxEnabled, selinuxEnabledChecked = false, true +func (s *selinuxState) setEnable(enabled bool) bool { + s.Lock() + defer s.Unlock() + s.enabledSet = true + s.enabled = enabled + return s.enabled } -// getSelinuxMountPoint returns the path to the mountpoint of an selinuxfs -// filesystem or an empty string if no mountpoint is found. Selinuxfs is -// a proc-like pseudo-filesystem that exposes the selinux policy API to -// processes. The existence of an selinuxfs mount is used to determine -// whether selinux is currently enabled or not. -func getSelinuxMountPoint() string { - if selinuxfs != "unknown" { +func (s *selinuxState) getEnabled() bool { + s.Lock() + enabled := s.enabled + enabledSet := s.enabledSet + s.Unlock() + if enabledSet { + return enabled + } + + enabled = false + if fs := getSelinuxMountPoint(); fs != "" { + if con, _ := Getcon(); con != "kernel" { + enabled = true + } + } + return s.setEnable(enabled) +} + +// SetDisabled disables selinux support for the package +func SetDisabled() { + state.setEnable(false) +} + +func (s *selinuxState) setSELinuxfs(selinuxfs string) string { + s.Lock() + defer s.Unlock() + s.selinuxfsSet = true + s.selinuxfs = selinuxfs + return s.selinuxfs +} + +func (s *selinuxState) getSELinuxfs() string { + s.Lock() + selinuxfs := s.selinuxfs + selinuxfsSet := s.selinuxfsSet + s.Unlock() + if selinuxfsSet { return selinuxfs } - selinuxfs = "" + selinuxfs = "" f, err := os.Open("/proc/self/mountinfo") if err != nil { return selinuxfs @@ -91,21 +131,21 @@ func getSelinuxMountPoint() string { selinuxfs = "" } } - return selinuxfs + return s.setSELinuxfs(selinuxfs) +} + +// getSelinuxMountPoint returns the path to the mountpoint of an selinuxfs +// filesystem or an empty string if no mountpoint is found. Selinuxfs is +// a proc-like pseudo-filesystem that exposes the selinux policy API to +// processes. The existence of an selinuxfs mount is used to determine +// whether selinux is currently enabled or not. +func getSelinuxMountPoint() string { + return state.getSELinuxfs() } // SelinuxEnabled returns whether selinux is currently enabled. func SelinuxEnabled() bool { - if selinuxEnabledChecked { - return selinuxEnabled - } - selinuxEnabledChecked = true - if fs := getSelinuxMountPoint(); fs != "" { - if con, _ := Getcon(); con != "kernel" { - selinuxEnabled = true - } - } - return selinuxEnabled + return state.getEnabled() } func readConfig(target string) (value string) { @@ -283,19 +323,19 @@ func SelinuxGetEnforceMode() int { } func mcsAdd(mcs string) error { - mcsLock.Lock() - defer mcsLock.Unlock() - if mcsList[mcs] { + state.Lock() + defer state.Unlock() + if state.mcsList[mcs] { return fmt.Errorf("MCS Label already exists") } - mcsList[mcs] = true + state.mcsList[mcs] = true return nil } func mcsDelete(mcs string) { - mcsLock.Lock() - mcsList[mcs] = false - mcsLock.Unlock() + state.Lock() + defer state.Unlock() + state.mcsList[mcs] = false } func IntToMcs(id int, catRange uint32) string { @@ -334,9 +374,7 @@ func uniqMcs(catRange uint32) string { continue } else { if c1 > c2 { - t := c1 - c1 = c2 - c2 = t + c1, c2 = c2, c1 } } mcs = fmt.Sprintf("s0:c%d,c%d", c1, c2) @@ -355,6 +393,12 @@ func FreeLxcContexts(scon string) { } } +var roFileLabel string + +func GetROFileLabel() (fileLabel string) { + return roFileLabel +} + func GetLxcContexts() (processLabel string, fileLabel string) { var ( val, key string @@ -399,6 +443,9 @@ func GetLxcContexts() (processLabel string, fileLabel string) { if key == "file" { fileLabel = strings.Trim(val, "\"") } + if key == "ro_file" { + roFileLabel = strings.Trim(val, "\"") + } } } @@ -406,6 +453,9 @@ func GetLxcContexts() (processLabel string, fileLabel string) { return "", "" } + if roFileLabel == "" { + roFileLabel = fileLabel + } exit: // mcs := IntToMcs(os.Getpid(), 1024) mcs := uniqMcs(1024) @@ -446,7 +496,7 @@ func badPrefix(fpath string) error { for _, prefix := range badprefixes { if fpath == prefix || strings.HasPrefix(fpath, fmt.Sprintf("%s/", prefix)) { - return fmt.Errorf("Relabeling content in %s is not allowed.", prefix) + return fmt.Errorf("relabeling content in %s is not allowed", prefix) } } return nil @@ -486,14 +536,14 @@ func DupSecOpt(src string) []string { con["level"] == "" { return nil } - return []string{"label=user:" + con["user"], - "label=role:" + con["role"], - "label=type:" + con["type"], - "label=level:" + con["level"]} + return []string{"user:" + con["user"], + "role:" + con["role"], + "type:" + con["type"], + "level:" + con["level"]} } // DisableSecOpt returns a security opt that can be used to disabling SELinux // labeling support for future container processes func DisableSecOpt() []string { - return []string{"label=disable"} + return []string{"disable"} } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go index 2a8f3452..f6e8998b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go @@ -16,7 +16,9 @@ import ( // linuxSetnsInit performs the container's initialization for running a new process // inside an existing container. type linuxSetnsInit struct { - config *initConfig + pipe *os.File + consoleSocket *os.File + config *initConfig } func (l *linuxSetnsInit) getSessionRingName() string { @@ -30,6 +32,14 @@ func (l *linuxSetnsInit) Init() error { return err } } + if l.config.CreateConsole { + if err := setupConsole(l.consoleSocket, l.config, false); err != nil { + return err + } + if err := system.Setctty(); err != nil { + return err + } + } if l.config.NoNewPrivileges { if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { return err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go index 87515e1e..d9cc5e50 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go @@ -4,7 +4,6 @@ package libcontainer import ( "fmt" - "io" "os" "os/exec" "syscall" @@ -18,10 +17,11 @@ import ( ) type linuxStandardInit struct { - pipe io.ReadWriteCloser - parentPid int - stateDirFD int - config *initConfig + pipe *os.File + consoleSocket *os.File + parentPid int + stateDirFD int + config *initConfig } func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { @@ -59,18 +59,6 @@ func (l *linuxStandardInit) Init() error { } } - var console *linuxConsole - if l.config.Console != "" { - console = newConsoleFromPath(l.config.Console) - if err := console.dupStdio(); err != nil { - return err - } - } - if console != nil { - if err := system.Setctty(); err != nil { - return err - } - } if err := setupNetwork(l.config); err != nil { return err } @@ -79,12 +67,33 @@ func (l *linuxStandardInit) Init() error { } label.Init() - // InitializeMountNamespace() can be executed only for a new mount namespace + + // prepareRootfs() can be executed only for a new mount namespace. if l.config.Config.Namespaces.Contains(configs.NEWNS) { - if err := setupRootfs(l.config.Config, console, l.pipe); err != nil { + if err := prepareRootfs(l.pipe, l.config.Config); err != nil { return err } } + + // Set up the console. This has to be done *before* we finalize the rootfs, + // but *after* we've given the user the chance to set up all of the mounts + // they wanted. + if l.config.CreateConsole { + if err := setupConsole(l.consoleSocket, l.config, true); err != nil { + return err + } + if err := system.Setctty(); err != nil { + return err + } + } + + // Finish the rootfs setup. + if l.config.Config.Namespaces.Contains(configs.NEWNS) { + if err := finalizeRootfs(l.config.Config); err != nil { + return err + } + } + if hostname := l.config.Config.Hostname; hostname != "" { if err := syscall.Sethostname([]byte(hostname)); err != nil { return err @@ -103,12 +112,12 @@ func (l *linuxStandardInit) Init() error { } } for _, path := range l.config.Config.ReadonlyPaths { - if err := remountReadonly(path); err != nil { + if err := readonlyPath(path); err != nil { return err } } for _, path := range l.config.Config.MaskPaths { - if err := maskFile(path); err != nil { + if err := maskPath(path); err != nil { return err } } @@ -143,7 +152,7 @@ func (l *linuxStandardInit) Init() error { if err := pdeath.Restore(); err != nil { return err } - // compare the parent from the inital start of the init process and make sure that it did not change. + // compare the parent from the initial start of the init process and make sure that it did not change. // if the parent changes that means it died and we were reparented to something else so we should // just kill ourself and not cause problems for someone else. if syscall.Getppid() != l.parentPid { @@ -171,6 +180,9 @@ func (l *linuxStandardInit) Init() error { return newSystemErrorWithCause(err, "init seccomp") } } + // close the statedir fd before exec because the kernel resets dumpable in the wrong order + // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 + syscall.Close(l.stateDirFD) if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { return newSystemErrorWithCause(err, "exec user process") } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go index 26628240..62878acf 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go @@ -39,7 +39,7 @@ type containerState interface { func destroy(c *linuxContainer) error { if !c.config.Namespaces.Contains(configs.NEWPID) { - if err := killCgroupProcesses(c.cgroupManager); err != nil { + if err := signalAllProcesses(c.cgroupManager, syscall.SIGKILL); err != nil { logrus.Warn(err) } } @@ -58,10 +58,9 @@ func destroy(c *linuxContainer) error { func runPoststopHooks(c *linuxContainer) error { if c.config.Hooks != nil { s := configs.HookState{ - Version: c.config.Version, - ID: c.id, - Root: c.config.Rootfs, - BundlePath: utils.SearchLabels(c.config.Labels, "bundle"), + Version: c.config.Version, + ID: c.id, + Bundle: utils.SearchLabels(c.config.Labels, "bundle"), } for _, hook := range c.config.Hooks.Poststop { if err := hook.Run(s); err != nil { @@ -83,10 +82,7 @@ func (b *stoppedState) status() Status { func (b *stoppedState) transition(s containerState) error { switch s.(type) { - case *runningState: - b.c.state = s - return nil - case *restoredState: + case *runningState, *restoredState: b.c.state = s return nil case *stoppedState: @@ -199,7 +195,7 @@ func (p *pausedState) destroy() error { return newGenericError(fmt.Errorf("container is paused"), ContainerPaused) } -// restoredState is the same as the running state but also has accociated checkpoint +// restoredState is the same as the running state but also has associated checkpoint // information that maybe need destroyed when the container is stopped and destroy is called. type restoredState struct { imageDir string @@ -212,9 +208,7 @@ func (r *restoredState) status() Status { func (r *restoredState) transition(s containerState) error { switch s.(type) { - case *stoppedState: - return nil - case *runningState: + case *stoppedState, *runningState: return nil } return newStateTransitionError(r, s) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/sync.go b/vendor/github.com/opencontainers/runc/libcontainer/sync.go new file mode 100644 index 00000000..cf7b45bc --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/sync.go @@ -0,0 +1,107 @@ +package libcontainer + +import ( + "encoding/json" + "fmt" + "io" + + "github.com/opencontainers/runc/libcontainer/utils" +) + +type syncType string + +// Constants that are used for synchronisation between the parent and child +// during container setup. They come in pairs (with procError being a generic +// response which is followed by a &genericError). +// +// [ child ] <-> [ parent ] +// +// procHooks --> [run hooks] +// <-- procResume +// +// procConsole --> +// <-- procConsoleReq +// [send(fd)] --> [recv(fd)] +// <-- procConsoleAck +// +// procReady --> [final setup] +// <-- procRun +const ( + procError syncType = "procError" + procReady syncType = "procReady" + procRun syncType = "procRun" + procHooks syncType = "procHooks" + procResume syncType = "procResume" +) + +type syncT struct { + Type syncType `json:"type"` +} + +// writeSync is used to write to a synchronisation pipe. An error is returned +// if there was a problem writing the payload. +func writeSync(pipe io.Writer, sync syncType) error { + if err := utils.WriteJSON(pipe, syncT{sync}); err != nil { + return err + } + return nil +} + +// readSync is used to read from a synchronisation pipe. An error is returned +// if we got a genericError, the pipe was closed, or we got an unexpected flag. +func readSync(pipe io.Reader, expected syncType) error { + var procSync syncT + if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { + if err == io.EOF { + return fmt.Errorf("parent closed synchronisation channel") + } + + if procSync.Type == procError { + var ierr genericError + + if err := json.NewDecoder(pipe).Decode(&ierr); err != nil { + return fmt.Errorf("failed reading error from parent: %v", err) + } + + return &ierr + } + + if procSync.Type != expected { + return fmt.Errorf("invalid synchronisation flag from parent") + } + } + return nil +} + +// parseSync runs the given callback function on each syncT received from the +// child. It will return once io.EOF is returned from the given pipe. +func parseSync(pipe io.Reader, fn func(*syncT) error) error { + dec := json.NewDecoder(pipe) + for { + var sync syncT + if err := dec.Decode(&sync); err != nil { + if err == io.EOF { + break + } + return err + } + + // We handle this case outside fn for cleanliness reasons. + var ierr *genericError + if sync.Type == procError { + if err := dec.Decode(&ierr); err != nil && err != io.EOF { + return newSystemErrorWithCause(err, "decoding proc error from init") + } + if ierr != nil { + return ierr + } + // Programmer error. + panic("No error following JSON procError payload.") + } + + if err := fn(&sync); err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go b/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go index 37808a29..a0e96371 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go @@ -14,8 +14,10 @@ func GetProcessStartTime(pid int) (string, error) { if err != nil { return "", err } + return parseStartTime(string(data)) +} - parts := strings.Split(string(data), " ") +func parseStartTime(stat string) (string, error) { // the starttime is located at pos 22 // from the man page // @@ -23,5 +25,19 @@ func GetProcessStartTime(pid int) (string, error) { // (22) The time the process started after system boot. In kernels before Linux 2.6, this // value was expressed in jiffies. Since Linux 2.6, the value is expressed in clock ticks // (divide by sysconf(_SC_CLK_TCK)). - return parts[22-1], nil // starts at 1 + // + // NOTE: + // pos 2 could contain space and is inside `(` and `)`: + // (2) comm %s + // The filename of the executable, in parentheses. + // This is visible whether or not the executable is + // swapped out. + // + // the following is an example: + // 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0 + + // get parts after last `)`: + s := strings.Split(stat, ")") + parts := strings.Split(strings.TrimSpace(s[len(s)-1]), " ") + return parts[22-3], nil // starts at 3 (after the filename pos `2`) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go index 43fd39ef..8962cab3 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go @@ -199,18 +199,16 @@ type ExecUser struct { // files cannot be opened for any reason, the error is ignored and a nil // io.Reader is passed instead. func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) { - passwd, err := os.Open(passwdPath) - if err != nil { - passwd = nil - } else { - defer passwd.Close() + var passwd, group io.Reader + + if passwdFile, err := os.Open(passwdPath); err == nil { + passwd = passwdFile + defer passwdFile.Close() } - group, err := os.Open(groupPath) - if err != nil { - group = nil - } else { - defer group.Close() + if groupFile, err := os.Open(groupPath); err == nil { + group = groupFile + defer groupFile.Close() } return GetExecUser(userSpec, defaults, passwd, group) @@ -343,7 +341,7 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) ( if len(groups) > 0 { // First match wins, even if there's more than one matching entry. user.Gid = groups[0].Gid - } else if groupArg != "" { + } else { // If we can't find a group with the given name, the only other valid // option is if it's a numeric group name with no associated entry in group. @@ -433,9 +431,11 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err // that opens the groupPath given and gives it as an argument to // GetAdditionalGroups. func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) { - group, err := os.Open(groupPath) - if err == nil { - defer group.Close() + var group io.Reader + + if groupFile, err := os.Open(groupPath); err == nil { + group = groupFile + defer groupFile.Close() } return GetAdditionalGroups(additionalGroups, group) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.c b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.c new file mode 100644 index 00000000..0ded4944 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.c @@ -0,0 +1,148 @@ +/* + * Copyright 2016 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "cmsg.h" + +#define error(fmt, ...) \ + ({ \ + fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \ + errno = ECOMM; \ + goto err; /* return value */ \ + }) + +/* + * Sends a file descriptor along the sockfd provided. Returns the return + * value of sendmsg(2). Any synchronisation and preparation of state + * should be done external to this (we expect the other side to be in + * recvfd() in the code). + */ +ssize_t sendfd(int sockfd, struct file_t file) +{ + struct msghdr msg = {0}; + struct iovec iov[1] = {0}; + struct cmsghdr *cmsg; + int *fdptr; + int ret; + + union { + char buf[CMSG_SPACE(sizeof(file.fd))]; + struct cmsghdr align; + } u; + + /* + * We need to send some other data along with the ancillary data, + * otherwise the other side won't recieve any data. This is very + * well-hidden in the documentation (and only applies to + * SOCK_STREAM). See the bottom part of unix(7). + */ + iov[0].iov_base = file.name; + iov[0].iov_len = strlen(file.name) + 1; + + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = iov; + msg.msg_iovlen = 1; + msg.msg_control = u.buf; + msg.msg_controllen = sizeof(u.buf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + + fdptr = (int *) CMSG_DATA(cmsg); + memcpy(fdptr, &file.fd, sizeof(int)); + + return sendmsg(sockfd, &msg, 0); +} + +/* + * Receives a file descriptor from the sockfd provided. Returns the file + * descriptor as sent from sendfd(). It will return the file descriptor + * or die (literally) trying. Any synchronisation and preparation of + * state should be done external to this (we expect the other side to be + * in sendfd() in the code). + */ +struct file_t recvfd(int sockfd) +{ + struct msghdr msg = {0}; + struct iovec iov[1] = {0}; + struct cmsghdr *cmsg; + struct file_t file = {0}; + int *fdptr; + int olderrno; + + union { + char buf[CMSG_SPACE(sizeof(file.fd))]; + struct cmsghdr align; + } u; + + /* Allocate a buffer. */ + /* TODO: Make this dynamic with MSG_PEEK. */ + file.name = malloc(TAG_BUFFER); + if (!file.name) + error("recvfd: failed to allocate file.tag buffer\n"); + + /* + * We need to "recieve" the non-ancillary data even though we don't + * plan to use it at all. Otherwise, things won't work as expected. + * See unix(7) and other well-hidden documentation. + */ + iov[0].iov_base = file.name; + iov[0].iov_len = TAG_BUFFER; + + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = iov; + msg.msg_iovlen = 1; + msg.msg_control = u.buf; + msg.msg_controllen = sizeof(u.buf); + + ssize_t ret = recvmsg(sockfd, &msg, 0); + if (ret < 0) + goto err; + + cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) + error("recvfd: got NULL from CMSG_FIRSTHDR"); + if (cmsg->cmsg_level != SOL_SOCKET) + error("recvfd: expected SOL_SOCKET in cmsg: %d", cmsg->cmsg_level); + if (cmsg->cmsg_type != SCM_RIGHTS) + error("recvfd: expected SCM_RIGHTS in cmsg: %d", cmsg->cmsg_type); + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + error("recvfd: expected correct CMSG_LEN in cmsg: %lu", (unsigned long)cmsg->cmsg_len); + + fdptr = (int *) CMSG_DATA(cmsg); + if (!fdptr || *fdptr < 0) + error("recvfd: recieved invalid pointer"); + + file.fd = *fdptr; + return file; + +err: + olderrno = errno; + free(file.name); + errno = olderrno; + return (struct file_t){0}; +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go new file mode 100644 index 00000000..ee893741 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go @@ -0,0 +1,57 @@ +// +build linux + +package utils + +/* + * Copyright 2016 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* +#include +#include +#include "cmsg.h" +*/ +import "C" + +import ( + "os" + "unsafe" +) + +// RecvFd waits for a file descriptor to be sent over the given AF_UNIX +// socket. The file name of the remote file descriptor will be recreated +// locally (it is sent as non-auxiliary data in the same payload). +func RecvFd(socket *os.File) (*os.File, error) { + file, err := C.recvfd(C.int(socket.Fd())) + if err != nil { + return nil, err + } + defer C.free(unsafe.Pointer(file.name)) + return os.NewFile(uintptr(file.fd), C.GoString(file.name)), nil +} + +// SendFd sends a file descriptor over the given AF_UNIX socket. In +// addition, the file.Name() of the given file will also be sent as +// non-auxiliary data in the same payload (allowing to send contextual +// information for a file descriptor). +func SendFd(socket, file *os.File) error { + var cfile C.struct_file_t + cfile.fd = C.int(file.Fd()) + cfile.name = C.CString(file.Name()) + defer C.free(unsafe.Pointer(cfile.name)) + + _, err := C.sendfd(C.int(socket.Fd()), cfile) + return err +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.h b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.h new file mode 100644 index 00000000..3fe76425 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.h @@ -0,0 +1,36 @@ +/* + * Copyright 2016 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#if !defined(CMSG_H) +#define CMSG_H + +#include + +/* TODO: Implement this properly with MSG_PEEK. */ +#define TAG_BUFFER 4096 + +/* This mirrors Go's (*os.File). */ +struct file_t { + char *name; + int fd; +}; + +struct file_t recvfd(int sockfd); +ssize_t sendfd(int sockfd, struct file_t file); + +#endif /* !defined(CMSG_H) */ diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go index 3466bfce..2b35b9a7 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -9,6 +9,7 @@ import ( "path/filepath" "strings" "syscall" + "unsafe" ) const ( @@ -102,7 +103,7 @@ func SearchLabels(labels []string, query string) string { } // Annotations returns the bundle path and user defined annotations from the -// libcontianer state. We need to remove the bundle because that is a label +// libcontainer state. We need to remove the bundle because that is a label // added by libcontainer. func Annotations(labels []string) (bundle string, userAnnotations map[string]string) { userAnnotations = make(map[string]string) @@ -119,3 +120,7 @@ func Annotations(labels []string) (bundle string, userAnnotations map[string]str } return } + +func GetIntSize() int { + return int(unsafe.Sizeof(1)) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go index 408918f2..7b798cc7 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go @@ -4,6 +4,7 @@ package utils import ( "io/ioutil" + "os" "strconv" "syscall" ) @@ -31,3 +32,12 @@ func CloseExecFrom(minFd int) error { } return nil } + +// NewSockPair returns a new unix socket pair +func NewSockPair(name string) (parent *os.File, child *os.File, err error) { + fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil +} diff --git a/vendor/github.com/opencontainers/runtime-spec/LICENSE b/vendor/github.com/opencontainers/runtime-spec/LICENSE new file mode 100644 index 00000000..bdc40365 --- /dev/null +++ b/vendor/github.com/opencontainers/runtime-spec/LICENSE @@ -0,0 +1,191 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2015 The Linux Foundation. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go new file mode 100644 index 00000000..70d708d2 --- /dev/null +++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go @@ -0,0 +1,563 @@ +package specs + +import "os" + +// Spec is the base configuration for the container. +type Spec struct { + // Version of the Open Container Runtime Specification with which the bundle complies. + Version string `json:"ociVersion"` + // Platform specifies the configuration's target platform. + Platform Platform `json:"platform"` + // Process configures the container process. + Process Process `json:"process"` + // Root configures the container's root filesystem. + Root Root `json:"root"` + // Hostname configures the container's hostname. + Hostname string `json:"hostname,omitempty"` + // Mounts configures additional mounts (on top of Root). + Mounts []Mount `json:"mounts,omitempty"` + // Hooks configures callbacks for container lifecycle events. + Hooks *Hooks `json:"hooks,omitempty"` + // Annotations contains arbitrary metadata for the container. + Annotations map[string]string `json:"annotations,omitempty"` + + // Linux is platform specific configuration for Linux based containers. + Linux *Linux `json:"linux,omitempty" platform:"linux"` + // Solaris is platform specific configuration for Solaris containers. + Solaris *Solaris `json:"solaris,omitempty" platform:"solaris"` + // Windows is platform specific configuration for Windows based containers, including Hyper-V containers. + Windows *Windows `json:"windows,omitempty" platform:"windows"` +} + +// Process contains information to start a specific application inside the container. +type Process struct { + // Terminal creates an interactive terminal for the container. + Terminal bool `json:"terminal,omitempty"` + // ConsoleSize specifies the size of the console. + ConsoleSize Box `json:"consoleSize,omitempty"` + // User specifies user information for the process. + User User `json:"user"` + // Args specifies the binary and arguments for the application to execute. + Args []string `json:"args"` + // Env populates the process environment for the process. + Env []string `json:"env,omitempty"` + // Cwd is the current working directory for the process and must be + // relative to the container's root. + Cwd string `json:"cwd"` + // Capabilities are Linux capabilities that are kept for the process. + Capabilities *LinuxCapabilities `json:"capabilities,omitempty" platform:"linux"` + // Rlimits specifies rlimit options to apply to the process. + Rlimits []LinuxRlimit `json:"rlimits,omitempty" platform:"linux"` + // NoNewPrivileges controls whether additional privileges could be gained by processes in the container. + NoNewPrivileges bool `json:"noNewPrivileges,omitempty" platform:"linux"` + // ApparmorProfile specifies the apparmor profile for the container. + ApparmorProfile string `json:"apparmorProfile,omitempty" platform:"linux"` + // SelinuxLabel specifies the selinux context that the container process is run as. + SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"` +} + +// LinuxCapabilities specifies the whitelist of capabilities that are kept for a process. +// http://man7.org/linux/man-pages/man7/capabilities.7.html +type LinuxCapabilities struct { + // Bounding is the set of capabilities checked by the kernel. + Bounding []string `json:"bounding,omitempty" platform:"linux"` + // Effective is the set of capabilities checked by the kernel. + Effective []string `json:"effective,omitempty" platform:"linux"` + // Inheritable is the capabilities preserved across execve. + Inheritable []string `json:"inheritable,omitempty" platform:"linux"` + // Permitted is the limiting superset for effective capabilities. + Permitted []string `json:"permitted,omitempty" platform:"linux"` + // Ambient is the ambient set of capabilities that are kept. + Ambient []string `json:"ambient,omitempty" platform:"linux"` +} + +// Box specifies dimensions of a rectangle. Used for specifying the size of a console. +type Box struct { + // Height is the vertical dimension of a box. + Height uint `json:"height"` + // Width is the horizontal dimension of a box. + Width uint `json:"width"` +} + +// User specifies specific user (and group) information for the container process. +type User struct { + // UID is the user id. + UID uint32 `json:"uid" platform:"linux,solaris"` + // GID is the group id. + GID uint32 `json:"gid" platform:"linux,solaris"` + // AdditionalGids are additional group ids set for the container's process. + AdditionalGids []uint32 `json:"additionalGids,omitempty" platform:"linux,solaris"` + // Username is the user name. + Username string `json:"username,omitempty" platform:"windows"` +} + +// Root contains information about the container's root filesystem on the host. +type Root struct { + // Path is the absolute path to the container's root filesystem. + Path string `json:"path"` + // Readonly makes the root filesystem for the container readonly before the process is executed. + Readonly bool `json:"readonly,omitempty"` +} + +// Platform specifies OS and arch information for the host system that the container +// is created for. +type Platform struct { + // OS is the operating system. + OS string `json:"os"` + // Arch is the architecture + Arch string `json:"arch"` +} + +// Mount specifies a mount for a container. +type Mount struct { + // Destination is the path where the mount will be placed relative to the container's root. The path and child directories MUST exist, a runtime MUST NOT create directories automatically to a mount point. + Destination string `json:"destination"` + // Type specifies the mount kind. + Type string `json:"type,omitempty"` + // Source specifies the source path of the mount. In the case of bind mounts on + // Linux based systems this would be the file on the host. + Source string `json:"source,omitempty"` + // Options are fstab style mount options. + Options []string `json:"options,omitempty"` +} + +// Hook specifies a command that is run at a particular event in the lifecycle of a container +type Hook struct { + Path string `json:"path"` + Args []string `json:"args,omitempty"` + Env []string `json:"env,omitempty"` + Timeout *int `json:"timeout,omitempty"` +} + +// Hooks for container setup and teardown +type Hooks struct { + // Prestart is a list of hooks to be run before the container process is executed. + // On Linux, they are run after the container namespaces are created. + Prestart []Hook `json:"prestart,omitempty"` + // Poststart is a list of hooks to be run after the container process is started. + Poststart []Hook `json:"poststart,omitempty"` + // Poststop is a list of hooks to be run after the container process exits. + Poststop []Hook `json:"poststop,omitempty"` +} + +// Linux contains platform specific configuration for Linux based containers. +type Linux struct { + // UIDMapping specifies user mappings for supporting user namespaces on Linux. + UIDMappings []LinuxIDMapping `json:"uidMappings,omitempty"` + // GIDMapping specifies group mappings for supporting user namespaces on Linux. + GIDMappings []LinuxIDMapping `json:"gidMappings,omitempty"` + // Sysctl are a set of key value pairs that are set for the container on start + Sysctl map[string]string `json:"sysctl,omitempty"` + // Resources contain cgroup information for handling resource constraints + // for the container + Resources *LinuxResources `json:"resources,omitempty"` + // CgroupsPath specifies the path to cgroups that are created and/or joined by the container. + // The path is expected to be relative to the cgroups mountpoint. + // If resources are specified, the cgroups at CgroupsPath will be updated based on resources. + CgroupsPath string `json:"cgroupsPath,omitempty"` + // Namespaces contains the namespaces that are created and/or joined by the container + Namespaces []LinuxNamespace `json:"namespaces,omitempty"` + // Devices are a list of device nodes that are created for the container + Devices []LinuxDevice `json:"devices,omitempty"` + // Seccomp specifies the seccomp security settings for the container. + Seccomp *LinuxSeccomp `json:"seccomp,omitempty"` + // RootfsPropagation is the rootfs mount propagation mode for the container. + RootfsPropagation string `json:"rootfsPropagation,omitempty"` + // MaskedPaths masks over the provided paths inside the container. + MaskedPaths []string `json:"maskedPaths,omitempty"` + // ReadonlyPaths sets the provided paths as RO inside the container. + ReadonlyPaths []string `json:"readonlyPaths,omitempty"` + // MountLabel specifies the selinux context for the mounts in the container. + MountLabel string `json:"mountLabel,omitempty"` + // IntelRdt contains Intel Resource Director Technology (RDT) information + // for handling resource constraints (e.g., L3 cache) for the container + IntelRdt *LinuxIntelRdt `json:"intelRdt,omitempty"` +} + +// LinuxNamespace is the configuration for a Linux namespace +type LinuxNamespace struct { + // Type is the type of Linux namespace + Type LinuxNamespaceType `json:"type"` + // Path is a path to an existing namespace persisted on disk that can be joined + // and is of the same type + Path string `json:"path,omitempty"` +} + +// LinuxNamespaceType is one of the Linux namespaces +type LinuxNamespaceType string + +const ( + // PIDNamespace for isolating process IDs + PIDNamespace LinuxNamespaceType = "pid" + // NetworkNamespace for isolating network devices, stacks, ports, etc + NetworkNamespace = "network" + // MountNamespace for isolating mount points + MountNamespace = "mount" + // IPCNamespace for isolating System V IPC, POSIX message queues + IPCNamespace = "ipc" + // UTSNamespace for isolating hostname and NIS domain name + UTSNamespace = "uts" + // UserNamespace for isolating user and group IDs + UserNamespace = "user" + // CgroupNamespace for isolating cgroup hierarchies + CgroupNamespace = "cgroup" +) + +// LinuxIDMapping specifies UID/GID mappings +type LinuxIDMapping struct { + // HostID is the starting UID/GID on the host to be mapped to 'ContainerID' + HostID uint32 `json:"hostID"` + // ContainerID is the starting UID/GID in the container + ContainerID uint32 `json:"containerID"` + // Size is the number of IDs to be mapped + Size uint32 `json:"size"` +} + +// LinuxRlimit type and restrictions +type LinuxRlimit struct { + // Type of the rlimit to set + Type string `json:"type"` + // Hard is the hard limit for the specified type + Hard uint64 `json:"hard"` + // Soft is the soft limit for the specified type + Soft uint64 `json:"soft"` +} + +// LinuxHugepageLimit structure corresponds to limiting kernel hugepages +type LinuxHugepageLimit struct { + // Pagesize is the hugepage size + Pagesize string `json:"pageSize"` + // Limit is the limit of "hugepagesize" hugetlb usage + Limit uint64 `json:"limit"` +} + +// LinuxInterfacePriority for network interfaces +type LinuxInterfacePriority struct { + // Name is the name of the network interface + Name string `json:"name"` + // Priority for the interface + Priority uint32 `json:"priority"` +} + +// linuxBlockIODevice holds major:minor format supported in blkio cgroup +type linuxBlockIODevice struct { + // Major is the device's major number. + Major int64 `json:"major"` + // Minor is the device's minor number. + Minor int64 `json:"minor"` +} + +// LinuxWeightDevice struct holds a `major:minor weight` pair for blkioWeightDevice +type LinuxWeightDevice struct { + linuxBlockIODevice + // Weight is the bandwidth rate for the device, range is from 10 to 1000 + Weight *uint16 `json:"weight,omitempty"` + // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, CFQ scheduler only + LeafWeight *uint16 `json:"leafWeight,omitempty"` +} + +// LinuxThrottleDevice struct holds a `major:minor rate_per_second` pair +type LinuxThrottleDevice struct { + linuxBlockIODevice + // Rate is the IO rate limit per cgroup per device + Rate uint64 `json:"rate"` +} + +// LinuxBlockIO for Linux cgroup 'blkio' resource management +type LinuxBlockIO struct { + // Specifies per cgroup weight, range is from 10 to 1000 + Weight *uint16 `json:"blkioWeight,omitempty"` + // Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, CFQ scheduler only + LeafWeight *uint16 `json:"blkioLeafWeight,omitempty"` + // Weight per cgroup per device, can override BlkioWeight + WeightDevice []LinuxWeightDevice `json:"blkioWeightDevice,omitempty"` + // IO read rate limit per cgroup per device, bytes per second + ThrottleReadBpsDevice []LinuxThrottleDevice `json:"blkioThrottleReadBpsDevice,omitempty"` + // IO write rate limit per cgroup per device, bytes per second + ThrottleWriteBpsDevice []LinuxThrottleDevice `json:"blkioThrottleWriteBpsDevice,omitempty"` + // IO read rate limit per cgroup per device, IO per second + ThrottleReadIOPSDevice []LinuxThrottleDevice `json:"blkioThrottleReadIOPSDevice,omitempty"` + // IO write rate limit per cgroup per device, IO per second + ThrottleWriteIOPSDevice []LinuxThrottleDevice `json:"blkioThrottleWriteIOPSDevice,omitempty"` +} + +// LinuxMemory for Linux cgroup 'memory' resource management +type LinuxMemory struct { + // Memory limit (in bytes). + Limit *uint64 `json:"limit,omitempty"` + // Memory reservation or soft_limit (in bytes). + Reservation *uint64 `json:"reservation,omitempty"` + // Total memory limit (memory + swap). + Swap *uint64 `json:"swap,omitempty"` + // Kernel memory limit (in bytes). + Kernel *uint64 `json:"kernel,omitempty"` + // Kernel memory limit for tcp (in bytes) + KernelTCP *uint64 `json:"kernelTCP,omitempty"` + // How aggressive the kernel will swap memory pages. Range from 0 to 100. + Swappiness *uint64 `json:"swappiness,omitempty"` +} + +// LinuxCPU for Linux cgroup 'cpu' resource management +type LinuxCPU struct { + // CPU shares (relative weight (ratio) vs. other cgroups with cpu shares). + Shares *uint64 `json:"shares,omitempty"` + // CPU hardcap limit (in usecs). Allowed cpu time in a given period. + Quota *int64 `json:"quota,omitempty"` + // CPU period to be used for hardcapping (in usecs). + Period *uint64 `json:"period,omitempty"` + // How much time realtime scheduling may use (in usecs). + RealtimeRuntime *int64 `json:"realtimeRuntime,omitempty"` + // CPU period to be used for realtime scheduling (in usecs). + RealtimePeriod *uint64 `json:"realtimePeriod,omitempty"` + // CPUs to use within the cpuset. Default is to use any CPU available. + Cpus string `json:"cpus,omitempty"` + // List of memory nodes in the cpuset. Default is to use any available memory node. + Mems string `json:"mems,omitempty"` +} + +// LinuxPids for Linux cgroup 'pids' resource management (Linux 4.3) +type LinuxPids struct { + // Maximum number of PIDs. Default is "no limit". + Limit int64 `json:"limit"` +} + +// LinuxNetwork identification and priority configuration +type LinuxNetwork struct { + // Set class identifier for container's network packets + ClassID *uint32 `json:"classID,omitempty"` + // Set priority of network traffic for container + Priorities []LinuxInterfacePriority `json:"priorities,omitempty"` +} + +// LinuxResources has container runtime resource constraints +type LinuxResources struct { + // Devices configures the device whitelist. + Devices []LinuxDeviceCgroup `json:"devices,omitempty"` + // DisableOOMKiller disables the OOM killer for out of memory conditions + DisableOOMKiller *bool `json:"disableOOMKiller,omitempty"` + // Specify an oom_score_adj for the container. + OOMScoreAdj *int `json:"oomScoreAdj,omitempty"` + // Memory restriction configuration + Memory *LinuxMemory `json:"memory,omitempty"` + // CPU resource restriction configuration + CPU *LinuxCPU `json:"cpu,omitempty"` + // Task resource restriction configuration. + Pids *LinuxPids `json:"pids,omitempty"` + // BlockIO restriction configuration + BlockIO *LinuxBlockIO `json:"blockIO,omitempty"` + // Hugetlb limit (in bytes) + HugepageLimits []LinuxHugepageLimit `json:"hugepageLimits,omitempty"` + // Network restriction configuration + Network *LinuxNetwork `json:"network,omitempty"` +} + +// LinuxDevice represents the mknod information for a Linux special device file +type LinuxDevice struct { + // Path to the device. + Path string `json:"path"` + // Device type, block, char, etc. + Type string `json:"type"` + // Major is the device's major number. + Major int64 `json:"major"` + // Minor is the device's minor number. + Minor int64 `json:"minor"` + // FileMode permission bits for the device. + FileMode *os.FileMode `json:"fileMode,omitempty"` + // UID of the device. + UID *uint32 `json:"uid,omitempty"` + // Gid of the device. + GID *uint32 `json:"gid,omitempty"` +} + +// LinuxDeviceCgroup represents a device rule for the whitelist controller +type LinuxDeviceCgroup struct { + // Allow or deny + Allow bool `json:"allow"` + // Device type, block, char, etc. + Type string `json:"type,omitempty"` + // Major is the device's major number. + Major *int64 `json:"major,omitempty"` + // Minor is the device's minor number. + Minor *int64 `json:"minor,omitempty"` + // Cgroup access permissions format, rwm. + Access string `json:"access,omitempty"` +} + +// Solaris contains platform specific configuration for Solaris application containers. +type Solaris struct { + // SMF FMRI which should go "online" before we start the container process. + Milestone string `json:"milestone,omitempty"` + // Maximum set of privileges any process in this container can obtain. + LimitPriv string `json:"limitpriv,omitempty"` + // The maximum amount of shared memory allowed for this container. + MaxShmMemory string `json:"maxShmMemory,omitempty"` + // Specification for automatic creation of network resources for this container. + Anet []SolarisAnet `json:"anet,omitempty"` + // Set limit on the amount of CPU time that can be used by container. + CappedCPU *SolarisCappedCPU `json:"cappedCPU,omitempty"` + // The physical and swap caps on the memory that can be used by this container. + CappedMemory *SolarisCappedMemory `json:"cappedMemory,omitempty"` +} + +// SolarisCappedCPU allows users to set limit on the amount of CPU time that can be used by container. +type SolarisCappedCPU struct { + Ncpus string `json:"ncpus,omitempty"` +} + +// SolarisCappedMemory allows users to set the physical and swap caps on the memory that can be used by this container. +type SolarisCappedMemory struct { + Physical string `json:"physical,omitempty"` + Swap string `json:"swap,omitempty"` +} + +// SolarisAnet provides the specification for automatic creation of network resources for this container. +type SolarisAnet struct { + // Specify a name for the automatically created VNIC datalink. + Linkname string `json:"linkname,omitempty"` + // Specify the link over which the VNIC will be created. + Lowerlink string `json:"lowerLink,omitempty"` + // The set of IP addresses that the container can use. + Allowedaddr string `json:"allowedAddress,omitempty"` + // Specifies whether allowedAddress limitation is to be applied to the VNIC. + Configallowedaddr string `json:"configureAllowedAddress,omitempty"` + // The value of the optional default router. + Defrouter string `json:"defrouter,omitempty"` + // Enable one or more types of link protection. + Linkprotection string `json:"linkProtection,omitempty"` + // Set the VNIC's macAddress + Macaddress string `json:"macAddress,omitempty"` +} + +// Windows defines the runtime configuration for Windows based containers, including Hyper-V containers. +type Windows struct { + // Resources contains information for handling resource constraints for the container. + Resources *WindowsResources `json:"resources,omitempty"` +} + +// WindowsResources has container runtime resource constraints for containers running on Windows. +type WindowsResources struct { + // Memory restriction configuration. + Memory *WindowsMemoryResources `json:"memory,omitempty"` + // CPU resource restriction configuration. + CPU *WindowsCPUResources `json:"cpu,omitempty"` + // Storage restriction configuration. + Storage *WindowsStorageResources `json:"storage,omitempty"` + // Network restriction configuration. + Network *WindowsNetworkResources `json:"network,omitempty"` +} + +// WindowsMemoryResources contains memory resource management settings. +type WindowsMemoryResources struct { + // Memory limit in bytes. + Limit *uint64 `json:"limit,omitempty"` + // Memory reservation in bytes. + Reservation *uint64 `json:"reservation,omitempty"` +} + +// WindowsCPUResources contains CPU resource management settings. +type WindowsCPUResources struct { + // Number of CPUs available to the container. + Count *uint64 `json:"count,omitempty"` + // CPU shares (relative weight to other containers with cpu shares). Range is from 1 to 10000. + Shares *uint16 `json:"shares,omitempty"` + // Percent of available CPUs usable by the container. + Percent *uint8 `json:"percent,omitempty"` +} + +// WindowsStorageResources contains storage resource management settings. +type WindowsStorageResources struct { + // Specifies maximum Iops for the system drive. + Iops *uint64 `json:"iops,omitempty"` + // Specifies maximum bytes per second for the system drive. + Bps *uint64 `json:"bps,omitempty"` + // Sandbox size specifies the minimum size of the system drive in bytes. + SandboxSize *uint64 `json:"sandboxSize,omitempty"` +} + +// WindowsNetworkResources contains network resource management settings. +type WindowsNetworkResources struct { + // EgressBandwidth is the maximum egress bandwidth in bytes per second. + EgressBandwidth *uint64 `json:"egressBandwidth,omitempty"` +} + +// LinuxSeccomp represents syscall restrictions +type LinuxSeccomp struct { + DefaultAction LinuxSeccompAction `json:"defaultAction"` + Architectures []Arch `json:"architectures,omitempty"` + Syscalls []LinuxSyscall `json:"syscalls"` +} + +// Arch used for additional architectures +type Arch string + +// Additional architectures permitted to be used for system calls +// By default only the native architecture of the kernel is permitted +const ( + ArchX86 Arch = "SCMP_ARCH_X86" + ArchX86_64 Arch = "SCMP_ARCH_X86_64" + ArchX32 Arch = "SCMP_ARCH_X32" + ArchARM Arch = "SCMP_ARCH_ARM" + ArchAARCH64 Arch = "SCMP_ARCH_AARCH64" + ArchMIPS Arch = "SCMP_ARCH_MIPS" + ArchMIPS64 Arch = "SCMP_ARCH_MIPS64" + ArchMIPS64N32 Arch = "SCMP_ARCH_MIPS64N32" + ArchMIPSEL Arch = "SCMP_ARCH_MIPSEL" + ArchMIPSEL64 Arch = "SCMP_ARCH_MIPSEL64" + ArchMIPSEL64N32 Arch = "SCMP_ARCH_MIPSEL64N32" + ArchPPC Arch = "SCMP_ARCH_PPC" + ArchPPC64 Arch = "SCMP_ARCH_PPC64" + ArchPPC64LE Arch = "SCMP_ARCH_PPC64LE" + ArchS390 Arch = "SCMP_ARCH_S390" + ArchS390X Arch = "SCMP_ARCH_S390X" + ArchPARISC Arch = "SCMP_ARCH_PARISC" + ArchPARISC64 Arch = "SCMP_ARCH_PARISC64" +) + +// LinuxSeccompAction taken upon Seccomp rule match +type LinuxSeccompAction string + +// Define actions for Seccomp rules +const ( + ActKill LinuxSeccompAction = "SCMP_ACT_KILL" + ActTrap LinuxSeccompAction = "SCMP_ACT_TRAP" + ActErrno LinuxSeccompAction = "SCMP_ACT_ERRNO" + ActTrace LinuxSeccompAction = "SCMP_ACT_TRACE" + ActAllow LinuxSeccompAction = "SCMP_ACT_ALLOW" +) + +// LinuxSeccompOperator used to match syscall arguments in Seccomp +type LinuxSeccompOperator string + +// Define operators for syscall arguments in Seccomp +const ( + OpNotEqual LinuxSeccompOperator = "SCMP_CMP_NE" + OpLessThan LinuxSeccompOperator = "SCMP_CMP_LT" + OpLessEqual LinuxSeccompOperator = "SCMP_CMP_LE" + OpEqualTo LinuxSeccompOperator = "SCMP_CMP_EQ" + OpGreaterEqual LinuxSeccompOperator = "SCMP_CMP_GE" + OpGreaterThan LinuxSeccompOperator = "SCMP_CMP_GT" + OpMaskedEqual LinuxSeccompOperator = "SCMP_CMP_MASKED_EQ" +) + +// LinuxSeccompArg used for matching specific syscall arguments in Seccomp +type LinuxSeccompArg struct { + Index uint `json:"index"` + Value uint64 `json:"value"` + ValueTwo uint64 `json:"valueTwo"` + Op LinuxSeccompOperator `json:"op"` +} + +// LinuxSyscall is used to match a syscall in Seccomp +type LinuxSyscall struct { + Names []string `json:"names"` + Action LinuxSeccompAction `json:"action"` + Args []LinuxSeccompArg `json:"args,omitempty"` +} + +// LinuxIntelRdt has container runtime resource constraints +// for Intel RDT/CAT which introduced in Linux 4.10 kernel +type LinuxIntelRdt struct { + // The schema for L3 cache id and capacity bitmask (CBM) + // Format: "L3:=;=;..." + L3CacheSchema string `json:"l3CacheSchema,omitempty"` +} diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/state.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/state.go new file mode 100644 index 00000000..b5dd3bee --- /dev/null +++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/state.go @@ -0,0 +1,17 @@ +package specs + +// State holds information about the runtime state of the container. +type State struct { + // Version is the version of the specification that is supported. + Version string `json:"ociVersion"` + // ID is the container ID + ID string `json:"id"` + // Status is the runtime status of the container. + Status string `json:"status"` + // Pid is the process ID for the container process. + Pid int `json:"pid"` + // Bundle is the path to the container's bundle directory. + Bundle string `json:"bundle"` + // Annotations are key values associated with the container. + Annotations map[string]string `json:"annotations,omitempty"` +} diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go new file mode 100644 index 00000000..dfcf0090 --- /dev/null +++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go @@ -0,0 +1,18 @@ +package specs + +import "fmt" + +const ( + // VersionMajor is for an API incompatible changes + VersionMajor = 1 + // VersionMinor is for functionality in a backwards-compatible manner + VersionMinor = 0 + // VersionPatch is for backwards-compatible bug fixes + VersionPatch = 0 + + // VersionDev indicates development branch. Releases will be empty string. + VersionDev = "-rc5-dev" +) + +// Version is the specification version that the package types support. +var Version = fmt.Sprintf("%d.%d.%d%s", VersionMajor, VersionMinor, VersionPatch, VersionDev) diff --git a/vendor/github.com/syndtr/gocapability/capability/capability.go b/vendor/github.com/syndtr/gocapability/capability/capability.go index c13f4e52..c07c5579 100644 --- a/vendor/github.com/syndtr/gocapability/capability/capability.go +++ b/vendor/github.com/syndtr/gocapability/capability/capability.go @@ -10,42 +10,42 @@ package capability type Capabilities interface { // Get check whether a capability present in the given // capabilities set. The 'which' value should be one of EFFECTIVE, - // PERMITTED, INHERITABLE or BOUNDING. + // PERMITTED, INHERITABLE, BOUNDING or AMBIENT. Get(which CapType, what Cap) bool // Empty check whether all capability bits of the given capabilities // set are zero. The 'which' value should be one of EFFECTIVE, - // PERMITTED, INHERITABLE or BOUNDING. + // PERMITTED, INHERITABLE, BOUNDING or AMBIENT. Empty(which CapType) bool // Full check whether all capability bits of the given capabilities // set are one. The 'which' value should be one of EFFECTIVE, - // PERMITTED, INHERITABLE or BOUNDING. + // PERMITTED, INHERITABLE, BOUNDING or AMBIENT. Full(which CapType) bool // Set sets capabilities of the given capabilities sets. The // 'which' value should be one or combination (OR'ed) of EFFECTIVE, - // PERMITTED, INHERITABLE or BOUNDING. + // PERMITTED, INHERITABLE, BOUNDING or AMBIENT. Set(which CapType, caps ...Cap) // Unset unsets capabilities of the given capabilities sets. The // 'which' value should be one or combination (OR'ed) of EFFECTIVE, - // PERMITTED, INHERITABLE or BOUNDING. + // PERMITTED, INHERITABLE, BOUNDING or AMBIENT. Unset(which CapType, caps ...Cap) // Fill sets all bits of the given capabilities kind to one. The - // 'kind' value should be one or combination (OR'ed) of CAPS or - // BOUNDS. + // 'kind' value should be one or combination (OR'ed) of CAPS, + // BOUNDS or AMBS. Fill(kind CapType) // Clear sets all bits of the given capabilities kind to zero. The - // 'kind' value should be one or combination (OR'ed) of CAPS or - // BOUNDS. + // 'kind' value should be one or combination (OR'ed) of CAPS, + // BOUNDS or AMBS. Clear(kind CapType) // String return current capabilities state of the given capabilities // set as string. The 'which' value should be one of EFFECTIVE, - // PERMITTED, INHERITABLE or BOUNDING. + // PERMITTED, INHERITABLE BOUNDING or AMBIENT StringCap(which CapType) string // String return current capabilities state as string. diff --git a/vendor/github.com/syndtr/gocapability/capability/capability_linux.go b/vendor/github.com/syndtr/gocapability/capability/capability_linux.go index 3dfcd398..6d2135ac 100644 --- a/vendor/github.com/syndtr/gocapability/capability/capability_linux.go +++ b/vendor/github.com/syndtr/gocapability/capability/capability_linux.go @@ -235,9 +235,10 @@ func (c *capsV1) Apply(kind CapType) error { } type capsV3 struct { - hdr capHeader - data [2]capData - bounds [2]uint32 + hdr capHeader + data [2]capData + bounds [2]uint32 + ambient [2]uint32 } func (c *capsV3) Get(which CapType, what Cap) bool { @@ -256,6 +257,8 @@ func (c *capsV3) Get(which CapType, what Cap) bool { return (1< Date: Tue, 22 Aug 2017 16:33:10 -0500 Subject: [PATCH 2/2] fixup manager after runc bump --- manager/manager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manager/manager.go b/manager/manager.go index 39fc85b7..e9415fa8 100644 --- a/manager/manager.go +++ b/manager/manager.go @@ -136,7 +136,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn } // Detect the container we are running on. - selfContainer, err := cgroups.GetThisCgroupDir("cpu") + selfContainer, err := cgroups.GetOwnCgroupPath("cpu") if err != nil { return nil, err }