12-Go实现完整容器
学习目标
- 实现功能完整的容器运行时
- 掌握网络自动配置技术
- 理解容器生命周期管理
- 能够处理错误和资源清理
- 掌握容器的高级功能
前置知识
- Go 语言进阶
- 容器网络原理
- 系统编程基础
- 并发编程基础
️ 一、项目架构
1.1 项目结构
container/
├── go.mod
├── main.go
├── container/
│ ├── container.go
│ ├── namespace.go
│ ├── cgroup.go
│ ├── network.go
│ ├── filesystem.go
│ └── lifecycle.go
├── network/
│ ├── bridge.go
│ ├── veth.go
│ └── nat.go
├── storage/
│ ├── rootfs.go
│ └── overlay.go
└── utils/
├── logging.go
└── errors.go
1.2 依赖管理
module container
go 1.21
require (
github.com/vishvananda/netlink v1.3.1
github.com/vishvananda/netns v0.0.4
golang.org/x/sys v0.15.0
github.com/sirupsen/logrus v1.9.3
)
二、核心实现
2.1 容器主结构
package container
import (
"context"
"fmt"
"os"
"os/exec"
"sync"
"syscall"
"time"
"golang.org/x/sys/unix"
)
// Container 容器结构
type Container struct {
ID string
Name string
Rootfs string
Cmd string
Args []string
Env []string
WorkingDir string
User string
Hostname string
// 资源限制
MemoryLimit string
CPULimit string
PidsLimit int
// 网络配置
NetworkMode string
IPAddress string
PortMappings []PortMapping
// 存储配置
Mounts []Mount
ReadOnly bool
// 运行时状态
Status ContainerStatus
PID int
CreatedAt time.Time
StartedAt time.Time
// 内部状态
mutex sync.RWMutex
cancel context.CancelFunc
}
// ContainerStatus 容器状态
type ContainerStatus int
const (
StatusCreated ContainerStatus = iota
StatusRunning
StatusPaused
StatusStopped
StatusRemoved
)
// PortMapping 端口映射
type PortMapping struct {
HostPort int
ContainerPort int
Protocol string
}
// Mount 挂载点
type Mount struct {
Source string
Destination string
Type string
Options []string
}
// NewContainer 创建新容器
func NewContainer(id, name, rootfs, cmd string, args []string) *Container {
return &Container{
ID: id,
Name: name,
Rootfs: rootfs,
Cmd: cmd,
Args: args,
Env: os.Environ(),
WorkingDir: "/",
User: "root",
Hostname: "container",
MemoryLimit: "128M",
CPULimit: "50000 100000",
PidsLimit: 100,
NetworkMode: "bridge",
Status: StatusCreated,
CreatedAt: time.Now(),
}
}
// SetResourceLimits 设置资源限制
func (c *Container) SetResourceLimits(memory, cpu string, pids int) {
c.mutex.Lock()
defer c.mutex.Unlock()
c.MemoryLimit = memory
c.CPULimit = cpu
c.PidsLimit = pids
}
// SetNetworkConfig 设置网络配置
func (c *Container) SetNetworkConfig(mode, ip string, ports []PortMapping) {
c.mutex.Lock()
defer c.mutex.Unlock()
c.NetworkMode = mode
c.IPAddress = ip
c.PortMappings = ports
}
// SetMounts 设置挂载点
func (c *Container) SetMounts(mounts []Mount) {
c.mutex.Lock()
defer c.mutex.Unlock()
c.Mounts = mounts
}
// GetStatus 获取容器状态
func (c *Container) GetStatus() ContainerStatus {
c.mutex.RLock()
defer c.mutex.RUnlock()
return c.Status
}
// SetStatus 设置容器状态
func (c *Container) SetStatus(status ContainerStatus) {
c.mutex.Lock()
defer c.mutex.Unlock()
c.Status = status
if status == StatusRunning {
c.StartedAt = time.Now()
}
}
2.2 容器运行时
// Run 运行容器
func (c *Container) Run() error {
// 验证容器配置
if err := c.validate(); err != nil {
return fmt.Errorf("validation failed: %v", err)
}
// 创建上下文
ctx, cancel := context.WithCancel(context.Background())
c.cancel = cancel
// 创建所有 namespace 的 flags
flags := syscall.CLONE_NEWUTS |
syscall.CLONE_NEWPID |
syscall.CLONE_NEWNS |
syscall.CLONE_NEWNET |
syscall.CLONE_NEWIPC |
syscall.CLONE_NEWUSER |
syscall.CLONE_NEWCGROUP
// 准备子进程命令
cmd := exec.CommandContext(ctx, "/proc/self/exe", "child", c.ID, c.Rootfs, c.Cmd)
cmd.Args = append(cmd.Args, c.Args...)
// 设置系统调用属性
cmd.SysProcAttr = &syscall.SysProcAttr{
Cloneflags: flags,
Unshareflags: syscall.CLONE_NEWNS,
}
// 设置标准输入输出
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
// 启动子进程
if err := cmd.Start(); err != nil {
return fmt.Errorf("failed to start child process: %v", err)
}
c.PID = cmd.Process.Pid
c.SetStatus(StatusRunning)
// 配置网络
if err := c.setupNetwork(); err != nil {
return fmt.Errorf("failed to setup network: %v", err)
}
// 配置控制组
if err := c.setupCGroup(); err != nil {
return fmt.Errorf("failed to setup cgroup: %v", err)
}
// 等待子进程完成
if err := cmd.Wait(); err != nil {
c.SetStatus(StatusStopped)
return fmt.Errorf("container exited with error: %v", err)
}
c.SetStatus(StatusStopped)
return nil
}
// validate 验证容器配置
func (c *Container) validate() error {
// 检查 rootfs 是否存在
if _, err := os.Stat(c.Rootfs); os.IsNotExist(err) {
return fmt.Errorf("rootfs does not exist: %s", c.Rootfs)
}
// 检查命令是否存在
if _, err := exec.LookPath(c.Cmd); err != nil {
return fmt.Errorf("command not found: %s", c.Cmd)
}
return nil
}
// Stop 停止容器
func (c *Container) Stop() error {
c.mutex.Lock()
defer c.mutex.Unlock()
if c.Status != StatusRunning {
return fmt.Errorf("container is not running")
}
if c.PID > 0 {
if err := syscall.Kill(c.PID, syscall.SIGTERM); err != nil {
return fmt.Errorf("failed to send SIGTERM: %v", err)
}
// 等待进程退出
for i := 0; i < 10; i++ {
if err := syscall.Kill(c.PID, 0); err != nil {
break
}
time.Sleep(100 * time.Millisecond)
}
// 强制杀死进程
if err := syscall.Kill(c.PID, syscall.SIGKILL); err != nil {
return fmt.Errorf("failed to send SIGKILL: %v", err)
}
}
c.Status = StatusStopped
return nil
}
// Remove 删除容器
func (c *Container) Remove() error {
c.mutex.Lock()
defer c.mutex.Unlock()
if c.Status == StatusRunning {
return fmt.Errorf("cannot remove running container")
}
// 清理网络
if err := c.cleanupNetwork(); err != nil {
return fmt.Errorf("failed to cleanup network: %v", err)
}
// 清理控制组
if err := c.cleanupCGroup(); err != nil {
return fmt.Errorf("failed to cleanup cgroup: %v", err)
}
c.Status = StatusRemoved
return nil
}
2.3 网络管理
package network
import (
"fmt"
"net"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/vishvananda/netlink"
"github.com/vishvananda/netns"
)
// NetworkManager 网络管理器
type NetworkManager struct {
bridgeName string
bridgeIP string
subnet string
}
// NewNetworkManager 创建网络管理器
func NewNetworkManager() *NetworkManager {
return &NetworkManager{
bridgeName: "cni0",
bridgeIP: "10.22.0.1",
subnet: "10.22.0.0/24",
}
}
// SetupNetwork 设置网络
func (nm *NetworkManager) SetupNetwork(containerID string) (*NetworkConfig, error) {
// 创建 Bridge
if err := nm.createBridge(); err != nil {
return nil, fmt.Errorf("failed to create bridge: %v", err)
}
// 创建 veth pair
vethHost, vethCont, err := nm.createVethPair(containerID)
if err != nil {
return nil, fmt.Errorf("failed to create veth pair: %v", err)
}
// 配置容器网络
containerIP, err := nm.configureContainerNetwork(containerID, vethCont)
if err != nil {
return nil, fmt.Errorf("failed to configure container network: %v", err)
}
// 配置 NAT
if err := nm.configureNAT(); err != nil {
return nil, fmt.Errorf("failed to configure NAT: %v", err)
}
return &NetworkConfig{
BridgeName: nm.bridgeName,
VethHost: vethHost,
VethCont: vethCont,
ContainerIP: containerIP,
GatewayIP: nm.bridgeIP,
}, nil
}
// createBridge 创建 Bridge
func (nm *NetworkManager) createBridge() error {
// 检查 Bridge 是否已存在
if _, err := netlink.LinkByName(nm.bridgeName); err == nil {
return nil
}
// 创建 Bridge
bridge := &netlink.Bridge{
LinkAttrs: netlink.LinkAttrs{
Name: nm.bridgeName,
},
}
if err := netlink.LinkAdd(bridge); err != nil {
return fmt.Errorf("failed to add bridge: %v", err)
}
// 配置 IP 地址
addr, err := netlink.ParseAddr(nm.bridgeIP + "/24")
if err != nil {
return fmt.Errorf("failed to parse bridge IP: %v", err)
}
if err := netlink.AddrAdd(bridge, addr); err != nil {
return fmt.Errorf("failed to add bridge address: %v", err)
}
// 启动 Bridge
if err := netlink.LinkSetUp(bridge); err != nil {
return fmt.Errorf("failed to set bridge up: %v", err)
}
return nil
}
// createVethPair 创建 veth pair
func (nm *NetworkManager) createVethPair(containerID string) (string, string, error) {
vethHost := fmt.Sprintf("veth%s", containerID[:8])
vethCont := "eth0"
// 创建 veth pair
veth := &netlink.Veth{
LinkAttrs: netlink.LinkAttrs{
Name: vethHost,
},
PeerName: vethCont,
}
if err := netlink.LinkAdd(veth); err != nil {
return "", "", fmt.Errorf("failed to add veth: %v", err)
}
// 获取 Bridge
bridge, err := netlink.LinkByName(nm.bridgeName)
if err != nil {
return "", "", fmt.Errorf("failed to get bridge: %v", err)
}
// 将 host 端连接到 Bridge
if err := netlink.LinkSetMaster(veth, bridge); err != nil {
return "", "", fmt.Errorf("failed to set veth master: %v", err)
}
// 启动 host 端
if err := netlink.LinkSetUp(veth); err != nil {
return "", "", fmt.Errorf("failed to set veth up: %v", err)
}
return vethHost, vethCont, nil
}
// configureContainerNetwork 配置容器网络
func (nm *NetworkManager) configureContainerNetwork(containerID, vethCont string) (string, error) {
// 创建 Network Namespace
if err := nm.createNetworkNamespace(containerID); err != nil {
return "", fmt.Errorf("failed to create network namespace: %v", err)
}
// 获取容器 Network Namespace
ns, err := netns.GetFromPath(filepath.Join("/var/run/netns", containerID))
if err != nil {
return "", fmt.Errorf("failed to get network namespace: %v", err)
}
defer ns.Close()
// 将 veth 移到容器 namespace
veth, err := netlink.LinkByName(vethCont)
if err != nil {
return "", fmt.Errorf("failed to get veth: %v", err)
}
if err := netlink.LinkSetNsPid(veth, 0); err != nil {
return "", fmt.Errorf("failed to set veth namespace: %v", err)
}
// 在容器 namespace 中配置网络
containerIP, err := nm.configureContainerInterface(ns, vethCont)
if err != nil {
return "", fmt.Errorf("failed to configure container interface: %v", err)
}
return containerIP, nil
}
// createNetworkNamespace 创建 Network Namespace
func (nm *NetworkManager) createNetworkNamespace(containerID string) error {
// 创建 namespace 文件
nsPath := filepath.Join("/var/run/netns", containerID)
if err := os.MkdirAll(filepath.Dir(nsPath), 0755); err != nil {
return fmt.Errorf("failed to create netns directory: %v", err)
}
// 创建 namespace
if err := netns.NewNamed(containerID); err != nil {
return fmt.Errorf("failed to create named namespace: %v", err)
}
return nil
}
// configureContainerInterface 配置容器网络接口
func (nm *NetworkManager) configureContainerInterface(ns netns.NsHandle, ifName string) (string, error) {
// 在容器 namespace 中配置网络
err := netns.Set(ns)
if err != nil {
return "", fmt.Errorf("failed to set namespace: %v", err)
}
defer netns.Set(netns.None())
// 启动 lo 接口
lo, err := netlink.LinkByName("lo")
if err == nil {
netlink.LinkSetUp(lo)
}
// 获取容器接口
iface, err := netlink.LinkByName(ifName)
if err != nil {
return "", fmt.Errorf("failed to get container interface: %v", err)
}
// 启动容器接口
if err := netlink.LinkSetUp(iface); err != nil {
return "", fmt.Errorf("failed to set container interface up: %v", err)
}
// 分配 IP 地址
containerIP := nm.allocateIP()
addr, err := netlink.ParseAddr(containerIP + "/24")
if err != nil {
return "", fmt.Errorf("failed to parse container IP: %v", err)
}
if err := netlink.AddrAdd(iface, addr); err != nil {
return "", fmt.Errorf("failed to add container address: %v", err)
}
// 添加默认路由
route := &netlink.Route{
LinkIndex: iface.Attrs().Index,
Dst: nil,
Gw: net.ParseIP(nm.bridgeIP),
}
if err := netlink.RouteAdd(route); err != nil {
return "", fmt.Errorf("failed to add default route: %v", err)
}
return containerIP, nil
}
// allocateIP 分配 IP 地址
func (nm *NetworkManager) allocateIP() string {
// 简单的 IP 分配策略
// 在实际应用中,应该使用更复杂的 IPAM
return "10.22.0.2"
}
// configureNAT 配置 NAT
func (nm *NetworkManager) configureNAT() error {
// 检查 NAT 规则是否已存在
if nm.natRuleExists() {
return nil
}
// 添加 NAT 规则
cmd := exec.Command("iptables", "-t", "nat", "-A", "POSTROUTING",
"-s", nm.subnet, "-j", "MASQUERADE")
if err := cmd.Run(); err != nil {
return fmt.Errorf("failed to add NAT rule: %v", err)
}
return nil
}
// natRuleExists 检查 NAT 规则是否存在
func (nm *NetworkManager) natRuleExists() bool {
cmd := exec.Command("iptables", "-t", "nat", "-C", "POSTROUTING",
"-s", nm.subnet, "-j", "MASQUERADE")
return cmd.Run() == nil
}
// NetworkConfig 网络配置
type NetworkConfig struct {
BridgeName string
VethHost string
VethCont string
ContainerIP string
GatewayIP string
}
2.4 控制组管理
package container
import (
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
)
// CGroupManager 控制组管理器
type CGroupManager struct {
containerID string
path string
config *CGroupConfig
}
// CGroupConfig 控制组配置
type CGroupConfig struct {
MemoryMax string
CPUMax string
PidsMax int
IOWeight int
}
// NewCGroupManager 创建控制组管理器
func NewCGroupManager(containerID string, config *CGroupConfig) *CGroupManager {
return &CGroupManager{
containerID: containerID,
path: filepath.Join("/sys/fs/cgroup", containerID),
config: config,
}
}
// Create 创建控制组
func (cgm *CGroupManager) Create() error {
// 创建控制组目录
if err := os.MkdirAll(cgm.path, 0755); err != nil {
return fmt.Errorf("failed to create cgroup directory: %v", err)
}
// 设置内存限制
if err := cgm.setMemoryLimit(); err != nil {
return fmt.Errorf("failed to set memory limit: %v", err)
}
// 设置 CPU 限制
if err := cgm.setCPULimit(); err != nil {
return fmt.Errorf("failed to set CPU limit: %v", err)
}
// 设置进程数限制
if err := cgm.setPidsLimit(); err != nil {
return fmt.Errorf("failed to set pids limit: %v", err)
}
// 设置 IO 权重
if err := cgm.setIOWeight(); err != nil {
return fmt.Errorf("failed to set IO weight: %v", err)
}
return nil
}
// AddProcess 添加进程到控制组
func (cgm *CGroupManager) AddProcess(pid int) error {
cgroupProcsPath := filepath.Join(cgm.path, "cgroup.procs")
return os.WriteFile(cgroupProcsPath, []byte(strconv.Itoa(pid)), 0644)
}
// RemoveProcess 从控制组移除进程
func (cgm *CGroupManager) RemoveProcess(pid int) error {
cgroupProcsPath := filepath.Join(cgm.path, "cgroup.procs")
data, err := os.ReadFile(cgroupProcsPath)
if err != nil {
return err
}
lines := strings.Split(string(data), "\n")
var newLines []string
pidStr := strconv.Itoa(pid)
for _, line := range lines {
if line != pidStr {
newLines = append(newLines, line)
}
}
return os.WriteFile(cgroupProcsPath, []byte(strings.Join(newLines, "\n")), 0644)
}
// Destroy 销毁控制组
func (cgm *CGroupManager) Destroy() error {
return os.RemoveAll(cgm.path)
}
// GetStats 获取控制组统计信息
func (cgm *CGroupManager) GetStats() (*CGroupStats, error) {
stats := &CGroupStats{}
// 读取内存使用情况
if data, err := os.ReadFile(filepath.Join(cgm.path, "memory.current")); err == nil {
stats.MemoryCurrent = strings.TrimSpace(string(data))
}
// 读取 CPU 使用情况
if data, err := os.ReadFile(filepath.Join(cgm.path, "cpu.stat")); err == nil {
stats.CPUStat = string(data)
}
// 读取进程数
if data, err := os.ReadFile(filepath.Join(cgm.path, "pids.current")); err == nil {
stats.PidsCurrent = strings.TrimSpace(string(data))
}
return stats, nil
}
// setMemoryLimit 设置内存限制
func (cgm *CGroupManager) setMemoryLimit() error {
memoryMaxPath := filepath.Join(cgm.path, "memory.max")
return os.WriteFile(memoryMaxPath, []byte(cgm.config.MemoryMax), 0644)
}
// setCPULimit 设置 CPU 限制
func (cgm *CGroupManager) setCPULimit() error {
cpuMaxPath := filepath.Join(cgm.path, "cpu.max")
return os.WriteFile(cpuMaxPath, []byte(cgm.config.CPUMax), 0644)
}
// setPidsLimit 设置进程数限制
func (cgm *CGroupManager) setPidsLimit() error {
pidsMaxPath := filepath.Join(cgm.path, "pids.max")
return os.WriteFile(pidsMaxPath, []byte(strconv.Itoa(cgm.config.PidsMax)), 0644)
}
// setIOWeight 设置 IO 权重
func (cgm *CGroupManager) setIOWeight() error {
ioWeightPath := filepath.Join(cgm.path, "io.weight")
return os.WriteFile(ioWeightPath, []byte(strconv.Itoa(cgm.config.IOWeight)), 0644)
}
// CGroupStats 控制组统计信息
type CGroupStats struct {
MemoryCurrent string
CPUStat string
PidsCurrent string
}
2.5 文件系统管理
package container
import (
"fmt"
"os"
"path/filepath"
"syscall"
"golang.org/x/sys/unix"
)
// FileSystemManager 文件系统管理器
type FileSystemManager struct {
rootfs string
mounts []Mount
}
// NewFileSystemManager 创建文件系统管理器
func NewFileSystemManager(rootfs string, mounts []Mount) *FileSystemManager {
return &FileSystemManager{
rootfs: rootfs,
mounts: mounts,
}
}
// SetupFileSystem 设置文件系统
func (fsm *FileSystemManager) SetupFileSystem() error {
// 重新挂载特殊文件系统
if err := fsm.mountSpecialFilesystems(); err != nil {
return fmt.Errorf("failed to mount special filesystems: %v", err)
}
// 创建必要设备文件
if err := fsm.createDevices(); err != nil {
return fmt.Errorf("failed to create devices: %v", err)
}
// 切换根目录
if err := fsm.pivotRoot(); err != nil {
return fmt.Errorf("failed to pivot root: %v", err)
}
// 重新挂载特殊文件系统
if err := fsm.mountSpecialFilesystems(); err != nil {
return fmt.Errorf("failed to remount special filesystems: %v", err)
}
// 挂载用户指定的挂载点
if err := fsm.mountUserMounts(); err != nil {
return fmt.Errorf("failed to mount user mounts: %v", err)
}
return nil
}
// mountSpecialFilesystems 挂载特殊文件系统
func (fsm *FileSystemManager) mountSpecialFilesystems() error {
// 挂载 /proc
if err := unix.Mount("proc", "/proc", "proc", 0, ""); err != nil {
return fmt.Errorf("failed to mount /proc: %v", err)
}
// 挂载 /sys
if err := unix.Mount("sysfs", "/sys", "sysfs", 0, ""); err != nil {
return fmt.Errorf("failed to mount /sys: %v", err)
}
// 挂载 /dev
if err := unix.Mount("devtmpfs", "/dev", "devtmpfs", 0, ""); err != nil {
return fmt.Errorf("failed to mount /dev: %v", err)
}
// 挂载 /dev/pts
if err := os.MkdirAll("/dev/pts", 0755); err != nil {
return fmt.Errorf("failed to create /dev/pts: %v", err)
}
if err := unix.Mount("devpts", "/dev/pts", "devpts", 0, ""); err != nil {
return fmt.Errorf("failed to mount /dev/pts: %v", err)
}
// 挂载 /dev/shm
if err := os.MkdirAll("/dev/shm", 0755); err != nil {
return fmt.Errorf("failed to create /dev/shm: %v", err)
}
if err := unix.Mount("tmpfs", "/dev/shm", "tmpfs", 0, ""); err != nil {
return fmt.Errorf("failed to mount /dev/shm: %v", err)
}
return nil
}
// createDevices 创建设备文件
func (fsm *FileSystemManager) createDevices() error {
devices := []struct {
name string
mode uint32
dev int
}{
{"/dev/null", syscall.S_IFCHR | 0666, 0x0103},
{"/dev/zero", syscall.S_IFCHR | 0666, 0x0105},
{"/dev/random", syscall.S_IFCHR | 0666, 0x0108},
{"/dev/urandom", syscall.S_IFCHR | 0666, 0x0109},
{"/dev/tty", syscall.S_IFCHR | 0666, 0x0500},
{"/dev/console", syscall.S_IFCHR | 0666, 0x0501},
}
for _, device := range devices {
if err := unix.Mknod(device.name, device.mode, device.dev); err != nil {
return fmt.Errorf("failed to create device %s: %v", device.name, err)
}
}
return nil
}
// pivotRoot 切换根目录
func (fsm *FileSystemManager) pivotRoot() error {
// 绑定挂载 rootfs
if err := unix.Mount(fsm.rootfs, fsm.rootfs, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
return fmt.Errorf("failed to bind mount rootfs: %v", err)
}
// 创建 put_old 目录
putold := "/.oldroot"
if err := os.MkdirAll(putold, 0700); err != nil {
return fmt.Errorf("failed to create put_old directory: %v", err)
}
// 执行 pivot_root
if err := unix.PivotRoot(fsm.rootfs, putold); err != nil {
return fmt.Errorf("failed to pivot_root: %v", err)
}
// 切换到新根目录
if err := os.Chdir("/"); err != nil {
return fmt.Errorf("failed to change working directory: %v", err)
}
// 卸载原根目录
if err := unix.Unmount(putold, unix.MNT_DETACH); err != nil {
return fmt.Errorf("failed to unmount old root: %v", err)
}
// 删除 put_old 目录
if err := os.RemoveAll(putold); err != nil {
return fmt.Errorf("failed to remove put_old directory: %v", err)
}
return nil
}
// mountUserMounts 挂载用户指定的挂载点
func (fsm *FileSystemManager) mountUserMounts() error {
for _, mount := range fsm.mounts {
if err := fsm.mountUserMount(mount); err != nil {
return fmt.Errorf("failed to mount %s: %v", mount.Destination, err)
}
}
return nil
}
// mountUserMount 挂载单个挂载点
func (fsm *FileSystemManager) mountUserMount(mount Mount) error {
// 创建目标目录
if err := os.MkdirAll(mount.Destination, 0755); err != nil {
return fmt.Errorf("failed to create mount point: %v", err)
}
// 解析挂载选项
var flags uintptr
for _, option := range mount.Options {
switch option {
case "ro":
flags |= unix.MS_RDONLY
case "noexec":
flags |= unix.MS_NOEXEC
case "nosuid":
flags |= unix.MS_NOSUID
case "nodev":
flags |= unix.MS_NODEV
}
}
// 执行挂载
if err := unix.Mount(mount.Source, mount.Destination, mount.Type, flags, ""); err != nil {
return fmt.Errorf("failed to mount: %v", err)
}
return nil
}
三、主程序
3.1 完整的 main.go
package main
import (
"flag"
"fmt"
"log"
"os"
"os/exec"
"syscall"
"time"
"container/container"
"container/network"
)
func main() {
var (
rootfs = flag.String("rootfs", "", "Root filesystem path")
cmd = flag.String("cmd", "/bin/sh", "Command to run in container")
args = flag.String("args", "", "Command arguments")
name = flag.String("name", "", "Container name")
memory = flag.String("memory", "128M", "Memory limit")
cpu = flag.String("cpu", "50000 100000", "CPU limit")
pids = flag.Int("pids", 100, "Process limit")
networkMode = flag.String("network", "bridge", "Network mode")
ip = flag.String("ip", "", "Container IP address")
ports = flag.String("ports", "", "Port mappings (host:container)")
mounts = flag.String("mounts", "", "Mount points (source:destination:type:options)")
readOnly = flag.Bool("readonly", false, "Read-only root filesystem")
detach = flag.Bool("detach", false, "Run container in background")
)
flag.Parse()
if *rootfs == "" {
log.Fatal("rootfs is required")
}
// 生成容器 ID
containerID := generateContainerID()
if *name == "" {
*name = containerID
}
// 解析命令参数
command := *cmd
commandArgs := []string{}
if *args != "" {
commandArgs = append(commandArgs, *args)
}
// 解析端口映射
portMappings := parsePortMappings(*ports)
// 解析挂载点
mountPoints := parseMounts(*mounts)
// 创建容器
c := container.NewContainer(containerID, *name, *rootfs, command, commandArgs)
// 设置资源限制
c.SetResourceLimits(*memory, *cpu, *pids)
// 设置网络配置
c.SetNetworkConfig(*networkMode, *ip, portMappings)
// 设置挂载点
c.SetMounts(mountPoints)
// 设置只读
if *readOnly {
c.ReadOnly = true
}
// 运行容器
if *detach {
go func() {
if err := c.Run(); err != nil {
log.Printf("Container %s failed: %v", containerID, err)
}
}()
// 等待容器启动
time.Sleep(1 * time.Second)
fmt.Printf("Container %s started in background\n", containerID)
fmt.Printf("Use 'docker attach %s' to attach to container\n", containerID)
} else {
if err := c.Run(); err != nil {
log.Fatalf("Failed to run container: %v", err)
}
}
}
// generateContainerID 生成容器 ID
func generateContainerID() string {
return fmt.Sprintf("container-%d", time.Now().UnixNano())
}
// parsePortMappings 解析端口映射
func parsePortMappings(ports string) []container.PortMapping {
var mappings []container.PortMapping
if ports == "" {
return mappings
}
// 解析格式: "8080:80,9090:90"
for _, port := range strings.Split(ports, ",") {
parts := strings.Split(port, ":")
if len(parts) != 2 {
continue
}
hostPort, err := strconv.Atoi(parts[0])
if err != nil {
continue
}
containerPort, err := strconv.Atoi(parts[1])
if err != nil {
continue
}
mappings = append(mappings, container.PortMapping{
HostPort: hostPort,
ContainerPort: containerPort,
Protocol: "tcp",
})
}
return mappings
}
// parseMounts 解析挂载点
func parseMounts(mounts string) []container.Mount {
var mountPoints []container.Mount
if mounts == "" {
return mountPoints
}
// 解析格式: "source:destination:type:options"
for _, mount := range strings.Split(mounts, ",") {
parts := strings.Split(mount, ":")
if len(parts) < 2 {
continue
}
mountPoint := container.Mount{
Source: parts[0],
Destination: parts[1],
Type: "bind",
Options: []string{"rw"},
}
if len(parts) > 2 {
mountPoint.Type = parts[2]
}
if len(parts) > 3 {
mountPoint.Options = strings.Split(parts[3], ",")
}
mountPoints = append(mountPoints, mountPoint)
}
return mountPoints
}
四、调试和测试
4.1 调试模式
// 添加调试模式
var debug = flag.Bool("debug", false, "Enable debug mode")
func (c *Container) Run() error {
if *debug {
log.Printf("Starting container %s with rootfs: %s", c.ID, c.Rootfs)
log.Printf("Command: %s %v", c.Cmd, c.Args)
log.Printf("Memory limit: %s", c.MemoryLimit)
log.Printf("CPU limit: %s", c.CPULimit)
log.Printf("Network mode: %s", c.NetworkMode)
}
// ... 其他代码
}
4.2 日志记录
import "github.com/sirupsen/logrus"
var logger = logrus.New()
func init() {
logger.SetLevel(logrus.InfoLevel)
logger.SetFormatter(&logrus.TextFormatter{
FullTimestamp: true,
})
}
func (c *Container) Run() error {
logger.WithFields(logrus.Fields{
"container_id": c.ID,
"rootfs": c.Rootfs,
"command": c.Cmd,
}).Info("Starting container")
// ... 其他代码
}
4.3 错误处理
func (c *Container) Run() error {
// 验证容器配置
if err := c.validate(); err != nil {
return fmt.Errorf("validation failed: %v", err)
}
// 创建上下文
ctx, cancel := context.WithCancel(context.Background())
c.cancel = cancel
defer cancel()
// ... 其他代码
// 等待子进程完成
if err := cmd.Wait(); err != nil {
if exitError, ok := err.(*exec.ExitError); ok {
logger.WithFields(logrus.Fields{
"container_id": c.ID,
"exit_code": exitError.ExitCode(),
}).Error("Container exited with error")
return fmt.Errorf("container exited with code %d: %v", exitError.ExitCode(), err)
}
return fmt.Errorf("failed to wait for child process: %v", err)
}
return nil
}
五、验证检查清单
基础功能
- [ ] 能够编译完整程序
- [ ] 能够运行容器
- [ ] 能够配置网络
- [ ] 能够配置资源限制
高级功能
- [ ] 能够处理端口映射
- [ ] 能够处理挂载点
- [ ] 能够处理只读文件系统
- [ ] 能够后台运行容器
调试技能
- [ ] 能够使用调试模式
- [ ] 能够查看日志
- [ ] 能够处理错误
- [ ] 能够进行性能测试
阶段化演进路线
根据实际开发经验,容器运行时的开发通常遵循以下阶段:
阶段 1:完善基础容器机制
1.1 增加 Capabilities 控制
// 限制容器中的系统权限
func setupCapabilities() error {
// 只保留必要的 capabilities
caps := []string{
"CAP_NET_BIND_SERVICE",
"CAP_CHOWN",
"CAP_SETUID",
"CAP_SETGID",
}
// 移除危险权限
dangerousCaps := []string{
"CAP_SYS_ADMIN",
"CAP_SYS_PTRACE",
"CAP_SYS_MODULE",
}
for _, cap := range dangerousCaps {
if err := unix.Prctl(unix.PR_CAPBSET_DROP,
getCapabilityValue(cap), 0, 0, 0); err != nil {
return err
}
}
return nil
}
1.2 增加 Seccomp 安全策略
// 限制系统调用
func setupSeccomp() error {
filter, err := seccomp.NewFilter(seccomp.ActAllow)
if err != nil {
return err
}
// 禁止危险系统调用
dangerousSyscalls := []string{
"mount", "umount", "ptrace", "reboot",
"swapon", "swapoff", "syslog",
}
for _, syscall := range dangerousSyscalls {
if err := filter.AddRule(
seccomp.GetSyscallFromName(syscall),
seccomp.ActKillProcess,
); err != nil {
return err
}
}
return filter.Load()
}
阶段 2:强化网络子系统
2.1 IPAM 模块实现
type IPAM struct {
Subnet *net.IPNet
Gateway net.IP
Allocated map[string]bool
file string
mu sync.Mutex
}
func (i *IPAM) Allocate() (net.IP, error) {
i.mu.Lock()
defer i.mu.Unlock()
start := binaryInc(i.Gateway)
for ip := start; i.Subnet.Contains(ip); ip = binaryInc(ip) {
if !i.Allocated[ip.String()] {
i.Allocated[ip.String()] = true
i.save()
return ip, nil
}
}
return nil, fmt.Errorf("no available IP in %s", i.Subnet.String())
}
2.2 多容器网络互通
// 支持多个容器共享同一个 bridge
func setupMultiContainerNetwork(containerID string) error {
// 1. 确保 bridge 存在
br, err := ensureBridge("toybr0", "10.22.0.1/24")
if err != nil {
return err
}
// 2. 分配唯一 IP
ipam, err := NewIPAM("/var/lib/toyctr/ipam.json", "10.22.0.0/24")
if err != nil {
return err
}
contIP, err := ipam.Allocate()
if err != nil {
return err
}
// 3. 创建 veth 对
hostVeth, contVeth, err := createVethPair(containerID)
if err != nil {
return err
}
// 4. 配置网络
return configureContainerNetwork(hostVeth, contVeth, contIP, br)
}
阶段 3:OverlayFS 镜像分层
3.1 OverlayFS 实现
type OverlayPaths struct {
Base string
Lower string // 只读镜像层
Upper string // 可写层
Work string // 工作目录
Merged string // 合并后的根目录
ImageRef string
}
func PrepareOverlay(containerID, imageName string) (*OverlayPaths, error) {
base := filepath.Join("/var/lib/toyctr/containers", containerID)
imgLower := filepath.Join("/var/lib/toyctr/images", imageName, "lower")
// 创建目录结构
upper := filepath.Join(base, "upper")
work := filepath.Join(base, "work")
merged := filepath.Join(base, "merged")
for _, d := range []string{base, upper, work, merged} {
if err := os.MkdirAll(d, 0755); err != nil {
return nil, err
}
}
// 挂载 OverlayFS
opts := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s",
imgLower, upper, work)
if err := unix.Mount("overlay", merged, "overlay", 0, opts); err != nil {
return nil, fmt.Errorf("mount overlay: %w", err)
}
return &OverlayPaths{
Base: base,
Lower: imgLower,
Upper: upper,
Work: work,
Merged: merged,
ImageRef: imageName,
}, nil
}
阶段 4:Volume 挂载支持
4.1 挂载解析
type MountSpec struct {
Type string // "bind" | "tmpfs"
Src string
Dst string
RO bool
Opts string
}
func parseMountSpec(s string) (MountSpec, error) {
ms := MountSpec{}
parts := strings.Split(s, ",")
for _, p := range parts {
p = strings.TrimSpace(p)
if p == "ro" {
ms.RO = true
continue
}
k, v, ok := strings.Cut(p, "=")
if !ok {
return ms, fmt.Errorf("bad mount token: %q", p)
}
switch k {
case "type":
ms.Type = v
case "src", "source":
ms.Src = v
case "dst", "target":
ms.Dst = v
case "opts":
ms.Opts = v
}
}
return ms, nil
}
4.2 挂载执行
func setupMounts(mounts []MountSpec) error {
for _, m := range mounts {
if err := os.MkdirAll(m.Dst, 0755); err != nil {
return err
}
switch m.Type {
case "bind":
// 先 bind 挂载
if err := unix.Mount(m.Src, m.Dst, "",
unix.MS_BIND|unix.MS_REC, ""); err != nil {
return err
}
// 如果是只读,则 remount
if m.RO {
if err := unix.Mount("", m.Dst, "",
unix.MS_REMOUNT|unix.MS_BIND|unix.MS_RDONLY, ""); err != nil {
return err
}
}
case "tmpfs":
if err := unix.Mount("tmpfs", m.Dst, "tmpfs", 0, m.Opts); err != nil {
return err
}
}
}
return nil
}
实战练习
练习 1:基础容器实现
- 实现基本的命名空间隔离
- 添加 Cgroup 资源限制
- 配置简单的网络连接
验证步骤:
# 1. 编译程序
go build -o toyctr main.go
# 2. 运行容器
sudo ./toyctr -cmd /bin/sh -args "-c 'sleep 60'"
# 3. 验证隔离
# 在另一个终端查看进程
ps aux | grep sh
# 应该只看到容器内的进程
# 4. 验证网络
# 在容器内
ip addr show
# 应该看到独立的网络接口
练习 2:多容器网络
- 实现 IPAM 模块
- 支持多容器互通
- 验证容器间通信
验证步骤:
# 1. 启动第一个容器
sudo ./toyctr -cmd /bin/sh -args "-c 'sleep 600'" &
CONTAINER1_PID=$!
# 2. 启动第二个容器
sudo ./toyctr -cmd /bin/sh -args "-c 'sleep 600'" &
CONTAINER2_PID=$!
# 3. 进入第一个容器测试网络
sudo nsenter -t $CONTAINER1_PID -n ping -c 1 10.22.0.3
# 4. 验证 IP 分配
cat /var/lib/toyctr/ipam.json
练习 3:文件系统隔离
- 实现 OverlayFS 支持
- 添加 Volume 挂载
- 测试只读根文件系统
验证步骤:
# 1. 准备镜像层
mkdir -p /var/lib/toyctr/images/busybox/lower
docker export $(docker create busybox) | sudo tar -C /var/lib/toyctr/images/busybox/lower -xf -
# 2. 运行容器
sudo ./toyctr -image busybox -cmd /bin/sh
# 3. 在容器内测试
# 创建文件
touch /test_file
ls -la /test_file
# 4. 退出容器后检查
# 文件应该在 upper 层
ls -la /var/lib/toyctr/containers/*/upper/test_file
练习 4:安全加固
- 实现 Capabilities 控制
- 添加 Seccomp 策略
- 测试安全限制效果
验证步骤:
# 1. 运行带安全限制的容器
sudo ./toyctr -cmd /bin/sh -args "-c 'mount -t tmpfs tmpfs /tmp'"
# 2. 应该看到权限被拒绝的错误
# mount: /tmp: Operation not permitted
# 3. 检查 capabilities
cat /proc/self/status | grep Cap
# 应该看到受限的 capabilities
验证检查清单
基础功能验证
- [ ] 容器进程隔离正常
- [ ] 网络命名空间独立
- [ ] 文件系统隔离有效
- [ ] 资源限制生效
网络功能验证
- [ ] 容器间可以通信
- [ ] 外网访问正常
- [ ] IP 分配不冲突
- [ ] 端口映射正确
文件系统验证
- [ ] OverlayFS 挂载成功
- [ ] Volume 挂载正常
- [ ] 只读限制生效
- [ ] 数据持久化正确
安全功能验证
- [ ] Capabilities 限制生效
- [ ] Seccomp 策略有效
- [ ] 危险系统调用被阻止
- [ ] 权限提升被限制
相关链接
- 11-Go实现最小容器 - 最小实现
- 13-容器生命周期管理 - 生命周期管理
- 14-调试技术与工具 - 调试技术详解
下一步:让我们学习容器生命周期管理,这是容器运行时的核心!