HiHuo
首页
博客
手册
工具
首页
博客
手册
工具
  • 手撸容器系统

    • 完整手撸容器技术文档系列
    • 01-容器本质与基础概念
    • 02-Namespace隔离机制
    • 03-CGroup资源控制
    • 04-Capabilities与安全机制
    • 05-容器网络原理
    • 06-网络模式与实现
    • 07-CNI插件开发
    • 08-RootFS与文件系统隔离
    • 09-OverlayFS镜像分层
    • 10-命令行手撸容器
    • 11-Go实现最小容器
    • 12-Go实现完整容器
    • 13-容器生命周期管理
    • 14-调试技术与工具
    • 15-OCI规范与标准化
    • 16-进阶场景与优化
    • 常见问题与故障排查
    • 参考资料与延伸阅读

12-Go实现完整容器

学习目标

  • 实现功能完整的容器运行时
  • 掌握网络自动配置技术
  • 理解容器生命周期管理
  • 能够处理错误和资源清理
  • 掌握容器的高级功能

前置知识

  • Go 语言进阶
  • 容器网络原理
  • 系统编程基础
  • 并发编程基础

️ 一、项目架构

1.1 项目结构

container/
├── go.mod
├── main.go
├── container/
│   ├── container.go
│   ├── namespace.go
│   ├── cgroup.go
│   ├── network.go
│   ├── filesystem.go
│   └── lifecycle.go
├── network/
│   ├── bridge.go
│   ├── veth.go
│   └── nat.go
├── storage/
│   ├── rootfs.go
│   └── overlay.go
└── utils/
    ├── logging.go
    └── errors.go

1.2 依赖管理

module container

go 1.21

require (
    github.com/vishvananda/netlink v1.3.1
    github.com/vishvananda/netns v0.0.4
    golang.org/x/sys v0.15.0
    github.com/sirupsen/logrus v1.9.3
)

二、核心实现

2.1 容器主结构

package container

import (
    "context"
    "fmt"
    "os"
    "os/exec"
    "sync"
    "syscall"
    "time"
    "golang.org/x/sys/unix"
)

// Container 容器结构
type Container struct {
    ID          string
    Name        string
    Rootfs      string
    Cmd         string
    Args        []string
    Env         []string
    WorkingDir  string
    User        string
    Hostname    string
    
    // 资源限制
    MemoryLimit string
    CPULimit    string
    PidsLimit   int
    
    // 网络配置
    NetworkMode string
    IPAddress   string
    PortMappings []PortMapping
    
    // 存储配置
    Mounts      []Mount
    ReadOnly    bool
    
    // 运行时状态
    Status      ContainerStatus
    PID         int
    CreatedAt   time.Time
    StartedAt   time.Time
    
    // 内部状态
    mutex       sync.RWMutex
    cancel      context.CancelFunc
}

// ContainerStatus 容器状态
type ContainerStatus int

const (
    StatusCreated ContainerStatus = iota
    StatusRunning
    StatusPaused
    StatusStopped
    StatusRemoved
)

// PortMapping 端口映射
type PortMapping struct {
    HostPort      int
    ContainerPort int
    Protocol      string
}

// Mount 挂载点
type Mount struct {
    Source      string
    Destination string
    Type        string
    Options     []string
}

// NewContainer 创建新容器
func NewContainer(id, name, rootfs, cmd string, args []string) *Container {
    return &Container{
        ID:          id,
        Name:        name,
        Rootfs:      rootfs,
        Cmd:         cmd,
        Args:        args,
        Env:         os.Environ(),
        WorkingDir:  "/",
        User:        "root",
        Hostname:    "container",
        MemoryLimit: "128M",
        CPULimit:    "50000 100000",
        PidsLimit:   100,
        NetworkMode: "bridge",
        Status:      StatusCreated,
        CreatedAt:   time.Now(),
    }
}

// SetResourceLimits 设置资源限制
func (c *Container) SetResourceLimits(memory, cpu string, pids int) {
    c.mutex.Lock()
    defer c.mutex.Unlock()
    
    c.MemoryLimit = memory
    c.CPULimit = cpu
    c.PidsLimit = pids
}

// SetNetworkConfig 设置网络配置
func (c *Container) SetNetworkConfig(mode, ip string, ports []PortMapping) {
    c.mutex.Lock()
    defer c.mutex.Unlock()
    
    c.NetworkMode = mode
    c.IPAddress = ip
    c.PortMappings = ports
}

// SetMounts 设置挂载点
func (c *Container) SetMounts(mounts []Mount) {
    c.mutex.Lock()
    defer c.mutex.Unlock()
    
    c.Mounts = mounts
}

// GetStatus 获取容器状态
func (c *Container) GetStatus() ContainerStatus {
    c.mutex.RLock()
    defer c.mutex.RUnlock()
    
    return c.Status
}

// SetStatus 设置容器状态
func (c *Container) SetStatus(status ContainerStatus) {
    c.mutex.Lock()
    defer c.mutex.Unlock()
    
    c.Status = status
    if status == StatusRunning {
        c.StartedAt = time.Now()
    }
}

2.2 容器运行时

// Run 运行容器
func (c *Container) Run() error {
    // 验证容器配置
    if err := c.validate(); err != nil {
        return fmt.Errorf("validation failed: %v", err)
    }
    
    // 创建上下文
    ctx, cancel := context.WithCancel(context.Background())
    c.cancel = cancel
    
    // 创建所有 namespace 的 flags
    flags := syscall.CLONE_NEWUTS |
             syscall.CLONE_NEWPID |
             syscall.CLONE_NEWNS |
             syscall.CLONE_NEWNET |
             syscall.CLONE_NEWIPC |
             syscall.CLONE_NEWUSER |
             syscall.CLONE_NEWCGROUP

    // 准备子进程命令
    cmd := exec.CommandContext(ctx, "/proc/self/exe", "child", c.ID, c.Rootfs, c.Cmd)
    cmd.Args = append(cmd.Args, c.Args...)
    
    // 设置系统调用属性
    cmd.SysProcAttr = &syscall.SysProcAttr{
        Cloneflags: flags,
        Unshareflags: syscall.CLONE_NEWNS,
    }

    // 设置标准输入输出
    cmd.Stdin = os.Stdin
    cmd.Stdout = os.Stdout
    cmd.Stderr = os.Stderr

    // 启动子进程
    if err := cmd.Start(); err != nil {
        return fmt.Errorf("failed to start child process: %v", err)
    }
    
    c.PID = cmd.Process.Pid
    c.SetStatus(StatusRunning)
    
    // 配置网络
    if err := c.setupNetwork(); err != nil {
        return fmt.Errorf("failed to setup network: %v", err)
    }
    
    // 配置控制组
    if err := c.setupCGroup(); err != nil {
        return fmt.Errorf("failed to setup cgroup: %v", err)
    }
    
    // 等待子进程完成
    if err := cmd.Wait(); err != nil {
        c.SetStatus(StatusStopped)
        return fmt.Errorf("container exited with error: %v", err)
    }
    
    c.SetStatus(StatusStopped)
    return nil
}

// validate 验证容器配置
func (c *Container) validate() error {
    // 检查 rootfs 是否存在
    if _, err := os.Stat(c.Rootfs); os.IsNotExist(err) {
        return fmt.Errorf("rootfs does not exist: %s", c.Rootfs)
    }
    
    // 检查命令是否存在
    if _, err := exec.LookPath(c.Cmd); err != nil {
        return fmt.Errorf("command not found: %s", c.Cmd)
    }
    
    return nil
}

// Stop 停止容器
func (c *Container) Stop() error {
    c.mutex.Lock()
    defer c.mutex.Unlock()
    
    if c.Status != StatusRunning {
        return fmt.Errorf("container is not running")
    }
    
    if c.PID > 0 {
        if err := syscall.Kill(c.PID, syscall.SIGTERM); err != nil {
            return fmt.Errorf("failed to send SIGTERM: %v", err)
        }
        
        // 等待进程退出
        for i := 0; i < 10; i++ {
            if err := syscall.Kill(c.PID, 0); err != nil {
                break
            }
            time.Sleep(100 * time.Millisecond)
        }
        
        // 强制杀死进程
        if err := syscall.Kill(c.PID, syscall.SIGKILL); err != nil {
            return fmt.Errorf("failed to send SIGKILL: %v", err)
        }
    }
    
    c.Status = StatusStopped
    return nil
}

// Remove 删除容器
func (c *Container) Remove() error {
    c.mutex.Lock()
    defer c.mutex.Unlock()
    
    if c.Status == StatusRunning {
        return fmt.Errorf("cannot remove running container")
    }
    
    // 清理网络
    if err := c.cleanupNetwork(); err != nil {
        return fmt.Errorf("failed to cleanup network: %v", err)
    }
    
    // 清理控制组
    if err := c.cleanupCGroup(); err != nil {
        return fmt.Errorf("failed to cleanup cgroup: %v", err)
    }
    
    c.Status = StatusRemoved
    return nil
}

2.3 网络管理

package network

import (
    "fmt"
    "net"
    "os"
    "path/filepath"
    "strconv"
    "strings"
    
    "github.com/vishvananda/netlink"
    "github.com/vishvananda/netns"
)

// NetworkManager 网络管理器
type NetworkManager struct {
    bridgeName string
    bridgeIP   string
    subnet     string
}

// NewNetworkManager 创建网络管理器
func NewNetworkManager() *NetworkManager {
    return &NetworkManager{
        bridgeName: "cni0",
        bridgeIP:   "10.22.0.1",
        subnet:     "10.22.0.0/24",
    }
}

// SetupNetwork 设置网络
func (nm *NetworkManager) SetupNetwork(containerID string) (*NetworkConfig, error) {
    // 创建 Bridge
    if err := nm.createBridge(); err != nil {
        return nil, fmt.Errorf("failed to create bridge: %v", err)
    }
    
    // 创建 veth pair
    vethHost, vethCont, err := nm.createVethPair(containerID)
    if err != nil {
        return nil, fmt.Errorf("failed to create veth pair: %v", err)
    }
    
    // 配置容器网络
    containerIP, err := nm.configureContainerNetwork(containerID, vethCont)
    if err != nil {
        return nil, fmt.Errorf("failed to configure container network: %v", err)
    }
    
    // 配置 NAT
    if err := nm.configureNAT(); err != nil {
        return nil, fmt.Errorf("failed to configure NAT: %v", err)
    }
    
    return &NetworkConfig{
        BridgeName:  nm.bridgeName,
        VethHost:    vethHost,
        VethCont:    vethCont,
        ContainerIP: containerIP,
        GatewayIP:   nm.bridgeIP,
    }, nil
}

// createBridge 创建 Bridge
func (nm *NetworkManager) createBridge() error {
    // 检查 Bridge 是否已存在
    if _, err := netlink.LinkByName(nm.bridgeName); err == nil {
        return nil
    }
    
    // 创建 Bridge
    bridge := &netlink.Bridge{
        LinkAttrs: netlink.LinkAttrs{
            Name: nm.bridgeName,
        },
    }
    
    if err := netlink.LinkAdd(bridge); err != nil {
        return fmt.Errorf("failed to add bridge: %v", err)
    }
    
    // 配置 IP 地址
    addr, err := netlink.ParseAddr(nm.bridgeIP + "/24")
    if err != nil {
        return fmt.Errorf("failed to parse bridge IP: %v", err)
    }
    
    if err := netlink.AddrAdd(bridge, addr); err != nil {
        return fmt.Errorf("failed to add bridge address: %v", err)
    }
    
    // 启动 Bridge
    if err := netlink.LinkSetUp(bridge); err != nil {
        return fmt.Errorf("failed to set bridge up: %v", err)
    }
    
    return nil
}

// createVethPair 创建 veth pair
func (nm *NetworkManager) createVethPair(containerID string) (string, string, error) {
    vethHost := fmt.Sprintf("veth%s", containerID[:8])
    vethCont := "eth0"
    
    // 创建 veth pair
    veth := &netlink.Veth{
        LinkAttrs: netlink.LinkAttrs{
            Name: vethHost,
        },
        PeerName: vethCont,
    }
    
    if err := netlink.LinkAdd(veth); err != nil {
        return "", "", fmt.Errorf("failed to add veth: %v", err)
    }
    
    // 获取 Bridge
    bridge, err := netlink.LinkByName(nm.bridgeName)
    if err != nil {
        return "", "", fmt.Errorf("failed to get bridge: %v", err)
    }
    
    // 将 host 端连接到 Bridge
    if err := netlink.LinkSetMaster(veth, bridge); err != nil {
        return "", "", fmt.Errorf("failed to set veth master: %v", err)
    }
    
    // 启动 host 端
    if err := netlink.LinkSetUp(veth); err != nil {
        return "", "", fmt.Errorf("failed to set veth up: %v", err)
    }
    
    return vethHost, vethCont, nil
}

// configureContainerNetwork 配置容器网络
func (nm *NetworkManager) configureContainerNetwork(containerID, vethCont string) (string, error) {
    // 创建 Network Namespace
    if err := nm.createNetworkNamespace(containerID); err != nil {
        return "", fmt.Errorf("failed to create network namespace: %v", err)
    }
    
    // 获取容器 Network Namespace
    ns, err := netns.GetFromPath(filepath.Join("/var/run/netns", containerID))
    if err != nil {
        return "", fmt.Errorf("failed to get network namespace: %v", err)
    }
    defer ns.Close()
    
    // 将 veth 移到容器 namespace
    veth, err := netlink.LinkByName(vethCont)
    if err != nil {
        return "", fmt.Errorf("failed to get veth: %v", err)
    }
    
    if err := netlink.LinkSetNsPid(veth, 0); err != nil {
        return "", fmt.Errorf("failed to set veth namespace: %v", err)
    }
    
    // 在容器 namespace 中配置网络
    containerIP, err := nm.configureContainerInterface(ns, vethCont)
    if err != nil {
        return "", fmt.Errorf("failed to configure container interface: %v", err)
    }
    
    return containerIP, nil
}

// createNetworkNamespace 创建 Network Namespace
func (nm *NetworkManager) createNetworkNamespace(containerID string) error {
    // 创建 namespace 文件
    nsPath := filepath.Join("/var/run/netns", containerID)
    if err := os.MkdirAll(filepath.Dir(nsPath), 0755); err != nil {
        return fmt.Errorf("failed to create netns directory: %v", err)
    }
    
    // 创建 namespace
    if err := netns.NewNamed(containerID); err != nil {
        return fmt.Errorf("failed to create named namespace: %v", err)
    }
    
    return nil
}

// configureContainerInterface 配置容器网络接口
func (nm *NetworkManager) configureContainerInterface(ns netns.NsHandle, ifName string) (string, error) {
    // 在容器 namespace 中配置网络
    err := netns.Set(ns)
    if err != nil {
        return "", fmt.Errorf("failed to set namespace: %v", err)
    }
    defer netns.Set(netns.None())
    
    // 启动 lo 接口
    lo, err := netlink.LinkByName("lo")
    if err == nil {
        netlink.LinkSetUp(lo)
    }
    
    // 获取容器接口
    iface, err := netlink.LinkByName(ifName)
    if err != nil {
        return "", fmt.Errorf("failed to get container interface: %v", err)
    }
    
    // 启动容器接口
    if err := netlink.LinkSetUp(iface); err != nil {
        return "", fmt.Errorf("failed to set container interface up: %v", err)
    }
    
    // 分配 IP 地址
    containerIP := nm.allocateIP()
    addr, err := netlink.ParseAddr(containerIP + "/24")
    if err != nil {
        return "", fmt.Errorf("failed to parse container IP: %v", err)
    }
    
    if err := netlink.AddrAdd(iface, addr); err != nil {
        return "", fmt.Errorf("failed to add container address: %v", err)
    }
    
    // 添加默认路由
    route := &netlink.Route{
        LinkIndex: iface.Attrs().Index,
        Dst:       nil,
        Gw:        net.ParseIP(nm.bridgeIP),
    }
    
    if err := netlink.RouteAdd(route); err != nil {
        return "", fmt.Errorf("failed to add default route: %v", err)
    }
    
    return containerIP, nil
}

// allocateIP 分配 IP 地址
func (nm *NetworkManager) allocateIP() string {
    // 简单的 IP 分配策略
    // 在实际应用中,应该使用更复杂的 IPAM
    return "10.22.0.2"
}

// configureNAT 配置 NAT
func (nm *NetworkManager) configureNAT() error {
    // 检查 NAT 规则是否已存在
    if nm.natRuleExists() {
        return nil
    }
    
    // 添加 NAT 规则
    cmd := exec.Command("iptables", "-t", "nat", "-A", "POSTROUTING", 
        "-s", nm.subnet, "-j", "MASQUERADE")
    if err := cmd.Run(); err != nil {
        return fmt.Errorf("failed to add NAT rule: %v", err)
    }
    
    return nil
}

// natRuleExists 检查 NAT 规则是否存在
func (nm *NetworkManager) natRuleExists() bool {
    cmd := exec.Command("iptables", "-t", "nat", "-C", "POSTROUTING", 
        "-s", nm.subnet, "-j", "MASQUERADE")
    return cmd.Run() == nil
}

// NetworkConfig 网络配置
type NetworkConfig struct {
    BridgeName  string
    VethHost    string
    VethCont    string
    ContainerIP string
    GatewayIP   string
}

2.4 控制组管理

package container

import (
    "fmt"
    "os"
    "path/filepath"
    "strconv"
    "strings"
)

// CGroupManager 控制组管理器
type CGroupManager struct {
    containerID string
    path        string
    config      *CGroupConfig
}

// CGroupConfig 控制组配置
type CGroupConfig struct {
    MemoryMax string
    CPUMax    string
    PidsMax   int
    IOWeight  int
}

// NewCGroupManager 创建控制组管理器
func NewCGroupManager(containerID string, config *CGroupConfig) *CGroupManager {
    return &CGroupManager{
        containerID: containerID,
        path:        filepath.Join("/sys/fs/cgroup", containerID),
        config:      config,
    }
}

// Create 创建控制组
func (cgm *CGroupManager) Create() error {
    // 创建控制组目录
    if err := os.MkdirAll(cgm.path, 0755); err != nil {
        return fmt.Errorf("failed to create cgroup directory: %v", err)
    }
    
    // 设置内存限制
    if err := cgm.setMemoryLimit(); err != nil {
        return fmt.Errorf("failed to set memory limit: %v", err)
    }
    
    // 设置 CPU 限制
    if err := cgm.setCPULimit(); err != nil {
        return fmt.Errorf("failed to set CPU limit: %v", err)
    }
    
    // 设置进程数限制
    if err := cgm.setPidsLimit(); err != nil {
        return fmt.Errorf("failed to set pids limit: %v", err)
    }
    
    // 设置 IO 权重
    if err := cgm.setIOWeight(); err != nil {
        return fmt.Errorf("failed to set IO weight: %v", err)
    }
    
    return nil
}

// AddProcess 添加进程到控制组
func (cgm *CGroupManager) AddProcess(pid int) error {
    cgroupProcsPath := filepath.Join(cgm.path, "cgroup.procs")
    return os.WriteFile(cgroupProcsPath, []byte(strconv.Itoa(pid)), 0644)
}

// RemoveProcess 从控制组移除进程
func (cgm *CGroupManager) RemoveProcess(pid int) error {
    cgroupProcsPath := filepath.Join(cgm.path, "cgroup.procs")
    data, err := os.ReadFile(cgroupProcsPath)
    if err != nil {
        return err
    }
    
    lines := strings.Split(string(data), "\n")
    var newLines []string
    pidStr := strconv.Itoa(pid)
    
    for _, line := range lines {
        if line != pidStr {
            newLines = append(newLines, line)
        }
    }
    
    return os.WriteFile(cgroupProcsPath, []byte(strings.Join(newLines, "\n")), 0644)
}

// Destroy 销毁控制组
func (cgm *CGroupManager) Destroy() error {
    return os.RemoveAll(cgm.path)
}

// GetStats 获取控制组统计信息
func (cgm *CGroupManager) GetStats() (*CGroupStats, error) {
    stats := &CGroupStats{}
    
    // 读取内存使用情况
    if data, err := os.ReadFile(filepath.Join(cgm.path, "memory.current")); err == nil {
        stats.MemoryCurrent = strings.TrimSpace(string(data))
    }
    
    // 读取 CPU 使用情况
    if data, err := os.ReadFile(filepath.Join(cgm.path, "cpu.stat")); err == nil {
        stats.CPUStat = string(data)
    }
    
    // 读取进程数
    if data, err := os.ReadFile(filepath.Join(cgm.path, "pids.current")); err == nil {
        stats.PidsCurrent = strings.TrimSpace(string(data))
    }
    
    return stats, nil
}

// setMemoryLimit 设置内存限制
func (cgm *CGroupManager) setMemoryLimit() error {
    memoryMaxPath := filepath.Join(cgm.path, "memory.max")
    return os.WriteFile(memoryMaxPath, []byte(cgm.config.MemoryMax), 0644)
}

// setCPULimit 设置 CPU 限制
func (cgm *CGroupManager) setCPULimit() error {
    cpuMaxPath := filepath.Join(cgm.path, "cpu.max")
    return os.WriteFile(cpuMaxPath, []byte(cgm.config.CPUMax), 0644)
}

// setPidsLimit 设置进程数限制
func (cgm *CGroupManager) setPidsLimit() error {
    pidsMaxPath := filepath.Join(cgm.path, "pids.max")
    return os.WriteFile(pidsMaxPath, []byte(strconv.Itoa(cgm.config.PidsMax)), 0644)
}

// setIOWeight 设置 IO 权重
func (cgm *CGroupManager) setIOWeight() error {
    ioWeightPath := filepath.Join(cgm.path, "io.weight")
    return os.WriteFile(ioWeightPath, []byte(strconv.Itoa(cgm.config.IOWeight)), 0644)
}

// CGroupStats 控制组统计信息
type CGroupStats struct {
    MemoryCurrent string
    CPUStat       string
    PidsCurrent   string
}

2.5 文件系统管理

package container

import (
    "fmt"
    "os"
    "path/filepath"
    "syscall"
    "golang.org/x/sys/unix"
)

// FileSystemManager 文件系统管理器
type FileSystemManager struct {
    rootfs string
    mounts []Mount
}

// NewFileSystemManager 创建文件系统管理器
func NewFileSystemManager(rootfs string, mounts []Mount) *FileSystemManager {
    return &FileSystemManager{
        rootfs: rootfs,
        mounts: mounts,
    }
}

// SetupFileSystem 设置文件系统
func (fsm *FileSystemManager) SetupFileSystem() error {
    // 重新挂载特殊文件系统
    if err := fsm.mountSpecialFilesystems(); err != nil {
        return fmt.Errorf("failed to mount special filesystems: %v", err)
    }
    
    // 创建必要设备文件
    if err := fsm.createDevices(); err != nil {
        return fmt.Errorf("failed to create devices: %v", err)
    }
    
    // 切换根目录
    if err := fsm.pivotRoot(); err != nil {
        return fmt.Errorf("failed to pivot root: %v", err)
    }
    
    // 重新挂载特殊文件系统
    if err := fsm.mountSpecialFilesystems(); err != nil {
        return fmt.Errorf("failed to remount special filesystems: %v", err)
    }
    
    // 挂载用户指定的挂载点
    if err := fsm.mountUserMounts(); err != nil {
        return fmt.Errorf("failed to mount user mounts: %v", err)
    }
    
    return nil
}

// mountSpecialFilesystems 挂载特殊文件系统
func (fsm *FileSystemManager) mountSpecialFilesystems() error {
    // 挂载 /proc
    if err := unix.Mount("proc", "/proc", "proc", 0, ""); err != nil {
        return fmt.Errorf("failed to mount /proc: %v", err)
    }
    
    // 挂载 /sys
    if err := unix.Mount("sysfs", "/sys", "sysfs", 0, ""); err != nil {
        return fmt.Errorf("failed to mount /sys: %v", err)
    }
    
    // 挂载 /dev
    if err := unix.Mount("devtmpfs", "/dev", "devtmpfs", 0, ""); err != nil {
        return fmt.Errorf("failed to mount /dev: %v", err)
    }
    
    // 挂载 /dev/pts
    if err := os.MkdirAll("/dev/pts", 0755); err != nil {
        return fmt.Errorf("failed to create /dev/pts: %v", err)
    }
    
    if err := unix.Mount("devpts", "/dev/pts", "devpts", 0, ""); err != nil {
        return fmt.Errorf("failed to mount /dev/pts: %v", err)
    }
    
    // 挂载 /dev/shm
    if err := os.MkdirAll("/dev/shm", 0755); err != nil {
        return fmt.Errorf("failed to create /dev/shm: %v", err)
    }
    
    if err := unix.Mount("tmpfs", "/dev/shm", "tmpfs", 0, ""); err != nil {
        return fmt.Errorf("failed to mount /dev/shm: %v", err)
    }
    
    return nil
}

// createDevices 创建设备文件
func (fsm *FileSystemManager) createDevices() error {
    devices := []struct {
        name string
        mode uint32
        dev  int
    }{
        {"/dev/null", syscall.S_IFCHR | 0666, 0x0103},
        {"/dev/zero", syscall.S_IFCHR | 0666, 0x0105},
        {"/dev/random", syscall.S_IFCHR | 0666, 0x0108},
        {"/dev/urandom", syscall.S_IFCHR | 0666, 0x0109},
        {"/dev/tty", syscall.S_IFCHR | 0666, 0x0500},
        {"/dev/console", syscall.S_IFCHR | 0666, 0x0501},
    }
    
    for _, device := range devices {
        if err := unix.Mknod(device.name, device.mode, device.dev); err != nil {
            return fmt.Errorf("failed to create device %s: %v", device.name, err)
        }
    }
    
    return nil
}

// pivotRoot 切换根目录
func (fsm *FileSystemManager) pivotRoot() error {
    // 绑定挂载 rootfs
    if err := unix.Mount(fsm.rootfs, fsm.rootfs, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
        return fmt.Errorf("failed to bind mount rootfs: %v", err)
    }
    
    // 创建 put_old 目录
    putold := "/.oldroot"
    if err := os.MkdirAll(putold, 0700); err != nil {
        return fmt.Errorf("failed to create put_old directory: %v", err)
    }
    
    // 执行 pivot_root
    if err := unix.PivotRoot(fsm.rootfs, putold); err != nil {
        return fmt.Errorf("failed to pivot_root: %v", err)
    }
    
    // 切换到新根目录
    if err := os.Chdir("/"); err != nil {
        return fmt.Errorf("failed to change working directory: %v", err)
    }
    
    // 卸载原根目录
    if err := unix.Unmount(putold, unix.MNT_DETACH); err != nil {
        return fmt.Errorf("failed to unmount old root: %v", err)
    }
    
    // 删除 put_old 目录
    if err := os.RemoveAll(putold); err != nil {
        return fmt.Errorf("failed to remove put_old directory: %v", err)
    }
    
    return nil
}

// mountUserMounts 挂载用户指定的挂载点
func (fsm *FileSystemManager) mountUserMounts() error {
    for _, mount := range fsm.mounts {
        if err := fsm.mountUserMount(mount); err != nil {
            return fmt.Errorf("failed to mount %s: %v", mount.Destination, err)
        }
    }
    
    return nil
}

// mountUserMount 挂载单个挂载点
func (fsm *FileSystemManager) mountUserMount(mount Mount) error {
    // 创建目标目录
    if err := os.MkdirAll(mount.Destination, 0755); err != nil {
        return fmt.Errorf("failed to create mount point: %v", err)
    }
    
    // 解析挂载选项
    var flags uintptr
    for _, option := range mount.Options {
        switch option {
        case "ro":
            flags |= unix.MS_RDONLY
        case "noexec":
            flags |= unix.MS_NOEXEC
        case "nosuid":
            flags |= unix.MS_NOSUID
        case "nodev":
            flags |= unix.MS_NODEV
        }
    }
    
    // 执行挂载
    if err := unix.Mount(mount.Source, mount.Destination, mount.Type, flags, ""); err != nil {
        return fmt.Errorf("failed to mount: %v", err)
    }
    
    return nil
}

三、主程序

3.1 完整的 main.go

package main

import (
    "flag"
    "fmt"
    "log"
    "os"
    "os/exec"
    "syscall"
    "time"
    
    "container/container"
    "container/network"
)

func main() {
    var (
        rootfs      = flag.String("rootfs", "", "Root filesystem path")
        cmd         = flag.String("cmd", "/bin/sh", "Command to run in container")
        args        = flag.String("args", "", "Command arguments")
        name        = flag.String("name", "", "Container name")
        memory      = flag.String("memory", "128M", "Memory limit")
        cpu         = flag.String("cpu", "50000 100000", "CPU limit")
        pids        = flag.Int("pids", 100, "Process limit")
        networkMode = flag.String("network", "bridge", "Network mode")
        ip          = flag.String("ip", "", "Container IP address")
        ports       = flag.String("ports", "", "Port mappings (host:container)")
        mounts      = flag.String("mounts", "", "Mount points (source:destination:type:options)")
        readOnly    = flag.Bool("readonly", false, "Read-only root filesystem")
        detach      = flag.Bool("detach", false, "Run container in background")
    )
    flag.Parse()

    if *rootfs == "" {
        log.Fatal("rootfs is required")
    }

    // 生成容器 ID
    containerID := generateContainerID()
    if *name == "" {
        *name = containerID
    }

    // 解析命令参数
    command := *cmd
    commandArgs := []string{}
    if *args != "" {
        commandArgs = append(commandArgs, *args)
    }

    // 解析端口映射
    portMappings := parsePortMappings(*ports)
    
    // 解析挂载点
    mountPoints := parseMounts(*mounts)

    // 创建容器
    c := container.NewContainer(containerID, *name, *rootfs, command, commandArgs)
    
    // 设置资源限制
    c.SetResourceLimits(*memory, *cpu, *pids)
    
    // 设置网络配置
    c.SetNetworkConfig(*networkMode, *ip, portMappings)
    
    // 设置挂载点
    c.SetMounts(mountPoints)
    
    // 设置只读
    if *readOnly {
        c.ReadOnly = true
    }

    // 运行容器
    if *detach {
        go func() {
            if err := c.Run(); err != nil {
                log.Printf("Container %s failed: %v", containerID, err)
            }
        }()
        
        // 等待容器启动
        time.Sleep(1 * time.Second)
        
        fmt.Printf("Container %s started in background\n", containerID)
        fmt.Printf("Use 'docker attach %s' to attach to container\n", containerID)
    } else {
        if err := c.Run(); err != nil {
            log.Fatalf("Failed to run container: %v", err)
        }
    }
}

// generateContainerID 生成容器 ID
func generateContainerID() string {
    return fmt.Sprintf("container-%d", time.Now().UnixNano())
}

// parsePortMappings 解析端口映射
func parsePortMappings(ports string) []container.PortMapping {
    var mappings []container.PortMapping
    
    if ports == "" {
        return mappings
    }
    
    // 解析格式: "8080:80,9090:90"
    for _, port := range strings.Split(ports, ",") {
        parts := strings.Split(port, ":")
        if len(parts) != 2 {
            continue
        }
        
        hostPort, err := strconv.Atoi(parts[0])
        if err != nil {
            continue
        }
        
        containerPort, err := strconv.Atoi(parts[1])
        if err != nil {
            continue
        }
        
        mappings = append(mappings, container.PortMapping{
            HostPort:      hostPort,
            ContainerPort: containerPort,
            Protocol:      "tcp",
        })
    }
    
    return mappings
}

// parseMounts 解析挂载点
func parseMounts(mounts string) []container.Mount {
    var mountPoints []container.Mount
    
    if mounts == "" {
        return mountPoints
    }
    
    // 解析格式: "source:destination:type:options"
    for _, mount := range strings.Split(mounts, ",") {
        parts := strings.Split(mount, ":")
        if len(parts) < 2 {
            continue
        }
        
        mountPoint := container.Mount{
            Source:      parts[0],
            Destination: parts[1],
            Type:        "bind",
            Options:     []string{"rw"},
        }
        
        if len(parts) > 2 {
            mountPoint.Type = parts[2]
        }
        
        if len(parts) > 3 {
            mountPoint.Options = strings.Split(parts[3], ",")
        }
        
        mountPoints = append(mountPoints, mountPoint)
    }
    
    return mountPoints
}

四、调试和测试

4.1 调试模式

// 添加调试模式
var debug = flag.Bool("debug", false, "Enable debug mode")

func (c *Container) Run() error {
    if *debug {
        log.Printf("Starting container %s with rootfs: %s", c.ID, c.Rootfs)
        log.Printf("Command: %s %v", c.Cmd, c.Args)
        log.Printf("Memory limit: %s", c.MemoryLimit)
        log.Printf("CPU limit: %s", c.CPULimit)
        log.Printf("Network mode: %s", c.NetworkMode)
    }
    
    // ... 其他代码
}

4.2 日志记录

import "github.com/sirupsen/logrus"

var logger = logrus.New()

func init() {
    logger.SetLevel(logrus.InfoLevel)
    logger.SetFormatter(&logrus.TextFormatter{
        FullTimestamp: true,
    })
}

func (c *Container) Run() error {
    logger.WithFields(logrus.Fields{
        "container_id": c.ID,
        "rootfs":       c.Rootfs,
        "command":      c.Cmd,
    }).Info("Starting container")
    
    // ... 其他代码
}

4.3 错误处理

func (c *Container) Run() error {
    // 验证容器配置
    if err := c.validate(); err != nil {
        return fmt.Errorf("validation failed: %v", err)
    }
    
    // 创建上下文
    ctx, cancel := context.WithCancel(context.Background())
    c.cancel = cancel
    defer cancel()
    
    // ... 其他代码
    
    // 等待子进程完成
    if err := cmd.Wait(); err != nil {
        if exitError, ok := err.(*exec.ExitError); ok {
            logger.WithFields(logrus.Fields{
                "container_id": c.ID,
                "exit_code":    exitError.ExitCode(),
            }).Error("Container exited with error")
            return fmt.Errorf("container exited with code %d: %v", exitError.ExitCode(), err)
        }
        return fmt.Errorf("failed to wait for child process: %v", err)
    }
    
    return nil
}

五、验证检查清单

基础功能

  • [ ] 能够编译完整程序
  • [ ] 能够运行容器
  • [ ] 能够配置网络
  • [ ] 能够配置资源限制

高级功能

  • [ ] 能够处理端口映射
  • [ ] 能够处理挂载点
  • [ ] 能够处理只读文件系统
  • [ ] 能够后台运行容器

调试技能

  • [ ] 能够使用调试模式
  • [ ] 能够查看日志
  • [ ] 能够处理错误
  • [ ] 能够进行性能测试

阶段化演进路线

根据实际开发经验,容器运行时的开发通常遵循以下阶段:

阶段 1:完善基础容器机制

1.1 增加 Capabilities 控制

// 限制容器中的系统权限
func setupCapabilities() error {
    // 只保留必要的 capabilities
    caps := []string{
        "CAP_NET_BIND_SERVICE",
        "CAP_CHOWN",
        "CAP_SETUID",
        "CAP_SETGID",
    }
    
    // 移除危险权限
    dangerousCaps := []string{
        "CAP_SYS_ADMIN",
        "CAP_SYS_PTRACE", 
        "CAP_SYS_MODULE",
    }
    
    for _, cap := range dangerousCaps {
        if err := unix.Prctl(unix.PR_CAPBSET_DROP, 
            getCapabilityValue(cap), 0, 0, 0); err != nil {
            return err
        }
    }
    return nil
}

1.2 增加 Seccomp 安全策略

// 限制系统调用
func setupSeccomp() error {
    filter, err := seccomp.NewFilter(seccomp.ActAllow)
    if err != nil {
        return err
    }
    
    // 禁止危险系统调用
    dangerousSyscalls := []string{
        "mount", "umount", "ptrace", "reboot",
        "swapon", "swapoff", "syslog",
    }
    
    for _, syscall := range dangerousSyscalls {
        if err := filter.AddRule(
            seccomp.GetSyscallFromName(syscall),
            seccomp.ActKillProcess,
        ); err != nil {
            return err
        }
    }
    
    return filter.Load()
}

阶段 2:强化网络子系统

2.1 IPAM 模块实现

type IPAM struct {
    Subnet    *net.IPNet
    Gateway   net.IP
    Allocated map[string]bool
    file      string
    mu        sync.Mutex
}

func (i *IPAM) Allocate() (net.IP, error) {
    i.mu.Lock()
    defer i.mu.Unlock()
    
    start := binaryInc(i.Gateway)
    for ip := start; i.Subnet.Contains(ip); ip = binaryInc(ip) {
        if !i.Allocated[ip.String()] {
            i.Allocated[ip.String()] = true
            i.save()
            return ip, nil
        }
    }
    return nil, fmt.Errorf("no available IP in %s", i.Subnet.String())
}

2.2 多容器网络互通

// 支持多个容器共享同一个 bridge
func setupMultiContainerNetwork(containerID string) error {
    // 1. 确保 bridge 存在
    br, err := ensureBridge("toybr0", "10.22.0.1/24")
    if err != nil {
        return err
    }
    
    // 2. 分配唯一 IP
    ipam, err := NewIPAM("/var/lib/toyctr/ipam.json", "10.22.0.0/24")
    if err != nil {
        return err
    }
    
    contIP, err := ipam.Allocate()
    if err != nil {
        return err
    }
    
    // 3. 创建 veth 对
    hostVeth, contVeth, err := createVethPair(containerID)
    if err != nil {
        return err
    }
    
    // 4. 配置网络
    return configureContainerNetwork(hostVeth, contVeth, contIP, br)
}

阶段 3:OverlayFS 镜像分层

3.1 OverlayFS 实现

type OverlayPaths struct {
    Base     string
    Lower    string  // 只读镜像层
    Upper    string  // 可写层
    Work     string  // 工作目录
    Merged   string  // 合并后的根目录
    ImageRef string
}

func PrepareOverlay(containerID, imageName string) (*OverlayPaths, error) {
    base := filepath.Join("/var/lib/toyctr/containers", containerID)
    imgLower := filepath.Join("/var/lib/toyctr/images", imageName, "lower")
    
    // 创建目录结构
    upper := filepath.Join(base, "upper")
    work := filepath.Join(base, "work")
    merged := filepath.Join(base, "merged")
    
    for _, d := range []string{base, upper, work, merged} {
        if err := os.MkdirAll(d, 0755); err != nil {
            return nil, err
        }
    }
    
    // 挂载 OverlayFS
    opts := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", 
        imgLower, upper, work)
    if err := unix.Mount("overlay", merged, "overlay", 0, opts); err != nil {
        return nil, fmt.Errorf("mount overlay: %w", err)
    }
    
    return &OverlayPaths{
        Base:     base,
        Lower:    imgLower,
        Upper:    upper,
        Work:     work,
        Merged:   merged,
        ImageRef: imageName,
    }, nil
}

阶段 4:Volume 挂载支持

4.1 挂载解析

type MountSpec struct {
    Type string // "bind" | "tmpfs"
    Src  string
    Dst  string
    RO   bool
    Opts string
}

func parseMountSpec(s string) (MountSpec, error) {
    ms := MountSpec{}
    parts := strings.Split(s, ",")
    
    for _, p := range parts {
        p = strings.TrimSpace(p)
        if p == "ro" {
            ms.RO = true
            continue
        }
        
        k, v, ok := strings.Cut(p, "=")
        if !ok {
            return ms, fmt.Errorf("bad mount token: %q", p)
        }
        
        switch k {
        case "type":
            ms.Type = v
        case "src", "source":
            ms.Src = v
        case "dst", "target":
            ms.Dst = v
        case "opts":
            ms.Opts = v
        }
    }
    
    return ms, nil
}

4.2 挂载执行

func setupMounts(mounts []MountSpec) error {
    for _, m := range mounts {
        if err := os.MkdirAll(m.Dst, 0755); err != nil {
            return err
        }
        
        switch m.Type {
        case "bind":
            // 先 bind 挂载
            if err := unix.Mount(m.Src, m.Dst, "", 
                unix.MS_BIND|unix.MS_REC, ""); err != nil {
                return err
            }
            
            // 如果是只读,则 remount
            if m.RO {
                if err := unix.Mount("", m.Dst, "", 
                    unix.MS_REMOUNT|unix.MS_BIND|unix.MS_RDONLY, ""); err != nil {
                    return err
                }
            }
            
        case "tmpfs":
            if err := unix.Mount("tmpfs", m.Dst, "tmpfs", 0, m.Opts); err != nil {
                return err
            }
        }
    }
    return nil
}

实战练习

练习 1:基础容器实现

  1. 实现基本的命名空间隔离
  2. 添加 Cgroup 资源限制
  3. 配置简单的网络连接

验证步骤:

# 1. 编译程序
go build -o toyctr main.go

# 2. 运行容器
sudo ./toyctr -cmd /bin/sh -args "-c 'sleep 60'"

# 3. 验证隔离
# 在另一个终端查看进程
ps aux | grep sh
# 应该只看到容器内的进程

# 4. 验证网络
# 在容器内
ip addr show
# 应该看到独立的网络接口

练习 2:多容器网络

  1. 实现 IPAM 模块
  2. 支持多容器互通
  3. 验证容器间通信

验证步骤:

# 1. 启动第一个容器
sudo ./toyctr -cmd /bin/sh -args "-c 'sleep 600'" &
CONTAINER1_PID=$!

# 2. 启动第二个容器
sudo ./toyctr -cmd /bin/sh -args "-c 'sleep 600'" &
CONTAINER2_PID=$!

# 3. 进入第一个容器测试网络
sudo nsenter -t $CONTAINER1_PID -n ping -c 1 10.22.0.3

# 4. 验证 IP 分配
cat /var/lib/toyctr/ipam.json

练习 3:文件系统隔离

  1. 实现 OverlayFS 支持
  2. 添加 Volume 挂载
  3. 测试只读根文件系统

验证步骤:

# 1. 准备镜像层
mkdir -p /var/lib/toyctr/images/busybox/lower
docker export $(docker create busybox) | sudo tar -C /var/lib/toyctr/images/busybox/lower -xf -

# 2. 运行容器
sudo ./toyctr -image busybox -cmd /bin/sh

# 3. 在容器内测试
# 创建文件
touch /test_file
ls -la /test_file

# 4. 退出容器后检查
# 文件应该在 upper 层
ls -la /var/lib/toyctr/containers/*/upper/test_file

练习 4:安全加固

  1. 实现 Capabilities 控制
  2. 添加 Seccomp 策略
  3. 测试安全限制效果

验证步骤:

# 1. 运行带安全限制的容器
sudo ./toyctr -cmd /bin/sh -args "-c 'mount -t tmpfs tmpfs /tmp'"

# 2. 应该看到权限被拒绝的错误
# mount: /tmp: Operation not permitted

# 3. 检查 capabilities
cat /proc/self/status | grep Cap
# 应该看到受限的 capabilities

验证检查清单

基础功能验证

  • [ ] 容器进程隔离正常
  • [ ] 网络命名空间独立
  • [ ] 文件系统隔离有效
  • [ ] 资源限制生效

网络功能验证

  • [ ] 容器间可以通信
  • [ ] 外网访问正常
  • [ ] IP 分配不冲突
  • [ ] 端口映射正确

文件系统验证

  • [ ] OverlayFS 挂载成功
  • [ ] Volume 挂载正常
  • [ ] 只读限制生效
  • [ ] 数据持久化正确

安全功能验证

  • [ ] Capabilities 限制生效
  • [ ] Seccomp 策略有效
  • [ ] 危险系统调用被阻止
  • [ ] 权限提升被限制

相关链接

  • 11-Go实现最小容器 - 最小实现
  • 13-容器生命周期管理 - 生命周期管理
  • 14-调试技术与工具 - 调试技术详解

下一步:让我们学习容器生命周期管理,这是容器运行时的核心!

Prev
11-Go实现最小容器
Next
13-容器生命周期管理