CFS调度器原理与源码

章节概述

CFS（Completely Fair Scheduler，完全公平调度器）是Linux 2.6.23版本引入的默认进程调度器。本章将深入探讨CFS的设计哲学、核心数据结构和源码实现，帮助读者理解Linux如何实现"公平"调度。

学习目标：

深入理解CFS的公平性实现原理
掌握红黑树在调度中的应用
学会分析vruntime的计算和调整机制
能够追踪和调试调度行为

核心概念

1. CFS的设计哲学

核心思想：

让每个进程获得相同的CPU时间份额
通过"虚拟运行时间"实现公平

公平性定义：

在一个调度周期内，每个进程都应该运行一次
每个进程获得的CPU时间与其权重成正比
优先选择运行时间最少的进程

2. 关键数据结构

调度实体（sched_entity）：

struct sched_entity {
    struct load_weight      load;           // 权重
    struct rb_node          run_node;       // 红黑树节点
    struct list_head        group_node;     // 组调度链表
    unsigned int            on_rq;          // 是否在运行队列上
    
    u64                     exec_start;     // 开始执行时间
    u64                     sum_exec_runtime; // 累计运行时间
    u64                     vruntime;       // 虚拟运行时间（核心！）
    u64                     prev_sum_exec_runtime;
    
    u64                     nr_migrations;  // 迁移次数
    
    struct sched_statistics statistics;    // 统计信息
};

CFS运行队列（cfs_rq）：

struct cfs_rq {
    struct load_weight  load;              // 总权重
    unsigned int        nr_running;        // 运行中的任务数
    
    u64                 exec_clock;        // 执行时钟
    u64                 min_vruntime;      // 最小vruntime
    
    struct rb_root_cached   tasks_timeline; // 红黑树根节点
    
    struct sched_entity *curr;             // 当前运行的实体
    struct sched_entity *next;             // 下一个运行的实体
    struct sched_entity *last;             // 最后运行的实体
    struct sched_entity *skip;             // 跳过的实体
};

3. 虚拟运行时间（vruntime）

计算公式：

Δvruntime = Δt × (NICE_0_LOAD / weight)

其中：
- Δt: 实际运行时间
- NICE_0_LOAD: nice值为0的标准权重（1024）
- weight: 进程的权重

权重表（根据nice值）：

const int sched_prio_to_weight[40] = {
 /* -20 */     88761,     71755,     56483,     46273,     36291,
 /* -15 */     29154,     23254,     18705,     14949,     11916,
 /* -10 */      9548,      7620,      6100,      4904,      3906,
 /*  -5 */      3121,      2501,      1991,      1586,      1277,
 /*   0 */      1024,       820,       655,       526,       423,
 /*   5 */       335,       272,       215,       172,       137,
 /*  10 */       110,        87,        70,        56,        45,
 /*  15 */        36,        29,        23,        18,        15,
};

特点：

nice值每降低1，权重增加约1.25倍
nice值每升高1，权重减少约1.25倍
vruntime越小，说明进程运行时间越短，越应该被调度

4. 红黑树调度结构

                    min_vruntime
                         ↓
              ┌──────────┴──────────┐
              │    CFS Red-Black    │
              │        Tree         │
              └─────────┬───────────┘
                        │
            ┌───────────┼───────────┐
            │           │           │
        ┌───┴───┐   ┌───┴───┐   ┌───┴───┐
        │Task A │   │Task B │   │Task C │
        │vr=100 │   │vr=150 │   │vr=200 │
        └───────┘   └───────┘   └───────┘
            ↑
         选择最小

红黑树特性：

始终保持平衡
查找、插入、删除时间复杂度O(log n)
最左节点是vruntime最小的进程

源码解析

1. 调度器入口

关键文件： kernel/sched/fair.c

// 选择下一个要运行的任务
static struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
    struct cfs_rq *cfs_rq = &rq->cfs;
    struct sched_entity *se;
    struct task_struct *p;
    
again:
    if (!cfs_rq->nr_running)
        return NULL;
    
    // 将当前任务放回红黑树
    put_prev_task(rq, prev);
    
    do {
        // 从红黑树选择vruntime最小的实体
        se = pick_next_entity(cfs_rq, NULL);
        set_next_entity(cfs_rq, se);
        cfs_rq = group_cfs_rq(se);
    } while (cfs_rq);
    
    p = task_of(se);
    
    return p;
}

// 从红黑树选择实体
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq,
                                             struct sched_entity *curr)
{
    struct sched_entity *left = __pick_first_entity(cfs_rq);
    struct sched_entity *se;
    
    // 红黑树的最左节点就是vruntime最小的
    if (!left || (curr && entity_before(curr, left)))
        left = curr;
    
    se = left;
    
    return se;
}

// 获取红黑树最左节点
static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
    struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
    
    if (!left)
        return NULL;
    
    return rb_entry(left, struct sched_entity, run_node);
}

2. vruntime更新

// 更新当前任务的运行时间
static void update_curr(struct cfs_rq *cfs_rq)
{
    struct sched_entity *curr = cfs_rq->curr;
    u64 now = rq_clock_task(rq_of(cfs_rq));
    u64 delta_exec;
    
    if (unlikely(!curr))
        return;
    
    // 计算实际运行时间
    delta_exec = now - curr->exec_start;
    if (unlikely((s64)delta_exec <= 0))
        return;
    
    curr->exec_start = now;
    
    // 累加实际运行时间
    curr->sum_exec_runtime += delta_exec;
    
    // 更新组调度的运行时间
    schedstat_add(cfs_rq->exec_clock, delta_exec);
    
    // 更新vruntime（核心！）
    curr->vruntime += calc_delta_fair(delta_exec, curr);
    update_min_vruntime(cfs_rq);
}

// 计算vruntime增量
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
    // 如果权重不是NICE_0_LOAD，需要调整
    if (unlikely(se->load.weight != NICE_0_LOAD))
        delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
    
    return delta;
}

// 通用的delta计算
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
    u64 fact = scale_load_down(weight);
    int shift = WMULT_SHIFT;
    
    __update_inv_weight(lw);
    
    if (unlikely(fact >> 32)) {
        while (fact >> 32) {
            fact >>= 1;
            shift--;
        }
    }
    
    // delta * weight / lw->weight
    fact = (u64)(u32)fact * lw->inv_weight;
    
    while (fact >> 32) {
        fact >>= 1;
        shift--;
    }
    
    return mul_u64_u32_shr(delta_exec, fact, shift);
}

3. 进程入队和出队

// 将任务加入红黑树
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
    bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
    bool curr = cfs_rq->curr == se;
    
    // 更新当前任务的运行时间
    if (renorm && curr)
        se->vruntime += cfs_rq->min_vruntime;
    
    update_curr(cfs_rq);
    
    // 如果不是当前正在运行的任务，加入红黑树
    if (!curr)
        __enqueue_entity(cfs_rq, se);
    
    se->on_rq = 1;
    
    // 更新cfs_rq统计信息
    account_entity_enqueue(cfs_rq, se);
    
    if (flags & ENQUEUE_WAKEUP)
        place_entity(cfs_rq, se, 0);
    
    update_cfs_group(se);
}

// 将实体插入红黑树
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
    struct rb_node *parent = NULL;
    struct sched_entity *entry;
    bool leftmost = true;
    
    // 查找插入位置
    while (*link) {
        parent = *link;
        entry = rb_entry(parent, struct sched_entity, run_node);
        
        // 按vruntime排序
        if (entity_before(se, entry)) {
            link = &parent->rb_left;
        } else {
            link = &parent->rb_right;
            leftmost = false;
        }
    }
    
    // 插入节点
    rb_link_node(&se->run_node, parent, link);
    rb_insert_color_cached(&se->run_node,
                          &cfs_rq->tasks_timeline, leftmost);
}

// 从红黑树移除任务
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
    update_curr(cfs_rq);
    
    // 从红黑树移除
    if (se->on_rq) {
        update_stats_dequeue(cfs_rq, se, flags);
        
        if (se != cfs_rq->curr)
            __dequeue_entity(cfs_rq, se);
        
        se->on_rq = 0;
        account_entity_dequeue(cfs_rq, se);
        update_cfs_group(se);
    }
}

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
}

4. 负载均衡

// 负载均衡主函数
static int load_balance(int this_cpu, struct rq *this_rq,
                       struct sched_domain *sd, enum cpu_idle_type idle,
                       int *continue_balancing)
{
    int ld_moved, cur_ld_moved, active_balance = 0;
    struct sd_lb_stats sds;
    struct lb_env env = {
        .sd             = sd,
        .dst_cpu        = this_cpu,
        .dst_rq         = this_rq,
        .idle           = idle,
    };
    
    // 计算负载统计
    update_sd_lb_stats(env.sd, &sds);
    
    // 查找最繁忙的组
    struct sched_group *busiest = find_busiest_group(&env, &sds);
    
    if (!busiest)
        goto out_balanced;
    
    // 查找最繁忙的CPU
    env.src_cpu = find_busiest_queue(&env, busiest);
    
    if (!should_we_balance(&env))
        goto out_balanced;
    
    // 迁移任务
    ld_moved = move_tasks(&env);
    
    return ld_moved;
    
out_balanced:
    return 0;
}

// 迁移任务
static int move_tasks(struct lb_env *env)
{
    struct list_head *tasks = &env->src_rq->cfs_tasks;
    struct task_struct *p;
    unsigned long load;
    int pulled = 0;
    
    if (env->imbalance <= 0)
        return 0;
    
    while (!list_empty(tasks)) {
        p = list_first_entry(tasks, struct task_struct, se.group_node);
        
        // 检查是否可以迁移
        if (!can_migrate_task(p, env))
            goto next;
        
        // 迁移任务
        detach_task(p, env);
        attach_task(env->dst_rq, p);
        
        pulled++;
        
        // 检查是否已达到平衡
        if (env->imbalance <= 0)
            break;
        
next:
        list_move(&p->se.group_node, tasks);
    }
    
    return pulled;
}

️ 实用命令

1. 调度器调试信息

查看调度器详细信息：

# 查看完整的调度调试信息
cat /proc/sched_debug

# 查看特定CPU的运行队列
cat /proc/sched_debug | grep -A 20 "cpu#0"

# 查看调度统计
cat /proc/schedstat

输出示例：

cfs_rq[0]:/
  .exec_clock                    : 123456.789000
  .MIN_vruntime                  : 0.000001
  .min_vruntime                  : 234567.890123
  .max_vruntime                  : 0.000001
  .spread                        : 0.000000
  .spread0                       : 0.000000
  .nr_spread_over                : 0
  .nr_running                    : 2
  .load                          : 2048
  .runnable_weight               : 2048

2. 进程调度信息

查看进程调度统计：

# 查看进程的调度信息
cat /proc/<pid>/sched

# 查看所有进程的调度信息
ps -eo pid,ni,pri,psr,comm,wchan

# 实时监控进程调度
top -H -p <pid>

3. 调整调度参数

CFS调度参数：

# 查看调度延迟（默认6ms）
cat /proc/sys/kernel/sched_latency_ns

# 查看最小粒度（默认0.75ms）
cat /proc/sys/kernel/sched_min_granularity_ns

# 查看唤醒粒度
cat /proc/sys/kernel/sched_wakeup_granularity_ns

# 调整参数（需要root权限）
sudo sysctl -w kernel.sched_latency_ns=12000000
sudo sysctl -w kernel.sched_min_granularity_ns=1500000

代码示例

1. 调度行为观测程序

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/resource.h>
#include <pthread.h>
#include <sched.h>

// 获取线程的调度信息
void print_sched_info(const char *name) {
    int policy;
    struct sched_param param;
    
    policy = sched_getscheduler(0);
    sched_getparam(0, &param);
    
    printf("%s 调度信息:\n", name);
    printf("  策略: %s\n", 
           policy == SCHED_OTHER ? "SCHED_OTHER" :
           policy == SCHED_FIFO ? "SCHED_FIFO" :
           policy == SCHED_RR ? "SCHED_RR" : "UNKNOWN");
    printf("  优先级: %d\n", param.sched_priority);
    printf("  Nice值: %d\n", getpriority(PRIO_PROCESS, 0));
}

// 计算密集型任务
void* cpu_intensive_task(void *arg) {
    int nice_val = *(int*)arg;
    volatile long i, sum = 0;
    
    // 设置nice值
    if (nice(nice_val) == -1) {
        perror("nice");
    }
    
    print_sched_info("CPU密集型任务");
    
    // 执行计算
    for (i = 0; i < 1000000000L; i++) {
        sum += i;
        if (i % 100000000 == 0) {
            printf("Nice %d: 完成 %ld%%\n", 
                   nice_val, i / 10000000);
        }
    }
    
    return NULL;
}

int main() {
    pthread_t threads[3];
    int nice_values[] = {-10, 0, 10};
    
    printf("启动3个不同nice值的线程\n\n");
    
    // 创建3个不同优先级的线程
    for (int i = 0; i < 3; i++) {
        pthread_create(&threads[i], NULL, 
                      cpu_intensive_task, &nice_values[i]);
    }
    
    // 等待所有线程完成
    for (int i = 0; i < 3; i++) {
        pthread_join(threads[i], NULL);
    }
    
    return 0;
}

2. vruntime模拟器

#include <stdio.h>
#include <stdlib.h>

#define NICE_0_LOAD 1024

// nice值对应的权重表（部分）
int weight_table[] = {
    88761,  // nice -20
    71755,  // nice -19
    // ...
    1024,   // nice 0
    // ...
    15      // nice 19
};

// 计算vruntime增量
unsigned long long calc_delta_vruntime(unsigned long long delta_exec,
                                       int nice_value) {
    int weight = weight_table[nice_value + 20];  // 调整索引
    unsigned long long vruntime_delta;
    
    vruntime_delta = delta_exec * NICE_0_LOAD / weight;
    
    return vruntime_delta;
}

int main() {
    unsigned long long delta_exec = 1000000;  // 1ms的纳秒数
    
    printf("实际运行时间: %llu ns (1ms)\n\n", delta_exec);
    printf("不同nice值的vruntime增量:\n");
    printf("%-10s %-10s %-15s\n", "Nice值", "权重", "vruntime增量(ns)");
    printf("----------------------------------------\n");
    
    for (int nice = -20; nice <= 19; nice += 5) {
        int weight = weight_table[nice + 20];
        unsigned long long vr_delta = calc_delta_vruntime(delta_exec, nice);
        
        printf("%-10d %-10d %-15llu\n", nice, weight, vr_delta);
    }
    
    printf("\n结论:\n");
    printf("- nice值越低（优先级越高），vruntime增长越慢\n");
    printf("- nice值越高（优先级越低），vruntime增长越快\n");
    printf("- 这保证了高优先级任务获得更多CPU时间\n");
    
    return 0;
}

3. Go语言调度追踪

package main

import (
    "fmt"
    "runtime"
    "sync"
    "time"
)

func cpuIntensiveTask(id int, wg *sync.WaitGroup) {
    defer wg.Done()
    
    fmt.Printf("Goroutine %d 开始执行\n", id)
    
    // CPU密集型任务
    sum := 0
    for i := 0; i < 100000000; i++ {
        sum += i
        if i%10000000 == 0 {
            // 打印当前所在的P
            fmt.Printf("Goroutine %d: 在P %d上执行, 进度 %d%%\n",
                      id, runtime.GOMAXPROCS(0), i/1000000)
        }
    }
    
    fmt.Printf("Goroutine %d 完成，sum=%d\n", id, sum)
}

func main() {
    // 设置GOMAXPROCS
    numCPU := runtime.NumCPU()
    runtime.GOMAXPROCS(numCPU)
    
    fmt.Printf("系统CPU核心数: %d\n", numCPU)
    fmt.Printf("GOMAXPROCS设置为: %d\n\n", runtime.GOMAXPROCS(0))
    
    var wg sync.WaitGroup
    numGoroutines := 5
    
    // 启动多个goroutine
    for i := 0; i < numGoroutines; i++ {
        wg.Add(1)
        go cpuIntensiveTask(i, &wg)
    }
    
    // 等待所有goroutine完成
    wg.Wait()
    
    fmt.Println("\n所有任务完成")
}

动手实验

实验1：观测vruntime变化

目标： 理解vruntime如何影响调度

步骤：

编写观测脚本：

#!/bin/bash
# watch_vruntime.sh

PID=$1
if [ -z "$PID" ]; then
    echo "用法: $0 <pid>"
    exit 1
fi

while true; do
    clear
    echo "=== 进程 $PID 的调度信息 ==="
    cat /proc/$PID/sched | grep -E "se.vruntime|se.sum_exec_runtime|nr_switches"
    echo ""
    echo "=== CFS运行队列信息 ==="
    cat /proc/sched_debug | grep -A 5 "cfs_rq\[0\]"
    sleep 1
done

运行测试程序：

# 编译并运行
gcc -o sched_test sched_test.c -lpthread
./sched_test &

# 获取PID
PID=$(pgrep sched_test)

# 观测vruntime
chmod +x watch_vruntime.sh
./watch_vruntime.sh $PID

分析结果：

观察vruntime的增长速度
比较不同nice值进程的vruntime
理解vruntime与实际运行时间的关系

实验2：负载均衡实验

目标： 观测CFS的负载均衡行为

步骤：

启动多个CPU密集型任务：

# 启动比CPU核心数多的任务
for i in {1..16}; do
    ./sched_test &
done

观测CPU分布：

# 实时观测CPU使用
top -H

# 或使用htop
htop

# 观测负载均衡统计
cat /proc/schedstat

分析任务迁移：

# 观测任务迁移次数
for pid in $(pgrep sched_test); do
    echo "PID $pid:"
    cat /proc/$pid/sched | grep nr_migrations
done

实验3：使用perf追踪调度事件

目标： 使用perf工具深入分析调度行为

步骤：

记录调度事件：

# 记录所有调度事件（5秒）
sudo perf sched record -- sleep 5

# 查看调度延迟
sudo perf sched latency

# 查看调度映射
sudo perf sched map

分析输出：

# 延迟分析示例输出
Task                  Runtime ms  Switches  Avg delay ms  Max delay ms
sched_test            1000.00     500       0.50          10.00

生成调度火焰图：

sudo perf sched record -a -- sleep 10
sudo perf script | stackcollapse-perf.pl | flamegraph.pl > sched.svg

常见问题

Q1: CFS如何实现"完全公平"？

A: CFS通过以下机制实现公平：

使用vruntime追踪进程的虚拟运行时间
总是选择vruntime最小的进程运行
通过权重调整不同优先级进程的vruntime增速
在长期来看，所有进程的vruntime趋于相同

Q2: 为什么使用红黑树而不是其他数据结构？

A: 红黑树的优势：

保持平衡，最坏情况也是O(log n)
插入、删除、查找都很高效
最左节点天然是vruntime最小的
适合频繁插入删除的场景

Q3: nice值如何影响调度？

A: 影响机制：

nice值通过权重表映射到权重
权重影响vruntime的计算
vruntime影响被调度的顺序
最终效果是CPU时间的分配比例

Q4: 如何优化CFS调度性能？

A: 优化策略：

减少任务数量（避免过度并发）
合理设置nice值
调整调度参数（sched_latency_ns等）
使用CPU亲和性绑定

复习题

选择题

CFS使用什么数据结构管理可运行进程？
- A. 链表
- B. 红黑树
- C. 哈希表
- D. 数组
vruntime的计算考虑了哪个因素？
- A. 进程的PID
- B. 进程的权重
- C. 进程的内存使用
- D. 进程的IO使用
nice值为0时，对应的权重是：
- A. 512
- B. 1024
- C. 2048
- D. 4096
CFS选择下一个运行进程的依据是：
- A. 优先级最高
- B. PID最小
- C. vruntime最小
- D. 等待时间最长
sched_latency_ns参数表示：
- A. 最小时间片
- B. 最大延迟
- C. 调度周期
- D. 唤醒延迟

简答题

解释vruntime的计算公式及其含义。
为什么CFS需要min_vruntime字段？
描述CFS的任务选择流程。
解释CFS如何处理新创建的进程。
比较CFS和O(1)调度器的优缺点。

实战题

源码分析题： 阅读pick_next_entity函数的源码，解释它如何选择下一个要运行的任务。
性能调优题： 系统有大量短期任务和少量长期任务，如何调整CFS参数以优化响应时间？
调试题： 通过/proc/sched_debug输出，分析某个CPU的运行队列状态，判断是否存在调度问题。

扩展阅读

深入方向

组调度（Group Scheduling）
实时调度器（RT Scheduler）
NUMA调度优化
CFS bandwidth control

下一章预告： 我们将深入探讨PageCache与内存回收机制，包括脏页回写策略、kswapd工作原理，以及如何优化内存使用。