之前介绍了eevdf的patchset-v1的前3个补丁,现在继续深入解析patch 4-6的内容。这三笔提交引入了EEVDF调度器的核心算法:基于虚拟截止时间的任务选择机制。
这笔提交虽然是一个辅助函数的添加,但为EEVDF调度器的核心算法奠定了基础。
在EEVDF调度器中,我们需要维护一个增强型红黑树(augmented RB-tree),这个树不仅要按虚拟运行时间(vruntime)排序,还要维护每个子树的最小截止时间(min_deadline)。这样我们才能高效地找到具有最早截止时间的可运行任务。
添加了一个通用的辅助函数rb_add_augmented_cached,用于向增强型缓存红黑树中插入节点:
cstatic __always_inline struct rb_node *
rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree,
bool (*less)(struct rb_node *, const struct rb_node *),
const struct rb_augment_callbacks *augment)
{
struct rb_node **link = &tree->rb_root.rb_node;
struct rb_node *parent = NULL;
bool leftmost = true;
while (*link) {
parent = *link;
if (less(node, parent)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = false;
}
}
rb_link_node(node, parent, link);
augment->propagate(parent, NULL); /* suboptimal */
rb_insert_augmented_cached(node, tree, leftmost, augment);
return leftmost ? node : NULL;
}
augment->propagate(parent, NULL)是suboptimal的,因为理论上可以在向下遍历树时就更新增强数据,但当前的增强接口不支持这样做在EEVDF中,每个任务实体(sched_entity)需要维护:
deadline:虚拟截止时间min_deadline:以该节点为根的子树中的最小截止时间当我们插入新任务时,需要确保从该节点到根节点路径上的所有节点的min_deadline都被正确更新。这个辅助函数简化了这个过程。
这笔提交是EEVDF调度器的核心实现,引入了基于**最早可运行虚拟截止时间优先(Earliest Eligible Virtual Deadline First)**的调度算法。
传统CFS是一个基于**加权公平队列(WFQ)**的调度器,只有单一的权重(weight)参数,映射到nice值。
EEVDF引入了第二个参数——延迟导向的参数,使得类似WF2Q或EEVDF的调度算法更适合:
"Where CFS is currently a WFQ based scheduler with only a single knob, the weight. The addition of a second, latency oriented parameter, makes something like WF2Q or EEVDF based a much better fit."
EEVDF有两个关键参数:
虚拟截止时间的计算公式:
vd_i = ve_i + r_i / w_i
其中:
vd_i:任务i的虚拟截止时间ve_i:任务i的虚拟运行时间r_i:请求大小(时间片)w_i:任务权重"Basically, by setting a smaller slice, the deadline will be earlier and the task will be more eligible and ran earlier."
更小的时间片意味着更早的截止时间,任务会更早被调度,这提供了控制延迟的手段。
cstruct sched_entity {
struct load_weight load;
struct rb_node run_node;
u64 deadline; // 新增:虚拟截止时间
u64 min_deadline; // 新增:子树最小截止时间
// ... 现有字段 ...
u64 vruntime;
s64 vlag;
u64 slice; // 新增:时间片长度
};
cstatic void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
avg_vruntime_add(cfs_rq, se);
se->min_deadline = se->deadline; // 初始化最小截止时间为自己的截止时间
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
__entity_less, &min_deadline_cb);
}
cstatic inline bool min_deadline_update(struct sched_entity *se, bool exit)
{
u64 old_min_deadline = se->min_deadline;
struct rb_node *node = &se->run_node;
se->min_deadline = se->deadline;
__update_min_deadline(se, node->rb_right);
__update_min_deadline(se, node->rb_left);
return se->min_deadline == old_min_deadline;
}
RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
run_node, min_deadline, min_deadline_update);
这个回调函数在树结构变化时自动更新每个节点的min_deadline,保证:
se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
cstatic struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *curr = cfs_rq->curr;
struct sched_entity *best = NULL;
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
while (node) {
struct sched_entity *se = __node_2_se(node);
// 如果这个实体不可运行,尝试左子树
if (!entity_eligible(cfs_rq, se)) {
node = node->rb_left;
continue;
}
// 如果这个实体的截止时间比之前找到的更早,选择它
// 如果它也是其子树中截止时间最早的,我们就找到了
if (!best || deadline_gt(deadline, best, se)) {
best = se;
if (best->deadline == best->min_deadline)
break;
}
// 如果这个子树中最早的截止时间在完全可运行的左半部分,
// 去那里查找
if (node->rb_left &&
__node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
node = node->rb_left;
continue;
}
node = node->rb_right;
}
if (!best || (curr && deadline_gt(deadline, best, curr)))
best = curr;
if (unlikely(!best)) {
struct sched_entity *left = __pick_first_entity(cfs_rq);
if (left) {
pr_err("EEVDF scheduling fail, picking leftmost\n");
return left;
}
}
return best;
}
通过增强型红黑树,这个算法可以在**O(log n)**时间复杂度内完成任务选择。
cint entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct sched_entity *curr = cfs_rq->curr;
s64 avg = cfs_rq->avg_vruntime;
long load = cfs_rq->avg_load;
if (curr && curr->on_rq) {
unsigned long weight = scale_load_down(curr->load.weight);
avg += entity_key(cfs_rq, curr) * weight;
load += weight;
}
return avg >= entity_key(cfs_rq, se) * load;
}
资格条件:任务必须被欠服务(lag >= 0)才可运行。
数学推导:
lag_i = S - s_i = w_i * (V - v_i) lag_i >= 0 -> V >= v_i ∑(v_i - v) * w_i V = ------------------ + v ∑w_i lag_i >= 0 -> ∑(v_i - v) * w_i >= (v_i - v) * (∑w_i)
cstatic void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if ((s64)(se->vruntime - se->deadline) < 0)
return;
if (sched_feat(EEVDF)) {
// 对于EEVDF,虚拟时间斜率由w_i决定(即nice值)
// 请求时间r_i由sysctl_sched_min_granularity决定
se->slice = sysctl_sched_min_granularity;
// 任务已消耗其请求,需要重新调度
if (cfs_rq->nr_running > 1) {
resched_curr(rq_of(cfs_rq));
clear_buddies(cfs_rq, se);
}
} else {
// CFS模式下的处理
se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency);
}
// EEVDF: vd_i = ve_i + r_i / w_i
se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
}
cstatic struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
struct sched_entity *left, *se;
if (sched_feat(EEVDF)) {
// 启用NEXT_BUDDY会影响延迟但不影响公平性
if (sched_feat(NEXT_BUDDY) &&
cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
return cfs_rq->next;
return pick_eevdf(cfs_rq);
}
se = left = pick_cfs(cfs_rq, curr);
// ... CFS的buddy处理逻辑 ...
return se;
}
由请求/时间片完成驱动:
ccheck_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
unsigned long delta_exec;
// ...
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > curr->slice) {
resched_curr(rq_of(cfs_rq));
// ...
}
}
由截止时间驱动:
cstatic void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
// ...
if (sched_feat(EEVDF)) {
if (pick_eevdf(cfs_rq) == pse)
goto preempt;
return;
}
// ...
}
cSEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', // E/N标识是否可运行
SPLIT_NS(p->se.deadline), // 截止时间
SPLIT_NS(p->se.slice), // 时间片
SPLIT_NS(p->se.sum_exec_runtime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
这笔提交删除了旧的FAIR_SLEEPERS机制,完全采用新的基于lag的放置策略。
FAIR_SLEEPERS是一个非常粗略的近似方案,用来弥补缺乏基于lag的放置机制的问题,特别是"欠服务(service owed)"部分。这对于starve和hackbench这类测试很重要。
FAIR_SLEEPERS导致了某种程度的不公平:
FAIR_SLEEPERS会产生50%/50%的时间分配c// 删除了整个FAIR_SLEEPERS逻辑块
if (sched_feat(FAIR_SLEEPERS)) {
// ... 40+行代码被删除 ...
}
cstatic void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vslice = calc_delta_fair(se->slice, se);
u64 vruntime = avg_vruntime(cfs_rq);
s64 lag = 0;
/*
* Due to how V is constructed as the weighted average of entities,
* adding tasks with positive lag, or removing tasks with negative lag
* will move 'time' backwards, this can screw around with the lag of
* other tasks.
*
* EEVDF: placement strategy #1 / #2
*/
if (sched_feat(PLACE_LAG) && cfs_rq->nr_running > 1) {
lag = se->vlag;
/*
* Compensate for the effect of adding this entity on
* the weighted average.
*/
load = cfs_rq->avg_load;
if (curr && curr->on_rq)
load += scale_load_down(curr->load.weight);
lag *= load + scale_load_down(se->load.weight);
if (WARN_ON_ONCE(!load))
load = 1;
lag = div_s64(lag, load);
}
vruntime -= lag;
// 新任务加入竞争时的特殊处理
if (sched_feat(PLACE_DEADLINE_INITIAL) && initial)
vslice /= 2;
// EEVDF: vd_i = ve_i + r_i/w_i
se->vruntime = vruntime;
se->deadline = vruntime + vslice;
}
在kernel/sched/features.h中新增:
cSCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
// ...
SCHED_FEAT(EEVDF, true)
entity_is_long_sleeper函数:不再需要特殊处理长睡眠任务sysctl_sched_min_granularity本文介绍了如下
https://lore.kernel.org/lkml/20230531124603.862983648@infradead.org/ https://lore.kernel.org/lkml/20230531124603.931005524@infradead.org/ https://lore.kernel.org/lkml/20230531124604.000198861@infradead.org/