继续分析eevdf的提交patch。

[PATCH 07/15] sched/smp: Use lag to simplify cross-runqueue placement

首先，不需要再通过min_vruntime来更新vruntime，直接调用之前补丁实现的place_entity

c
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
-       bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
        bool curr = cfs_rq->curr == se;

        /*
         * If we're the current task, we must renormalise before calling
         * update_curr().
         */
-       if (renorm && curr)
-               se->vruntime += cfs_rq->min_vruntime;
+       if (curr)
+               place_entity(cfs_rq, se, 0);

dequeue这部分，也是删除了之前cfs的自减min_vruntime，这个之前文章提过。

也因此 enqueue 时不再只有ENQUEUE_WAKEUP的时候才 place_entity ，所以 dequeue 的时候对应要做 update_entity_lag

c
@@ -5335,23 +5306,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
 
 	clear_buddies(cfs_rq, se);
 
-	if (flags & DEQUEUE_SLEEP)
-		update_entity_lag(cfs_rq, se);
-
+	update_entity_lag(cfs_rq, se);
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
 	account_entity_dequeue(cfs_rq, se);
 
-	/*
-	 * Normalize after update_curr(); which will also have moved
-	 * min_vruntime if @se is the one holding it back. But before doing
-	 * update_min_vruntime() again, which will discount @se's position and
-	 * can move min_vruntime forward still more.
-	 */
-	if (!(flags & DEQUEUE_SLEEP))
-		se->vruntime -= cfs_rq->min_vruntime;
-
 	/* return excess runtime on last dequeue */
 	return_cfs_rq_runtime(cfs_rq);

migrate_task_rq_fair 在cfs中的时候，需要重新计算vruntime，这里使用eevdf，删掉

c
@@ -8102,18 +8062,6 @@ static void migrate_task_rq_fair(struct
 {
 	struct sched_entity *se = &p->se;
 
-	/*
-	 * As blocked tasks retain absolute vruntime the migration needs to
-	 * deal with this by subtracting the old and adding the new
-	 * min_vruntime -- the latter is done by enqueue_entity() when placing
-	 * the task on the new runqueue.
-	 */
-	if (READ_ONCE(p->__state) == TASK_WAKING) {
-		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
-		se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
-	}
-
 	if (!task_on_rq_migrating(p)) {
 		remove_entity_load_avg(se);

task_fork_fair 也不需要调整vruntime，新版的place_entity 可以直接计算 se 的vruntime，所以删掉

值得注意的是 sysctl_sched_child_runs_first 本身是针对有child 需要先于 parent 获得cpu的功能，这里eevdf直接删掉

c
@@ -12492,22 +12440,9 @@ static void task_fork_fair(struct task_s
 
 	cfs_rq = task_cfs_rq(current);
 	curr = cfs_rq->curr;
-	if (curr) {
+	if (curr)
 		update_curr(cfs_rq);
-		se->vruntime = curr->vruntime;
-	}
 	place_entity(cfs_rq, se, 1);
-
-	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
-		/*
-		 * Upon rescheduling, sched_class::put_prev_task() will place
-		 * 'current' within the tree based on its new key value.
-		 */
-		swap(curr->vruntime, se->vruntime);
-		resched_curr(rq);
-	}
-
-	se->vruntime -= cfs_rq->min_vruntime;
 	rq_unlock(rq, &rf);
 }

标准函数vruntime_normalized直接删掉，其本意是判断一个任务是否需要重平衡，这里在 detach_task_cfs_rq 和 detach_task_cfs_rq 一并删掉


@@ -12634,16 +12541,6 @@ static void attach_entity_cfs_rq(struct
 static void detach_task_cfs_rq(struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
-	if (!vruntime_normalized(p)) {
-		/*
-		 * Fix up our vruntime so that the current sleep doesn't
-		 * cause 'unlimited' sleep bonus.
-		 */
-		place_entity(cfs_rq, se, 0);
-		se->vruntime -= cfs_rq->min_vruntime;
-	}
 
 	detach_entity_cfs_rq(se);
 }
@@ -12651,12 +12548,8 @@ static void detach_task_cfs_rq(struct ta
 static void attach_task_cfs_rq(struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 	attach_entity_cfs_rq(se);
-
-	if (!vruntime_normalized(p))
-		se->vruntime += cfs_rq->min_vruntime;
 }

[PATCH 08/15] sched: Commit to EEVDF

这里直接删掉原cfs的 sysctl_sched_latency 和 sysctl_sched_wakeup_granularity

c
-unsigned int sysctl_sched_latency                     = 6000000ULL;
-unsigned int sysctl_sched_wakeup_granularity          = 1000000UL;
+unsigned int sysctl_sched_base_slice                  = 750000ULL;

删掉LAST_BUDDY、SKIP_BUDDYbuddy机制


--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -576,8 +576,6 @@ struct cfs_rq {
 	 */
 	struct sched_entity	*curr;
 	struct sched_entity	*next;
-	struct sched_entity	*last;
-	struct sched_entity	*skip;

在CFS的时候，需要做特殊处理如下

当任务yield的时候，标识为 skip buddy，这样可以判断跳过这个任务的选取
最后一次唤醒的时候，标识为 last buddy，这样cfs更容易选中这个被唤醒的任务

但是这在eevdf都是不需要的，因为它只需要一个关键 deadline


@@ -1098,35 +961,25 @@ static void update_deadline(struct cfs_r
 	if ((s64)(se->vruntime - se->deadline) < 0)
 		return;
 
-	if (sched_feat(EEVDF)) {
-		/*
-		 * For EEVDF the virtual time slope is determined by w_i (iow.
-		 * nice) while the request time r_i is determined by
-		 * sysctl_sched_min_granularity.
-		 */
-		se->slice = sysctl_sched_min_granularity;
-
-		/*
-		 * The task has consumed its request, reschedule.
-		 */
-		if (cfs_rq->nr_running > 1) {
-			resched_curr(rq_of(cfs_rq));
-			clear_buddies(cfs_rq, se);
-		}
-	} else {
-		/*
-		 * When many tasks blow up the sched_period; it is possible
-		 * that sched_slice() reports unusually large results (when
-		 * many tasks are very light for example). Therefore impose a
-		 * maximum.
-		 */
-		se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency);
-	}
+	/*
+	 * For EEVDF the virtual time slope is determined by w_i (iow.
+	 * nice) while the request time r_i is determined by
+	 * sysctl_sched_min_granularity.
+	 */
+	se->slice = sysctl_sched_min_granularity;
 
 	/*
 	 * EEVDF: vd_i = ve_i + r_i / w_i
 	 */
 	se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
+
+	/*
+	 * The task has consumed its request, reschedule.
+	 */
+	if (cfs_rq->nr_running > 1) {
+		resched_curr(rq_of(cfs_rq));
+		clear_buddies(cfs_rq, se);
+	}
 }

根据上面的代码，可以看到，直接给slice为sysctl_sched_min_granularity，把之前一些兼容cfs的东西删掉


-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void
-check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
-	unsigned long delta_exec;
-	struct sched_entity *se;
-	s64 delta;
-
-	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-	if (delta_exec > curr->slice) {
-		resched_curr(rq_of(cfs_rq));
-		/*
-		 * The current task ran long enough, ensure it doesn't get
-		 * re-elected due to buddy favours.
-		 */
-		clear_buddies(cfs_rq, curr);
-		return;
-	}
-
-	/*
-	 * Ensure that a task that missed wakeup preemption by a
-	 * narrow margin doesn't have to wait for a full slice.
-	 * This also mitigates buddy induced latencies under load.
-	 */
-	if (delta_exec < sysctl_sched_min_granularity)
-		return;
-
-	se = __pick_first_entity(cfs_rq);
-	delta = curr->vruntime - se->vruntime;
-
-	if (delta < 0)
-		return;
-
-	if (delta > curr->slice)
-		resched_curr(rq_of(cfs_rq));
-}
-

删掉cfs独有的check_preempt_tick


 /*
  * Pick the next process, keeping these things in mind, in this order:
  * 1) keep things fair between processes/task groups
@@ -5420,53 +5189,14 @@ wakeup_preempt_entity(struct sched_entit
 static struct sched_entity *
 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-	struct sched_entity *left, *se;
-
-	if (sched_feat(EEVDF)) {
-		/*
-		 * Enabling NEXT_BUDDY will affect latency but not fairness.
-		 */
-		if (sched_feat(NEXT_BUDDY) &&
-		    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
-			return cfs_rq->next;
-
-		return pick_eevdf(cfs_rq);
-	}
-
-	se = left = pick_cfs(cfs_rq, curr);
-
 	/*
-	 * Avoid running the skip buddy, if running something else can
-	 * be done without getting too unfair.
+	 * Enabling NEXT_BUDDY will affect latency but not fairness.
 	 */
-	if (cfs_rq->skip && cfs_rq->skip == se) {
-		struct sched_entity *second;
+	if (sched_feat(NEXT_BUDDY) &&
+	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+		return cfs_rq->next;
 
-		if (se == curr) {
-			second = __pick_first_entity(cfs_rq);
-		} else {
-			second = __pick_next_entity(se);
-			if (!second || (curr && entity_before(curr, second)))
-				second = curr;
-		}
-
-		if (second && wakeup_preempt_entity(second, left) < 1)
-			se = second;
-	}
-
-	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
-		/*
-		 * Someone really wants this to run. If it's not unfair, run it.
-		 */
-		se = cfs_rq->next;
-	} else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
-		/*
-		 * Prefer last buddy, try to return the CPU to a preempted task.
-		 */
-		se = cfs_rq->last;
-	}
-
-	return se;
+	return pick_eevdf(cfs_rq);
 }

pick_next_entity 全面删除cfs的逻辑，只留下eevdf的内容

其他一些删除的东西，这里略过一下。详细可以查看后面的参考链接

[PATCH 09/15] sched/debug: Rename min_granularity to base_slice

c
-unsigned int sysctl_sched_min_granularity           = 750000ULL;
+unsigned int sysctl_sched_base_slice           = 750000ULL;

重命名，其目的为：

min_granularity暗示了CFS中时间片的最小保证
base_slice明确表示这是EEVDF中任务的基本请求时间

在EEVDF中，slice不再是最小保证，而是计算虚拟deadline的基础参数：


se->deadline = se->vruntime + calc_delta_fair(se->slice, se);

[PATCH 10/15] sched/fair: Propagate enqueue flags into place_entity()

新增了 ENQUEUE_INITIAL 的flag

c
-static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
        u64 vslice = calc_delta_fair(se->slice, se);
        u64 vruntime = avg_vruntime(cfs_rq);

        /*
         * on average, halfway through their slice, as such start tasks
         * off with half a slice to ease into the competition.
         */
-       if (sched_feat(PLACE_DEADLINE_INITIAL) && initial)
+       if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
                vslice /= 2;

这个initial和wakeup，migrated平级了。

ENQUEUE_INITIAL标志用于新fork的任务
ENQUEUE_WAKEUP标志用于唤醒的任务
ENQUEUE_MIGRATED标志用于迁移的任务

[PATCH 11/15] sched/eevdf: Better handle mixed slice length

首先引入 avg_slice 的概念。基本上和 avg_vruntime 的概念类似，是对时间的平均累加和。


--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -642,6 +642,7 @@ avg_vruntime_add(struct cfs_rq *cfs_rq,
 	s64 key = entity_key(cfs_rq, se);
 
 	cfs_rq->avg_vruntime += key * weight;
+	cfs_rq->avg_slice += se->slice * weight;
 	cfs_rq->avg_load += weight;
 }
 
@@ -652,6 +653,7 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq,
 	s64 key = entity_key(cfs_rq, se);
 
 	cfs_rq->avg_vruntime -= key * weight;
+	cfs_rq->avg_slice -= se->slice * weight;
 	cfs_rq->avg_load -= weight;
 }

entity_has_slept 先判断任务是否是sleep到wakeup，且不是迁移任务，如果满足，则将lag补足

c
static inline bool
entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
        u64 now;

        if (!(flags & ENQUEUE_WAKEUP))
                return false;

        if (flags & ENQUEUE_MIGRATED)
                return true;

        now = rq_clock_task(rq_of(cfs_rq));
        return (s64)(se->exec_start - now) >= se->slice;
}

添加一个启发式算法PLACE_FUDGE

c
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -4930,6 +4947,19 @@ place_entity(struct cfs_rq *cfs_rq, stru
 		lag = se->vlag;
 
 		/*
+		 * For latency sensitive tasks; those that have a shorter than
+		 * average slice and do not fully consume the slice, transition
+		 * to EEVDF placement strategy #2.
+		 */
+		if (sched_feat(PLACE_FUDGE) &&
+		    (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) &&
+		    entity_has_slept(cfs_rq, se, flags)) {
+			lag += vslice;
+			if (lag > 0)
+				lag = 0;
+		}
+
+		/*
 		 * If we want to place a task and preserve lag, we have to
 		 * consider the effect of the new entity on the weighted
 		 * average and compensate for this, otherwise lag can quickly

这个算法的核心思想是：

当任务的slice小于平均slice时，认为它是延迟敏感任务
如果该任务睡眠时间超过其slice，则将其移动到策略#2
策略#2确保任务立即获得执行资格，避免被大slice任务压制

对于延迟敏感任务，我们希望：

lag_i = w_i \times (V - v_i) \leq r_i

其中r_i是任务的请求时间（slice）。当任务睡眠时间超过r_i时，我们重置其lag，确保它能够及时获得CPU时间。

总结

总结一下这些补丁的核心内容。

SMP优化：使用lag机制替代min_vruntime的计算
策略统一：移除了CFS的启发式丑代码
命名清晰：重命名参数
场景感知：新增enqueue initial flags
混合负载：为eevdf添加启发式算法优化不同slice长度任务的调度

参考

https://lore.kernel.org/lkml/20230531124604.068911180@infradead.org/ https://lore.kernel.org/lkml/20230531124604.137187212@infradead.org/ https://lore.kernel.org/lkml/20230531124604.205287511@infradead.org/ https://lore.kernel.org/lkml/20230531124604.274010996@infradead.org/ https://lore.kernel.org/lkml/20230531124604.341527144@infradead.org/

目录

[PATCH 07/15] sched/smp: Use lag to simplify cross-runqueue placement

[PATCH 08/15] sched: Commit to EEVDF

[PATCH 09/15] sched/debug: Rename min_granularity to base_slice

[PATCH 10/15] sched/fair: Propagate enqueue flags into place_entity()

[PATCH 11/15] sched/eevdf: Better handle mixed slice length

总结

参考