之前介绍了eevdf的patchset-v1，现在继续介绍

[PATCH 02/15] sched/fair: Remove START_DEBIT

引入 avg_vruntime() 就不需要使用更差的了近似值。以0滞后点为插入起点新任务。

在cfs中，为了解决任务fork时cfs的min_vruntime过小导致其他任务starvation，所以提供了一个cfs的转述feature：START_DEBIT，在eevdf中se->vruntime不再是原cfs的min_vruntime了，所以这个feature天然消失，消失就删掉


@@ -906,16 +906,6 @@ static u64 sched_slice(struct cfs_rq *cf
 	return slice;
 }
 
-/*
- * We calculate the vruntime slice of a to-be-inserted task.
- *
- * vs = s/w
- */
-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	return calc_delta_fair(sched_slice(cfs_rq, se), se);
-}
-
 #include "pelt.h"
 #ifdef CONFIG_SMP
 
@@ -4862,16 +4852,7 @@ static inline bool entity_is_long_sleepe
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
-	u64 vruntime = cfs_rq->min_vruntime;
-
-	/*
-	 * The 'current' period is already promised to the current tasks,
-	 * however the extra weight of the new task will slow them down a
-	 * little, place the new task so that it fits in the slot that
-	 * stays open at the end.
-	 */
-	if (initial && sched_feat(START_DEBIT))
-		vruntime += sched_vslice(cfs_rq, se);
+	u64 vruntime = avg_vruntime(cfs_rq);

可以看到，eevdf的vruntime是获取了avg_vruntime，avg_vruntime的计算上一篇介绍过了。

[PATCH 03/15] sched/fair: Add lag based placement

这笔提交给se添加了lag概念


@@ -555,8 +555,9 @@ struct sched_entity {
 
 	u64				exec_start;
 	u64				sum_exec_runtime;
-	u64				vruntime;
 	u64				prev_sum_exec_runtime;
+	u64				vruntime;
+	s64				vlag;
 
 	u64				nr_migrations;

reweight的时候，lag需要重新计算，


@@ -3492,6 +3501,8 @@ dequeue_load_avg(struct cfs_rq *cfs_rq,
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {
+	unsigned long old_weight = se->load.weight;
+
 	if (se->on_rq) {
 		/* commit outstanding execution time */
 		if (cfs_rq->curr == se)
@@ -3504,6 +3515,14 @@ static void reweight_entity(struct cfs_r
 
 	update_load_set(&se->load, weight);
 
+	if (!se->on_rq) {
+		/*
+		 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+		 * we need to scale se->vlag when w_i changes.
+		 */
+		se->vlag = div_s64(se->vlag * old_weight, weight);
+	}
+

这里vlag = lag_i / w_i，所以reweight的时候，vlag需要更新。

我们在更新lag的时候，之前提到的公式是S-s，所以如下


/*
 * lag_i = S - s_i = w_i * (V - v_i)
 */
void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	SCHED_WARN_ON(!se->on_rq);
	se->vlag = avg_vruntime(cfs_rq) - se->vruntime;
}

因为lag是 w_i * (V - v_i)，而vlag是vlag = lag_i / w_i，所以计算vlag就是直接 V-v_i

更新lag的实际是dequeue的时候，所以如下


@@ -5066,6 +5155,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
 
 	clear_buddies(cfs_rq, se);
 
+	if (flags & DEQUEUE_SLEEP)
+		update_entity_lag(cfs_rq, se);
+
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;

eevdf在client requeue的时候有三个策略，所以place_entity实现这些策略，这里只实现了策略1和策略2，我们先看看代码


 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
 	u64 vruntime = avg_vruntime(cfs_rq);
+	s64 lag = 0;
 
-	/* sleeps up to a single latency don't count. */
-	if (!initial) {
-		unsigned long thresh;
+	/*
+	 * Due to how V is constructed as the weighted average of entities,
+	 * adding tasks with positive lag, or removing tasks with negative lag
+	 * will move 'time' backwards, this can screw around with the lag of
+	 * other tasks.
+	 *
+	 * EEVDF: placement strategy #1 / #2
+	 */
+	if (sched_feat(PLACE_LAG) && cfs_rq->nr_running > 1) {

策略1

client 的场景:

client 离开

V(t) = V(t) + \frac{lag_j(t)}{\sum_{i\in{A(t^+)}}w_i}

client 加入

V(t) = V(t) - \frac{lag_j(t)}{\sum_{i\in{A(t^+)}}w_i}

client reweight

V(t) = V(t) + \frac{lag_j(t)}{\sum_{i\in{A(t^+)}}w_i - w_j} + \frac{lag_j(t)}{\sum_{i\in{A(t^+)}}w_i + w_j^{'}}

来更新

策略2

和策略1一致，但是client reweight不做任何操作，也就是

V(t) = V(t)

我们看一下代码


 		/*
		 * If we want to place a task and preserve lag, we have to
		 * consider the effect of the new entity on the weighted
		 * average and compensate for this, otherwise lag can quickly
		 * evaporate.
		 *
		 * Lag is defined as:
		 *
		 *   lag_i = S - s_i = w_i * (V - v_i)
		 *
		 * To avoid the 'w_i' term all over the place, we only track
		 * the virtual lag:
		 *
		 *   vl_i = V - v_i <=> v_i = V - vl_i
		 *
		 * And we take V to be the weighted average of all v:
		 *
		 *   V = (\Sum w_j*v_j) / W
		 *
		 * Where W is: \Sum w_j
		 *
		 * Then, the weighted average after adding an entity with lag
		 * vl_i is given by:
		 *
		 *   V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
		 *      = (W*V + w_i*(V - vl_i)) / (W + w_i)
		 *      = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
		 *      = (V*(W + w_i) - w_i*l) / (W + w_i)
		 *      = V - w_i*vl_i / (W + w_i)
		 *
		 * And the actual lag after adding an entity with vl_i is:
		 *
		 *   vl'_i = V' - v_i
		 *         = V - w_i*vl_i / (W + w_i) - (V - vl_i)
		 *         = vl_i - w_i*vl_i / (W + w_i)
		 *
		 * Which is strictly less than vl_i. So in order to preserve lag
		 * we should inflate the lag before placement such that the
		 * effective lag after placement comes out right.
		 *
		 * As such, invert the above relation for vl'_i to get the vl_i
		 * we need to use such that the lag after placement is the lag
		 * we computed before dequeue.
		 *
		 *   vl'_i = vl_i - w_i*vl_i / (W + w_i)
		 *         = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
		 *
		 *   (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
		 *                   = W*vl_i
		 *
		 *   vl_i = (W + w_i)*vl'_i / W
 		 */
		load = cfs_rq->avg_load;
		if (curr && curr->on_rq)
			load += curr->load.weight;

		lag *= load + se->load.weight;
		if (WARN_ON_ONCE(!load))
			load = 1;
		lag = div_s64(lag, load);

		vruntime -= lag;
 	}

假设V为任务i加入前的virtual time，那么V此时的公式为

V = \frac{\sum_j w_j*v_j}{W} W = \sum_j w_j

此时任务i加入了，那么我们要计算的新的V应该是

V(t) = V(t) - \frac{lag_j(t)}{\sum_{i\in{A(t^+)}}w_i}

而lag是


lag_i = S - s_i = w_i * (V - v_i)

所以新的V公式为

V' = (\sum_j w_j*v_j + w_i*v_i) / (W + w_i) \\ = (W*V + w_i*(V - vl_i)) / (W + w_i) \\ = (W*V + w_i*V - w_i*vl_i) / (W + w_i) \\ = (V*(W + w_i) - w_i*l) / (W + w_i) \\ = V - w_i*vl_i / (W + w_i)

这样可以计算i加入后的lag v_i，如下


vl'_i = V' - v_i \\
	  = V - w_i*vl_i / (W + w_i) - (V - vl_i) \\
      = vl_i - w_i*vl_i / (W + w_i)

两边乘以W + w_i

(W + w_i) vl'_i = (W + w_i) vl_i - w_i*vl_i (W + w_i) vl'_i = W * vl_i vl_i = \frac{(W + w_i) vl'_i}{W}

这样求出来就是lag值，又因为

v_i = V - vl_i

所以可以通过vruntime -= lag求出vruntime值

总结

这笔patch突出了lag的计算，我们需要反复的结合论文的公式和内核注释来进行分析，论文中提到了策略1和策略2，而内核默认是策略2，也可以通过feature启用策略1。

参考

https://lore.kernel.org/lkml/20230531124603.722361178@infradead.org/ https://lore.kernel.org/lkml/20230531124603.794929315@infradead.org/

目录

[PATCH 02/15] sched/fair: Remove START_DEBIT

[PATCH 03/15] sched/fair: Add lag based placement

策略1

策略2

总结

参考