现在基于EEVDF的社区patch进行逐一解析，尽可能抓住重点来介绍eevdf

[PATCH 01/15]

添加两个参数avg_vruntime 和 avg_load


@@ -554,6 +554,9 @@ struct cfs_rq {
 	unsigned int		idle_nr_running;   /* SCHED_IDLE */
 	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
 
+	s64			avg_vruntime;
+	u64			avg_load;
+
 	u64			exec_clock;
 	u64			min_vruntime;

核心说明


/*
 * Compute virtual time from the per-task service numbers:
 *
 * Fair schedulers conserve lag:
 *
+ *   \Sum lag_i = 0
 *
 * Where lag_i is given by:
 *
 *   lag_i = S - s_i = w_i * (V - v_i)
 *
 * Where S is the ideal service time and V is it's virtual time counterpart.
 */

这个注释很容易理解，上篇论文提过，lag为S-s，因为引入了V，所以是w * (v-vi)，然后我们要满足sum(lag)=0，所以可以转换公式如下


 /* Therefore:
 *
 *   \Sum lag_i = 0
 *   \Sum w_i * (V - v_i) = 0
 *   \Sum w_i * V - w_i * v_i = 0
 *
 * From which we can solve an expression for V in v_i (which we have in
 * se->vruntime):
 *
 *       \Sum v_i * w_i   \Sum v_i * w_i
 *   V = -------------- = --------------
 *          \Sum w_i            W
 *
 * Specifically, this is the weighted average of all entity virtual runtimes.
 */

因为sum(lag)=0，所以得到v的公式是

V = \frac{\sum_i w_i * v_i}{\sum_i w_i} = \frac{\sum_i w_i * v_i}{W}


 /*
  * [[ NOTE: this is only equal to the ideal scheduler under the condition
  *          that join/leave operations happen at lag_i = 0, otherwise the
  *          virtual time has non-continguous motion equivalent to:
  *
  *	      V +-= lag_i / W
  *
  *	    Also see the comment in place_entity() that deals with this. ]]
  *
  * However, since v_i is u64, and the multiplcation could easily overflow
  * transform it into a relative form that uses smaller quantities:
  *
  * Substitute: v_i == (v_i - v0) + v0
  *
  *     \Sum ((v_i - v0) + v0) * w_i   \Sum (v_i - v0) * w_i
  * V = ---------------------------- = --------------------- + v0
  *                  W                            W
  *
  * Which we track using:
  *
  *                    v0 := cfs_rq->min_vruntime
  * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
  *              \Sum w_i := cfs_rq->avg_load
  *
  * Since min_vruntime is a monotonic increasing variable that closely tracks
  * the per-task service, these deltas: (v_i - v), will be in the order of the
  * maximal (virtual) lag induced in the system due to quantisation.
  *
  * Also, we use scale_load_down() to reduce the size.
  *
  * As measured, the max (key * weight) value was ~44 bits for a kernel build.
  */

为了避免u64的overflow，所以引用一个替代v_i == (v_i - v0) + v0，代入原公式如下

V = \frac{\sum_i w_i * v_i}{W} = \frac{\sum_i w_i * (v_i - v0) + \sum_i w_i * v0}{W} = \frac{\sum_i w_i * (v_i - v0)}{W} + v0

这里将公式对接到代码上


 v0                  = cfs_rq->min_vruntime
 sum(v_i - v0) * w_i = cfs_rq->avg_vruntime
 sum(w_i)            = cfs_rq->avg_load

对于v_i - v0的函数如下


static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	return (s64)(se->vruntime - cfs_rq->min_vruntime);
}

那么cfs_rq->avg_vruntime和cfs_rq->avg_load的计算函数就是


static void
avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	unsigned long weight = scale_load_down(se->load.weight);
	s64 key = entity_key(cfs_rq, se);

	cfs_rq->avg_vruntime += key * weight;
	cfs_rq->avg_load += weight;
}

static void
avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	unsigned long weight = scale_load_down(se->load.weight);
	s64 key = entity_key(cfs_rq, se);

	cfs_rq->avg_vruntime -= key * weight;
	cfs_rq->avg_load -= weight;
}

随着时间的推移，min_vruntime会不断更新，所以函数如下


static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
{
	u64 min_vruntime = cfs_rq->min_vruntime;
	/*
	 * open coded max_vruntime() to allow updating avg_vruntime
	 */
	s64 delta = (s64)(vruntime - min_vruntime);
	if (delta > 0) {
		avg_vruntime_update(cfs_rq, delta);
		min_vruntime = vruntime;
	}
	return min_vruntime;
}

同样的avg_vruntime 也会随着时间更新，所以函数如下


static inline
void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
{
	/*
	 * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
	 */
	cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
}

还记得计算V的公式吗，如下

V = \frac{\sum_i w_i * (v_i - v0)}{W} + v0 =

其实就是


V = cfs_rq->avg_vruntime / cfs_rq->avg_load + cfs_rq->min_vruntime

所以对于V的函数实现如下


u64 avg_vruntime(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;
	s64 avg = cfs_rq->avg_vruntime;
	long load = cfs_rq->avg_load;

	if (curr && curr->on_rq) {
		unsigned long weight = scale_load_down(curr->load.weight);

		avg += entity_key(cfs_rq, curr) * weight;
		load += weight;
	}

	if (load)
		avg = div_s64(avg, load);

	return cfs_rq->min_vruntime + avg;
}

和之前的CFS的计算vruntime一致，我们要在enqueue和dequeue的时候主动update_curr，这里直接调用add/sub来更新V，如下


@@ -642,12 +767,14 @@ static inline bool __entity_less(struct
  */
 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+	avg_vruntime_add(cfs_rq, se);
 	rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
 }
 
 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+	avg_vruntime_sub(cfs_rq, se);
 }
 
 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
@@ -3379,6 +3506,8 @@ static void reweight_entity(struct cfs_r
 		/* commit outstanding execution time */
 		if (cfs_rq->curr == se)
 			update_curr(cfs_rq);
+		else
+			avg_vruntime_sub(cfs_rq, se);
 		update_load_sub(&cfs_rq->load, se->load.weight);
 	}
 	dequeue_load_avg(cfs_rq, se);
@@ -3394,9 +3523,11 @@ static void reweight_entity(struct cfs_r
 #endif
 
 	enqueue_load_avg(cfs_rq, se);
-	if (se->on_rq)
+	if (se->on_rq) {
 		update_load_add(&cfs_rq->load, se->load.weight);
-
+		if (cfs_rq->curr != se)
+			avg_vruntime_add(cfs_rq, se);
+	}
 }

总结

这里介绍了eevdf的第一个patch，其作用是添加了avg_vruntime和avg_load，并结合论文提供了v的计算函数

参考

https://lore.kernel.org/lkml/20230531115839.089944915@infradead.org/ https://lore.kernel.org/lkml/20230531124603.654144274@infradead.org/

目录

[PATCH 01/15]

总结

参考