之前介绍了eevdf的patchset-v1,现在继续介绍
引入 avg_vruntime() 就不需要使用更差的了近似值。以0滞后点为插入起点新任务。
在cfs中,为了解决任务fork时cfs的min_vruntime过小导致其他任务starvation,所以提供了一个cfs的转述feature:START_DEBIT,在eevdf中se->vruntime不再是原cfs的min_vruntime了,所以这个feature天然消失,消失就删掉
@@ -906,16 +906,6 @@ static u64 sched_slice(struct cfs_rq *cf return slice; } -/* - * We calculate the vruntime slice of a to-be-inserted task. - * - * vs = s/w - */ -static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - return calc_delta_fair(sched_slice(cfs_rq, se), se); -} - #include "pelt.h" #ifdef CONFIG_SMP @@ -4862,16 +4852,7 @@ static inline bool entity_is_long_sleepe static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { - u64 vruntime = cfs_rq->min_vruntime; - - /* - * The 'current' period is already promised to the current tasks, - * however the extra weight of the new task will slow them down a - * little, place the new task so that it fits in the slot that - * stays open at the end. - */ - if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice(cfs_rq, se); + u64 vruntime = avg_vruntime(cfs_rq);
可以看到,eevdf的vruntime是获取了avg_vruntime,avg_vruntime的计算上一篇介绍过了。
这笔提交给se添加了lag概念
@@ -555,8 +555,9 @@ struct sched_entity { u64 exec_start; u64 sum_exec_runtime; - u64 vruntime; u64 prev_sum_exec_runtime; + u64 vruntime; + s64 vlag; u64 nr_migrations;
reweight的时候,lag需要重新计算,
@@ -3492,6 +3501,8 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { + unsigned long old_weight = se->load.weight; + if (se->on_rq) { /* commit outstanding execution time */ if (cfs_rq->curr == se) @@ -3504,6 +3515,14 @@ static void reweight_entity(struct cfs_r update_load_set(&se->load, weight); + if (!se->on_rq) { + /* + * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), + * we need to scale se->vlag when w_i changes. + */ + se->vlag = div_s64(se->vlag * old_weight, weight); + } +
这里vlag = lag_i / w_i,所以reweight的时候,vlag需要更新。
我们在更新lag的时候,之前提到的公式是S-s,所以如下
/* * lag_i = S - s_i = w_i * (V - v_i) */ void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) { SCHED_WARN_ON(!se->on_rq); se->vlag = avg_vruntime(cfs_rq) - se->vruntime; }
因为lag是 w_i * (V - v_i),而vlag是vlag = lag_i / w_i,所以计算vlag就是直接 V-v_i
更新lag的实际是dequeue的时候,所以如下
@@ -5066,6 +5155,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, st clear_buddies(cfs_rq, se); + if (flags & DEQUEUE_SLEEP) + update_entity_lag(cfs_rq, se); + if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0;
eevdf在client requeue的时候有三个策略,所以place_entity实现这些策略,这里只实现了策略1和策略2,我们先看看代码
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime = avg_vruntime(cfs_rq); + s64 lag = 0; - /* sleeps up to a single latency don't count. */ - if (!initial) { - unsigned long thresh; + /* + * Due to how V is constructed as the weighted average of entities, + * adding tasks with positive lag, or removing tasks with negative lag + * will move 'time' backwards, this can screw around with the lag of + * other tasks. + * + * EEVDF: placement strategy #1 / #2 + */ + if (sched_feat(PLACE_LAG) && cfs_rq->nr_running > 1) {
client 的场景:
client 离开
client 加入
client reweight
来更新
和策略1一致,但是client reweight不做任何操作,也就是
我们看一下代码
/* * If we want to place a task and preserve lag, we have to * consider the effect of the new entity on the weighted * average and compensate for this, otherwise lag can quickly * evaporate. * * Lag is defined as: * * lag_i = S - s_i = w_i * (V - v_i) * * To avoid the 'w_i' term all over the place, we only track * the virtual lag: * * vl_i = V - v_i <=> v_i = V - vl_i * * And we take V to be the weighted average of all v: * * V = (\Sum w_j*v_j) / W * * Where W is: \Sum w_j * * Then, the weighted average after adding an entity with lag * vl_i is given by: * * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i) * = (W*V + w_i*(V - vl_i)) / (W + w_i) * = (W*V + w_i*V - w_i*vl_i) / (W + w_i) * = (V*(W + w_i) - w_i*l) / (W + w_i) * = V - w_i*vl_i / (W + w_i) * * And the actual lag after adding an entity with vl_i is: * * vl'_i = V' - v_i * = V - w_i*vl_i / (W + w_i) - (V - vl_i) * = vl_i - w_i*vl_i / (W + w_i) * * Which is strictly less than vl_i. So in order to preserve lag * we should inflate the lag before placement such that the * effective lag after placement comes out right. * * As such, invert the above relation for vl'_i to get the vl_i * we need to use such that the lag after placement is the lag * we computed before dequeue. * * vl'_i = vl_i - w_i*vl_i / (W + w_i) * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i) * * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i * = W*vl_i * * vl_i = (W + w_i)*vl'_i / W */ load = cfs_rq->avg_load; if (curr && curr->on_rq) load += curr->load.weight; lag *= load + se->load.weight; if (WARN_ON_ONCE(!load)) load = 1; lag = div_s64(lag, load); vruntime -= lag; }
假设V为任务i加入前的virtual time,那么V此时的公式为
此时任务i加入了,那么我们要计算的新的V应该是
而lag是
lag_i = S - s_i = w_i * (V - v_i)
所以新的V公式为
这样可以计算i加入后的lag v_i,如下
vl'_i = V' - v_i \\ = V - w_i*vl_i / (W + w_i) - (V - vl_i) \\ = vl_i - w_i*vl_i / (W + w_i)
两边乘以W + w_i
这样求出来就是lag值, 又因为
所以可以通过vruntime -= lag求出vruntime值
这笔patch突出了lag的计算,我们需要反复的结合论文的公式和内核注释来进行分析,论文中提到了策略1和策略2,而内核默认是策略2,也可以通过feature启用策略1。
https://lore.kernel.org/lkml/20230531124603.722361178@infradead.org/ https://lore.kernel.org/lkml/20230531124603.794929315@infradead.org/