继续分析eevdf的提交patch。
首先,不需要再通过min_vruntime来更新vruntime,直接调用之前补丁实现的place_entity
cstatic void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
bool curr = cfs_rq->curr == se;
/*
* If we're the current task, we must renormalise before calling
* update_curr().
*/
- if (renorm && curr)
- se->vruntime += cfs_rq->min_vruntime;
+ if (curr)
+ place_entity(cfs_rq, se, 0);
dequeue这部分,也是删除了之前cfs的自减min_vruntime,这个之前文章提过。
也因此 enqueue 时 不再只有ENQUEUE_WAKEUP的时候才 place_entity ,所以 dequeue 的时候对应要做 update_entity_lag
c@@ -5335,23 +5306,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
clear_buddies(cfs_rq, se);
- if (flags & DEQUEUE_SLEEP)
- update_entity_lag(cfs_rq, se);
-
+ update_entity_lag(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);
- /*
- * Normalize after update_curr(); which will also have moved
- * min_vruntime if @se is the one holding it back. But before doing
- * update_min_vruntime() again, which will discount @se's position and
- * can move min_vruntime forward still more.
- */
- if (!(flags & DEQUEUE_SLEEP))
- se->vruntime -= cfs_rq->min_vruntime;
-
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
migrate_task_rq_fair 在cfs中的时候,需要重新计算vruntime,这里使用eevdf,删掉
c@@ -8102,18 +8062,6 @@ static void migrate_task_rq_fair(struct
{
struct sched_entity *se = &p->se;
- /*
- * As blocked tasks retain absolute vruntime the migration needs to
- * deal with this by subtracting the old and adding the new
- * min_vruntime -- the latter is done by enqueue_entity() when placing
- * the task on the new runqueue.
- */
- if (READ_ONCE(p->__state) == TASK_WAKING) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
- }
-
if (!task_on_rq_migrating(p)) {
remove_entity_load_avg(se);
task_fork_fair 也不需要调整vruntime,新版的place_entity 可以直接计算 se 的vruntime,所以删掉
值得注意的是 sysctl_sched_child_runs_first 本身是针对有child 需要先 于 parent 获得cpu的功能,这里eevdf直接删掉
c@@ -12492,22 +12440,9 @@ static void task_fork_fair(struct task_s
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
- if (curr) {
+ if (curr)
update_curr(cfs_rq);
- se->vruntime = curr->vruntime;
- }
place_entity(cfs_rq, se, 1);
-
- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
- /*
- * Upon rescheduling, sched_class::put_prev_task() will place
- * 'current' within the tree based on its new key value.
- */
- swap(curr->vruntime, se->vruntime);
- resched_curr(rq);
- }
-
- se->vruntime -= cfs_rq->min_vruntime;
rq_unlock(rq, &rf);
}
标准函数vruntime_normalized直接删掉,其本意是判断一个任务是否需要重平衡,这里在 detach_task_cfs_rq 和 detach_task_cfs_rq 一并删掉
@@ -12634,16 +12541,6 @@ static void attach_entity_cfs_rq(struct static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - if (!vruntime_normalized(p)) { - /* - * Fix up our vruntime so that the current sleep doesn't - * cause 'unlimited' sleep bonus. - */ - place_entity(cfs_rq, se, 0); - se->vruntime -= cfs_rq->min_vruntime; - } detach_entity_cfs_rq(se); } @@ -12651,12 +12548,8 @@ static void detach_task_cfs_rq(struct ta static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); attach_entity_cfs_rq(se); - - if (!vruntime_normalized(p)) - se->vruntime += cfs_rq->min_vruntime; }
这里直接删掉原cfs的 sysctl_sched_latency 和 sysctl_sched_wakeup_granularity
c-unsigned int sysctl_sched_latency = 6000000ULL;
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int sysctl_sched_base_slice = 750000ULL;
删掉LAST_BUDDY、SKIP_BUDDYbuddy机制
--- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -576,8 +576,6 @@ struct cfs_rq { */ struct sched_entity *curr; struct sched_entity *next; - struct sched_entity *last; - struct sched_entity *skip;
在CFS的时候,需要做特殊处理如下
但是这在eevdf都是不需要的,因为它只需要一个关键 deadline
@@ -1098,35 +961,25 @@ static void update_deadline(struct cfs_r if ((s64)(se->vruntime - se->deadline) < 0) return; - if (sched_feat(EEVDF)) { - /* - * For EEVDF the virtual time slope is determined by w_i (iow. - * nice) while the request time r_i is determined by - * sysctl_sched_min_granularity. - */ - se->slice = sysctl_sched_min_granularity; - - /* - * The task has consumed its request, reschedule. - */ - if (cfs_rq->nr_running > 1) { - resched_curr(rq_of(cfs_rq)); - clear_buddies(cfs_rq, se); - } - } else { - /* - * When many tasks blow up the sched_period; it is possible - * that sched_slice() reports unusually large results (when - * many tasks are very light for example). Therefore impose a - * maximum. - */ - se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency); - } + /* + * For EEVDF the virtual time slope is determined by w_i (iow. + * nice) while the request time r_i is determined by + * sysctl_sched_min_granularity. + */ + se->slice = sysctl_sched_min_granularity; /* * EEVDF: vd_i = ve_i + r_i / w_i */ se->deadline = se->vruntime + calc_delta_fair(se->slice, se); + + /* + * The task has consumed its request, reschedule. + */ + if (cfs_rq->nr_running > 1) { + resched_curr(rq_of(cfs_rq)); + clear_buddies(cfs_rq, se); + } }
根据上面的代码,可以看到,直接给slice为sysctl_sched_min_granularity,把之前一些兼容cfs的东西删掉
-/* - * Preempt the current task with a newly woken task if needed: - */ -static void -check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) -{ - unsigned long delta_exec; - struct sched_entity *se; - s64 delta; - - delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > curr->slice) { - resched_curr(rq_of(cfs_rq)); - /* - * The current task ran long enough, ensure it doesn't get - * re-elected due to buddy favours. - */ - clear_buddies(cfs_rq, curr); - return; - } - - /* - * Ensure that a task that missed wakeup preemption by a - * narrow margin doesn't have to wait for a full slice. - * This also mitigates buddy induced latencies under load. - */ - if (delta_exec < sysctl_sched_min_granularity) - return; - - se = __pick_first_entity(cfs_rq); - delta = curr->vruntime - se->vruntime; - - if (delta < 0) - return; - - if (delta > curr->slice) - resched_curr(rq_of(cfs_rq)); -} -
删掉cfs独有的check_preempt_tick
/* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups @@ -5420,53 +5189,14 @@ wakeup_preempt_entity(struct sched_entit static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - struct sched_entity *left, *se; - - if (sched_feat(EEVDF)) { - /* - * Enabling NEXT_BUDDY will affect latency but not fairness. - */ - if (sched_feat(NEXT_BUDDY) && - cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) - return cfs_rq->next; - - return pick_eevdf(cfs_rq); - } - - se = left = pick_cfs(cfs_rq, curr); - /* - * Avoid running the skip buddy, if running something else can - * be done without getting too unfair. + * Enabling NEXT_BUDDY will affect latency but not fairness. */ - if (cfs_rq->skip && cfs_rq->skip == se) { - struct sched_entity *second; + if (sched_feat(NEXT_BUDDY) && + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) + return cfs_rq->next; - if (se == curr) { - second = __pick_first_entity(cfs_rq); - } else { - second = __pick_next_entity(se); - if (!second || (curr && entity_before(curr, second))) - second = curr; - } - - if (second && wakeup_preempt_entity(second, left) < 1) - se = second; - } - - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) { - /* - * Someone really wants this to run. If it's not unfair, run it. - */ - se = cfs_rq->next; - } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) { - /* - * Prefer last buddy, try to return the CPU to a preempted task. - */ - se = cfs_rq->last; - } - - return se; + return pick_eevdf(cfs_rq); }
pick_next_entity 全面删除cfs的逻辑,只留下eevdf的内容
其他一些删除的东西,这里略过一下。详细可以查看后面的参考链接
c-unsigned int sysctl_sched_min_granularity = 750000ULL;
+unsigned int sysctl_sched_base_slice = 750000ULL;
重命名,其目的为:
min_granularity暗示了CFS中时间片的最小保证base_slice明确表示这是EEVDF中任务的基本请求时间在EEVDF中,slice不再是最小保证,而是计算虚拟deadline的基础参数:
se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
新增了 ENQUEUE_INITIAL 的flag
c-static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u64 vslice = calc_delta_fair(se->slice, se);
u64 vruntime = avg_vruntime(cfs_rq);
/*
* on average, halfway through their slice, as such start tasks
* off with half a slice to ease into the competition.
*/
- if (sched_feat(PLACE_DEADLINE_INITIAL) && initial)
+ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
vslice /= 2;
这个initial和wakeup,migrated平级了。
ENQUEUE_INITIAL标志用于新fork的任务ENQUEUE_WAKEUP标志用于唤醒的任务ENQUEUE_MIGRATED标志用于迁移的任务首先 引入 avg_slice 的概念。基本上和 avg_vruntime 的概念类似,是对时间的平均累加和。
--- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -642,6 +642,7 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, s64 key = entity_key(cfs_rq, se); cfs_rq->avg_vruntime += key * weight; + cfs_rq->avg_slice += se->slice * weight; cfs_rq->avg_load += weight; } @@ -652,6 +653,7 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq, s64 key = entity_key(cfs_rq, se); cfs_rq->avg_vruntime -= key * weight; + cfs_rq->avg_slice -= se->slice * weight; cfs_rq->avg_load -= weight; }
entity_has_slept 先判断任务是否是sleep到wakeup,且不是迁移任务,如果满足,则将lag补足
cstatic inline bool
entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u64 now;
if (!(flags & ENQUEUE_WAKEUP))
return false;
if (flags & ENQUEUE_MIGRATED)
return true;
now = rq_clock_task(rq_of(cfs_rq));
return (s64)(se->exec_start - now) >= se->slice;
}
添加一个启发式算法PLACE_FUDGE
c static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -4930,6 +4947,19 @@ place_entity(struct cfs_rq *cfs_rq, stru
lag = se->vlag;
/*
+ * For latency sensitive tasks; those that have a shorter than
+ * average slice and do not fully consume the slice, transition
+ * to EEVDF placement strategy #2.
+ */
+ if (sched_feat(PLACE_FUDGE) &&
+ (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) &&
+ entity_has_slept(cfs_rq, se, flags)) {
+ lag += vslice;
+ if (lag > 0)
+ lag = 0;
+ }
+
+ /*
* If we want to place a task and preserve lag, we have to
* consider the effect of the new entity on the weighted
* average and compensate for this, otherwise lag can quickly
这个算法的核心思想是:
对于延迟敏感任务,我们希望:
其中r_i是任务的请求时间(slice)。当任务睡眠时间超过r_i时,我们重置其lag,确保它能够及时获得CPU时间。
总结一下这些补丁的核心内容。
https://lore.kernel.org/lkml/20230531124604.068911180@infradead.org/ https://lore.kernel.org/lkml/20230531124604.137187212@infradead.org/ https://lore.kernel.org/lkml/20230531124604.205287511@infradead.org/ https://lore.kernel.org/lkml/20230531124604.274010996@infradead.org/ https://lore.kernel.org/lkml/20230531124604.341527144@infradead.org/