下面继续分析其他的patch
这里为 task_struct 提供了 latency_prio 字段,类似于 weight 的 nice value,所以latency_prio越低,则时延要求越高。
--- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -791,6 +791,7 @@ struct task_struct { int static_prio; int normal_prio; unsigned int rt_priority; + int latency_prio; struct sched_entity se; struct sched_rt_entity rt;
所以新增一个 SCHED_FLAG_LATENCY_NICE 代表 用户态可以通过 sched_setattr 来定制化 sched_latency_nice
--- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -132,6 +132,7 @@ struct clone_args { #define SCHED_FLAG_KEEP_PARAMS 0x10 #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 +#define SCHED_FLAG_LATENCY_NICE 0x80 #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ SCHED_FLAG_KEEP_PARAMS) @@ -143,6 +144,7 @@ struct clone_args { SCHED_FLAG_RECLAIM | \ SCHED_FLAG_DL_OVERRUN | \ SCHED_FLAG_KEEP_ALL | \ - SCHED_FLAG_UTIL_CLAMP) + SCHED_FLAG_UTIL_CLAMP | \ + SCHED_FLAG_LATENCY_NICE) #endif /* _UAPI_LINUX_SCHED_H */
sched_latency_nice的范围是 -20 至 19
/* * Extended scheduling parameters data structure. @@ -98,6 +99,22 @@ struct sched_param { * scheduled on a CPU with no more capacity than the specified value. * * A task utilization boundary can be reset by setting the attribute to -1. + * + * Latency Tolerance Attributes + * =========================== + * + * A subset of sched_attr attributes allows to specify the relative latency + * requirements of a task with respect to the other tasks running/queued in the + * system. + * + * @ sched_latency_nice task's latency_nice value + * + * The latency_nice of a task can have any value in a range of + * [MIN_LATENCY_NICE..MAX_LATENCY_NICE]. + * + * A task with latency_nice with the value of LATENCY_NICE_MIN can be + * taken for a task requiring a lower latency as opposed to the task with + * higher latency_nice. */ struct sched_attr { __u32 size; @@ -120,6 +137,8 @@ struct sched_attr { __u32 sched_util_min; __u32 sched_util_max; + /* latency requirement hints */ + __s32 sched_latency_nice; }; #endif /* _UAPI_LINUX_SCHED_TYPES_H */
其初始值是 DEFAULT_PRIO
--- a/init/init_task.c +++ b/init/init_task.c @@ -78,6 +78,7 @@ struct task_struct init_task .prio = MAX_PRIO - 20, .static_prio = MAX_PRIO - 20, .normal_prio = MAX_PRIO - 20, + .latency_prio = DEFAULT_PRIO, .policy = SCHED_NORMAL, .cpus_ptr = &init_task.cpus_mask, .user_cpus_ptr = NULL, @@ -89,7 +90,7 @@ struct task_struct init_task .fn = do_no_restart_syscall, },
fork的时候,prio是 nice 0
--- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4719,6 +4719,8 @@ int sched_fork(unsigned long clone_flags p->prio = p->normal_prio = p->static_prio; set_load_weight(p, false); + p->latency_prio = NICE_TO_PRIO(0); + /* * We don't need the reset flag anymore after the fork. It has * fulfilled its duty:
__sched_setscheduler 的具体实现
+static void __setscheduler_latency(struct task_struct *p, + const struct sched_attr *attr) +{ + if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) + p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice); +} + /* * Check the target process has a UID that matches the current process's: */ @@ -7641,6 +7650,13 @@ static int __sched_setscheduler(struct t return retval; } + if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) { + if (attr->sched_latency_nice > MAX_NICE) + return -EINVAL; + if (attr->sched_latency_nice < MIN_NICE) + return -EINVAL; + } + if (pi) cpuset_read_lock(); @@ -7675,6 +7691,9 @@ static int __sched_setscheduler(struct t goto change; if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) goto change; + if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE && + attr->sched_latency_nice != PRIO_TO_NICE(p->latency_prio)) + goto change; p->sched_reset_on_fork = reset_on_fork; retval = 0; @@ -7763,6 +7782,7 @@ static int __sched_setscheduler(struct t __setscheduler_params(p, attr); __setscheduler_prio(p, newprio); } + __setscheduler_latency(p, attr); __setscheduler_uclamp(p, attr);
新增设置 latency_prio 函数
--- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_ } } +static inline void set_latency_prio(struct task_struct *p, int prio) +{ + p->latency_prio = prio; + set_latency_fair(&p->se, prio - MAX_RT_PRIO); +} +
然后使用接口来实现设置latency prio,这样可以获得如下映射
/* --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -952,6 +952,21 @@ int sched_update_scaling(void) } #endif +void set_latency_fair(struct sched_entity *se, int prio) +{ + u32 weight = sched_prio_to_weight[prio]; + u64 base = sysctl_sched_base_slice; + + /* + * For EEVDF the virtual time slope is determined by w_i (iow. + * nice) while the request time r_i is determined by + * latency-nice. + * + * Smaller request gets better latency. + */ + se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight); +} +
set_latency_fair 在 set_latency_prio 下使用,这里乘以1024,然后除上weight来计算期望的slice
这笔patch针对 cgroup 的 task_group 来完善set latency
--- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -378,6 +378,8 @@ struct task_group { /* A positive value indicates that this is a SCHED_IDLE group. */ int idle; + /* latency priority of the group. */ + int latency_prio; #ifdef CONFIG_SMP /* @@ -488,6 +490,8 @@ extern int sched_group_set_shares(struct extern int sched_group_set_idle(struct task_group *tg, long idle); +extern int sched_group_set_latency(struct task_group *tg, int prio); +
+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 nice) +{ + int prio; + + if (nice < MIN_NICE || nice > MAX_NICE) + return -ERANGE; + + prio = NICE_TO_PRIO(nice); + + return sched_group_set_latency(css_tg(css), prio); +} #endif
@@ -11408,6 +11432,12 @@ static struct cftype cpu_files[] = { .read_s64 = cpu_idle_read_s64, .write_s64 = cpu_idle_write_s64, }, + { + .name = "latency.nice", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_latency_nice_read_s64, + .write_s64 = cpu_latency_nice_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH
这样在cgroup v2上的cpu.latency.nice,直接作用到cpu_latency_nice_read_s64和cpu_latency_nice_write_s64上
同样
新增的 sched_group_set_latency 可以对 task_group 更新 latency prio,这里lantency prio 一旦更新,group 下所有 se 的 slice 值也同步更新
--- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7494,10 +7494,18 @@ static void __setscheduler_params(struct p->policy = policy; - if (dl_policy(policy)) + if (dl_policy(policy)) { __setparam_dl(p, attr); - else if (fair_policy(policy)) + } else if (fair_policy(policy)) { p->static_prio = NICE_TO_PRIO(attr->sched_nice); + if (attr->sched_runtime) { + p->se.slice = clamp_t(u64, attr->sched_runtime, + NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ + NSEC_PER_MSEC*100); /* HZ=100 / 10 */ + } else { + p->se.slice = sysctl_sched_base_slice; + } + } /* * __sched_setscheduler() ensures attr->sched_priority == 0 when @@ -7689,7 +7697,9 @@ static int __sched_setscheduler(struct t * but store a possible modification of reset_on_fork. */ if (unlikely(policy == p->policy)) { - if (fair_policy(policy) && attr->sched_nice != task_nice(p)) + if (fair_policy(policy) && + (attr->sched_nice != task_nice(p) || + (attr->sched_runtime && attr->sched_runtime != p->se.slice))) goto change; if (rt_policy(policy) && attr->sched_priority != p->rt_priority) goto change; @@ -8017,12 +8027,14 @@ static int sched_copy_attr(struct sched_ static void get_params(struct task_struct *p, struct sched_attr *attr) { - if (task_has_dl_policy(p)) + if (task_has_dl_policy(p)) { __getparam_dl(p, attr); - else if (task_has_rt_policy(p)) + } else if (task_has_rt_policy(p)) { attr->sched_priority = p->rt_priority; - else + } else { attr->sched_nice = task_nice(p); + attr->sched_runtime = p->se.slice; + } } /**
鉴于某些应用有精确的延迟要求,这里提供更精确的设置办法
EEVDF的patch基本上都分析完成了,关于调度相关的原理的东西基本完结,但是针对调度而言,个人认为还有如下未尽事宜。
鉴于现在AI特别的火,系统性的学习AI也至关重要,上面未尽事宜等什么时候有空再更新。转战AI。
https://lore.kernel.org/lkml/20230531124604.410388887@infradead.org/ https://lore.kernel.org/lkml/20230531124604.477939524@infradead.org/ https://lore.kernel.org/lkml/20230531124604.546980086@infradead.org/ https://lore.kernel.org/lkml/20230531124604.615053451@infradead.org/