下面继续分析其他的patch

[PATCH 12/15] sched: Introduce latency-nice as a per-task attribute

这里为 task_struct 提供了 latency_prio 字段，类似于 weight 的 nice value，所以latency_prio越低，则时延要求越高。


--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -791,6 +791,7 @@ struct task_struct {
 	int				static_prio;
 	int				normal_prio;
 	unsigned int			rt_priority;
+	int				latency_prio;
 
 	struct sched_entity		se;
 	struct sched_rt_entity		rt;

所以新增一个 SCHED_FLAG_LATENCY_NICE 代表用户态可以通过 sched_setattr 来定制化 sched_latency_nice


--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -132,6 +132,7 @@ struct clone_args {
 #define SCHED_FLAG_KEEP_PARAMS		0x10
 #define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
 #define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
+#define SCHED_FLAG_LATENCY_NICE		0x80
 
 #define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
 				 SCHED_FLAG_KEEP_PARAMS)
@@ -143,6 +144,7 @@ struct clone_args {
 			 SCHED_FLAG_RECLAIM		| \
 			 SCHED_FLAG_DL_OVERRUN		| \
 			 SCHED_FLAG_KEEP_ALL		| \
-			 SCHED_FLAG_UTIL_CLAMP)
+			 SCHED_FLAG_UTIL_CLAMP		| \
+			 SCHED_FLAG_LATENCY_NICE)
 
 #endif /* _UAPI_LINUX_SCHED_H */

sched_latency_nice的范围是 -20 至 19


 /*
  * Extended scheduling parameters data structure.
@@ -98,6 +99,22 @@ struct sched_param {
  * scheduled on a CPU with no more capacity than the specified value.
  *
  * A task utilization boundary can be reset by setting the attribute to -1.
+ *
+ * Latency Tolerance Attributes
+ * ===========================
+ *
+ * A subset of sched_attr attributes allows to specify the relative latency
+ * requirements of a task with respect to the other tasks running/queued in the
+ * system.
+ *
+ * @ sched_latency_nice	task's latency_nice value
+ *
+ * The latency_nice of a task can have any value in a range of
+ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE].
+ *
+ * A task with latency_nice with the value of LATENCY_NICE_MIN can be
+ * taken for a task requiring a lower latency as opposed to the task with
+ * higher latency_nice.
  */
 struct sched_attr {
 	__u32 size;
@@ -120,6 +137,8 @@ struct sched_attr {
 	__u32 sched_util_min;
 	__u32 sched_util_max;
 
+	/* latency requirement hints */
+	__s32 sched_latency_nice;
 };
 
 #endif /* _UAPI_LINUX_SCHED_TYPES_H */

其初始值是 DEFAULT_PRIO


--- a/init/init_task.c
+++ b/init/init_task.c
@@ -78,6 +78,7 @@ struct task_struct init_task
 	.prio		= MAX_PRIO - 20,
 	.static_prio	= MAX_PRIO - 20,
 	.normal_prio	= MAX_PRIO - 20,
+	.latency_prio	= DEFAULT_PRIO,
 	.policy		= SCHED_NORMAL,
 	.cpus_ptr	= &init_task.cpus_mask,
 	.user_cpus_ptr	= NULL,
@@ -89,7 +90,7 @@ struct task_struct init_task
 		.fn = do_no_restart_syscall,
 	},

fork的时候，prio是 nice 0


--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4719,6 +4719,8 @@ int sched_fork(unsigned long clone_flags
 		p->prio = p->normal_prio = p->static_prio;
 		set_load_weight(p, false);
 
+		p->latency_prio = NICE_TO_PRIO(0);
+
 		/*
 		 * We don't need the reset flag anymore after the fork. It has
 		 * fulfilled its duty:

__sched_setscheduler 的具体实现


+static void __setscheduler_latency(struct task_struct *p,
+				   const struct sched_attr *attr)
+{
+	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
+		p->latency_prio = NICE_TO_PRIO(attr->sched_latency_nice);
+}
+
 /*
  * Check the target process has a UID that matches the current process's:
  */
@@ -7641,6 +7650,13 @@ static int __sched_setscheduler(struct t
 			return retval;
 	}
 
+	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
+		if (attr->sched_latency_nice > MAX_NICE)
+			return -EINVAL;
+		if (attr->sched_latency_nice < MIN_NICE)
+			return -EINVAL;
+	}
+
 	if (pi)
 		cpuset_read_lock();
 
@@ -7675,6 +7691,9 @@ static int __sched_setscheduler(struct t
 			goto change;
 		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
 			goto change;
+		if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
+		    attr->sched_latency_nice != PRIO_TO_NICE(p->latency_prio))
+			goto change;
 
 		p->sched_reset_on_fork = reset_on_fork;
 		retval = 0;
@@ -7763,6 +7782,7 @@ static int __sched_setscheduler(struct t
 		__setscheduler_params(p, attr);
 		__setscheduler_prio(p, newprio);
 	}
+	__setscheduler_latency(p, attr);
 	__setscheduler_uclamp(p, attr);

[PATCH 13/15] sched/fair: Implement latency-nice

新增设置 latency_prio 函数


--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_
 	}
 }
 
+static inline void set_latency_prio(struct task_struct *p, int prio)
+{
+	p->latency_prio = prio;
+	set_latency_fair(&p->se, prio - MAX_RT_PRIO);
+}
+

然后使用接口来实现设置latency prio，这样可以获得如下映射

latency-nice 19: ~204.8ms (尽力而为)
latency-nice -20: ~0.034ms (极低延迟)


 /*
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -952,6 +952,21 @@ int sched_update_scaling(void)
 }
 #endif
 
+void set_latency_fair(struct sched_entity *se, int prio)
+{
+	u32 weight = sched_prio_to_weight[prio];
+	u64 base = sysctl_sched_base_slice;
+
+	/*
+	 * For EEVDF the virtual time slope is determined by w_i (iow.
+	 * nice) while the request time r_i is determined by
+	 * latency-nice.
+	 *
+	 * Smaller request gets better latency.
+	 */
+	se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
+}
+

set_latency_fair 在 set_latency_prio 下使用，这里乘以1024，然后除上weight来计算期望的slice

[PATCH 14/15] sched/fair: Add sched group latency support

这笔patch针对 cgroup 的 task_group 来完善set latency


--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -378,6 +378,8 @@ struct task_group {
 
 	/* A positive value indicates that this is a SCHED_IDLE group. */
 	int			idle;
+	/* latency priority of the group. */
+	int			latency_prio;
 
 #ifdef	CONFIG_SMP
 	/*
@@ -488,6 +490,8 @@ extern int sched_group_set_shares(struct
 
 extern int sched_group_set_idle(struct task_group *tg, long idle);
 
+extern int sched_group_set_latency(struct task_group *tg, int prio);
+


+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
+				     struct cftype *cft, s64 nice)
+{
+	int prio;
+
+	if (nice < MIN_NICE || nice > MAX_NICE)
+		return -ERANGE;
+
+	prio = NICE_TO_PRIO(nice);
+
+	return sched_group_set_latency(css_tg(css), prio);
+}
 #endif


@@ -11408,6 +11432,12 @@ static struct cftype cpu_files[] = {
 		.read_s64 = cpu_idle_read_s64,
 		.write_s64 = cpu_idle_write_s64,
 	},
+	{
+		.name = "latency.nice",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_s64 = cpu_latency_nice_read_s64,
+		.write_s64 = cpu_latency_nice_write_s64,
+	},
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH

这样在cgroup v2上的cpu.latency.nice，直接作用到cpu_latency_nice_read_s64和cpu_latency_nice_write_s64上

同样

新增的 sched_group_set_latency 可以对 task_group 更新 latency prio，这里lantency prio 一旦更新，group 下所有 se 的 slice 值也同步更新

[PATCH 15/15] sched/eevdf: Use sched_attr::sched_runtime to set request/slice


--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7494,10 +7494,18 @@ static void __setscheduler_params(struct
 
 	p->policy = policy;
 
-	if (dl_policy(policy))
+	if (dl_policy(policy)) {
 		__setparam_dl(p, attr);
-	else if (fair_policy(policy))
+	} else if (fair_policy(policy)) {
 		p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+		if (attr->sched_runtime) {
+			p->se.slice = clamp_t(u64, attr->sched_runtime,
+					      NSEC_PER_MSEC/10,   /* HZ=1000 * 10 */
+					      NSEC_PER_MSEC*100); /* HZ=100  / 10 */
+		} else {
+			p->se.slice = sysctl_sched_base_slice;
+		}
+	}
 
 	/*
 	 * __sched_setscheduler() ensures attr->sched_priority == 0 when
@@ -7689,7 +7697,9 @@ static int __sched_setscheduler(struct t
 	 * but store a possible modification of reset_on_fork.
 	 */
 	if (unlikely(policy == p->policy)) {
-		if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+		if (fair_policy(policy) &&
+		    (attr->sched_nice != task_nice(p) ||
+		     (attr->sched_runtime && attr->sched_runtime != p->se.slice)))
 			goto change;
 		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
 			goto change;
@@ -8017,12 +8027,14 @@ static int sched_copy_attr(struct sched_
 
 static void get_params(struct task_struct *p, struct sched_attr *attr)
 {
-	if (task_has_dl_policy(p))
+	if (task_has_dl_policy(p)) {
 		__getparam_dl(p, attr);
-	else if (task_has_rt_policy(p))
+	} else if (task_has_rt_policy(p)) {
 		attr->sched_priority = p->rt_priority;
-	else
+	} else {
 		attr->sched_nice = task_nice(p);
+		attr->sched_runtime = p->se.slice;
+	}
 }
 
 /**

鉴于某些应用有精确的延迟要求，这里提供更精确的设置办法

总结

EEVDF的patch基本上都分析完成了，关于调度相关的原理的东西基本完结，但是针对调度而言，个人认为还有如下未尽事宜。

sched_ext 原理
eas 调度
PREEMPT_RT 补丁
RTLA分析：https://retis.sssup.it/~d.casini/papers/2025/TC2025/Timerlat_TC.pdf
Deadline 调度
bench工具：Schbench/Hackbench/Cyclictest
维测工具：perfetto

鉴于现在AI特别的火，系统性的学习AI也至关重要，上面未尽事宜等什么时候有空再更新。转战AI。

参考

https://lore.kernel.org/lkml/20230531124604.410388887@infradead.org/ https://lore.kernel.org/lkml/20230531124604.477939524@infradead.org/ https://lore.kernel.org/lkml/20230531124604.546980086@infradead.org/ https://lore.kernel.org/lkml/20230531124604.615053451@infradead.org/

目录

[PATCH 12/15] sched: Introduce latency-nice as a per-task attribute

[PATCH 13/15] sched/fair: Implement latency-nice

[PATCH 14/15] sched/fair: Add sched group latency support

[PATCH 15/15] sched/eevdf: Use sched_attr::sched_runtime to set request/slice

总结

参考