diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -91,13 +91,14 @@ struct td_sched { short ts_flags; /* TSF_* flags. */ int ts_cpu; /* CPU we are on, or were last on. */ - int ts_rltick; /* Real last tick, for affinity. */ - int ts_slice; /* Ticks of slice remaining. */ + u_int ts_rltick; /* Real last tick, for affinity. */ + u_int ts_slice; /* Ticks of slice remaining. */ + u_int ts_ftick; /* %CPU window's first tick */ + u_int ts_ltick; /* %CPU window's last tick */ + /* All ticks count below are stored shifted by SCHED_TICK_SHIFT. */ u_int ts_slptime; /* Number of ticks we vol. slept */ u_int ts_runtime; /* Number of ticks we were running */ - int ts_ltick; /* Last tick that we were running on */ - int ts_ftick; /* First tick that we were running on */ - int ts_ticks; /* Tick count */ + u_int ts_ticks; /* pctcpu window's running tick count */ #ifdef KTR char ts_name[TS_NAME_LEN]; #endif @@ -137,20 +138,14 @@ * and end of the MIN to MAX timeshare range are only reachable with negative * or positive nice respectively. * - * PRI_NRESV: Number of priority levels reserved to account for nice values. + * CPU_RANGE: Length of range for priorities computed from CPU use. + * NICE: Priority offset due to the nice value. * 5/4 is to preserve historical nice effect on computation ratios. - * PRI_RANGE: Length of range for priorities computed from CPU use. - * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total. - * PRI_NICE: Determines the part of the priority inherited from nice. + * NRESV: Number of priority levels reserved to account for nice values. */ -#define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) * 5 / 4) -#define SCHED_PRI_MIN (PRI_MIN_BATCH + (SCHED_PRI_NRESV / 2)) -#define SCHED_PRI_MAX (PRI_MAX_BATCH - (SCHED_PRI_NRESV / 2)) -#define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN + 1) -#define SCHED_PRI_TICKS(ts) \ - (SCHED_TICK_HZ((ts)) / \ - (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE)) -#define SCHED_PRI_NICE(nice) ((nice) * 5 / 4) +#define SCHED_PRI_CPU_RANGE (PRI_BATCH_RANGE - SCHED_PRI_NRESV) +#define SCHED_PRI_NICE(nice) (((nice) - PRIO_MIN) * 5 / 4) +#define SCHED_PRI_NRESV SCHED_PRI_NICE(PRIO_MAX) /* * Runqueue indices for the implemented scheduling policies' priority bounds. @@ -180,19 +175,26 @@ /* * Cpu percentage computation macros and defines. * - * SCHED_TICK_SECS: Number of seconds to average the cpu usage across. - * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across. - * SCHED_TICK_MAX: Maximum number of ticks before scaling back. + * SCHED_TICK_SECS: Max number of seconds to average the cpu usage across. + * Must be at most 20 to avoid overflow in sched_pctcpu()'s current formula. + * SCHED_TICK_MAX: Max number of hz ticks matching SCHED_TICK_SECS. * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results. - * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count. - * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks. - */ -#define SCHED_TICK_SECS 10 -#define SCHED_TICK_TARG (hz * SCHED_TICK_SECS) -#define SCHED_TICK_MAX (SCHED_TICK_TARG + hz) -#define SCHED_TICK_SHIFT 10 -#define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT) -#define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz)) + * SCHED_TICK_RUN_SHIFTED: Number of shifted ticks running in last window. + * SCHED_TICK_LENGTH: Length of last window in shifted ticks or 1 if empty. + * SCHED_CPU_DECAY_NUMER: Numerator of %CPU decay factor. + * SCHED_CPU_DECAY_DENOM: Denominator of %CPU decay factor. + */ +#define SCHED_TICK_SECS 11 +#define SCHED_TICK_MAX(hz) ((hz) * SCHED_TICK_SECS) +#define SCHED_TICK_SHIFT 10 +#define SCHED_TICK_RUN_SHIFTED(ts) ((ts)->ts_ticks) +#define SCHED_TICK_LENGTH(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, 1)) +#define SCHED_CPU_DECAY_NUMER 10 +#define SCHED_CPU_DECAY_DENOM 11 +_Static_assert(SCHED_CPU_DECAY_NUMER >= 0 && SCHED_CPU_DECAY_DENOM > 0 && + SCHED_CPU_DECAY_NUMER <= SCHED_CPU_DECAY_DENOM, + "Inconsistent values for SCHED_CPU_DECAY_NUMER and/or " + "SCHED_CPU_DECAY_DENOM"); /* * These determine the interactivity of a process. Interactivity differs from @@ -308,7 +310,11 @@ struct cpu_group __read_mostly *cpu_top; /* CPU topology */ #define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000)) -#define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity)) +/* + * This inequality has to be written with a positive difference of ticks to + * correctly handle wraparound. + */ +#define SCHED_AFFINITY(ts, t) ((u_int)ticks - (ts)->ts_rltick < (t) * affinity) /* * Run-time tunables. @@ -665,7 +671,7 @@ * aside from curthread and target no more than sched_slice latency but * no less than sched_slice_min runtime. */ -static inline int +static inline u_int tdq_slice(struct tdq *tdq) { int load; @@ -1757,9 +1763,12 @@ sched_priority(struct thread *td) { u_int pri, score; + int nice; if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE) return; + + nice = td->td_proc->p_nice; /* * If the score is interactive we place the thread in the realtime * queue with a priority that is less than kernel and interrupt @@ -1773,7 +1782,7 @@ * score. Negative nice values make it easier for a thread to be * considered interactive. */ - score = imax(0, sched_interact_score(td) + td->td_proc->p_nice); + score = imax(0, sched_interact_score(td) + nice); if (score < sched_interact) { pri = PRI_MIN_INTERACT; pri += (PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) * score / @@ -1782,17 +1791,26 @@ ("sched_priority: invalid interactive priority %u score %u", pri, score)); } else { - pri = SCHED_PRI_MIN; - if (td_get_sched(td)->ts_ticks) - pri += min(SCHED_PRI_TICKS(td_get_sched(td)), - SCHED_PRI_RANGE - 1); - pri += SCHED_PRI_NICE(td->td_proc->p_nice); + const struct td_sched *const ts = td_get_sched(td); + const u_int run = SCHED_TICK_RUN_SHIFTED(ts); + const u_int run_unshifted __unused = (run + + (1 << SCHED_TICK_SHIFT) / 2) >> SCHED_TICK_SHIFT; + const u_int len = SCHED_TICK_LENGTH(ts); + const u_int nice_pri_off = SCHED_PRI_NICE(nice); + const u_int cpu_pri_off = (((SCHED_PRI_CPU_RANGE - 1) * + run + len / 2) / len + (1 << SCHED_TICK_SHIFT) / 2) >> + SCHED_TICK_SHIFT; + + MPASS(cpu_pri_off < SCHED_PRI_CPU_RANGE); + pri = PRI_MIN_BATCH + cpu_pri_off + nice_pri_off; KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH, - ("sched_priority: invalid priority %u: nice %d, " - "ticks %d ftick %d ltick %d tick pri %d", - pri, td->td_proc->p_nice, td_get_sched(td)->ts_ticks, - td_get_sched(td)->ts_ftick, td_get_sched(td)->ts_ltick, - SCHED_PRI_TICKS(td_get_sched(td)))); + ("sched_priority: Invalid computed priority %u: " + "Should be between %u and %u (PRI_MIN_BATCH: %u; " + "Window size (ticks): %u, runtime (shifted ticks): %u," + "(unshifted ticks): %u => CPU pri off: %u; " + "Nice: %d => nice pri off: %u)", + pri, PRI_MIN_BATCH, PRI_MAX_BATCH, PRI_MIN_BATCH, + len, run, run_unshifted, cpu_pri_off, nice, nice_pri_off)); } sched_user_prio(td, pri); @@ -1877,8 +1895,8 @@ * Set up the scheduler specific parts of thread0. */ ts0 = td_get_sched(&thread0); - ts0->ts_ltick = ticks; - ts0->ts_ftick = ticks; + ts0->ts_ftick = (u_int)ticks; + ts0->ts_ltick = ts0->ts_ftick; ts0->ts_slice = 0; ts0->ts_cpu = curcpu; /* set valid CPU number */ } @@ -1917,30 +1935,56 @@ } /* - * Update the percent cpu tracking information when it is requested or - * the total history exceeds the maximum. We keep a sliding history of - * tick counts that slowly decays. This is less precise than the 4BSD - * mechanism since it happens with less regular and frequent events. + * Update the percent cpu tracking information when it is requested or the total + * history exceeds the maximum. We keep a sliding history of tick counts that + * slowly decays, for running threads (see comments below for more details). + * This is less precise than the 4BSD mechanism since it happens with less + * regular and frequent events. */ static void sched_pctcpu_update(struct td_sched *ts, int run) { - int t = ticks; + const u_int t = (u_int)ticks; + u_int t_max = SCHED_TICK_MAX((u_int)hz); + u_int t_tgt = ((t_max << SCHED_TICK_SHIFT) * SCHED_CPU_DECAY_NUMER / + SCHED_CPU_DECAY_DENOM) >> SCHED_TICK_SHIFT; + const u_int lu_span = t - ts->ts_ltick; - /* - * The signed difference may be negative if the thread hasn't run for - * over half of the ticks rollover period. - */ - if ((u_int)(t - ts->ts_ltick) >= SCHED_TICK_TARG) { - ts->ts_ticks = 0; - ts->ts_ftick = t - SCHED_TICK_TARG; - } else if (t - ts->ts_ftick >= SCHED_TICK_MAX) { - ts->ts_ticks = (ts->ts_ticks / (ts->ts_ltick - ts->ts_ftick)) * - (ts->ts_ltick - (t - SCHED_TICK_TARG)); - ts->ts_ftick = t - SCHED_TICK_TARG; + if (lu_span >= t_tgt) { + /* + * Forget all previous ticks if we are more than t_tgt + * (currently, 10s) apart from the last update. Don't account + * for more than 't_tgt' ticks when running. + */ + ts->ts_ticks = run ? (t_tgt << SCHED_TICK_SHIFT) : 0; + ts->ts_ftick = t - t_tgt; + ts->ts_ltick = t; + return; + } + + if (t - ts->ts_ftick >= t_max) { + /* + * First reduce the existing ticks to proportionally occupy only + * what's left of the target window given 'lu_span' will occupy + * the rest. Since sched_clock() is called frequently on + * running threads, these threads have a small 'lu_span', and + * the next formula basically becomes an exponential decay with + * ratio r = SCHED_CPU_DECAY_NUMER / SCHED_CPU_DECAY_DENOM + * (currently, 10/11) and period 1s. However, a sleeping thread + * will see its accounted ticks drop linearly with a high slope + * with respect to 'lu_span', approaching 0 as 'lu_span' + * approaches 't_tgt' (so, continuously with respect to the + * previous case). This rescaling is completely dependent on + * the frequency of calls and the span since last update passed + * at each call. + */ + ts->ts_ticks = SCHED_TICK_RUN_SHIFTED(ts) / + SCHED_TICK_LENGTH(ts) * (t_tgt - lu_span); + ts->ts_ftick = t - t_tgt; } + if (run) - ts->ts_ticks += (t - ts->ts_ltick) << SCHED_TICK_SHIFT; + ts->ts_ticks += lu_span << SCHED_TICK_SHIFT; ts->ts_ltick = t; } @@ -2300,9 +2344,9 @@ #ifdef SMP pickcpu = (td->td_flags & TDF_PICKCPU) != 0; if (pickcpu) - ts->ts_rltick = ticks - affinity * MAX_CACHE_LEVELS; + ts->ts_rltick = (u_int)ticks - affinity * MAX_CACHE_LEVELS; else - ts->ts_rltick = ticks; + ts->ts_rltick = (u_int)ticks; #endif td->td_lastcpu = td->td_oncpu; preempted = (td->td_flags & TDF_SLICEEND) == 0 && @@ -2655,7 +2699,7 @@ * Return time slice for a given thread. For ithreads this is * sched_slice. For other threads it is tdq_slice(tdq). */ -static inline int +static inline u_int td_slice(struct thread *td, struct tdq *tdq) { if (PRI_BASE(td->td_pri_class) == PRI_ITHD) @@ -2940,22 +2984,18 @@ fixpt_t sched_pctcpu(struct thread *td) { - fixpt_t pctcpu; struct td_sched *ts; - - pctcpu = 0; - ts = td_get_sched(td); + u_int len; + fixpt_t pctcpu; THREAD_LOCK_ASSERT(td, MA_OWNED); + ts = td_get_sched(td); sched_pctcpu_update(ts, TD_IS_RUNNING(td)); - if (ts->ts_ticks) { - int rtick; - - /* How many rtick per second ? */ - rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz); - pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; - } - + len = SCHED_TICK_LENGTH(ts); + pctcpu = ((FSHIFT >= SCHED_TICK_SHIFT ? /* Resolved at compile-time. */ + (SCHED_TICK_RUN_SHIFTED(ts) << (FSHIFT - SCHED_TICK_SHIFT)) : + (SCHED_TICK_RUN_SHIFTED(ts) >> (SCHED_TICK_SHIFT - FSHIFT))) + + len / 2) / len; return (pctcpu); }