Commit 9bd42183 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'sched-core-for-linus' of git://

Pull scheduler updates from Ingo Molnar:
 "The main changes in this cycle were:

   - Add the SYSTEM_SCHEDULING bootup state to move various scheduler
     debug checks earlier into the bootup. This turns silent and
     sporadically deadly bugs into nice, deterministic splats. Fix some
     of the splats that triggered. (Thomas Gleixner)

   - A round of restructuring and refactoring of the load-balancing and
     topology code (Peter Zijlstra)

   - Another round of consolidating ~20 of incremental scheduler code
     history: this time in terms of wait-queue nomenclature. (I didn't
     get much feedback on these renaming patches, and we can still
     easily change any names I might have misplaced, so if anyone hates
     a new name, please holler and I'll fix it.) (Ingo Molnar)

   - sched/numa improvements, fixes and updates (Rik van Riel)

   - Another round of x86/tsc scheduler clock code improvements, in hope
     of making it more robust (Peter Zijlstra)

   - Improve NOHZ behavior (Frederic Weisbecker)

   - Deadline scheduler improvements and fixes (Luca Abeni, Daniel
     Bristot de Oliveira)

   - Simplify and optimize the topology setup code (Lauro Ramos

   - Debloat and decouple scheduler code some more (Nicolas Pitre)

   - Simplify code by making better use of llist primitives (Byungchul

   - ... plus other fixes and improvements"

* 'sched-core-for-linus' of git:// (103 commits)
  sched/cputime: Refactor the cputime_adjust() code
  sched/debug: Expose the number of RT/DL tasks that can migrate
  sched/numa: Hide numa_wake_affine() from UP build
  sched/fair: Remove effective_load()
  sched/numa: Implement NUMA node level wake_affine()
  sched/fair: Simplify wake_affine() for the single socket case
  sched/numa: Override part of migrate_degrades_locality() when idle balancing
  sched/rt: Move RT related code from sched/core.c to sched/rt.c
  sched/deadline: Move DL related code from sched/core.c to sched/deadline.c
  sched/cpuset: Only offer CONFIG_CPUSETS if SMP is enabled
  sched/fair: Spare idle load balancing on nohz_full CPUs
  nohz: Move idle balancer registration to the idle path
  sched/loadavg: Generalize "_idle" naming to "_nohz"
  sched/core: Drop the unused try_get_task_struct() helper function
  sched/fair: WARN() and refuse to set buddy when !se->on_rq
  sched/debug: Fix SCHED_WARN_ON() to return a value on !CONFIG_SCHED_DEBUG as well
  sched/wait: Disambiguate wq_entry->task_list and wq_head->task_list naming
  sched/wait: Move bit_wait_table[] and related functionality from sched/core.c to sched/wait_bit.c
  sched/wait: Split out the wait_bit*() APIs from <linux/wait.h> into <linux/wait_bit.h>
  sched/wait: Re-adjust macro line continuation backslashes in <linux/wait.h>
parents 7447d562 72298e5c
......@@ -819,7 +819,7 @@ printk(KERN_INFO "my ip: %pI4\n", &amp;ipaddress);
certain condition is true. They must be used carefully to ensure
there is no race condition. You declare a
<type>wait_queue_head_t</type>, and then processes which want to
wait for that condition declare a <type>wait_queue_t</type>
wait for that condition declare a <type>wait_queue_entry_t</type>
referring to themselves, and place that in the queue.
......@@ -316,7 +316,7 @@ For version 5, the format of the message is:
struct autofs_v5_packet {
int proto_version; /* Protocol version */
int type; /* Type of packet */
autofs_wqt_t wait_queue_token;
autofs_wqt_t wait_queue_entry_token;
__u32 dev;
__u64 ino;
__u32 uid;
......@@ -341,12 +341,12 @@ The pipe will be set to "packet mode" (equivalent to passing
`O_DIRECT`) to _pipe2(2)_ so that a read from the pipe will return at
most one packet, and any unread portion of a packet will be discarded.
The `wait_queue_token` is a unique number which can identify a
The `wait_queue_entry_token` is a unique number which can identify a
particular request to be acknowledged. When a message is sent over
the pipe the affected dentry is marked as either "active" or
"expiring" and other accesses to it block until the message is
acknowledged using one of the ioctls below and the relevant
Communicating with autofs: root directory ioctls
......@@ -358,7 +358,7 @@ capability, or must be the automount daemon.
The available ioctl commands are:
- **AUTOFS_IOC_READY**: a notification has been handled. The argument
to the ioctl command is the "wait_queue_token" number
to the ioctl command is the "wait_queue_entry_token" number
corresponding to the notification being acknowledged.
- **AUTOFS_IOC_FAIL**: similar to above, but indicates failure with
the error code `ENOENT`.
......@@ -382,14 +382,14 @@ The available ioctl commands are:
struct autofs_packet_expire_multi {
int proto_version; /* Protocol version */
int type; /* Type of packet */
autofs_wqt_t wait_queue_token;
autofs_wqt_t wait_queue_entry_token;
int len;
char name[NAME_MAX+1];
is required. This is filled in with the name of something
that can be unmounted or removed. If nothing can be expired,
`errno` is set to `EAGAIN`. Even though a `wait_queue_token`
`errno` is set to `EAGAIN`. Even though a `wait_queue_entry_token`
is present in the structure, no "wait queue" is established
and no acknowledgment is needed.
- **AUTOFS_IOC_EXPIRE_MULTI**: This is similar to
......@@ -7,6 +7,8 @@ CONTENTS
1. Overview
2. Scheduling algorithm
2.1 Main algorithm
2.2 Bandwidth reclaiming
3. Scheduling Real-Time Tasks
3.1 Definitions
3.2 Schedulability Analysis for Uniprocessor Systems
......@@ -44,6 +46,9 @@ CONTENTS
2. Scheduling algorithm
2.1 Main algorithm
SCHED_DEADLINE uses three parameters, named "runtime", "period", and
"deadline", to schedule tasks. A SCHED_DEADLINE task should receive
"runtime" microseconds of execution time every "period" microseconds, and
......@@ -113,6 +118,160 @@ CONTENTS
remaining runtime = remaining runtime + runtime
2.2 Bandwidth reclaiming
Bandwidth reclaiming for deadline tasks is based on the GRUB (Greedy
Reclamation of Unused Bandwidth) algorithm [15, 16, 17] and it is enabled
when flag SCHED_FLAG_RECLAIM is set.
The following diagram illustrates the state names for tasks handled by GRUB:
(d) | Active |
------------->| |
| | Contending |
| ------------
| A |
---------- | |
| | | |
| Inactive | |(b) | (a)
| | | |
---------- | |
A | V
| ------------
| | Active |
--------------| Non |
(c) | Contending |
A task can be in one of the following states:
- ActiveContending: if it is ready for execution (or executing);
- ActiveNonContending: if it just blocked and has not yet surpassed the 0-lag
- Inactive: if it is blocked and has surpassed the 0-lag time.
State transitions:
(a) When a task blocks, it does not become immediately inactive since its
bandwidth cannot be immediately reclaimed without breaking the
real-time guarantees. It therefore enters a transitional state called
ActiveNonContending. The scheduler arms the "inactive timer" to fire at
the 0-lag time, when the task's bandwidth can be reclaimed without
breaking the real-time guarantees.
The 0-lag time for a task entering the ActiveNonContending state is
computed as
(runtime * dl_period)
deadline - ---------------------
where runtime is the remaining runtime, while dl_runtime and dl_period
are the reservation parameters.
(b) If the task wakes up before the inactive timer fires, the task re-enters
the ActiveContending state and the "inactive timer" is canceled.
In addition, if the task wakes up on a different runqueue, then
the task's utilization must be removed from the previous runqueue's active
utilization and must be added to the new runqueue's active utilization.
In order to avoid races between a task waking up on a runqueue while the
"inactive timer" is running on a different CPU, the "dl_non_contending"
flag is used to indicate that a task is not on a runqueue but is active
(so, the flag is set when the task blocks and is cleared when the
"inactive timer" fires or when the task wakes up).
(c) When the "inactive timer" fires, the task enters the Inactive state and
its utilization is removed from the runqueue's active utilization.
(d) When an inactive task wakes up, it enters the ActiveContending state and
its utilization is added to the active utilization of the runqueue where
it has been enqueued.
For each runqueue, the algorithm GRUB keeps track of two different bandwidths:
- Active bandwidth (running_bw): this is the sum of the bandwidths of all
tasks in active state (i.e., ActiveContending or ActiveNonContending);
- Total bandwidth (this_bw): this is the sum of all tasks "belonging" to the
runqueue, including the tasks in Inactive state.
The algorithm reclaims the bandwidth of the tasks in Inactive state.
It does so by decrementing the runtime of the executing task Ti at a pace equal
dq = -max{ Ui, (1 - Uinact) } dt
where Uinact is the inactive utilization, computed as (this_bq - running_bw),
and Ui is the bandwidth of task Ti.
Let's now see a trivial example of two deadline tasks with runtime equal
to 4 and period equal to 8 (i.e., bandwidth equal to 0.5):
A Task T1
| |
| |
|-------- |----
| | V
0 1 2 3 4 5 6 7 8
A Task T2
| |
| |
| ------------------------|
| | V
0 1 2 3 4 5 6 7 8
A running_bw
1 ----------------- ------
| | |
0.5- -----------------
| |
0 1 2 3 4 5 6 7 8
- Time t = 0:
Both tasks are ready for execution and therefore in ActiveContending state.
Suppose Task T1 is the first task to start execution.
Since there are no inactive tasks, its runtime is decreased as dq = -1 dt.
- Time t = 2:
Suppose that task T1 blocks
Task T1 therefore enters the ActiveNonContending state. Since its remaining
runtime is equal to 2, its 0-lag time is equal to t = 4.
Task T2 start execution, with runtime still decreased as dq = -1 dt since
there are no inactive tasks.
- Time t = 4:
This is the 0-lag time for Task T1. Since it didn't woken up in the
meantime, it enters the Inactive state. Its bandwidth is removed from
Task T2 continues its execution. However, its runtime is now decreased as
dq = - 0.5 dt because Uinact = 0.5.
Task T2 therefore reclaims the bandwidth unused by Task T1.
- Time t = 8:
Task T1 wakes up. It enters the ActiveContending state again, and the
running_bw is incremented.
3. Scheduling Real-Time Tasks
......@@ -330,6 +489,15 @@ CONTENTS
14 - J. Erickson, U. Devi and S. Baruah. Improved tardiness bounds for
Global EDF. Proceedings of the 22nd Euromicro Conference on
Real-Time Systems, 2010.
15 - G. Lipari, S. Baruah, Greedy reclamation of unused bandwidth in
constant-bandwidth servers, 12th IEEE Euromicro Conference on Real-Time
Systems, 2000.
16 - L. Abeni, J. Lelli, C. Scordino, L. Palopoli, Greedy CPU reclaiming for
SCHED DEADLINE. In Proceedings of the Real-Time Linux Workshop (RTLWS),
Dusseldorf, Germany, 2014.
17 - L. Abeni, G. Lipari, A. Parri, Y. Sun, Multicore CPU reclaiming: parallel
or sequential?. In Proceedings of the 31st Annual ACM Symposium on Applied
Computing, 2016.
4. Bandwidth management
......@@ -1609,7 +1609,7 @@ Doing the same with chrt -r 5 and function-trace set.
<idle>-0 3dN.2 14us : sched_avg_update <-__cpu_load_update
<idle>-0 3dN.2 14us : _raw_spin_unlock <-cpu_load_update_nohz
<idle>-0 3dN.2 14us : sub_preempt_count <-_raw_spin_unlock
<idle>-0 3dN.1 15us : calc_load_exit_idle <-tick_nohz_idle_exit
<idle>-0 3dN.1 15us : calc_load_nohz_stop <-tick_nohz_idle_exit
<idle>-0 3dN.1 15us : touch_softlockup_watchdog <-tick_nohz_idle_exit
<idle>-0 3dN.1 15us : hrtimer_cancel <-tick_nohz_idle_exit
<idle>-0 3dN.1 15us : hrtimer_try_to_cancel <-hrtimer_cancel
......@@ -555,8 +555,7 @@ static DEFINE_RAW_SPINLOCK(stop_lock);
static void ipi_cpu_stop(unsigned int cpu)
if (system_state == SYSTEM_BOOTING ||
system_state == SYSTEM_RUNNING) {
if (system_state <= SYSTEM_RUNNING) {
pr_crit("CPU%u: stopping\n", cpu);
......@@ -961,8 +961,7 @@ void smp_send_stop(void)
cpumask_copy(&mask, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &mask);
if (system_state == SYSTEM_BOOTING ||
system_state == SYSTEM_RUNNING)
if (system_state <= SYSTEM_RUNNING)
pr_crit("SMP: stopping secondary CPUs\n");
smp_cross_call(&mask, IPI_CPU_STOP);
......@@ -567,8 +567,7 @@ static void stop_this_cpu(void *data)
unsigned int cpu = smp_processor_id();
if (system_state == SYSTEM_BOOTING ||
system_state == SYSTEM_RUNNING) {
if (system_state <= SYSTEM_RUNNING) {
pr_crit("CPU%u: stopping\n", cpu);
......@@ -97,7 +97,7 @@ int smp_generic_cpu_bootable(unsigned int nr)
/* Special case - we inhibit secondary thread startup
* during boot if the user requests it.
if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) {
if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) {
if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
return 0;
if (smt_enabled_at_boot
......@@ -2265,7 +2265,7 @@ static struct pmu pmu = {
void arch_perf_update_userpage(struct perf_event *event,
struct perf_event_mmap_page *userpg, u64 now)
struct cyc2ns_data *data;
struct cyc2ns_data data;
u64 offset;
userpg->cap_user_time = 0;
......@@ -2277,17 +2277,17 @@ void arch_perf_update_userpage(struct perf_event *event,
if (!using_native_sched_clock() || !sched_clock_stable())
data = cyc2ns_read_begin();
offset = data->cyc2ns_offset + __sched_clock_offset;
offset = data.cyc2ns_offset + __sched_clock_offset;
* Internal timekeeping for enabled/running/stopped times
* is always in the local_clock domain.
userpg->cap_user_time = 1;
userpg->time_mult = data->cyc2ns_mul;
userpg->time_shift = data->cyc2ns_shift;
userpg->time_mult = data.cyc2ns_mul;
userpg->time_shift = data.cyc2ns_shift;
userpg->time_offset = offset - now;
......@@ -2299,7 +2299,7 @@ void arch_perf_update_userpage(struct perf_event *event,
userpg->time_zero = offset;
......@@ -29,11 +29,9 @@ struct cyc2ns_data {
u32 cyc2ns_mul;
u32 cyc2ns_shift;
u64 cyc2ns_offset;
u32 __count;
/* u32 hole */
}; /* 24 bytes -- do not grow */
}; /* 16 bytes */
extern struct cyc2ns_data *cyc2ns_read_begin(void);
extern void cyc2ns_read_end(struct cyc2ns_data *);
extern void cyc2ns_read_begin(struct cyc2ns_data *);
extern void cyc2ns_read_end(void);
#endif /* _ASM_X86_TIMER_H */
......@@ -863,7 +863,7 @@ static void announce_cpu(int cpu, int apicid)
if (cpu == 1)
printk(KERN_INFO "x86: Booting SMP configuration:\n");
if (system_state == SYSTEM_BOOTING) {
if (system_state < SYSTEM_RUNNING) {
if (node != current_node) {
if (current_node > (-1))
......@@ -51,115 +51,34 @@ static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
struct clocksource *art_related_clocksource;
* Use a ring-buffer like data structure, where a writer advances the head by
* writing a new data entry and a reader advances the tail when it observes a
* new entry.
* Writers are made to wait on readers until there's space to write a new
* entry.
* This means that we can always use an {offset, mul} pair to compute a ns
* value that is 'roughly' in the right direction, even if we're writing a new
* {offset, mul} pair during the clock read.
* The down-side is that we can no longer guarantee strict monotonicity anymore
* (assuming the TSC was that to begin with), because while we compute the
* intersection point of the two clock slopes and make sure the time is
* continuous at the point of switching; we can no longer guarantee a reader is
* strictly before or after the switch point.
* It does mean a reader no longer needs to disable IRQs in order to avoid
* CPU-Freq updates messing with his times, and similarly an NMI reader will
* no longer run the risk of hitting half-written state.
struct cyc2ns {
struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
struct cyc2ns_data *head; /* 48 + 8 = 56 */
struct cyc2ns_data *tail; /* 56 + 8 = 64 */
}; /* exactly fits one cacheline */
static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
struct cyc2ns_data *cyc2ns_read_begin(void)
struct cyc2ns_data *head;
head = this_cpu_read(cyc2ns.head);
* Ensure we observe the entry when we observe the pointer to it.
* matches the wmb from cyc2ns_write_end().
struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
seqcount_t seq; /* 32 + 4 = 36 */
return head;
}; /* fits one cacheline */
void cyc2ns_read_end(struct cyc2ns_data *head)
* If we're the outer most nested read; update the tail pointer
* when we're done. This notifies possible pending writers
* that we've observed the head pointer and that the other
* entry is now free.
if (!--head->__count) {
* x86-TSO does not reorder writes with older reads;
* therefore once this write becomes visible to another
* cpu, we must be finished reading the cyc2ns_data.
* matches with cyc2ns_write_begin().
this_cpu_write(cyc2ns.tail, head);
static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
* Begin writing a new @data entry for @cpu.
* Assumes some sort of write side lock; currently 'provided' by the assumption
* that cpufreq will call its notifiers sequentially.
static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
void cyc2ns_read_begin(struct cyc2ns_data *data)
struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
struct cyc2ns_data *data = c2n->data;
int seq, idx;
if (data == c2n->head)
/* XXX send an IPI to @cpu in order to guarantee a read? */
do {
seq = this_cpu_read(cyc2ns.seq.sequence);
idx = seq & 1;
* When we observe the tail write from cyc2ns_read_end(),
* the cpu must be done with that entry and its safe
* to start writing to it.
while (c2n->tail == data)
data->cyc2ns_offset = this_cpu_read([idx].cyc2ns_offset);
data->cyc2ns_mul = this_cpu_read([idx].cyc2ns_mul);
data->cyc2ns_shift = this_cpu_read([idx].cyc2ns_shift);
return data;
} while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
void cyc2ns_read_end(void)
struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
* Ensure the @data writes are visible before we publish the
* entry. Matches the data-depencency in cyc2ns_read_begin().
ACCESS_ONCE(c2n->head) = data;
......@@ -191,7 +110,6 @@ static void cyc2ns_data_init(struct cyc2ns_data *data)
data->cyc2ns_mul = 0;
data->cyc2ns_shift = 0;
data->cyc2ns_offset = 0;
data->__count = 0;
static void cyc2ns_init(int cpu)
......@@ -201,51 +119,29 @@ static void cyc2ns_init(int cpu)
c2n->head = c2n->data;
c2n->tail = c2n->data;
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
struct cyc2ns_data *data, *tail;
struct cyc2ns_data data;
unsigned long long ns;
* See cyc2ns_read_*() for details; replicated in order to avoid
* an extra few instructions that came with the abstraction.
* Notable, it allows us to only do the __count and tail update
* dance when its actually needed.
data = this_cpu_read(cyc2ns.head);
tail = this_cpu_read(cyc2ns.tail);
if (likely(data == tail)) {
ns = data->cyc2ns_offset;
ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
} else {
ns = data->cyc2ns_offset;
ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
ns = data.cyc2ns_offset;
ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
if (!--data->__count)
this_cpu_write(cyc2ns.tail, data);
return ns;
static void set_cyc2ns_scale(unsigned long khz, int cpu)
static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
unsigned long long tsc_now, ns_now;
struct cyc2ns_data *data;
unsigned long long ns_now;
struct cyc2ns_data data;
struct cyc2ns *c2n;
unsigned long flags;
......@@ -254,9 +150,6 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
if (!khz)
goto done;
data = cyc2ns_write_begin(cpu);
tsc_now = rdtsc();
ns_now = cycles_2_ns(tsc_now);
......@@ -264,7 +157,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz,
clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
......@@ -273,20 +166,26 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* conversion algorithm shifting a 32-bit value (now specifies a 64-bit
* value) - refer perf_event_mmap_page documentation in perf_event.h.
if (data->cyc2ns_shift == 32) {
data->cyc2ns_shift = 31;
data->cyc2ns_mul >>= 1;
if (data.cyc2ns_shift == 32) {
data.cyc2ns_shift = 31;
data.cyc2ns_mul >>= 1;
data->cyc2ns_offset = ns_now -
mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
data.cyc2ns_offset = ns_now -
mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
c2n = per_cpu_ptr(&cyc2ns, cpu);
cyc2ns_write_end(cpu, data);
c2n->data[0] = data;
c2n->data[1] = data;
* Scheduler clock - returns current time in nanosec units.
......@@ -374,6 +273,8 @@ static int __init tsc_setup(char *str)
tsc_clocksource_reliable = 1;
if (!strncmp(str, "noirqtime", 9))
no_sched_irq_time = 1;
if (!strcmp(str, "unstable"))
mark_tsc_unstable("boot parameter");
return 1;