core.c 196 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
/*
2
 *  kernel/sched/core.c
Linus Torvalds's avatar
Linus Torvalds committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
 *
 *  Kernel scheduler and related syscalls
 *
 *  Copyright (C) 1991-2002  Linus Torvalds
 *
 *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
 *		make semaphores SMP safe
 *  1998-11-19	Implemented schedule_timeout() and related stuff
 *		by Andrea Arcangeli
 *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
 *		hybrid priority-list and round-robin design with
 *		an array-switch method of distributing timeslices
 *		and per-CPU runqueues.  Cleanups and useful suggestions
 *		by Davide Libenzi, preemptible kernel bits by Robert Love.
 *  2003-09-03	Interactivity tuning by Con Kolivas.
 *  2004-04-02	Scheduler domains code by Nick Piggin
Ingo Molnar's avatar
Ingo Molnar committed
19
20
21
22
23
24
 *  2007-04-15  Work begun on replacing all interactivity tuning with a
 *              fair scheduling design by Con Kolivas.
 *  2007-05-05  Load balancing (smp-nice) and other improvements
 *              by Peter Williams
 *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
 *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
25
26
 *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
 *              Thomas Gleixner, Mike Kravetz
Linus Torvalds's avatar
Linus Torvalds committed
27
28
29
30
31
32
 */

#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/init.h>
33
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
34
35
36
#include <linux/highmem.h>
#include <asm/mmu_context.h>
#include <linux/interrupt.h>
37
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
38
39
#include <linux/completion.h>
#include <linux/kernel_stat.h>
40
#include <linux/debug_locks.h>
41
#include <linux/perf_event.h>
Linus Torvalds's avatar
Linus Torvalds committed
42
43
44
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/profile.h>
45
#include <linux/freezer.h>
46
#include <linux/vmalloc.h>
Linus Torvalds's avatar
Linus Torvalds committed
47
48
#include <linux/blkdev.h>
#include <linux/delay.h>
49
#include <linux/pid_namespace.h>
Linus Torvalds's avatar
Linus Torvalds committed
50
51
52
53
54
55
56
#include <linux/smp.h>
#include <linux/threads.h>
#include <linux/timer.h>
#include <linux/rcupdate.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/percpu.h>
57
#include <linux/proc_fs.h>
Linus Torvalds's avatar
Linus Torvalds committed
58
#include <linux/seq_file.h>
59
#include <linux/sysctl.h>
Linus Torvalds's avatar
Linus Torvalds committed
60
61
#include <linux/syscalls.h>
#include <linux/times.h>
62
#include <linux/tsacct_kern.h>
63
#include <linux/kprobes.h>
64
#include <linux/delayacct.h>
65
#include <linux/unistd.h>
Jens Axboe's avatar
Jens Axboe committed
66
#include <linux/pagemap.h>
67
#include <linux/hrtimer.h>
Reynes Philippe's avatar
Reynes Philippe committed
68
#include <linux/tick.h>
Peter Zijlstra's avatar
Peter Zijlstra committed
69
70
#include <linux/debugfs.h>
#include <linux/ctype.h>
71
#include <linux/ftrace.h>
72
#include <linux/slab.h>
73
#include <linux/init_task.h>
Al Viro's avatar
Al Viro committed
74
#include <linux/binfmts.h>
75
#include <linux/context_tracking.h>
Linus Torvalds's avatar
Linus Torvalds committed
76

77
#include <asm/switch_to.h>
78
#include <asm/tlb.h>
79
#include <asm/irq_regs.h>
80
#include <asm/mutex.h>
81
82
83
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
Linus Torvalds's avatar
Linus Torvalds committed
84

85
#include "sched.h"
86
#include "../workqueue_internal.h"
87
#include "../smpboot.h"
88

89
#define CREATE_TRACE_POINTS
90
#include <trace/events/sched.h>
91

92
void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
93
{
94
95
	unsigned long delta;
	ktime_t soft, hard, now;
96

97
98
99
100
101
102
	for (;;) {
		if (hrtimer_active(period_timer))
			break;

		now = hrtimer_cb_get_time(period_timer);
		hrtimer_forward(period_timer, now, period);
103

104
105
106
107
108
109
110
111
		soft = hrtimer_get_softexpires(period_timer);
		hard = hrtimer_get_expires(period_timer);
		delta = ktime_to_ns(ktime_sub(hard, soft));
		__hrtimer_start_range_ns(period_timer, soft, delta,
					 HRTIMER_MODE_ABS_PINNED, 0);
	}
}

112
113
DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
114

115
static void update_rq_clock_task(struct rq *rq, s64 delta);
116

117
void update_rq_clock(struct rq *rq)
118
{
119
	s64 delta;
120

121
	if (rq->skip_clock_update > 0)
122
		return;
123

124
125
126
	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
	rq->clock += delta;
	update_rq_clock_task(rq, delta);
127
128
}

129
130
131
/*
 * Debugging: various feature bits
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
132
133
134
135

#define SCHED_FEAT(name, enabled)	\
	(1UL << __SCHED_FEAT_##name) * enabled |

136
const_debug unsigned int sysctl_sched_features =
137
#include "features.h"
Peter Zijlstra's avatar
Peter Zijlstra committed
138
139
140
141
142
143
144
145
	0;

#undef SCHED_FEAT

#ifdef CONFIG_SCHED_DEBUG
#define SCHED_FEAT(name, enabled)	\
	#name ,

146
static const char * const sched_feat_names[] = {
147
#include "features.h"
Peter Zijlstra's avatar
Peter Zijlstra committed
148
149
150
151
};

#undef SCHED_FEAT

152
static int sched_feat_show(struct seq_file *m, void *v)
Peter Zijlstra's avatar
Peter Zijlstra committed
153
154
155
{
	int i;

156
	for (i = 0; i < __SCHED_FEAT_NR; i++) {
157
158
159
		if (!(sysctl_sched_features & (1UL << i)))
			seq_puts(m, "NO_");
		seq_printf(m, "%s ", sched_feat_names[i]);
Peter Zijlstra's avatar
Peter Zijlstra committed
160
	}
161
	seq_puts(m, "\n");
Peter Zijlstra's avatar
Peter Zijlstra committed
162

163
	return 0;
Peter Zijlstra's avatar
Peter Zijlstra committed
164
165
}

166
167
#ifdef HAVE_JUMP_LABEL

168
169
#define jump_label_key__true  STATIC_KEY_INIT_TRUE
#define jump_label_key__false STATIC_KEY_INIT_FALSE
170
171
172
173

#define SCHED_FEAT(name, enabled)	\
	jump_label_key__##enabled ,

174
struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
175
176
177
178
179
180
181
#include "features.h"
};

#undef SCHED_FEAT

static void sched_feat_disable(int i)
{
182
183
	if (static_key_enabled(&sched_feat_keys[i]))
		static_key_slow_dec(&sched_feat_keys[i]);
184
185
186
187
}

static void sched_feat_enable(int i)
{
188
189
	if (!static_key_enabled(&sched_feat_keys[i]))
		static_key_slow_inc(&sched_feat_keys[i]);
190
191
192
193
194
195
}
#else
static void sched_feat_disable(int i) { };
static void sched_feat_enable(int i) { };
#endif /* HAVE_JUMP_LABEL */

196
static int sched_feat_set(char *cmp)
Peter Zijlstra's avatar
Peter Zijlstra committed
197
198
{
	int i;
199
	int neg = 0;
Peter Zijlstra's avatar
Peter Zijlstra committed
200

Hillf Danton's avatar
Hillf Danton committed
201
	if (strncmp(cmp, "NO_", 3) == 0) {
Peter Zijlstra's avatar
Peter Zijlstra committed
202
203
204
205
		neg = 1;
		cmp += 3;
	}

206
	for (i = 0; i < __SCHED_FEAT_NR; i++) {
207
		if (strcmp(cmp, sched_feat_names[i]) == 0) {
208
			if (neg) {
Peter Zijlstra's avatar
Peter Zijlstra committed
209
				sysctl_sched_features &= ~(1UL << i);
210
211
				sched_feat_disable(i);
			} else {
Peter Zijlstra's avatar
Peter Zijlstra committed
212
				sysctl_sched_features |= (1UL << i);
213
214
				sched_feat_enable(i);
			}
Peter Zijlstra's avatar
Peter Zijlstra committed
215
216
217
218
			break;
		}
	}

219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
	return i;
}

static ssize_t
sched_feat_write(struct file *filp, const char __user *ubuf,
		size_t cnt, loff_t *ppos)
{
	char buf[64];
	char *cmp;
	int i;

	if (cnt > 63)
		cnt = 63;

	if (copy_from_user(&buf, ubuf, cnt))
		return -EFAULT;

	buf[cnt] = 0;
	cmp = strstrip(buf);

	i = sched_feat_set(cmp);
240
	if (i == __SCHED_FEAT_NR)
Peter Zijlstra's avatar
Peter Zijlstra committed
241
242
		return -EINVAL;

243
	*ppos += cnt;
Peter Zijlstra's avatar
Peter Zijlstra committed
244
245
246
247

	return cnt;
}

248
249
250
251
252
static int sched_feat_open(struct inode *inode, struct file *filp)
{
	return single_open(filp, sched_feat_show, NULL);
}

253
static const struct file_operations sched_feat_fops = {
254
255
256
257
258
	.open		= sched_feat_open,
	.write		= sched_feat_write,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release,
Peter Zijlstra's avatar
Peter Zijlstra committed
259
260
261
262
263
264
265
266
267
268
};

static __init int sched_init_debug(void)
{
	debugfs_create_file("sched_features", 0644, NULL, NULL,
			&sched_feat_fops);

	return 0;
}
late_initcall(sched_init_debug);
269
#endif /* CONFIG_SCHED_DEBUG */
270

271
272
273
274
275
276
/*
 * Number of tasks to iterate in a single balance run.
 * Limited because this is done with IRQs disabled.
 */
const_debug unsigned int sysctl_sched_nr_migrate = 32;

277
278
279
280
281
282
283
284
/*
 * period over which we average the RT time consumption, measured
 * in ms.
 *
 * default: 1s
 */
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

Peter Zijlstra's avatar
Peter Zijlstra committed
285
/*
Peter Zijlstra's avatar
Peter Zijlstra committed
286
 * period over which we measure -rt task cpu usage in us.
Peter Zijlstra's avatar
Peter Zijlstra committed
287
288
 * default: 1s
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
289
unsigned int sysctl_sched_rt_period = 1000000;
Peter Zijlstra's avatar
Peter Zijlstra committed
290

291
__read_mostly int scheduler_running;
292

Peter Zijlstra's avatar
Peter Zijlstra committed
293
294
295
296
297
/*
 * part of the period that we allow rt tasks to run in us.
 * default: 0.95s
 */
int sysctl_sched_rt_runtime = 950000;
Peter Zijlstra's avatar
Peter Zijlstra committed
298
299


Linus Torvalds's avatar
Linus Torvalds committed
300

301
/*
302
 * __task_rq_lock - lock the rq @p resides on.
303
 */
304
static inline struct rq *__task_rq_lock(struct task_struct *p)
305
306
	__acquires(rq->lock)
{
307
308
	struct rq *rq;

309
310
	lockdep_assert_held(&p->pi_lock);

311
	for (;;) {
312
		rq = task_rq(p);
313
		raw_spin_lock(&rq->lock);
314
		if (likely(rq == task_rq(p)))
315
			return rq;
316
		raw_spin_unlock(&rq->lock);
317
318
319
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
320
/*
321
 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
Linus Torvalds's avatar
Linus Torvalds committed
322
 */
323
static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
324
	__acquires(p->pi_lock)
Linus Torvalds's avatar
Linus Torvalds committed
325
326
	__acquires(rq->lock)
{
327
	struct rq *rq;
Linus Torvalds's avatar
Linus Torvalds committed
328

329
	for (;;) {
330
		raw_spin_lock_irqsave(&p->pi_lock, *flags);
331
		rq = task_rq(p);
332
		raw_spin_lock(&rq->lock);
333
		if (likely(rq == task_rq(p)))
334
			return rq;
335
336
		raw_spin_unlock(&rq->lock);
		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
Linus Torvalds's avatar
Linus Torvalds committed
337
338
339
	}
}

Alexey Dobriyan's avatar
Alexey Dobriyan committed
340
static void __task_rq_unlock(struct rq *rq)
341
342
	__releases(rq->lock)
{
343
	raw_spin_unlock(&rq->lock);
344
345
}

346
347
static inline void
task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
Linus Torvalds's avatar
Linus Torvalds committed
348
	__releases(rq->lock)
349
	__releases(p->pi_lock)
Linus Torvalds's avatar
Linus Torvalds committed
350
{
351
352
	raw_spin_unlock(&rq->lock);
	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
Linus Torvalds's avatar
Linus Torvalds committed
353
354
355
}

/*
356
 * this_rq_lock - lock this runqueue and disable interrupts.
Linus Torvalds's avatar
Linus Torvalds committed
357
 */
Alexey Dobriyan's avatar
Alexey Dobriyan committed
358
static struct rq *this_rq_lock(void)
Linus Torvalds's avatar
Linus Torvalds committed
359
360
	__acquires(rq->lock)
{
361
	struct rq *rq;
Linus Torvalds's avatar
Linus Torvalds committed
362
363
364

	local_irq_disable();
	rq = this_rq();
365
	raw_spin_lock(&rq->lock);
Linus Torvalds's avatar
Linus Torvalds committed
366
367
368
369

	return rq;
}

370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
#ifdef CONFIG_SCHED_HRTICK
/*
 * Use HR-timers to deliver accurate preemption points.
 *
 * Its all a bit involved since we cannot program an hrt while holding the
 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
 * reschedule event.
 *
 * When we get rescheduled we reprogram the hrtick_timer outside of the
 * rq->lock.
 */

static void hrtick_clear(struct rq *rq)
{
	if (hrtimer_active(&rq->hrtick_timer))
		hrtimer_cancel(&rq->hrtick_timer);
}

/*
 * High-resolution timer tick.
 * Runs from hardirq context with interrupts disabled.
 */
static enum hrtimer_restart hrtick(struct hrtimer *timer)
{
	struct rq *rq = container_of(timer, struct rq, hrtick_timer);

	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

398
	raw_spin_lock(&rq->lock);
399
	update_rq_clock(rq);
400
	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
401
	raw_spin_unlock(&rq->lock);
402
403
404
405

	return HRTIMER_NORESTART;
}

406
#ifdef CONFIG_SMP
407
408
409
410
/*
 * called from hardirq (IPI) context
 */
static void __hrtick_start(void *arg)
411
{
412
	struct rq *rq = arg;
413

414
	raw_spin_lock(&rq->lock);
415
416
	hrtimer_restart(&rq->hrtick_timer);
	rq->hrtick_csd_pending = 0;
417
	raw_spin_unlock(&rq->lock);
418
419
}

420
421
422
423
424
/*
 * Called to set the hrtick timer state.
 *
 * called with rq->lock held and irqs disabled
 */
425
void hrtick_start(struct rq *rq, u64 delay)
426
{
427
428
	struct hrtimer *timer = &rq->hrtick_timer;
	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
429

430
	hrtimer_set_expires(timer, time);
431
432
433
434

	if (rq == this_rq()) {
		hrtimer_restart(timer);
	} else if (!rq->hrtick_csd_pending) {
435
		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
436
437
		rq->hrtick_csd_pending = 1;
	}
438
439
440
441
442
443
444
445
446
447
448
449
450
451
}

static int
hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
	int cpu = (int)(long)hcpu;

	switch (action) {
	case CPU_UP_CANCELED:
	case CPU_UP_CANCELED_FROZEN:
	case CPU_DOWN_PREPARE:
	case CPU_DOWN_PREPARE_FROZEN:
	case CPU_DEAD:
	case CPU_DEAD_FROZEN:
452
		hrtick_clear(cpu_rq(cpu));
453
454
455
456
457
458
		return NOTIFY_OK;
	}

	return NOTIFY_DONE;
}

459
static __init void init_hrtick(void)
460
461
462
{
	hotcpu_notifier(hotplug_hrtick, 0);
}
463
464
465
466
467
468
#else
/*
 * Called to set the hrtick timer state.
 *
 * called with rq->lock held and irqs disabled
 */
469
void hrtick_start(struct rq *rq, u64 delay)
470
{
471
	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
472
			HRTIMER_MODE_REL_PINNED, 0);
473
}
474

Andrew Morton's avatar
Andrew Morton committed
475
static inline void init_hrtick(void)
476
477
{
}
478
#endif /* CONFIG_SMP */
479

480
static void init_rq_hrtick(struct rq *rq)
481
{
482
483
#ifdef CONFIG_SMP
	rq->hrtick_csd_pending = 0;
484

485
486
487
488
	rq->hrtick_csd.flags = 0;
	rq->hrtick_csd.func = __hrtick_start;
	rq->hrtick_csd.info = rq;
#endif
489

490
491
	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	rq->hrtick_timer.function = hrtick;
492
}
Andrew Morton's avatar
Andrew Morton committed
493
#else	/* CONFIG_SCHED_HRTICK */
494
495
496
497
498
499
500
501
static inline void hrtick_clear(struct rq *rq)
{
}

static inline void init_rq_hrtick(struct rq *rq)
{
}

502
503
504
static inline void init_hrtick(void)
{
}
Andrew Morton's avatar
Andrew Morton committed
505
#endif	/* CONFIG_SCHED_HRTICK */
506

507
508
509
510
511
512
513
514
515
516
/*
 * resched_task - mark a task 'to be rescheduled now'.
 *
 * On UP this means the setting of the need_resched flag, on SMP it
 * might also involve a cross-CPU call to trigger the scheduler on
 * the target CPU.
 */
#ifdef CONFIG_SMP

#ifndef tsk_is_polling
Al Viro's avatar
Al Viro committed
517
#define tsk_is_polling(t) 0
518
519
#endif

520
void resched_task(struct task_struct *p)
521
522
523
{
	int cpu;

524
	assert_raw_spin_locked(&task_rq(p)->lock);
525

526
	if (test_tsk_need_resched(p))
527
528
		return;

529
	set_tsk_need_resched(p);
530
531
532
533
534
535
536
537
538
539
540

	cpu = task_cpu(p);
	if (cpu == smp_processor_id())
		return;

	/* NEED_RESCHED must be visible before we test polling */
	smp_mb();
	if (!tsk_is_polling(p))
		smp_send_reschedule(cpu);
}

541
void resched_cpu(int cpu)
542
543
544
545
{
	struct rq *rq = cpu_rq(cpu);
	unsigned long flags;

546
	if (!raw_spin_trylock_irqsave(&rq->lock, flags))
547
548
		return;
	resched_task(cpu_curr(cpu));
549
	raw_spin_unlock_irqrestore(&rq->lock, flags);
550
}
551
552

#ifdef CONFIG_NO_HZ
553
554
555
556
557
558
559
560
561
562
563
564
565
566
/*
 * In the semi idle case, use the nearest busy cpu for migrating timers
 * from an idle cpu.  This is good for power-savings.
 *
 * We don't do similar optimization for completely idle system, as
 * selecting an idle cpu will add more delays to the timers than intended
 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
 */
int get_nohz_timer_target(void)
{
	int cpu = smp_processor_id();
	int i;
	struct sched_domain *sd;

567
	rcu_read_lock();
568
	for_each_domain(cpu, sd) {
569
570
571
572
573
574
		for_each_cpu(i, sched_domain_span(sd)) {
			if (!idle_cpu(i)) {
				cpu = i;
				goto unlock;
			}
		}
575
	}
576
577
unlock:
	rcu_read_unlock();
578
579
	return cpu;
}
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
/*
 * When add_timer_on() enqueues a timer into the timer wheel of an
 * idle CPU then this timer might expire before the next timer event
 * which is scheduled to wake up that CPU. In case of a completely
 * idle system the next event might even be infinite time into the
 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
 * leaves the inner idle loop so the newly added timer is taken into
 * account when the CPU goes back to idle and evaluates the timer
 * wheel for the next timer event.
 */
void wake_up_idle_cpu(int cpu)
{
	struct rq *rq = cpu_rq(cpu);

	if (cpu == smp_processor_id())
		return;

	/*
	 * This is safe, as this function is called with the timer
	 * wheel base lock of (cpu) held. When the CPU is on the way
	 * to idle and has not yet set rq->curr to idle then it will
	 * be serialized on the timer wheel base lock and take the new
	 * timer into account automatically.
	 */
	if (rq->curr != rq->idle)
		return;
606
607

	/*
608
609
610
	 * We can set TIF_RESCHED on the idle task of the other CPU
	 * lockless. The worst case is that the other CPU runs the
	 * idle task through an additional NOOP schedule()
611
	 */
612
	set_tsk_need_resched(rq->idle);
613

614
615
616
617
	/* NEED_RESCHED must be visible before we test polling */
	smp_mb();
	if (!tsk_is_polling(rq->idle))
		smp_send_reschedule(cpu);
618
619
}

620
static inline bool got_nohz_idle_kick(void)
621
{
622
623
	int cpu = smp_processor_id();
	return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
624
625
}

626
#else /* CONFIG_NO_HZ */
627

628
static inline bool got_nohz_idle_kick(void)
Peter Zijlstra's avatar
Peter Zijlstra committed
629
{
630
	return false;
Peter Zijlstra's avatar
Peter Zijlstra committed
631
632
}

633
#endif /* CONFIG_NO_HZ */
634

635
void sched_avg_update(struct rq *rq)
636
{
637
638
639
	s64 period = sched_avg_period();

	while ((s64)(rq->clock - rq->age_stamp) > period) {
640
641
642
643
644
645
		/*
		 * Inline assembly required to prevent the compiler
		 * optimising this loop into a divmod call.
		 * See __iter_div_u64_rem() for another example of this.
		 */
		asm("" : "+rm" (rq->age_stamp));
646
647
648
		rq->age_stamp += period;
		rq->rt_avg /= 2;
	}
649
650
}

651
#else /* !CONFIG_SMP */
652
void resched_task(struct task_struct *p)
653
{
654
	assert_raw_spin_locked(&task_rq(p)->lock);
655
	set_tsk_need_resched(p);
656
}
657
#endif /* CONFIG_SMP */
658

659
660
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
661
/*
662
663
664
665
 * Iterate task_group tree rooted at *from, calling @down when first entering a
 * node and @up when leaving it for the final time.
 *
 * Caller must hold rcu_lock or sufficient equivalent.
666
 */
667
int walk_tg_tree_from(struct task_group *from,
668
			     tg_visitor down, tg_visitor up, void *data)
669
670
{
	struct task_group *parent, *child;
Peter Zijlstra's avatar
Peter Zijlstra committed
671
	int ret;
672

673
674
	parent = from;

675
down:
Peter Zijlstra's avatar
Peter Zijlstra committed
676
677
	ret = (*down)(parent, data);
	if (ret)
678
		goto out;
679
680
681
682
683
684
685
	list_for_each_entry_rcu(child, &parent->children, siblings) {
		parent = child;
		goto down;

up:
		continue;
	}
Peter Zijlstra's avatar
Peter Zijlstra committed
686
	ret = (*up)(parent, data);
687
688
	if (ret || parent == from)
		goto out;
689
690
691
692
693

	child = parent;
	parent = parent->parent;
	if (parent)
		goto up;
694
out:
Peter Zijlstra's avatar
Peter Zijlstra committed
695
	return ret;
696
697
}

698
int tg_nop(struct task_group *tg, void *data)
Peter Zijlstra's avatar
Peter Zijlstra committed
699
{
700
	return 0;
Peter Zijlstra's avatar
Peter Zijlstra committed
701
}
702
703
#endif

704
705
static void set_load_weight(struct task_struct *p)
{
Nikhil Rao's avatar
Nikhil Rao committed
706
707
708
	int prio = p->static_prio - MAX_RT_PRIO;
	struct load_weight *load = &p->se.load;

Ingo Molnar's avatar
Ingo Molnar committed
709
710
711
712
	/*
	 * SCHED_IDLE tasks get minimal weight:
	 */
	if (p->policy == SCHED_IDLE) {
713
		load->weight = scale_load(WEIGHT_IDLEPRIO);
Nikhil Rao's avatar
Nikhil Rao committed
714
		load->inv_weight = WMULT_IDLEPRIO;
Ingo Molnar's avatar
Ingo Molnar committed
715
716
		return;
	}
717

718
	load->weight = scale_load(prio_to_weight[prio]);
Nikhil Rao's avatar
Nikhil Rao committed
719
	load->inv_weight = prio_to_wmult[prio];
720
721
}

722
static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
723
{
724
	update_rq_clock(rq);
Ingo Molnar's avatar
Ingo Molnar committed
725
	sched_info_queued(p);
726
	p->sched_class->enqueue_task(rq, p, flags);
727
728
}

729
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
730
{
731
	update_rq_clock(rq);
732
	sched_info_dequeued(p);
733
	p->sched_class->dequeue_task(rq, p, flags);
734
735
}

736
void activate_task(struct rq *rq, struct task_struct *p, int flags)
737
738
739
740
{
	if (task_contributes_to_load(p))
		rq->nr_uninterruptible--;

741
	enqueue_task(rq, p, flags);
742
743
}

744
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
745
746
747
748
{
	if (task_contributes_to_load(p))
		rq->nr_uninterruptible++;

749
	dequeue_task(rq, p, flags);
750
751
}

752
static void update_rq_clock_task(struct rq *rq, s64 delta)
753
{
754
755
756
757
758
759
760
761
/*
 * In theory, the compile should just see 0 here, and optimize out the call
 * to sched_rt_avg_update. But I don't trust it...
 */
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
	s64 steal = 0, irq_delta = 0;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
762
	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783

	/*
	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
	 * this case when a previous update_rq_clock() happened inside a
	 * {soft,}irq region.
	 *
	 * When this happens, we stop ->clock_task and only update the
	 * prev_irq_time stamp to account for the part that fit, so that a next
	 * update will consume the rest. This ensures ->clock_task is
	 * monotonic.
	 *
	 * It does however cause some slight miss-attribution of {soft,}irq
	 * time, a more accurate solution would be to update the irq_time using
	 * the current rq->clock timestamp, except that would require using
	 * atomic ops.
	 */
	if (irq_delta > delta)
		irq_delta = delta;

	rq->prev_irq_time += irq_delta;
	delta -= irq_delta;
784
785
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
786
	if (static_key_false((&paravirt_steal_rq_enabled))) {
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
		u64 st;

		steal = paravirt_steal_clock(cpu_of(rq));
		steal -= rq->prev_steal_time_rq;

		if (unlikely(steal > delta))
			steal = delta;

		st = steal_ticks(steal);
		steal = st * TICK_NSEC;

		rq->prev_steal_time_rq += steal;

		delta -= steal;
	}
#endif

804
805
	rq->clock_task += delta;

806
807
808
809
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
	if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
		sched_rt_avg_update(rq, irq_delta + steal);
#endif
810
811
}

812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
	struct task_struct *old_stop = cpu_rq(cpu)->stop;

	if (stop) {
		/*
		 * Make it appear like a SCHED_FIFO task, its something
		 * userspace knows about and won't get confused about.
		 *
		 * Also, it will make PI more or less work without too
		 * much confusion -- but then, stop work should not
		 * rely on PI working anyway.
		 */
		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);

		stop->sched_class = &stop_sched_class;
	}

	cpu_rq(cpu)->stop = stop;

	if (old_stop) {
		/*
		 * Reset it back to a normal scheduling class so that
		 * it can die in pieces.
		 */
		old_stop->sched_class = &rt_sched_class;
	}
}

842
/*
Ingo Molnar's avatar
Ingo Molnar committed
843
 * __normal_prio - return the priority that is based on the static prio
844
845
846
 */
static inline int __normal_prio(struct task_struct *p)
{
Ingo Molnar's avatar
Ingo Molnar committed
847
	return p->static_prio;
848
849
}

850
851
852
853
854
855
856
/*
 * Calculate the expected normal priority: i.e. priority
 * without taking RT-inheritance into account. Might be
 * boosted by interactivity modifiers. Changes upon fork,
 * setprio syscalls, and whenever the interactivity
 * estimator recalculates.
 */
857
static inline int normal_prio(struct task_struct *p)
858
859
860
{
	int prio;

861
	if (task_has_rt_policy(p))
862
863
864
865
866
867
868
869
870
871
872
873
874
		prio = MAX_RT_PRIO-1 - p->rt_priority;
	else
		prio = __normal_prio(p);
	return prio;
}

/*
 * Calculate the current priority, i.e. the priority
 * taken into account by the scheduler. This value might
 * be boosted by RT tasks, or might be boosted by
 * interactivity modifiers. Will be RT if the task got
 * RT-boosted. If not then it returns p->normal_prio.
 */
875
static int effective_prio(struct task_struct *p)
876
877
878
879
880
881
882
883
884
885
886
887
{
	p->normal_prio = normal_prio(p);
	/*
	 * If we are RT tasks or we were boosted to RT priority,
	 * keep the priority unchanged. Otherwise, update priority
	 * to the normal priority:
	 */
	if (!rt_prio(p->prio))
		return p->normal_prio;
	return p->prio;
}

Linus Torvalds's avatar
Linus Torvalds committed
888
889
890
891
/**
 * task_curr - is this task currently executing on a CPU?
 * @p: the task in question.
 */
892
inline int task_curr(const struct task_struct *p)
Linus Torvalds's avatar
Linus Torvalds committed
893
894
895
896
{
	return cpu_curr(task_cpu(p)) == p;
}

897
898
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
				       const struct sched_class *prev_class,
Peter Zijlstra's avatar
Peter Zijlstra committed
899
				       int oldprio)
900
901
902
{
	if (prev_class != p->sched_class) {
		if (prev_class->switched_from)
Peter Zijlstra's avatar
Peter Zijlstra committed
903
904
905
906
			prev_class->switched_from(rq, p);
		p->sched_class->switched_to(rq, p);
	} else if (oldprio != p->prio)
		p->sched_class->prio_changed(rq, p, oldprio);
907
908
}

909
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
{
	const struct sched_class *class;

	if (p->sched_class == rq->curr->sched_class) {
		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
	} else {
		for_each_class(class) {
			if (class == rq->curr->sched_class)
				break;
			if (class == p->sched_class) {
				resched_task(rq->curr);
				break;
			}
		}
	}

	/*
	 * A queue event has occurred, and we're going to schedule.  In
	 * this case, we can save a useless back to back clock update.
	 */
Peter Zijlstra's avatar
Peter Zijlstra committed
930
	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
931
932
933
		rq->skip_clock_update = 1;
}

934
935
936
937
938
939
940
static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);

void register_task_migration_notifier(struct notifier_block *n)
{
	atomic_notifier_chain_register(&task_migration_notifier, n);
}

Linus Torvalds's avatar
Linus Torvalds committed
941
#ifdef CONFIG_SMP
Ingo Molnar's avatar
Ingo Molnar committed
942
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
Ingo Molnar's avatar
Ingo Molnar committed
943
{
944
945
946
947
948
#ifdef CONFIG_SCHED_DEBUG
	/*
	 * We should never call set_task_cpu() on a blocked task,
	 * ttwu() will sort out the placement.
	 */
Peter Zijlstra's avatar
Peter Zijlstra committed
949
950
	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
			!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
951
952

#ifdef CONFIG_LOCKDEP
953
954
955
956
957
	/*
	 * The caller should hold either p->pi_lock or rq->lock, when changing
	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
	 *
	 * sched_move_task() holds both and thus holding either pins the cgroup,
958
	 * see task_group().
959
960
961
962
	 *
	 * Furthermore, all task_rq users should acquire both locks, see
	 * task_rq_lock().
	 */
963
964
965
	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
				      lockdep_is_held(&task_rq(p)->lock)));
#endif
966
967
#endif

968
	trace_sched_migrate_task(p, new_cpu);
969

970
	if (task_cpu(p) != new_cpu) {
971
972
		struct task_migration_notifier tmn;

973
974
		if (p->sched_class->migrate_task_rq)
			p->sched_class->migrate_task_rq(p, new_cpu);
975
		p->se.nr_migrations++;
976
		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
977
978
979
980
981
982

		tmn.task = p;
		tmn.from_cpu = task_cpu(p);
		tmn.to_cpu = new_cpu;

		atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
983
	}
Ingo Molnar's avatar
Ingo Molnar committed
984
985

	__set_task_cpu(p, new_cpu);
Ingo Molnar's avatar
Ingo Molnar committed
986
987
}

988
struct migration_arg {
989
	struct task_struct *task;
Linus Torvalds's avatar
Linus Torvalds committed
990
	int dest_cpu;
991
};
Linus Torvalds's avatar
Linus Torvalds committed
992

993
994
static int migration_cpu_stop(void *data);

Linus Torvalds's avatar
Linus Torvalds committed
995
996
997
/*
 * wait_task_inactive - wait for a thread to unschedule.
 *
Roland McGrath's avatar
Roland McGrath committed
998
999
1000
1001
1002
1003
1004
 * If @match_state is nonzero, it's the @p->state value just checked and
 * not expected to change.  If it changes, i.e. @p might have woken up,
 * then return zero.  When we succeed in waiting for @p to be off its CPU,
 * we return a positive number (its total switch count).  If a second call
 * a short while later returns the same number, the caller can be sure that
 * @p has remained unscheduled the whole time.
 *
Linus Torvalds's avatar
Linus Torvalds committed
1005
1006
1007
1008
1009
1010
 * The caller must ensure that the task *will* unschedule sometime soon,
 * else this function might spin for a *long* time. This function can't
 * be called with interrupts off, or it may introduce deadlock with
 * smp_call_function() if an IPI is sent by the same process we are
 * waiting to become inactive.
 */
Roland McGrath's avatar
Roland McGrath committed
1011
unsigned long wait_task_inactive(struct task_struct *p, long match_state)
Linus Torvalds's avatar
Linus Torvalds committed
1012
1013
{
	unsigned long flags;
Ingo Molnar's avatar
Ingo Molnar committed
1014
	int running, on_rq;
Roland McGrath's avatar
Roland McGrath committed
1015
	unsigned long ncsw;
1016
	struct rq *rq;
Linus Torvalds's avatar
Linus Torvalds committed
1017

1018
1019
1020
1021
1022
1023
1024
1025
	for (;;) {
		/*
		 * We do the initial early heuristics without holding
		 * any task-queue locks at all. We'll only try to get
		 * the runqueue lock when things look like they will
		 * work out!
		 */
		rq = task_rq(p);
1026

1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
		/*
		 * If the task is actively running on another CPU
		 * still, just relax and busy-wait without holding
		 * any locks.
		 *
		 * NOTE! Since we don't hold any locks, it's not
		 * even sure that "rq" stays as the right runqueue!
		 * But we don't care, since "task_running()" will
		 * return false if the runqueue has changed and p
		 * is actually now running somewhere else!
		 */
Roland McGrath's avatar
Roland McGrath committed
1038
1039
1040
		while (task_running(rq, p)) {
			if (match_state && unlikely(p->state != match_state))
				return 0;
1041
			cpu_relax();
Roland McGrath's avatar
Roland McGrath committed
1042
		}
1043

1044
1045
1046
1047
1048
1049
		/*
		 * Ok, time to look more closely! We need the rq
		 * lock now, to be *sure*. If we're wrong, we'll
		 * just go back and repeat.
		 */
		rq = task_rq_lock(p, &flags);
1050
		trace_sched_wait_task(p);
1051
		running = task_running(rq, p);
Peter Zijlstra's avatar
Peter Zijlstra committed
1052
		on_rq = p->on_rq;
Roland McGrath's avatar
Roland McGrath committed
1053
		ncsw = 0;
1054
		if (!match_state || p->state == match_state)
1055
			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1056
		task_rq_unlock(rq, p, &flags);
1057

Roland McGrath's avatar
Roland McGrath committed
1058
1059
1060
1061
1062
1063
		/*
		 * If it changed from the expected state, bail out now.
		 */
		if (unlikely(!ncsw))
			break;

1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
		/*
		 * Was it really running after all now that we
		 * checked with the proper locks actually held?
		 *
		 * Oops. Go back and try again..
		 */
		if (unlikely(running)) {
			cpu_relax();
			continue;
		}
1074

1075
1076
1077
1078
1079
		/*
		 * It's not enough that it's not actively running,
		 * it must be off the runqueue _entirely_, and not
		 * preempted!
		 *
1080
		 * So if it was still runnable (but just not actively
1081
1082
1083
1084
		 * running right now), it's preempted, and we should
		 * yield - it could be a while.
		 */
		if (unlikely(on_rq)) {
1085
1086
1087
1088
			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);

			set_current_state(TASK_UNINTERRUPTIBLE);
			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1089
1090
			continue;
		}
1091

1092
1093
1094
1095
1096
1097
1098
		/*
		 * Ahh, all good. It wasn't running, and it wasn't
		 * runnable, which means that it will never become
		 * running in the future either. We're all done!
		 */
		break;
	}
Roland McGrath's avatar
Roland McGrath committed
1099
1100

	return ncsw;
Linus Torvalds's avatar
Linus Torvalds committed
1101
1102
1103
1104
1105
1106
1107
1108
1109
}