fair.c 189 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/*
 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
 *
 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *
 *  Interactivity improvements by Mike Galbraith
 *  (C) 2007 Mike Galbraith <efault@gmx.de>
 *
 *  Various enhancements by Dmitry Adamushko.
 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
 *
 *  Group scheduling enhancements by Srivatsa Vaddagiri
 *  Copyright IBM Corporation, 2007
 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
 *
 *  Scaled math optimizations by Thomas Gleixner
 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18
19
20
 *
 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21
22
 */

Arjan van de Ven's avatar
Arjan van de Ven committed
23
#include <linux/latencytop.h>
24
#include <linux/sched.h>
25
#include <linux/cpumask.h>
26
27
28
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
29
#include <linux/mempolicy.h>
30
#include <linux/migrate.h>
31
#include <linux/task_work.h>
32
33
34
35

#include <trace/events/sched.h>

#include "sched.h"
Arjan van de Ven's avatar
Arjan van de Ven committed
36

37
/*
38
 * Targeted preemption latency for CPU-bound tasks:
39
 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
40
 *
41
 * NOTE: this latency value is not the same as the concept of
Ingo Molnar's avatar
Ingo Molnar committed
42
43
44
 * 'timeslice length' - timeslices in CFS are of variable length
 * and have no persistent notion like in traditional, time-slice
 * based scheduling concepts.
45
 *
Ingo Molnar's avatar
Ingo Molnar committed
46
47
 * (to see the precise effective timeslice length of your workload,
 *  run vmstat and monitor the context-switches (cs) field)
48
 */
49
50
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
51

52
53
54
55
56
57
58
59
60
61
62
63
/*
 * The initial- and re-scaling of tunables is configurable
 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
 *
 * Options are:
 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
 */
enum sched_tunable_scaling sysctl_sched_tunable_scaling
	= SCHED_TUNABLESCALING_LOG;

64
/*
65
 * Minimal preemption granularity for CPU-bound tasks:
66
 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
67
 */
68
69
unsigned int sysctl_sched_min_granularity = 750000ULL;
unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
70
71

/*
72
73
 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
 */
74
static unsigned int sched_nr_latency = 8;
75
76

/*
77
 * After fork, child runs first. If set to 0 (default) then
78
 * parent will (try to) run first.
79
 */
80
unsigned int sysctl_sched_child_runs_first __read_mostly;
81
82
83

/*
 * SCHED_OTHER wake-up granularity.
84
 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
85
86
87
88
89
 *
 * This option delays the preemption effects of decoupled workloads
 * and reduces their over-scheduling. Synchronous workloads will still
 * have immediate wakeup/sleep latencies.
 */
90
unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
91
unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
92

93
94
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;

95
96
97
98
99
100
101
/*
 * The exponential sliding  window over which load is averaged for shares
 * distribution.
 * (default: 10msec)
 */
unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;

102
103
104
105
106
107
108
109
110
111
112
113
114
115
#ifdef CONFIG_CFS_BANDWIDTH
/*
 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
 * each time a cfs_rq requests quota.
 *
 * Note: in the case that the slice exceeds the runtime remaining (either due
 * to consumption or the quota being specified to be smaller than the slice)
 * we will always only issue the remaining available time.
 *
 * default: 5 msec, units: microseconds
  */
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif

116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
	lw->weight += inc;
	lw->inv_weight = 0;
}

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
	lw->weight -= dec;
	lw->inv_weight = 0;
}

static inline void update_load_set(struct load_weight *lw, unsigned long w)
{
	lw->weight = w;
	lw->inv_weight = 0;
}

134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
/*
 * Increase the granularity value when there are more CPUs,
 * because with more CPUs the 'effective latency' as visible
 * to users decreases. But the relationship is not linear,
 * so pick a second-best guess by going with the log2 of the
 * number of CPUs.
 *
 * This idea comes from the SD scheduler of Con Kolivas:
 */
static int get_update_sysctl_factor(void)
{
	unsigned int cpus = min_t(int, num_online_cpus(), 8);
	unsigned int factor;

	switch (sysctl_sched_tunable_scaling) {
	case SCHED_TUNABLESCALING_NONE:
		factor = 1;
		break;
	case SCHED_TUNABLESCALING_LINEAR:
		factor = cpus;
		break;
	case SCHED_TUNABLESCALING_LOG:
	default:
		factor = 1 + ilog2(cpus);
		break;
	}

	return factor;
}

static void update_sysctl(void)
{
	unsigned int factor = get_update_sysctl_factor();

#define SET_SYSCTL(name) \
	(sysctl_##name = (factor) * normalized_sysctl_##name)
	SET_SYSCTL(sched_min_granularity);
	SET_SYSCTL(sched_latency);
	SET_SYSCTL(sched_wakeup_granularity);
#undef SET_SYSCTL
}

void sched_init_granularity(void)
{
	update_sysctl();
}

181
#define WMULT_CONST	(~0U)
182
183
#define WMULT_SHIFT	32

184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
static void __update_inv_weight(struct load_weight *lw)
{
	unsigned long w;

	if (likely(lw->inv_weight))
		return;

	w = scale_load_down(lw->weight);

	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
		lw->inv_weight = 1;
	else if (unlikely(!w))
		lw->inv_weight = WMULT_CONST;
	else
		lw->inv_weight = WMULT_CONST / w;
}
200
201

/*
202
203
204
205
206
207
208
209
210
211
 * delta_exec * weight / lw.weight
 *   OR
 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
 *
 * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
 * we're guaranteed shift stays positive because inv_weight is guaranteed to
 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
 *
 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
 * weight/lw.weight <= 1, and therefore our shift will also be positive.
212
 */
213
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
214
{
215
216
	u64 fact = scale_load_down(weight);
	int shift = WMULT_SHIFT;
217

218
	__update_inv_weight(lw);
219

220
221
222
223
224
	if (unlikely(fact >> 32)) {
		while (fact >> 32) {
			fact >>= 1;
			shift--;
		}
225
226
	}

227
228
	/* hint to use a 32x32->64 mul */
	fact = (u64)(u32)fact * lw->inv_weight;
229

230
231
232
233
234
235
	while (fact >> 32) {
		fact >>= 1;
		shift--;
	}

	return mul_u64_u32_shr(delta_exec, fact, shift);
236
237
238
239
}


const struct sched_class fair_sched_class;
240

241
242
243
244
/**************************************************************
 * CFS operations on generic schedulable entities:
 */

245
#ifdef CONFIG_FAIR_GROUP_SCHED
246

247
/* cpu runqueue to which this cfs_rq is attached */
248
249
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
250
	return cfs_rq->rq;
251
252
}

253
254
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se)	(!se->my_q)
255

256
257
258
259
260
261
262
263
static inline struct task_struct *task_of(struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
	WARN_ON_ONCE(!entity_is_task(se));
#endif
	return container_of(se, struct task_struct, se);
}

264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
		for (; se; se = se->parent)

static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
{
	return p->se.cfs_rq;
}

/* runqueue on which this entity is (to be) queued */
static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
	return se->cfs_rq;
}

/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
	return grp->my_q;
}

285
286
static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				       int force_update);
287

288
289
290
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
	if (!cfs_rq->on_list) {
291
292
293
294
295
296
297
298
299
300
301
302
		/*
		 * Ensure we either appear before our parent (if already
		 * enqueued) or force our parent to appear after us when it is
		 * enqueued.  The fact that we always enqueue bottom-up
		 * reduces this to two cases.
		 */
		if (cfs_rq->tg->parent &&
		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
				&rq_of(cfs_rq)->leaf_cfs_rq_list);
		} else {
			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
303
				&rq_of(cfs_rq)->leaf_cfs_rq_list);
304
		}
305
306

		cfs_rq->on_list = 1;
307
		/* We should have no load, but we need to update last_decay. */
308
		update_cfs_rq_blocked_load(cfs_rq, 0);
309
310
311
312
313
314
315
316
317
318
319
	}
}

static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
	if (cfs_rq->on_list) {
		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
		cfs_rq->on_list = 0;
	}
}

320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)

/* Do the two (enqueued) entities belong to the same group ? */
static inline int
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
	if (se->cfs_rq == pse->cfs_rq)
		return 1;

	return 0;
}

static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
	return se->parent;
}

339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
/* return depth at which a sched entity is present in the hierarchy */
static inline int depth_se(struct sched_entity *se)
{
	int depth = 0;

	for_each_sched_entity(se)
		depth++;

	return depth;
}

static void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
	int se_depth, pse_depth;

	/*
	 * preemption test can be made between sibling entities who are in the
	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
	 * both tasks until we find their ancestors who are siblings of common
	 * parent.
	 */

	/* First walk up until both entities are at same depth */
	se_depth = depth_se(*se);
	pse_depth = depth_se(*pse);

	while (se_depth > pse_depth) {
		se_depth--;
		*se = parent_entity(*se);
	}

	while (pse_depth > se_depth) {
		pse_depth--;
		*pse = parent_entity(*pse);
	}

	while (!is_same_group(*se, *pse)) {
		*se = parent_entity(*se);
		*pse = parent_entity(*pse);
	}
}

382
383
384
385
386
387
#else	/* !CONFIG_FAIR_GROUP_SCHED */

static inline struct task_struct *task_of(struct sched_entity *se)
{
	return container_of(se, struct task_struct, se);
}
388

389
390
391
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
	return container_of(cfs_rq, struct rq, cfs);
392
393
394
395
}

#define entity_is_task(se)	1

396
397
#define for_each_sched_entity(se) \
		for (; se; se = NULL)
398

399
static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
400
{
401
	return &task_rq(p)->cfs;
402
403
}

404
405
406
407
408
409
410
411
412
413
414
415
416
417
static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
	struct task_struct *p = task_of(se);
	struct rq *rq = task_rq(p);

	return &rq->cfs;
}

/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
	return NULL;
}

418
419
420
421
422
423
424
425
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}

static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}

426
427
428
429
430
431
432
433
434
435
436
437
438
439
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)

static inline int
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
	return 1;
}

static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
	return NULL;
}

440
441
442
443
444
static inline void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
}

445
446
#endif	/* CONFIG_FAIR_GROUP_SCHED */

447
static __always_inline
448
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
449
450
451
452
453

/**************************************************************
 * Scheduling class tree data structure manipulation methods:
 */

454
static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
455
{
456
	s64 delta = (s64)(vruntime - max_vruntime);
457
	if (delta > 0)
458
		max_vruntime = vruntime;
459

460
	return max_vruntime;
461
462
}

463
static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
464
465
466
467
468
469
470
471
{
	s64 delta = (s64)(vruntime - min_vruntime);
	if (delta < 0)
		min_vruntime = vruntime;

	return min_vruntime;
}

472
473
474
475
476
477
static inline int entity_before(struct sched_entity *a,
				struct sched_entity *b)
{
	return (s64)(a->vruntime - b->vruntime) < 0;
}

478
479
480
481
482
483
484
485
486
487
488
489
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
	u64 vruntime = cfs_rq->min_vruntime;

	if (cfs_rq->curr)
		vruntime = cfs_rq->curr->vruntime;

	if (cfs_rq->rb_leftmost) {
		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
						   struct sched_entity,
						   run_node);

490
		if (!cfs_rq->curr)
491
492
493
494
495
			vruntime = se->vruntime;
		else
			vruntime = min_vruntime(vruntime, se->vruntime);
	}

496
	/* ensure we never gain time by being placed backwards. */
497
	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
498
499
500
501
#ifndef CONFIG_64BIT
	smp_wmb();
	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
502
503
}

504
505
506
/*
 * Enqueue an entity into the rb-tree:
 */
507
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
{
	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
	struct rb_node *parent = NULL;
	struct sched_entity *entry;
	int leftmost = 1;

	/*
	 * Find the right place in the rbtree:
	 */
	while (*link) {
		parent = *link;
		entry = rb_entry(parent, struct sched_entity, run_node);
		/*
		 * We dont care about collisions. Nodes with
		 * the same key stay together.
		 */
524
		if (entity_before(se, entry)) {
525
526
527
528
529
530
531
532
533
534
535
			link = &parent->rb_left;
		} else {
			link = &parent->rb_right;
			leftmost = 0;
		}
	}

	/*
	 * Maintain a cache of leftmost tree entries (it is frequently
	 * used):
	 */
536
	if (leftmost)
Ingo Molnar's avatar
Ingo Molnar committed
537
		cfs_rq->rb_leftmost = &se->run_node;
538
539
540
541
542

	rb_link_node(&se->run_node, parent, link);
	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}

543
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
544
{
Peter Zijlstra's avatar
Peter Zijlstra committed
545
546
547
548
549
550
	if (cfs_rq->rb_leftmost == &se->run_node) {
		struct rb_node *next_node;

		next_node = rb_next(&se->run_node);
		cfs_rq->rb_leftmost = next_node;
	}
Ingo Molnar's avatar
Ingo Molnar committed
551

552
553
554
	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}

555
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
556
{
557
558
559
560
561
562
	struct rb_node *left = cfs_rq->rb_leftmost;

	if (!left)
		return NULL;

	return rb_entry(left, struct sched_entity, run_node);
563
564
}

565
566
567
568
569
570
571
572
573
574
575
static struct sched_entity *__pick_next_entity(struct sched_entity *se)
{
	struct rb_node *next = rb_next(&se->run_node);

	if (!next)
		return NULL;

	return rb_entry(next, struct sched_entity, run_node);
}

#ifdef CONFIG_SCHED_DEBUG
576
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
577
{
578
	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
579

580
581
	if (!last)
		return NULL;
582
583

	return rb_entry(last, struct sched_entity, run_node);
584
585
}

586
587
588
589
/**************************************************************
 * Scheduling class statistics methods:
 */

590
int sched_proc_update_handler(struct ctl_table *table, int write,
591
		void __user *buffer, size_t *lenp,
592
593
		loff_t *ppos)
{
594
	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
595
	int factor = get_update_sysctl_factor();
596
597
598
599
600
601
602

	if (ret || !write)
		return ret;

	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
					sysctl_sched_min_granularity);

603
604
605
606
607
608
609
#define WRT_SYSCTL(name) \
	(normalized_sysctl_##name = sysctl_##name / (factor))
	WRT_SYSCTL(sched_min_granularity);
	WRT_SYSCTL(sched_latency);
	WRT_SYSCTL(sched_wakeup_granularity);
#undef WRT_SYSCTL

610
611
612
	return 0;
}
#endif
613

614
/*
615
 * delta /= w
616
 */
617
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
618
{
619
	if (unlikely(se->load.weight != NICE_0_LOAD))
620
		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
621
622
623
624

	return delta;
}

625
626
627
/*
 * The idea is to set a period in which each task runs once.
 *
628
 * When there are too many tasks (sched_nr_latency) we have to stretch
629
630
631
632
 * this period because otherwise the slices get too small.
 *
 * p = (nr <= nl) ? l : l*nr/nl
 */
633
634
635
static u64 __sched_period(unsigned long nr_running)
{
	u64 period = sysctl_sched_latency;
636
	unsigned long nr_latency = sched_nr_latency;
637
638

	if (unlikely(nr_running > nr_latency)) {
639
		period = sysctl_sched_min_granularity;
640
641
642
643
644
645
		period *= nr_running;
	}

	return period;
}

646
647
648
649
/*
 * We calculate the wall-time slice from the period by taking a part
 * proportional to the weight.
 *
650
 * s = p*P[w/rw]
651
 */
652
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
653
{
Mike Galbraith's avatar
Mike Galbraith committed
654
	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
655

Mike Galbraith's avatar
Mike Galbraith committed
656
	for_each_sched_entity(se) {
Lin Ming's avatar
Lin Ming committed
657
		struct load_weight *load;
658
		struct load_weight lw;
Lin Ming's avatar
Lin Ming committed
659
660
661

		cfs_rq = cfs_rq_of(se);
		load = &cfs_rq->load;
662

Mike Galbraith's avatar
Mike Galbraith committed
663
		if (unlikely(!se->on_rq)) {
664
			lw = cfs_rq->load;
Mike Galbraith's avatar
Mike Galbraith committed
665
666
667
668

			update_load_add(&lw, se->load.weight);
			load = &lw;
		}
669
		slice = __calc_delta(slice, se->load.weight, load);
Mike Galbraith's avatar
Mike Galbraith committed
670
671
	}
	return slice;
672
673
}

674
/*
Andrei Epure's avatar
Andrei Epure committed
675
 * We calculate the vruntime slice of a to-be-inserted task.
676
 *
677
 * vs = s/w
678
 */
679
static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
Peter Zijlstra's avatar
Peter Zijlstra committed
680
{
681
	return calc_delta_fair(sched_slice(cfs_rq, se), se);
682
683
}

684
#ifdef CONFIG_SMP
685
686
static unsigned long task_h_load(struct task_struct *p);

687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
static inline void __update_task_entity_contrib(struct sched_entity *se);

/* Give new task start runnable values to heavy its load in infant time */
void init_task_runnable_average(struct task_struct *p)
{
	u32 slice;

	p->se.avg.decay_count = 0;
	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
	p->se.avg.runnable_avg_sum = slice;
	p->se.avg.runnable_avg_period = slice;
	__update_task_entity_contrib(&p->se);
}
#else
void init_task_runnable_average(struct task_struct *p)
{
}
#endif

706
/*
707
 * Update the current task's runtime statistics.
708
 */
709
static void update_curr(struct cfs_rq *cfs_rq)
710
{
711
	struct sched_entity *curr = cfs_rq->curr;
712
	u64 now = rq_clock_task(rq_of(cfs_rq));
713
	u64 delta_exec;
714
715
716
717

	if (unlikely(!curr))
		return;

718
719
	delta_exec = now - curr->exec_start;
	if (unlikely((s64)delta_exec <= 0))
Peter Zijlstra's avatar
Peter Zijlstra committed
720
		return;
721

Ingo Molnar's avatar
Ingo Molnar committed
722
	curr->exec_start = now;
723

724
725
726
727
728
729
730
731
732
	schedstat_set(curr->statistics.exec_max,
		      max(delta_exec, curr->statistics.exec_max));

	curr->sum_exec_runtime += delta_exec;
	schedstat_add(cfs_rq, exec_clock, delta_exec);

	curr->vruntime += calc_delta_fair(delta_exec, curr);
	update_min_vruntime(cfs_rq);

733
734
735
	if (entity_is_task(curr)) {
		struct task_struct *curtask = task_of(curr);

736
		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
737
		cpuacct_charge(curtask, delta_exec);
738
		account_group_exec_runtime(curtask, delta_exec);
739
	}
740
741

	account_cfs_rq_runtime(cfs_rq, delta_exec);
742
743
744
}

static inline void
745
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
746
{
747
	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
748
749
750
751
752
}

/*
 * Task is being enqueued - update stats:
 */
753
static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
754
755
756
757
758
{
	/*
	 * Are we enqueueing a waiting task? (for current tasks
	 * a dequeue/enqueue event is a NOP)
	 */
759
	if (se != cfs_rq->curr)
760
		update_stats_wait_start(cfs_rq, se);
761
762
763
}

static void
764
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
765
{
766
	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
767
			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
768
769
	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
770
			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
771
772
773
#ifdef CONFIG_SCHEDSTATS
	if (entity_is_task(se)) {
		trace_sched_stat_wait(task_of(se),
774
			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
775
776
	}
#endif
777
	schedstat_set(se->statistics.wait_start, 0);
778
779
780
}

static inline void
781
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
782
783
784
785
786
{
	/*
	 * Mark the end of the wait period if dequeueing a
	 * waiting task:
	 */
787
	if (se != cfs_rq->curr)
788
		update_stats_wait_end(cfs_rq, se);
789
790
791
792
793
794
}

/*
 * We are picking a new current task - update its stats:
 */
static inline void
795
update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
796
797
798
799
{
	/*
	 * We are starting a new run period:
	 */
800
	se->exec_start = rq_clock_task(rq_of(cfs_rq));
801
802
803
804
805
806
}

/**************************************************
 * Scheduling class queueing methods:
 */

807
808
#ifdef CONFIG_NUMA_BALANCING
/*
809
810
811
 * Approximate time to scan a full NUMA task in ms. The task scan period is
 * calculated based on the tasks virtual memory size and
 * numa_balancing_scan_size.
812
 */
813
814
unsigned int sysctl_numa_balancing_scan_period_min = 1000;
unsigned int sysctl_numa_balancing_scan_period_max = 60000;
815
816
817

/* Portion of address space to scan in MB */
unsigned int sysctl_numa_balancing_scan_size = 256;
818

819
820
821
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;

822
823
824
825
826
827
828
829
/*
 * After skipping a page migration on a shared page, skip N more numa page
 * migrations unconditionally. This reduces the number of NUMA migrations
 * in shared memory workloads, and has the effect of pulling tasks towards
 * where their memory lives, over pulling the memory towards the task.
 */
unsigned int sysctl_numa_balancing_migrate_deferred = 16;

830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
static unsigned int task_nr_scan_windows(struct task_struct *p)
{
	unsigned long rss = 0;
	unsigned long nr_scan_pages;

	/*
	 * Calculations based on RSS as non-present and empty pages are skipped
	 * by the PTE scanner and NUMA hinting faults should be trapped based
	 * on resident pages
	 */
	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
	rss = get_mm_rss(p->mm);
	if (!rss)
		rss = nr_scan_pages;

	rss = round_up(rss, nr_scan_pages);
	return rss / nr_scan_pages;
}

/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
#define MAX_SCAN_WINDOW 2560

static unsigned int task_scan_min(struct task_struct *p)
{
	unsigned int scan, floor;
	unsigned int windows = 1;

	if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
		windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
	floor = 1000 / windows;

	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
	return max_t(unsigned int, floor, scan);
}

static unsigned int task_scan_max(struct task_struct *p)
{
	unsigned int smin = task_scan_min(p);
	unsigned int smax;

	/* Watch for min being lower than max due to floor calculations */
	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
	return max(smin, smax);
}

875
876
877
878
879
880
881
/*
 * Once a preferred node is selected the scheduler balancer will prefer moving
 * a task to that node for sysctl_numa_balancing_settle_count number of PTE
 * scans. This will give the process the chance to accumulate more faults on
 * the preferred node but still allow the scheduler to move the task again if
 * the nodes CPUs are overloaded.
 */
882
unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
883

884
885
886
887
888
889
890
891
892
893
894
895
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
	rq->nr_numa_running += (p->numa_preferred_nid != -1);
	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
}

static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}

896
897
898
899
900
struct numa_group {
	atomic_t refcount;

	spinlock_t lock; /* nr_tasks, tasks */
	int nr_tasks;
901
	pid_t gid;
902
903
904
	struct list_head task_list;

	struct rcu_head rcu;
905
906
	unsigned long total_faults;
	unsigned long faults[0];
907
908
};

909
910
911
912
913
pid_t task_numa_group_id(struct task_struct *p)
{
	return p->numa_group ? p->numa_group->gid : 0;
}

914
915
916
917
918
919
920
921
922
923
924
925
926
927
static inline int task_faults_idx(int nid, int priv)
{
	return 2 * nid + priv;
}

static inline unsigned long task_faults(struct task_struct *p, int nid)
{
	if (!p->numa_faults)
		return 0;

	return p->numa_faults[task_faults_idx(nid, 0)] +
		p->numa_faults[task_faults_idx(nid, 1)];
}

928
929
930
931
932
static inline unsigned long group_faults(struct task_struct *p, int nid)
{
	if (!p->numa_group)
		return 0;

933
	return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
}

/*
 * These return the fraction of accesses done by a particular task, or
 * task group, on a particular numa node.  The group weight is given a
 * larger multiplier, in order to group tasks together that are almost
 * evenly spread out between numa nodes.
 */
static inline unsigned long task_weight(struct task_struct *p, int nid)
{
	unsigned long total_faults;

	if (!p->numa_faults)
		return 0;

	total_faults = p->total_numa_faults;

	if (!total_faults)
		return 0;

	return 1000 * task_faults(p, nid) / total_faults;
}

static inline unsigned long group_weight(struct task_struct *p, int nid)
{
959
	if (!p->numa_group || !p->numa_group->total_faults)
960
961
		return 0;

962
	return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
963
964
}

965
static unsigned long weighted_cpuload(const int cpu);
966
967
968
969
970
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long power_of(int cpu);
static long effective_load(struct task_group *tg, int cpu, long wl, long wg);

971
/* Cached statistics for all CPUs within a node */
972
struct numa_stats {
973
	unsigned long nr_running;
974
	unsigned long load;
975
976
977
978
979
980
981

	/* Total compute capacity of CPUs on a node */
	unsigned long power;

	/* Approximate capacity in terms of runnable tasks on a node */
	unsigned long capacity;
	int has_capacity;
982
};
983

984
985
986
987
988
/*
 * XXX borrowed from update_sg_lb_stats
 */
static void update_numa_stats(struct numa_stats *ns, int nid)
{
989
	int cpu, cpus = 0;
990
991
992
993
994
995
996
997

	memset(ns, 0, sizeof(*ns));
	for_each_cpu(cpu, cpumask_of_node(nid)) {
		struct rq *rq = cpu_rq(cpu);

		ns->nr_running += rq->nr_running;
		ns->load += weighted_cpuload(cpu);
		ns->power += power_of(cpu);
998
999

		cpus++;
1000
1001
	}

1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
	/*
	 * If we raced with hotplug and there are no CPUs left in our mask
	 * the @ns structure is NULL'ed and task_numa_compare() will
	 * not find this node attractive.
	 *
	 * We'll either bail at !has_capacity, or we'll detect a huge imbalance
	 * and bail there.
	 */
	if (!cpus)
		return;

1013
1014
1015
1016
1017
	ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
	ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
	ns->has_capacity = (ns->nr_running < ns->capacity);
}

1018
1019
struct task_numa_env {
	struct task_struct *p;
1020

1021
1022
	int src_cpu, src_nid;
	int dst_cpu, dst_nid;
1023

1024
	struct numa_stats src_stats, dst_stats;
1025

1026
1027
1028
1029
	int imbalance_pct, idx;

	struct task_struct *best_task;
	long best_imp;
1030
1031
1032
	int best_cpu;
};

1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
static void task_numa_assign(struct task_numa_env *env,
			     struct task_struct *p, long imp)
{
	if (env->best_task)
		put_task_struct(env->best_task);
	if (p)
		get_task_struct(p);

	env->best_task = p;
	env->best_imp = imp;
	env->best_cpu = env->dst_cpu;
}

/*
 * This checks if the overall compute and NUMA accesses of the system would
 * be improved if the source tasks was migrated to the target dst_cpu taking
 * into account that it might be best if task running on the dst_cpu should
 * be exchanged with the source task
 */
1052
1053
static void task_numa_compare(struct task_numa_env *env,
			      long taskimp, long groupimp)
1054
1055
1056
1057
1058
1059
{
	struct rq *src_rq = cpu_rq(env->src_cpu);
	struct rq *dst_rq = cpu_rq(env->dst_cpu);
	struct task_struct *cur;
	long dst_load, src_load;
	long load;
1060
	long imp = (groupimp > 0) ? groupimp : taskimp;
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078

	rcu_read_lock();
	cur = ACCESS_ONCE(dst_rq->curr);
	if (cur->pid == 0) /* idle */
		cur = NULL;

	/*
	 * "imp" is the fault differential for the source task between the
	 * source and destination node. Calculate the total differential for
	 * the source task and potential destination task. The more negative
	 * the value is, the more rmeote accesses that would be expected to
	 * be incurred if the tasks were swapped.
	 */
	if (cur) {
		/* Skip this swap candidate if cannot move to the source cpu */
		if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
			goto unlock;

1079
1080
		/*
		 * If dst and source tasks are in the same NUMA group, or not
1081
		 * in any group then look only at task weights.
1082
		 */
1083
		if (cur->numa_group == env->p->numa_group) {
1084
1085
			imp = taskimp + task_weight(cur, env->src_nid) -
			      task_weight(cur, env->dst_nid);
1086
1087
1088
1089
1090
1091
			/*
			 * Add some hysteresis to prevent swapping the
			 * tasks within a group over tiny differences.
			 */
			if (cur->numa_group)
				imp -= imp/16;
1092
		} else {
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
			/*
			 * Compare the group weights. If a task is all by
			 * itself (not part of a group), use the task weight
			 * instead.
			 */
			if (env->p->numa_group)
				imp = groupimp;
			else
				imp = taskimp;

			if (cur->numa_group)
				imp += group_weight(cur, env->src_nid) -
				       group_weight(cur, env->dst_nid);
			else
				imp += task_weight(cur, env->src_nid) -
				       task_weight(cur, env->dst_nid);
1109
		}
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
	}

	if (imp < env->best_imp)
		goto unlock;

	if (!cur) {
		/* Is there capacity at our destination? */
		if (env->src_stats.has_capacity &&
		    !env->dst_stats.has_capacity)
			goto unlock;

		goto balance;
	}

	/* Balance doesn't matter much if we're running a task per cpu */
	if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
		goto assign;

	/*
	 * In the overloaded case, try and keep the load balanced.
	 */
balance:
	dst_load = env->dst_stats.load;
	src_load = env->src_stats.load;

	/* XXX missing power terms */
	load = task_h_load(env->p);
	dst_load += load;
	src_load -= load;

	if (cur) {
		load = task_h_load(cur);
		dst_load -= load;
		src_load += load;
	}

	/* make src_load the smaller */
	if (dst_load < src_load)
		swap(dst_load, src_load);

	if (src_load * env->imbalance_pct < dst_load * 100)
		goto unlock;

assign:
	task_numa_assign(env, cur, imp);
unlock:
	rcu_read_unlock();
}

1159
1160
static void task_numa_find_cpu(struct task_numa_env *env,
				long taskimp, long groupimp)
1161
1162
1163
1164
1165
1166
1167
1168
1169
{
	int cpu;

	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
		/* Skip this CPU if the source task cannot migrate */
		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
			continue;

		env->dst_cpu = cpu;
1170
		task_numa_compare(env, taskimp, groupimp);
1171
1172
1173
	}
}

1174
1175
1176
1177
static int task_numa_migrate(struct task_struct *p)
{
	struct task_numa_env env = {
		.p = p,
1178

1179
		.src_cpu = task_cpu(p),
Ingo Molnar's avatar
Ingo Molnar committed
1180
		.src_nid = task_node(p),
1181
1182
1183
1184
1185
1186

		.imbalance_pct = 112,

		.best_task = NULL,
		.best_imp = 0,
		.best_cpu = -1
1187
1188
	};
	struct sched_domain *sd;
1189
	unsigned long taskweight, groupweight;
1190
	int nid, ret;
1191
	long taskimp, groupimp;
1192

1193
	/*
1194
1195
1196
1197
1198
1199
	 * Pick the lowest SD_NUMA domain, as that would have the smallest
	 * imbalance and would be the first to start moving tasks about.
	 *
	 * And we want to avoid any moving of tasks about, as that would create
	 * random movement of tasks -- counter the numa conditions we're trying
	 * to satisfy here.
1200
1201
	 */
	rcu_read_lock();
1202
	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1203
1204
	if (sd)
		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1205
1206
	rcu_read_unlock();

1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
	/*
	 * Cpusets can break the scheduler domain tree into smaller
	 * balance domains, some of which do not cross NUMA boundaries.
	 * Tasks that are "trapped" in such domains cannot be migrated
	 * elsewhere, so there is no point in (re)trying.
	 */
	if (unlikely(!sd)) {
		p->numa_preferred_nid = cpu_to_node(task_cpu(p));
		return -EINVAL;
	}

1218
1219
	taskweight = task_weight(p, env.src_nid);
	groupweight = group_weight(p, env.src_nid);
1220
	update_numa_stats(&env.src_stats, env.src_nid);
1221
	env.dst_nid = p->numa_preferred_nid;
1222
1223
	taskimp = task_weight(p, env.dst_nid) - taskweight;
	groupimp = group_weight(p, env.dst_nid) - groupweight;
1224
	update_numa_stats(&env.dst_stats, env.dst_nid);
1225

1226
1227
	/* If the preferred nid has capacity, try to use it. */
	if (env.dst_stats.has_capacity)
1228
		task_numa_find_cpu(&env, taskimp, groupimp);
1229
1230
1231

	/* No space available on the preferred nid. Look elsewhere. */
	if (env.best_cpu == -1) {
1232
1233
1234
		for_each_online_node(nid) {
			if (nid == env.src_nid || nid == p->numa_preferred_nid)
				continue;
1235

1236
			/* Only consider nodes where both task and groups benefit */
1237
1238
1239
			taskimp = task_weight(p, nid) - taskweight;
			groupimp = group_weight(p, nid) - groupweight;
			if (taskimp < 0 && groupimp < 0)
1240
1241
				continue;

1242
1243
			env.dst_nid = nid;
			update_numa_stats(&env.dst_stats, env.dst_nid);
1244
			task_numa_find_cpu(&env, taskimp, groupimp);
1245
1246
1247
		}
	}

1248
1249
1250
1251
	/* No better CPU than the current one was found. */
	if (env.best_cpu == -1)
		return -EAGAIN;

1252
1253
	sched_setnuma(p, env.dst_nid);

1254
1255
1256
1257
1258
1259
	/*
	 * Reset the scan period if the task is being rescheduled on an
	 * alternative node to recheck if the tasks is now properly placed.
	 */
	p->numa_scan_period = task_scan_min(p);

1260
1261
1262
1263
1264
1265
1266
1267
	if (env.best_task == NULL) {
		int ret = migrate_task_to(p, env.best_cpu);
		return ret;
	}

	ret = migrate_swap(p, env.best_task);
	put_task_struct(env.best_task);
	return ret;
1268
1269
}

1270
1271
1272
/* Attempt to migrate a task to a CPU on the preferred node. */
static void numa_migrate_preferred(struct task_struct *p)
{
1273
1274
	/* This task has no NUMA fault statistics yet */
	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1275
1276
		return;

1277
1278
1279
1280
1281
	/* Periodically retry migrating the task to the preferred node */
	p->numa_migrate_retry = jiffies + HZ;

	/* Success if task is already running on preferred CPU */
	if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
1282
1283
1284
		return;

	/* Otherwise, try migrate to a CPU on the preferred node */
1285
	task_numa_migrate(p);
1286
1287
}

1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
/*
 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
 * increments. The more local the fault statistics are, the higher the scan
 * period will be for the next scan window. If local/remote ratio is below
 * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
 * scan period will decrease
 */
#define NUMA_PERIOD_SLOTS 10
#define NUMA_PERIOD_THRESHOLD 3

/*
 * Increase the scan period (slow down scanning) if the majority of
 * our memory is already on our local node, or if the majority of
 * the page accesses are shared with other processes.
 * Otherwise, decrease the scan period.
 */
static void update_task_scan_period(struct task_struct *p,
			unsigned long shared, unsigned long private)
{
	unsigned int period_slot;
	int ratio;
	int diff;

	unsigned long remote = p->numa_faults_locality[0];
	unsigned long local = p->numa_faults_locality[1];

	/*
	 * If there were no record hinting faults then either the task is
	 * completely idle or all activity is areas that are not of interest
	 * to automatic numa balancing. Scan slower
	 */
	if (local + shared == 0) {
		p->numa_scan_period = min(p->numa_scan_period_max,
			p->numa_scan_period << 1);

		p->mm->numa_next_scan = jiffies +
			msecs_to_jiffies(p->numa_scan_period);

		return;
	}

	/*
	 * Prepare to scale scan period relative to the current period.
	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
	 */
	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
	if (ratio >= NUMA_PERIOD_THRESHOLD) {
		int slot = ratio - NUMA_PERIOD_THRESHOLD;
		if (!slot)
			slot = 1;
		diff = slot * period_slot;
	} else {
		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;

		/*
		 * Scale scan rate increases based on sharing. There is an
		 * inverse relationship between the degree of sharing and
		 * the adjustment made to the scanning period. Broadly
		 * speaking the intent is that there is little point
		 * scanning faster if shared accesses dominate as it may
		 * simply bounce migrations uselessly
		 */
		period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
		ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
		diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
	}

	p->numa_scan_period = clamp(p->numa_scan_period + diff,
			task_scan_min(p), task_scan_max(p));
	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}

1363
1364
static void task_numa_placement(struct task_struct *p)
{
1365
1366
	int seq, nid, max_nid = -1, max_group_nid = -1;
	unsigned long max_faults = 0, max_group_faults = 0;
1367
	unsigned long fault_types[2] = { 0, 0 };
1368
	spinlock_t *group_lock = NULL;
1369

1370
	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
1371
1372
1373
	if (p->numa_scan_seq == seq)
		return;
	p->numa_scan_seq = seq;
1374
	p->numa_scan_period_max = task_scan_max(p);
1375

1376
1377
1378
1379
1380
1381
	/* If the task is part of a group prevent parallel updates to group stats */
	if (p->numa_group) {
		group_lock = &p->numa_group