workqueue.c 103 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
/*
Tejun Heo's avatar
Tejun Heo committed
2
 * kernel/workqueue.c - generic async execution with shared worker pool
Linus Torvalds's avatar
Linus Torvalds committed
3
 *
Tejun Heo's avatar
Tejun Heo committed
4
 * Copyright (C) 2002		Ingo Molnar
Linus Torvalds's avatar
Linus Torvalds committed
5
 *
Tejun Heo's avatar
Tejun Heo committed
6
7
8
9
10
 *   Derived from the taskqueue/keventd code by:
 *     David Woodhouse <dwmw2@infradead.org>
 *     Andrew Morton
 *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
 *     Theodore Ts'o <tytso@mit.edu>
Linus Torvalds's avatar
Linus Torvalds committed
11
 *
Tejun Heo's avatar
Tejun Heo committed
12
 * Made to use alloc_percpu by Christoph Lameter.
Linus Torvalds's avatar
Linus Torvalds committed
13
 *
Tejun Heo's avatar
Tejun Heo committed
14
15
 * Copyright (C) 2010		SUSE Linux Products GmbH
 * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
16
 *
Tejun Heo's avatar
Tejun Heo committed
17
18
19
20
21
22
23
 * This is the generic async execution mechanism.  Work items as are
 * executed in process context.  The worker pool is shared and
 * automatically managed.  There is one worker pool for each CPU and
 * one extra for works which are better served by workers which are
 * not bound to any specific CPU.
 *
 * Please read Documentation/workqueue.txt for details.
Linus Torvalds's avatar
Linus Torvalds committed
24
25
 */

26
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
27
28
29
30
31
32
33
34
35
36
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
37
#include <linux/hardirq.h>
38
#include <linux/mempolicy.h>
39
#include <linux/freezer.h>
40
41
#include <linux/kallsyms.h>
#include <linux/debug_locks.h>
42
#include <linux/lockdep.h>
Tejun Heo's avatar
Tejun Heo committed
43
#include <linux/idr.h>
44
45

#include "workqueue_sched.h"
Linus Torvalds's avatar
Linus Torvalds committed
46

Tejun Heo's avatar
Tejun Heo committed
47
enum {
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
	/*
	 * global_cwq flags
	 *
	 * A bound gcwq is either associated or disassociated with its CPU.
	 * While associated (!DISASSOCIATED), all workers are bound to the
	 * CPU and none has %WORKER_UNBOUND set and concurrency management
	 * is in effect.
	 *
	 * While DISASSOCIATED, the cpu may be offline and all workers have
	 * %WORKER_UNBOUND set and concurrency management disabled, and may
	 * be executing on any CPU.  The gcwq behaves as an unbound one.
	 *
	 * Note that DISASSOCIATED can be flipped only while holding
	 * managership of all pools on the gcwq to avoid changing binding
	 * state while create_worker() is in progress.
	 */
64
65
66
67
68
	GCWQ_DISASSOCIATED	= 1 << 0,	/* cpu can't serve workers */
	GCWQ_FREEZING		= 1 << 1,	/* freeze in progress */

	/* pool flags */
	POOL_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
69

Tejun Heo's avatar
Tejun Heo committed
70
71
72
73
	/* worker flags */
	WORKER_STARTED		= 1 << 0,	/* started */
	WORKER_DIE		= 1 << 1,	/* die die die */
	WORKER_IDLE		= 1 << 2,	/* is idle */
74
75
	WORKER_PREP		= 1 << 3,	/* preparing to run works */
	WORKER_REBIND		= 1 << 5,	/* mom is home, come back */
76
	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
77
	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
78

79
80
	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
				  WORKER_CPU_INTENSIVE,
81

82
	NR_WORKER_POOLS		= 2,		/* # worker pools per gcwq */
83

Tejun Heo's avatar
Tejun Heo committed
84
85
86
	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
	BUSY_WORKER_HASH_SIZE	= 1 << BUSY_WORKER_HASH_ORDER,
	BUSY_WORKER_HASH_MASK	= BUSY_WORKER_HASH_SIZE - 1,
87

88
89
90
	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */

91
92
93
	MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
						/* call for help after 10ms
						   (min two ticks) */
94
95
96
97
98
99
100
101
	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */

	/*
	 * Rescue workers are used only on emergencies and shared by
	 * all cpus.  Give -20.
	 */
	RESCUER_NICE_LEVEL	= -20,
102
	HIGHPRI_NICE_LEVEL	= -20,
Tejun Heo's avatar
Tejun Heo committed
103
};
Linus Torvalds's avatar
Linus Torvalds committed
104
105

/*
Tejun Heo's avatar
Tejun Heo committed
106
107
 * Structure fields follow one of the following exclusion rules.
 *
108
109
 * I: Modifiable by initialization/destruction paths and read-only for
 *    everyone else.
Tejun Heo's avatar
Tejun Heo committed
110
 *
111
112
113
 * P: Preemption protected.  Disabling preemption is enough and should
 *    only be modified and accessed from the local cpu.
 *
114
 * L: gcwq->lock protected.  Access with gcwq->lock held.
Tejun Heo's avatar
Tejun Heo committed
115
 *
116
117
118
 * X: During normal operation, modification requires gcwq->lock and
 *    should be done only from local cpu.  Either disabling preemption
 *    on local cpu or grabbing gcwq->lock is enough for read access.
119
 *    If GCWQ_DISASSOCIATED is set, it's identical to L.
120
 *
121
122
 * F: wq->flush_mutex protected.
 *
Tejun Heo's avatar
Tejun Heo committed
123
 * W: workqueue_lock protected.
Linus Torvalds's avatar
Linus Torvalds committed
124
125
 */

126
struct global_cwq;
127
struct worker_pool;
128
struct idle_rebind;
Linus Torvalds's avatar
Linus Torvalds committed
129

130
131
132
133
/*
 * The poor guys doing the actual heavy lifting.  All on-duty workers
 * are either serving the manager role, on idle list or on busy hash.
 */
Tejun Heo's avatar
Tejun Heo committed
134
struct worker {
Tejun Heo's avatar
Tejun Heo committed
135
136
137
138
139
	/* on idle list while idle, on busy hash table while busy */
	union {
		struct list_head	entry;	/* L: while idle */
		struct hlist_node	hentry;	/* L: while busy */
	};
Linus Torvalds's avatar
Linus Torvalds committed
140

Tejun Heo's avatar
Tejun Heo committed
141
	struct work_struct	*current_work;	/* L: work being processed */
142
	struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
143
	struct list_head	scheduled;	/* L: scheduled works */
Tejun Heo's avatar
Tejun Heo committed
144
	struct task_struct	*task;		/* I: worker task */
145
	struct worker_pool	*pool;		/* I: the associated pool */
146
147
148
	/* 64 bytes boundary on 64bit, 32 on 32bit */
	unsigned long		last_active;	/* L: last active timestamp */
	unsigned int		flags;		/* X: flags */
Tejun Heo's avatar
Tejun Heo committed
149
	int			id;		/* I: worker id */
150
151
152
153

	/* for rebinding worker to CPU */
	struct idle_rebind	*idle_rebind;	/* L: for idle worker */
	struct work_struct	rebind_work;	/* L: for busy worker */
Tejun Heo's avatar
Tejun Heo committed
154
155
};

156
157
struct worker_pool {
	struct global_cwq	*gcwq;		/* I: the owning gcwq */
158
	unsigned int		flags;		/* X: flags */
159
160
161
162
163
164
165
166
167

	struct list_head	worklist;	/* L: list of pending works */
	int			nr_workers;	/* L: total number of workers */
	int			nr_idle;	/* L: currently idle ones */

	struct list_head	idle_list;	/* X: list of idle workers */
	struct timer_list	idle_timer;	/* L: worker idle timeout */
	struct timer_list	mayday_timer;	/* L: SOS timer for workers */

168
	struct mutex		manager_mutex;	/* mutex manager should hold */
169
170
171
	struct ida		worker_ida;	/* L: for worker IDs */
};

172
/*
173
174
175
 * Global per-cpu workqueue.  There's one and only one for each cpu
 * and all works are queued and processed here regardless of their
 * target workqueues.
176
177
178
179
 */
struct global_cwq {
	spinlock_t		lock;		/* the gcwq lock */
	unsigned int		cpu;		/* I: the associated cpu */
180
	unsigned int		flags;		/* L: GCWQ_* flags */
Tejun Heo's avatar
Tejun Heo committed
181

182
	/* workers are chained either in busy_hash or pool idle_list */
Tejun Heo's avatar
Tejun Heo committed
183
184
185
	struct hlist_head	busy_hash[BUSY_WORKER_HASH_SIZE];
						/* L: hash of busy workers */

186
	struct worker_pool	pools[2];	/* normal and highpri pools */
187

188
	wait_queue_head_t	rebind_hold;	/* rebind hold wait */
189
190
} ____cacheline_aligned_in_smp;

Linus Torvalds's avatar
Linus Torvalds committed
191
/*
192
 * The per-CPU workqueue.  The lower WORK_STRUCT_FLAG_BITS of
Tejun Heo's avatar
Tejun Heo committed
193
194
 * work_struct->data are used for flags and thus cwqs need to be
 * aligned at two's power of the number of flag bits.
Linus Torvalds's avatar
Linus Torvalds committed
195
196
 */
struct cpu_workqueue_struct {
197
	struct worker_pool	*pool;		/* I: the associated pool */
Tejun Heo's avatar
Tejun Heo committed
198
	struct workqueue_struct *wq;		/* I: the owning workqueue */
199
200
201
202
	int			work_color;	/* L: current color */
	int			flush_color;	/* L: flushing color */
	int			nr_in_flight[WORK_NR_COLORS];
						/* L: nr of in_flight works */
203
	int			nr_active;	/* L: nr of active works */
204
	int			max_active;	/* L: max active works */
205
	struct list_head	delayed_works;	/* L: delayed works */
Tejun Heo's avatar
Tejun Heo committed
206
};
Linus Torvalds's avatar
Linus Torvalds committed
207

208
209
210
211
212
213
214
215
216
/*
 * Structure used to wait for workqueue flush.
 */
struct wq_flusher {
	struct list_head	list;		/* F: list of flushers */
	int			flush_color;	/* F: flush color waiting for */
	struct completion	done;		/* flush completion */
};

217
218
219
220
221
222
223
224
225
226
/*
 * All cpumasks are assumed to be always set on UP and thus can't be
 * used to determine whether there's something to be done.
 */
#ifdef CONFIG_SMP
typedef cpumask_var_t mayday_mask_t;
#define mayday_test_and_set_cpu(cpu, mask)	\
	cpumask_test_and_set_cpu((cpu), (mask))
#define mayday_clear_cpu(cpu, mask)		cpumask_clear_cpu((cpu), (mask))
#define for_each_mayday_cpu(cpu, mask)		for_each_cpu((cpu), (mask))
227
#define alloc_mayday_mask(maskp, gfp)		zalloc_cpumask_var((maskp), (gfp))
228
229
230
231
232
233
234
235
236
#define free_mayday_mask(mask)			free_cpumask_var((mask))
#else
typedef unsigned long mayday_mask_t;
#define mayday_test_and_set_cpu(cpu, mask)	test_and_set_bit(0, &(mask))
#define mayday_clear_cpu(cpu, mask)		clear_bit(0, &(mask))
#define for_each_mayday_cpu(cpu, mask)		if ((cpu) = 0, (mask))
#define alloc_mayday_mask(maskp, gfp)		true
#define free_mayday_mask(mask)			do { } while (0)
#endif
Linus Torvalds's avatar
Linus Torvalds committed
237
238
239
240
241
242

/*
 * The externally visible workqueue abstraction is an array of
 * per-CPU workqueues:
 */
struct workqueue_struct {
243
	unsigned int		flags;		/* W: WQ_* flags */
244
245
246
247
248
	union {
		struct cpu_workqueue_struct __percpu	*pcpu;
		struct cpu_workqueue_struct		*single;
		unsigned long				v;
	} cpu_wq;				/* I: cwq's */
Tejun Heo's avatar
Tejun Heo committed
249
	struct list_head	list;		/* W: list of all workqueues */
250
251
252
253
254
255
256
257
258

	struct mutex		flush_mutex;	/* protects wq flushing */
	int			work_color;	/* F: current work color */
	int			flush_color;	/* F: current flush color */
	atomic_t		nr_cwqs_to_flush; /* flush in progress */
	struct wq_flusher	*first_flusher;	/* F: first flusher */
	struct list_head	flusher_queue;	/* F: flush waiters */
	struct list_head	flusher_overflow; /* F: flush overflow list */

259
	mayday_mask_t		mayday_mask;	/* cpus requesting rescue */
260
261
	struct worker		*rescuer;	/* I: rescue worker */

262
	int			nr_drainers;	/* W: drain in progress */
263
	int			saved_max_active; /* W: saved cwq max_active */
264
#ifdef CONFIG_LOCKDEP
Tejun Heo's avatar
Tejun Heo committed
265
	struct lockdep_map	lockdep_map;
266
#endif
267
	char			name[];		/* I: workqueue name */
Linus Torvalds's avatar
Linus Torvalds committed
268
269
};

270
271
272
struct workqueue_struct *system_wq __read_mostly;
struct workqueue_struct *system_long_wq __read_mostly;
struct workqueue_struct *system_nrt_wq __read_mostly;
273
struct workqueue_struct *system_unbound_wq __read_mostly;
274
struct workqueue_struct *system_freezable_wq __read_mostly;
275
struct workqueue_struct *system_nrt_freezable_wq __read_mostly;
276
277
278
EXPORT_SYMBOL_GPL(system_wq);
EXPORT_SYMBOL_GPL(system_long_wq);
EXPORT_SYMBOL_GPL(system_nrt_wq);
279
EXPORT_SYMBOL_GPL(system_unbound_wq);
280
EXPORT_SYMBOL_GPL(system_freezable_wq);
281
EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
282

283
284
285
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>

286
#define for_each_worker_pool(pool, gcwq)				\
287
288
	for ((pool) = &(gcwq)->pools[0];				\
	     (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
289

290
291
292
293
#define for_each_busy_worker(worker, i, pos, gcwq)			\
	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
		hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)

294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
				  unsigned int sw)
{
	if (cpu < nr_cpu_ids) {
		if (sw & 1) {
			cpu = cpumask_next(cpu, mask);
			if (cpu < nr_cpu_ids)
				return cpu;
		}
		if (sw & 2)
			return WORK_CPU_UNBOUND;
	}
	return WORK_CPU_NONE;
}

static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
				struct workqueue_struct *wq)
{
	return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
}

315
316
317
318
319
320
321
322
323
324
325
326
327
/*
 * CPU iterators
 *
 * An extra gcwq is defined for an invalid cpu number
 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
 * specific CPU.  The following iterators are similar to
 * for_each_*_cpu() iterators but also considers the unbound gcwq.
 *
 * for_each_gcwq_cpu()		: possible CPUs + WORK_CPU_UNBOUND
 * for_each_online_gcwq_cpu()	: online CPUs + WORK_CPU_UNBOUND
 * for_each_cwq_cpu()		: possible CPUs for bound workqueues,
 *				  WORK_CPU_UNBOUND for unbound workqueues
 */
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
#define for_each_gcwq_cpu(cpu)						\
	for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3);		\
	     (cpu) < WORK_CPU_NONE;					\
	     (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))

#define for_each_online_gcwq_cpu(cpu)					\
	for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3);		\
	     (cpu) < WORK_CPU_NONE;					\
	     (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))

#define for_each_cwq_cpu(cpu, wq)					\
	for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq));	\
	     (cpu) < WORK_CPU_NONE;					\
	     (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))

343
344
345
346
#ifdef CONFIG_DEBUG_OBJECTS_WORK

static struct debug_obj_descr work_debug_descr;

347
348
349
350
351
static void *work_debug_hint(void *addr)
{
	return ((struct work_struct *) addr)->func;
}

352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static int work_fixup_init(void *addr, enum debug_obj_state state)
{
	struct work_struct *work = addr;

	switch (state) {
	case ODEBUG_STATE_ACTIVE:
		cancel_work_sync(work);
		debug_object_init(work, &work_debug_descr);
		return 1;
	default:
		return 0;
	}
}

/*
 * fixup_activate is called when:
 * - an active object is activated
 * - an unknown object is activated (might be a statically initialized object)
 */
static int work_fixup_activate(void *addr, enum debug_obj_state state)
{
	struct work_struct *work = addr;

	switch (state) {

	case ODEBUG_STATE_NOTAVAILABLE:
		/*
		 * This is not really a fixup. The work struct was
		 * statically initialized. We just make sure that it
		 * is tracked in the object tracker.
		 */
387
		if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
			debug_object_init(work, &work_debug_descr);
			debug_object_activate(work, &work_debug_descr);
			return 0;
		}
		WARN_ON_ONCE(1);
		return 0;

	case ODEBUG_STATE_ACTIVE:
		WARN_ON(1);

	default:
		return 0;
	}
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static int work_fixup_free(void *addr, enum debug_obj_state state)
{
	struct work_struct *work = addr;

	switch (state) {
	case ODEBUG_STATE_ACTIVE:
		cancel_work_sync(work);
		debug_object_free(work, &work_debug_descr);
		return 1;
	default:
		return 0;
	}
}

static struct debug_obj_descr work_debug_descr = {
	.name		= "work_struct",
423
	.debug_hint	= work_debug_hint,
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
	.fixup_init	= work_fixup_init,
	.fixup_activate	= work_fixup_activate,
	.fixup_free	= work_fixup_free,
};

static inline void debug_work_activate(struct work_struct *work)
{
	debug_object_activate(work, &work_debug_descr);
}

static inline void debug_work_deactivate(struct work_struct *work)
{
	debug_object_deactivate(work, &work_debug_descr);
}

void __init_work(struct work_struct *work, int onstack)
{
	if (onstack)
		debug_object_init_on_stack(work, &work_debug_descr);
	else
		debug_object_init(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(__init_work);

void destroy_work_on_stack(struct work_struct *work)
{
	debug_object_free(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_work_on_stack);

#else
static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif

459
460
/* Serializes the accesses to the list of workqueues. */
static DEFINE_SPINLOCK(workqueue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
461
static LIST_HEAD(workqueues);
462
static bool workqueue_freezing;		/* W: have wqs started freezing? */
Tejun Heo's avatar
Tejun Heo committed
463

464
465
466
467
468
/*
 * The almighty global cpu workqueues.  nr_running is the only field
 * which is expected to be used frequently by other cpus via
 * try_to_wake_up().  Put it in a separate cacheline.
 */
469
static DEFINE_PER_CPU(struct global_cwq, global_cwq);
470
static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
471

472
473
474
475
476
477
/*
 * Global cpu workqueue and nr_running counter for unbound gcwq.  The
 * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
 * workers have WORKER_UNBOUND set.
 */
static struct global_cwq unbound_global_cwq;
478
479
480
static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
	[0 ... NR_WORKER_POOLS - 1]	= ATOMIC_INIT(0),	/* always 0 */
};
481

Tejun Heo's avatar
Tejun Heo committed
482
static int worker_thread(void *__worker);
Linus Torvalds's avatar
Linus Torvalds committed
483

484
485
486
487
488
static int worker_pool_pri(struct worker_pool *pool)
{
	return pool - pool->gcwq->pools;
}

489
490
static struct global_cwq *get_gcwq(unsigned int cpu)
{
491
492
493
494
	if (cpu != WORK_CPU_UNBOUND)
		return &per_cpu(global_cwq, cpu);
	else
		return &unbound_global_cwq;
495
496
}

497
static atomic_t *get_pool_nr_running(struct worker_pool *pool)
498
{
499
	int cpu = pool->gcwq->cpu;
500
	int idx = worker_pool_pri(pool);
501

502
	if (cpu != WORK_CPU_UNBOUND)
503
		return &per_cpu(pool_nr_running, cpu)[idx];
504
	else
505
		return &unbound_pool_nr_running[idx];
506
507
}

Tejun Heo's avatar
Tejun Heo committed
508
509
static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
					    struct workqueue_struct *wq)
510
{
511
	if (!(wq->flags & WQ_UNBOUND)) {
512
		if (likely(cpu < nr_cpu_ids))
513
514
515
516
			return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
	} else if (likely(cpu == WORK_CPU_UNBOUND))
		return wq->cpu_wq.single;
	return NULL;
517
518
}

519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
static unsigned int work_color_to_flags(int color)
{
	return color << WORK_STRUCT_COLOR_SHIFT;
}

static int get_work_color(struct work_struct *work)
{
	return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
		((1 << WORK_STRUCT_COLOR_BITS) - 1);
}

static int work_next_color(int color)
{
	return (color + 1) % WORK_NR_COLORS;
}
Linus Torvalds's avatar
Linus Torvalds committed
534

535
/*
536
537
538
 * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
 * work is on queue.  Once execution starts, WORK_STRUCT_CWQ is
 * cleared and the work data contains the cpu number it was last on.
539
 *
540
541
542
543
 * set_work_cwq(), set_work_cpu_and_clear_pending() and clear_work_data()
 * can be used to set the cwq, cpu or clear work->data.  These functions
 * should only be called while the work is owned - ie. while the PENDING
 * bit is set.
544
545
546
547
548
 *
 * get_work_[g]cwq() can be used to obtain the gcwq or cwq
 * corresponding to a work.  gcwq is available once the work has been
 * queued anywhere after initialization.  cwq is available only from
 * queueing until execution starts.
549
 */
550
551
static inline void set_work_data(struct work_struct *work, unsigned long data,
				 unsigned long flags)
552
{
553
	BUG_ON(!work_pending(work));
554
555
	atomic_long_set(&work->data, data | flags | work_static(work));
}
556

557
558
559
560
561
static void set_work_cwq(struct work_struct *work,
			 struct cpu_workqueue_struct *cwq,
			 unsigned long extra_flags)
{
	set_work_data(work, (unsigned long)cwq,
562
		      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
563
564
}

565
566
static void set_work_cpu_and_clear_pending(struct work_struct *work,
					   unsigned int cpu)
567
{
568
	set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, 0);
569
}
570

571
static void clear_work_data(struct work_struct *work)
Linus Torvalds's avatar
Linus Torvalds committed
572
{
573
	set_work_data(work, WORK_STRUCT_NO_CPU, 0);
Linus Torvalds's avatar
Linus Torvalds committed
574
575
}

576
static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
577
{
578
	unsigned long data = atomic_long_read(&work->data);
579

580
581
582
583
	if (data & WORK_STRUCT_CWQ)
		return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
	else
		return NULL;
584
585
}

586
static struct global_cwq *get_work_gcwq(struct work_struct *work)
587
{
588
	unsigned long data = atomic_long_read(&work->data);
589
590
	unsigned int cpu;

591
592
	if (data & WORK_STRUCT_CWQ)
		return ((struct cpu_workqueue_struct *)
593
			(data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
594
595

	cpu = data >> WORK_STRUCT_FLAG_BITS;
596
	if (cpu == WORK_CPU_NONE)
597
598
		return NULL;

599
	BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
600
	return get_gcwq(cpu);
601
602
}

603
/*
604
605
606
 * Policy functions.  These define the policies on how the global worker
 * pools are managed.  Unless noted otherwise, these functions assume that
 * they're being called with gcwq->lock held.
607
608
 */

609
static bool __need_more_worker(struct worker_pool *pool)
610
{
611
	return !atomic_read(get_pool_nr_running(pool));
612
613
}

614
/*
615
616
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
617
618
619
620
 *
 * Note that, because unbound workers never contribute to nr_running, this
 * function will always return %true for unbound gcwq as long as the
 * worklist isn't empty.
621
 */
622
static bool need_more_worker(struct worker_pool *pool)
623
{
624
	return !list_empty(&pool->worklist) && __need_more_worker(pool);
625
}
626

627
/* Can I start working?  Called from busy but !running workers. */
628
static bool may_start_working(struct worker_pool *pool)
629
{
630
	return pool->nr_idle;
631
632
633
}

/* Do I need to keep working?  Called from currently running workers. */
634
static bool keep_working(struct worker_pool *pool)
635
{
636
	atomic_t *nr_running = get_pool_nr_running(pool);
637

638
	return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
639
640
641
}

/* Do we need a new worker?  Called from manager. */
642
static bool need_to_create_worker(struct worker_pool *pool)
643
{
644
	return need_more_worker(pool) && !may_start_working(pool);
645
}
646

647
/* Do I need to be the manager? */
648
static bool need_to_manage_workers(struct worker_pool *pool)
649
{
650
	return need_to_create_worker(pool) ||
651
		(pool->flags & POOL_MANAGE_WORKERS);
652
653
654
}

/* Do we have too many workers and should some go away? */
655
static bool too_many_workers(struct worker_pool *pool)
656
{
657
	bool managing = mutex_is_locked(&pool->manager_mutex);
658
659
	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
	int nr_busy = pool->nr_workers - nr_idle;
660
661

	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
662
663
}

664
/*
665
666
667
 * Wake up functions.
 */

668
/* Return the first worker.  Safe with preemption disabled */
669
static struct worker *first_worker(struct worker_pool *pool)
670
{
671
	if (unlikely(list_empty(&pool->idle_list)))
672
673
		return NULL;

674
	return list_first_entry(&pool->idle_list, struct worker, entry);
675
676
677
678
}

/**
 * wake_up_worker - wake up an idle worker
679
 * @pool: worker pool to wake worker from
680
 *
681
 * Wake up the first idle worker of @pool.
682
683
684
685
 *
 * CONTEXT:
 * spin_lock_irq(gcwq->lock).
 */
686
static void wake_up_worker(struct worker_pool *pool)
687
{
688
	struct worker *worker = first_worker(pool);
689
690
691
692
693

	if (likely(worker))
		wake_up_process(worker->task);
}

694
/**
695
696
697
698
699
700
701
702
703
704
705
706
707
708
 * wq_worker_waking_up - a worker is waking up
 * @task: task waking up
 * @cpu: CPU @task is waking up to
 *
 * This function is called during try_to_wake_up() when a worker is
 * being awoken.
 *
 * CONTEXT:
 * spin_lock_irq(rq->lock)
 */
void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
{
	struct worker *worker = kthread_data(task);

709
	if (!(worker->flags & WORKER_NOT_RUNNING))
710
		atomic_inc(get_pool_nr_running(worker->pool));
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
}

/**
 * wq_worker_sleeping - a worker is going to sleep
 * @task: task going to sleep
 * @cpu: CPU in question, must be the current CPU number
 *
 * This function is called during schedule() when a busy worker is
 * going to sleep.  Worker on the same cpu can be woken up by
 * returning pointer to its task.
 *
 * CONTEXT:
 * spin_lock_irq(rq->lock)
 *
 * RETURNS:
 * Worker task on @cpu to wake up, %NULL if none.
 */
struct task_struct *wq_worker_sleeping(struct task_struct *task,
				       unsigned int cpu)
{
	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
732
	struct worker_pool *pool = worker->pool;
733
	atomic_t *nr_running = get_pool_nr_running(pool);
734

735
	if (worker->flags & WORKER_NOT_RUNNING)
736
737
738
739
740
741
742
743
744
745
		return NULL;

	/* this can only happen on the local cpu */
	BUG_ON(cpu != raw_smp_processor_id());

	/*
	 * The counterpart of the following dec_and_test, implied mb,
	 * worklist not empty test sequence is in insert_work().
	 * Please read comment there.
	 *
746
747
748
749
750
	 * NOT_RUNNING is clear.  This means that we're bound to and
	 * running on the local cpu w/ rq lock held and preemption
	 * disabled, which in turn means that none else could be
	 * manipulating idle_list, so dereferencing idle_list without gcwq
	 * lock is safe.
751
	 */
752
	if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
753
		to_wakeup = first_worker(pool);
754
755
756
757
758
	return to_wakeup ? to_wakeup->task : NULL;
}

/**
 * worker_set_flags - set worker flags and adjust nr_running accordingly
759
 * @worker: self
760
761
762
 * @flags: flags to set
 * @wakeup: wakeup an idle worker if necessary
 *
763
764
765
 * Set @flags in @worker->flags and adjust nr_running accordingly.  If
 * nr_running becomes zero and @wakeup is %true, an idle worker is
 * woken up.
766
 *
767
768
 * CONTEXT:
 * spin_lock_irq(gcwq->lock)
769
770
771
772
 */
static inline void worker_set_flags(struct worker *worker, unsigned int flags,
				    bool wakeup)
{
773
	struct worker_pool *pool = worker->pool;
774

775
776
	WARN_ON_ONCE(worker->task != current);

777
778
779
780
781
782
783
	/*
	 * If transitioning into NOT_RUNNING, adjust nr_running and
	 * wake up an idle worker as necessary if requested by
	 * @wakeup.
	 */
	if ((flags & WORKER_NOT_RUNNING) &&
	    !(worker->flags & WORKER_NOT_RUNNING)) {
784
		atomic_t *nr_running = get_pool_nr_running(pool);
785
786
787

		if (wakeup) {
			if (atomic_dec_and_test(nr_running) &&
788
			    !list_empty(&pool->worklist))
789
				wake_up_worker(pool);
790
791
792
793
		} else
			atomic_dec(nr_running);
	}

794
795
796
797
	worker->flags |= flags;
}

/**
798
 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
799
 * @worker: self
800
801
 * @flags: flags to clear
 *
802
 * Clear @flags in @worker->flags and adjust nr_running accordingly.
803
 *
804
805
 * CONTEXT:
 * spin_lock_irq(gcwq->lock)
806
807
808
 */
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
809
	struct worker_pool *pool = worker->pool;
810
811
	unsigned int oflags = worker->flags;

812
813
	WARN_ON_ONCE(worker->task != current);

814
	worker->flags &= ~flags;
815

816
817
818
819
820
	/*
	 * If transitioning out of NOT_RUNNING, increment nr_running.  Note
	 * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
	 * of multiple flags, not a single flag.
	 */
821
822
	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
		if (!(worker->flags & WORKER_NOT_RUNNING))
823
			atomic_inc(get_pool_nr_running(pool));
824
825
}

Tejun Heo's avatar
Tejun Heo committed
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
/**
 * busy_worker_head - return the busy hash head for a work
 * @gcwq: gcwq of interest
 * @work: work to be hashed
 *
 * Return hash head of @gcwq for @work.
 *
 * CONTEXT:
 * spin_lock_irq(gcwq->lock).
 *
 * RETURNS:
 * Pointer to the hash head.
 */
static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
					   struct work_struct *work)
{
	const int base_shift = ilog2(sizeof(struct work_struct));
	unsigned long v = (unsigned long)work;

	/* simple shift and fold hash, do we need something better? */
	v >>= base_shift;
	v += v >> BUSY_WORKER_HASH_ORDER;
	v &= BUSY_WORKER_HASH_MASK;

	return &gcwq->busy_hash[v];
}

853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
/**
 * __find_worker_executing_work - find worker which is executing a work
 * @gcwq: gcwq of interest
 * @bwh: hash head as returned by busy_worker_head()
 * @work: work to find worker for
 *
 * Find a worker which is executing @work on @gcwq.  @bwh should be
 * the hash head obtained by calling busy_worker_head() with the same
 * work.
 *
 * CONTEXT:
 * spin_lock_irq(gcwq->lock).
 *
 * RETURNS:
 * Pointer to worker which is executing @work if found, NULL
 * otherwise.
 */
static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
						   struct hlist_head *bwh,
						   struct work_struct *work)
{
	struct worker *worker;
	struct hlist_node *tmp;

	hlist_for_each_entry(worker, tmp, bwh, hentry)
		if (worker->current_work == work)
			return worker;
	return NULL;
}

/**
 * find_worker_executing_work - find worker which is executing a work
 * @gcwq: gcwq of interest
 * @work: work to find worker for
 *
 * Find a worker which is executing @work on @gcwq.  This function is
 * identical to __find_worker_executing_work() except that this
 * function calculates @bwh itself.
 *
 * CONTEXT:
 * spin_lock_irq(gcwq->lock).
 *
 * RETURNS:
 * Pointer to worker which is executing @work if found, NULL
 * otherwise.
898
 */
899
900
static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
						 struct work_struct *work)
901
{
902
903
	return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
					    work);
904
905
}

Tejun Heo's avatar
Tejun Heo committed
906
/**
907
 * insert_work - insert a work into gcwq
Tejun Heo's avatar
Tejun Heo committed
908
909
910
911
912
 * @cwq: cwq @work belongs to
 * @work: work to insert
 * @head: insertion point
 * @extra_flags: extra WORK_STRUCT_* flags to set
 *
913
914
 * Insert @work which belongs to @cwq into @gcwq after @head.
 * @extra_flags is or'd to work_struct flags.
Tejun Heo's avatar
Tejun Heo committed
915
916
 *
 * CONTEXT:
917
 * spin_lock_irq(gcwq->lock).
Tejun Heo's avatar
Tejun Heo committed
918
 */
Oleg Nesterov's avatar
Oleg Nesterov committed
919
static void insert_work(struct cpu_workqueue_struct *cwq,
Tejun Heo's avatar
Tejun Heo committed
920
921
			struct work_struct *work, struct list_head *head,
			unsigned int extra_flags)
Oleg Nesterov's avatar
Oleg Nesterov committed
922
{
923
	struct worker_pool *pool = cwq->pool;
924

Tejun Heo's avatar
Tejun Heo committed
925
	/* we own @work, set data and link */
926
	set_work_cwq(work, cwq, extra_flags);
927

928
929
930
931
932
	/*
	 * Ensure that we get the right work->data if we see the
	 * result of list_add() below, see try_to_grab_pending().
	 */
	smp_wmb();
Tejun Heo's avatar
Tejun Heo committed
933

934
	list_add_tail(&work->entry, head);
935
936
937
938
939
940
941
942

	/*
	 * Ensure either worker_sched_deactivated() sees the above
	 * list_add_tail() or we see zero nr_running to avoid workers
	 * lying around lazily while there are works to be processed.
	 */
	smp_mb();

943
944
	if (__need_more_worker(pool))
		wake_up_worker(pool);
Oleg Nesterov's avatar
Oleg Nesterov committed
945
946
}

947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
/*
 * Test whether @work is being queued from another work executing on the
 * same workqueue.  This is rather expensive and should only be used from
 * cold paths.
 */
static bool is_chained_work(struct workqueue_struct *wq)
{
	unsigned long flags;
	unsigned int cpu;

	for_each_gcwq_cpu(cpu) {
		struct global_cwq *gcwq = get_gcwq(cpu);
		struct worker *worker;
		struct hlist_node *pos;
		int i;

		spin_lock_irqsave(&gcwq->lock, flags);
		for_each_busy_worker(worker, i, pos, gcwq) {
			if (worker->task != current)
				continue;
			spin_unlock_irqrestore(&gcwq->lock, flags);
			/*
			 * I'm @worker, no locking necessary.  See if @work
			 * is headed to the same workqueue.
			 */
			return worker->current_cwq->wq == wq;
		}
		spin_unlock_irqrestore(&gcwq->lock, flags);
	}
	return false;
}

Tejun Heo's avatar
Tejun Heo committed
979
static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
Linus Torvalds's avatar
Linus Torvalds committed
980
981
			 struct work_struct *work)
{
982
983
	struct global_cwq *gcwq;
	struct cpu_workqueue_struct *cwq;
984
	struct list_head *worklist;
985
	unsigned int work_flags;
986
987
988
989
990
991
992
993

	/*
	 * While a work item is PENDING && off queue, a task trying to
	 * steal the PENDING will busy-loop waiting for it to either get
	 * queued or lose PENDING.  Grabbing PENDING and queueing should
	 * happen with IRQ disabled.
	 */
	WARN_ON_ONCE(!irqs_disabled());
Linus Torvalds's avatar
Linus Torvalds committed
994

995
	debug_work_activate(work);
996

997
	/* if dying, only works from the same workqueue are allowed */
998
	if (unlikely(wq->flags & WQ_DRAINING) &&
999
	    WARN_ON_ONCE(!is_chained_work(wq)))
1000
		return;