blk-core.c 80 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
/*
 * Copyright (C) 1991, 1992 Linus Torvalds
 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
6
7
 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
 *	-  July2000
Linus Torvalds's avatar
Linus Torvalds committed
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
 */

/*
 * This handles all read/write requests to block devices
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/kernel_stat.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/completion.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/writeback.h>
28
#include <linux/task_io_accounting_ops.h>
29
#include <linux/fault-inject.h>
30
#include <linux/list_sort.h>
Tejun Heo's avatar
Tejun Heo committed
31
#include <linux/delay.h>
32
#include <linux/ratelimit.h>
33
34
35

#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
Linus Torvalds's avatar
Linus Torvalds committed
36

37
#include "blk.h"
38
#include "blk-cgroup.h"
39

40
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
41
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
42
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
43

44
45
DEFINE_IDA(blk_queue_ida);

Linus Torvalds's avatar
Linus Torvalds committed
46
47
48
/*
 * For the allocated request tables
 */
49
static struct kmem_cache *request_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
50
51
52
53

/*
 * For queue allocation
 */
54
struct kmem_cache *blk_requestq_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
55
56
57
58

/*
 * Controlling structure to kblockd
 */
59
static struct workqueue_struct *kblockd_workqueue;
Linus Torvalds's avatar
Linus Torvalds committed
60

61
62
static void drive_stat_acct(struct request *rq, int new_io)
{
63
	struct hd_struct *part;
64
	int rw = rq_data_dir(rq);
Tejun Heo's avatar
Tejun Heo committed
65
	int cpu;
66

67
	if (!blk_do_io_stat(rq))
68
69
		return;

70
	cpu = part_stat_lock();
Tejun Heo's avatar
Tejun Heo committed
71

72
73
	if (!new_io) {
		part = rq->part;
74
		part_stat_inc(cpu, part, merges[rw]);
75
76
	} else {
		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
77
		if (!hd_struct_try_get(part)) {
78
79
80
81
82
83
84
85
86
			/*
			 * The partition is already being removed,
			 * the request will be accounted on the disk only
			 *
			 * We take a reference on disk->part0 although that
			 * partition will never be deleted, so we can treat
			 * it as any other partition.
			 */
			part = &rq->rq_disk->part0;
87
			hd_struct_get(part);
88
		}
89
		part_round_stats(cpu, part);
90
		part_inc_in_flight(part, rw);
91
		rq->part = part;
92
	}
93

94
	part_stat_unlock();
95
96
}

97
void blk_queue_congestion_threshold(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
{
	int nr;

	nr = q->nr_requests - (q->nr_requests / 8) + 1;
	if (nr > q->nr_requests)
		nr = q->nr_requests;
	q->nr_congestion_on = nr;

	nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
	if (nr < 1)
		nr = 1;
	q->nr_congestion_off = nr;
}

/**
 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
 * @bdev:	device
 *
 * Locates the passed device's request queue and returns the address of its
 * backing_dev_info
 *
 * Will return NULL if the request queue cannot be located.
 */
struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
{
	struct backing_dev_info *ret = NULL;
124
	struct request_queue *q = bdev_get_queue(bdev);
Linus Torvalds's avatar
Linus Torvalds committed
125
126
127
128
129
130
131

	if (q)
		ret = &q->backing_dev_info;
	return ret;
}
EXPORT_SYMBOL(blk_get_backing_dev_info);

132
void blk_rq_init(struct request_queue *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
133
{
134
135
	memset(rq, 0, sizeof(*rq));

Linus Torvalds's avatar
Linus Torvalds committed
136
	INIT_LIST_HEAD(&rq->queuelist);
137
	INIT_LIST_HEAD(&rq->timeout_list);
138
	rq->cpu = -1;
Jens Axboe's avatar
Jens Axboe committed
139
	rq->q = q;
140
	rq->__sector = (sector_t) -1;
141
142
	INIT_HLIST_NODE(&rq->hash);
	RB_CLEAR_NODE(&rq->rb_node);
143
	rq->cmd = rq->__cmd;
144
	rq->cmd_len = BLK_MAX_CDB;
Jens Axboe's avatar
Jens Axboe committed
145
	rq->tag = -1;
Linus Torvalds's avatar
Linus Torvalds committed
146
	rq->ref_count = 1;
147
	rq->start_time = jiffies;
148
	set_start_time_ns(rq);
149
	rq->part = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
150
}
151
EXPORT_SYMBOL(blk_rq_init);
Linus Torvalds's avatar
Linus Torvalds committed
152

153
154
static void req_bio_endio(struct request *rq, struct bio *bio,
			  unsigned int nbytes, int error)
Linus Torvalds's avatar
Linus Torvalds committed
155
{
156
157
158
159
	if (error)
		clear_bit(BIO_UPTODATE, &bio->bi_flags);
	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
		error = -EIO;
160

161
162
163
164
	if (unlikely(nbytes > bio->bi_size)) {
		printk(KERN_ERR "%s: want %u bytes done, %u left\n",
		       __func__, nbytes, bio->bi_size);
		nbytes = bio->bi_size;
165
	}
166

167
168
	if (unlikely(rq->cmd_flags & REQ_QUIET))
		set_bit(BIO_QUIET, &bio->bi_flags);
169

170
171
	bio->bi_size -= nbytes;
	bio->bi_sector += (nbytes >> 9);
172

173
174
	if (bio_integrity(bio))
		bio_integrity_advance(bio, nbytes);
175

176
177
178
	/* don't actually finish bio if it's part of flush sequence */
	if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
		bio_endio(bio, error);
Linus Torvalds's avatar
Linus Torvalds committed
179
180
181
182
183
184
}

void blk_dump_rq_flags(struct request *rq, char *msg)
{
	int bit;

185
	printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,
186
187
		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
		rq->cmd_flags);
Linus Torvalds's avatar
Linus Torvalds committed
188

189
190
191
	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
	       (unsigned long long)blk_rq_pos(rq),
	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
Tejun Heo's avatar
Tejun Heo committed
192
	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
193
	       rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));
Linus Torvalds's avatar
Linus Torvalds committed
194

195
	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
196
		printk(KERN_INFO "  cdb: ");
197
		for (bit = 0; bit < BLK_MAX_CDB; bit++)
Linus Torvalds's avatar
Linus Torvalds committed
198
199
200
201
202
203
			printk("%02x ", rq->cmd[bit]);
		printk("\n");
	}
}
EXPORT_SYMBOL(blk_dump_rq_flags);

204
static void blk_delay_work(struct work_struct *work)
Linus Torvalds's avatar
Linus Torvalds committed
205
{
206
	struct request_queue *q;
Linus Torvalds's avatar
Linus Torvalds committed
207

208
209
	q = container_of(work, struct request_queue, delay_work.work);
	spin_lock_irq(q->queue_lock);
210
	__blk_run_queue(q);
211
	spin_unlock_irq(q->queue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
212
213
214
}

/**
215
216
217
 * blk_delay_queue - restart queueing after defined interval
 * @q:		The &struct request_queue in question
 * @msecs:	Delay in msecs
Linus Torvalds's avatar
Linus Torvalds committed
218
219
 *
 * Description:
220
221
222
223
224
 *   Sometimes queueing needs to be postponed for a little while, to allow
 *   resources to come back. This function will make sure that queueing is
 *   restarted around the specified time.
 */
void blk_delay_queue(struct request_queue *q, unsigned long msecs)
225
{
226
227
	queue_delayed_work(kblockd_workqueue, &q->delay_work,
				msecs_to_jiffies(msecs));
228
}
229
EXPORT_SYMBOL(blk_delay_queue);
230

Linus Torvalds's avatar
Linus Torvalds committed
231
232
/**
 * blk_start_queue - restart a previously stopped queue
233
 * @q:    The &struct request_queue in question
Linus Torvalds's avatar
Linus Torvalds committed
234
235
236
237
238
239
 *
 * Description:
 *   blk_start_queue() will clear the stop flag on the queue, and call
 *   the request_fn for the queue if it was in a stopped state when
 *   entered. Also see blk_stop_queue(). Queue lock must be held.
 **/
240
void blk_start_queue(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
241
{
242
243
	WARN_ON(!irqs_disabled());

244
	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
245
	__blk_run_queue(q);
Linus Torvalds's avatar
Linus Torvalds committed
246
247
248
249
250
}
EXPORT_SYMBOL(blk_start_queue);

/**
 * blk_stop_queue - stop a queue
251
 * @q:    The &struct request_queue in question
Linus Torvalds's avatar
Linus Torvalds committed
252
253
254
255
256
257
258
259
260
261
262
 *
 * Description:
 *   The Linux block layer assumes that a block driver will consume all
 *   entries on the request queue when the request_fn strategy is called.
 *   Often this will not happen, because of hardware limitations (queue
 *   depth settings). If a device driver gets a 'queue full' response,
 *   or if it simply chooses not to queue more I/O at one point, it can
 *   call this function to prevent the request_fn from being called until
 *   the driver has signalled it's ready to go again. This happens by calling
 *   blk_start_queue() to restart queue operations. Queue lock must be held.
 **/
263
void blk_stop_queue(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
264
{
265
	__cancel_delayed_work(&q->delay_work);
266
	queue_flag_set(QUEUE_FLAG_STOPPED, q);
Linus Torvalds's avatar
Linus Torvalds committed
267
268
269
270
271
272
273
274
275
276
277
278
}
EXPORT_SYMBOL(blk_stop_queue);

/**
 * blk_sync_queue - cancel any pending callbacks on a queue
 * @q: the queue
 *
 * Description:
 *     The block layer may perform asynchronous callback activity
 *     on a queue, such as calling the unplug function after a timeout.
 *     A block device may call blk_sync_queue to ensure that any
 *     such activity is cancelled, thus allowing it to release resources
279
 *     that the callbacks might use. The caller must already have made sure
Linus Torvalds's avatar
Linus Torvalds committed
280
281
282
 *     that its ->make_request_fn will not re-add plugging prior to calling
 *     this function.
 *
283
284
 *     This function does not cancel any asynchronous activity arising
 *     out of elevator or throttling code. That would require elevaotor_exit()
285
 *     and blkcg_exit_queue() to be called with queue lock initialized.
286
 *
Linus Torvalds's avatar
Linus Torvalds committed
287
288
289
 */
void blk_sync_queue(struct request_queue *q)
{
290
	del_timer_sync(&q->timeout);
291
	cancel_delayed_work_sync(&q->delay_work);
Linus Torvalds's avatar
Linus Torvalds committed
292
293
294
295
}
EXPORT_SYMBOL(blk_sync_queue);

/**
296
 * __blk_run_queue - run a single device queue
Linus Torvalds's avatar
Linus Torvalds committed
297
 * @q:	The queue to run
298
299
300
 *
 * Description:
 *    See @blk_run_queue. This variant must be called with the queue lock
301
 *    held and interrupts disabled.
Linus Torvalds's avatar
Linus Torvalds committed
302
 */
303
void __blk_run_queue(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
304
{
305
306
307
	if (unlikely(blk_queue_stopped(q)))
		return;

308
	q->request_fn(q);
309
310
}
EXPORT_SYMBOL(__blk_run_queue);
311

312
313
314
315
316
317
318
319
320
321
/**
 * blk_run_queue_async - run a single device queue in workqueue context
 * @q:	The queue to run
 *
 * Description:
 *    Tells kblockd to perform the equivalent of @blk_run_queue on behalf
 *    of us.
 */
void blk_run_queue_async(struct request_queue *q)
{
322
323
	if (likely(!blk_queue_stopped(q))) {
		__cancel_delayed_work(&q->delay_work);
324
		queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
325
	}
326
}
327
EXPORT_SYMBOL(blk_run_queue_async);
328

329
330
331
/**
 * blk_run_queue - run a single device queue
 * @q: The queue to run
332
333
334
 *
 * Description:
 *    Invoke request handling on this queue, if it has pending work to do.
Tejun Heo's avatar
Tejun Heo committed
335
 *    May be used to restart queueing when a request has completed.
336
337
338
339
340
341
 */
void blk_run_queue(struct request_queue *q)
{
	unsigned long flags;

	spin_lock_irqsave(q->queue_lock, flags);
342
	__blk_run_queue(q);
Linus Torvalds's avatar
Linus Torvalds committed
343
344
345
346
	spin_unlock_irqrestore(q->queue_lock, flags);
}
EXPORT_SYMBOL(blk_run_queue);

347
void blk_put_queue(struct request_queue *q)
348
349
350
{
	kobject_put(&q->kobj);
}
351
EXPORT_SYMBOL(blk_put_queue);
352

Tejun Heo's avatar
Tejun Heo committed
353
354
355
/**
 * blk_drain_queue - drain requests from request_queue
 * @q: queue to drain
356
 * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
Tejun Heo's avatar
Tejun Heo committed
357
 *
358
359
360
 * Drain requests from @q.  If @drain_all is set, all requests are drained.
 * If not, only ELVPRIV requests are drained.  The caller is responsible
 * for ensuring that no new requests which need to be drained are queued.
Tejun Heo's avatar
Tejun Heo committed
361
 */
362
void blk_drain_queue(struct request_queue *q, bool drain_all)
Tejun Heo's avatar
Tejun Heo committed
363
{
364
365
	int i;

Tejun Heo's avatar
Tejun Heo committed
366
	while (true) {
367
		bool drain = false;
Tejun Heo's avatar
Tejun Heo committed
368
369
370

		spin_lock_irq(q->queue_lock);

371
372
373
374
375
376
377
		/*
		 * The caller might be trying to drain @q before its
		 * elevator is initialized.
		 */
		if (q->elevator)
			elv_drain_elevator(q);

378
		blkcg_drain_queue(q);
Tejun Heo's avatar
Tejun Heo committed
379

380
381
		/*
		 * This function might be called on a queue which failed
382
383
384
385
		 * driver init after queue creation or is not yet fully
		 * active yet.  Some drivers (e.g. fd and loop) get unhappy
		 * in such cases.  Kick queue iff dispatch queue has
		 * something on it and @q has request_fn set.
386
		 */
387
		if (!list_empty(&q->queue_head) && q->request_fn)
388
			__blk_run_queue(q);
389

390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
		drain |= q->rq.elvpriv;

		/*
		 * Unfortunately, requests are queued at and tracked from
		 * multiple places and there's no single counter which can
		 * be drained.  Check all the queues and counters.
		 */
		if (drain_all) {
			drain |= !list_empty(&q->queue_head);
			for (i = 0; i < 2; i++) {
				drain |= q->rq.count[i];
				drain |= q->in_flight[i];
				drain |= !list_empty(&q->flush_queue[i]);
			}
		}
Tejun Heo's avatar
Tejun Heo committed
405
406
407

		spin_unlock_irq(q->queue_lock);

408
		if (!drain)
Tejun Heo's avatar
Tejun Heo committed
409
410
411
			break;
		msleep(10);
	}
412
413
414
415
416
417
418
419
420
421
422
423

	/*
	 * With queue marked dead, any woken up waiter will fail the
	 * allocation path, so the wakeup chaining is lost and we're
	 * left with hung waiters. We need to wake up those waiters.
	 */
	if (q->request_fn) {
		spin_lock_irq(q->queue_lock);
		for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++)
			wake_up_all(&q->rq.wait[i]);
		spin_unlock_irq(q->queue_lock);
	}
Tejun Heo's avatar
Tejun Heo committed
424
425
}

426
427
428
429
430
431
/**
 * blk_queue_bypass_start - enter queue bypass mode
 * @q: queue of interest
 *
 * In bypass mode, only the dispatch FIFO queue of @q is used.  This
 * function makes @q enter bypass mode and drains all requests which were
432
 * throttled or issued before.  On return, it's guaranteed that no request
433
434
 * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
 * inside queue or RCU read lock.
435
436
437
 */
void blk_queue_bypass_start(struct request_queue *q)
{
438
439
	bool drain;

440
	spin_lock_irq(q->queue_lock);
441
	drain = !q->bypass_depth++;
442
443
444
	queue_flag_set(QUEUE_FLAG_BYPASS, q);
	spin_unlock_irq(q->queue_lock);

445
446
447
448
449
	if (drain) {
		blk_drain_queue(q, false);
		/* ensure blk_queue_bypass() is %true inside RCU read lock */
		synchronize_rcu();
	}
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
}
EXPORT_SYMBOL_GPL(blk_queue_bypass_start);

/**
 * blk_queue_bypass_end - leave queue bypass mode
 * @q: queue of interest
 *
 * Leave bypass mode and restore the normal queueing behavior.
 */
void blk_queue_bypass_end(struct request_queue *q)
{
	spin_lock_irq(q->queue_lock);
	if (!--q->bypass_depth)
		queue_flag_clear(QUEUE_FLAG_BYPASS, q);
	WARN_ON_ONCE(q->bypass_depth < 0);
	spin_unlock_irq(q->queue_lock);
}
EXPORT_SYMBOL_GPL(blk_queue_bypass_end);

469
470
471
472
473
474
/**
 * blk_cleanup_queue - shutdown a request queue
 * @q: request queue to shutdown
 *
 * Mark @q DEAD, drain all pending requests, destroy and put it.  All
 * future requests will be failed immediately with -ENODEV.
475
 */
476
void blk_cleanup_queue(struct request_queue *q)
477
{
478
	spinlock_t *lock = q->queue_lock;
479

480
	/* mark @q DEAD, no new request or merges will be allowed afterwards */
481
	mutex_lock(&q->sysfs_lock);
482
	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
483
484

	spin_lock_irq(lock);
485

486
487
488
489
490
491
492
493
494
	/*
	 * Dead queue is permanently in bypass mode till released.  Note
	 * that, unlike blk_queue_bypass_start(), we aren't performing
	 * synchronize_rcu() after entering bypass mode to avoid the delay
	 * as some drivers create and destroy a lot of queues while
	 * probing.  This is still safe because blk_release_queue() will be
	 * called only after the queue refcnt drops to zero and nothing,
	 * RCU or not, would be traversing the queue by then.
	 */
495
496
497
	q->bypass_depth++;
	queue_flag_set(QUEUE_FLAG_BYPASS, q);

498
499
500
	queue_flag_set(QUEUE_FLAG_NOMERGES, q);
	queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
	queue_flag_set(QUEUE_FLAG_DEAD, q);
501

502
503
	if (q->queue_lock != &q->__queue_lock)
		q->queue_lock = &q->__queue_lock;
504

505
506
507
	spin_unlock_irq(lock);
	mutex_unlock(&q->sysfs_lock);

508
509
	/* drain all requests queued before DEAD marking */
	blk_drain_queue(q, true);
510
511
512
513
514
515

	/* @q won't process any more request, flush async actions */
	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
	blk_sync_queue(q);

	/* @q is and will stay empty, shutdown and put */
516
517
	blk_put_queue(q);
}
Linus Torvalds's avatar
Linus Torvalds committed
518
519
EXPORT_SYMBOL(blk_cleanup_queue);

520
static int blk_init_free_list(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
521
522
523
{
	struct request_list *rl = &q->rq;

524
525
526
	if (unlikely(rl->rq_pool))
		return 0;

527
528
	rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
	rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
529
	rl->elvpriv = 0;
530
531
	init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
	init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
Linus Torvalds's avatar
Linus Torvalds committed
532

533
534
	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
				mempool_free_slab, request_cachep, q->node);
Linus Torvalds's avatar
Linus Torvalds committed
535
536
537
538
539
540
541

	if (!rl->rq_pool)
		return -ENOMEM;

	return 0;
}

542
struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
543
{
544
545
546
	return blk_alloc_queue_node(gfp_mask, -1);
}
EXPORT_SYMBOL(blk_alloc_queue);
Linus Torvalds's avatar
Linus Torvalds committed
547

548
struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
549
{
550
	struct request_queue *q;
Peter Zijlstra's avatar
Peter Zijlstra committed
551
	int err;
552

553
	q = kmem_cache_alloc_node(blk_requestq_cachep,
554
				gfp_mask | __GFP_ZERO, node_id);
Linus Torvalds's avatar
Linus Torvalds committed
555
556
557
	if (!q)
		return NULL;

558
	q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
559
560
561
	if (q->id < 0)
		goto fail_q;

562
563
564
565
	q->backing_dev_info.ra_pages =
			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
	q->backing_dev_info.state = 0;
	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
566
	q->backing_dev_info.name = "block";
567
	q->node = node_id;
568

Peter Zijlstra's avatar
Peter Zijlstra committed
569
	err = bdi_init(&q->backing_dev_info);
570
571
	if (err)
		goto fail_id;
Peter Zijlstra's avatar
Peter Zijlstra committed
572

573
574
	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
		    laptop_mode_timer_fn, (unsigned long) q);
575
	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
576
	INIT_LIST_HEAD(&q->queue_head);
577
	INIT_LIST_HEAD(&q->timeout_list);
578
	INIT_LIST_HEAD(&q->icq_list);
579
#ifdef CONFIG_BLK_CGROUP
580
	INIT_LIST_HEAD(&q->blkg_list);
581
#endif
582
583
584
	INIT_LIST_HEAD(&q->flush_queue[0]);
	INIT_LIST_HEAD(&q->flush_queue[1]);
	INIT_LIST_HEAD(&q->flush_data_in_flight);
585
	INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
586

587
	kobject_init(&q->kobj, &blk_queue_ktype);
Linus Torvalds's avatar
Linus Torvalds committed
588

589
	mutex_init(&q->sysfs_lock);
590
	spin_lock_init(&q->__queue_lock);
591

592
593
594
595
596
597
	/*
	 * By default initialize queue_lock to internal lock and driver can
	 * override it later if need be.
	 */
	q->queue_lock = &q->__queue_lock;

598
599
600
601
602
603
604
605
606
	/*
	 * A queue starts its life with bypass turned on to avoid
	 * unnecessary bypass on/off overhead and nasty surprises during
	 * init.  The initial bypass will be finished at the end of
	 * blk_init_allocated_queue().
	 */
	q->bypass_depth = 1;
	__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);

607
	if (blkcg_init_queue(q))
608
609
		goto fail_id;

Linus Torvalds's avatar
Linus Torvalds committed
610
	return q;
611
612
613
614
615
616

fail_id:
	ida_simple_remove(&blk_queue_ida, q->id);
fail_q:
	kmem_cache_free(blk_requestq_cachep, q);
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
617
}
618
EXPORT_SYMBOL(blk_alloc_queue_node);
Linus Torvalds's avatar
Linus Torvalds committed
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641

/**
 * blk_init_queue  - prepare a request queue for use with a block device
 * @rfn:  The function to be called to process requests that have been
 *        placed on the queue.
 * @lock: Request queue spin lock
 *
 * Description:
 *    If a block device wishes to use the standard request handling procedures,
 *    which sorts requests and coalesces adjacent requests, then it must
 *    call blk_init_queue().  The function @rfn will be called when there
 *    are requests on the queue that need to be processed.  If the device
 *    supports plugging, then @rfn may not be called immediately when requests
 *    are available on the queue, but may be called at some time later instead.
 *    Plugged queues are generally unplugged when a buffer belonging to one
 *    of the requests on the queue is needed, or due to memory pressure.
 *
 *    @rfn is not required, or even expected, to remove all requests off the
 *    queue, but only as many as it can handle at a time.  If it does leave
 *    requests on the queue, it is responsible for arranging that the requests
 *    get dealt with eventually.
 *
 *    The queue spin lock must be held while manipulating the requests on the
642
643
 *    request queue; this lock will be taken also from interrupt context, so irq
 *    disabling is needed for it.
Linus Torvalds's avatar
Linus Torvalds committed
644
 *
645
 *    Function returns a pointer to the initialized request queue, or %NULL if
Linus Torvalds's avatar
Linus Torvalds committed
646
647
648
649
650
651
 *    it didn't succeed.
 *
 * Note:
 *    blk_init_queue() must be paired with a blk_cleanup_queue() call
 *    when the block device is deactivated (such as at module unload).
 **/
652

653
struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
Linus Torvalds's avatar
Linus Torvalds committed
654
{
655
656
657
658
	return blk_init_queue_node(rfn, lock, -1);
}
EXPORT_SYMBOL(blk_init_queue);

659
struct request_queue *
660
661
blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
{
662
	struct request_queue *uninit_q, *q;
Linus Torvalds's avatar
Linus Torvalds committed
663

664
665
666
667
	uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
	if (!uninit_q)
		return NULL;

668
	q = blk_init_allocated_queue(uninit_q, rfn, lock);
669
670
671
672
	if (!q)
		blk_cleanup_queue(uninit_q);

	return q;
673
674
675
676
677
678
679
}
EXPORT_SYMBOL(blk_init_queue_node);

struct request_queue *
blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
			 spinlock_t *lock)
{
Linus Torvalds's avatar
Linus Torvalds committed
680
681
682
	if (!q)
		return NULL;

683
	if (blk_init_free_list(q))
684
		return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
685
686
687

	q->request_fn		= rfn;
	q->prep_rq_fn		= NULL;
688
	q->unprep_rq_fn		= NULL;
689
	q->queue_flags		= QUEUE_FLAG_DEFAULT;
690
691
692
693

	/* Override internal queue lock with supplied lock pointer */
	if (lock)
		q->queue_lock		= lock;
Linus Torvalds's avatar
Linus Torvalds committed
694

695
696
697
	/*
	 * This also sets hw/phys segments, boundary and size
	 */
698
	blk_queue_make_request(q, blk_queue_bio);
Linus Torvalds's avatar
Linus Torvalds committed
699

700
701
	q->sg_reserved_size = INT_MAX;

702
703
704
	/* init elevator */
	if (elevator_init(q, NULL))
		return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
705

706
707
708
709
710
	blk_queue_congestion_threshold(q);

	/* all done, end the initial bypass */
	blk_queue_bypass_end(q);
	return q;
Linus Torvalds's avatar
Linus Torvalds committed
711
}
712
EXPORT_SYMBOL(blk_init_allocated_queue);
Linus Torvalds's avatar
Linus Torvalds committed
713

714
bool blk_get_queue(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
715
{
Tejun Heo's avatar
Tejun Heo committed
716
	if (likely(!blk_queue_dead(q))) {
717
718
		__blk_get_queue(q);
		return true;
Linus Torvalds's avatar
Linus Torvalds committed
719
720
	}

721
	return false;
Linus Torvalds's avatar
Linus Torvalds committed
722
}
723
EXPORT_SYMBOL(blk_get_queue);
Linus Torvalds's avatar
Linus Torvalds committed
724

725
static inline void blk_free_request(struct request_queue *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
726
{
727
	if (rq->cmd_flags & REQ_ELVPRIV) {
728
		elv_put_request(q, rq);
729
		if (rq->elv.icq)
730
			put_io_context(rq->elv.icq->ioc);
731
732
	}

Linus Torvalds's avatar
Linus Torvalds committed
733
734
735
736
737
738
739
	mempool_free(rq, q->rq.rq_pool);
}

/*
 * ioc_batching returns true if the ioc is a valid batching request and
 * should be given priority access to a request.
 */
740
static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
Linus Torvalds's avatar
Linus Torvalds committed
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
{
	if (!ioc)
		return 0;

	/*
	 * Make sure the process is able to allocate at least 1 request
	 * even if the batch times out, otherwise we could theoretically
	 * lose wakeups.
	 */
	return ioc->nr_batch_requests == q->nr_batching ||
		(ioc->nr_batch_requests > 0
		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
}

/*
 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
 * will cause the process to be a "batcher" on all queues in the system. This
 * is the behaviour we want though - once it gets a wakeup it should be given
 * a nice run.
 */
761
static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
Linus Torvalds's avatar
Linus Torvalds committed
762
763
764
765
766
767
768
769
{
	if (!ioc || ioc_batching(q, ioc))
		return;

	ioc->nr_batch_requests = q->nr_batching;
	ioc->last_waited = jiffies;
}

770
static void __freed_request(struct request_queue *q, int sync)
Linus Torvalds's avatar
Linus Torvalds committed
771
772
773
{
	struct request_list *rl = &q->rq;

774
775
	if (rl->count[sync] < queue_congestion_off_threshold(q))
		blk_clear_queue_congested(q, sync);
Linus Torvalds's avatar
Linus Torvalds committed
776

777
778
779
	if (rl->count[sync] + 1 <= q->nr_requests) {
		if (waitqueue_active(&rl->wait[sync]))
			wake_up(&rl->wait[sync]);
Linus Torvalds's avatar
Linus Torvalds committed
780

781
		blk_clear_queue_full(q, sync);
Linus Torvalds's avatar
Linus Torvalds committed
782
783
784
785
786
787
788
	}
}

/*
 * A request has just been released.  Account for it, update the full and
 * congestion status, wake up any waiters.   Called under q->queue_lock.
 */
789
static void freed_request(struct request_queue *q, unsigned int flags)
Linus Torvalds's avatar
Linus Torvalds committed
790
791
{
	struct request_list *rl = &q->rq;
792
	int sync = rw_is_sync(flags);
Linus Torvalds's avatar
Linus Torvalds committed
793

794
	rl->count[sync]--;
795
	if (flags & REQ_ELVPRIV)
796
		rl->elvpriv--;
Linus Torvalds's avatar
Linus Torvalds committed
797

798
	__freed_request(q, sync);
Linus Torvalds's avatar
Linus Torvalds committed
799

800
801
	if (unlikely(rl->starved[sync ^ 1]))
		__freed_request(q, sync ^ 1);
Linus Torvalds's avatar
Linus Torvalds committed
802
803
}

804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
/*
 * Determine if elevator data should be initialized when allocating the
 * request associated with @bio.
 */
static bool blk_rq_should_init_elevator(struct bio *bio)
{
	if (!bio)
		return true;

	/*
	 * Flush requests do not use the elevator so skip initialization.
	 * This allows a request to share the flush and elevator data.
	 */
	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
		return false;

	return true;
}

823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
/**
 * rq_ioc - determine io_context for request allocation
 * @bio: request being allocated is for this bio (can be %NULL)
 *
 * Determine io_context to use for request allocation for @bio.  May return
 * %NULL if %current->io_context doesn't exist.
 */
static struct io_context *rq_ioc(struct bio *bio)
{
#ifdef CONFIG_BLK_CGROUP
	if (bio && bio->bi_ioc)
		return bio->bi_ioc;
#endif
	return current->io_context;
}

839
840
841
842
843
844
845
846
847
848
849
850
851
/**
 * get_request - get a free request
 * @q: request_queue to allocate request from
 * @rw_flags: RW and SYNC flags
 * @bio: bio to allocate request for (can be %NULL)
 * @gfp_mask: allocation mask
 *
 * Get a free request from @q.  This function may fail under memory
 * pressure or if @q is dead.
 *
 * Must be callled with @q->queue_lock held and,
 * Returns %NULL on failure, with @q->queue_lock held.
 * Returns !%NULL on success, with @q->queue_lock *not held*.
Linus Torvalds's avatar
Linus Torvalds committed
852
 */
853
static struct request *get_request(struct request_queue *q, int rw_flags,
854
				   struct bio *bio, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
855
{
Tejun Heo's avatar
Tejun Heo committed
856
	struct request *rq;
Linus Torvalds's avatar
Linus Torvalds committed
857
	struct request_list *rl = &q->rq;
858
	struct elevator_type *et;
859
	struct io_context *ioc;
860
	struct io_cq *icq = NULL;
861
	const bool is_sync = rw_is_sync(rw_flags) != 0;
862
	bool retried = false;
863
	int may_queue;
864
retry:
865
	et = q->elevator->type;
866
	ioc = rq_ioc(bio);
867

Tejun Heo's avatar
Tejun Heo committed
868
	if (unlikely(blk_queue_dead(q)))
869
870
		return NULL;

871
	may_queue = elv_may_queue(q, rw_flags);
872
873
874
	if (may_queue == ELV_MQUEUE_NO)
		goto rq_starved;

875
876
	if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
		if (rl->count[is_sync]+1 >= q->nr_requests) {
877
878
879
880
881
882
883
884
			/*
			 * We want ioc to record batching state.  If it's
			 * not already there, creating a new one requires
			 * dropping queue_lock, which in turn requires
			 * retesting conditions to avoid queue hang.
			 */
			if (!ioc && !retried) {
				spin_unlock_irq(q->queue_lock);
885
				create_io_context(gfp_mask, q->node);
886
887
888
889
890
				spin_lock_irq(q->queue_lock);
				retried = true;
				goto retry;
			}

891
892
893
894
895
896
			/*
			 * The queue will fill after this allocation, so set
			 * it as full, and mark this process as "batching".
			 * This process will be allowed to complete a batch of
			 * requests, others will be blocked.
			 */
897
			if (!blk_queue_full(q, is_sync)) {
898
				ioc_set_batching(q, ioc);
899
				blk_set_queue_full(q, is_sync);
900
901
902
903
904
905
906
907
			} else {
				if (may_queue != ELV_MQUEUE_MUST
						&& !ioc_batching(q, ioc)) {
					/*
					 * The queue is full and the allocating
					 * process is not a "batcher", and not
					 * exempted by the IO scheduler
					 */
Tejun Heo's avatar
Tejun Heo committed
908
					return NULL;
909
910
				}
			}
Linus Torvalds's avatar
Linus Torvalds committed
911
		}
912
		blk_set_queue_congested(q, is_sync);
Linus Torvalds's avatar
Linus Torvalds committed
913
914
	}

915
916
917
918
919
	/*
	 * Only allow batching queuers to allocate up to 50% over the defined
	 * limit of requests, otherwise we could have thousands of requests
	 * allocated with any setting of ->nr_requests
	 */
920
	if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
Tejun Heo's avatar
Tejun Heo committed
921
		return NULL;
922

923
924
	rl->count[is_sync]++;
	rl->starved[is_sync] = 0;
925

926
927
928
929
930
931
932
933
934
935
	/*
	 * Decide whether the new request will be managed by elevator.  If
	 * so, mark @rw_flags and increment elvpriv.  Non-zero elvpriv will
	 * prevent the current elevator from being destroyed until the new
	 * request is freed.  This guarantees icq's won't be destroyed and
	 * makes creating new ones safe.
	 *
	 * Also, lookup icq while holding queue_lock.  If it doesn't exist,
	 * it will be created after releasing queue_lock.
	 */
936
	if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
937
938
		rw_flags |= REQ_ELVPRIV;
		rl->elvpriv++;
939
940
		if (et->icq_cache && ioc)
			icq = ioc_lookup_icq(ioc, q);
941
	}
942

943
944
	if (blk_queue_io_stat(q))
		rw_flags |= REQ_IO_STAT;
Linus Torvalds's avatar
Linus Torvalds committed
945
946
	spin_unlock_irq(q->queue_lock);

947
948
949
	/* allocate and init request */
	rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
	if (!rq)
Tejun Heo's avatar
Tejun Heo committed
950
		goto fail_alloc;
Linus Torvalds's avatar
Linus Torvalds committed
951

952
953
954
	blk_rq_init(q, rq);
	rq->cmd_flags = rw_flags | REQ_ALLOCED;

955
	/* init elvpriv */
956
	if (rw_flags & REQ_ELVPRIV) {
957
958
959
960
961
962
963
964
965
		if (unlikely(et->icq_cache && !icq)) {
			create_io_context(gfp_mask, q->node);
			ioc = rq_ioc(bio);
			if (!ioc)
				goto fail_elvpriv;

			icq = ioc_create_icq(ioc, q, gfp_mask);
			if (!icq)
				goto fail_elvpriv;
966
		}
967
968
969
970
971
972

		rq->elv.icq = icq;
		if (unlikely(elv_set_request(q, rq, bio, gfp_mask)))
			goto fail_elvpriv;

		/* @rq->elv.icq holds io_context until @rq is freed */
973
974
975
		if (icq)
			get_io_context(icq->ioc);
	}
976
out:
977
978
979
980
981
982
	/*
	 * ioc may be NULL here, and ioc_batching will be false. That's
	 * OK, if the queue is under the request limit then requests need
	 * not count toward the nr_batch_requests limit. There will always
	 * be some limit enforced by BLK_BATCH_TIME.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
983
984
	if (ioc_batching(q, ioc))
		ioc->nr_batch_requests--;
985

986
	trace_block_getrq(q, bio, rw_flags & 1);
Linus Torvalds's avatar
Linus Torvalds committed
987
	return rq;
Tejun Heo's avatar
Tejun Heo committed
988

989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
fail_elvpriv:
	/*
	 * elvpriv init failed.  ioc, icq and elvpriv aren't mempool backed
	 * and may fail indefinitely under memory pressure and thus
	 * shouldn't stall IO.  Treat this request as !elvpriv.  This will
	 * disturb iosched and blkcg but weird is bettern than dead.
	 */
	printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n",
			   dev_name(q->backing_dev_info.dev));

	rq->cmd_flags &= ~REQ_ELVPRIV;
	rq->elv.icq = NULL;

	spin_lock_irq(q->queue_lock);
	rl->elvpriv--;
	spin_unlock_irq(q->queue_lock);
	goto out;

Tejun Heo's avatar
Tejun Heo committed
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
fail_alloc:
	/*
	 * Allocation failed presumably due to memory. Undo anything we
	 * might have messed up.
	 *
	 * Allocating task should really be put onto the front of the wait
	 * queue, but this is pretty rare.
	 */
	spin_lock_irq(q->queue_lock);
	freed_request(q, rw_flags);

	/*
	 * in the very unlikely event that allocation failed and no
	 * requests for this direction was pending, mark us starved so that
	 * freeing of a request in the other direction will notice
	 * us. another possible fix would be to split the rq mempool into
	 * READ and WRITE
	 */
rq_starved:
	if (unlikely(rl->count[is_sync] == 0))
		rl->starved[is_sync] = 1;
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1029
1030
}

1031
1032
1033
1034
1035
1036
1037
1038
/**
 * get_request_wait - get a free request with retry
 * @q: request_queue to allocate request from
 * @rw_flags: RW and SYNC flags
 * @bio: bio to allocate request for (can be %NULL)
 *
 * Get a free request from @q.  This function keeps retrying under memory
 * pressure and fails iff @q is dead.
Nick Piggin's avatar
Nick Piggin committed
1039
 *
1040
1041
1042
 * Must be callled with @q->queue_lock held and,
 * Returns %NULL on failure, with @q->queue_lock held.
 * Returns !%NULL on success, with @q->queue_lock *not held*.
Linus Torvalds's avatar
Linus Torvalds committed
1043
 */
1044
static struct request *get_request_wait(struct request_queue *q, int rw_flags,
1045
					struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
1046
{
1047
	const bool is_sync = rw_is_sync(rw_flags) != 0;
Linus Torvalds's avatar
Linus Torvalds committed
1048
1049
	struct request *rq;

1050
	rq = get_request(q, rw_flags, bio, GFP_NOIO);
1051
1052
	while (!rq) {
		DEFINE_WAIT(wait);
Linus Torvalds's avatar
Linus Torvalds committed
1053
1054
		struct request_list *rl = &q->rq;

Tejun Heo's avatar
Tejun Heo committed
1055
		if (unlikely(blk_queue_dead(q)))
1056
1057
			return NULL;

1058
		prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
Linus Torvalds's avatar
Linus Torvalds committed
1059
1060
				TASK_UNINTERRUPTIBLE);

1061
		trace_block_sleeprq(q, bio, rw_flags & 1);
Linus Torvalds's avatar
Linus Torvalds committed
1062

1063
1064
		spin_unlock_irq(q->queue_lock);
		io_schedule();
Linus Torvalds's avatar
Linus Torvalds committed
1065

1066
1067
1068
1069
1070
1071
		/*
		 * After sleeping, we become a "batching" process and
		 * will be able to allocate at least one request, and
		 * up to a big batch of them for a small period time.
		 * See ioc_batching, ioc_set_batching
		 */
1072
		create_io_context(GFP_NOIO, q->node);
1073
		ioc_set_batching(q, current->io_context);
Nick Piggin's avatar
Nick Piggin committed
1074

1075
		spin_lock_irq(q->queue_lock);
1076
		finish_wait(&rl->wait[is_sync], &wait);
1077
1078
1079

		rq = get_request(q, rw_flags, bio, GFP_NOIO);
	};
Linus Torvalds's avatar
Linus Torvalds committed
1080
1081
1082
1083

	return rq;
}

1084
struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
1085
1086
1087
1088
1089
{
	struct request *rq;

	BUG_ON(rw != READ && rw != WRITE);