blk-core.c 73.7 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
/*
 * Copyright (C) 1991, 1992 Linus Torvalds
 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
6
7
 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
 *	-  July2000
Linus Torvalds's avatar
Linus Torvalds committed
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
 */

/*
 * This handles all read/write requests to block devices
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/kernel_stat.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/completion.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/writeback.h>
28
#include <linux/task_io_accounting_ops.h>
29
#include <linux/fault-inject.h>
30
#include <linux/list_sort.h>
31
32
33

#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
Linus Torvalds's avatar
Linus Torvalds committed
34

35
36
#include "blk.h"

37
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
38
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
39
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
40

41
static int __make_request(struct request_queue *q, struct bio *bio);
Linus Torvalds's avatar
Linus Torvalds committed
42
43
44
45

/*
 * For the allocated request tables
 */
46
static struct kmem_cache *request_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
47
48
49
50

/*
 * For queue allocation
 */
51
struct kmem_cache *blk_requestq_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
52
53
54
55

/*
 * Controlling structure to kblockd
 */
56
static struct workqueue_struct *kblockd_workqueue;
Linus Torvalds's avatar
Linus Torvalds committed
57

58
59
static void drive_stat_acct(struct request *rq, int new_io)
{
60
	struct hd_struct *part;
61
	int rw = rq_data_dir(rq);
Tejun Heo's avatar
Tejun Heo committed
62
	int cpu;
63

64
	if (!blk_do_io_stat(rq))
65
66
		return;

67
	cpu = part_stat_lock();
Tejun Heo's avatar
Tejun Heo committed
68

69
70
	if (!new_io) {
		part = rq->part;
71
		part_stat_inc(cpu, part, merges[rw]);
72
73
	} else {
		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
74
		if (!hd_struct_try_get(part)) {
75
76
77
78
79
80
81
82
83
			/*
			 * The partition is already being removed,
			 * the request will be accounted on the disk only
			 *
			 * We take a reference on disk->part0 although that
			 * partition will never be deleted, so we can treat
			 * it as any other partition.
			 */
			part = &rq->rq_disk->part0;
84
			hd_struct_get(part);
85
		}
86
		part_round_stats(cpu, part);
87
		part_inc_in_flight(part, rw);
88
		rq->part = part;
89
	}
90

91
	part_stat_unlock();
92
93
}

94
void blk_queue_congestion_threshold(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
{
	int nr;

	nr = q->nr_requests - (q->nr_requests / 8) + 1;
	if (nr > q->nr_requests)
		nr = q->nr_requests;
	q->nr_congestion_on = nr;

	nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
	if (nr < 1)
		nr = 1;
	q->nr_congestion_off = nr;
}

/**
 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
 * @bdev:	device
 *
 * Locates the passed device's request queue and returns the address of its
 * backing_dev_info
 *
 * Will return NULL if the request queue cannot be located.
 */
struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
{
	struct backing_dev_info *ret = NULL;
121
	struct request_queue *q = bdev_get_queue(bdev);
Linus Torvalds's avatar
Linus Torvalds committed
122
123
124
125
126
127
128

	if (q)
		ret = &q->backing_dev_info;
	return ret;
}
EXPORT_SYMBOL(blk_get_backing_dev_info);

129
void blk_rq_init(struct request_queue *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
130
{
131
132
	memset(rq, 0, sizeof(*rq));

Linus Torvalds's avatar
Linus Torvalds committed
133
	INIT_LIST_HEAD(&rq->queuelist);
134
	INIT_LIST_HEAD(&rq->timeout_list);
135
	rq->cpu = -1;
Jens Axboe's avatar
Jens Axboe committed
136
	rq->q = q;
137
	rq->__sector = (sector_t) -1;
138
139
	INIT_HLIST_NODE(&rq->hash);
	RB_CLEAR_NODE(&rq->rb_node);
140
	rq->cmd = rq->__cmd;
141
	rq->cmd_len = BLK_MAX_CDB;
Jens Axboe's avatar
Jens Axboe committed
142
	rq->tag = -1;
Linus Torvalds's avatar
Linus Torvalds committed
143
	rq->ref_count = 1;
144
	rq->start_time = jiffies;
145
	set_start_time_ns(rq);
146
	rq->part = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
147
}
148
EXPORT_SYMBOL(blk_rq_init);
Linus Torvalds's avatar
Linus Torvalds committed
149

150
151
static void req_bio_endio(struct request *rq, struct bio *bio,
			  unsigned int nbytes, int error)
Linus Torvalds's avatar
Linus Torvalds committed
152
{
153
154
155
156
	if (error)
		clear_bit(BIO_UPTODATE, &bio->bi_flags);
	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
		error = -EIO;
157

158
159
160
161
	if (unlikely(nbytes > bio->bi_size)) {
		printk(KERN_ERR "%s: want %u bytes done, %u left\n",
		       __func__, nbytes, bio->bi_size);
		nbytes = bio->bi_size;
162
	}
163

164
165
	if (unlikely(rq->cmd_flags & REQ_QUIET))
		set_bit(BIO_QUIET, &bio->bi_flags);
166

167
168
	bio->bi_size -= nbytes;
	bio->bi_sector += (nbytes >> 9);
169

170
171
	if (bio_integrity(bio))
		bio_integrity_advance(bio, nbytes);
172

173
174
175
	/* don't actually finish bio if it's part of flush sequence */
	if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
		bio_endio(bio, error);
Linus Torvalds's avatar
Linus Torvalds committed
176
177
178
179
180
181
}

void blk_dump_rq_flags(struct request *rq, char *msg)
{
	int bit;

182
	printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,
183
184
		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
		rq->cmd_flags);
Linus Torvalds's avatar
Linus Torvalds committed
185

186
187
188
	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
	       (unsigned long long)blk_rq_pos(rq),
	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
Tejun Heo's avatar
Tejun Heo committed
189
	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
190
	       rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));
Linus Torvalds's avatar
Linus Torvalds committed
191

192
	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
193
		printk(KERN_INFO "  cdb: ");
194
		for (bit = 0; bit < BLK_MAX_CDB; bit++)
Linus Torvalds's avatar
Linus Torvalds committed
195
196
197
198
199
200
			printk("%02x ", rq->cmd[bit]);
		printk("\n");
	}
}
EXPORT_SYMBOL(blk_dump_rq_flags);

201
static void blk_delay_work(struct work_struct *work)
Linus Torvalds's avatar
Linus Torvalds committed
202
{
203
	struct request_queue *q;
Linus Torvalds's avatar
Linus Torvalds committed
204

205
206
	q = container_of(work, struct request_queue, delay_work.work);
	spin_lock_irq(q->queue_lock);
207
	__blk_run_queue(q);
208
	spin_unlock_irq(q->queue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
209
210
211
}

/**
212
213
214
 * blk_delay_queue - restart queueing after defined interval
 * @q:		The &struct request_queue in question
 * @msecs:	Delay in msecs
Linus Torvalds's avatar
Linus Torvalds committed
215
216
 *
 * Description:
217
218
219
220
221
 *   Sometimes queueing needs to be postponed for a little while, to allow
 *   resources to come back. This function will make sure that queueing is
 *   restarted around the specified time.
 */
void blk_delay_queue(struct request_queue *q, unsigned long msecs)
222
{
223
224
	queue_delayed_work(kblockd_workqueue, &q->delay_work,
				msecs_to_jiffies(msecs));
225
}
226
EXPORT_SYMBOL(blk_delay_queue);
227

Linus Torvalds's avatar
Linus Torvalds committed
228
229
/**
 * blk_start_queue - restart a previously stopped queue
230
 * @q:    The &struct request_queue in question
Linus Torvalds's avatar
Linus Torvalds committed
231
232
233
234
235
236
 *
 * Description:
 *   blk_start_queue() will clear the stop flag on the queue, and call
 *   the request_fn for the queue if it was in a stopped state when
 *   entered. Also see blk_stop_queue(). Queue lock must be held.
 **/
237
void blk_start_queue(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
238
{
239
240
	WARN_ON(!irqs_disabled());

241
	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
242
	__blk_run_queue(q);
Linus Torvalds's avatar
Linus Torvalds committed
243
244
245
246
247
}
EXPORT_SYMBOL(blk_start_queue);

/**
 * blk_stop_queue - stop a queue
248
 * @q:    The &struct request_queue in question
Linus Torvalds's avatar
Linus Torvalds committed
249
250
251
252
253
254
255
256
257
258
259
 *
 * Description:
 *   The Linux block layer assumes that a block driver will consume all
 *   entries on the request queue when the request_fn strategy is called.
 *   Often this will not happen, because of hardware limitations (queue
 *   depth settings). If a device driver gets a 'queue full' response,
 *   or if it simply chooses not to queue more I/O at one point, it can
 *   call this function to prevent the request_fn from being called until
 *   the driver has signalled it's ready to go again. This happens by calling
 *   blk_start_queue() to restart queue operations. Queue lock must be held.
 **/
260
void blk_stop_queue(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
261
{
262
	__cancel_delayed_work(&q->delay_work);
263
	queue_flag_set(QUEUE_FLAG_STOPPED, q);
Linus Torvalds's avatar
Linus Torvalds committed
264
265
266
267
268
269
270
271
272
273
274
275
}
EXPORT_SYMBOL(blk_stop_queue);

/**
 * blk_sync_queue - cancel any pending callbacks on a queue
 * @q: the queue
 *
 * Description:
 *     The block layer may perform asynchronous callback activity
 *     on a queue, such as calling the unplug function after a timeout.
 *     A block device may call blk_sync_queue to ensure that any
 *     such activity is cancelled, thus allowing it to release resources
276
 *     that the callbacks might use. The caller must already have made sure
Linus Torvalds's avatar
Linus Torvalds committed
277
278
279
 *     that its ->make_request_fn will not re-add plugging prior to calling
 *     this function.
 *
280
281
282
283
 *     This function does not cancel any asynchronous activity arising
 *     out of elevator or throttling code. That would require elevaotor_exit()
 *     and blk_throtl_exit() to be called with queue lock initialized.
 *
Linus Torvalds's avatar
Linus Torvalds committed
284
285
286
 */
void blk_sync_queue(struct request_queue *q)
{
287
	del_timer_sync(&q->timeout);
288
	cancel_delayed_work_sync(&q->delay_work);
Linus Torvalds's avatar
Linus Torvalds committed
289
290
291
292
}
EXPORT_SYMBOL(blk_sync_queue);

/**
293
 * __blk_run_queue - run a single device queue
Linus Torvalds's avatar
Linus Torvalds committed
294
 * @q:	The queue to run
295
296
297
 *
 * Description:
 *    See @blk_run_queue. This variant must be called with the queue lock
298
 *    held and interrupts disabled.
Linus Torvalds's avatar
Linus Torvalds committed
299
 */
300
void __blk_run_queue(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
301
{
302
303
304
	if (unlikely(blk_queue_stopped(q)))
		return;

305
	q->request_fn(q);
306
307
}
EXPORT_SYMBOL(__blk_run_queue);
308

309
310
311
312
313
314
315
316
317
318
/**
 * blk_run_queue_async - run a single device queue in workqueue context
 * @q:	The queue to run
 *
 * Description:
 *    Tells kblockd to perform the equivalent of @blk_run_queue on behalf
 *    of us.
 */
void blk_run_queue_async(struct request_queue *q)
{
319
320
	if (likely(!blk_queue_stopped(q))) {
		__cancel_delayed_work(&q->delay_work);
321
		queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
322
	}
323
}
324
EXPORT_SYMBOL(blk_run_queue_async);
325

326
327
328
/**
 * blk_run_queue - run a single device queue
 * @q: The queue to run
329
330
331
 *
 * Description:
 *    Invoke request handling on this queue, if it has pending work to do.
Tejun Heo's avatar
Tejun Heo committed
332
 *    May be used to restart queueing when a request has completed.
333
334
335
336
337
338
 */
void blk_run_queue(struct request_queue *q)
{
	unsigned long flags;

	spin_lock_irqsave(q->queue_lock, flags);
339
	__blk_run_queue(q);
Linus Torvalds's avatar
Linus Torvalds committed
340
341
342
343
	spin_unlock_irqrestore(q->queue_lock, flags);
}
EXPORT_SYMBOL(blk_run_queue);

344
void blk_put_queue(struct request_queue *q)
345
346
347
{
	kobject_put(&q->kobj);
}
348
EXPORT_SYMBOL(blk_put_queue);
349

350
/*
351
352
353
354
 * Note: If a driver supplied the queue lock, it is disconnected
 * by this function. The actual state of the lock doesn't matter
 * here as the request_queue isn't accessible after this point
 * (QUEUE_FLAG_DEAD is set) and no other requests will be queued.
355
 */
356
void blk_cleanup_queue(struct request_queue *q)
357
{
358
359
360
361
362
363
364
365
	/*
	 * We know we have process context here, so we can be a little
	 * cautious and ensure that pending block actions on this device
	 * are done before moving on. Going into this function, we should
	 * not have processes doing IO to this device.
	 */
	blk_sync_queue(q);

366
	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
367
	mutex_lock(&q->sysfs_lock);
368
	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
369
370
	mutex_unlock(&q->sysfs_lock);

371
372
	if (q->queue_lock != &q->__queue_lock)
		q->queue_lock = &q->__queue_lock;
373

374
375
	blk_put_queue(q);
}
Linus Torvalds's avatar
Linus Torvalds committed
376
377
EXPORT_SYMBOL(blk_cleanup_queue);

378
static int blk_init_free_list(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
379
380
381
{
	struct request_list *rl = &q->rq;

382
383
384
	if (unlikely(rl->rq_pool))
		return 0;

385
386
	rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
	rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
387
	rl->elvpriv = 0;
388
389
	init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
	init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
Linus Torvalds's avatar
Linus Torvalds committed
390

391
392
	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
				mempool_free_slab, request_cachep, q->node);
Linus Torvalds's avatar
Linus Torvalds committed
393
394
395
396
397
398
399

	if (!rl->rq_pool)
		return -ENOMEM;

	return 0;
}

400
struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
401
{
402
403
404
	return blk_alloc_queue_node(gfp_mask, -1);
}
EXPORT_SYMBOL(blk_alloc_queue);
Linus Torvalds's avatar
Linus Torvalds committed
405

406
struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
407
{
408
	struct request_queue *q;
Peter Zijlstra's avatar
Peter Zijlstra committed
409
	int err;
410

411
	q = kmem_cache_alloc_node(blk_requestq_cachep,
412
				gfp_mask | __GFP_ZERO, node_id);
Linus Torvalds's avatar
Linus Torvalds committed
413
414
415
	if (!q)
		return NULL;

416
417
418
419
	q->backing_dev_info.ra_pages =
			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
	q->backing_dev_info.state = 0;
	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
420
	q->backing_dev_info.name = "block";
421

Peter Zijlstra's avatar
Peter Zijlstra committed
422
423
	err = bdi_init(&q->backing_dev_info);
	if (err) {
424
		kmem_cache_free(blk_requestq_cachep, q);
Peter Zijlstra's avatar
Peter Zijlstra committed
425
426
427
		return NULL;
	}

428
429
430
431
432
	if (blk_throtl_init(q)) {
		kmem_cache_free(blk_requestq_cachep, q);
		return NULL;
	}

433
434
	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
		    laptop_mode_timer_fn, (unsigned long) q);
435
436
	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
	INIT_LIST_HEAD(&q->timeout_list);
437
438
439
	INIT_LIST_HEAD(&q->flush_queue[0]);
	INIT_LIST_HEAD(&q->flush_queue[1]);
	INIT_LIST_HEAD(&q->flush_data_in_flight);
440
	INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
441

442
	kobject_init(&q->kobj, &blk_queue_ktype);
Linus Torvalds's avatar
Linus Torvalds committed
443

444
	mutex_init(&q->sysfs_lock);
445
	spin_lock_init(&q->__queue_lock);
446

447
448
449
450
451
452
	/*
	 * By default initialize queue_lock to internal lock and driver can
	 * override it later if need be.
	 */
	q->queue_lock = &q->__queue_lock;

Linus Torvalds's avatar
Linus Torvalds committed
453
454
	return q;
}
455
EXPORT_SYMBOL(blk_alloc_queue_node);
Linus Torvalds's avatar
Linus Torvalds committed
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478

/**
 * blk_init_queue  - prepare a request queue for use with a block device
 * @rfn:  The function to be called to process requests that have been
 *        placed on the queue.
 * @lock: Request queue spin lock
 *
 * Description:
 *    If a block device wishes to use the standard request handling procedures,
 *    which sorts requests and coalesces adjacent requests, then it must
 *    call blk_init_queue().  The function @rfn will be called when there
 *    are requests on the queue that need to be processed.  If the device
 *    supports plugging, then @rfn may not be called immediately when requests
 *    are available on the queue, but may be called at some time later instead.
 *    Plugged queues are generally unplugged when a buffer belonging to one
 *    of the requests on the queue is needed, or due to memory pressure.
 *
 *    @rfn is not required, or even expected, to remove all requests off the
 *    queue, but only as many as it can handle at a time.  If it does leave
 *    requests on the queue, it is responsible for arranging that the requests
 *    get dealt with eventually.
 *
 *    The queue spin lock must be held while manipulating the requests on the
479
480
 *    request queue; this lock will be taken also from interrupt context, so irq
 *    disabling is needed for it.
Linus Torvalds's avatar
Linus Torvalds committed
481
 *
482
 *    Function returns a pointer to the initialized request queue, or %NULL if
Linus Torvalds's avatar
Linus Torvalds committed
483
484
485
486
487
488
 *    it didn't succeed.
 *
 * Note:
 *    blk_init_queue() must be paired with a blk_cleanup_queue() call
 *    when the block device is deactivated (such as at module unload).
 **/
489

490
struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
Linus Torvalds's avatar
Linus Torvalds committed
491
{
492
493
494
495
	return blk_init_queue_node(rfn, lock, -1);
}
EXPORT_SYMBOL(blk_init_queue);

496
struct request_queue *
497
498
blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
{
499
	struct request_queue *uninit_q, *q;
Linus Torvalds's avatar
Linus Torvalds committed
500

501
502
503
504
505
506
507
508
509
	uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
	if (!uninit_q)
		return NULL;

	q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);
	if (!q)
		blk_cleanup_queue(uninit_q);

	return q;
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
}
EXPORT_SYMBOL(blk_init_queue_node);

struct request_queue *
blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
			 spinlock_t *lock)
{
	return blk_init_allocated_queue_node(q, rfn, lock, -1);
}
EXPORT_SYMBOL(blk_init_allocated_queue);

struct request_queue *
blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
			      spinlock_t *lock, int node_id)
{
Linus Torvalds's avatar
Linus Torvalds committed
525
526
527
	if (!q)
		return NULL;

528
	q->node = node_id;
529
	if (blk_init_free_list(q))
530
		return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
531
532
533

	q->request_fn		= rfn;
	q->prep_rq_fn		= NULL;
534
	q->unprep_rq_fn		= NULL;
535
	q->queue_flags		= QUEUE_FLAG_DEFAULT;
536
537
538
539

	/* Override internal queue lock with supplied lock pointer */
	if (lock)
		q->queue_lock		= lock;
Linus Torvalds's avatar
Linus Torvalds committed
540

541
542
543
	/*
	 * This also sets hw/phys segments, boundary and size
	 */
Linus Torvalds's avatar
Linus Torvalds committed
544
545
	blk_queue_make_request(q, __make_request);

546
547
	q->sg_reserved_size = INT_MAX;

Linus Torvalds's avatar
Linus Torvalds committed
548
549
550
551
552
553
554
555
556
557
	/*
	 * all done
	 */
	if (!elevator_init(q, NULL)) {
		blk_queue_congestion_threshold(q);
		return q;
	}

	return NULL;
}
558
EXPORT_SYMBOL(blk_init_allocated_queue_node);
Linus Torvalds's avatar
Linus Torvalds committed
559

560
int blk_get_queue(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
561
{
Nick Piggin's avatar
Nick Piggin committed
562
	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
563
		kobject_get(&q->kobj);
Linus Torvalds's avatar
Linus Torvalds committed
564
565
566
567
568
		return 0;
	}

	return 1;
}
569
EXPORT_SYMBOL(blk_get_queue);
Linus Torvalds's avatar
Linus Torvalds committed
570

571
static inline void blk_free_request(struct request_queue *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
572
{
573
	if (rq->cmd_flags & REQ_ELVPRIV)
574
		elv_put_request(q, rq);
Linus Torvalds's avatar
Linus Torvalds committed
575
576
577
	mempool_free(rq, q->rq.rq_pool);
}

Jens Axboe's avatar
Jens Axboe committed
578
static struct request *
579
blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
580
581
582
583
584
585
{
	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);

	if (!rq)
		return NULL;

586
	blk_rq_init(q, rq);
587

588
	rq->cmd_flags = flags | REQ_ALLOCED;
Linus Torvalds's avatar
Linus Torvalds committed
589

590
	if (priv) {
591
		if (unlikely(elv_set_request(q, rq, gfp_mask))) {
592
593
594
			mempool_free(rq, q->rq.rq_pool);
			return NULL;
		}
595
		rq->cmd_flags |= REQ_ELVPRIV;
596
	}
Linus Torvalds's avatar
Linus Torvalds committed
597

598
	return rq;
Linus Torvalds's avatar
Linus Torvalds committed
599
600
601
602
603
604
}

/*
 * ioc_batching returns true if the ioc is a valid batching request and
 * should be given priority access to a request.
 */
605
static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
Linus Torvalds's avatar
Linus Torvalds committed
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
{
	if (!ioc)
		return 0;

	/*
	 * Make sure the process is able to allocate at least 1 request
	 * even if the batch times out, otherwise we could theoretically
	 * lose wakeups.
	 */
	return ioc->nr_batch_requests == q->nr_batching ||
		(ioc->nr_batch_requests > 0
		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
}

/*
 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
 * will cause the process to be a "batcher" on all queues in the system. This
 * is the behaviour we want though - once it gets a wakeup it should be given
 * a nice run.
 */
626
static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
Linus Torvalds's avatar
Linus Torvalds committed
627
628
629
630
631
632
633
634
{
	if (!ioc || ioc_batching(q, ioc))
		return;

	ioc->nr_batch_requests = q->nr_batching;
	ioc->last_waited = jiffies;
}

635
static void __freed_request(struct request_queue *q, int sync)
Linus Torvalds's avatar
Linus Torvalds committed
636
637
638
{
	struct request_list *rl = &q->rq;

639
640
	if (rl->count[sync] < queue_congestion_off_threshold(q))
		blk_clear_queue_congested(q, sync);
Linus Torvalds's avatar
Linus Torvalds committed
641

642
643
644
	if (rl->count[sync] + 1 <= q->nr_requests) {
		if (waitqueue_active(&rl->wait[sync]))
			wake_up(&rl->wait[sync]);
Linus Torvalds's avatar
Linus Torvalds committed
645

646
		blk_clear_queue_full(q, sync);
Linus Torvalds's avatar
Linus Torvalds committed
647
648
649
650
651
652
653
	}
}

/*
 * A request has just been released.  Account for it, update the full and
 * congestion status, wake up any waiters.   Called under q->queue_lock.
 */
654
static void freed_request(struct request_queue *q, int sync, int priv)
Linus Torvalds's avatar
Linus Torvalds committed
655
656
657
{
	struct request_list *rl = &q->rq;

658
	rl->count[sync]--;
659
660
	if (priv)
		rl->elvpriv--;
Linus Torvalds's avatar
Linus Torvalds committed
661

662
	__freed_request(q, sync);
Linus Torvalds's avatar
Linus Torvalds committed
663

664
665
	if (unlikely(rl->starved[sync ^ 1]))
		__freed_request(q, sync ^ 1);
Linus Torvalds's avatar
Linus Torvalds committed
666
667
}

668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
/*
 * Determine if elevator data should be initialized when allocating the
 * request associated with @bio.
 */
static bool blk_rq_should_init_elevator(struct bio *bio)
{
	if (!bio)
		return true;

	/*
	 * Flush requests do not use the elevator so skip initialization.
	 * This allows a request to share the flush and elevator data.
	 */
	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
		return false;

	return true;
}

Linus Torvalds's avatar
Linus Torvalds committed
687
/*
Nick Piggin's avatar
Nick Piggin committed
688
689
690
 * Get a free request, queue_lock must be held.
 * Returns NULL on failure, with queue_lock held.
 * Returns !NULL on success, with queue_lock *not held*.
Linus Torvalds's avatar
Linus Torvalds committed
691
 */
692
static struct request *get_request(struct request_queue *q, int rw_flags,
693
				   struct bio *bio, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
694
695
696
{
	struct request *rq = NULL;
	struct request_list *rl = &q->rq;
697
	struct io_context *ioc = NULL;
698
	const bool is_sync = rw_is_sync(rw_flags) != 0;
699
	int may_queue, priv = 0;
700

701
	may_queue = elv_may_queue(q, rw_flags);
702
703
704
	if (may_queue == ELV_MQUEUE_NO)
		goto rq_starved;

705
706
	if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
		if (rl->count[is_sync]+1 >= q->nr_requests) {
707
			ioc = current_io_context(GFP_ATOMIC, q->node);
708
709
710
711
712
713
			/*
			 * The queue will fill after this allocation, so set
			 * it as full, and mark this process as "batching".
			 * This process will be allowed to complete a batch of
			 * requests, others will be blocked.
			 */
714
			if (!blk_queue_full(q, is_sync)) {
715
				ioc_set_batching(q, ioc);
716
				blk_set_queue_full(q, is_sync);
717
718
719
720
721
722
723
724
725
726
727
			} else {
				if (may_queue != ELV_MQUEUE_MUST
						&& !ioc_batching(q, ioc)) {
					/*
					 * The queue is full and the allocating
					 * process is not a "batcher", and not
					 * exempted by the IO scheduler
					 */
					goto out;
				}
			}
Linus Torvalds's avatar
Linus Torvalds committed
728
		}
729
		blk_set_queue_congested(q, is_sync);
Linus Torvalds's avatar
Linus Torvalds committed
730
731
	}

732
733
734
735
736
	/*
	 * Only allow batching queuers to allocate up to 50% over the defined
	 * limit of requests, otherwise we could have thousands of requests
	 * allocated with any setting of ->nr_requests
	 */
737
	if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
738
		goto out;
739

740
741
	rl->count[is_sync]++;
	rl->starved[is_sync] = 0;
742

743
744
745
746
747
	if (blk_rq_should_init_elevator(bio)) {
		priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
		if (priv)
			rl->elvpriv++;
	}
748

749
750
	if (blk_queue_io_stat(q))
		rw_flags |= REQ_IO_STAT;
Linus Torvalds's avatar
Linus Torvalds committed
751
752
	spin_unlock_irq(q->queue_lock);

753
	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
754
	if (unlikely(!rq)) {
Linus Torvalds's avatar
Linus Torvalds committed
755
756
757
758
759
760
761
762
		/*
		 * Allocation failed presumably due to memory. Undo anything
		 * we might have messed up.
		 *
		 * Allocating task should really be put onto the front of the
		 * wait queue, but this is pretty rare.
		 */
		spin_lock_irq(q->queue_lock);
763
		freed_request(q, is_sync, priv);
Linus Torvalds's avatar
Linus Torvalds committed
764
765
766
767
768
769
770
771
772

		/*
		 * in the very unlikely event that allocation failed and no
		 * requests for this direction was pending, mark us starved
		 * so that freeing of a request in the other direction will
		 * notice us. another possible fix would be to split the
		 * rq mempool into READ and WRITE
		 */
rq_starved:
773
774
		if (unlikely(rl->count[is_sync] == 0))
			rl->starved[is_sync] = 1;
Linus Torvalds's avatar
Linus Torvalds committed
775
776
777
778

		goto out;
	}

779
780
781
782
783
784
	/*
	 * ioc may be NULL here, and ioc_batching will be false. That's
	 * OK, if the queue is under the request limit then requests need
	 * not count toward the nr_batch_requests limit. There will always
	 * be some limit enforced by BLK_BATCH_TIME.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
785
786
	if (ioc_batching(q, ioc))
		ioc->nr_batch_requests--;
787

788
	trace_block_getrq(q, bio, rw_flags & 1);
Linus Torvalds's avatar
Linus Torvalds committed
789
790
791
792
793
out:
	return rq;
}

/*
Jens Axboe's avatar
Jens Axboe committed
794
795
 * No available requests for this queue, wait for some requests to become
 * available.
Nick Piggin's avatar
Nick Piggin committed
796
797
 *
 * Called with q->queue_lock held, and returns with it unlocked.
Linus Torvalds's avatar
Linus Torvalds committed
798
 */
799
static struct request *get_request_wait(struct request_queue *q, int rw_flags,
800
					struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
801
{
802
	const bool is_sync = rw_is_sync(rw_flags) != 0;
Linus Torvalds's avatar
Linus Torvalds committed
803
804
	struct request *rq;

805
	rq = get_request(q, rw_flags, bio, GFP_NOIO);
806
807
	while (!rq) {
		DEFINE_WAIT(wait);
808
		struct io_context *ioc;
Linus Torvalds's avatar
Linus Torvalds committed
809
810
		struct request_list *rl = &q->rq;

811
		prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
Linus Torvalds's avatar
Linus Torvalds committed
812
813
				TASK_UNINTERRUPTIBLE);

814
		trace_block_sleeprq(q, bio, rw_flags & 1);
Linus Torvalds's avatar
Linus Torvalds committed
815

816
817
		spin_unlock_irq(q->queue_lock);
		io_schedule();
Linus Torvalds's avatar
Linus Torvalds committed
818

819
820
821
822
823
824
825
826
		/*
		 * After sleeping, we become a "batching" process and
		 * will be able to allocate at least one request, and
		 * up to a big batch of them for a small period time.
		 * See ioc_batching, ioc_set_batching
		 */
		ioc = current_io_context(GFP_NOIO, q->node);
		ioc_set_batching(q, ioc);
Nick Piggin's avatar
Nick Piggin committed
827

828
		spin_lock_irq(q->queue_lock);
829
		finish_wait(&rl->wait[is_sync], &wait);
830
831
832

		rq = get_request(q, rw_flags, bio, GFP_NOIO);
	};
Linus Torvalds's avatar
Linus Torvalds committed
833
834
835
836

	return rq;
}

837
struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
838
839
840
{
	struct request *rq;

841
842
843
	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
		return NULL;

Linus Torvalds's avatar
Linus Torvalds committed
844
845
	BUG_ON(rw != READ && rw != WRITE);

Nick Piggin's avatar
Nick Piggin committed
846
847
	spin_lock_irq(q->queue_lock);
	if (gfp_mask & __GFP_WAIT) {
848
		rq = get_request_wait(q, rw, NULL);
Nick Piggin's avatar
Nick Piggin committed
849
	} else {
850
		rq = get_request(q, rw, NULL, gfp_mask);
Nick Piggin's avatar
Nick Piggin committed
851
852
853
854
		if (!rq)
			spin_unlock_irq(q->queue_lock);
	}
	/* q->queue_lock is unlocked at this point */
Linus Torvalds's avatar
Linus Torvalds committed
855
856
857
858
859

	return rq;
}
EXPORT_SYMBOL(blk_get_request);

860
/**
861
 * blk_make_request - given a bio, allocate a corresponding struct request.
862
 * @q: target request queue
863
864
 * @bio:  The bio describing the memory mappings that will be submitted for IO.
 *        It may be a chained-bio properly constructed by block/bio layer.
865
 * @gfp_mask: gfp flags to be used for memory allocation
866
 *
867
868
869
870
 * blk_make_request is the parallel of generic_make_request for BLOCK_PC
 * type commands. Where the struct request needs to be farther initialized by
 * the caller. It is passed a &struct bio, which describes the memory info of
 * the I/O transfer.
871
 *
872
873
874
875
876
877
878
879
880
 * The caller of blk_make_request must make sure that bi_io_vec
 * are set to describe the memory buffers. That bio_data_dir() will return
 * the needed direction of the request. (And all bio's in the passed bio-chain
 * are properly set accordingly)
 *
 * If called under none-sleepable conditions, mapped bio buffers must not
 * need bouncing, by calling the appropriate masked or flagged allocator,
 * suitable for the target device. Otherwise the call to blk_queue_bounce will
 * BUG.
881
882
883
884
885
886
887
888
889
 *
 * WARNING: When allocating/cloning a bio-chain, careful consideration should be
 * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for
 * anything but the first bio in the chain. Otherwise you risk waiting for IO
 * completion of a bio that hasn't been submitted yet, thus resulting in a
 * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead
 * of bio_alloc(), as that avoids the mempool deadlock.
 * If possible a big IO should be split into smaller parts when allocation
 * fails. Partial allocation should not be an error, or you risk a live-lock.
890
 */
891
892
struct request *blk_make_request(struct request_queue *q, struct bio *bio,
				 gfp_t gfp_mask)
893
{
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
	struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask);

	if (unlikely(!rq))
		return ERR_PTR(-ENOMEM);

	for_each_bio(bio) {
		struct bio *bounce_bio = bio;
		int ret;

		blk_queue_bounce(q, &bounce_bio);
		ret = blk_rq_append_bio(q, rq, bounce_bio);
		if (unlikely(ret)) {
			blk_put_request(rq);
			return ERR_PTR(ret);
		}
	}

	return rq;
912
}
913
EXPORT_SYMBOL(blk_make_request);
914

Linus Torvalds's avatar
Linus Torvalds committed
915
916
917
918
919
920
921
922
923
924
/**
 * blk_requeue_request - put a request back on queue
 * @q:		request queue where request should be inserted
 * @rq:		request to be inserted
 *
 * Description:
 *    Drivers often keep queueing requests until the hardware cannot accept
 *    more, when that condition happens we need to put the request back
 *    on the queue. Must be called with queue lock held.
 */
925
void blk_requeue_request(struct request_queue *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
926
{
927
928
	blk_delete_timer(rq);
	blk_clear_rq_complete(rq);
929
	trace_block_rq_requeue(q, rq);
930

Linus Torvalds's avatar
Linus Torvalds committed
931
932
933
	if (blk_rq_tagged(rq))
		blk_queue_end_tag(q, rq);

934
935
	BUG_ON(blk_queued_rq(rq));

Linus Torvalds's avatar
Linus Torvalds committed
936
937
938
939
	elv_requeue_request(q, rq);
}
EXPORT_SYMBOL(blk_requeue_request);

940
941
942
943
static void add_acct_request(struct request_queue *q, struct request *rq,
			     int where)
{
	drive_stat_acct(rq, 1);
Jens Axboe's avatar
Jens Axboe committed
944
	__elv_add_request(q, rq, where);
945
946
}

Linus Torvalds's avatar
Linus Torvalds committed
947
/**
948
 * blk_insert_request - insert a special request into a request queue
Linus Torvalds's avatar
Linus Torvalds committed
949
950
951
952
953
954
955
956
957
 * @q:		request queue where request should be inserted
 * @rq:		request to be inserted
 * @at_head:	insert request at head or tail of queue
 * @data:	private data
 *
 * Description:
 *    Many block devices need to execute commands asynchronously, so they don't
 *    block the whole kernel from preemption during request execution.  This is
 *    accomplished normally by inserting aritficial requests tagged as
958
959
 *    REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
 *    be scheduled for actual execution by the request queue.
Linus Torvalds's avatar
Linus Torvalds committed
960
961
962
963
964
965
 *
 *    We have the option of inserting the head or the tail of the queue.
 *    Typically we use the tail for new ioctls and so forth.  We use the head
 *    of the queue for things like a QUEUE_FULL message from a device, or a
 *    host that is unable to accept a particular command.
 */
966
void blk_insert_request(struct request_queue *q, struct request *rq,
967
			int at_head, void *data)
Linus Torvalds's avatar
Linus Torvalds committed
968
{
969
	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
Linus Torvalds's avatar
Linus Torvalds committed
970
971
972
973
974
975
976
	unsigned long flags;

	/*
	 * tell I/O scheduler that this isn't a regular read/write (ie it
	 * must not attempt merges on this) and that it acts as a soft
	 * barrier
	 */
977
	rq->cmd_type = REQ_TYPE_SPECIAL;
Linus Torvalds's avatar
Linus Torvalds committed
978
979
980
981
982
983
984
985

	rq->special = data;

	spin_lock_irqsave(q->queue_lock, flags);

	/*
	 * If command is tagged, release the tag
	 */
986
987
	if (blk_rq_tagged(rq))
		blk_queue_end_tag(q, rq);
Linus Torvalds's avatar
Linus Torvalds committed
988

989
	add_acct_request(q, rq, where);
990
	__blk_run_queue(q);
Linus Torvalds's avatar
Linus Torvalds committed
991
992
993
994
	spin_unlock_irqrestore(q->queue_lock, flags);
}
EXPORT_SYMBOL(blk_insert_request);

995
996
997
998
999
1000
static void part_round_stats_single(int cpu, struct hd_struct *part,
				    unsigned long now)
{
	if (now == part->stamp)
		return;