Commit 5e57dc81 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block IO fixes from Jens Axboe:
 "Second round of updates and fixes for 3.14-rc2.  Most of this stuff
  has been queued up for a while.  The notable exception is the blk-mq
  changes, which are naturally a bit more in flux still.

  The pull request contains:

   - Two bug fixes for the new immutable vecs, causing crashes with raid
     or swap.  From Kent.

   - Various blk-mq tweaks and fixes from Christoph.  A fix for
     integrity bio's from Nic.

   - A few bcache fixes from Kent and Darrick Wong.

   - xen-blk{front,back} fixes from David Vrabel, Matt Rushton, Nicolas
     Swenson, and Roger Pau Monne.

   - Fix for a vec miscount with integrity vectors from Martin.

   - Minor annotations or fixes from Masanari Iida and Rashika Kheria.

   - Tweak to null_blk to do more normal FIFO processing of requests
     from Shlomo Pongratz.

   - Elevator switching bypass fix from Tejun.

   - Softlockup in blkdev_issue_discard() fix when !CONFIG_PREEMPT from
     me"

* 'for-linus' of git://git.kernel.dk/linux-block: (31 commits)
  block: add cond_resched() to potentially long running ioctl discard loop
  xen-blkback: init persistent_purge_work work_struct
  blk-mq: pair blk_mq_start_request / blk_mq_requeue_request
  blk-mq: dont assume rq->errors is set when returning an error from ->queue_rq
  block: Fix cloning of discard/write same bios
  block: Fix type mismatch in ssize_t_blk_mq_tag_sysfs_show
  blk-mq: rework flush sequencing logic
  null_blk: use blk_complete_request and blk_mq_complete_request
  virtio_blk: use blk_mq_complete_request
  blk-mq: rework I/O completions
  fs: Add prototype declaration to appropriate header file include/linux/bio.h
  fs: Mark function as static in fs/bio-integrity.c
  block/null_blk: Fix completion processing from LIFO to FIFO
  block: Explicitly handle discard/write same segments
  block: Fix nr_vecs for inline integrity vectors
  blk-mq: Add bio_integrity setup to blk_mq_make_request
  blk-mq: initialize sg_reserved_size
  blk-mq: handle dma_drain_size
  blk-mq: divert __blk_put_request for MQ ops
  blk-mq: support at_head inserations for blk_execute_rq
  ...
parents 0d25e369 c8123f8c
......@@ -693,11 +693,20 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
if (!uninit_q)
return NULL;
uninit_q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL);
if (!uninit_q->flush_rq)
goto out_cleanup_queue;
q = blk_init_allocated_queue(uninit_q, rfn, lock);
if (!q)
blk_cleanup_queue(uninit_q);
goto out_free_flush_rq;
return q;
out_free_flush_rq:
kfree(uninit_q->flush_rq);
out_cleanup_queue:
blk_cleanup_queue(uninit_q);
return NULL;
}
EXPORT_SYMBOL(blk_init_queue_node);
......@@ -1127,7 +1136,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
{
if (q->mq_ops)
return blk_mq_alloc_request(q, rw, gfp_mask, false);
return blk_mq_alloc_request(q, rw, gfp_mask);
else
return blk_old_get_request(q, rw, gfp_mask);
}
......@@ -1278,6 +1287,11 @@ void __blk_put_request(struct request_queue *q, struct request *req)
if (unlikely(!q))
return;
if (q->mq_ops) {
blk_mq_free_request(req);
return;
}
blk_pm_put_request(req);
elv_completed_request(q, req);
......
......@@ -65,7 +65,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
* be resued after dying flag is set
*/
if (q->mq_ops) {
blk_mq_insert_request(q, rq, true);
blk_mq_insert_request(q, rq, at_head, true);
return;
}
......
......@@ -130,20 +130,26 @@ static void blk_flush_restore_request(struct request *rq)
blk_clear_rq_complete(rq);
}
static void mq_flush_data_run(struct work_struct *work)
static void mq_flush_run(struct work_struct *work)
{
struct request *rq;
rq = container_of(work, struct request, mq_flush_data);
rq = container_of(work, struct request, mq_flush_work);
memset(&rq->csd, 0, sizeof(rq->csd));
blk_mq_run_request(rq, true, false);
}
static void blk_mq_flush_data_insert(struct request *rq)
static bool blk_flush_queue_rq(struct request *rq)
{
INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);
kblockd_schedule_work(rq->q, &rq->mq_flush_data);
if (rq->q->mq_ops) {
INIT_WORK(&rq->mq_flush_work, mq_flush_run);
kblockd_schedule_work(rq->q, &rq->mq_flush_work);
return false;
} else {
list_add_tail(&rq->queuelist, &rq->q->queue_head);
return true;
}
}
/**
......@@ -187,12 +193,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
case REQ_FSEQ_DATA:
list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
if (q->mq_ops)
blk_mq_flush_data_insert(rq);
else {
list_add(&rq->queuelist, &q->queue_head);
queued = true;
}
queued = blk_flush_queue_rq(rq);
break;
case REQ_FSEQ_DONE:
......@@ -216,9 +217,6 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
}
kicked = blk_kick_flush(q);
/* blk_mq_run_flush will run queue */
if (q->mq_ops)
return queued;
return kicked | queued;
}
......@@ -230,10 +228,9 @@ static void flush_end_io(struct request *flush_rq, int error)
struct request *rq, *n;
unsigned long flags = 0;
if (q->mq_ops) {
blk_mq_free_request(flush_rq);
if (q->mq_ops)
spin_lock_irqsave(&q->mq_flush_lock, flags);
}
running = &q->flush_queue[q->flush_running_idx];
BUG_ON(q->flush_pending_idx == q->flush_running_idx);
......@@ -263,49 +260,14 @@ static void flush_end_io(struct request *flush_rq, int error)
* kblockd.
*/
if (queued || q->flush_queue_delayed) {
if (!q->mq_ops)
blk_run_queue_async(q);
else
/*
* This can be optimized to only run queues with requests
* queued if necessary.
*/
blk_mq_run_queues(q, true);
WARN_ON(q->mq_ops);
blk_run_queue_async(q);
}
q->flush_queue_delayed = 0;
if (q->mq_ops)
spin_unlock_irqrestore(&q->mq_flush_lock, flags);
}
static void mq_flush_work(struct work_struct *work)
{
struct request_queue *q;
struct request *rq;
q = container_of(work, struct request_queue, mq_flush_work);
/* We don't need set REQ_FLUSH_SEQ, it's for consistency */
rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
__GFP_WAIT|GFP_ATOMIC, true);
rq->cmd_type = REQ_TYPE_FS;
rq->end_io = flush_end_io;
blk_mq_run_request(rq, true, false);
}
/*
* We can't directly use q->flush_rq, because it doesn't have tag and is not in
* hctx->rqs[]. so we must allocate a new request, since we can't sleep here,
* so offload the work to workqueue.
*
* Note: we assume a flush request finished in any hardware queue will flush
* the whole disk cache.
*/
static void mq_run_flush(struct request_queue *q)
{
kblockd_schedule_work(q, &q->mq_flush_work);
}
/**
* blk_kick_flush - consider issuing flush request
* @q: request_queue being kicked
......@@ -340,19 +302,31 @@ static bool blk_kick_flush(struct request_queue *q)
* different from running_idx, which means flush is in flight.
*/
q->flush_pending_idx ^= 1;
if (q->mq_ops) {
mq_run_flush(q);
return true;
struct blk_mq_ctx *ctx = first_rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
blk_mq_rq_init(hctx, q->flush_rq);
q->flush_rq->mq_ctx = ctx;
/*
* Reuse the tag value from the fist waiting request,
* with blk-mq the tag is generated during request
* allocation and drivers can rely on it being inside
* the range they asked for.
*/
q->flush_rq->tag = first_rq->tag;
} else {
blk_rq_init(q, q->flush_rq);
}
blk_rq_init(q, &q->flush_rq);
q->flush_rq.cmd_type = REQ_TYPE_FS;
q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
q->flush_rq.rq_disk = first_rq->rq_disk;
q->flush_rq.end_io = flush_end_io;
q->flush_rq->cmd_type = REQ_TYPE_FS;
q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
q->flush_rq->rq_disk = first_rq->rq_disk;
q->flush_rq->end_io = flush_end_io;
list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
return true;
return blk_flush_queue_rq(q->flush_rq);
}
static void flush_data_end_io(struct request *rq, int error)
......@@ -558,5 +532,4 @@ EXPORT_SYMBOL(blkdev_issue_flush);
void blk_mq_init_flush(struct request_queue *q)
{
spin_lock_init(&q->mq_flush_lock);
INIT_WORK(&q->mq_flush_work, mq_flush_work);
}
......@@ -119,6 +119,14 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
atomic_inc(&bb.done);
submit_bio(type, bio);
/*
* We can loop for a long time in here, if someone does
* full device discards (like mkfs). Be nice and allow
* us to schedule out to avoid softlocking if preempt
* is disabled.
*/
cond_resched();
}
blk_finish_plug(&plug);
......
......@@ -21,6 +21,16 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
if (!bio)
return 0;
/*
* This should probably be returning 0, but blk_add_request_payload()
* (Christoph!!!!)
*/
if (bio->bi_rw & REQ_DISCARD)
return 1;
if (bio->bi_rw & REQ_WRITE_SAME)
return 1;
fbio = bio;
cluster = blk_queue_cluster(q);
seg_size = 0;
......@@ -161,30 +171,60 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
*bvprv = *bvec;
}
/*
* map a request to scatterlist, return number of sg entries setup. Caller
* must make sure sg can hold rq->nr_phys_segments entries
*/
int blk_rq_map_sg(struct request_queue *q, struct request *rq,
struct scatterlist *sglist)
static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
struct scatterlist *sglist,
struct scatterlist **sg)
{
struct bio_vec bvec, bvprv = { NULL };
struct req_iterator iter;
struct scatterlist *sg;
struct bvec_iter iter;
int nsegs, cluster;
nsegs = 0;
cluster = blk_queue_cluster(q);
/*
* for each bio in rq
*/
sg = NULL;
rq_for_each_segment(bvec, rq, iter) {
__blk_segment_map_sg(q, &bvec, sglist, &bvprv, &sg,
&nsegs, &cluster);
} /* segments in rq */
if (bio->bi_rw & REQ_DISCARD) {
/*
* This is a hack - drivers should be neither modifying the
* biovec, nor relying on bi_vcnt - but because of
* blk_add_request_payload(), a discard bio may or may not have
* a payload we need to set up here (thank you Christoph) and
* bi_vcnt is really the only way of telling if we need to.
*/
if (bio->bi_vcnt)
goto single_segment;
return 0;
}
if (bio->bi_rw & REQ_WRITE_SAME) {
single_segment:
*sg = sglist;
bvec = bio_iovec(bio);
sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
return 1;
}
for_each_bio(bio)
bio_for_each_segment(bvec, bio, iter)
__blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg,
&nsegs, &cluster);
return nsegs;
}
/*
* map a request to scatterlist, return number of sg entries setup. Caller
* must make sure sg can hold rq->nr_phys_segments entries
*/
int blk_rq_map_sg(struct request_queue *q, struct request *rq,
struct scatterlist *sglist)
{
struct scatterlist *sg = NULL;
int nsegs = 0;
if (rq->bio)
nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
if (unlikely(rq->cmd_flags & REQ_COPY_USER) &&
(blk_rq_bytes(rq) & q->dma_pad_mask)) {
......@@ -230,20 +270,13 @@ EXPORT_SYMBOL(blk_rq_map_sg);
int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
struct scatterlist *sglist)
{
struct bio_vec bvec, bvprv = { NULL };
struct scatterlist *sg;
int nsegs, cluster;
struct bvec_iter iter;
nsegs = 0;
cluster = blk_queue_cluster(q);
sg = NULL;
bio_for_each_segment(bvec, bio, iter) {
__blk_segment_map_sg(q, &bvec, sglist, &bvprv, &sg,
&nsegs, &cluster);
} /* segments in bio */
struct scatterlist *sg = NULL;
int nsegs;
struct bio *next = bio->bi_next;
bio->bi_next = NULL;
nsegs = __blk_bios_map_sg(q, bio, sglist, &sg);
bio->bi_next = next;
if (sg)
sg_mark_end(sg);
......
......@@ -184,7 +184,7 @@ void blk_mq_free_tags(struct blk_mq_tags *tags)
ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
{
char *orig_page = page;
int cpu;
unsigned int cpu;
if (!tags)
return 0;
......
......@@ -226,15 +226,14 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
return rq;
}
struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
gfp_t gfp, bool reserved)
struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp)
{
struct request *rq;
if (blk_mq_queue_enter(q))
return NULL;
rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved);
rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
if (rq)
blk_mq_put_ctx(rq->mq_ctx);
return rq;
......@@ -258,7 +257,7 @@ EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
/*
* Re-init and set pdu, if we have it
*/
static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
{
blk_rq_init(hctx->queue, rq);
......@@ -305,7 +304,7 @@ static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error)
bio_endio(bio, error);
}
void blk_mq_complete_request(struct request *rq, int error)
void blk_mq_end_io(struct request *rq, int error)
{
struct bio *bio = rq->bio;
unsigned int bytes = 0;
......@@ -330,48 +329,55 @@ void blk_mq_complete_request(struct request *rq, int error)
else
blk_mq_free_request(rq);
}
EXPORT_SYMBOL(blk_mq_end_io);
void __blk_mq_end_io(struct request *rq, int error)
{
if (!blk_mark_rq_complete(rq))
blk_mq_complete_request(rq, error);
}
static void blk_mq_end_io_remote(void *data)
static void __blk_mq_complete_request_remote(void *data)
{
struct request *rq = data;
__blk_mq_end_io(rq, rq->errors);
rq->q->softirq_done_fn(rq);
}
/*
* End IO on this request on a multiqueue enabled driver. We'll either do
* it directly inline, or punt to a local IPI handler on the matching
* remote CPU.
*/
void blk_mq_end_io(struct request *rq, int error)
void __blk_mq_complete_request(struct request *rq)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
int cpu;
if (!ctx->ipi_redirect)
return __blk_mq_end_io(rq, error);
if (!ctx->ipi_redirect) {
rq->q->softirq_done_fn(rq);
return;
}
cpu = get_cpu();
if (cpu != ctx->cpu && cpu_online(ctx->cpu)) {
rq->errors = error;
rq->csd.func = blk_mq_end_io_remote;
rq->csd.func = __blk_mq_complete_request_remote;
rq->csd.info = rq;
rq->csd.flags = 0;
__smp_call_function_single(ctx->cpu, &rq->csd, 0);
} else {
__blk_mq_end_io(rq, error);
rq->q->softirq_done_fn(rq);
}
put_cpu();
}
EXPORT_SYMBOL(blk_mq_end_io);
static void blk_mq_start_request(struct request *rq)
/**
* blk_mq_complete_request - end I/O on a request
* @rq: the request being processed
*
* Description:
* Ends all I/O on a request. It does not handle partial completions.
* The actual completion happens out-of-order, through a IPI handler.
**/
void blk_mq_complete_request(struct request *rq)
{
if (unlikely(blk_should_fake_timeout(rq->q)))
return;
if (!blk_mark_rq_complete(rq))
__blk_mq_complete_request(rq);
}
EXPORT_SYMBOL(blk_mq_complete_request);
static void blk_mq_start_request(struct request *rq, bool last)
{
struct request_queue *q = rq->q;
......@@ -384,6 +390,25 @@ static void blk_mq_start_request(struct request *rq)
*/
rq->deadline = jiffies + q->rq_timeout;
set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
if (q->dma_drain_size && blk_rq_bytes(rq)) {
/*
* Make sure space for the drain appears. We know we can do
* this because max_hw_segments has been adjusted to be one
* fewer than the device can handle.
*/
rq->nr_phys_segments++;
}
/*
* Flag the last request in the series so that drivers know when IO
* should be kicked off, if they don't do it on a per-request basis.
*
* Note: the flag isn't the only condition drivers should do kick off.
* If drive is busy, the last request might not have the bit set.
*/
if (last)
rq->cmd_flags |= REQ_END;
}
static void blk_mq_requeue_request(struct request *rq)
......@@ -392,6 +417,11 @@ static void blk_mq_requeue_request(struct request *rq)
trace_block_rq_requeue(q, rq);
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
rq->cmd_flags &= ~REQ_END;
if (q->dma_drain_size && blk_rq_bytes(rq))
rq->nr_phys_segments--;
}
struct blk_mq_timeout_data {
......@@ -559,19 +589,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
rq = list_first_entry(&rq_list, struct request, queuelist);
list_del_init(&rq->queuelist);
blk_mq_start_request(rq);
/*
* Last request in the series. Flag it as such, this
* enables drivers to know when IO should be kicked off,
* if they don't do it on a per-request basis.
*
* Note: the flag isn't the only condition drivers
* should do kick off. If drive is busy, the last
* request might not have the bit set.
*/
if (list_empty(&rq_list))
rq->cmd_flags |= REQ_END;
blk_mq_start_request(rq, list_empty(&rq_list));
ret = q->mq_ops->queue_rq(hctx, rq);
switch (ret) {
......@@ -589,8 +608,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
break;
default:
pr_err("blk-mq: bad return on queue: %d\n", ret);
rq->errors = -EIO;
case BLK_MQ_RQ_QUEUE_ERROR:
rq->errors = -EIO;
blk_mq_end_io(rq, rq->errors);
break;
}
......@@ -693,13 +712,16 @@ static void blk_mq_work_fn(struct work_struct *work)
}
static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
struct request *rq)
struct request *rq, bool at_head)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
trace_block_rq_insert(hctx->queue, rq);
list_add_tail(&rq->queuelist, &ctx->rq_list);
if (at_head)
list_add(&rq->queuelist, &ctx->rq_list);
else
list_add_tail(&rq->queuelist, &ctx->rq_list);
blk_mq_hctx_mark_pending(hctx, ctx);
/*
......@@ -709,7 +731,7 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
}
void blk_mq_insert_request(struct request_queue *q, struct request *rq,
bool run_queue)
bool at_head, bool run_queue)
{
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx, *current_ctx;
......@@ -728,7 +750,7 @@ void blk_mq_insert_request(struct request_queue *q, struct request *rq,
rq->mq_ctx = ctx;
}
spin_lock(&ctx->lock);
__blk_mq_insert_request(hctx, rq);
__blk_mq_insert_request(hctx, rq, at_head);
spin_unlock(&ctx->lock);
blk_mq_put_ctx(current_ctx);
......@@ -760,7 +782,7 @@ void blk_mq_run_request(struct request *rq, bool run_queue, bool async)
/* ctx->cpu might be offline */
spin_lock(&ctx->lock);
__blk_mq_insert_request(hctx, rq);
__blk_mq_insert_request(hctx, rq, false);
spin_unlock(&ctx->lock);
blk_mq_put_ctx(current_ctx);
......@@ -798,7 +820,7 @@ static void blk_mq_insert_requests(struct request_queue *q,
rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);
rq->mq_ctx = ctx;
__blk_mq_insert_request(hctx, rq);
__blk_mq_insert_request(hctx, rq, false);
}
spin_unlock(&ctx->lock);
......@@ -888,6 +910,11 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_queue_bounce(q, &bio);
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
bio_endio(bio, -EIO);
return;
}
if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
return;
......@@ -950,7 +977,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
__blk_mq_free_request(hctx, ctx, rq);
else {
blk_mq_bio_to_request(rq, bio);
__blk_mq_insert_request(hctx, rq);
__blk_mq_insert_request(hctx, rq, false);
}
spin_unlock(&ctx->lock);
......@@ -1309,15 +1336,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
reg->queue_depth = BLK_MQ_MAX_DEPTH;
}
/*
* Set aside a tag for flush requests. It will only be used while
* another flush request is in progress but outside the driver.
*
* TODO: only allocate if flushes are supported
*/
reg->queue_depth++;
reg->reserved_tags++;
if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
return ERR_PTR(-EINVAL);
......@@ -1360,17 +1378,27 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
q->mq_ops = reg->ops;
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;