Commit 1d02369d authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:
 "Final round of fixes for this merge window - some of this has come up
  after the initial pull request, and some of it was put in a post-merge
  branch before the merge window.

  This contains:

   - Fix for a bad check for an error on dma mapping in the mtip32xx
     driver, from Alexey Khoroshilov.

   - A set of fixes for lightnvm, from Javier, Matias, and Wenwei.

   - An NVMe completion record corruption fix from Marta, ensuring that
     we read things in the right order.

   - Two writeback fixes from Tejun, marked for stable@ as well.

   - A blk-mq sw queue iterator fix from Thomas, fixing an oops for
     sparse CPU maps.  They hit this in the hot plug/unplug rework"

* 'for-linus' of git://git.kernel.dk/linux-block:
  nvme: avoid cqe corruption when update at the same time as read
  writeback, cgroup: fix use of the wrong bdi_writeback which mismatches the inode
  writeback, cgroup: fix premature wb_put() in locked_inode_to_wb_and_lock_list()
  blk-mq: Use proper cpumask iterator
  mtip32xx: fix checks for dma mapping errors
  lightnvm: do not load L2P table if not supported
  lightnvm: do not reserve lun on l2p loading
  nvme: lightnvm: return ppa completion status
  lightnvm: add a bitmap of luns
  lightnvm: specify target's logical address area
  null_blk: add lightnvm null_blk device to the nullb_list
parents 8f40842e d783e0bd
......@@ -416,12 +416,14 @@ void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
static void blk_mq_sysfs_init(struct request_queue *q)
{
struct blk_mq_ctx *ctx;
int i;
int cpu;
kobject_init(&q->mq_kobj, &blk_mq_ktype);
queue_for_each_ctx(q, ctx, i)
for_each_possible_cpu(cpu) {
ctx = per_cpu_ptr(q->queue_ctx, cpu);
kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
}
}
int blk_mq_register_disk(struct gendisk *disk)
......
......@@ -1798,11 +1798,12 @@ static void blk_mq_map_swqueue(struct request_queue *q,
/*
* Map software to hardware queues
*/
queue_for_each_ctx(q, ctx, i) {
for_each_possible_cpu(i) {
/* If the cpu isn't online, the cpu is mapped to first hctx */
if (!cpumask_test_cpu(i, online_mask))
continue;
ctx = per_cpu_ptr(q->queue_ctx, i);
hctx = q->mq_ops->map_queue(q, i);
cpumask_set_cpu(i, hctx->cpumask);
......
......@@ -2051,7 +2051,7 @@ static int exec_drive_taskfile(struct driver_data *dd,
outbuf,
taskout,
DMA_TO_DEVICE);
if (outbuf_dma == 0) {
if (pci_dma_mapping_error(dd->pdev, outbuf_dma)) {
err = -ENOMEM;
goto abort;
}
......@@ -2068,7 +2068,7 @@ static int exec_drive_taskfile(struct driver_data *dd,
inbuf_dma = pci_map_single(dd->pdev,
inbuf,
taskin, DMA_FROM_DEVICE);
if (inbuf_dma == 0) {
if (pci_dma_mapping_error(dd->pdev, inbuf_dma)) {
err = -ENOMEM;
goto abort;
}
......
......@@ -742,10 +742,11 @@ static int null_add_dev(void)
add_disk(disk);
done:
mutex_lock(&lock);
list_add_tail(&nullb->list, &nullb_list);
mutex_unlock(&lock);
done:
return 0;
out_cleanup_lightnvm:
......
......@@ -464,8 +464,13 @@ static int nvm_core_init(struct nvm_dev *dev)
dev->nr_luns = dev->luns_per_chnl * dev->nr_chnls;
dev->total_secs = dev->nr_luns * dev->sec_per_lun;
dev->lun_map = kcalloc(BITS_TO_LONGS(dev->nr_luns),
sizeof(unsigned long), GFP_KERNEL);
if (!dev->lun_map)
return -ENOMEM;
INIT_LIST_HEAD(&dev->online_targets);
mutex_init(&dev->mlock);
spin_lock_init(&dev->lock);
return 0;
}
......@@ -585,6 +590,7 @@ int nvm_register(struct request_queue *q, char *disk_name,
return 0;
err_init:
kfree(dev->lun_map);
kfree(dev);
return ret;
}
......@@ -607,6 +613,7 @@ void nvm_unregister(char *disk_name)
up_write(&nvm_lock);
nvm_exit(dev);
kfree(dev->lun_map);
kfree(dev);
}
EXPORT_SYMBOL(nvm_unregister);
......
......@@ -20,6 +20,68 @@
#include "gennvm.h"
static int gennvm_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len)
{
struct gen_nvm *gn = dev->mp;
struct gennvm_area *area, *prev, *next;
sector_t begin = 0;
sector_t max_sectors = (dev->sec_size * dev->total_secs) >> 9;
if (len > max_sectors)
return -EINVAL;
area = kmalloc(sizeof(struct gennvm_area), GFP_KERNEL);
if (!area)
return -ENOMEM;
prev = NULL;
spin_lock(&dev->lock);
list_for_each_entry(next, &gn->area_list, list) {
if (begin + len > next->begin) {
begin = next->end;
prev = next;
continue;
}
break;
}
if ((begin + len) > max_sectors) {
spin_unlock(&dev->lock);
kfree(area);
return -EINVAL;
}
area->begin = *lba = begin;
area->end = begin + len;
if (prev) /* insert into sorted order */
list_add(&area->list, &prev->list);
else
list_add(&area->list, &gn->area_list);
spin_unlock(&dev->lock);
return 0;
}
static void gennvm_put_area(struct nvm_dev *dev, sector_t begin)
{
struct gen_nvm *gn = dev->mp;
struct gennvm_area *area;
spin_lock(&dev->lock);
list_for_each_entry(area, &gn->area_list, list) {
if (area->begin != begin)
continue;
list_del(&area->list);
spin_unlock(&dev->lock);
kfree(area);
return;
}
spin_unlock(&dev->lock);
}
static void gennvm_blocks_free(struct nvm_dev *dev)
{
struct gen_nvm *gn = dev->mp;
......@@ -195,7 +257,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
}
}
if (dev->ops->get_l2p_tbl) {
if ((dev->identity.dom & NVM_RSP_L2P) && dev->ops->get_l2p_tbl) {
ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_secs,
gennvm_block_map, dev);
if (ret) {
......@@ -229,6 +291,7 @@ static int gennvm_register(struct nvm_dev *dev)
gn->dev = dev;
gn->nr_luns = dev->nr_luns;
INIT_LIST_HEAD(&gn->area_list);
dev->mp = gn;
ret = gennvm_luns_init(dev, gn);
......@@ -419,10 +482,23 @@ static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk,
return nvm_erase_ppa(dev, &addr, 1);
}
static int gennvm_reserve_lun(struct nvm_dev *dev, int lunid)
{
return test_and_set_bit(lunid, dev->lun_map);
}
static void gennvm_release_lun(struct nvm_dev *dev, int lunid)
{
WARN_ON(!test_and_clear_bit(lunid, dev->lun_map));
}
static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid)
{
struct gen_nvm *gn = dev->mp;
if (unlikely(lunid >= dev->nr_luns))
return NULL;
return &gn->luns[lunid].vlun;
}
......@@ -464,7 +540,13 @@ static struct nvmm_type gennvm = {
.erase_blk = gennvm_erase_blk,
.get_lun = gennvm_get_lun,
.reserve_lun = gennvm_reserve_lun,
.release_lun = gennvm_release_lun,
.lun_info_print = gennvm_lun_info_print,
.get_area = gennvm_get_area,
.put_area = gennvm_put_area,
};
static int __init gennvm_module_init(void)
......
......@@ -39,8 +39,14 @@ struct gen_nvm {
int nr_luns;
struct gen_lun *luns;
struct list_head area_list;
};
struct gennvm_area {
struct list_head list;
sector_t begin;
sector_t end; /* end is excluded */
};
#define gennvm_for_each_lun(bm, lun, i) \
for ((i) = 0, lun = &(bm)->luns[0]; \
(i) < (bm)->nr_luns; (i)++, lun = &(bm)->luns[(i)])
......
......@@ -965,25 +965,11 @@ static void rrpc_requeue(struct work_struct *work)
static void rrpc_gc_free(struct rrpc *rrpc)
{
struct rrpc_lun *rlun;
int i;
if (rrpc->krqd_wq)
destroy_workqueue(rrpc->krqd_wq);
if (rrpc->kgc_wq)
destroy_workqueue(rrpc->kgc_wq);
if (!rrpc->luns)
return;
for (i = 0; i < rrpc->nr_luns; i++) {
rlun = &rrpc->luns[i];
if (!rlun->blocks)
break;
vfree(rlun->blocks);
}
}
static int rrpc_gc_init(struct rrpc *rrpc)
......@@ -1053,8 +1039,11 @@ static int rrpc_map_init(struct rrpc *rrpc)
{
struct nvm_dev *dev = rrpc->dev;
sector_t i;
u64 slba;
int ret;
slba = rrpc->soffset >> (ilog2(dev->sec_size) - 9);
rrpc->trans_map = vzalloc(sizeof(struct rrpc_addr) * rrpc->nr_sects);
if (!rrpc->trans_map)
return -ENOMEM;
......@@ -1076,7 +1065,7 @@ static int rrpc_map_init(struct rrpc *rrpc)
return 0;
/* Bring up the mapping table from device */
ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_secs, rrpc_l2p_update,
ret = dev->ops->get_l2p_tbl(dev, slba, rrpc->nr_sects, rrpc_l2p_update,
rrpc);
if (ret) {
pr_err("nvm: rrpc: could not read L2P table.\n");
......@@ -1086,7 +1075,6 @@ static int rrpc_map_init(struct rrpc *rrpc)
return 0;
}
/* Minimum pages needed within a lun */
#define PAGE_POOL_SIZE 16
#define ADDR_POOL_SIZE 64
......@@ -1141,6 +1129,23 @@ static void rrpc_core_free(struct rrpc *rrpc)
static void rrpc_luns_free(struct rrpc *rrpc)
{
struct nvm_dev *dev = rrpc->dev;
struct nvm_lun *lun;
struct rrpc_lun *rlun;
int i;
if (!rrpc->luns)
return;
for (i = 0; i < rrpc->nr_luns; i++) {
rlun = &rrpc->luns[i];
lun = rlun->parent;
if (!lun)
break;
dev->mt->release_lun(dev, lun->id);
vfree(rlun->blocks);
}
kfree(rrpc->luns);
}
......@@ -1148,7 +1153,7 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
{
struct nvm_dev *dev = rrpc->dev;
struct rrpc_lun *rlun;
int i, j;
int i, j, ret = -EINVAL;
if (dev->sec_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) {
pr_err("rrpc: number of pages per block too high.");
......@@ -1164,25 +1169,26 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
/* 1:1 mapping */
for (i = 0; i < rrpc->nr_luns; i++) {
struct nvm_lun *lun = dev->mt->get_lun(dev, lun_begin + i);
rlun = &rrpc->luns[i];
rlun->rrpc = rrpc;
rlun->parent = lun;
INIT_LIST_HEAD(&rlun->prio_list);
INIT_LIST_HEAD(&rlun->open_list);
INIT_LIST_HEAD(&rlun->closed_list);
int lunid = lun_begin + i;
struct nvm_lun *lun;
INIT_WORK(&rlun->ws_gc, rrpc_lun_gc);
spin_lock_init(&rlun->lock);
if (dev->mt->reserve_lun(dev, lunid)) {
pr_err("rrpc: lun %u is already allocated\n", lunid);
goto err;
}
rrpc->total_blocks += dev->blks_per_lun;
rrpc->nr_sects += dev->sec_per_lun;
lun = dev->mt->get_lun(dev, lunid);
if (!lun)
goto err;
rlun = &rrpc->luns[i];
rlun->parent = lun;
rlun->blocks = vzalloc(sizeof(struct rrpc_block) *
rrpc->dev->blks_per_lun);
if (!rlun->blocks)
if (!rlun->blocks) {
ret = -ENOMEM;
goto err;
}
for (j = 0; j < rrpc->dev->blks_per_lun; j++) {
struct rrpc_block *rblk = &rlun->blocks[j];
......@@ -1193,11 +1199,43 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
INIT_LIST_HEAD(&rblk->prio);
spin_lock_init(&rblk->lock);
}
rlun->rrpc = rrpc;
INIT_LIST_HEAD(&rlun->prio_list);
INIT_LIST_HEAD(&rlun->open_list);
INIT_LIST_HEAD(&rlun->closed_list);
INIT_WORK(&rlun->ws_gc, rrpc_lun_gc);
spin_lock_init(&rlun->lock);
rrpc->total_blocks += dev->blks_per_lun;
rrpc->nr_sects += dev->sec_per_lun;
}
return 0;
err:
return -ENOMEM;
return ret;
}
/* returns 0 on success and stores the beginning address in *begin */
static int rrpc_area_init(struct rrpc *rrpc, sector_t *begin)
{
struct nvm_dev *dev = rrpc->dev;
struct nvmm_type *mt = dev->mt;
sector_t size = rrpc->nr_sects * dev->sec_size;
size >>= 9;
return mt->get_area(dev, begin, size);
}
static void rrpc_area_free(struct rrpc *rrpc)
{
struct nvm_dev *dev = rrpc->dev;
struct nvmm_type *mt = dev->mt;
mt->put_area(dev, rrpc->soffset);
}
static void rrpc_free(struct rrpc *rrpc)
......@@ -1206,6 +1244,7 @@ static void rrpc_free(struct rrpc *rrpc)
rrpc_map_free(rrpc);
rrpc_core_free(rrpc);
rrpc_luns_free(rrpc);
rrpc_area_free(rrpc);
kfree(rrpc);
}
......@@ -1327,6 +1366,7 @@ static void *rrpc_init(struct nvm_dev *dev, struct gendisk *tdisk,
struct request_queue *bqueue = dev->q;
struct request_queue *tqueue = tdisk->queue;
struct rrpc *rrpc;
sector_t soffset;
int ret;
if (!(dev->identity.dom & NVM_RSP_L2P)) {
......@@ -1352,6 +1392,13 @@ static void *rrpc_init(struct nvm_dev *dev, struct gendisk *tdisk,
/* simple round-robin strategy */
atomic_set(&rrpc->next_lun, -1);
ret = rrpc_area_init(rrpc, &soffset);
if (ret < 0) {
pr_err("nvm: rrpc: could not initialize area\n");
return ERR_PTR(ret);
}
rrpc->soffset = soffset;
ret = rrpc_luns_init(rrpc, lun_begin, lun_end);
if (ret) {
pr_err("nvm: rrpc: could not initialize luns\n");
......
......@@ -97,6 +97,7 @@ struct rrpc {
struct nvm_dev *dev;
struct gendisk *disk;
sector_t soffset; /* logical sector offset */
u64 poffset; /* physical page offset */
int lun_offset;
......
......@@ -146,6 +146,14 @@ struct nvme_nvm_command {
};
};
struct nvme_nvm_completion {
__le64 result; /* Used by LightNVM to return ppa completions */
__le16 sq_head; /* how much of this queue may be reclaimed */
__le16 sq_id; /* submission queue that generated this entry */
__u16 command_id; /* of the command which completed */
__le16 status; /* did the command fail, and if so, why? */
};
#define NVME_NVM_LP_MLC_PAIRS 886
struct nvme_nvm_lp_mlc {
__u16 num_pairs;
......@@ -507,6 +515,10 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd,
static void nvme_nvm_end_io(struct request *rq, int error)
{
struct nvm_rq *rqd = rq->end_io_data;
struct nvme_nvm_completion *cqe = rq->special;
if (cqe)
rqd->ppa_status = le64_to_cpu(cqe->result);
nvm_end_io(rqd, error);
......@@ -526,7 +538,8 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
if (IS_ERR(rq))
return -ENOMEM;
cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
cmd = kzalloc(sizeof(struct nvme_nvm_command) +
sizeof(struct nvme_nvm_completion), GFP_KERNEL);
if (!cmd) {
blk_mq_free_request(rq);
return -ENOMEM;
......@@ -545,7 +558,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
rq->cmd = (unsigned char *)cmd;
rq->cmd_len = sizeof(struct nvme_nvm_command);
rq->special = (void *)0;
rq->special = cmd + 1;
rq->end_io_data = rqd;
......
......@@ -723,6 +723,13 @@ static void nvme_complete_rq(struct request *req)
blk_mq_end_request(req, error);
}
/* We read the CQE phase first to check if the rest of the entry is valid */
static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head,
u16 phase)
{
return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase;
}
static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
{
u16 head, phase;
......@@ -730,13 +737,10 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
head = nvmeq->cq_head;
phase = nvmeq->cq_phase;
for (;;) {
while (nvme_cqe_valid(nvmeq, head, phase)) {
struct nvme_completion cqe = nvmeq->cqes[head];
u16 status = le16_to_cpu(cqe.status);
struct request *req;
if ((status & 1) != phase)
break;
if (++head == nvmeq->q_depth) {
head = 0;
phase = !phase;
......@@ -767,7 +771,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id);
if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special)
memcpy(req->special, &cqe, sizeof(cqe));
blk_mq_complete_request(req, status >> 1);
blk_mq_complete_request(req, le16_to_cpu(cqe.status) >> 1);
}
......@@ -808,18 +812,16 @@ static irqreturn_t nvme_irq(int irq, void *data)
static irqreturn_t nvme_irq_check(int irq, void *data)
{
struct nvme_queue *nvmeq = data;
struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
return IRQ_NONE;
return IRQ_WAKE_THREAD;
if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
return IRQ_WAKE_THREAD;
return IRQ_NONE;
}
static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
{
struct nvme_queue *nvmeq = hctx->driver_data;
if ((le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) ==
nvmeq->cq_phase) {
if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
spin_lock_irq(&nvmeq->q_lock);
__nvme_process_cq(nvmeq, &tag);
spin_unlock_irq(&nvmeq->q_lock);
......
......@@ -281,13 +281,15 @@ locked_inode_to_wb_and_lock_list(struct inode *inode)
wb_get(wb);
spin_unlock(&inode->i_lock);
spin_lock(&wb->list_lock);
wb_put(wb); /* not gonna deref it anymore */
/* i_wb may have changed inbetween, can't use inode_to_wb() */
if (likely(wb == inode->i_wb))
return wb; /* @inode already has ref */
if (likely(wb == inode->i_wb)) {
wb_put(wb); /* @inode already has ref */
return wb;
}
spin_unlock(&wb->list_lock);
wb_put(wb);
cpu_relax();
spin_lock(&inode->i_lock);
}
......@@ -1337,10 +1339,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
* and does more profound writeback list handling in writeback_sb_inodes().
*/
static int
writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
struct writeback_control *wbc)
static int writeback_single_inode(struct inode *inode,
struct writeback_control *wbc)
{
struct bdi_writeback *wb;
int ret = 0;
spin_lock(&inode->i_lock);
......@@ -1378,7 +1380,8 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
ret = __writeback_single_inode(inode, wbc);
wbc_detach_inode(wbc);
spin_lock(&wb->list_lock);
wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
/*
* If inode is clean, remove it from writeback lists. Otherwise don't
......@@ -1453,6 +1456,7 @@ static long writeback_sb_inodes(struct super_block *sb,
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
struct bdi_writeback *tmp_wb;
if (inode->i_sb != sb) {
if (work->sb) {
......@@ -1543,15 +1547,23 @@ static long writeback_sb_inodes(struct super_block *sb,
cond_resched();
}
spin_lock(&wb->list_lock);
/*
* Requeue @inode if still dirty. Be careful as @inode may
* have been switched to another wb in the meantime.
*/
tmp_wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY_ALL))
wrote++;
requeue_inode(inode, wb, &wbc);
requeue_inode(inode, tmp_wb, &wbc);
inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
if (unlikely(tmp_wb != wb)) {
spin_unlock(&tmp_wb->list_lock);
spin_lock(&wb->list_lock);
}
/*
* bail out to wb_writeback() often enough to check
* background threshold and other termination conditions.
......@@ -2338,7 +2350,6 @@ EXPORT_SYMBOL(sync_inodes_sb);