raid5.c 197 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
/*
 * raid5.c : Multiple Devices driver for Linux
 *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
 *	   Copyright (C) 1999, 2000 Ingo Molnar
5
 *	   Copyright (C) 2002, 2003 H. Peter Anvin
Linus Torvalds's avatar
Linus Torvalds committed
6
 *
7
8
9
 * RAID-4/5/6 management functions.
 * Thanks to Penguin Computing for making the RAID-6 development possible
 * by donating a test server!
Linus Torvalds's avatar
Linus Torvalds committed
10
11
12
13
14
15
16
17
18
19
20
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * You should have received a copy of the GNU General Public License
 * (for example /usr/src/linux/COPYING); if not, write to the Free
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

21
22
23
24
25
26
27
28
29
/*
 * BITMAP UNPLUGGING:
 *
 * The sequencing for updating the bitmap reliably is a little
 * subtle (and I got it wrong the first time) so it deserves some
 * explanation.
 *
 * We group bitmap updates into batches.  Each batch has a number.
 * We may write out several batches at once, but that isn't very important.
30
31
 * conf->seq_write is the number of the last batch successfully written.
 * conf->seq_flush is the number of the last batch that was closed to
32
33
34
 *    new additions.
 * When we discover that we will need to write to any block in a stripe
 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35
 * the number of the batch it will be in. This is seq_flush+1.
36
37
38
39
40
41
42
43
44
 * When we are ready to do a write, if that batch hasn't been written yet,
 *   we plug the array and queue the stripe for later.
 * When an unplug happens, we increment bm_flush, thus closing the current
 *   batch.
 * When we notice that bm_flush > bm_write, we write out all pending updates
 * to the bitmap, and advance bm_write to where bm_flush was.
 * This may occasionally write a bit out twice, but is sure never to
 * miss any bits.
 */
Linus Torvalds's avatar
Linus Torvalds committed
45

46
#include <linux/blkdev.h>
47
#include <linux/kthread.h>
48
#include <linux/raid/pq.h>
49
#include <linux/async_tx.h>
50
#include <linux/module.h>
51
#include <linux/async.h>
52
#include <linux/seq_file.h>
53
#include <linux/cpu.h>
54
#include <linux/slab.h>
55
#include <linux/ratelimit.h>
56
#include <linux/nodemask.h>
NeilBrown's avatar
NeilBrown committed
57
58
#include <trace/events/block.h>

59
#include "md.h"
60
#include "raid5.h"
61
#include "raid0.h"
62
#include "bitmap.h"
63

64
65
66
67
#define cpu_to_group(cpu) cpu_to_node(cpu)
#define ANY_GROUP NUMA_NO_NODE

static struct workqueue_struct *raid5_wq;
Linus Torvalds's avatar
Linus Torvalds committed
68
69
70
71
72
73
74
75
76
/*
 * Stripe cache
 */

#define NR_STRIPES		256
#define STRIPE_SIZE		PAGE_SIZE
#define STRIPE_SHIFT		(PAGE_SHIFT - 9)
#define STRIPE_SECTORS		(STRIPE_SIZE>>9)
#define	IO_THRESHOLD		1
77
#define BYPASS_THRESHOLD	1
78
#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
Linus Torvalds's avatar
Linus Torvalds committed
79
#define HASH_MASK		(NR_HASH - 1)
80
#define MAX_STRIPE_BATCH	8
Linus Torvalds's avatar
Linus Torvalds committed
81

82
static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
83
84
85
86
{
	int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
	return &conf->stripe_hashtbl[hash];
}
Linus Torvalds's avatar
Linus Torvalds committed
87

88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
static inline int stripe_hash_locks_hash(sector_t sect)
{
	return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
}

static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
{
	spin_lock_irq(conf->hash_locks + hash);
	spin_lock(&conf->device_lock);
}

static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
{
	spin_unlock(&conf->device_lock);
	spin_unlock_irq(conf->hash_locks + hash);
}

static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
{
	int i;
	local_irq_disable();
	spin_lock(conf->hash_locks);
	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
		spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
	spin_lock(&conf->device_lock);
}

static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
{
	int i;
	spin_unlock(&conf->device_lock);
	for (i = NR_STRIPE_HASH_LOCKS; i; i--)
		spin_unlock(conf->hash_locks + i - 1);
	local_irq_enable();
}

Linus Torvalds's avatar
Linus Torvalds committed
124
125
126
127
128
129
/* bio's attached to a stripe+device for I/O are linked together in bi_sector
 * order without overlap.  There may be several bio's per stripe+device, and
 * a bio could span several devices.
 * When walking this list for a particular stripe+device, we must never proceed
 * beyond a bio that extends past this device, as the next bio might no longer
 * be valid.
130
 * This function is used to determine the 'next' bio in the list, given the sector
Linus Torvalds's avatar
Linus Torvalds committed
131
132
 * of the current stripe+device
 */
133
134
static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
{
135
	int sectors = bio_sectors(bio);
136
137
138
139
140
	if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
		return bio->bi_next;
	else
		return NULL;
}
Linus Torvalds's avatar
Linus Torvalds committed
141

142
/*
143
144
 * We maintain a biased count of active stripes in the bottom 16 bits of
 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
145
 */
146
static inline int raid5_bi_processed_stripes(struct bio *bio)
147
{
148
149
	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
	return (atomic_read(segments) >> 16) & 0xffff;
150
151
}

152
static inline int raid5_dec_bi_active_stripes(struct bio *bio)
153
{
154
155
	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
	return atomic_sub_return(1, segments) & 0xffff;
156
157
}

158
static inline void raid5_inc_bi_active_stripes(struct bio *bio)
159
{
160
161
	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
	atomic_inc(segments);
162
163
}

164
165
static inline void raid5_set_bi_processed_stripes(struct bio *bio,
	unsigned int cnt)
166
{
167
168
	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
	int old, new;
169

170
171
172
173
	do {
		old = atomic_read(segments);
		new = (old & 0xffff) | (cnt << 16);
	} while (atomic_cmpxchg(segments, old, new) != old);
174
175
}

176
static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
177
{
178
179
	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
	atomic_set(segments, cnt);
180
181
}

182
183
184
/* Find first data disk in a raid6 stripe */
static inline int raid6_d0(struct stripe_head *sh)
{
185
186
187
188
	if (sh->ddf_layout)
		/* ddf always start from first device */
		return 0;
	/* md starts just after Q block */
189
190
191
192
193
	if (sh->qd_idx == sh->disks - 1)
		return 0;
	else
		return sh->qd_idx + 1;
}
194
195
196
197
198
static inline int raid6_next_disk(int disk, int raid_disks)
{
	disk++;
	return (disk < raid_disks) ? disk : 0;
}
199

200
201
202
203
204
/* When walking through the disks in a raid5, starting at raid6_d0,
 * We need to map each disk to a 'slot', where the data disks are slot
 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
 * is raid_disks-1.  This help does that mapping.
 */
205
206
static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
			     int *count, int syndrome_disks)
207
{
208
	int slot = *count;
209

210
	if (sh->ddf_layout)
211
		(*count)++;
212
	if (idx == sh->pd_idx)
213
		return syndrome_disks;
214
	if (idx == sh->qd_idx)
215
		return syndrome_disks + 1;
216
	if (!sh->ddf_layout)
217
		(*count)++;
218
219
220
	return slot;
}

221
222
223
224
225
226
227
228
static void return_io(struct bio *return_bi)
{
	struct bio *bi = return_bi;
	while (bi) {

		return_bi = bi->bi_next;
		bi->bi_next = NULL;
		bi->bi_size = 0;
229
230
		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
					 bi, 0);
231
		bio_endio(bi, 0);
232
233
234
235
		bi = return_bi;
	}
}

236
static void print_raid5_conf (struct r5conf *conf);
Linus Torvalds's avatar
Linus Torvalds committed
237

238
239
240
241
242
243
244
static int stripe_operations_active(struct stripe_head *sh)
{
	return sh->check_state || sh->reconstruct_state ||
	       test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
}

245
246
247
248
static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
{
	struct r5conf *conf = sh->raid_conf;
	struct r5worker_group *group;
249
	int thread_cnt;
250
251
252
253
254
255
256
257
258
259
260
	int i, cpu = sh->cpu;

	if (!cpu_online(cpu)) {
		cpu = cpumask_any(cpu_online_mask);
		sh->cpu = cpu;
	}

	if (list_empty(&sh->lru)) {
		struct r5worker_group *group;
		group = conf->worker_groups + cpu_to_group(cpu);
		list_add_tail(&sh->lru, &group->handle_list);
261
262
		group->stripes_cnt++;
		sh->group = group;
263
264
265
266
267
268
269
270
271
	}

	if (conf->worker_cnt_per_group == 0) {
		md_wakeup_thread(conf->mddev->thread);
		return;
	}

	group = conf->worker_groups + cpu_to_group(sh->cpu);

272
273
274
275
276
277
278
279
280
281
282
283
284
285
	group->workers[0].working = true;
	/* at least one worker should run to avoid race */
	queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);

	thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
	/* wakeup more workers */
	for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
		if (group->workers[i].working == false) {
			group->workers[i].working = true;
			queue_work_on(sh->cpu, raid5_wq,
				      &group->workers[i].work);
			thread_cnt--;
		}
	}
286
287
}

288
289
static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
			      struct list_head *temp_inactive_list)
Linus Torvalds's avatar
Linus Torvalds committed
290
{
291
292
293
294
295
296
297
298
299
300
301
302
	BUG_ON(!list_empty(&sh->lru));
	BUG_ON(atomic_read(&conf->active_stripes)==0);
	if (test_bit(STRIPE_HANDLE, &sh->state)) {
		if (test_bit(STRIPE_DELAYED, &sh->state) &&
		    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
			list_add_tail(&sh->lru, &conf->delayed_list);
		else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
			   sh->bm_seq - conf->seq_write > 0)
			list_add_tail(&sh->lru, &conf->bitmap_list);
		else {
			clear_bit(STRIPE_DELAYED, &sh->state);
			clear_bit(STRIPE_BIT_DELAY, &sh->state);
303
304
305
306
307
308
			if (conf->worker_cnt_per_group == 0) {
				list_add_tail(&sh->lru, &conf->handle_list);
			} else {
				raid5_wakeup_stripe_thread(sh);
				return;
			}
309
310
311
312
313
314
315
316
317
		}
		md_wakeup_thread(conf->mddev->thread);
	} else {
		BUG_ON(stripe_operations_active(sh));
		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
			if (atomic_dec_return(&conf->preread_active_stripes)
			    < IO_THRESHOLD)
				md_wakeup_thread(conf->mddev->thread);
		atomic_dec(&conf->active_stripes);
318
319
		if (!test_bit(STRIPE_EXPANDING, &sh->state))
			list_add_tail(&sh->lru, temp_inactive_list);
Linus Torvalds's avatar
Linus Torvalds committed
320
321
	}
}
322

323
324
static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
			     struct list_head *temp_inactive_list)
325
326
{
	if (atomic_dec_and_test(&sh->count))
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
		do_release_stripe(conf, sh, temp_inactive_list);
}

/*
 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
 *
 * Be careful: Only one task can add/delete stripes from temp_inactive_list at
 * given time. Adding stripes only takes device lock, while deleting stripes
 * only takes hash lock.
 */
static void release_inactive_stripe_list(struct r5conf *conf,
					 struct list_head *temp_inactive_list,
					 int hash)
{
	int size;
	bool do_wakeup = false;
	unsigned long flags;

	if (hash == NR_STRIPE_HASH_LOCKS) {
		size = NR_STRIPE_HASH_LOCKS;
		hash = NR_STRIPE_HASH_LOCKS - 1;
	} else
		size = 1;
	while (size) {
		struct list_head *list = &temp_inactive_list[size - 1];

		/*
		 * We don't hold any lock here yet, get_active_stripe() might
		 * remove stripes from the list
		 */
		if (!list_empty_careful(list)) {
			spin_lock_irqsave(conf->hash_locks + hash, flags);
359
360
361
			if (list_empty(conf->inactive_list + hash) &&
			    !list_empty(list))
				atomic_dec(&conf->empty_inactive_list_nr);
362
363
364
365
366
367
368
369
370
371
372
373
374
			list_splice_tail_init(list, conf->inactive_list + hash);
			do_wakeup = true;
			spin_unlock_irqrestore(conf->hash_locks + hash, flags);
		}
		size--;
		hash--;
	}

	if (do_wakeup) {
		wake_up(&conf->wait_for_stripe);
		if (conf->retry_read_aligned)
			md_wakeup_thread(conf->mddev->thread);
	}
375
376
}

377
/* should hold conf->device_lock already */
378
379
static int release_stripe_list(struct r5conf *conf,
			       struct list_head *temp_inactive_list)
380
381
382
383
384
385
{
	struct stripe_head *sh;
	int count = 0;
	struct llist_node *head;

	head = llist_del_all(&conf->released_stripes);
Shaohua Li's avatar
Shaohua Li committed
386
	head = llist_reverse_order(head);
387
	while (head) {
388
389
		int hash;

390
391
392
393
394
395
396
397
398
399
		sh = llist_entry(head, struct stripe_head, release_list);
		head = llist_next(head);
		/* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
		smp_mb();
		clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
		/*
		 * Don't worry the bit is set here, because if the bit is set
		 * again, the count is always > 1. This is true for
		 * STRIPE_ON_UNPLUG_LIST bit too.
		 */
400
401
		hash = sh->hash_lock_index;
		__release_stripe(conf, sh, &temp_inactive_list[hash]);
402
403
404
405
406
407
		count++;
	}

	return count;
}

Linus Torvalds's avatar
Linus Torvalds committed
408
409
static void release_stripe(struct stripe_head *sh)
{
410
	struct r5conf *conf = sh->raid_conf;
Linus Torvalds's avatar
Linus Torvalds committed
411
	unsigned long flags;
412
413
	struct list_head list;
	int hash;
414
	bool wakeup;
415

416
417
	if (unlikely(!conf->mddev->thread) ||
		test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
418
419
420
421
422
423
		goto slow_path;
	wakeup = llist_add(&sh->release_list, &conf->released_stripes);
	if (wakeup)
		md_wakeup_thread(conf->mddev->thread);
	return;
slow_path:
424
	local_irq_save(flags);
425
	/* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
426
	if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
427
428
429
		INIT_LIST_HEAD(&list);
		hash = sh->hash_lock_index;
		do_release_stripe(conf, sh, &list);
430
		spin_unlock(&conf->device_lock);
431
		release_inactive_stripe_list(conf, &list, hash);
432
433
	}
	local_irq_restore(flags);
Linus Torvalds's avatar
Linus Torvalds committed
434
435
}

436
static inline void remove_hash(struct stripe_head *sh)
Linus Torvalds's avatar
Linus Torvalds committed
437
{
438
439
	pr_debug("remove_hash(), stripe %llu\n",
		(unsigned long long)sh->sector);
Linus Torvalds's avatar
Linus Torvalds committed
440

441
	hlist_del_init(&sh->hash);
Linus Torvalds's avatar
Linus Torvalds committed
442
443
}

444
static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
Linus Torvalds's avatar
Linus Torvalds committed
445
{
446
	struct hlist_head *hp = stripe_hash(conf, sh->sector);
Linus Torvalds's avatar
Linus Torvalds committed
447

448
449
	pr_debug("insert_hash(), stripe %llu\n",
		(unsigned long long)sh->sector);
Linus Torvalds's avatar
Linus Torvalds committed
450

451
	hlist_add_head(&sh->hash, hp);
Linus Torvalds's avatar
Linus Torvalds committed
452
453
454
455
}


/* find an idle stripe, make sure it is unhashed, and return it. */
456
static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
Linus Torvalds's avatar
Linus Torvalds committed
457
458
459
460
{
	struct stripe_head *sh = NULL;
	struct list_head *first;

461
	if (list_empty(conf->inactive_list + hash))
Linus Torvalds's avatar
Linus Torvalds committed
462
		goto out;
463
	first = (conf->inactive_list + hash)->next;
Linus Torvalds's avatar
Linus Torvalds committed
464
465
466
467
	sh = list_entry(first, struct stripe_head, lru);
	list_del_init(first);
	remove_hash(sh);
	atomic_inc(&conf->active_stripes);
468
	BUG_ON(hash != sh->hash_lock_index);
469
470
	if (list_empty(conf->inactive_list + hash))
		atomic_inc(&conf->empty_inactive_list_nr);
Linus Torvalds's avatar
Linus Torvalds committed
471
472
473
474
out:
	return sh;
}

475
static void shrink_buffers(struct stripe_head *sh)
Linus Torvalds's avatar
Linus Torvalds committed
476
477
478
{
	struct page *p;
	int i;
479
	int num = sh->raid_conf->pool_size;
Linus Torvalds's avatar
Linus Torvalds committed
480

481
	for (i = 0; i < num ; i++) {
Linus Torvalds's avatar
Linus Torvalds committed
482
483
484
485
		p = sh->dev[i].page;
		if (!p)
			continue;
		sh->dev[i].page = NULL;
486
		put_page(p);
Linus Torvalds's avatar
Linus Torvalds committed
487
488
489
	}
}

490
static int grow_buffers(struct stripe_head *sh)
Linus Torvalds's avatar
Linus Torvalds committed
491
492
{
	int i;
493
	int num = sh->raid_conf->pool_size;
Linus Torvalds's avatar
Linus Torvalds committed
494

495
	for (i = 0; i < num; i++) {
Linus Torvalds's avatar
Linus Torvalds committed
496
497
498
499
500
501
502
503
504
505
		struct page *page;

		if (!(page = alloc_page(GFP_KERNEL))) {
			return 1;
		}
		sh->dev[i].page = page;
	}
	return 0;
}

506
static void raid5_build_block(struct stripe_head *sh, int i, int previous);
507
static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
508
			    struct stripe_head *sh);
Linus Torvalds's avatar
Linus Torvalds committed
509

510
static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
Linus Torvalds's avatar
Linus Torvalds committed
511
{
512
	struct r5conf *conf = sh->raid_conf;
513
	int i, seq;
Linus Torvalds's avatar
Linus Torvalds committed
514

515
516
	BUG_ON(atomic_read(&sh->count) != 0);
	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
517
	BUG_ON(stripe_operations_active(sh));
518

519
	pr_debug("init_stripe called, stripe %llu\n",
Linus Torvalds's avatar
Linus Torvalds committed
520
521
522
		(unsigned long long)sh->sector);

	remove_hash(sh);
523
524
retry:
	seq = read_seqcount_begin(&conf->gen_lock);
525
	sh->generation = conf->generation - previous;
526
	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
Linus Torvalds's avatar
Linus Torvalds committed
527
	sh->sector = sector;
528
	stripe_set_idx(sector, conf, previous, sh);
Linus Torvalds's avatar
Linus Torvalds committed
529
530
	sh->state = 0;

531
532

	for (i = sh->disks; i--; ) {
Linus Torvalds's avatar
Linus Torvalds committed
533
534
		struct r5dev *dev = &sh->dev[i];

535
		if (dev->toread || dev->read || dev->towrite || dev->written ||
Linus Torvalds's avatar
Linus Torvalds committed
536
		    test_bit(R5_LOCKED, &dev->flags)) {
537
			printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
Linus Torvalds's avatar
Linus Torvalds committed
538
			       (unsigned long long)sh->sector, i, dev->toread,
539
			       dev->read, dev->towrite, dev->written,
Linus Torvalds's avatar
Linus Torvalds committed
540
			       test_bit(R5_LOCKED, &dev->flags));
541
			WARN_ON(1);
Linus Torvalds's avatar
Linus Torvalds committed
542
543
		}
		dev->flags = 0;
544
		raid5_build_block(sh, i, previous);
Linus Torvalds's avatar
Linus Torvalds committed
545
	}
546
547
	if (read_seqcount_retry(&conf->gen_lock, seq))
		goto retry;
Linus Torvalds's avatar
Linus Torvalds committed
548
	insert_hash(conf, sh);
549
	sh->cpu = smp_processor_id();
Linus Torvalds's avatar
Linus Torvalds committed
550
551
}

552
static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
553
					 short generation)
Linus Torvalds's avatar
Linus Torvalds committed
554
555
556
{
	struct stripe_head *sh;

557
	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
558
	hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
559
		if (sh->sector == sector && sh->generation == generation)
Linus Torvalds's avatar
Linus Torvalds committed
560
			return sh;
561
	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
Linus Torvalds's avatar
Linus Torvalds committed
562
563
564
	return NULL;
}

565
566
567
568
569
570
571
572
573
574
575
576
577
/*
 * Need to check if array has failed when deciding whether to:
 *  - start an array
 *  - remove non-faulty devices
 *  - add a spare
 *  - allow a reshape
 * This determination is simple when no reshape is happening.
 * However if there is a reshape, we need to carefully check
 * both the before and after sections.
 * This is because some failed devices may only affect one
 * of the two sections, and some non-in_sync devices may
 * be insync in the section most affected by failed devices.
 */
578
static int calc_degraded(struct r5conf *conf)
579
{
580
	int degraded, degraded2;
581
582
583
584
585
	int i;

	rcu_read_lock();
	degraded = 0;
	for (i = 0; i < conf->previous_raid_disks; i++) {
586
		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
587
588
		if (rdev && test_bit(Faulty, &rdev->flags))
			rdev = rcu_dereference(conf->disks[i].replacement);
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
		if (!rdev || test_bit(Faulty, &rdev->flags))
			degraded++;
		else if (test_bit(In_sync, &rdev->flags))
			;
		else
			/* not in-sync or faulty.
			 * If the reshape increases the number of devices,
			 * this is being recovered by the reshape, so
			 * this 'previous' section is not in_sync.
			 * If the number of devices is being reduced however,
			 * the device can only be part of the array if
			 * we are reverting a reshape, so this section will
			 * be in-sync.
			 */
			if (conf->raid_disks >= conf->previous_raid_disks)
				degraded++;
	}
	rcu_read_unlock();
607
608
	if (conf->raid_disks == conf->previous_raid_disks)
		return degraded;
609
	rcu_read_lock();
610
	degraded2 = 0;
611
	for (i = 0; i < conf->raid_disks; i++) {
612
		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
613
614
		if (rdev && test_bit(Faulty, &rdev->flags))
			rdev = rcu_dereference(conf->disks[i].replacement);
615
		if (!rdev || test_bit(Faulty, &rdev->flags))
616
			degraded2++;
617
618
619
620
621
622
623
624
625
		else if (test_bit(In_sync, &rdev->flags))
			;
		else
			/* not in-sync or faulty.
			 * If reshape increases the number of devices, this
			 * section has already been recovered, else it
			 * almost certainly hasn't.
			 */
			if (conf->raid_disks <= conf->previous_raid_disks)
626
				degraded2++;
627
628
	}
	rcu_read_unlock();
629
630
631
632
633
634
635
636
637
638
639
640
641
	if (degraded2 > degraded)
		return degraded2;
	return degraded;
}

static int has_failed(struct r5conf *conf)
{
	int degraded;

	if (conf->mddev->reshape_position == MaxSector)
		return conf->mddev->degraded > conf->max_degraded;

	degraded = calc_degraded(conf);
642
643
644
645
646
	if (degraded > conf->max_degraded)
		return 1;
	return 0;
}

647
static struct stripe_head *
648
get_active_stripe(struct r5conf *conf, sector_t sector,
649
		  int previous, int noblock, int noquiesce)
Linus Torvalds's avatar
Linus Torvalds committed
650
651
{
	struct stripe_head *sh;
652
	int hash = stripe_hash_locks_hash(sector);
Linus Torvalds's avatar
Linus Torvalds committed
653

654
	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
Linus Torvalds's avatar
Linus Torvalds committed
655

656
	spin_lock_irq(conf->hash_locks + hash);
Linus Torvalds's avatar
Linus Torvalds committed
657
658

	do {
659
		wait_event_lock_irq(conf->wait_for_stripe,
660
				    conf->quiesce == 0 || noquiesce,
661
				    *(conf->hash_locks + hash));
662
		sh = __find_stripe(conf, sector, conf->generation - previous);
Linus Torvalds's avatar
Linus Torvalds committed
663
664
		if (!sh) {
			if (!conf->inactive_blocked)
665
				sh = get_free_stripe(conf, hash);
Linus Torvalds's avatar
Linus Torvalds committed
666
667
668
669
			if (noblock && sh == NULL)
				break;
			if (!sh) {
				conf->inactive_blocked = 1;
670
671
672
673
674
675
676
				wait_event_lock_irq(
					conf->wait_for_stripe,
					!list_empty(conf->inactive_list + hash) &&
					(atomic_read(&conf->active_stripes)
					 < (conf->max_nr_stripes * 3 / 4)
					 || !conf->inactive_blocked),
					*(conf->hash_locks + hash));
Linus Torvalds's avatar
Linus Torvalds committed
677
678
				conf->inactive_blocked = 0;
			} else
679
				init_stripe(sh, sector, previous);
Linus Torvalds's avatar
Linus Torvalds committed
680
		} else {
681
			spin_lock(&conf->device_lock);
Linus Torvalds's avatar
Linus Torvalds committed
682
			if (atomic_read(&sh->count)) {
683
				BUG_ON(!list_empty(&sh->lru)
684
				    && !test_bit(STRIPE_EXPANDING, &sh->state)
685
				    && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
686
					);
Linus Torvalds's avatar
Linus Torvalds committed
687
688
689
			} else {
				if (!test_bit(STRIPE_HANDLE, &sh->state))
					atomic_inc(&conf->active_stripes);
690
				BUG_ON(list_empty(&sh->lru));
691
				list_del_init(&sh->lru);
692
693
694
695
				if (sh->group) {
					sh->group->stripes_cnt--;
					sh->group = NULL;
				}
Linus Torvalds's avatar
Linus Torvalds committed
696
			}
697
			spin_unlock(&conf->device_lock);
Linus Torvalds's avatar
Linus Torvalds committed
698
699
700
701
702
703
		}
	} while (sh == NULL);

	if (sh)
		atomic_inc(&sh->count);

704
	spin_unlock_irq(conf->hash_locks + hash);
Linus Torvalds's avatar
Linus Torvalds committed
705
706
707
	return sh;
}

708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
/* Determine if 'data_offset' or 'new_data_offset' should be used
 * in this stripe_head.
 */
static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
{
	sector_t progress = conf->reshape_progress;
	/* Need a memory barrier to make sure we see the value
	 * of conf->generation, or ->data_offset that was set before
	 * reshape_progress was updated.
	 */
	smp_rmb();
	if (progress == MaxSector)
		return 0;
	if (sh->generation == conf->generation - 1)
		return 0;
	/* We are in a reshape, and this is a new-generation stripe,
	 * so use new_data_offset.
	 */
	return 1;
}

729
730
731
732
static void
raid5_end_read_request(struct bio *bi, int error);
static void
raid5_end_write_request(struct bio *bi, int error);
733

734
static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
735
{
736
	struct r5conf *conf = sh->raid_conf;
737
738
739
740
741
742
	int i, disks = sh->disks;

	might_sleep();

	for (i = disks; i--; ) {
		int rw;
743
		int replace_only = 0;
744
745
		struct bio *bi, *rbi;
		struct md_rdev *rdev, *rrdev = NULL;
Tejun Heo's avatar
Tejun Heo committed
746
747
748
749
750
		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
				rw = WRITE_FUA;
			else
				rw = WRITE;
751
			if (test_bit(R5_Discard, &sh->dev[i].flags))
Shaohua Li's avatar
Shaohua Li committed
752
				rw |= REQ_DISCARD;
Tejun Heo's avatar
Tejun Heo committed
753
		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
754
			rw = READ;
755
756
757
758
759
		else if (test_and_clear_bit(R5_WantReplace,
					    &sh->dev[i].flags)) {
			rw = WRITE;
			replace_only = 1;
		} else
760
			continue;
Shaohua Li's avatar
Shaohua Li committed
761
762
		if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
			rw |= REQ_SYNC;
763
764

		bi = &sh->dev[i].req;
765
		rbi = &sh->dev[i].rreq; /* For writing to replacement */
766
767

		rcu_read_lock();
768
		rrdev = rcu_dereference(conf->disks[i].replacement);
769
770
771
772
773
774
		smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
		rdev = rcu_dereference(conf->disks[i].rdev);
		if (!rdev) {
			rdev = rrdev;
			rrdev = NULL;
		}
775
776
777
		if (rw & WRITE) {
			if (replace_only)
				rdev = NULL;
778
779
780
			if (rdev == rrdev)
				/* We raced and saw duplicates */
				rrdev = NULL;
781
		} else {
782
			if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
783
784
785
				rdev = rrdev;
			rrdev = NULL;
		}
786

787
788
789
790
		if (rdev && test_bit(Faulty, &rdev->flags))
			rdev = NULL;
		if (rdev)
			atomic_inc(&rdev->nr_pending);
791
792
793
794
		if (rrdev && test_bit(Faulty, &rrdev->flags))
			rrdev = NULL;
		if (rrdev)
			atomic_inc(&rrdev->nr_pending);
795
796
		rcu_read_unlock();

797
		/* We have already checked bad blocks for reads.  Now
798
799
		 * need to check for writes.  We never accept write errors
		 * on the replacement, so we don't to check rrdev.
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
		 */
		while ((rw & WRITE) && rdev &&
		       test_bit(WriteErrorSeen, &rdev->flags)) {
			sector_t first_bad;
			int bad_sectors;
			int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
					      &first_bad, &bad_sectors);
			if (!bad)
				break;

			if (bad < 0) {
				set_bit(BlockedBadBlocks, &rdev->flags);
				if (!conf->mddev->external &&
				    conf->mddev->flags) {
					/* It is very unlikely, but we might
					 * still need to write out the
					 * bad block log - better give it
					 * a chance*/
					md_check_recovery(conf->mddev);
				}
820
821
822
823
824
825
				/*
				 * Because md_wait_for_blocked_rdev
				 * will dec nr_pending, we must
				 * increment it first.
				 */
				atomic_inc(&rdev->nr_pending);
826
827
828
829
830
831
832
833
				md_wait_for_blocked_rdev(rdev, conf->mddev);
			} else {
				/* Acknowledged bad block - skip the write */
				rdev_dec_pending(rdev, conf->mddev);
				rdev = NULL;
			}
		}

834
		if (rdev) {
835
836
			if (s->syncing || s->expanding || s->expanded
			    || s->replacing)
837
838
				md_sync_acct(rdev->bdev, STRIPE_SECTORS);

Dan Williams's avatar
Dan Williams committed
839
840
			set_bit(STRIPE_IO_STARTED, &sh->state);

Kent Overstreet's avatar
Kent Overstreet committed
841
			bio_reset(bi);
842
			bi->bi_bdev = rdev->bdev;
Kent Overstreet's avatar
Kent Overstreet committed
843
844
845
846
847
848
			bi->bi_rw = rw;
			bi->bi_end_io = (rw & WRITE)
				? raid5_end_write_request
				: raid5_end_read_request;
			bi->bi_private = sh;

849
			pr_debug("%s: for %llu schedule op %ld on disc %d\n",
850
				__func__, (unsigned long long)sh->sector,
851
852
				bi->bi_rw, i);
			atomic_inc(&sh->count);
853
854
855
856
857
858
			if (use_new_offset(conf, sh))
				bi->bi_sector = (sh->sector
						 + rdev->new_data_offset);
			else
				bi->bi_sector = (sh->sector
						 + rdev->data_offset);
859
			if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
860
				bi->bi_rw |= REQ_NOMERGE;
861

Kent Overstreet's avatar
Kent Overstreet committed
862
			bi->bi_vcnt = 1;
863
864
865
			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
			bi->bi_io_vec[0].bv_offset = 0;
			bi->bi_size = STRIPE_SIZE;
866
867
868
869
870
871
			/*
			 * If this is discard request, set bi_vcnt 0. We don't
			 * want to confuse SCSI because SCSI will replace payload
			 */
			if (rw & REQ_DISCARD)
				bi->bi_vcnt = 0;
872
873
			if (rrdev)
				set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
874
875
876
877
878

			if (conf->mddev->gendisk)
				trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
						      bi, disk_devt(conf->mddev->gendisk),
						      sh->dev[i].sector);
879
			generic_make_request(bi);
880
881
		}
		if (rrdev) {
882
883
			if (s->syncing || s->expanding || s->expanded
			    || s->replacing)
884
885
886
887
				md_sync_acct(rrdev->bdev, STRIPE_SECTORS);

			set_bit(STRIPE_IO_STARTED, &sh->state);

Kent Overstreet's avatar
Kent Overstreet committed
888
			bio_reset(rbi);
889
			rbi->bi_bdev = rrdev->bdev;
Kent Overstreet's avatar
Kent Overstreet committed
890
891
892
893
894
			rbi->bi_rw = rw;
			BUG_ON(!(rw & WRITE));
			rbi->bi_end_io = raid5_end_write_request;
			rbi->bi_private = sh;

895
896
897
898
899
			pr_debug("%s: for %llu schedule op %ld on "
				 "replacement disc %d\n",
				__func__, (unsigned long long)sh->sector,
				rbi->bi_rw, i);
			atomic_inc(&sh->count);
900
901
902
903
904
905
			if (use_new_offset(conf, sh))
				rbi->bi_sector = (sh->sector
						  + rrdev->new_data_offset);
			else
				rbi->bi_sector = (sh->sector
						  + rrdev->data_offset);
Kent Overstreet's avatar
Kent Overstreet committed
906
			rbi->bi_vcnt = 1;
907
908
909
			rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
			rbi->bi_io_vec[0].bv_offset = 0;
			rbi->bi_size = STRIPE_SIZE;
910
911
912
913
914
915
			/*
			 * If this is discard request, set bi_vcnt 0. We don't
			 * want to confuse SCSI because SCSI will replace payload
			 */
			if (rw & REQ_DISCARD)
				rbi->bi_vcnt = 0;
916
917
918
919
			if (conf->mddev->gendisk)
				trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
						      rbi, disk_devt(conf->mddev->gendisk),
						      sh->dev[i].sector);
920
921
922
			generic_make_request(rbi);
		}
		if (!rdev && !rrdev) {
923
			if (rw & WRITE)
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
				set_bit(STRIPE_DEGRADED, &sh->state);
			pr_debug("skip op %ld on disc %d for sector %llu\n",
				bi->bi_rw, i, (unsigned long long)sh->sector);
			clear_bit(R5_LOCKED, &sh->dev[i].flags);
			set_bit(STRIPE_HANDLE, &sh->state);
		}
	}
}

static struct dma_async_tx_descriptor *
async_copy_data(int frombio, struct bio *bio, struct page *page,
	sector_t sector, struct dma_async_tx_descriptor *tx)
{
	struct bio_vec *bvl;
	struct page *bio_page;
	int i;
	int page_offset;
941
	struct async_submit_ctl submit;
Dan Williams's avatar
Dan Williams committed
942
	enum async_tx_flags flags = 0;
943
944
945
946
947

	if (bio->bi_sector >= sector)
		page_offset = (signed)(bio->bi_sector - sector) * 512;
	else
		page_offset = (signed)(sector - bio->bi_sector) * -512;
948

Dan Williams's avatar
Dan Williams committed
949
950
951
952
	if (frombio)
		flags |= ASYNC_TX_FENCE;
	init_async_submit(&submit, flags, tx, NULL, NULL, NULL);

953
	bio_for_each_segment(bvl, bio, i) {
954
		int len = bvl->bv_len;
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
		int clen;
		int b_offset = 0;

		if (page_offset < 0) {
			b_offset = -page_offset;
			page_offset += b_offset;
			len -= b_offset;
		}

		if (len > 0 && page_offset + len > STRIPE_SIZE)
			clen = STRIPE_SIZE - page_offset;
		else
			clen = len;

		if (clen > 0) {
970
971
			b_offset += bvl->bv_offset;
			bio_page = bvl->bv_page;
972
973
			if (frombio)
				tx = async_memcpy(page, bio_page, page_offset,
974
						  b_offset, clen, &submit);
975
976
			else
				tx = async_memcpy(bio_page, page, b_offset,
977
						  page_offset, clen, &submit);
978
		}
979
980
981
		/* chain the operations */
		submit.depend_tx = tx;

982
983
984
985
986
987
988
989
990
991
992
993
		if (clen < len) /* hit end of page */
			break;
		page_offset +=  len;
	}

	return tx;
}

static void ops_complete_biofill(void *stripe_head_ref)
{
	struct stripe_head *sh = stripe_head_ref;
	struct bio *return_bi = NULL;
994
	int i;
995

996
	pr_debug("%s: stripe %llu\n", __func__,
997
998
999
1000
		(unsigned long long)sh->sector);

	/* clear completed biofills */
	for (i = sh->disks; i--; ) {