dm-raid.c 104 KB
Newer Older
NeilBrown's avatar
NeilBrown committed
1
2
/*
 * Copyright (C) 2010-2011 Neil Brown
3
 * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved.
NeilBrown's avatar
NeilBrown committed
4
5
6
7
8
 *
 * This file is released under the GPL.
 */

#include <linux/slab.h>
9
#include <linux/module.h>
NeilBrown's avatar
NeilBrown committed
10
11

#include "md.h"
12
#include "raid1.h"
NeilBrown's avatar
NeilBrown committed
13
#include "raid5.h"
14
#include "raid10.h"
NeilBrown's avatar
NeilBrown committed
15
16
#include "bitmap.h"

Alasdair G Kergon's avatar
Alasdair G Kergon committed
17
18
#include <linux/device-mapper.h>

NeilBrown's avatar
NeilBrown committed
19
#define DM_MSG_PREFIX "raid"
20
#define	MAX_RAID_DEVICES	253 /* md-raid kernel limit */
NeilBrown's avatar
NeilBrown committed
21

22
23
24
25
26
/*
 * Minimum sectors of free reshape space per raid device
 */
#define	MIN_FREE_RESHAPE_SPACE to_sector(4*4096)

27
28
static bool devices_handle_discard_safely = false;

NeilBrown's avatar
NeilBrown committed
29
/*
30
31
 * The following flags are used by dm-raid.c to set up the array state.
 * They must be cleared before md_run is called.
NeilBrown's avatar
NeilBrown committed
32
 */
33
#define FirstUse 10		/* rdev flag */
NeilBrown's avatar
NeilBrown committed
34
35
36
37

struct raid_dev {
	/*
	 * Two DM devices, one to hold metadata and one to hold the
38
	 * actual data/parity.	The reason for this is to not confuse
NeilBrown's avatar
NeilBrown committed
39
40
41
42
43
44
45
46
47
48
49
	 * ti->len and give more flexibility in altering size and
	 * characteristics.
	 *
	 * While it is possible for this device to be associated
	 * with a different physical device than the data_dev, it
	 * is intended for it to be the same.
	 *    |--------- Physical Device ---------|
	 *    |- meta_dev -|------ data_dev ------|
	 */
	struct dm_dev *meta_dev;
	struct dm_dev *data_dev;
50
	struct md_rdev rdev;
NeilBrown's avatar
NeilBrown committed
51
52
53
};

/*
54
 * Bits for establishing rs->ctr_flags
55
56
57
 *
 * 1 = no flag value
 * 2 = flag with value
NeilBrown's avatar
NeilBrown committed
58
 */
59
60
61
62
63
64
65
66
67
68
69
70
#define __CTR_FLAG_SYNC			0  /* 1 */ /* Not with raid0! */
#define __CTR_FLAG_NOSYNC		1  /* 1 */ /* Not with raid0! */
#define __CTR_FLAG_REBUILD		2  /* 2 */ /* Not with raid0! */
#define __CTR_FLAG_DAEMON_SLEEP		3  /* 2 */ /* Not with raid0! */
#define __CTR_FLAG_MIN_RECOVERY_RATE	4  /* 2 */ /* Not with raid0! */
#define __CTR_FLAG_MAX_RECOVERY_RATE	5  /* 2 */ /* Not with raid0! */
#define __CTR_FLAG_MAX_WRITE_BEHIND	6  /* 2 */ /* Only with raid1! */
#define __CTR_FLAG_WRITE_MOSTLY		7  /* 2 */ /* Only with raid1! */
#define __CTR_FLAG_STRIPE_CACHE		8  /* 2 */ /* Only with raid4/5/6! */
#define __CTR_FLAG_REGION_SIZE		9  /* 2 */ /* Not with raid0! */
#define __CTR_FLAG_RAID10_COPIES	10 /* 2 */ /* Only with raid10 */
#define __CTR_FLAG_RAID10_FORMAT	11 /* 2 */ /* Only with raid10 */
71
/* New for v1.9.0 */
72
#define __CTR_FLAG_DELTA_DISKS		12 /* 2 */ /* Only with reshapable raid1/4/5/6/10! */
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#define __CTR_FLAG_DATA_OFFSET		13 /* 2 */ /* Only with reshapable raid4/5/6/10! */
#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */

/*
 * Flags for rs->ctr_flags field.
 */
#define CTR_FLAG_SYNC			(1 << __CTR_FLAG_SYNC)
#define CTR_FLAG_NOSYNC			(1 << __CTR_FLAG_NOSYNC)
#define CTR_FLAG_REBUILD		(1 << __CTR_FLAG_REBUILD)
#define CTR_FLAG_DAEMON_SLEEP		(1 << __CTR_FLAG_DAEMON_SLEEP)
#define CTR_FLAG_MIN_RECOVERY_RATE	(1 << __CTR_FLAG_MIN_RECOVERY_RATE)
#define CTR_FLAG_MAX_RECOVERY_RATE	(1 << __CTR_FLAG_MAX_RECOVERY_RATE)
#define CTR_FLAG_MAX_WRITE_BEHIND	(1 << __CTR_FLAG_MAX_WRITE_BEHIND)
#define CTR_FLAG_WRITE_MOSTLY		(1 << __CTR_FLAG_WRITE_MOSTLY)
#define CTR_FLAG_STRIPE_CACHE		(1 << __CTR_FLAG_STRIPE_CACHE)
#define CTR_FLAG_REGION_SIZE		(1 << __CTR_FLAG_REGION_SIZE)
#define CTR_FLAG_RAID10_COPIES		(1 << __CTR_FLAG_RAID10_COPIES)
#define CTR_FLAG_RAID10_FORMAT		(1 << __CTR_FLAG_RAID10_FORMAT)
#define CTR_FLAG_DELTA_DISKS		(1 << __CTR_FLAG_DELTA_DISKS)
#define CTR_FLAG_DATA_OFFSET		(1 << __CTR_FLAG_DATA_OFFSET)
#define CTR_FLAG_RAID10_USE_NEAR_SETS	(1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
94

95
96
97
98
99
100
101
102
103
/*
 * Definitions of various constructor flags to
 * be used in checks of valid / invalid flags
 * per raid level.
 */
/* Define all any sync flags */
#define	CTR_FLAGS_ANY_SYNC		(CTR_FLAG_SYNC | CTR_FLAG_NOSYNC)

/* Define flags for options without argument (e.g. 'nosync') */
104
105
#define	CTR_FLAG_OPTIONS_NO_ARGS	(CTR_FLAGS_ANY_SYNC | \
					 CTR_FLAG_RAID10_USE_NEAR_SETS)
106
107
108
109
110
111
112
113
114
115
116

/* Define flags for options with one argument (e.g. 'delta_disks +2') */
#define CTR_FLAG_OPTIONS_ONE_ARG (CTR_FLAG_REBUILD | \
				  CTR_FLAG_WRITE_MOSTLY | \
				  CTR_FLAG_DAEMON_SLEEP | \
				  CTR_FLAG_MIN_RECOVERY_RATE | \
				  CTR_FLAG_MAX_RECOVERY_RATE | \
				  CTR_FLAG_MAX_WRITE_BEHIND | \
				  CTR_FLAG_STRIPE_CACHE | \
				  CTR_FLAG_REGION_SIZE | \
				  CTR_FLAG_RAID10_COPIES | \
117
118
119
				  CTR_FLAG_RAID10_FORMAT | \
				  CTR_FLAG_DELTA_DISKS | \
				  CTR_FLAG_DATA_OFFSET)
120

121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* Valid options definitions per raid level... */

/* "raid0" does only accept data offset */
#define RAID0_VALID_FLAGS	(CTR_FLAG_DATA_OFFSET)

/* "raid1" does not accept stripe cache, data offset, delta_disks or any raid10 options */
#define RAID1_VALID_FLAGS	(CTR_FLAGS_ANY_SYNC | \
				 CTR_FLAG_REBUILD | \
				 CTR_FLAG_WRITE_MOSTLY | \
				 CTR_FLAG_DAEMON_SLEEP | \
				 CTR_FLAG_MIN_RECOVERY_RATE | \
				 CTR_FLAG_MAX_RECOVERY_RATE | \
				 CTR_FLAG_MAX_WRITE_BEHIND | \
				 CTR_FLAG_REGION_SIZE | \
135
				 CTR_FLAG_DELTA_DISKS | \
136
				 CTR_FLAG_DATA_OFFSET)
137

138
139
140
141
142
143
144
/* "raid10" does not accept any raid1 or stripe cache options */
#define RAID10_VALID_FLAGS	(CTR_FLAGS_ANY_SYNC | \
				 CTR_FLAG_REBUILD | \
				 CTR_FLAG_DAEMON_SLEEP | \
				 CTR_FLAG_MIN_RECOVERY_RATE | \
				 CTR_FLAG_MAX_RECOVERY_RATE | \
				 CTR_FLAG_REGION_SIZE | \
145
				 CTR_FLAG_RAID10_COPIES | \
146
147
				 CTR_FLAG_RAID10_FORMAT | \
				 CTR_FLAG_DELTA_DISKS | \
148
149
				 CTR_FLAG_DATA_OFFSET | \
				 CTR_FLAG_RAID10_USE_NEAR_SETS)
150
151
152
153
154
155
156
157

/*
 * "raid4/5/6" do not accept any raid1 or raid10 specific options
 *
 * "raid6" does not accept "nosync", because it is not guaranteed
 * that both parity and q-syndrome are being written properly with
 * any writes
 */
158
159
160
161
162
#define RAID45_VALID_FLAGS	(CTR_FLAGS_ANY_SYNC | \
				 CTR_FLAG_REBUILD | \
				 CTR_FLAG_DAEMON_SLEEP | \
				 CTR_FLAG_MIN_RECOVERY_RATE | \
				 CTR_FLAG_MAX_RECOVERY_RATE | \
163
				 CTR_FLAG_MAX_WRITE_BEHIND | \
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
				 CTR_FLAG_STRIPE_CACHE | \
				 CTR_FLAG_REGION_SIZE | \
				 CTR_FLAG_DELTA_DISKS | \
				 CTR_FLAG_DATA_OFFSET)

#define RAID6_VALID_FLAGS	(CTR_FLAG_SYNC | \
				 CTR_FLAG_REBUILD | \
				 CTR_FLAG_DAEMON_SLEEP | \
				 CTR_FLAG_MIN_RECOVERY_RATE | \
				 CTR_FLAG_MAX_RECOVERY_RATE | \
				 CTR_FLAG_MAX_WRITE_BEHIND | \
				 CTR_FLAG_STRIPE_CACHE | \
				 CTR_FLAG_REGION_SIZE | \
				 CTR_FLAG_DELTA_DISKS | \
				 CTR_FLAG_DATA_OFFSET)
/* ...valid options definitions per raid level */
180

181
182
183
184
185
186
187
188
/*
 * Flags for rs->runtime_flags field
 * (RT_FLAG prefix meaning "runtime flag")
 *
 * These are all internal and used to define runtime state,
 * e.g. to prevent another resume from preresume processing
 * the raid set all over again.
 */
189
190
191
192
#define RT_FLAG_RS_PRERESUMED		0
#define RT_FLAG_RS_RESUMED		1
#define RT_FLAG_RS_BITMAP_LOADED	2
#define RT_FLAG_UPDATE_SBS		3
193
#define RT_FLAG_RESHAPE_RS		4
194
#define RT_FLAG_KEEP_RS_FROZEN		5
195

196
/* Array elements of 64 bit needed for rebuild/failed disk bits */
197
198
#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)

199
200
201
202
203
204
205
206
207
/*
 * raid set level, layout and chunk sectors backup/restore
 */
struct rs_layout {
	int new_level;
	int new_layout;
	int new_chunk_sectors;
};

NeilBrown's avatar
NeilBrown committed
208
209
210
struct raid_set {
	struct dm_target *ti;

211
	uint32_t bitmap_loaded;
212
	uint32_t stripe_cache_entries;
213
214
	unsigned long ctr_flags;
	unsigned long runtime_flags;
215
216

	uint64_t rebuild_disks[DISKS_ARRAY_ELEMS];
NeilBrown's avatar
NeilBrown committed
217

218
219
	int raid_disks;
	int delta_disks;
220
	int data_offset;
221
	int raid10_copies;
222
	int requested_bitmap_chunk_sectors;
223

224
	struct mddev md;
NeilBrown's avatar
NeilBrown committed
225
226
227
228
229
230
	struct raid_type *raid_type;
	struct dm_target_callbacks callbacks;

	struct raid_dev dev[0];
};

231
static void rs_config_backup(struct raid_set *rs, struct rs_layout *l)
232
233
234
235
236
237
238
239
{
	struct mddev *mddev = &rs->md;

	l->new_level = mddev->new_level;
	l->new_layout = mddev->new_layout;
	l->new_chunk_sectors = mddev->new_chunk_sectors;
}

240
static void rs_config_restore(struct raid_set *rs, struct rs_layout *l)
241
242
243
244
245
246
247
248
{
	struct mddev *mddev = &rs->md;

	mddev->new_level = l->new_level;
	mddev->new_layout = l->new_layout;
	mddev->new_chunk_sectors = l->new_chunk_sectors;
}

249
250
251
252
253
254
/* raid10 algorithms (i.e. formats) */
#define	ALGORITHM_RAID10_DEFAULT	0
#define	ALGORITHM_RAID10_NEAR		1
#define	ALGORITHM_RAID10_OFFSET		2
#define	ALGORITHM_RAID10_FAR		3

NeilBrown's avatar
NeilBrown committed
255
256
257
258
259
260
261
262
263
/* Supported raid types and properties. */
static struct raid_type {
	const char *name;		/* RAID algorithm. */
	const char *descr;		/* Descriptor text for logging. */
	const unsigned parity_devs;	/* # of parity devices. */
	const unsigned minimal_devs;	/* minimal # of devices in set. */
	const unsigned level;		/* RAID level. */
	const unsigned algorithm;	/* RAID algorithm. */
} raid_types[] = {
264
265
266
	{"raid0",	  "raid0 (striping)",			    0, 2, 0,  0 /* NONE */},
	{"raid1",	  "raid1 (mirroring)",			    0, 2, 1,  0 /* NONE */},
	{"raid10_far",	  "raid10 far (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_FAR},
267
	{"raid10_offset", "raid10 offset (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_OFFSET},
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
	{"raid10_near",	  "raid10 near (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_NEAR},
	{"raid10",	  "raid10 (striped mirrors)",		    0, 2, 10, ALGORITHM_RAID10_DEFAULT},
	{"raid4",	  "raid4 (dedicated last parity disk)",	    1, 2, 4,  ALGORITHM_PARITY_N}, /* raid4 layout = raid5_n */
	{"raid5_n",	  "raid5 (dedicated last parity disk)",	    1, 2, 5,  ALGORITHM_PARITY_N},
	{"raid5_ls",	  "raid5 (left symmetric)",		    1, 2, 5,  ALGORITHM_LEFT_SYMMETRIC},
	{"raid5_rs",	  "raid5 (right symmetric)",		    1, 2, 5,  ALGORITHM_RIGHT_SYMMETRIC},
	{"raid5_la",	  "raid5 (left asymmetric)",		    1, 2, 5,  ALGORITHM_LEFT_ASYMMETRIC},
	{"raid5_ra",	  "raid5 (right asymmetric)",		    1, 2, 5,  ALGORITHM_RIGHT_ASYMMETRIC},
	{"raid6_zr",	  "raid6 (zero restart)",		    2, 4, 6,  ALGORITHM_ROTATING_ZERO_RESTART},
	{"raid6_nr",	  "raid6 (N restart)",			    2, 4, 6,  ALGORITHM_ROTATING_N_RESTART},
	{"raid6_nc",	  "raid6 (N continue)",			    2, 4, 6,  ALGORITHM_ROTATING_N_CONTINUE},
	{"raid6_n_6",	  "raid6 (dedicated parity/Q n/6)",	    2, 4, 6,  ALGORITHM_PARITY_N_6},
	{"raid6_ls_6",	  "raid6 (left symmetric dedicated Q 6)",   2, 4, 6,  ALGORITHM_LEFT_SYMMETRIC_6},
	{"raid6_rs_6",	  "raid6 (right symmetric dedicated Q 6)",  2, 4, 6,  ALGORITHM_RIGHT_SYMMETRIC_6},
	{"raid6_la_6",	  "raid6 (left asymmetric dedicated Q 6)",  2, 4, 6,  ALGORITHM_LEFT_ASYMMETRIC_6},
	{"raid6_ra_6",	  "raid6 (right asymmetric dedicated Q 6)", 2, 4, 6,  ALGORITHM_RIGHT_ASYMMETRIC_6}
NeilBrown's avatar
NeilBrown committed
284
285
};

286
/* True, if @v is in inclusive range [@min, @max] */
287
static bool __within_range(long v, long min, long max)
288
289
290
291
{
	return v >= min && v <= max;
}

292
293
/* All table line arguments are defined here */
static struct arg_name_flag {
294
	const unsigned long flag;
295
	const char *name;
Mike Snitzer's avatar
Mike Snitzer committed
296
} __arg_name_flags[] = {
297
298
299
300
301
302
303
	{ CTR_FLAG_SYNC, "sync"},
	{ CTR_FLAG_NOSYNC, "nosync"},
	{ CTR_FLAG_REBUILD, "rebuild"},
	{ CTR_FLAG_DAEMON_SLEEP, "daemon_sleep"},
	{ CTR_FLAG_MIN_RECOVERY_RATE, "min_recovery_rate"},
	{ CTR_FLAG_MAX_RECOVERY_RATE, "max_recovery_rate"},
	{ CTR_FLAG_MAX_WRITE_BEHIND, "max_write_behind"},
304
	{ CTR_FLAG_WRITE_MOSTLY, "write_mostly"},
305
306
307
308
	{ CTR_FLAG_STRIPE_CACHE, "stripe_cache"},
	{ CTR_FLAG_REGION_SIZE, "region_size"},
	{ CTR_FLAG_RAID10_COPIES, "raid10_copies"},
	{ CTR_FLAG_RAID10_FORMAT, "raid10_format"},
309
310
311
	{ CTR_FLAG_DATA_OFFSET, "data_offset"},
	{ CTR_FLAG_DELTA_DISKS, "delta_disks"},
	{ CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
312
313
314
};

/* Return argument name string for given @flag */
315
static const char *dm_raid_arg_name_by_flag(const uint32_t flag)
316
317
{
	if (hweight32(flag) == 1) {
Mike Snitzer's avatar
Mike Snitzer committed
318
		struct arg_name_flag *anf = __arg_name_flags + ARRAY_SIZE(__arg_name_flags);
319

Mike Snitzer's avatar
Mike Snitzer committed
320
		while (anf-- > __arg_name_flags)
321
			if (flag & anf->flag)
322
323
324
325
326
327
328
329
				return anf->name;

	} else
		DMERR("%s called with more than one flag!", __func__);

	return NULL;
}

330
/*
331
332
 * Bool helpers to test for various raid levels of a raid set.
 * It's level as reported by the superblock rather than
333
334
335
336
337
338
339
340
 * the requested raid_type passed to the constructor.
 */
/* Return true, if raid set in @rs is raid0 */
static bool rs_is_raid0(struct raid_set *rs)
{
	return !rs->md.level;
}

341
342
343
344
345
346
/* Return true, if raid set in @rs is raid1 */
static bool rs_is_raid1(struct raid_set *rs)
{
	return rs->md.level == 1;
}

347
348
349
350
351
352
/* Return true, if raid set in @rs is raid10 */
static bool rs_is_raid10(struct raid_set *rs)
{
	return rs->md.level == 10;
}

353
354
355
356
357
358
/* Return true, if raid set in @rs is level 6 */
static bool rs_is_raid6(struct raid_set *rs)
{
	return rs->md.level == 6;
}

359
360
361
362
363
364
365
/* Return true, if raid set in @rs is level 4, 5 or 6 */
static bool rs_is_raid456(struct raid_set *rs)
{
	return __within_range(rs->md.level, 4, 6);
}

/* Return true, if raid set in @rs is reshapable */
366
static bool __is_raid10_far(int layout);
367
368
369
370
371
372
static bool rs_is_reshapable(struct raid_set *rs)
{
	return rs_is_raid456(rs) ||
	       (rs_is_raid10(rs) && !__is_raid10_far(rs->md.new_layout));
}

373
374
375
/* Return true, if raid set in @rs is recovering */
static bool rs_is_recovering(struct raid_set *rs)
{
376
	return rs->md.recovery_cp < rs->dev[0].rdev.sectors;
377
378
379
380
381
382
383
384
}

/* Return true, if raid set in @rs is reshaping */
static bool rs_is_reshaping(struct raid_set *rs)
{
	return rs->md.reshape_position != MaxSector;
}

385
/*
386
 * bool helpers to test for various raid levels of a raid type @rt
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
 */

/* Return true, if raid type in @rt is raid0 */
static bool rt_is_raid0(struct raid_type *rt)
{
	return !rt->level;
}

/* Return true, if raid type in @rt is raid1 */
static bool rt_is_raid1(struct raid_type *rt)
{
	return rt->level == 1;
}

/* Return true, if raid type in @rt is raid10 */
static bool rt_is_raid10(struct raid_type *rt)
{
	return rt->level == 10;
}

/* Return true, if raid type in @rt is raid4/5 */
static bool rt_is_raid45(struct raid_type *rt)
{
410
	return __within_range(rt->level, 4, 5);
411
412
413
414
415
416
417
}

/* Return true, if raid type in @rt is raid6 */
static bool rt_is_raid6(struct raid_type *rt)
{
	return rt->level == 6;
}
418
419
420
421

/* Return true, if raid type in @rt is raid4/5/6 */
static bool rt_is_raid456(struct raid_type *rt)
{
422
	return __within_range(rt->level, 4, 6);
423
}
424
425
/* END: raid level bools */

426
427
/* Return valid ctr flags for the raid level of @rs */
static unsigned long __valid_flags(struct raid_set *rs)
428
429
{
	if (rt_is_raid0(rs->raid_type))
430
		return RAID0_VALID_FLAGS;
431
	else if (rt_is_raid1(rs->raid_type))
432
		return RAID1_VALID_FLAGS;
433
	else if (rt_is_raid10(rs->raid_type))
434
		return RAID10_VALID_FLAGS;
435
	else if (rt_is_raid45(rs->raid_type))
436
		return RAID45_VALID_FLAGS;
437
	else if (rt_is_raid6(rs->raid_type))
438
		return RAID6_VALID_FLAGS;
439

440
	return 0;
441
442
443
}

/*
444
 * Check for valid flags set on @rs
445
446
447
 *
 * Has to be called after parsing of the ctr flags!
 */
448
static int rs_check_for_valid_flags(struct raid_set *rs)
449
{
450
	if (rs->ctr_flags & ~__valid_flags(rs)) {
451
		rs->ti->error = "Invalid flags combination";
452
453
		return -EINVAL;
	}
454
455
456
457

	return 0;
}

458
459
460
461
462
463
464
/* MD raid10 bit definitions and helpers */
#define RAID10_OFFSET			(1 << 16) /* stripes with data copies area adjacent on devices */
#define RAID10_BROCKEN_USE_FAR_SETS	(1 << 17) /* Broken in raid10.c: use sets instead of whole stripe rotation */
#define RAID10_USE_FAR_SETS		(1 << 18) /* Use sets instead of whole stripe rotation */
#define RAID10_FAR_COPIES_SHIFT		8	  /* raid10 # far copies shift (2nd byte of layout) */

/* Return md raid10 near copies for @layout */
Mike Snitzer's avatar
Mike Snitzer committed
465
static unsigned int __raid10_near_copies(int layout)
466
467
468
469
470
{
	return layout & 0xFF;
}

/* Return md raid10 far copies for @layout */
Mike Snitzer's avatar
Mike Snitzer committed
471
static unsigned int __raid10_far_copies(int layout)
472
{
Mike Snitzer's avatar
Mike Snitzer committed
473
	return __raid10_near_copies(layout >> RAID10_FAR_COPIES_SHIFT);
474
475
476
}

/* Return true if md raid10 offset for @layout */
477
static bool __is_raid10_offset(int layout)
478
{
479
	return !!(layout & RAID10_OFFSET);
480
481
482
}

/* Return true if md raid10 near for @layout */
483
static bool __is_raid10_near(int layout)
484
{
Mike Snitzer's avatar
Mike Snitzer committed
485
	return !__is_raid10_offset(layout) && __raid10_near_copies(layout) > 1;
486
487
488
}

/* Return true if md raid10 far for @layout */
489
static bool __is_raid10_far(int layout)
490
{
Mike Snitzer's avatar
Mike Snitzer committed
491
	return !__is_raid10_offset(layout) && __raid10_far_copies(layout) > 1;
492
493
494
495
}

/* Return md raid10 layout string for @layout */
static const char *raid10_md_layout_to_format(int layout)
496
497
{
	/*
498
499
500
	 * Bit 16 stands for "offset"
	 * (i.e. adjacent stripes hold copies)
	 *
501
502
	 * Refer to MD's raid10.c for details
	 */
Mike Snitzer's avatar
Mike Snitzer committed
503
	if (__is_raid10_offset(layout))
504
505
		return "offset";

Mike Snitzer's avatar
Mike Snitzer committed
506
	if (__raid10_near_copies(layout) > 1)
507
508
		return "near";

Mike Snitzer's avatar
Mike Snitzer committed
509
	WARN_ON(__raid10_far_copies(layout) < 2);
510

511
512
513
	return "far";
}

514
/* Return md raid10 algorithm for @name */
515
static int raid10_name_to_format(const char *name)
516
517
518
519
520
521
522
523
524
525
526
527
528
{
	if (!strcasecmp(name, "near"))
		return ALGORITHM_RAID10_NEAR;
	else if (!strcasecmp(name, "offset"))
		return ALGORITHM_RAID10_OFFSET;
	else if (!strcasecmp(name, "far"))
		return ALGORITHM_RAID10_FAR;

	return -EINVAL;
}

/* Return md raid10 copies for @layout */
static unsigned int raid10_md_layout_to_copies(int layout)
529
{
530
	return max(__raid10_near_copies(layout), __raid10_far_copies(layout));
531
532
}

533
534
535
536
/* Return md raid10 format id for @format string */
static int raid10_format_to_md_layout(struct raid_set *rs,
				      unsigned int algorithm,
				      unsigned int copies)
537
{
538
	unsigned int n = 1, f = 1, r = 0;
539

540
541
542
543
544
545
546
547
548
549
	/*
	 * MD resilienece flaw:
	 *
	 * enabling use_far_sets for far/offset formats causes copies
	 * to be colocated on the same devs together with their origins!
	 *
	 * -> disable it for now in the definition above
	 */
	if (algorithm == ALGORITHM_RAID10_DEFAULT ||
	    algorithm == ALGORITHM_RAID10_NEAR)
550
		n = copies;
551
552
553
554

	else if (algorithm == ALGORITHM_RAID10_OFFSET) {
		f = copies;
		r = RAID10_OFFSET;
555
		if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags))
556
557
558
			r |= RAID10_USE_FAR_SETS;

	} else if (algorithm == ALGORITHM_RAID10_FAR) {
559
		f = copies;
560
		r = !RAID10_OFFSET;
561
		if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags))
562
			r |= RAID10_USE_FAR_SETS;
563

564
565
566
567
568
569
	} else
		return -EINVAL;

	return r | (f << RAID10_FAR_COPIES_SHIFT) | n;
}
/* END: MD raid10 bit definitions and helpers */
570

571
/* Check for any of the raid10 algorithms */
572
static bool __got_raid10(struct raid_type *rtp, const int layout)
573
574
575
576
577
{
	if (rtp->level == 10) {
		switch (rtp->algorithm) {
		case ALGORITHM_RAID10_DEFAULT:
		case ALGORITHM_RAID10_NEAR:
Mike Snitzer's avatar
Mike Snitzer committed
578
			return __is_raid10_near(layout);
579
		case ALGORITHM_RAID10_OFFSET:
Mike Snitzer's avatar
Mike Snitzer committed
580
			return __is_raid10_offset(layout);
581
		case ALGORITHM_RAID10_FAR:
Mike Snitzer's avatar
Mike Snitzer committed
582
			return __is_raid10_far(layout);
583
584
585
586
		default:
			break;
		}
	}
587

588
	return false;
589
590
}

591
/* Return raid_type for @name */
592
static struct raid_type *get_raid_type(const char *name)
NeilBrown's avatar
NeilBrown committed
593
{
594
	struct raid_type *rtp = raid_types + ARRAY_SIZE(raid_types);
NeilBrown's avatar
NeilBrown committed
595

596
597
598
	while (rtp-- > raid_types)
		if (!strcasecmp(rtp->name, name))
			return rtp;
NeilBrown's avatar
NeilBrown committed
599
600
601
602

	return NULL;
}

603
604
605
606
607
608
609
610
/* Return raid_type for @name based derived from @level and @layout */
static struct raid_type *get_raid_type_by_ll(const int level, const int layout)
{
	struct raid_type *rtp = raid_types + ARRAY_SIZE(raid_types);

	while (rtp-- > raid_types) {
		/* RAID10 special checks based on @layout flags/properties */
		if (rtp->level == level &&
Mike Snitzer's avatar
Mike Snitzer committed
611
		    (__got_raid10(rtp, layout) || rtp->algorithm == layout))
612
613
614
615
616
617
			return rtp;
	}

	return NULL;
}

618
619
620
621
622
623
624
/*
 * Conditionally change bdev capacity of @rs
 * in case of a disk add/remove reshape
 */
static void rs_set_capacity(struct raid_set *rs)
{
	struct mddev *mddev = &rs->md;
625
	struct md_rdev *rdev;
626
	struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
627

628
629
630
631
632
633
634
	/*
	 * raid10 sets rdev->sector to the device size, which
	 * is unintended in case of out-of-place reshaping
	 */
	rdev_for_each(rdev, mddev)
		rdev->sectors = mddev->dev_sectors;

635
636
	set_capacity(gendisk, mddev->array_sectors);
	revalidate_disk(gendisk);
637
638
}

639
640
641
642
643
644
645
646
647
648
649
650
651
/*
 * Set the mddev properties in @rs to the current
 * ones retrieved from the freshest superblock
 */
static void rs_set_cur(struct raid_set *rs)
{
	struct mddev *mddev = &rs->md;

	mddev->new_level = mddev->level;
	mddev->new_layout = mddev->layout;
	mddev->new_chunk_sectors = mddev->chunk_sectors;
}

652
653
654
655
656
657
658
659
660
661
662
/*
 * Set the mddev properties in @rs to the new
 * ones requested by the ctr
 */
static void rs_set_new(struct raid_set *rs)
{
	struct mddev *mddev = &rs->md;

	mddev->level = mddev->new_level;
	mddev->layout = mddev->new_layout;
	mddev->chunk_sectors = mddev->new_chunk_sectors;
663
	mddev->raid_disks = rs->raid_disks;
664
665
666
	mddev->delta_disks = 0;
}

667
668
static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *raid_type,
				       unsigned raid_devs)
NeilBrown's avatar
NeilBrown committed
669
670
671
672
{
	unsigned i;
	struct raid_set *rs;

673
674
675
676
	if (raid_devs <= raid_type->parity_devs) {
		ti->error = "Insufficient number of devices";
		return ERR_PTR(-EINVAL);
	}
NeilBrown's avatar
NeilBrown committed
677
678

	rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
679
680
681
682
	if (!rs) {
		ti->error = "Cannot allocate raid context";
		return ERR_PTR(-ENOMEM);
	}
NeilBrown's avatar
NeilBrown committed
683
684
685

	mddev_init(&rs->md);

686
687
688
	rs->raid_disks = raid_devs;
	rs->delta_disks = 0;

NeilBrown's avatar
NeilBrown committed
689
690
	rs->ti = ti;
	rs->raid_type = raid_type;
691
	rs->stripe_cache_entries = 256;
NeilBrown's avatar
NeilBrown committed
692
693
694
695
696
697
	rs->md.raid_disks = raid_devs;
	rs->md.level = raid_type->level;
	rs->md.new_level = rs->md.level;
	rs->md.layout = raid_type->algorithm;
	rs->md.new_layout = rs->md.layout;
	rs->md.delta_disks = 0;
698
	rs->md.recovery_cp = MaxSector;
NeilBrown's avatar
NeilBrown committed
699
700
701
702
703
704
705
706
707
708

	for (i = 0; i < raid_devs; i++)
		md_rdev_init(&rs->dev[i].rdev);

	/*
	 * Remaining items to be initialized by further RAID params:
	 *  rs->md.persistent
	 *  rs->md.external
	 *  rs->md.chunk_sectors
	 *  rs->md.new_chunk_sectors
709
	 *  rs->md.dev_sectors
NeilBrown's avatar
NeilBrown committed
710
711
712
713
714
	 */

	return rs;
}

715
static void raid_set_free(struct raid_set *rs)
NeilBrown's avatar
NeilBrown committed
716
717
718
{
	int i;

719
	for (i = 0; i < rs->raid_disks; i++) {
720
721
		if (rs->dev[i].meta_dev)
			dm_put_device(rs->ti, rs->dev[i].meta_dev);
722
		md_rdev_clear(&rs->dev[i].rdev);
NeilBrown's avatar
NeilBrown committed
723
724
		if (rs->dev[i].data_dev)
			dm_put_device(rs->ti, rs->dev[i].data_dev);
725
	}
NeilBrown's avatar
NeilBrown committed
726
727
728
729
730
731
732
733
734

	kfree(rs);
}

/*
 * For every device we have two words
 *  <meta_dev>: meta device name or '-' if missing
 *  <data_dev>: data device name or '-' if missing
 *
735
736
737
738
739
740
741
742
743
 * The following are permitted:
 *    - -
 *    - <data_dev>
 *    <meta_dev> <data_dev>
 *
 * The following is not allowed:
 *    <meta_dev> -
 *
 * This code parses those words.  If there is a failure,
744
 * the caller must use raid_set_free() to unwind the operations.
NeilBrown's avatar
NeilBrown committed
745
 */
746
static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
NeilBrown's avatar
NeilBrown committed
747
748
749
750
{
	int i;
	int rebuild = 0;
	int metadata_available = 0;
751
	int r = 0;
752
	const char *arg;
NeilBrown's avatar
NeilBrown committed
753

754
755
756
757
758
	/* Put off the number of raid devices argument to get to dev pairs */
	arg = dm_shift_arg(as);
	if (!arg)
		return -EINVAL;

759
	for (i = 0; i < rs->raid_disks; i++) {
NeilBrown's avatar
NeilBrown committed
760
761
762
763
764
765
766
767
768
769
770
771
		rs->dev[i].rdev.raid_disk = i;

		rs->dev[i].meta_dev = NULL;
		rs->dev[i].data_dev = NULL;

		/*
		 * There are no offsets, since there is a separate device
		 * for data and metadata.
		 */
		rs->dev[i].rdev.data_offset = 0;
		rs->dev[i].rdev.mddev = &rs->md;

772
773
774
775
776
		arg = dm_shift_arg(as);
		if (!arg)
			return -EINVAL;

		if (strcmp(arg, "-")) {
777
778
779
780
781
782
			r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
					  &rs->dev[i].meta_dev);
			if (r) {
				rs->ti->error = "RAID metadata device lookup failure";
				return r;
			}
783
784

			rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
785
786
787
788
			if (!rs->dev[i].rdev.sb_page) {
				rs->ti->error = "Failed to allocate superblock page";
				return -ENOMEM;
			}
NeilBrown's avatar
NeilBrown committed
789
790
		}

791
792
793
794
795
		arg = dm_shift_arg(as);
		if (!arg)
			return -EINVAL;

		if (!strcmp(arg, "-")) {
NeilBrown's avatar
NeilBrown committed
796
			if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
797
798
799
800
			    (!rs->dev[i].rdev.recovery_offset)) {
				rs->ti->error = "Drive designated for rebuild not specified";
				return -EINVAL;
			}
NeilBrown's avatar
NeilBrown committed
801

802
803
804
805
			if (rs->dev[i].meta_dev) {
				rs->ti->error = "No data device supplied with metadata device";
				return -EINVAL;
			}
806

NeilBrown's avatar
NeilBrown committed
807
808
809
			continue;
		}

810
811
812
813
814
815
		r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
				  &rs->dev[i].data_dev);
		if (r) {
			rs->ti->error = "RAID device lookup failure";
			return r;
		}
NeilBrown's avatar
NeilBrown committed
816

817
818
819
820
		if (rs->dev[i].meta_dev) {
			metadata_available = 1;
			rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
		}
NeilBrown's avatar
NeilBrown committed
821
		rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
822
		list_add_tail(&rs->dev[i].rdev.same_set, &rs->md.disks);
NeilBrown's avatar
NeilBrown committed
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
		if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
			rebuild++;
	}

	if (metadata_available) {
		rs->md.external = 0;
		rs->md.persistent = 1;
		rs->md.major_version = 2;
	} else if (rebuild && !rs->md.recovery_cp) {
		/*
		 * Without metadata, we will not be able to tell if the array
		 * is in-sync or not - we must assume it is not.  Therefore,
		 * it is impossible to rebuild a drive.
		 *
		 * Even if there is metadata, the on-disk information may
		 * indicate that the array is not in-sync and it will then
		 * fail at that time.
		 *
		 * User could specify 'nosync' option if desperate.
		 */
843
844
		rs->ti->error = "Unable to rebuild drive while array is not in-sync";
		return -EINVAL;
NeilBrown's avatar
NeilBrown committed
845
846
847
848
849
	}

	return 0;
}

850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
/*
 * validate_region_size
 * @rs
 * @region_size:  region size in sectors.  If 0, pick a size (4MiB default).
 *
 * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
 * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
 *
 * Returns: 0 on success, -EINVAL on failure.
 */
static int validate_region_size(struct raid_set *rs, unsigned long region_size)
{
	unsigned long min_region_size = rs->ti->len / (1 << 21);

	if (!region_size) {
		/*
866
		 * Choose a reasonable default.	 All figures in sectors.
867
868
		 */
		if (min_region_size > (1 << 13)) {
869
			/* If not a power of 2, make it the next power of 2 */
870
			region_size = roundup_pow_of_two(min_region_size);
871
872
873
874
875
876
877
878
879
880
			DMINFO("Choosing default region size of %lu sectors",
			       region_size);
		} else {
			DMINFO("Choosing default region size of 4MiB");
			region_size = 1 << 13; /* sectors */
		}
	} else {
		/*
		 * Validate user-supplied value.
		 */
881
882
883
884
		if (region_size > rs->ti->len) {
			rs->ti->error = "Supplied region size is too large";
			return -EINVAL;
		}
885
886
887
888

		if (region_size < min_region_size) {
			DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
			      region_size, min_region_size);
889
890
			rs->ti->error = "Supplied region size is too small";
			return -EINVAL;
891
892
		}

893
894
895
896
		if (!is_power_of_2(region_size)) {
			rs->ti->error = "Region size is not a power of 2";
			return -EINVAL;
		}
897

898
899
900
901
		if (region_size < rs->md.chunk_sectors) {
			rs->ti->error = "Region size is smaller than the chunk size";
			return -EINVAL;
		}
902
903
904
905
906
907
908
909
910
911
	}

	/*
	 * Convert sectors to bytes.
	 */
	rs->md.bitmap_info.chunksize = (region_size << 9);

	return 0;
}

912
/*
913
 * validate_raid_redundancy
914
915
 * @rs
 *
916
917
 * Determine if there are enough devices in the array that haven't
 * failed (or are being rebuilt) to form a usable array.
918
919
920
 *
 * Returns: 0 on success, -EINVAL on failure.
 */
921
static int validate_raid_redundancy(struct raid_set *rs)
922
923
{
	unsigned i, rebuild_cnt = 0;
924
	unsigned rebuilds_per_group = 0, copies;
925
	unsigned group_size, last_group_start;
926
927

	for (i = 0; i < rs->md.raid_disks; i++)
928
929
		if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
		    !rs->dev[i].rdev.sb_page)
930
931
932
933
934
935
936
937
938
939
940
941
942
943
			rebuild_cnt++;

	switch (rs->raid_type->level) {
	case 1:
		if (rebuild_cnt >= rs->md.raid_disks)
			goto too_many;
		break;
	case 4:
	case 5:
	case 6:
		if (rebuild_cnt > rs->raid_type->parity_devs)
			goto too_many;
		break;
	case 10:
944
		copies = raid10_md_layout_to_copies(rs->md.new_layout);
945
946
947
948
949
950
951
952
953
954
955
956
957
		if (rebuild_cnt < copies)
			break;

		/*
		 * It is possible to have a higher rebuild count for RAID10,
		 * as long as the failed devices occur in different mirror
		 * groups (i.e. different stripes).
		 *
		 * When checking "near" format, make sure no adjacent devices
		 * have failed beyond what can be handled.  In addition to the
		 * simple case where the number of devices is a multiple of the
		 * number of copies, we must also handle cases where the number
		 * of devices is not a multiple of the number of copies.
958
959
960
		 * E.g.	   dev1 dev2 dev3 dev4 dev5
		 *	    A	 A    B	   B	C
		 *	    C	 D    D	   E	E
961
		 */
962
		if (__is_raid10_near(rs->md.new_layout)) {
963
			for (i = 0; i < rs->md.raid_disks; i++) {
964
965
				if (!(i % copies))
					rebuilds_per_group = 0;
966
				if ((!rs->dev[i].rdev.sb_page ||
967
				    !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
968
969
970
971
972
973
974
975
976
977
978
979
980
981
				    (++rebuilds_per_group >= copies))
					goto too_many;
			}
			break;
		}

		/*
		 * When checking "far" and "offset" formats, we need to ensure
		 * that the device that holds its copy is not also dead or
		 * being rebuilt.  (Note that "far" and "offset" formats only
		 * support two copies right now.  These formats also only ever
		 * use the 'use_far_sets' variant.)
		 *
		 * This check is somewhat complicated by the need to account
982
		 * for arrays that are not a multiple of (far) copies.	This
983
984
985
986
987
988
989
990
		 * results in the need to treat the last (potentially larger)
		 * set differently.
		 */
		group_size = (rs->md.raid_disks / copies);
		last_group_start = (rs->md.raid_disks / group_size) - 1;
		last_group_start *= group_size;
		for (i = 0; i < rs->md.raid_disks; i++) {
			if (!(i % copies) && !(i > last_group_start))
991
				rebuilds_per_group = 0;
992
993
			if ((!rs->dev[i].rdev.sb_page ||
			     !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
994
			    (++rebuilds_per_group >= copies))
995
					goto too_many;
996
997
		}
		break;
998
	default:
999
1000
		if (rebuild_cnt)
			return -EINVAL;
1001
1002
1003
1004
1005
1006
1007
1008
	}

	return 0;

too_many:
	return -EINVAL;
}

NeilBrown's avatar
NeilBrown committed
1009
1010
1011
1012
/*
 * Possible arguments are...
 *	<chunk_size> [optional_args]
 *
1013
1014
 * Argument definitions
 *    <chunk_size>			The number of sectors per disk that
1015
 *					will form the "stripe"
1016
 *    [[no]sync]			Force or prevent recovery of the
1017
 *					entire array
NeilBrown's avatar
NeilBrown committed
1018
 *    [rebuild <idx>]			Rebuild the drive indicated by the index
1019
 *    [daemon_sleep <ms>]		Time between bitmap daemon work to
1020
 *					clear bits
NeilBrown's avatar
NeilBrown committed
1021
1022
 *    [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
 *    [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
1023
 *    [write_mostly <idx>]		Indicate a write mostly drive via index
NeilBrown's avatar
NeilBrown committed
1024
1025
 *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
 *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
1026
 *    [region_size <sectors>]		Defines granularity of bitmap
1027
1028
 *
 * RAID10-only options:
1029
 *    [raid10_copies <# copies>]	Number of copies.  (Default: 2)
1030
 *    [raid10_format <near|far|offset>] Layout algorithm.  (Default: near)
NeilBrown's avatar
NeilBrown committed
1031
 */
1032
static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
NeilBrown's avatar
NeilBrown committed
1033
1034
			     unsigned num_raid_params)
{
1035
	int value, raid10_format = ALGORITHM_RAID10_DEFAULT;
1036
	unsigned raid10_copies = 2;
1037
	unsigned i, write_mostly = 0;
1038
	unsigned region_size = 0;
1039
	sector_t max_io_len;
1040
	const char *arg, *key;
1041
	struct raid_dev *rd;
1042
	struct raid_type *rt = rs->raid_type;
1043
1044
1045
1046

	arg = dm_shift_arg(as);
	num_raid_params--; /* Account for chunk_size argument */

1047
	if (kstrtoint(arg, 10, &value) < 0) {
1048
1049
1050
		rs->ti->error = "Bad numerical argument given for chunk_size";
		return -EINVAL;
	}
NeilBrown's avatar
NeilBrown committed
1051
1052
1053

	/*
	 * First, parse the in-order required arguments
1054
	 * "chunk_size" is the only argument of this type.
NeilBrown's avatar
NeilBrown committed
1055
	 */
1056
	if (rt_is_raid1(rt)) {
1057
1058
1059
		if (value)
			DMERR("Ignoring chunk size parameter for RAID 1");
		value = 0;
1060
1061
1062
1063
1064
1065
1066
	} else if (!is_power_of_2(value)) {
		rs->ti->error = "Chunk size must be a power of 2";
		return -EINVAL;
	} else if (value < 8) {
		rs->ti->error = "Chunk size value is too small";
		return -EINVAL;
	}
NeilBrown's avatar
NeilBrown committed
1067
1068
1069
1070

	rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;

	/*
1071
1072
1073
1074
1075
	 * We set each individual device as In_sync with a completed
	 * 'recovery_offset'.  If there has been a device failure or
	 * replacement then one of the following cases applies:
	 *
	 *   1) User specifies 'rebuild'.
1076
	 *	- Device is reset when param is read.
1077
	 *   2) A new device is supplied.
1078
	 *	- No matching superblock found, resets device.
1079
	 *   3) Device failure was transient and returns on reload.
1080
	 *	- Failure noticed, resets device for bitmap replay.
1081
	 *   4) Device hadn't completed recovery after previous failure.
1082
	 *	- Superblock is read and overrides recovery_offset.
1083
1084
1085
	 *
	 * What is found in the superblocks of the devices is always
	 * authoritative, unless 'rebuild' or '[no]sync' was specified.
NeilBrown's avatar
NeilBrown committed
1086
	 */
1087
	for (i = 0; i < rs->raid_disks; i++) {
NeilBrown's avatar
NeilBrown committed
1088
		set_bit(In_sync, &rs->dev[i].rdev.flags);
1089
1090
		rs->dev[i].rdev.recovery_offset = MaxSector;
	}
NeilBrown's avatar
NeilBrown committed
1091

1092
1093
1094
	/*
	 * Second, parse the unordered optional arguments
	 */
NeilBrown's avatar
NeilBrown committed
1095
	for (i = 0; i < num_raid_params; i++) {
1096
		key = dm_shift_arg(as);
1097
1098
1099
1100
		if (!key) {
			rs->ti->error = "Not enough raid parameters given";
			return -EINVAL;
		}
1101

1102
		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC))) {
1103
			if (test_and_set_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) {
1104
1105
1106
				rs->ti->error = "Only one 'nosync' argument allowed";
				return -EINVAL;
			}
NeilBrown's avatar
NeilBrown committed
1107
1108
			continue;
		}
1109
		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_SYNC))) {
1110
			if (test_and_set_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) {
1111
1112
1113
				rs->ti->error = "Only one 'sync' argument allowed";
				return -EINVAL;
			}
1114
1115
			continue;
		}
1116
		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_USE_NEAR_SETS))) {
1117
			if (test_and_set_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) {
1118
1119
1120
				rs->ti->error = "Only one 'raid10_use_new_sets' argument allowed";
				return -EINVAL;
			}
NeilBrown's avatar
NeilBrown committed
1121
1122
1123
			continue;
		}

1124
1125
		arg = dm_shift_arg(as);
		i++; /* Account for the argument pairs */
1126
1127
1128
1129
		if (!arg) {
			rs->ti->error = "Wrong number of raid parameters given";
			return -EINVAL;
		}
1130

1131
1132
1133
1134
		/*
		 * Parameters that take a string value are checked here.
		 */

1135
		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) {
1136
			if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) {
1137
1138
1139
1140
1141
1142
1143
				rs->ti->error = "Only one 'raid10_format' argument pair allowed";
				return -EINVAL;
			}
			if (!rt_is_raid10(rt)) {
				rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
				return -EINVAL;
			}
1144
			raid10_format = raid10_name_to_format(arg);
1145
1146
1147
1148
			if (raid10_format < 0) {
				rs->ti->error = "Invalid 'raid10_format' value given";
				return raid10_format;
			}
1149
1150
1151
			continue;
		}

1152
		if (kstrtoint(arg, 10, &value) < 0) {
1153
1154
1155
			rs->ti->error = "Bad numerical argument given in raid params";
			return -EINVAL;
		}
1156

1157
		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD))) {
1158
1159
1160
1161
			/*
			 * "rebuild" is being passed in by userspace to provide
			 * indexes of replaced devices and to set up additional
			 * devices on raid level takeover.
1162
			 */
1163
			if (!__within_range(value, 0, rs->raid_disks - 1)) {
1164
1165
1166
				rs->ti->error = "Invalid rebuild index given";
				return -EINVAL;
			}
1167

1168
1169
1170
1171
			if (test_and_set_bit(value, (void *) rs->rebuild_disks)) {
				rs->ti->error = "rebuild for this index already given";
				return -EINVAL;
			}
1172

1173
1174
1175
1176
			rd = rs->dev + value;
			clear_bit(In_sync, &rd->rdev.flags);
			clear_bit(Faulty, &rd->rdev.flags);
			rd->rdev.recovery_offset = 0;