dm-raid.c 100 KB
Newer Older
NeilBrown's avatar
NeilBrown committed
1
2
/*
 * Copyright (C) 2010-2011 Neil Brown
3
 * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved.
NeilBrown's avatar
NeilBrown committed
4
5
6
7
8
 *
 * This file is released under the GPL.
 */

#include <linux/slab.h>
9
#include <linux/module.h>
NeilBrown's avatar
NeilBrown committed
10
11

#include "md.h"
12
#include "raid1.h"
NeilBrown's avatar
NeilBrown committed
13
#include "raid5.h"
14
#include "raid10.h"
NeilBrown's avatar
NeilBrown committed
15
16
#include "bitmap.h"

Alasdair G Kergon's avatar
Alasdair G Kergon committed
17
18
#include <linux/device-mapper.h>

NeilBrown's avatar
NeilBrown committed
19
#define DM_MSG_PREFIX "raid"
20
#define	MAX_RAID_DEVICES	253 /* md-raid kernel limit */
NeilBrown's avatar
NeilBrown committed
21

22
23
24
25
26
/*
 * Minimum sectors of free reshape space per raid device
 */
#define	MIN_FREE_RESHAPE_SPACE to_sector(4*4096)

27
28
static bool devices_handle_discard_safely = false;

NeilBrown's avatar
NeilBrown committed
29
/*
30
31
 * The following flags are used by dm-raid.c to set up the array state.
 * They must be cleared before md_run is called.
NeilBrown's avatar
NeilBrown committed
32
 */
33
#define FirstUse 10		/* rdev flag */
NeilBrown's avatar
NeilBrown committed
34
35
36
37

struct raid_dev {
	/*
	 * Two DM devices, one to hold metadata and one to hold the
38
	 * actual data/parity.	The reason for this is to not confuse
NeilBrown's avatar
NeilBrown committed
39
40
41
42
43
44
45
46
47
48
49
	 * ti->len and give more flexibility in altering size and
	 * characteristics.
	 *
	 * While it is possible for this device to be associated
	 * with a different physical device than the data_dev, it
	 * is intended for it to be the same.
	 *    |--------- Physical Device ---------|
	 *    |- meta_dev -|------ data_dev ------|
	 */
	struct dm_dev *meta_dev;
	struct dm_dev *data_dev;
50
	struct md_rdev rdev;
NeilBrown's avatar
NeilBrown committed
51
52
53
};

/*
54
 * Bits for establishing rs->ctr_flags
55
56
57
 *
 * 1 = no flag value
 * 2 = flag with value
NeilBrown's avatar
NeilBrown committed
58
 */
59
60
61
62
63
64
65
66
67
68
69
70
#define __CTR_FLAG_SYNC			0  /* 1 */ /* Not with raid0! */
#define __CTR_FLAG_NOSYNC		1  /* 1 */ /* Not with raid0! */
#define __CTR_FLAG_REBUILD		2  /* 2 */ /* Not with raid0! */
#define __CTR_FLAG_DAEMON_SLEEP		3  /* 2 */ /* Not with raid0! */
#define __CTR_FLAG_MIN_RECOVERY_RATE	4  /* 2 */ /* Not with raid0! */
#define __CTR_FLAG_MAX_RECOVERY_RATE	5  /* 2 */ /* Not with raid0! */
#define __CTR_FLAG_MAX_WRITE_BEHIND	6  /* 2 */ /* Only with raid1! */
#define __CTR_FLAG_WRITE_MOSTLY		7  /* 2 */ /* Only with raid1! */
#define __CTR_FLAG_STRIPE_CACHE		8  /* 2 */ /* Only with raid4/5/6! */
#define __CTR_FLAG_REGION_SIZE		9  /* 2 */ /* Not with raid0! */
#define __CTR_FLAG_RAID10_COPIES	10 /* 2 */ /* Only with raid10 */
#define __CTR_FLAG_RAID10_FORMAT	11 /* 2 */ /* Only with raid10 */
71
/* New for v1.9.0 */
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#define __CTR_FLAG_DELTA_DISKS		12 /* 2 */ /* Only with reshapable raid4/5/6/10! */
#define __CTR_FLAG_DATA_OFFSET		13 /* 2 */ /* Only with reshapable raid4/5/6/10! */
#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */

/*
 * Flags for rs->ctr_flags field.
 */
#define CTR_FLAG_SYNC			(1 << __CTR_FLAG_SYNC)
#define CTR_FLAG_NOSYNC			(1 << __CTR_FLAG_NOSYNC)
#define CTR_FLAG_REBUILD		(1 << __CTR_FLAG_REBUILD)
#define CTR_FLAG_DAEMON_SLEEP		(1 << __CTR_FLAG_DAEMON_SLEEP)
#define CTR_FLAG_MIN_RECOVERY_RATE	(1 << __CTR_FLAG_MIN_RECOVERY_RATE)
#define CTR_FLAG_MAX_RECOVERY_RATE	(1 << __CTR_FLAG_MAX_RECOVERY_RATE)
#define CTR_FLAG_MAX_WRITE_BEHIND	(1 << __CTR_FLAG_MAX_WRITE_BEHIND)
#define CTR_FLAG_WRITE_MOSTLY		(1 << __CTR_FLAG_WRITE_MOSTLY)
#define CTR_FLAG_STRIPE_CACHE		(1 << __CTR_FLAG_STRIPE_CACHE)
#define CTR_FLAG_REGION_SIZE		(1 << __CTR_FLAG_REGION_SIZE)
#define CTR_FLAG_RAID10_COPIES		(1 << __CTR_FLAG_RAID10_COPIES)
#define CTR_FLAG_RAID10_FORMAT		(1 << __CTR_FLAG_RAID10_FORMAT)
#define CTR_FLAG_DELTA_DISKS		(1 << __CTR_FLAG_DELTA_DISKS)
#define CTR_FLAG_DATA_OFFSET		(1 << __CTR_FLAG_DATA_OFFSET)
#define CTR_FLAG_RAID10_USE_NEAR_SETS	(1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
94

95
96
97
98
99
100
101
102
103
/*
 * Definitions of various constructor flags to
 * be used in checks of valid / invalid flags
 * per raid level.
 */
/* Define all any sync flags */
#define	CTR_FLAGS_ANY_SYNC		(CTR_FLAG_SYNC | CTR_FLAG_NOSYNC)

/* Define flags for options without argument (e.g. 'nosync') */
104
105
#define	CTR_FLAG_OPTIONS_NO_ARGS	(CTR_FLAGS_ANY_SYNC | \
					 CTR_FLAG_RAID10_USE_NEAR_SETS)
106
107
108
109
110
111
112
113
114
115
116

/* Define flags for options with one argument (e.g. 'delta_disks +2') */
#define CTR_FLAG_OPTIONS_ONE_ARG (CTR_FLAG_REBUILD | \
				  CTR_FLAG_WRITE_MOSTLY | \
				  CTR_FLAG_DAEMON_SLEEP | \
				  CTR_FLAG_MIN_RECOVERY_RATE | \
				  CTR_FLAG_MAX_RECOVERY_RATE | \
				  CTR_FLAG_MAX_WRITE_BEHIND | \
				  CTR_FLAG_STRIPE_CACHE | \
				  CTR_FLAG_REGION_SIZE | \
				  CTR_FLAG_RAID10_COPIES | \
117
118
119
				  CTR_FLAG_RAID10_FORMAT | \
				  CTR_FLAG_DELTA_DISKS | \
				  CTR_FLAG_DATA_OFFSET)
120

121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
/* Valid options definitions per raid level... */

/* "raid0" does only accept data offset */
#define RAID0_VALID_FLAGS	(CTR_FLAG_DATA_OFFSET)

/* "raid1" does not accept stripe cache, data offset, delta_disks or any raid10 options */
#define RAID1_VALID_FLAGS	(CTR_FLAGS_ANY_SYNC | \
				 CTR_FLAG_REBUILD | \
				 CTR_FLAG_WRITE_MOSTLY | \
				 CTR_FLAG_DAEMON_SLEEP | \
				 CTR_FLAG_MIN_RECOVERY_RATE | \
				 CTR_FLAG_MAX_RECOVERY_RATE | \
				 CTR_FLAG_MAX_WRITE_BEHIND | \
				 CTR_FLAG_REGION_SIZE | \
				 CTR_FLAG_DATA_OFFSET)
136

137
138
139
140
141
142
143
/* "raid10" does not accept any raid1 or stripe cache options */
#define RAID10_VALID_FLAGS	(CTR_FLAGS_ANY_SYNC | \
				 CTR_FLAG_REBUILD | \
				 CTR_FLAG_DAEMON_SLEEP | \
				 CTR_FLAG_MIN_RECOVERY_RATE | \
				 CTR_FLAG_MAX_RECOVERY_RATE | \
				 CTR_FLAG_REGION_SIZE | \
144
				 CTR_FLAG_RAID10_COPIES | \
145
146
				 CTR_FLAG_RAID10_FORMAT | \
				 CTR_FLAG_DELTA_DISKS | \
147
148
				 CTR_FLAG_DATA_OFFSET | \
				 CTR_FLAG_RAID10_USE_NEAR_SETS)
149
150
151
152
153
154
155
156

/*
 * "raid4/5/6" do not accept any raid1 or raid10 specific options
 *
 * "raid6" does not accept "nosync", because it is not guaranteed
 * that both parity and q-syndrome are being written properly with
 * any writes
 */
157
158
159
160
161
#define RAID45_VALID_FLAGS	(CTR_FLAGS_ANY_SYNC | \
				 CTR_FLAG_REBUILD | \
				 CTR_FLAG_DAEMON_SLEEP | \
				 CTR_FLAG_MIN_RECOVERY_RATE | \
				 CTR_FLAG_MAX_RECOVERY_RATE | \
162
				 CTR_FLAG_MAX_WRITE_BEHIND | \
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
				 CTR_FLAG_STRIPE_CACHE | \
				 CTR_FLAG_REGION_SIZE | \
				 CTR_FLAG_DELTA_DISKS | \
				 CTR_FLAG_DATA_OFFSET)

#define RAID6_VALID_FLAGS	(CTR_FLAG_SYNC | \
				 CTR_FLAG_REBUILD | \
				 CTR_FLAG_DAEMON_SLEEP | \
				 CTR_FLAG_MIN_RECOVERY_RATE | \
				 CTR_FLAG_MAX_RECOVERY_RATE | \
				 CTR_FLAG_MAX_WRITE_BEHIND | \
				 CTR_FLAG_STRIPE_CACHE | \
				 CTR_FLAG_REGION_SIZE | \
				 CTR_FLAG_DELTA_DISKS | \
				 CTR_FLAG_DATA_OFFSET)
/* ...valid options definitions per raid level */
179

180
181
182
183
184
185
186
187
/*
 * Flags for rs->runtime_flags field
 * (RT_FLAG prefix meaning "runtime flag")
 *
 * These are all internal and used to define runtime state,
 * e.g. to prevent another resume from preresume processing
 * the raid set all over again.
 */
188
189
190
191
#define RT_FLAG_RS_PRERESUMED		0
#define RT_FLAG_RS_RESUMED		1
#define RT_FLAG_RS_BITMAP_LOADED	2
#define RT_FLAG_UPDATE_SBS		3
192
#define RT_FLAG_RESHAPE_RS		4
193
#define RT_FLAG_KEEP_RS_FROZEN		5
194

195
196
197
/* Array elements of 64 bit needed for rebuild/write_mostly bits */
#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)

198
199
200
201
202
203
204
205
206
/*
 * raid set level, layout and chunk sectors backup/restore
 */
struct rs_layout {
	int new_level;
	int new_layout;
	int new_chunk_sectors;
};

NeilBrown's avatar
NeilBrown committed
207
208
209
struct raid_set {
	struct dm_target *ti;

210
	uint32_t bitmap_loaded;
211
	uint32_t stripe_cache_entries;
212
213
	unsigned long ctr_flags;
	unsigned long runtime_flags;
214
215

	uint64_t rebuild_disks[DISKS_ARRAY_ELEMS];
NeilBrown's avatar
NeilBrown committed
216

217
218
	int raid_disks;
	int delta_disks;
219
	int data_offset;
220
	int raid10_copies;
221
	int requested_bitmap_chunk_sectors;
222

223
	struct mddev md;
NeilBrown's avatar
NeilBrown committed
224
225
226
227
228
229
	struct raid_type *raid_type;
	struct dm_target_callbacks callbacks;

	struct raid_dev dev[0];
};

230
static void rs_config_backup(struct raid_set *rs, struct rs_layout *l)
231
232
233
234
235
236
237
238
{
	struct mddev *mddev = &rs->md;

	l->new_level = mddev->new_level;
	l->new_layout = mddev->new_layout;
	l->new_chunk_sectors = mddev->new_chunk_sectors;
}

239
static void rs_config_restore(struct raid_set *rs, struct rs_layout *l)
240
241
242
243
244
245
246
247
{
	struct mddev *mddev = &rs->md;

	mddev->new_level = l->new_level;
	mddev->new_layout = l->new_layout;
	mddev->new_chunk_sectors = l->new_chunk_sectors;
}

248
249
250
251
252
253
/* raid10 algorithms (i.e. formats) */
#define	ALGORITHM_RAID10_DEFAULT	0
#define	ALGORITHM_RAID10_NEAR		1
#define	ALGORITHM_RAID10_OFFSET		2
#define	ALGORITHM_RAID10_FAR		3

NeilBrown's avatar
NeilBrown committed
254
255
256
257
258
259
260
261
262
/* Supported raid types and properties. */
static struct raid_type {
	const char *name;		/* RAID algorithm. */
	const char *descr;		/* Descriptor text for logging. */
	const unsigned parity_devs;	/* # of parity devices. */
	const unsigned minimal_devs;	/* minimal # of devices in set. */
	const unsigned level;		/* RAID level. */
	const unsigned algorithm;	/* RAID algorithm. */
} raid_types[] = {
263
264
265
	{"raid0",	  "raid0 (striping)",			    0, 2, 0,  0 /* NONE */},
	{"raid1",	  "raid1 (mirroring)",			    0, 2, 1,  0 /* NONE */},
	{"raid10_far",	  "raid10 far (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_FAR},
266
	{"raid10_offset", "raid10 offset (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_OFFSET},
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
	{"raid10_near",	  "raid10 near (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_NEAR},
	{"raid10",	  "raid10 (striped mirrors)",		    0, 2, 10, ALGORITHM_RAID10_DEFAULT},
	{"raid4",	  "raid4 (dedicated last parity disk)",	    1, 2, 4,  ALGORITHM_PARITY_N}, /* raid4 layout = raid5_n */
	{"raid5_n",	  "raid5 (dedicated last parity disk)",	    1, 2, 5,  ALGORITHM_PARITY_N},
	{"raid5_ls",	  "raid5 (left symmetric)",		    1, 2, 5,  ALGORITHM_LEFT_SYMMETRIC},
	{"raid5_rs",	  "raid5 (right symmetric)",		    1, 2, 5,  ALGORITHM_RIGHT_SYMMETRIC},
	{"raid5_la",	  "raid5 (left asymmetric)",		    1, 2, 5,  ALGORITHM_LEFT_ASYMMETRIC},
	{"raid5_ra",	  "raid5 (right asymmetric)",		    1, 2, 5,  ALGORITHM_RIGHT_ASYMMETRIC},
	{"raid6_zr",	  "raid6 (zero restart)",		    2, 4, 6,  ALGORITHM_ROTATING_ZERO_RESTART},
	{"raid6_nr",	  "raid6 (N restart)",			    2, 4, 6,  ALGORITHM_ROTATING_N_RESTART},
	{"raid6_nc",	  "raid6 (N continue)",			    2, 4, 6,  ALGORITHM_ROTATING_N_CONTINUE},
	{"raid6_n_6",	  "raid6 (dedicated parity/Q n/6)",	    2, 4, 6,  ALGORITHM_PARITY_N_6},
	{"raid6_ls_6",	  "raid6 (left symmetric dedicated Q 6)",   2, 4, 6,  ALGORITHM_LEFT_SYMMETRIC_6},
	{"raid6_rs_6",	  "raid6 (right symmetric dedicated Q 6)",  2, 4, 6,  ALGORITHM_RIGHT_SYMMETRIC_6},
	{"raid6_la_6",	  "raid6 (left asymmetric dedicated Q 6)",  2, 4, 6,  ALGORITHM_LEFT_ASYMMETRIC_6},
	{"raid6_ra_6",	  "raid6 (right asymmetric dedicated Q 6)", 2, 4, 6,  ALGORITHM_RIGHT_ASYMMETRIC_6}
NeilBrown's avatar
NeilBrown committed
283
284
};

285
/* True, if @v is in inclusive range [@min, @max] */
286
static bool __within_range(long v, long min, long max)
287
288
289
290
{
	return v >= min && v <= max;
}

291
292
/* All table line arguments are defined here */
static struct arg_name_flag {
293
	const unsigned long flag;
294
	const char *name;
Mike Snitzer's avatar
Mike Snitzer committed
295
} __arg_name_flags[] = {
296
297
298
299
300
301
302
303
304
305
306
307
	{ CTR_FLAG_SYNC, "sync"},
	{ CTR_FLAG_NOSYNC, "nosync"},
	{ CTR_FLAG_REBUILD, "rebuild"},
	{ CTR_FLAG_DAEMON_SLEEP, "daemon_sleep"},
	{ CTR_FLAG_MIN_RECOVERY_RATE, "min_recovery_rate"},
	{ CTR_FLAG_MAX_RECOVERY_RATE, "max_recovery_rate"},
	{ CTR_FLAG_MAX_WRITE_BEHIND, "max_write_behind"},
	{ CTR_FLAG_WRITE_MOSTLY, "writemostly"},
	{ CTR_FLAG_STRIPE_CACHE, "stripe_cache"},
	{ CTR_FLAG_REGION_SIZE, "region_size"},
	{ CTR_FLAG_RAID10_COPIES, "raid10_copies"},
	{ CTR_FLAG_RAID10_FORMAT, "raid10_format"},
308
309
310
	{ CTR_FLAG_DATA_OFFSET, "data_offset"},
	{ CTR_FLAG_DELTA_DISKS, "delta_disks"},
	{ CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
311
312
313
};

/* Return argument name string for given @flag */
314
static const char *dm_raid_arg_name_by_flag(const uint32_t flag)
315
316
{
	if (hweight32(flag) == 1) {
Mike Snitzer's avatar
Mike Snitzer committed
317
		struct arg_name_flag *anf = __arg_name_flags + ARRAY_SIZE(__arg_name_flags);
318

Mike Snitzer's avatar
Mike Snitzer committed
319
		while (anf-- > __arg_name_flags)
320
			if (flag & anf->flag)
321
322
323
324
325
326
327
328
				return anf->name;

	} else
		DMERR("%s called with more than one flag!", __func__);

	return NULL;
}

329
330
331
332
333
334
335
336
337
338
339
/*
 * bool helpers to test for various raid levels of a raid set,
 * is. it's level as reported by the superblock rather than
 * the requested raid_type passed to the constructor.
 */
/* Return true, if raid set in @rs is raid0 */
static bool rs_is_raid0(struct raid_set *rs)
{
	return !rs->md.level;
}

340
341
342
343
344
345
/* Return true, if raid set in @rs is raid1 */
static bool rs_is_raid1(struct raid_set *rs)
{
	return rs->md.level == 1;
}

346
347
348
349
350
351
/* Return true, if raid set in @rs is raid10 */
static bool rs_is_raid10(struct raid_set *rs)
{
	return rs->md.level == 10;
}

352
353
354
355
356
357
358
359
360
361
362
363
364
365
/* Return true, if raid set in @rs is level 4, 5 or 6 */
static bool rs_is_raid456(struct raid_set *rs)
{
	return __within_range(rs->md.level, 4, 6);
}

/* Return true, if raid set in @rs is reshapable */
static unsigned int __is_raid10_far(int layout);
static bool rs_is_reshapable(struct raid_set *rs)
{
	return rs_is_raid456(rs) ||
	       (rs_is_raid10(rs) && !__is_raid10_far(rs->md.new_layout));
}

366
367
368
369
370
371
372
373
374
375
376
377
/* Return true, if raid set in @rs is recovering */
static bool rs_is_recovering(struct raid_set *rs)
{
	return rs->md.recovery_cp != MaxSector;
}

/* Return true, if raid set in @rs is reshaping */
static bool rs_is_reshaping(struct raid_set *rs)
{
	return rs->md.reshape_position != MaxSector;
}

378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
/*
 * bool helpers to test for various raid levels of a raid type
 */

/* Return true, if raid type in @rt is raid0 */
static bool rt_is_raid0(struct raid_type *rt)
{
	return !rt->level;
}

/* Return true, if raid type in @rt is raid1 */
static bool rt_is_raid1(struct raid_type *rt)
{
	return rt->level == 1;
}

/* Return true, if raid type in @rt is raid10 */
static bool rt_is_raid10(struct raid_type *rt)
{
	return rt->level == 10;
}

/* Return true, if raid type in @rt is raid4/5 */
static bool rt_is_raid45(struct raid_type *rt)
{
403
	return __within_range(rt->level, 4, 5);
404
405
406
407
408
409
410
}

/* Return true, if raid type in @rt is raid6 */
static bool rt_is_raid6(struct raid_type *rt)
{
	return rt->level == 6;
}
411
412
413
414

/* Return true, if raid type in @rt is raid4/5/6 */
static bool rt_is_raid456(struct raid_type *rt)
{
415
	return __within_range(rt->level, 4, 6);
416
}
417
418
/* END: raid level bools */

419
420
/* Return valid ctr flags for the raid level of @rs */
static unsigned long __valid_flags(struct raid_set *rs)
421
422
{
	if (rt_is_raid0(rs->raid_type))
423
		return RAID0_VALID_FLAGS;
424
	else if (rt_is_raid1(rs->raid_type))
425
		return RAID1_VALID_FLAGS;
426
	else if (rt_is_raid10(rs->raid_type))
427
		return RAID10_VALID_FLAGS;
428
	else if (rt_is_raid45(rs->raid_type))
429
		return RAID45_VALID_FLAGS;
430
	else if (rt_is_raid6(rs->raid_type))
431
		return RAID6_VALID_FLAGS;
432
433
434
435
436

	return ~0;
}

/*
437
 * Check for valid flags set on @rs
438
439
440
 *
 * Has to be called after parsing of the ctr flags!
 */
441
static int rs_check_for_valid_flags(struct raid_set *rs)
442
{
443
	if (rs->ctr_flags & ~__valid_flags(rs)) {
444
		rs->ti->error = "Invalid flags combination";
445
446
		return -EINVAL;
	}
447
448
449
450

	return 0;
}

451
452
453
454
455
456
457
/* MD raid10 bit definitions and helpers */
#define RAID10_OFFSET			(1 << 16) /* stripes with data copies area adjacent on devices */
#define RAID10_BROCKEN_USE_FAR_SETS	(1 << 17) /* Broken in raid10.c: use sets instead of whole stripe rotation */
#define RAID10_USE_FAR_SETS		(1 << 18) /* Use sets instead of whole stripe rotation */
#define RAID10_FAR_COPIES_SHIFT		8	  /* raid10 # far copies shift (2nd byte of layout) */

/* Return md raid10 near copies for @layout */
Mike Snitzer's avatar
Mike Snitzer committed
458
static unsigned int __raid10_near_copies(int layout)
459
460
461
462
463
{
	return layout & 0xFF;
}

/* Return md raid10 far copies for @layout */
Mike Snitzer's avatar
Mike Snitzer committed
464
static unsigned int __raid10_far_copies(int layout)
465
{
Mike Snitzer's avatar
Mike Snitzer committed
466
	return __raid10_near_copies(layout >> RAID10_FAR_COPIES_SHIFT);
467
468
469
}

/* Return true if md raid10 offset for @layout */
Mike Snitzer's avatar
Mike Snitzer committed
470
static unsigned int __is_raid10_offset(int layout)
471
472
473
474
475
{
	return layout & RAID10_OFFSET;
}

/* Return true if md raid10 near for @layout */
Mike Snitzer's avatar
Mike Snitzer committed
476
static unsigned int __is_raid10_near(int layout)
477
{
Mike Snitzer's avatar
Mike Snitzer committed
478
	return !__is_raid10_offset(layout) && __raid10_near_copies(layout) > 1;
479
480
481
}

/* Return true if md raid10 far for @layout */
Mike Snitzer's avatar
Mike Snitzer committed
482
static unsigned int __is_raid10_far(int layout)
483
{
Mike Snitzer's avatar
Mike Snitzer committed
484
	return !__is_raid10_offset(layout) && __raid10_far_copies(layout) > 1;
485
486
487
488
}

/* Return md raid10 layout string for @layout */
static const char *raid10_md_layout_to_format(int layout)
489
490
{
	/*
491
492
493
	 * Bit 16 stands for "offset"
	 * (i.e. adjacent stripes hold copies)
	 *
494
495
	 * Refer to MD's raid10.c for details
	 */
Mike Snitzer's avatar
Mike Snitzer committed
496
	if (__is_raid10_offset(layout))
497
498
		return "offset";

Mike Snitzer's avatar
Mike Snitzer committed
499
	if (__raid10_near_copies(layout) > 1)
500
501
		return "near";

Mike Snitzer's avatar
Mike Snitzer committed
502
	WARN_ON(__raid10_far_copies(layout) < 2);
503

504
505
506
	return "far";
}

507
/* Return md raid10 algorithm for @name */
508
static int raid10_name_to_format(const char *name)
509
510
511
512
513
514
515
516
517
518
519
520
521
{
	if (!strcasecmp(name, "near"))
		return ALGORITHM_RAID10_NEAR;
	else if (!strcasecmp(name, "offset"))
		return ALGORITHM_RAID10_OFFSET;
	else if (!strcasecmp(name, "far"))
		return ALGORITHM_RAID10_FAR;

	return -EINVAL;
}

/* Return md raid10 copies for @layout */
static unsigned int raid10_md_layout_to_copies(int layout)
522
{
Mike Snitzer's avatar
Mike Snitzer committed
523
524
	return __raid10_near_copies(layout) > 1 ?
		__raid10_near_copies(layout) : __raid10_far_copies(layout);
525
526
}

527
528
529
530
/* Return md raid10 format id for @format string */
static int raid10_format_to_md_layout(struct raid_set *rs,
				      unsigned int algorithm,
				      unsigned int copies)
531
{
532
	unsigned int n = 1, f = 1, r = 0;
533

534
535
536
537
538
539
540
541
542
543
	/*
	 * MD resilienece flaw:
	 *
	 * enabling use_far_sets for far/offset formats causes copies
	 * to be colocated on the same devs together with their origins!
	 *
	 * -> disable it for now in the definition above
	 */
	if (algorithm == ALGORITHM_RAID10_DEFAULT ||
	    algorithm == ALGORITHM_RAID10_NEAR)
544
		n = copies;
545
546
547
548

	else if (algorithm == ALGORITHM_RAID10_OFFSET) {
		f = copies;
		r = RAID10_OFFSET;
549
		if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags))
550
551
552
			r |= RAID10_USE_FAR_SETS;

	} else if (algorithm == ALGORITHM_RAID10_FAR) {
553
		f = copies;
554
		r = !RAID10_OFFSET;
555
		if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags))
556
			r |= RAID10_USE_FAR_SETS;
557

558
559
560
561
562
563
	} else
		return -EINVAL;

	return r | (f << RAID10_FAR_COPIES_SHIFT) | n;
}
/* END: MD raid10 bit definitions and helpers */
564

565
/* Check for any of the raid10 algorithms */
Mike Snitzer's avatar
Mike Snitzer committed
566
static int __got_raid10(struct raid_type *rtp, const int layout)
567
568
569
570
571
{
	if (rtp->level == 10) {
		switch (rtp->algorithm) {
		case ALGORITHM_RAID10_DEFAULT:
		case ALGORITHM_RAID10_NEAR:
Mike Snitzer's avatar
Mike Snitzer committed
572
			return __is_raid10_near(layout);
573
		case ALGORITHM_RAID10_OFFSET:
Mike Snitzer's avatar
Mike Snitzer committed
574
			return __is_raid10_offset(layout);
575
		case ALGORITHM_RAID10_FAR:
Mike Snitzer's avatar
Mike Snitzer committed
576
			return __is_raid10_far(layout);
577
578
579
580
		default:
			break;
		}
	}
581

582
	return 0;
583
584
}

585
/* Return raid_type for @name */
586
static struct raid_type *get_raid_type(const char *name)
NeilBrown's avatar
NeilBrown committed
587
{
588
	struct raid_type *rtp = raid_types + ARRAY_SIZE(raid_types);
NeilBrown's avatar
NeilBrown committed
589

590
591
592
	while (rtp-- > raid_types)
		if (!strcasecmp(rtp->name, name))
			return rtp;
NeilBrown's avatar
NeilBrown committed
593
594
595
596

	return NULL;
}

597
598
599
600
601
602
603
604
/* Return raid_type for @name based derived from @level and @layout */
static struct raid_type *get_raid_type_by_ll(const int level, const int layout)
{
	struct raid_type *rtp = raid_types + ARRAY_SIZE(raid_types);

	while (rtp-- > raid_types) {
		/* RAID10 special checks based on @layout flags/properties */
		if (rtp->level == level &&
Mike Snitzer's avatar
Mike Snitzer committed
605
		    (__got_raid10(rtp, layout) || rtp->algorithm == layout))
606
607
608
609
610
611
			return rtp;
	}

	return NULL;
}

612
613
614
615
616
617
618
619
/*
 * Conditionally change bdev capacity of @rs
 * in case of a disk add/remove reshape
 */
static void rs_set_capacity(struct raid_set *rs)
{
	struct mddev *mddev = &rs->md;

620
	if (rs->ti->len != mddev->array_sectors) {
621
622
623
624
625
626
627
		struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));

		set_capacity(gendisk, mddev->array_sectors);
		revalidate_disk(gendisk);
	}
}

628
629
630
631
632
633
634
635
636
637
638
639
640
/*
 * Set the mddev properties in @rs to the current
 * ones retrieved from the freshest superblock
 */
static void rs_set_cur(struct raid_set *rs)
{
	struct mddev *mddev = &rs->md;

	mddev->new_level = mddev->level;
	mddev->new_layout = mddev->layout;
	mddev->new_chunk_sectors = mddev->chunk_sectors;
}

641
642
643
644
645
646
647
648
649
650
651
/*
 * Set the mddev properties in @rs to the new
 * ones requested by the ctr
 */
static void rs_set_new(struct raid_set *rs)
{
	struct mddev *mddev = &rs->md;

	mddev->level = mddev->new_level;
	mddev->layout = mddev->new_layout;
	mddev->chunk_sectors = mddev->new_chunk_sectors;
652
	mddev->raid_disks = rs->raid_disks;
653
654
655
	mddev->delta_disks = 0;
}

656
657
static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *raid_type,
				       unsigned raid_devs)
NeilBrown's avatar
NeilBrown committed
658
659
660
661
{
	unsigned i;
	struct raid_set *rs;

662
663
664
665
	if (raid_devs <= raid_type->parity_devs) {
		ti->error = "Insufficient number of devices";
		return ERR_PTR(-EINVAL);
	}
NeilBrown's avatar
NeilBrown committed
666
667

	rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
668
669
670
671
	if (!rs) {
		ti->error = "Cannot allocate raid context";
		return ERR_PTR(-ENOMEM);
	}
NeilBrown's avatar
NeilBrown committed
672
673
674

	mddev_init(&rs->md);

675
676
677
	rs->raid_disks = raid_devs;
	rs->delta_disks = 0;

NeilBrown's avatar
NeilBrown committed
678
679
	rs->ti = ti;
	rs->raid_type = raid_type;
680
	rs->stripe_cache_entries = 256;
NeilBrown's avatar
NeilBrown committed
681
682
683
684
685
686
	rs->md.raid_disks = raid_devs;
	rs->md.level = raid_type->level;
	rs->md.new_level = rs->md.level;
	rs->md.layout = raid_type->algorithm;
	rs->md.new_layout = rs->md.layout;
	rs->md.delta_disks = 0;
687
	rs->md.recovery_cp = rs_is_raid0(rs) ? MaxSector : 0;
NeilBrown's avatar
NeilBrown committed
688
689
690
691
692
693
694
695
696
697

	for (i = 0; i < raid_devs; i++)
		md_rdev_init(&rs->dev[i].rdev);

	/*
	 * Remaining items to be initialized by further RAID params:
	 *  rs->md.persistent
	 *  rs->md.external
	 *  rs->md.chunk_sectors
	 *  rs->md.new_chunk_sectors
698
	 *  rs->md.dev_sectors
NeilBrown's avatar
NeilBrown committed
699
700
701
702
703
	 */

	return rs;
}

704
static void raid_set_free(struct raid_set *rs)
NeilBrown's avatar
NeilBrown committed
705
706
707
{
	int i;

708
709
710
	for (i = 0; i < rs->md.raid_disks; i++) {
		if (rs->dev[i].meta_dev)
			dm_put_device(rs->ti, rs->dev[i].meta_dev);
711
		md_rdev_clear(&rs->dev[i].rdev);
NeilBrown's avatar
NeilBrown committed
712
713
		if (rs->dev[i].data_dev)
			dm_put_device(rs->ti, rs->dev[i].data_dev);
714
	}
NeilBrown's avatar
NeilBrown committed
715
716
717
718
719
720
721
722
723

	kfree(rs);
}

/*
 * For every device we have two words
 *  <meta_dev>: meta device name or '-' if missing
 *  <data_dev>: data device name or '-' if missing
 *
724
725
726
727
728
729
730
731
732
 * The following are permitted:
 *    - -
 *    - <data_dev>
 *    <meta_dev> <data_dev>
 *
 * The following is not allowed:
 *    <meta_dev> -
 *
 * This code parses those words.  If there is a failure,
733
 * the caller must use raid_set_free() to unwind the operations.
NeilBrown's avatar
NeilBrown committed
734
 */
735
static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
NeilBrown's avatar
NeilBrown committed
736
737
738
739
{
	int i;
	int rebuild = 0;
	int metadata_available = 0;
740
	int r = 0;
741
	const char *arg;
NeilBrown's avatar
NeilBrown committed
742

743
744
745
746
747
748
	/* Put off the number of raid devices argument to get to dev pairs */
	arg = dm_shift_arg(as);
	if (!arg)
		return -EINVAL;

	for (i = 0; i < rs->md.raid_disks; i++) {
NeilBrown's avatar
NeilBrown committed
749
750
751
752
753
754
755
756
757
758
759
760
		rs->dev[i].rdev.raid_disk = i;

		rs->dev[i].meta_dev = NULL;
		rs->dev[i].data_dev = NULL;

		/*
		 * There are no offsets, since there is a separate device
		 * for data and metadata.
		 */
		rs->dev[i].rdev.data_offset = 0;
		rs->dev[i].rdev.mddev = &rs->md;

761
762
763
764
765
		arg = dm_shift_arg(as);
		if (!arg)
			return -EINVAL;

		if (strcmp(arg, "-")) {
766
767
768
769
770
771
			r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
					  &rs->dev[i].meta_dev);
			if (r) {
				rs->ti->error = "RAID metadata device lookup failure";
				return r;
			}
772
773

			rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
774
775
776
777
			if (!rs->dev[i].rdev.sb_page) {
				rs->ti->error = "Failed to allocate superblock page";
				return -ENOMEM;
			}
NeilBrown's avatar
NeilBrown committed
778
779
		}

780
781
782
783
784
		arg = dm_shift_arg(as);
		if (!arg)
			return -EINVAL;

		if (!strcmp(arg, "-")) {
NeilBrown's avatar
NeilBrown committed
785
			if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
786
787
788
789
			    (!rs->dev[i].rdev.recovery_offset)) {
				rs->ti->error = "Drive designated for rebuild not specified";
				return -EINVAL;
			}
NeilBrown's avatar
NeilBrown committed
790

791
792
793
794
			if (rs->dev[i].meta_dev) {
				rs->ti->error = "No data device supplied with metadata device";
				return -EINVAL;
			}
795

NeilBrown's avatar
NeilBrown committed
796
797
798
			continue;
		}

799
800
801
802
803
804
		r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
				  &rs->dev[i].data_dev);
		if (r) {
			rs->ti->error = "RAID device lookup failure";
			return r;
		}
NeilBrown's avatar
NeilBrown committed
805

806
807
808
809
		if (rs->dev[i].meta_dev) {
			metadata_available = 1;
			rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
		}
NeilBrown's avatar
NeilBrown committed
810
		rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
811
		list_add_tail(&rs->dev[i].rdev.same_set, &rs->md.disks);
NeilBrown's avatar
NeilBrown committed
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
		if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
			rebuild++;
	}

	if (metadata_available) {
		rs->md.external = 0;
		rs->md.persistent = 1;
		rs->md.major_version = 2;
	} else if (rebuild && !rs->md.recovery_cp) {
		/*
		 * Without metadata, we will not be able to tell if the array
		 * is in-sync or not - we must assume it is not.  Therefore,
		 * it is impossible to rebuild a drive.
		 *
		 * Even if there is metadata, the on-disk information may
		 * indicate that the array is not in-sync and it will then
		 * fail at that time.
		 *
		 * User could specify 'nosync' option if desperate.
		 */
832
833
		rs->ti->error = "Unable to rebuild drive while array is not in-sync";
		return -EINVAL;
NeilBrown's avatar
NeilBrown committed
834
835
836
837
838
	}

	return 0;
}

839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
/*
 * validate_region_size
 * @rs
 * @region_size:  region size in sectors.  If 0, pick a size (4MiB default).
 *
 * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
 * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
 *
 * Returns: 0 on success, -EINVAL on failure.
 */
static int validate_region_size(struct raid_set *rs, unsigned long region_size)
{
	unsigned long min_region_size = rs->ti->len / (1 << 21);

	if (!region_size) {
		/*
855
		 * Choose a reasonable default.	 All figures in sectors.
856
857
		 */
		if (min_region_size > (1 << 13)) {
858
			/* If not a power of 2, make it the next power of 2 */
859
			region_size = roundup_pow_of_two(min_region_size);
860
861
862
863
864
865
866
867
868
869
			DMINFO("Choosing default region size of %lu sectors",
			       region_size);
		} else {
			DMINFO("Choosing default region size of 4MiB");
			region_size = 1 << 13; /* sectors */
		}
	} else {
		/*
		 * Validate user-supplied value.
		 */
870
871
872
873
		if (region_size > rs->ti->len) {
			rs->ti->error = "Supplied region size is too large";
			return -EINVAL;
		}
874
875
876
877

		if (region_size < min_region_size) {
			DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
			      region_size, min_region_size);
878
879
			rs->ti->error = "Supplied region size is too small";
			return -EINVAL;
880
881
		}

882
883
884
885
		if (!is_power_of_2(region_size)) {
			rs->ti->error = "Region size is not a power of 2";
			return -EINVAL;
		}
886

887
888
889
890
		if (region_size < rs->md.chunk_sectors) {
			rs->ti->error = "Region size is smaller than the chunk size";
			return -EINVAL;
		}
891
892
893
894
895
896
897
898
899
900
	}

	/*
	 * Convert sectors to bytes.
	 */
	rs->md.bitmap_info.chunksize = (region_size << 9);

	return 0;
}

901
/*
902
 * validate_raid_redundancy
903
904
 * @rs
 *
905
906
 * Determine if there are enough devices in the array that haven't
 * failed (or are being rebuilt) to form a usable array.
907
908
909
 *
 * Returns: 0 on success, -EINVAL on failure.
 */
910
static int validate_raid_redundancy(struct raid_set *rs)
911
912
{
	unsigned i, rebuild_cnt = 0;
913
	unsigned rebuilds_per_group = 0, copies;
914
	unsigned group_size, last_group_start;
915
916

	for (i = 0; i < rs->md.raid_disks; i++)
917
918
		if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
		    !rs->dev[i].rdev.sb_page)
919
920
921
922
923
924
925
926
927
928
929
930
931
932
			rebuild_cnt++;

	switch (rs->raid_type->level) {
	case 1:
		if (rebuild_cnt >= rs->md.raid_disks)
			goto too_many;
		break;
	case 4:
	case 5:
	case 6:
		if (rebuild_cnt > rs->raid_type->parity_devs)
			goto too_many;
		break;
	case 10:
933
		copies = raid10_md_layout_to_copies(rs->md.new_layout);
934
935
936
937
938
939
940
941
942
943
944
945
946
		if (rebuild_cnt < copies)
			break;

		/*
		 * It is possible to have a higher rebuild count for RAID10,
		 * as long as the failed devices occur in different mirror
		 * groups (i.e. different stripes).
		 *
		 * When checking "near" format, make sure no adjacent devices
		 * have failed beyond what can be handled.  In addition to the
		 * simple case where the number of devices is a multiple of the
		 * number of copies, we must also handle cases where the number
		 * of devices is not a multiple of the number of copies.
947
948
949
		 * E.g.	   dev1 dev2 dev3 dev4 dev5
		 *	    A	 A    B	   B	C
		 *	    C	 D    D	   E	E
950
		 */
951
952
		if (__is_raid10_near(rs->md.new_layout)) {
			for (i = 0; i < rs->raid_disks; i++) {
953
954
				if (!(i % copies))
					rebuilds_per_group = 0;
955
				if ((!rs->dev[i].rdev.sb_page ||
956
				    !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
957
958
959
960
961
962
963
964
965
966
967
968
969
970
				    (++rebuilds_per_group >= copies))
					goto too_many;
			}
			break;
		}

		/*
		 * When checking "far" and "offset" formats, we need to ensure
		 * that the device that holds its copy is not also dead or
		 * being rebuilt.  (Note that "far" and "offset" formats only
		 * support two copies right now.  These formats also only ever
		 * use the 'use_far_sets' variant.)
		 *
		 * This check is somewhat complicated by the need to account
971
		 * for arrays that are not a multiple of (far) copies.	This
972
973
974
975
976
977
978
979
		 * results in the need to treat the last (potentially larger)
		 * set differently.
		 */
		group_size = (rs->md.raid_disks / copies);
		last_group_start = (rs->md.raid_disks / group_size) - 1;
		last_group_start *= group_size;
		for (i = 0; i < rs->md.raid_disks; i++) {
			if (!(i % copies) && !(i > last_group_start))
980
				rebuilds_per_group = 0;
981
982
			if ((!rs->dev[i].rdev.sb_page ||
			     !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
983
			    (++rebuilds_per_group >= copies))
984
					goto too_many;
985
986
		}
		break;
987
	default:
988
989
		if (rebuild_cnt)
			return -EINVAL;
990
991
992
993
994
995
996
997
	}

	return 0;

too_many:
	return -EINVAL;
}

NeilBrown's avatar
NeilBrown committed
998
999
1000
1001
/*
 * Possible arguments are...
 *	<chunk_size> [optional_args]
 *
1002
1003
 * Argument definitions
 *    <chunk_size>			The number of sectors per disk that
1004
 *					will form the "stripe"
1005
 *    [[no]sync]			Force or prevent recovery of the
1006
 *					entire array
NeilBrown's avatar
NeilBrown committed
1007
 *    [rebuild <idx>]			Rebuild the drive indicated by the index
1008
 *    [daemon_sleep <ms>]		Time between bitmap daemon work to
1009
 *					clear bits
NeilBrown's avatar
NeilBrown committed
1010
1011
 *    [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
 *    [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
1012
 *    [write_mostly <idx>]		Indicate a write mostly drive via index
NeilBrown's avatar
NeilBrown committed
1013
1014
 *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
 *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
1015
 *    [region_size <sectors>]		Defines granularity of bitmap
1016
1017
 *
 * RAID10-only options:
1018
 *    [raid10_copies <# copies>]	Number of copies.  (Default: 2)
1019
 *    [raid10_format <near|far|offset>] Layout algorithm.  (Default: near)
NeilBrown's avatar
NeilBrown committed
1020
 */
1021
static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
NeilBrown's avatar
NeilBrown committed
1022
1023
			     unsigned num_raid_params)
{
1024
	int value, raid10_format = ALGORITHM_RAID10_DEFAULT;
1025
	unsigned raid10_copies = 2;
1026
	unsigned i, write_mostly = 0;
1027
	unsigned region_size = 0;
1028
	sector_t max_io_len;
1029
	const char *arg, *key;
1030
	struct raid_dev *rd;
1031
	struct raid_type *rt = rs->raid_type;
1032
1033
1034
1035

	arg = dm_shift_arg(as);
	num_raid_params--; /* Account for chunk_size argument */

1036
	if (kstrtoint(arg, 10, &value) < 0) {
1037
1038
1039
		rs->ti->error = "Bad numerical argument given for chunk_size";
		return -EINVAL;
	}
NeilBrown's avatar
NeilBrown committed
1040
1041
1042

	/*
	 * First, parse the in-order required arguments
1043
	 * "chunk_size" is the only argument of this type.
NeilBrown's avatar
NeilBrown committed
1044
	 */
1045
	if (rt_is_raid1(rt)) {
1046
1047
1048
		if (value)
			DMERR("Ignoring chunk size parameter for RAID 1");
		value = 0;
1049
1050
1051
1052
1053
1054
1055
	} else if (!is_power_of_2(value)) {
		rs->ti->error = "Chunk size must be a power of 2";
		return -EINVAL;
	} else if (value < 8) {
		rs->ti->error = "Chunk size value is too small";
		return -EINVAL;
	}
NeilBrown's avatar
NeilBrown committed
1056
1057
1058
1059

	rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;

	/*
1060
1061
1062
1063
1064
	 * We set each individual device as In_sync with a completed
	 * 'recovery_offset'.  If there has been a device failure or
	 * replacement then one of the following cases applies:
	 *
	 *   1) User specifies 'rebuild'.
1065
	 *	- Device is reset when param is read.
1066
	 *   2) A new device is supplied.
1067
	 *	- No matching superblock found, resets device.
1068
	 *   3) Device failure was transient and returns on reload.
1069
	 *	- Failure noticed, resets device for bitmap replay.
1070
	 *   4) Device hadn't completed recovery after previous failure.
1071
	 *	- Superblock is read and overrides recovery_offset.
1072
1073
1074
	 *
	 * What is found in the superblocks of the devices is always
	 * authoritative, unless 'rebuild' or '[no]sync' was specified.
NeilBrown's avatar
NeilBrown committed
1075
	 */
1076
	for (i = 0; i < rs->md.raid_disks; i++) {
NeilBrown's avatar
NeilBrown committed
1077
		set_bit(In_sync, &rs->dev[i].rdev.flags);
1078
1079
		rs->dev[i].rdev.recovery_offset = MaxSector;
	}
NeilBrown's avatar
NeilBrown committed
1080

1081
1082
1083
	/*
	 * Second, parse the unordered optional arguments
	 */
NeilBrown's avatar
NeilBrown committed
1084
	for (i = 0; i < num_raid_params; i++) {
1085
		key = dm_shift_arg(as);
1086
1087
1088
1089
		if (!key) {
			rs->ti->error = "Not enough raid parameters given";
			return -EINVAL;
		}
1090

1091
		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC))) {
1092
			if (test_and_set_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) {
1093
1094
1095
				rs->ti->error = "Only one 'nosync' argument allowed";
				return -EINVAL;
			}
NeilBrown's avatar
NeilBrown committed
1096
1097
1098
			rs->md.recovery_cp = MaxSector;
			continue;
		}
1099
		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_SYNC))) {
1100
			if (test_and_set_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) {
1101
1102
1103
				rs->ti->error = "Only one 'sync' argument allowed";
				return -EINVAL;
			}
NeilBrown's avatar
NeilBrown committed
1104
			rs->md.recovery_cp = 0;
1105
1106
			continue;
		}
1107
		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_USE_NEAR_SETS))) {
1108
			if (test_and_set_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) {
1109
1110
1111
				rs->ti->error = "Only one 'raid10_use_new_sets' argument allowed";
				return -EINVAL;
			}
NeilBrown's avatar
NeilBrown committed
1112
1113
1114
			continue;
		}

1115
1116
		arg = dm_shift_arg(as);
		i++; /* Account for the argument pairs */
1117
1118
1119
1120
		if (!arg) {
			rs->ti->error = "Wrong number of raid parameters given";
			return -EINVAL;
		}
1121

1122
1123
1124
1125
		/*
		 * Parameters that take a string value are checked here.
		 */

1126
		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) {
1127
			if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) {
1128
1129
1130
1131
1132
1133
1134
				rs->ti->error = "Only one 'raid10_format' argument pair allowed";
				return -EINVAL;
			}
			if (!rt_is_raid10(rt)) {
				rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
				return -EINVAL;
			}
1135
			raid10_format = raid10_name_to_format(arg);
1136
1137
1138
1139
			if (raid10_format < 0) {
				rs->ti->error = "Invalid 'raid10_format' value given";
				return raid10_format;
			}
1140
1141
1142
			continue;
		}

1143
		if (kstrtoint(arg, 10, &value) < 0) {
1144
1145
1146
			rs->ti->error = "Bad numerical argument given in raid params";
			return -EINVAL;
		}
1147

1148
		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD))) {
1149
1150
1151
1152
			/*
			 * "rebuild" is being passed in by userspace to provide
			 * indexes of replaced devices and to set up additional
			 * devices on raid level takeover.
1153
			 */
1154
			if (!__within_range(value, 0, rs->raid_disks - 1)) {
1155
1156
1157
				rs->ti->error = "Invalid rebuild index given";
				return -EINVAL;
			}
1158

1159
1160
1161
1162
			if (test_and_set_bit(value, (void *) rs->rebuild_disks)) {
				rs->ti->error = "rebuild for this index already given";
				return -EINVAL;
			}
1163

1164
1165
1166
1167
			rd = rs->dev + value;
			clear_bit(In_sync, &rd->rdev.flags);
			clear_bit(Faulty, &rd->rdev.flags);
			rd->rdev.recovery_offset = 0;
1168
			set_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags);
1169
		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY))) {
1170
1171
1172
1173
			if (!rt_is_raid1(rt)) {
				rs->ti->error = "write_mostly option is only valid for RAID1";
				return -EINVAL;
			}
1174

1175
			if (!__within_range(value, 0, rs->md.raid_disks - 1)) {
1176
1177
1178
				rs->ti->error = "Invalid write_mostly index given";
				return -EINVAL;
			}
NeilBrown's avatar
NeilBrown committed
1179

1180
			write_mostly++;
1181
			set_bit(WriteMostly, &rs->dev[value].rdev.flags);
1182
			set_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags);
1183
		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND))) {
1184
1185
1186
1187
			if (!rt_is_raid1(rt)) {
				rs->ti->error = "max_write_behind option is only valid for RAID1";
				return -EINVAL;
			}
1188

1189
			if (test_and_set_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) {
1190
1191
1192
				rs->ti->error = "Only one max_write_behind argument pair allowed";
				return -EINVAL;
			}
NeilBrown's avatar
NeilBrown committed
1193
1194
1195
1196
1197
1198

			/*
			 * In device-mapper, we specify things in sectors, but
			 * MD records this value in kB
			 */
			value /= 2;
1199
1200
1201
1202
			if (value > COUNTER_MAX) {
				rs->ti->error = "Max write-behind limit out of range";
				return -EINVAL;
			}
1203

NeilBrown's avatar
NeilBrown committed
1204
			rs->md.bitmap_info.max_write_behind = value;