block_dev.c 44.3 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
/*
 *  linux/fs/block_dev.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
 */

#include <linux/init.h>
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/major.h>
14
#include <linux/device_cgroup.h>
Linus Torvalds's avatar
Linus Torvalds committed
15
16
17
18
#include <linux/highmem.h>
#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
19
#include <linux/magic.h>
Linus Torvalds's avatar
Linus Torvalds committed
20
#include <linux/buffer_head.h>
Al Viro's avatar
Al Viro committed
21
#include <linux/swap.h>
22
#include <linux/pagevec.h>
23
#include <linux/writeback.h>
Linus Torvalds's avatar
Linus Torvalds committed
24
25
26
27
#include <linux/mpage.h>
#include <linux/mount.h>
#include <linux/uio.h>
#include <linux/namei.h>
28
#include <linux/log2.h>
Al Viro's avatar
Al Viro committed
29
#include <linux/cleancache.h>
Linus Torvalds's avatar
Linus Torvalds committed
30
#include <asm/uaccess.h>
31
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
32
33
34
35
36
37

struct bdev_inode {
	struct block_device bdev;
	struct inode vfs_inode;
};

Adrian Bunk's avatar
Adrian Bunk committed
38
39
static const struct address_space_operations def_blk_aops;

Linus Torvalds's avatar
Linus Torvalds committed
40
41
42
43
44
45
46
47
48
49
50
static inline struct bdev_inode *BDEV_I(struct inode *inode)
{
	return container_of(inode, struct bdev_inode, vfs_inode);
}

inline struct block_device *I_BDEV(struct inode *inode)
{
	return &BDEV_I(inode)->bdev;
}
EXPORT_SYMBOL(I_BDEV);

51
/*
52
53
54
 * Move the inode from its current bdi to a new bdi. If the inode is dirty we
 * need to move it onto the dirty list of @dst so that the inode is always on
 * the right list.
55
56
57
58
 */
static void bdev_inode_switch_bdi(struct inode *inode,
			struct backing_dev_info *dst)
{
59
60
61
62
63
	struct backing_dev_info *old = inode->i_data.backing_dev_info;

	if (unlikely(dst == old))		/* deadlock avoidance */
		return;
	bdi_lock_two(&old->wb, &dst->wb);
64
	spin_lock(&inode->i_lock);
65
66
	inode->i_data.backing_dev_info = dst;
	if (inode->i_state & I_DIRTY)
Nick Piggin's avatar
Nick Piggin committed
67
		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
68
	spin_unlock(&inode->i_lock);
69
70
	spin_unlock(&old->wb.list_lock);
	spin_unlock(&dst->wb.list_lock);
71
72
}

73
sector_t blkdev_max_block(struct block_device *bdev)
Linus Torvalds's avatar
Linus Torvalds committed
74
75
76
77
78
79
80
81
82
83
84
85
{
	sector_t retval = ~((sector_t)0);
	loff_t sz = i_size_read(bdev->bd_inode);

	if (sz) {
		unsigned int size = block_size(bdev);
		unsigned int sizebits = blksize_bits(size);
		retval = (sz >> sizebits);
	}
	return retval;
}

Peter Zijlstra's avatar
Peter Zijlstra committed
86
/* Kill _all_ buffers and pagecache , dirty or not.. */
Al Viro's avatar
Al Viro committed
87
void kill_bdev(struct block_device *bdev)
Linus Torvalds's avatar
Linus Torvalds committed
88
{
Al Viro's avatar
Al Viro committed
89
90
91
	struct address_space *mapping = bdev->bd_inode->i_mapping;

	if (mapping->nrpages == 0)
Peter Zijlstra's avatar
Peter Zijlstra committed
92
		return;
Al Viro's avatar
Al Viro committed
93

Peter Zijlstra's avatar
Peter Zijlstra committed
94
	invalidate_bh_lrus();
Al Viro's avatar
Al Viro committed
95
	truncate_inode_pages(mapping, 0);
Linus Torvalds's avatar
Linus Torvalds committed
96
}	
Al Viro's avatar
Al Viro committed
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
EXPORT_SYMBOL(kill_bdev);

/* Invalidate clean unused buffers and pagecache. */
void invalidate_bdev(struct block_device *bdev)
{
	struct address_space *mapping = bdev->bd_inode->i_mapping;

	if (mapping->nrpages == 0)
		return;

	invalidate_bh_lrus();
	lru_add_drain_all();	/* make sure all lru add caches are flushed */
	invalidate_mapping_pages(mapping, 0, -1);
	/* 99% of the time, we don't need to flush the cleancache on the bdev.
	 * But, for the strange corners, lets be cautious
	 */
113
	cleancache_invalidate_inode(mapping);
Al Viro's avatar
Al Viro committed
114
115
}
EXPORT_SYMBOL(invalidate_bdev);
Linus Torvalds's avatar
Linus Torvalds committed
116
117
118

int set_blocksize(struct block_device *bdev, int size)
{
119
120
	struct address_space *mapping;

Linus Torvalds's avatar
Linus Torvalds committed
121
	/* Size must be a power of two, and between 512 and PAGE_SIZE */
122
	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
Linus Torvalds's avatar
Linus Torvalds committed
123
124
125
		return -EINVAL;

	/* Size cannot be smaller than the size supported by the device */
126
	if (size < bdev_logical_block_size(bdev))
Linus Torvalds's avatar
Linus Torvalds committed
127
128
		return -EINVAL;

129
	/* Prevent starting I/O or mapping the device */
130
	percpu_down_write(&bdev->bd_block_size_semaphore);
131
132
133
134
135
136
137

	/* Check that the block device is not memory mapped */
	mapping = bdev->bd_inode->i_mapping;
	mutex_lock(&mapping->i_mmap_mutex);
	if (!prio_tree_empty(&mapping->i_mmap) ||
	    !list_empty(&mapping->i_mmap_nonlinear)) {
		mutex_unlock(&mapping->i_mmap_mutex);
138
		percpu_up_write(&bdev->bd_block_size_semaphore);
139
140
141
142
		return -EBUSY;
	}
	mutex_unlock(&mapping->i_mmap_mutex);

Linus Torvalds's avatar
Linus Torvalds committed
143
144
145
146
147
148
149
	/* Don't change the size if it is same as current */
	if (bdev->bd_block_size != size) {
		sync_blockdev(bdev);
		bdev->bd_block_size = size;
		bdev->bd_inode->i_blkbits = blksize_bits(size);
		kill_bdev(bdev);
	}
150

151
	percpu_up_write(&bdev->bd_block_size_semaphore);
152

Linus Torvalds's avatar
Linus Torvalds committed
153
154
155
156
157
158
159
160
161
162
163
164
	return 0;
}

EXPORT_SYMBOL(set_blocksize);

int sb_set_blocksize(struct super_block *sb, int size)
{
	if (set_blocksize(sb->s_bdev, size))
		return 0;
	/* If we get here, we know size is power of two
	 * and it's value is between 512 and PAGE_SIZE */
	sb->s_blocksize = size;
165
	sb->s_blocksize_bits = blksize_bits(size);
Linus Torvalds's avatar
Linus Torvalds committed
166
167
168
169
170
171
172
	return sb->s_blocksize;
}

EXPORT_SYMBOL(sb_set_blocksize);

int sb_min_blocksize(struct super_block *sb, int size)
{
173
	int minsize = bdev_logical_block_size(sb->s_bdev);
Linus Torvalds's avatar
Linus Torvalds committed
174
175
176
177
178
179
180
181
182
183
184
	if (size < minsize)
		size = minsize;
	return sb_set_blocksize(sb, size);
}

EXPORT_SYMBOL(sb_min_blocksize);

static int
blkdev_get_block(struct inode *inode, sector_t iblock,
		struct buffer_head *bh, int create)
{
185
	if (iblock >= blkdev_max_block(I_BDEV(inode))) {
Linus Torvalds's avatar
Linus Torvalds committed
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
		if (create)
			return -EIO;

		/*
		 * for reads, we're just trying to fill a partial page.
		 * return a hole, they will have to call get_block again
		 * before they can fill it, and they will get -EIO at that
		 * time
		 */
		return 0;
	}
	bh->b_bdev = I_BDEV(inode);
	bh->b_blocknr = iblock;
	set_buffer_mapped(bh);
	return 0;
}

203
204
205
206
static int
blkdev_get_blocks(struct inode *inode, sector_t iblock,
		struct buffer_head *bh, int create)
{
207
	sector_t end_block = blkdev_max_block(I_BDEV(inode));
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
	unsigned long max_blocks = bh->b_size >> inode->i_blkbits;

	if ((iblock + max_blocks) > end_block) {
		max_blocks = end_block - iblock;
		if ((long)max_blocks <= 0) {
			if (create)
				return -EIO;	/* write fully beyond EOF */
			/*
			 * It is a read which is fully beyond EOF.  We return
			 * a !buffer_mapped buffer
			 */
			max_blocks = 0;
		}
	}

	bh->b_bdev = I_BDEV(inode);
	bh->b_blocknr = iblock;
	bh->b_size = max_blocks << inode->i_blkbits;
	if (max_blocks)
		set_buffer_mapped(bh);
	return 0;
}

static ssize_t
blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
			loff_t offset, unsigned long nr_segs)
{
	struct file *file = iocb->ki_filp;
	struct inode *inode = file->f_mapping->host;

238
239
	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
				    nr_segs, blkdev_get_blocks, NULL, NULL, 0);
240
241
}

242
243
244
245
246
247
248
249
250
int __sync_blockdev(struct block_device *bdev, int wait)
{
	if (!bdev)
		return 0;
	if (!wait)
		return filemap_flush(bdev->bd_inode->i_mapping);
	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
}

251
252
253
254
255
256
/*
 * Write out and wait upon all the dirty data associated with a block
 * device via its mapping.  Does not take the superblock lock.
 */
int sync_blockdev(struct block_device *bdev)
{
257
	return __sync_blockdev(bdev, 1);
258
259
260
261
262
263
264
265
266
267
268
269
}
EXPORT_SYMBOL(sync_blockdev);

/*
 * Write out and wait upon all dirty data associated with this
 * device.   Filesystem data as well as the underlying block
 * device.  Takes the superblock lock.
 */
int fsync_bdev(struct block_device *bdev)
{
	struct super_block *sb = get_super(bdev);
	if (sb) {
270
		int res = sync_filesystem(sb);
271
272
273
274
275
		drop_super(sb);
		return res;
	}
	return sync_blockdev(bdev);
}
276
EXPORT_SYMBOL(fsync_bdev);
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295

/**
 * freeze_bdev  --  lock a filesystem and force it into a consistent state
 * @bdev:	blockdevice to lock
 *
 * If a superblock is found on this device, we take the s_umount semaphore
 * on it to make sure nobody unmounts until the snapshot creation is done.
 * The reference counter (bd_fsfreeze_count) guarantees that only the last
 * unfreeze process can unfreeze the frozen filesystem actually when multiple
 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
 * actually.
 */
struct super_block *freeze_bdev(struct block_device *bdev)
{
	struct super_block *sb;
	int error = 0;

	mutex_lock(&bdev->bd_fsfreeze_mutex);
296
297
298
299
300
301
	if (++bdev->bd_fsfreeze_count > 1) {
		/*
		 * We don't even need to grab a reference - the first call
		 * to freeze_bdev grab an active reference and only the last
		 * thaw_bdev drops it.
		 */
302
		sb = get_super(bdev);
303
304
305
306
307
308
309
310
		drop_super(sb);
		mutex_unlock(&bdev->bd_fsfreeze_mutex);
		return sb;
	}

	sb = get_active_super(bdev);
	if (!sb)
		goto out;
311
312
313
314
	error = freeze_super(sb);
	if (error) {
		deactivate_super(sb);
		bdev->bd_fsfreeze_count--;
315
		mutex_unlock(&bdev->bd_fsfreeze_mutex);
316
		return ERR_PTR(error);
317
	}
318
	deactivate_super(sb);
319
 out:
320
321
	sync_blockdev(bdev);
	mutex_unlock(&bdev->bd_fsfreeze_mutex);
322
	return sb;	/* thaw_bdev releases s->s_umount */
323
324
325
326
327
328
329
330
331
332
333
334
}
EXPORT_SYMBOL(freeze_bdev);

/**
 * thaw_bdev  -- unlock filesystem
 * @bdev:	blockdevice to unlock
 * @sb:		associated superblock
 *
 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
 */
int thaw_bdev(struct block_device *bdev, struct super_block *sb)
{
335
	int error = -EINVAL;
336
337

	mutex_lock(&bdev->bd_fsfreeze_mutex);
338
	if (!bdev->bd_fsfreeze_count)
339
		goto out;
340
341
342

	error = 0;
	if (--bdev->bd_fsfreeze_count > 0)
343
		goto out;
344
345

	if (!sb)
346
		goto out;
347

348
349
350
351
352
353
354
	error = thaw_super(sb);
	if (error) {
		bdev->bd_fsfreeze_count++;
		mutex_unlock(&bdev->bd_fsfreeze_mutex);
		return error;
	}
out:
355
356
357
358
359
	mutex_unlock(&bdev->bd_fsfreeze_mutex);
	return 0;
}
EXPORT_SYMBOL(thaw_bdev);

Linus Torvalds's avatar
Linus Torvalds committed
360
361
362
363
364
365
366
367
368
369
static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
{
	return block_write_full_page(page, blkdev_get_block, wbc);
}

static int blkdev_readpage(struct file * file, struct page * page)
{
	return block_read_full_page(page, blkdev_get_block);
}

Nick Piggin's avatar
Nick Piggin committed
370
371
372
static int blkdev_write_begin(struct file *file, struct address_space *mapping,
			loff_t pos, unsigned len, unsigned flags,
			struct page **pagep, void **fsdata)
Linus Torvalds's avatar
Linus Torvalds committed
373
{
374
375
	return block_write_begin(mapping, pos, len, flags, pagep,
				 blkdev_get_block);
Linus Torvalds's avatar
Linus Torvalds committed
376
377
}

Nick Piggin's avatar
Nick Piggin committed
378
379
380
static int blkdev_write_end(struct file *file, struct address_space *mapping,
			loff_t pos, unsigned len, unsigned copied,
			struct page *page, void *fsdata)
Linus Torvalds's avatar
Linus Torvalds committed
381
{
Nick Piggin's avatar
Nick Piggin committed
382
383
384
385
386
387
388
	int ret;
	ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);

	unlock_page(page);
	page_cache_release(page);

	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
389
390
391
392
}

/*
 * private llseek:
393
 * for a block special file file->f_path.dentry->d_inode->i_size is zero
Linus Torvalds's avatar
Linus Torvalds committed
394
395
396
397
398
399
400
401
 * so we compute the size by hand (just as in block_read/write above)
 */
static loff_t block_llseek(struct file *file, loff_t offset, int origin)
{
	struct inode *bd_inode = file->f_mapping->host;
	loff_t size;
	loff_t retval;

402
	mutex_lock(&bd_inode->i_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
403
404
	size = i_size_read(bd_inode);

405
	retval = -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
406
	switch (origin) {
407
		case SEEK_END:
Linus Torvalds's avatar
Linus Torvalds committed
408
409
			offset += size;
			break;
410
		case SEEK_CUR:
Linus Torvalds's avatar
Linus Torvalds committed
411
			offset += file->f_pos;
412
413
414
415
		case SEEK_SET:
			break;
		default:
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
416
417
418
419
420
421
422
	}
	if (offset >= 0 && offset <= size) {
		if (offset != file->f_pos) {
			file->f_pos = offset;
		}
		retval = offset;
	}
423
out:
424
	mutex_unlock(&bd_inode->i_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
425
426
427
	return retval;
}
	
428
int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
Linus Torvalds's avatar
Linus Torvalds committed
429
{
430
431
	struct inode *bd_inode = filp->f_mapping->host;
	struct block_device *bdev = I_BDEV(bd_inode);
432
	int error;
433
434
435
436
	
	error = filemap_write_and_wait_range(filp->f_mapping, start, end);
	if (error)
		return error;
437

438
439
440
441
442
	/*
	 * There is no need to serialise calls to blkdev_issue_flush with
	 * i_mutex and doing so causes performance issues with concurrent
	 * O_SYNC writers to a block device.
	 */
443
	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
444
445
	if (error == -EOPNOTSUPP)
		error = 0;
446

447
	return error;
Linus Torvalds's avatar
Linus Torvalds committed
448
}
449
EXPORT_SYMBOL(blkdev_fsync);
Linus Torvalds's avatar
Linus Torvalds committed
450
451
452
453
454
455

/*
 * pseudo-fs
 */

static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
456
static struct kmem_cache * bdev_cachep __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
457
458
459

static struct inode *bdev_alloc_inode(struct super_block *sb)
{
460
	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
Linus Torvalds's avatar
Linus Torvalds committed
461
462
	if (!ei)
		return NULL;
463
464
465
466
467
468

	if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
		kmem_cache_free(bdev_cachep, ei);
		return NULL;
	}

Linus Torvalds's avatar
Linus Torvalds committed
469
470
471
	return &ei->vfs_inode;
}

Nick Piggin's avatar
Nick Piggin committed
472
static void bdev_i_callback(struct rcu_head *head)
Linus Torvalds's avatar
Linus Torvalds committed
473
{
Nick Piggin's avatar
Nick Piggin committed
474
	struct inode *inode = container_of(head, struct inode, i_rcu);
Linus Torvalds's avatar
Linus Torvalds committed
475
476
	struct bdev_inode *bdi = BDEV_I(inode);

477
478
	percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);

Linus Torvalds's avatar
Linus Torvalds committed
479
480
481
	kmem_cache_free(bdev_cachep, bdi);
}

Nick Piggin's avatar
Nick Piggin committed
482
483
484
485
486
static void bdev_destroy_inode(struct inode *inode)
{
	call_rcu(&inode->i_rcu, bdev_i_callback);
}

487
static void init_once(void *foo)
Linus Torvalds's avatar
Linus Torvalds committed
488
489
490
491
{
	struct bdev_inode *ei = (struct bdev_inode *) foo;
	struct block_device *bdev = &ei->bdev;

492
493
494
495
	memset(bdev, 0, sizeof(*bdev));
	mutex_init(&bdev->bd_mutex);
	INIT_LIST_HEAD(&bdev->bd_inodes);
	INIT_LIST_HEAD(&bdev->bd_list);
496
497
498
#ifdef CONFIG_SYSFS
	INIT_LIST_HEAD(&bdev->bd_holder_disks);
#endif
499
	inode_init_once(&ei->vfs_inode);
500
501
	/* Initialize mutex for freeze. */
	mutex_init(&bdev->bd_fsfreeze_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
502
503
504
505
506
507
508
509
510
}

static inline void __bd_forget(struct inode *inode)
{
	list_del_init(&inode->i_devices);
	inode->i_bdev = NULL;
	inode->i_mapping = &inode->i_data;
}

511
static void bdev_evict_inode(struct inode *inode)
Linus Torvalds's avatar
Linus Torvalds committed
512
513
514
{
	struct block_device *bdev = &BDEV_I(inode)->bdev;
	struct list_head *p;
515
516
	truncate_inode_pages(&inode->i_data, 0);
	invalidate_inode_buffers(inode); /* is it needed here? */
517
	clear_inode(inode);
Linus Torvalds's avatar
Linus Torvalds committed
518
519
520
521
522
523
524
525
	spin_lock(&bdev_lock);
	while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
		__bd_forget(list_entry(p, struct inode, i_devices));
	}
	list_del_init(&bdev->bd_list);
	spin_unlock(&bdev_lock);
}

526
static const struct super_operations bdev_sops = {
Linus Torvalds's avatar
Linus Torvalds committed
527
528
529
530
	.statfs = simple_statfs,
	.alloc_inode = bdev_alloc_inode,
	.destroy_inode = bdev_destroy_inode,
	.drop_inode = generic_delete_inode,
531
	.evict_inode = bdev_evict_inode,
Linus Torvalds's avatar
Linus Torvalds committed
532
533
};

Al Viro's avatar
Al Viro committed
534
535
static struct dentry *bd_mount(struct file_system_type *fs_type,
	int flags, const char *dev_name, void *data)
Linus Torvalds's avatar
Linus Torvalds committed
536
{
537
	return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
Linus Torvalds's avatar
Linus Torvalds committed
538
539
540
541
}

static struct file_system_type bd_type = {
	.name		= "bdev",
Al Viro's avatar
Al Viro committed
542
	.mount		= bd_mount,
Linus Torvalds's avatar
Linus Torvalds committed
543
544
545
	.kill_sb	= kill_anon_super,
};

Al Viro's avatar
Al Viro committed
546
static struct super_block *blockdev_superblock __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
547
548
549
550

void __init bdev_cache_init(void)
{
	int err;
551
	static struct vfsmount *bd_mnt;
552

Linus Torvalds's avatar
Linus Torvalds committed
553
	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
554
555
			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
				SLAB_MEM_SPREAD|SLAB_PANIC),
556
			init_once);
Linus Torvalds's avatar
Linus Torvalds committed
557
558
559
560
561
562
	err = register_filesystem(&bd_type);
	if (err)
		panic("Cannot register bdev pseudo-fs");
	bd_mnt = kern_mount(&bd_type);
	if (IS_ERR(bd_mnt))
		panic("Cannot create bdev pseudo-fs");
563
	blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
Linus Torvalds's avatar
Linus Torvalds committed
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
}

/*
 * Most likely _very_ bad one - but then it's hardly critical for small
 * /dev and can be fixed when somebody will need really large one.
 * Keep in mind that it will be fed through icache hash function too.
 */
static inline unsigned long hash(dev_t dev)
{
	return MAJOR(dev)+MINOR(dev);
}

static int bdev_test(struct inode *inode, void *data)
{
	return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
}

static int bdev_set(struct inode *inode, void *data)
{
	BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
	return 0;
}

static LIST_HEAD(all_bdevs);

struct block_device *bdget(dev_t dev)
{
	struct block_device *bdev;
	struct inode *inode;

594
	inode = iget5_locked(blockdev_superblock, hash(dev),
Linus Torvalds's avatar
Linus Torvalds committed
595
596
597
598
599
600
601
602
603
			bdev_test, bdev_set, &dev);

	if (!inode)
		return NULL;

	bdev = &BDEV_I(inode)->bdev;

	if (inode->i_state & I_NEW) {
		bdev->bd_contains = NULL;
604
		bdev->bd_super = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
		bdev->bd_inode = inode;
		bdev->bd_block_size = (1 << inode->i_blkbits);
		bdev->bd_part_count = 0;
		bdev->bd_invalidated = 0;
		inode->i_mode = S_IFBLK;
		inode->i_rdev = dev;
		inode->i_bdev = bdev;
		inode->i_data.a_ops = &def_blk_aops;
		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
		inode->i_data.backing_dev_info = &default_backing_dev_info;
		spin_lock(&bdev_lock);
		list_add(&bdev->bd_list, &all_bdevs);
		spin_unlock(&bdev_lock);
		unlock_new_inode(inode);
	}
	return bdev;
}

EXPORT_SYMBOL(bdget);

625
626
627
628
629
630
/**
 * bdgrab -- Grab a reference to an already referenced block device
 * @bdev:	Block device to grab a reference to.
 */
struct block_device *bdgrab(struct block_device *bdev)
{
Al Viro's avatar
Al Viro committed
631
	ihold(bdev->bd_inode);
632
633
634
	return bdev;
}

Linus Torvalds's avatar
Linus Torvalds committed
635
636
long nr_blockdev_pages(void)
{
637
	struct block_device *bdev;
Linus Torvalds's avatar
Linus Torvalds committed
638
639
	long ret = 0;
	spin_lock(&bdev_lock);
640
	list_for_each_entry(bdev, &all_bdevs, bd_list) {
Linus Torvalds's avatar
Linus Torvalds committed
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
		ret += bdev->bd_inode->i_mapping->nrpages;
	}
	spin_unlock(&bdev_lock);
	return ret;
}

void bdput(struct block_device *bdev)
{
	iput(bdev->bd_inode);
}

EXPORT_SYMBOL(bdput);
 
static struct block_device *bd_acquire(struct inode *inode)
{
	struct block_device *bdev;
657

Linus Torvalds's avatar
Linus Torvalds committed
658
659
	spin_lock(&bdev_lock);
	bdev = inode->i_bdev;
660
	if (bdev) {
Al Viro's avatar
Al Viro committed
661
		ihold(bdev->bd_inode);
Linus Torvalds's avatar
Linus Torvalds committed
662
663
664
665
		spin_unlock(&bdev_lock);
		return bdev;
	}
	spin_unlock(&bdev_lock);
666

Linus Torvalds's avatar
Linus Torvalds committed
667
668
669
	bdev = bdget(inode->i_rdev);
	if (bdev) {
		spin_lock(&bdev_lock);
670
671
		if (!inode->i_bdev) {
			/*
Al Viro's avatar
Al Viro committed
672
			 * We take an additional reference to bd_inode,
673
674
675
676
			 * and it's released in clear_inode() of inode.
			 * So, we can access it via ->i_mapping always
			 * without igrab().
			 */
Al Viro's avatar
Al Viro committed
677
			ihold(bdev->bd_inode);
678
679
680
681
			inode->i_bdev = bdev;
			inode->i_mapping = bdev->bd_inode->i_mapping;
			list_add(&inode->i_devices, &bdev->bd_inodes);
		}
Linus Torvalds's avatar
Linus Torvalds committed
682
683
684
685
686
		spin_unlock(&bdev_lock);
	}
	return bdev;
}

Al Viro's avatar
Al Viro committed
687
688
689
690
691
static inline int sb_is_blkdev_sb(struct super_block *sb)
{
	return sb == blockdev_superblock;
}

Linus Torvalds's avatar
Linus Torvalds committed
692
693
694
695
/* Call when you free inode */

void bd_forget(struct inode *inode)
{
696
697
	struct block_device *bdev = NULL;

Linus Torvalds's avatar
Linus Torvalds committed
698
	spin_lock(&bdev_lock);
699
	if (inode->i_bdev) {
700
		if (!sb_is_blkdev_sb(inode->i_sb))
701
			bdev = inode->i_bdev;
Linus Torvalds's avatar
Linus Torvalds committed
702
		__bd_forget(inode);
703
	}
Linus Torvalds's avatar
Linus Torvalds committed
704
	spin_unlock(&bdev_lock);
705
706
707

	if (bdev)
		iput(bdev->bd_inode);
Linus Torvalds's avatar
Linus Torvalds committed
708
709
}

Tejun Heo's avatar
Tejun Heo committed
710
711
712
713
714
715
/**
 * bd_may_claim - test whether a block device can be claimed
 * @bdev: block device of interest
 * @whole: whole block device containing @bdev, may equal @bdev
 * @holder: holder trying to claim @bdev
 *
Lucas De Marchi's avatar
Lucas De Marchi committed
716
 * Test whether @bdev can be claimed by @holder.
Tejun Heo's avatar
Tejun Heo committed
717
718
719
720
721
722
723
724
725
 *
 * CONTEXT:
 * spin_lock(&bdev_lock).
 *
 * RETURNS:
 * %true if @bdev can be claimed, %false otherwise.
 */
static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
			 void *holder)
Linus Torvalds's avatar
Linus Torvalds committed
726
727
{
	if (bdev->bd_holder == holder)
Tejun Heo's avatar
Tejun Heo committed
728
		return true;	 /* already a holder */
Linus Torvalds's avatar
Linus Torvalds committed
729
	else if (bdev->bd_holder != NULL)
Tejun Heo's avatar
Tejun Heo committed
730
		return false; 	 /* held by someone else */
Linus Torvalds's avatar
Linus Torvalds committed
731
	else if (bdev->bd_contains == bdev)
Tejun Heo's avatar
Tejun Heo committed
732
		return true;  	 /* is a whole device which isn't held */
Linus Torvalds's avatar
Linus Torvalds committed
733

734
	else if (whole->bd_holder == bd_may_claim)
Tejun Heo's avatar
Tejun Heo committed
735
736
737
		return true; 	 /* is a partition of a device that is being partitioned */
	else if (whole->bd_holder != NULL)
		return false;	 /* is a partition of a held device */
Linus Torvalds's avatar
Linus Torvalds committed
738
	else
Tejun Heo's avatar
Tejun Heo committed
739
740
741
		return true;	 /* is a partition of an un-held device */
}

742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
/**
 * bd_prepare_to_claim - prepare to claim a block device
 * @bdev: block device of interest
 * @whole: the whole device containing @bdev, may equal @bdev
 * @holder: holder trying to claim @bdev
 *
 * Prepare to claim @bdev.  This function fails if @bdev is already
 * claimed by another holder and waits if another claiming is in
 * progress.  This function doesn't actually claim.  On successful
 * return, the caller has ownership of bd_claiming and bd_holder[s].
 *
 * CONTEXT:
 * spin_lock(&bdev_lock).  Might release bdev_lock, sleep and regrab
 * it multiple times.
 *
 * RETURNS:
 * 0 if @bdev can be claimed, -EBUSY otherwise.
 */
static int bd_prepare_to_claim(struct block_device *bdev,
			       struct block_device *whole, void *holder)
{
retry:
	/* if someone else claimed, fail */
	if (!bd_may_claim(bdev, whole, holder))
		return -EBUSY;

768
769
	/* if claiming is already in progress, wait for it to finish */
	if (whole->bd_claiming) {
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
		wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
		DEFINE_WAIT(wait);

		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
		spin_unlock(&bdev_lock);
		schedule();
		finish_wait(wq, &wait);
		spin_lock(&bdev_lock);
		goto retry;
	}

	/* yay, all mine */
	return 0;
}

/**
 * bd_start_claiming - start claiming a block device
 * @bdev: block device of interest
 * @holder: holder trying to claim @bdev
 *
 * @bdev is about to be opened exclusively.  Check @bdev can be opened
 * exclusively and mark that an exclusive open is in progress.  Each
 * successful call to this function must be matched with a call to
Nick Piggin's avatar
Nick Piggin committed
793
794
795
796
797
798
799
 * either bd_finish_claiming() or bd_abort_claiming() (which do not
 * fail).
 *
 * This function is used to gain exclusive access to the block device
 * without actually causing other exclusive open attempts to fail. It
 * should be used when the open sequence itself requires exclusive
 * access but may subsequently fail.
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
 *
 * CONTEXT:
 * Might sleep.
 *
 * RETURNS:
 * Pointer to the block device containing @bdev on success, ERR_PTR()
 * value on failure.
 */
static struct block_device *bd_start_claiming(struct block_device *bdev,
					      void *holder)
{
	struct gendisk *disk;
	struct block_device *whole;
	int partno, err;

	might_sleep();

	/*
	 * @bdev might not have been initialized properly yet, look up
	 * and grab the outer block device the hard way.
	 */
	disk = get_gendisk(bdev->bd_dev, &partno);
	if (!disk)
		return ERR_PTR(-ENXIO);

825
826
827
828
829
830
831
832
833
834
835
836
837
	/*
	 * Normally, @bdev should equal what's returned from bdget_disk()
	 * if partno is 0; however, some drivers (floppy) use multiple
	 * bdev's for the same physical device and @bdev may be one of the
	 * aliases.  Keep @bdev if partno is 0.  This means claimer
	 * tracking is broken for those devices but it has always been that
	 * way.
	 */
	if (partno)
		whole = bdget_disk(disk, 0);
	else
		whole = bdgrab(bdev);

838
	module_put(disk->fops->owner);
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
	put_disk(disk);
	if (!whole)
		return ERR_PTR(-ENOMEM);

	/* prepare to claim, if successful, mark claiming in progress */
	spin_lock(&bdev_lock);

	err = bd_prepare_to_claim(bdev, whole, holder);
	if (err == 0) {
		whole->bd_claiming = holder;
		spin_unlock(&bdev_lock);
		return whole;
	} else {
		spin_unlock(&bdev_lock);
		bdput(whole);
		return ERR_PTR(err);
	}
}

858
#ifdef CONFIG_SYSFS
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
struct bd_holder_disk {
	struct list_head	list;
	struct gendisk		*disk;
	int			refcnt;
};

static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
						  struct gendisk *disk)
{
	struct bd_holder_disk *holder;

	list_for_each_entry(holder, &bdev->bd_holder_disks, list)
		if (holder->disk == disk)
			return holder;
	return NULL;
}

876
static int add_symlink(struct kobject *from, struct kobject *to)
877
{
878
	return sysfs_create_link(from, to, kobject_name(to));
879
880
881
882
883
884
885
}

static void del_symlink(struct kobject *from, struct kobject *to)
{
	sysfs_remove_link(from, kobject_name(to));
}

886
/**
887
888
889
 * bd_link_disk_holder - create symlinks between holding disk and slave bdev
 * @bdev: the claimed slave bdev
 * @disk: the holding disk
890
 *
891
892
 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
 *
893
 * This functions creates the following sysfs symlinks.
894
 *
895
896
 * - from "slaves" directory of the holder @disk to the claimed @bdev
 * - from "holders" directory of the @bdev to the holder @disk
897
 *
898
899
 * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
 * passed to bd_link_disk_holder(), then:
900
 *
901
902
 *   /sys/block/dm-0/slaves/sda --> /sys/block/sda
 *   /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
903
 *
904
905
906
 * The caller must have claimed @bdev before calling this function and
 * ensure that both @bdev and @disk are valid during the creation and
 * lifetime of these symlinks.
907
 *
908
909
 * CONTEXT:
 * Might sleep.
910
 *
911
912
 * RETURNS:
 * 0 on success, -errno on failure.
913
 */
914
int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
915
{
916
	struct bd_holder_disk *holder;
917
	int ret = 0;
918

919
	mutex_lock(&bdev->bd_mutex);
920

921
	WARN_ON_ONCE(!bdev->bd_holder);
922

923
924
925
	/* FIXME: remove the following once add_disk() handles errors */
	if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
		goto out_unlock;
926

927
928
929
	holder = bd_find_holder_disk(bdev, disk);
	if (holder) {
		holder->refcnt++;
930
		goto out_unlock;
931
	}
932

933
934
935
	holder = kzalloc(sizeof(*holder), GFP_KERNEL);
	if (!holder) {
		ret = -ENOMEM;
936
937
		goto out_unlock;
	}
938

939
940
941
942
943
944
945
946
947
948
949
	INIT_LIST_HEAD(&holder->list);
	holder->disk = disk;
	holder->refcnt = 1;

	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
	if (ret)
		goto out_free;

	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
	if (ret)
		goto out_del;
950
951
952
953
954
	/*
	 * bdev could be deleted beneath us which would implicitly destroy
	 * the holder directory.  Hold on to it.
	 */
	kobject_get(bdev->bd_part->holder_dir);
955
956
957
958
959
960
961
962

	list_add(&holder->list, &bdev->bd_holder_disks);
	goto out_unlock;

out_del:
	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
out_free:
	kfree(holder);
963
out_unlock:
964
	mutex_unlock(&bdev->bd_mutex);
965
	return ret;
966
}
967
EXPORT_SYMBOL_GPL(bd_link_disk_holder);
968

969
970
971
972
973
974
975
976
977
978
979
/**
 * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
 * @bdev: the calimed slave bdev
 * @disk: the holding disk
 *
 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
 *
 * CONTEXT:
 * Might sleep.
 */
void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
980
{
981
	struct bd_holder_disk *holder;
982

983
	mutex_lock(&bdev->bd_mutex);
984

985
986
987
988
989
990
	holder = bd_find_holder_disk(bdev, disk);

	if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
		del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
		del_symlink(bdev->bd_part->holder_dir,
			    &disk_to_dev(disk)->kobj);
991
		kobject_put(bdev->bd_part->holder_dir);
992
993
994
995
996
		list_del_init(&holder->list);
		kfree(holder);
	}

	mutex_unlock(&bdev->bd_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
997
}
998
EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
999
#endif
Linus Torvalds's avatar
Linus Torvalds committed
1000

1001
1002
1003
1004
/**
 * flush_disk - invalidates all buffer-cache entries on a disk
 *
 * @bdev:      struct block device to be flushed
1005
 * @kill_dirty: flag to guide handling of dirty inodes
1006
1007
1008
1009
1010
 *
 * Invalidates all buffer-cache entries on a disk. It should be called
 * when a disk has been changed -- either by a media change or online
 * resize.
 */
1011
static void flush_disk(struct block_device *bdev, bool kill_dirty)
1012
{
1013
	if (__invalidate_device(bdev, kill_dirty)) {
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
		char name[BDEVNAME_SIZE] = "";

		if (bdev->bd_disk)
			disk_name(bdev->bd_disk, 0, name);
		printk(KERN_WARNING "VFS: busy inodes on changed media or "
		       "resized disk %s\n", name);
	}

	if (!bdev->bd_disk)
		return;
Tejun Heo's avatar
Tejun Heo committed
1024
	if (disk_part_scan_enabled(bdev->bd_disk))
1025
1026
1027
		bdev->bd_invalidated = 1;
}

1028
/**
1029
 * check_disk_size_change - checks for disk size change and adjusts bdev size.
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
 * @disk: struct gendisk to check
 * @bdev: struct bdev to adjust.
 *
 * This routine checks to see if the bdev size does not match the disk size
 * and adjusts it if it differs.
 */
void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
{
	loff_t disk_size, bdev_size;

	disk_size = (loff_t)get_capacity(disk) << 9;
	bdev_size = i_size_read(bdev->bd_inode);
	if (disk_size != bdev_size) {
		char name[BDEVNAME_SIZE];

		disk_name(disk, 0, name);
		printk(KERN_INFO
		       "%s: detected capacity change from %lld to %lld\n",
		       name, bdev_size, disk_size);
		i_size_write(bdev->bd_inode, disk_size);
1050
		flush_disk(bdev, false);
1051
1052
1053
1054
	}
}
EXPORT_SYMBOL(check_disk_size_change);

1055
/**
1056
 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
1057
1058
1059
1060
1061
1062
1063
1064
 * @disk: struct gendisk to be revalidated
 *
 * This routine is a wrapper for lower-level driver's revalidate_disk
 * call-backs.  It is used to do common pre and post operations needed
 * for all revalidate_disk operations.
 */
int revalidate_disk(struct gendisk *disk)
{
1065
	struct block_device *bdev;
1066
1067
1068
1069
1070
	int ret = 0;

	if (disk->fops->revalidate_disk)
		ret = disk->fops->revalidate_disk(disk);

1071
1072
1073
1074
1075
1076
1077
1078
	bdev = bdget_disk(disk, 0);
	if (!bdev)
		return ret;

	mutex_lock(&bdev->bd_mutex);
	check_disk_size_change(disk, bdev);
	mutex_unlock(&bdev->bd_mutex);
	bdput(bdev);
1079
1080
1081
1082
	return ret;
}
EXPORT_SYMBOL(revalidate_disk);

Linus Torvalds's avatar
Linus Torvalds committed
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
/*
 * This routine checks whether a removable media has been changed,
 * and invalidates all buffer-cache-entries in that case. This
 * is a relatively slow routine, so we have to try to minimize using
 * it. Thus it is called only upon a 'mount' or 'open'. This
 * is the best way of combining speed and utility, I think.
 * People changing diskettes in the middle of an operation deserve
 * to lose :-)
 */
int check_disk_change(struct block_device *bdev)
{
	struct gendisk *disk = bdev->bd_disk;
1095
	const struct block_device_operations *bdops = disk->fops;
1096
	unsigned int events;
Linus Torvalds's avatar
Linus Torvalds committed
1097

1098
1099
1100
	events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
				   DISK_EVENT_EJECT_REQUEST);
	if (!(events & DISK_EVENT_MEDIA_CHANGE))
Linus Torvalds's avatar
Linus Torvalds committed
1101
1102
		return 0;

1103
	flush_disk(bdev, true);
Linus Torvalds's avatar
Linus Torvalds committed
1104
1105
1106
1107
1108
1109
1110
1111
1112
	if (bdops->revalidate_disk)
		bdops->revalidate_disk(bdev->bd_disk);
	return 1;
}

EXPORT_SYMBOL(check_disk_change);

void bd_set_size(struct block_device *bdev, loff_t size)
{
1113
	unsigned bsize = bdev_logical_block_size(bdev);
Linus Torvalds's avatar
Linus Torvalds committed
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125

	bdev->bd_inode->i_size = size;
	while (bsize < PAGE_CACHE_SIZE) {
		if (size & bsize)
			break;
		bsize <<= 1;
	}
	bdev->bd_block_size = bsize;
	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
}
EXPORT_SYMBOL(bd_set_size);

Al Viro's avatar
Al Viro committed
1126
static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
1127

1128
1129
1130
1131
1132
1133
1134
/*
 * bd_mutex locking:
 *
 *  mutex_lock(part->bd_mutex)
 *    mutex_lock_nested(whole->bd_mutex, 1)
 */

1135
static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
Linus Torvalds's avatar
Linus Torvalds committed
1136
1137
{
	struct gendisk *disk;
1138
	struct module *owner;
1139
	int ret;
1140
	int partno;
1141
1142
	int perm = 0;

1143
	if (mode & FMODE_READ)
1144
		perm |= MAY_READ;
1145
	if (mode & FMODE_WRITE)
1146
1147
1148
1149
		perm |= MAY_WRITE;
	/*
	 * hooks: /n/, see "layering violations".
	 */
1150
1151
1152
1153
1154
1155
	if (!for_part) {
		ret = devcgroup_inode_permission(bdev->bd_inode, perm);
		if (ret != 0) {
			bdput(bdev);
			return ret;
		}
1156
	}
1157

1158
 restart:
Tejun Heo's avatar
Tejun Heo committed
1159

1160
	ret = -ENXIO;
1161
	disk = get_gendisk(bdev->bd_dev, &partno);
Tejun Heo's avatar
Tejun Heo committed
1162
	if (!disk)
1163
		goto out;
1164
	owner = disk->fops->owner;
Linus Torvalds's avatar
Linus Torvalds committed
1165

1166
	disk_block_events(disk);
1167
	mutex_lock_nested(&bdev->bd_mutex, for_part);
Linus Torvalds's avatar
Linus Torvalds committed
1168
1169
	if (!bdev->bd_openers) {
		bdev->bd_disk = disk;
1170
		bdev->bd_queue = disk->queue;
Linus Torvalds's avatar
Linus Torvalds committed
1171
		bdev->bd_contains = bdev;
1172
		if (!partno) {
Linus Torvalds's avatar
Linus Torvalds committed
1173
			struct backing_dev_info *bdi;
1174
1175
1176
1177
1178
1179

			ret = -ENXIO;
			bdev->bd_part = disk_get_part(disk, partno);
			if (!bdev->bd_part)
				goto out_clear;

1180
			ret = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1181
			if (disk->fops->open) {
1182
				ret = disk->fops->open(bdev, mode);
1183
1184
1185
1186
1187
1188
1189
1190
				if (ret == -ERESTARTSYS) {
					/* Lost a race with 'disk' being
					 * deleted, try again.
					 * See md.c
					 */
					disk_put_part(bdev->bd_part);
					bdev->bd_part = NULL;
					bdev->bd_disk = NULL;
1191
					bdev->bd_queue = NULL;
1192
					mutex_unlock(&bdev->bd_mutex);
1193
1194
					disk_unblock_events(disk);
					put_disk(disk);
1195
					module_put(owner);
1196
1197
					goto restart;
				}
Linus Torvalds's avatar
Linus Torvalds committed
1198
			}
1199
1200
1201
1202
1203
1204
1205
1206
1207

			if (!ret && !bdev->bd_openers) {
				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
				bdi = blk_get_backing_dev_info(bdev);
				if (bdi == NULL)
					bdi = &default_backing_dev_info;
				bdev_inode_switch_bdi(bdev->bd_inode, bdi);
			}

1208
1209
1210
1211
1212
1213
			/*
			 * If the device is invalidated, rescan partition
			 * if open succeeded or failed with -ENOMEDIUM.
			 * The latter is necessary to prevent ghost
			 * partitions on a removed medium.
			 */
1214
1215
1216
1217
1218
1219
			if (bdev->bd_invalidated) {
				if (!ret)
					rescan_partitions(disk, bdev);
				else if (ret == -ENOMEDIUM)
					invalidate_partitions(disk, bdev);
			}
1220
1221
			if (ret)
				goto out_clear;
Linus Torvalds's avatar
Linus Torvalds committed
1222
1223
1224
1225
1226
		} else {
			struct block_device *whole;
			whole = bdget_disk(disk, 0);
			ret = -ENOMEM;
			if (!whole)
Tejun Heo's avatar
Tejun Heo committed
1227
				goto out_clear;
1228
			BUG_ON(for_part);
1229
			ret = __blkdev_get(whole, mode, 1);
Linus Torvalds's avatar
Linus Torvalds committed
1230
			if (ret)
Tejun Heo's avatar
Tejun Heo committed
1231
				goto out_clear;
Linus Torvalds's avatar
Linus Torvalds committed
1232
			bdev->bd_contains = whole;
1233
1234
			bdev_inode_switch_bdi(bdev->bd_inode,
				whole->bd_inode->i_data.backing_dev_info);
1235
			bdev->bd_part = disk_get_part(disk, partno);
1236
			if (!(disk->flags & GENHD_FL_UP) ||
1237
			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
Linus Torvalds's avatar
Linus Torvalds committed
1238
				ret = -ENXIO;
Tejun Heo's avatar
Tejun Heo committed
1239
				goto out_clear;
Linus Torvalds's avatar
Linus Torvalds committed
1240
			}
1241
			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
Linus Torvalds's avatar
Linus Torvalds committed
1242
1243
1244
		}
	} else {
		if (bdev->bd_contains == bdev) {
1245
1246
			ret = 0;
			if (bdev->bd_disk->fops->open)
1247
				ret = bdev->bd_disk->fops->open(bdev, mode);
1248
			/* the same as first opener case, read comment there */
1249
1250
1251
1252
1253
1254
			if (bdev->bd_invalidated) {
				if (!ret)
					rescan_partitions(bdev->bd_disk, bdev);
				else if (ret == -ENOMEDIUM)
					invalidate_partitions(bdev->bd_disk, bdev);
			}
1255
1256
			if (ret)
				goto out_unlock_bdev;
Linus Torvalds's avatar
Linus Torvalds committed
1257
		}
1258
1259
		/* only one opener holds refs to the module and disk */
		put_disk(disk);
1260
		module_put(owner);
Linus Torvalds's avatar
Linus Torvalds committed
1261
1262
	}
	bdev->bd_openers++;
1263
1264
	if (for_part)
		bdev->bd_part_count++;
1265
	mutex_unlock(&bdev->bd_mutex);
1266
	disk_unblock_events(disk);
Linus Torvalds's avatar
Linus Torvalds committed
1267
1268
	return 0;

Tejun Heo's avatar
Tejun Heo committed
1269
 out_clear:
1270
	disk_put_part(bdev->bd_part);
Linus Torvalds's avatar
Linus Torvalds committed
1271
	bdev->bd_disk = NULL;
Tejun Heo's avatar
Tejun Heo committed
1272
	bdev->bd_part = NULL;
1273
	bdev->bd_queue = NULL;
1274
	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
Linus Torvalds's avatar
Linus Torvalds committed
1275
	if (bdev != bdev->bd_contains)
1276
		__blkdev_put(bdev->bd_contains, mode, 1);
Linus Torvalds's avatar
Linus Torvalds committed
1277
	bdev->bd_contains = NULL;
Tejun Heo's avatar
Tejun Heo committed
1278
 out_unlock_bdev:
1279
	mutex_unlock(&bdev->bd_mutex);
1280
	disk_unblock_events(disk);
Tejun Heo's avatar
Tejun Heo committed
1281
	put_disk(disk);