namespace.c 84.6 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
/*
 *  linux/fs/namespace.c
 *
 * (C) Copyright Al Viro 2000, 2001
 *	Released under GPL v2.
 *
 * Based on code from fs/super.c, copyright Linus Torvalds and others.
 * Heavily rewritten.
 */

#include <linux/syscalls.h>
Al Viro's avatar
Al Viro committed
12
#include <linux/export.h>
13
#include <linux/capability.h>
14
#include <linux/mnt_namespace.h>
15
#include <linux/user_namespace.h>
Linus Torvalds's avatar
Linus Torvalds committed
16
17
#include <linux/namei.h>
#include <linux/security.h>
18
#include <linux/cred.h>
19
#include <linux/idr.h>
20
#include <linux/init.h>		/* init_rootfs */
Al Viro's avatar
Al Viro committed
21
22
23
#include <linux/fs_struct.h>	/* get_fs_root et.al. */
#include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
#include <linux/uaccess.h>
24
#include <linux/proc_ns.h>
25
#include <linux/magic.h>
26
#include <linux/memblock.h>
Al Viro's avatar
Al Viro committed
27
#include <linux/task_work.h>
28
29
#include <linux/sched/task.h>

30
#include "pnode.h"
31
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
32

33
34
35
/* Maximum number of mounts in a mount namespace */
unsigned int sysctl_mount_max __read_mostly = 100000;

Al Viro's avatar
Al Viro committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
static unsigned int m_hash_mask __read_mostly;
static unsigned int m_hash_shift __read_mostly;
static unsigned int mp_hash_mask __read_mostly;
static unsigned int mp_hash_shift __read_mostly;

static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{
	if (!str)
		return 0;
	mhash_entries = simple_strtoul(str, &str, 0);
	return 1;
}
__setup("mhash_entries=", set_mhash_entries);

static __initdata unsigned long mphash_entries;
static int __init set_mphash_entries(char *str)
{
	if (!str)
		return 0;
	mphash_entries = simple_strtoul(str, &str, 0);
	return 1;
}
__setup("mphash_entries=", set_mphash_entries);
Eric Dumazet's avatar
Eric Dumazet committed
60

Al Viro's avatar
Al Viro committed
61
static u64 event;
62
static DEFINE_IDA(mnt_id_ida);
63
static DEFINE_IDA(mnt_group_ida);
Linus Torvalds's avatar
Linus Torvalds committed
64

Al Viro's avatar
Al Viro committed
65
static struct hlist_head *mount_hashtable __read_mostly;
Al Viro's avatar
Al Viro committed
66
static struct hlist_head *mountpoint_hashtable __read_mostly;
67
static struct kmem_cache *mnt_cache __read_mostly;
Al Viro's avatar
Al Viro committed
68
static DECLARE_RWSEM(namespace_sem);
Linus Torvalds's avatar
Linus Torvalds committed
69

Miklos Szeredi's avatar
Miklos Szeredi committed
70
/* /sys/fs */
71
72
struct kobject *fs_kobj;
EXPORT_SYMBOL_GPL(fs_kobj);
Miklos Szeredi's avatar
Miklos Szeredi committed
73

Nick Piggin's avatar
Nick Piggin committed
74
75
76
77
78
79
80
81
/*
 * vfsmount lock may be taken for read to prevent changes to the
 * vfsmount hash, ie. during mountpoint lookups or walking back
 * up the tree.
 *
 * It should be taken for write in all cases where the vfsmount
 * tree or hash is modified or when a vfsmount structure is modified.
 */
Al Viro's avatar
Al Viro committed
82
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
Nick Piggin's avatar
Nick Piggin committed
83

Al Viro's avatar
Al Viro committed
84
static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
Linus Torvalds's avatar
Linus Torvalds committed
85
{
Ram Pai's avatar
Ram Pai committed
86
87
	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
Al Viro's avatar
Al Viro committed
88
89
90
91
92
93
94
95
96
	tmp = tmp + (tmp >> m_hash_shift);
	return &mount_hashtable[tmp & m_hash_mask];
}

static inline struct hlist_head *mp_hash(struct dentry *dentry)
{
	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
	tmp = tmp + (tmp >> mp_hash_shift);
	return &mountpoint_hashtable[tmp & mp_hash_mask];
Linus Torvalds's avatar
Linus Torvalds committed
97
98
}

99
static int mnt_alloc_id(struct mount *mnt)
100
{
101
102
103
104
105
106
	int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);

	if (res < 0)
		return res;
	mnt->mnt_id = res;
	return 0;
107
108
}

109
static void mnt_free_id(struct mount *mnt)
110
{
111
	ida_free(&mnt_id_ida, mnt->mnt_id);
112
113
}

114
115
116
/*
 * Allocate a new peer group ID
 */
117
static int mnt_alloc_group_id(struct mount *mnt)
118
{
119
	int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);
120

121
122
123
124
	if (res < 0)
		return res;
	mnt->mnt_group_id = res;
	return 0;
125
126
127
128
129
}

/*
 * Release a peer group ID
 */
130
void mnt_release_group_id(struct mount *mnt)
131
{
132
	ida_free(&mnt_group_ida, mnt->mnt_group_id);
Al Viro's avatar
Al Viro committed
133
	mnt->mnt_group_id = 0;
134
135
}

Nick Piggin's avatar
Nick Piggin committed
136
137
138
/*
 * vfsmount lock must be held for read
 */
139
static inline void mnt_add_count(struct mount *mnt, int n)
Nick Piggin's avatar
Nick Piggin committed
140
141
{
#ifdef CONFIG_SMP
142
	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
Nick Piggin's avatar
Nick Piggin committed
143
144
#else
	preempt_disable();
145
	mnt->mnt_count += n;
Nick Piggin's avatar
Nick Piggin committed
146
147
148
149
150
151
152
	preempt_enable();
#endif
}

/*
 * vfsmount lock must be held for write
 */
153
unsigned int mnt_get_count(struct mount *mnt)
Nick Piggin's avatar
Nick Piggin committed
154
155
{
#ifdef CONFIG_SMP
156
	unsigned int count = 0;
Nick Piggin's avatar
Nick Piggin committed
157
158
159
	int cpu;

	for_each_possible_cpu(cpu) {
160
		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
Nick Piggin's avatar
Nick Piggin committed
161
162
163
164
	}

	return count;
#else
165
	return mnt->mnt_count;
Nick Piggin's avatar
Nick Piggin committed
166
167
168
#endif
}

169
170
171
172
173
174
175
176
static void drop_mountpoint(struct fs_pin *p)
{
	struct mount *m = container_of(p, struct mount, mnt_umount);
	dput(m->mnt_ex_mountpoint);
	pin_remove(p);
	mntput(&m->mnt);
}

177
static struct mount *alloc_vfsmnt(const char *name)
Linus Torvalds's avatar
Linus Torvalds committed
178
{
179
180
	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
	if (mnt) {
181
182
		int err;

183
		err = mnt_alloc_id(mnt);
184
185
186
187
		if (err)
			goto out_free_cache;

		if (name) {
188
			mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
189
			if (!mnt->mnt_devname)
190
				goto out_free_id;
191
192
		}

Nick Piggin's avatar
Nick Piggin committed
193
#ifdef CONFIG_SMP
194
195
		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
		if (!mnt->mnt_pcp)
Nick Piggin's avatar
Nick Piggin committed
196
197
			goto out_free_devname;

198
		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
Nick Piggin's avatar
Nick Piggin committed
199
#else
200
201
		mnt->mnt_count = 1;
		mnt->mnt_writers = 0;
Nick Piggin's avatar
Nick Piggin committed
202
203
#endif

Al Viro's avatar
Al Viro committed
204
		INIT_HLIST_NODE(&mnt->mnt_hash);
205
206
207
208
209
210
211
		INIT_LIST_HEAD(&mnt->mnt_child);
		INIT_LIST_HEAD(&mnt->mnt_mounts);
		INIT_LIST_HEAD(&mnt->mnt_list);
		INIT_LIST_HEAD(&mnt->mnt_expire);
		INIT_LIST_HEAD(&mnt->mnt_share);
		INIT_LIST_HEAD(&mnt->mnt_slave_list);
		INIT_LIST_HEAD(&mnt->mnt_slave);
212
		INIT_HLIST_NODE(&mnt->mnt_mp_list);
213
		INIT_LIST_HEAD(&mnt->mnt_umounting);
214
		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
Linus Torvalds's avatar
Linus Torvalds committed
215
	}
216
	return mnt;
217

npiggin@suse.de's avatar
npiggin@suse.de committed
218
219
#ifdef CONFIG_SMP
out_free_devname:
220
	kfree_const(mnt->mnt_devname);
npiggin@suse.de's avatar
npiggin@suse.de committed
221
#endif
222
out_free_id:
223
	mnt_free_id(mnt);
224
out_free_cache:
225
	kmem_cache_free(mnt_cache, mnt);
226
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
227
228
}

229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
/*
 * Most r/o checks on a fs are for operations that take
 * discrete amounts of time, like a write() or unlink().
 * We must keep track of when those operations start
 * (for permission checks) and when they end, so that
 * we can determine when writes are able to occur to
 * a filesystem.
 */
/*
 * __mnt_is_readonly: check whether a mount is read-only
 * @mnt: the mount to check for its write status
 *
 * This shouldn't be used directly ouside of the VFS.
 * It does not guarantee that the filesystem will stay
 * r/w, just that it is right *now*.  This can not and
 * should not be used in place of IS_RDONLY(inode).
 * mnt_want/drop_write() will _keep_ the filesystem
 * r/w.
 */
int __mnt_is_readonly(struct vfsmount *mnt)
{
250
251
	if (mnt->mnt_flags & MNT_READONLY)
		return 1;
252
	if (sb_rdonly(mnt->mnt_sb))
253
254
		return 1;
	return 0;
255
256
257
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

258
static inline void mnt_inc_writers(struct mount *mnt)
npiggin@suse.de's avatar
npiggin@suse.de committed
259
260
{
#ifdef CONFIG_SMP
261
	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
npiggin@suse.de's avatar
npiggin@suse.de committed
262
#else
263
	mnt->mnt_writers++;
npiggin@suse.de's avatar
npiggin@suse.de committed
264
265
#endif
}
266

267
static inline void mnt_dec_writers(struct mount *mnt)
268
{
npiggin@suse.de's avatar
npiggin@suse.de committed
269
#ifdef CONFIG_SMP
270
	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
npiggin@suse.de's avatar
npiggin@suse.de committed
271
#else
272
	mnt->mnt_writers--;
npiggin@suse.de's avatar
npiggin@suse.de committed
273
#endif
274
275
}

276
static unsigned int mnt_get_writers(struct mount *mnt)
277
{
npiggin@suse.de's avatar
npiggin@suse.de committed
278
279
#ifdef CONFIG_SMP
	unsigned int count = 0;
280
281
282
	int cpu;

	for_each_possible_cpu(cpu) {
283
		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
284
285
	}

npiggin@suse.de's avatar
npiggin@suse.de committed
286
287
288
289
	return count;
#else
	return mnt->mnt_writers;
#endif
290
291
}

292
293
294
295
296
297
298
299
300
static int mnt_is_readonly(struct vfsmount *mnt)
{
	if (mnt->mnt_sb->s_readonly_remount)
		return 1;
	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
	smp_rmb();
	return __mnt_is_readonly(mnt);
}

301
/*
302
303
304
305
 * Most r/o & frozen checks on a fs are for operations that take discrete
 * amounts of time, like a write() or unlink().  We must keep track of when
 * those operations start (for permission checks) and when they end, so that we
 * can determine when writes are able to occur to a filesystem.
306
307
 */
/**
308
 * __mnt_want_write - get write access to a mount without freeze protection
309
 * @m: the mount on which to take a write
310
 *
311
312
313
314
315
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mnt it read-write) before
 * returning success. This operation does not protect against filesystem being
 * frozen. When the write operation is finished, __mnt_drop_write() must be
 * called. This is effectively a refcount.
316
 */
317
int __mnt_want_write(struct vfsmount *m)
318
{
319
	struct mount *mnt = real_mount(m);
320
321
	int ret = 0;

npiggin@suse.de's avatar
npiggin@suse.de committed
322
	preempt_disable();
323
	mnt_inc_writers(mnt);
npiggin@suse.de's avatar
npiggin@suse.de committed
324
	/*
325
	 * The store to mnt_inc_writers must be visible before we pass
npiggin@suse.de's avatar
npiggin@suse.de committed
326
327
328
329
	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
	 * incremented count after it has set MNT_WRITE_HOLD.
	 */
	smp_mb();
330
	while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
npiggin@suse.de's avatar
npiggin@suse.de committed
331
332
333
334
335
336
337
		cpu_relax();
	/*
	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
	 * be set to match its requirements. So we must not load that until
	 * MNT_WRITE_HOLD is cleared.
	 */
	smp_rmb();
338
	if (mnt_is_readonly(m)) {
339
		mnt_dec_writers(mnt);
340
341
		ret = -EROFS;
	}
npiggin@suse.de's avatar
npiggin@suse.de committed
342
	preempt_enable();
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363

	return ret;
}

/**
 * mnt_want_write - get write access to a mount
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mount is read-write, filesystem
 * is not frozen) before returning success.  When the write operation is
 * finished, mnt_drop_write() must be called.  This is effectively a refcount.
 */
int mnt_want_write(struct vfsmount *m)
{
	int ret;

	sb_start_write(m->mnt_sb);
	ret = __mnt_want_write(m);
	if (ret)
		sb_end_write(m->mnt_sb);
364
	return ret;
365
366
367
}
EXPORT_SYMBOL_GPL(mnt_want_write);

368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
/**
 * mnt_clone_write - get write access to a mount
 * @mnt: the mount on which to take a write
 *
 * This is effectively like mnt_want_write, except
 * it must only be used to take an extra write reference
 * on a mountpoint that we already know has a write reference
 * on it. This allows some optimisation.
 *
 * After finished, mnt_drop_write must be called as usual to
 * drop the reference.
 */
int mnt_clone_write(struct vfsmount *mnt)
{
	/* superblock may be r/o */
	if (__mnt_is_readonly(mnt))
		return -EROFS;
	preempt_disable();
386
	mnt_inc_writers(real_mount(mnt));
387
388
389
390
391
392
	preempt_enable();
	return 0;
}
EXPORT_SYMBOL_GPL(mnt_clone_write);

/**
393
 * __mnt_want_write_file - get write access to a file's mount
394
395
 * @file: the file who's mount on which to take a write
 *
396
 * This is like __mnt_want_write, but it takes a file and can
397
398
 * do some optimisations if the file is open for write already
 */
399
int __mnt_want_write_file(struct file *file)
400
{
401
	if (!(file->f_mode & FMODE_WRITER))
402
		return __mnt_want_write(file->f_path.mnt);
403
404
405
	else
		return mnt_clone_write(file->f_path.mnt);
}
406

407
408
409
410
411
412
413
414
415
416
417
/**
 * mnt_want_write_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_want_write, but it takes a file and can
 * do some optimisations if the file is open for write already
 */
int mnt_want_write_file(struct file *file)
{
	int ret;

418
	sb_start_write(file_inode(file)->i_sb);
419
420
	ret = __mnt_want_write_file(file);
	if (ret)
421
		sb_end_write(file_inode(file)->i_sb);
422
423
	return ret;
}
424
425
EXPORT_SYMBOL_GPL(mnt_want_write_file);

426
/**
427
 * __mnt_drop_write - give up write access to a mount
428
429
430
431
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done
 * performing writes to it.  Must be matched with
432
 * __mnt_want_write() call above.
433
 */
434
void __mnt_drop_write(struct vfsmount *mnt)
435
{
npiggin@suse.de's avatar
npiggin@suse.de committed
436
	preempt_disable();
437
	mnt_dec_writers(real_mount(mnt));
npiggin@suse.de's avatar
npiggin@suse.de committed
438
	preempt_enable();
439
}
440
441
442
443
444
445
446
447
448
449
450
451
452
453

/**
 * mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done performing writes to it and
 * also allows filesystem to be frozen again.  Must be matched with
 * mnt_want_write() call above.
 */
void mnt_drop_write(struct vfsmount *mnt)
{
	__mnt_drop_write(mnt);
	sb_end_write(mnt->mnt_sb);
}
454
455
EXPORT_SYMBOL_GPL(mnt_drop_write);

456
457
458
459
460
void __mnt_drop_write_file(struct file *file)
{
	__mnt_drop_write(file->f_path.mnt);
}

461
462
void mnt_drop_write_file(struct file *file)
{
463
	__mnt_drop_write_file(file);
464
465
	sb_end_write(file_inode(file)->i_sb);
}
Al Viro's avatar
Al Viro committed
466
467
EXPORT_SYMBOL(mnt_drop_write_file);

468
static int mnt_make_readonly(struct mount *mnt)
469
{
470
471
	int ret = 0;

472
	lock_mount_hash();
473
	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
474
	/*
npiggin@suse.de's avatar
npiggin@suse.de committed
475
476
	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
	 * should be visible before we do.
477
	 */
npiggin@suse.de's avatar
npiggin@suse.de committed
478
479
	smp_mb();

480
	/*
npiggin@suse.de's avatar
npiggin@suse.de committed
481
482
483
484
485
486
487
488
489
490
491
492
493
494
	 * With writers on hold, if this value is zero, then there are
	 * definitely no active writers (although held writers may subsequently
	 * increment the count, they'll have to wait, and decrement it after
	 * seeing MNT_READONLY).
	 *
	 * It is OK to have counter incremented on one CPU and decremented on
	 * another: the sum will add up correctly. The danger would be when we
	 * sum up each counter, if we read a counter before it is incremented,
	 * but then read another CPU's count which it has been subsequently
	 * decremented from -- we would see more decrements than we should.
	 * MNT_WRITE_HOLD protects against this scenario, because
	 * mnt_want_write first increments count, then smp_mb, then spins on
	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
	 * we're counting up here.
495
	 */
496
	if (mnt_get_writers(mnt) > 0)
npiggin@suse.de's avatar
npiggin@suse.de committed
497
498
		ret = -EBUSY;
	else
499
		mnt->mnt.mnt_flags |= MNT_READONLY;
npiggin@suse.de's avatar
npiggin@suse.de committed
500
501
502
503
504
	/*
	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
	 * that become unheld will see MNT_READONLY.
	 */
	smp_wmb();
505
	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
506
	unlock_mount_hash();
507
	return ret;
508
509
}

510
static void __mnt_unmake_readonly(struct mount *mnt)
511
{
512
	lock_mount_hash();
513
	mnt->mnt.mnt_flags &= ~MNT_READONLY;
514
	unlock_mount_hash();
515
516
}

517
518
519
520
521
int sb_prepare_remount_readonly(struct super_block *sb)
{
	struct mount *mnt;
	int err = 0;

522
523
524
525
	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
	if (atomic_long_read(&sb->s_remove_count))
		return -EBUSY;

526
	lock_mount_hash();
527
528
529
530
531
532
533
534
535
536
	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
			smp_mb();
			if (mnt_get_writers(mnt) > 0) {
				err = -EBUSY;
				break;
			}
		}
	}
537
538
539
	if (!err && atomic_long_read(&sb->s_remove_count))
		err = -EBUSY;

540
541
542
543
544
545
546
547
	if (!err) {
		sb->s_readonly_remount = 1;
		smp_wmb();
	}
	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
	}
548
	unlock_mount_hash();
549
550
551
552

	return err;
}

553
static void free_vfsmnt(struct mount *mnt)
Linus Torvalds's avatar
Linus Torvalds committed
554
{
555
	kfree_const(mnt->mnt_devname);
npiggin@suse.de's avatar
npiggin@suse.de committed
556
#ifdef CONFIG_SMP
557
	free_percpu(mnt->mnt_pcp);
npiggin@suse.de's avatar
npiggin@suse.de committed
558
#endif
559
	kmem_cache_free(mnt_cache, mnt);
Linus Torvalds's avatar
Linus Torvalds committed
560
561
}

562
563
564
565
566
static void delayed_free_vfsmnt(struct rcu_head *head)
{
	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
}

Al Viro's avatar
Al Viro committed
567
/* call under rcu_read_lock */
Al Viro's avatar
Al Viro committed
568
int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
Al Viro's avatar
Al Viro committed
569
570
571
{
	struct mount *mnt;
	if (read_seqretry(&mount_lock, seq))
Al Viro's avatar
Al Viro committed
572
		return 1;
Al Viro's avatar
Al Viro committed
573
	if (bastard == NULL)
Al Viro's avatar
Al Viro committed
574
		return 0;
Al Viro's avatar
Al Viro committed
575
576
	mnt = real_mount(bastard);
	mnt_add_count(mnt, 1);
Al Viro's avatar
Al Viro committed
577
	smp_mb();			// see mntput_no_expire()
Al Viro's avatar
Al Viro committed
578
	if (likely(!read_seqretry(&mount_lock, seq)))
Al Viro's avatar
Al Viro committed
579
		return 0;
Al Viro's avatar
Al Viro committed
580
581
	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
		mnt_add_count(mnt, -1);
Al Viro's avatar
Al Viro committed
582
583
		return 1;
	}
Al Viro's avatar
Al Viro committed
584
585
586
587
588
589
590
591
	lock_mount_hash();
	if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
		mnt_add_count(mnt, -1);
		unlock_mount_hash();
		return 1;
	}
	unlock_mount_hash();
	/* caller will mntput() */
Al Viro's avatar
Al Viro committed
592
593
594
595
596
597
598
599
600
601
602
603
604
	return -1;
}

/* call under rcu_read_lock */
bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
	int res = __legitimize_mnt(bastard, seq);
	if (likely(!res))
		return true;
	if (unlikely(res < 0)) {
		rcu_read_unlock();
		mntput(bastard);
		rcu_read_lock();
Al Viro's avatar
Al Viro committed
605
606
607
608
	}
	return false;
}

Linus Torvalds's avatar
Linus Torvalds committed
609
/*
610
 * find the first mount at @dentry on vfsmount @mnt.
Al Viro's avatar
Al Viro committed
611
 * call under rcu_read_lock()
Linus Torvalds's avatar
Linus Torvalds committed
612
 */
613
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
Linus Torvalds's avatar
Linus Torvalds committed
614
{
Al Viro's avatar
Al Viro committed
615
	struct hlist_head *head = m_hash(mnt, dentry);
616
617
	struct mount *p;

Al Viro's avatar
Al Viro committed
618
	hlist_for_each_entry_rcu(p, head, mnt_hash)
619
620
621
622
623
		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
			return p;
	return NULL;
}

624
/*
625
626
627
628
629
630
631
632
633
634
635
636
637
638
 * lookup_mnt - Return the first child mount mounted at path
 *
 * "First" means first mounted chronologically.  If you create the
 * following mounts:
 *
 * mount /dev/sda1 /mnt
 * mount /dev/sda2 /mnt
 * mount /dev/sda3 /mnt
 *
 * Then lookup_mnt() on the base /mnt dentry in the root mount will
 * return successively the root dentry and vfsmount of /dev/sda1, then
 * /dev/sda2, then /dev/sda3, then NULL.
 *
 * lookup_mnt takes a reference to the found vfsmount.
639
 */
640
struct vfsmount *lookup_mnt(const struct path *path)
641
{
642
	struct mount *child_mnt;
Al Viro's avatar
Al Viro committed
643
644
	struct vfsmount *m;
	unsigned seq;
Nick Piggin's avatar
Nick Piggin committed
645

Al Viro's avatar
Al Viro committed
646
647
648
649
650
651
652
653
	rcu_read_lock();
	do {
		seq = read_seqbegin(&mount_lock);
		child_mnt = __lookup_mnt(path->mnt, path->dentry);
		m = child_mnt ? &child_mnt->mnt : NULL;
	} while (!legitimize_mnt(m, seq));
	rcu_read_unlock();
	return m;
654
655
}

656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
/*
 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
 *                         current mount namespace.
 *
 * The common case is dentries are not mountpoints at all and that
 * test is handled inline.  For the slow case when we are actually
 * dealing with a mountpoint of some kind, walk through all of the
 * mounts in the current mount namespace and test to see if the dentry
 * is a mountpoint.
 *
 * The mount_hashtable is not usable in the context because we
 * need to identify all mounts that may be in the current mount
 * namespace not just a mount that happens to have some specified
 * parent mount.
 */
bool __is_local_mountpoint(struct dentry *dentry)
{
	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
	struct mount *mnt;
	bool is_covered = false;

	if (!d_mountpoint(dentry))
		goto out;

	down_read(&namespace_sem);
	list_for_each_entry(mnt, &ns->list, mnt_list) {
		is_covered = (mnt->mnt_mountpoint == dentry);
		if (is_covered)
			break;
	}
	up_read(&namespace_sem);
out:
	return is_covered;
}

691
static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
692
{
Al Viro's avatar
Al Viro committed
693
	struct hlist_head *chain = mp_hash(dentry);
694
695
	struct mountpoint *mp;

Al Viro's avatar
Al Viro committed
696
	hlist_for_each_entry(mp, chain, m_hash) {
697
698
699
700
701
		if (mp->m_dentry == dentry) {
			mp->m_count++;
			return mp;
		}
	}
702
703
704
	return NULL;
}

705
static struct mountpoint *get_mountpoint(struct dentry *dentry)
706
{
707
	struct mountpoint *mp, *new = NULL;
708
	int ret;
709

710
	if (d_mountpoint(dentry)) {
711
712
713
		/* might be worth a WARN_ON() */
		if (d_unlinked(dentry))
			return ERR_PTR(-ENOENT);
714
715
716
717
718
719
720
721
722
723
724
mountpoint:
		read_seqlock_excl(&mount_lock);
		mp = lookup_mountpoint(dentry);
		read_sequnlock_excl(&mount_lock);
		if (mp)
			goto done;
	}

	if (!new)
		new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
	if (!new)
725
726
		return ERR_PTR(-ENOMEM);

727
728

	/* Exactly one processes may set d_mounted */
729
730
	ret = d_set_mounted(dentry);

731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
	/* Someone else set d_mounted? */
	if (ret == -EBUSY)
		goto mountpoint;

	/* The dentry is not available as a mountpoint? */
	mp = ERR_PTR(ret);
	if (ret)
		goto done;

	/* Add the new mountpoint to the hash table */
	read_seqlock_excl(&mount_lock);
	new->m_dentry = dentry;
	new->m_count = 1;
	hlist_add_head(&new->m_hash, mp_hash(dentry));
	INIT_HLIST_HEAD(&new->m_list);
	read_sequnlock_excl(&mount_lock);

	mp = new;
	new = NULL;
done:
	kfree(new);
752
753
754
755
756
757
758
	return mp;
}

static void put_mountpoint(struct mountpoint *mp)
{
	if (!--mp->m_count) {
		struct dentry *dentry = mp->m_dentry;
759
		BUG_ON(!hlist_empty(&mp->m_list));
760
761
762
		spin_lock(&dentry->d_lock);
		dentry->d_flags &= ~DCACHE_MOUNTED;
		spin_unlock(&dentry->d_lock);
Al Viro's avatar
Al Viro committed
763
		hlist_del(&mp->m_hash);
764
765
766
767
		kfree(mp);
	}
}

Al Viro's avatar
Al Viro committed
768
static inline int check_mnt(struct mount *mnt)
Linus Torvalds's avatar
Linus Torvalds committed
769
{
770
	return mnt->mnt_ns == current->nsproxy->mnt_ns;
Linus Torvalds's avatar
Linus Torvalds committed
771
772
}

Nick Piggin's avatar
Nick Piggin committed
773
774
775
/*
 * vfsmount lock must be held for write
 */
776
static void touch_mnt_namespace(struct mnt_namespace *ns)
Al Viro's avatar
Al Viro committed
777
778
779
780
781
782
783
{
	if (ns) {
		ns->event = ++event;
		wake_up_interruptible(&ns->poll);
	}
}

Nick Piggin's avatar
Nick Piggin committed
784
785
786
/*
 * vfsmount lock must be held for write
 */
787
static void __touch_mnt_namespace(struct mnt_namespace *ns)
Al Viro's avatar
Al Viro committed
788
789
790
791
792
793
794
{
	if (ns && ns->event != event) {
		ns->event = event;
		wake_up_interruptible(&ns->poll);
	}
}

Nick Piggin's avatar
Nick Piggin committed
795
796
797
/*
 * vfsmount lock must be held for write
 */
798
static void unhash_mnt(struct mount *mnt)
799
{
800
	mnt->mnt_parent = mnt;
801
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
802
	list_del_init(&mnt->mnt_child);
Al Viro's avatar
Al Viro committed
803
	hlist_del_init_rcu(&mnt->mnt_hash);
804
	hlist_del_init(&mnt->mnt_mp_list);
805
806
	put_mountpoint(mnt->mnt_mp);
	mnt->mnt_mp = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
807
808
}

809
810
811
812
813
814
815
816
817
818
/*
 * vfsmount lock must be held for write
 */
static void detach_mnt(struct mount *mnt, struct path *old_path)
{
	old_path->dentry = mnt->mnt_mountpoint;
	old_path->mnt = &mnt->mnt_parent->mnt;
	unhash_mnt(mnt);
}

819
820
821
822
823
824
825
826
827
828
/*
 * vfsmount lock must be held for write
 */
static void umount_mnt(struct mount *mnt)
{
	/* old mountpoint will be dropped when we can do that */
	mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
	unhash_mnt(mnt);
}

Nick Piggin's avatar
Nick Piggin committed
829
830
831
/*
 * vfsmount lock must be held for write
 */
832
833
void mnt_set_mountpoint(struct mount *mnt,
			struct mountpoint *mp,
834
			struct mount *child_mnt)
835
{
836
	mp->m_count++;
837
	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
838
	child_mnt->mnt_mountpoint = dget(mp->m_dentry);
839
	child_mnt->mnt_parent = mnt;
840
	child_mnt->mnt_mp = mp;
841
	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
842
843
}

844
845
846
847
848
849
850
static void __attach_mnt(struct mount *mnt, struct mount *parent)
{
	hlist_add_head_rcu(&mnt->mnt_hash,
			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}

Nick Piggin's avatar
Nick Piggin committed
851
852
853
/*
 * vfsmount lock must be held for write
 */
854
855
856
static void attach_mnt(struct mount *mnt,
			struct mount *parent,
			struct mountpoint *mp)
Linus Torvalds's avatar
Linus Torvalds committed
857
{
858
	mnt_set_mountpoint(parent, mp, mnt);
859
	__attach_mnt(mnt, parent);
860
861
}

862
void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
Al Viro's avatar
Al Viro committed
863
{
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
	struct mountpoint *old_mp = mnt->mnt_mp;
	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
	struct mount *old_parent = mnt->mnt_parent;

	list_del_init(&mnt->mnt_child);
	hlist_del_init(&mnt->mnt_mp_list);
	hlist_del_init_rcu(&mnt->mnt_hash);

	attach_mnt(mnt, parent, mp);

	put_mountpoint(old_mp);

	/*
	 * Safely avoid even the suggestion this code might sleep or
	 * lock the mount hash by taking advantage of the knowledge that
	 * mnt_change_mountpoint will not release the final reference
	 * to a mountpoint.
	 *
	 * During mounting, the mount passed in as the parent mount will
	 * continue to use the old mountpoint and during unmounting, the
	 * old mountpoint will continue to exist until namespace_unlock,
	 * which happens well after mnt_change_mountpoint.
	 */
	spin_lock(&old_mountpoint->d_lock);
	old_mountpoint->d_lockref.count--;
	spin_unlock(&old_mountpoint->d_lock);

	mnt_add_count(old_parent, -1);
Al Viro's avatar
Al Viro committed
892
893
}

894
/*
Nick Piggin's avatar
Nick Piggin committed
895
 * vfsmount lock must be held for write
896
 */
897
static void commit_tree(struct mount *mnt)
898
{
899
	struct mount *parent = mnt->mnt_parent;
900
	struct mount *m;
901
	LIST_HEAD(head);
Al Viro's avatar
Al Viro committed
902
	struct mnt_namespace *n = parent->mnt_ns;
903

904
	BUG_ON(parent == mnt);
905

Al Viro's avatar
Al Viro committed
906
	list_add_tail(&head, &mnt->mnt_list);
Al Viro's avatar
Al Viro committed
907
	list_for_each_entry(m, &head, mnt_list)
Al Viro's avatar
Al Viro committed
908
		m->mnt_ns = n;
909

910
911
	list_splice(&head, n->list.prev);

912
913
914
	n->mounts += n->pending_mounts;
	n->pending_mounts = 0;

915
	__attach_mnt(mnt, parent);
916
	touch_mnt_namespace(n);
Linus Torvalds's avatar
Linus Torvalds committed
917
918
}

919
static struct mount *next_mnt(struct mount *p, struct mount *root)
Linus Torvalds's avatar
Linus Torvalds committed
920
{
921
922
	struct list_head *next = p->mnt_mounts.next;
	if (next == &p->mnt_mounts) {
Linus Torvalds's avatar
Linus Torvalds committed
923
		while (1) {
924
			if (p == root)
Linus Torvalds's avatar
Linus Torvalds committed
925
				return NULL;
926
927
			next = p->mnt_child.next;
			if (next != &p->mnt_parent->mnt_mounts)
Linus Torvalds's avatar
Linus Torvalds committed
928
				break;
929
			p = p->mnt_parent;
Linus Torvalds's avatar
Linus Torvalds committed
930
931
		}
	}
932
	return list_entry(next, struct mount, mnt_child);
Linus Torvalds's avatar
Linus Torvalds committed
933
934
}

935
static struct mount *skip_mnt_tree(struct mount *p)
Ram Pai's avatar
Ram Pai committed
936
{
937
938
939
940
	struct list_head *prev = p->mnt_mounts.prev;
	while (prev != &p->mnt_mounts) {
		p = list_entry(prev, struct mount, mnt_child);
		prev = p->mnt_mounts.prev;
Ram Pai's avatar
Ram Pai committed
941
942
943
944
	}
	return p;
}

945
946
947
struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
948
	struct mount *mnt;
949
950
951
952
953
954
955
956
957
	struct dentry *root;

	if (!type)
		return ERR_PTR(-ENODEV);

	mnt = alloc_vfsmnt(name);
	if (!mnt)
		return ERR_PTR(-ENOMEM);

958
	if (flags & SB_KERNMOUNT)
959
		mnt->mnt.mnt_flags = MNT_INTERNAL;
960
961
962

	root = mount_fs(type, flags, name, data);
	if (IS_ERR(root)) {
963
		mnt_free_id(mnt);
964
965
966
967
		free_vfsmnt(mnt);
		return ERR_CAST(root);
	}

968
969
	mnt->mnt.mnt_root = root;
	mnt->mnt.mnt_sb = root->d_sb;
970
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
971
	mnt->mnt_parent = mnt;
972
	lock_mount_hash();
973
	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
974
	unlock_mount_hash();
975
	return &mnt->mnt;
976
977
978
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);

979
980
981
982
983
984
985
986
987
988
989
struct vfsmount *
vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
	     const char *name, void *data)
{
	/* Until it is worked out how to pass the user namespace
	 * through from the parent mount to the submount don't support
	 * unprivileged mounts with submounts.
	 */
	if (mountpoint->d_sb->s_user_ns != &init_user_ns)
		return ERR_PTR(-EPERM);

990
	return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
991
992
993
}
EXPORT_SYMBOL_GPL(vfs_submount);

994
static struct mount *clone_mnt(struct mount *old, struct dentry *root,
Ram Pai's avatar
Ram Pai committed
995
					int flag)
Linus Torvalds's avatar
Linus Torvalds committed
996
{
997
	struct super_block *sb = old->mnt.mnt_sb;
998
999
	struct mount *mnt;
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
1000

1001
1002
1003
	mnt = alloc_vfsmnt(old->mnt_devname);
	if (!mnt)
		return ERR_PTR(-ENOMEM);
1004

1005
	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
1006
1007
1008
		mnt->mnt_group_id = 0; /* not a peer of original */
	else
		mnt->mnt_group_id = old->mnt_group_id;
1009

1010
1011
1012
1013
	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
		err = mnt_alloc_group_id(mnt);
		if (err)
			goto out_free;
Linus Torvalds's avatar
Linus Torvalds committed
1014
	}
1015

1016
1017
	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
1018
	/* Don't allow unprivileged users to change mount flags */
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
	if (flag & CL_UNPRIVILEGED) {
		mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;

		if (mnt->mnt.mnt_flags & MNT_READONLY)
			mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;

		if (mnt->mnt.mnt_flags & MNT_NODEV)
			mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;

		if (mnt->mnt.mnt_flags & MNT_NOSUID)
			mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;

		if (mnt->mnt.mnt_flags & MNT_NOEXEC)
			mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
	}
1034

1035
	/* Don't allow unprivileged users to reveal what is under a mount */
1036
1037
	if ((flag & CL_UNPRIVILEGED) &&
	    (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire)))
1038
1039
		mnt->mnt.mnt_flags |= MNT_LOCKED;

1040
1041
1042
1043
1044
	atomic_inc(&sb->s_active);
	mnt->mnt.mnt_sb = sb;
	mnt->mnt.mnt_root = dget(root);
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
	mnt->mnt_parent = mnt;
1045
	lock_mount_hash();
1046
	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
1047
	unlock_mount_hash();
1048

1049
1050
	if ((flag & CL_SLAVE) ||
	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
1051
1052
1053
1054
1055
1056
1057
1058
1059
		list_add(&mnt->mnt_slave, &old->mnt_slave_list);
		mnt->mnt_master = old;
		CLEAR_MNT_SHARED(mnt);
	} else if (!(flag & CL_PRIVATE)) {
		if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
			list_add(&mnt->mnt_share, &old->mnt_share);
		if (IS_MNT_SLAVE(old))
			list_add(&mnt->mnt_slave, &old->mnt_slave);
		mnt->mnt_master = old->mnt_master;
Al Viro's avatar
Al Viro committed
1060
1061
	} else {
		CLEAR_MNT_SHARED(mnt);
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
	}
	if (flag & CL_MAKE_SHARED)
		set_mnt_shared(mnt);

	/* stick the duplicate mount on the same expiry list
	 * as the original if that was on one */
	if (flag & CL_EXPIRE) {
		if (!list_empty(&old->mnt_expire))
			list_add(&mnt->mnt_expire, &old->mnt_expire);
	}

1073
	return mnt;
1074
1075

 out_free:
1076
	mnt_free_id(mnt);
1077
	free_vfsmnt(mnt);
1078
	return ERR_PTR(err);
Linus Torvalds's avatar
Linus Torvalds committed
1079
1080