namespace.c 86.4 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
/*
 *  linux/fs/namespace.c
 *
 * (C) Copyright Al Viro 2000, 2001
 *	Released under GPL v2.
 *
 * Based on code from fs/super.c, copyright Linus Torvalds and others.
 * Heavily rewritten.
 */

#include <linux/syscalls.h>
Al Viro's avatar
Al Viro committed
12
#include <linux/export.h>
13
#include <linux/capability.h>
14
#include <linux/mnt_namespace.h>
15
#include <linux/user_namespace.h>
Linus Torvalds's avatar
Linus Torvalds committed
16
17
#include <linux/namei.h>
#include <linux/security.h>
18
#include <linux/cred.h>
19
#include <linux/idr.h>
20
#include <linux/init.h>		/* init_rootfs */
Al Viro's avatar
Al Viro committed
21
22
23
#include <linux/fs_struct.h>	/* get_fs_root et.al. */
#include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
#include <linux/uaccess.h>
24
#include <linux/proc_ns.h>
25
#include <linux/magic.h>
Al Viro's avatar
Al Viro committed
26
#include <linux/bootmem.h>
Al Viro's avatar
Al Viro committed
27
#include <linux/task_work.h>
28
29
#include <linux/sched/task.h>

30
#include "pnode.h"
31
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
32

33
34
35
/* Maximum number of mounts in a mount namespace */
unsigned int sysctl_mount_max __read_mostly = 100000;

Al Viro's avatar
Al Viro committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
static unsigned int m_hash_mask __read_mostly;
static unsigned int m_hash_shift __read_mostly;
static unsigned int mp_hash_mask __read_mostly;
static unsigned int mp_hash_shift __read_mostly;

static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{
	if (!str)
		return 0;
	mhash_entries = simple_strtoul(str, &str, 0);
	return 1;
}
__setup("mhash_entries=", set_mhash_entries);

static __initdata unsigned long mphash_entries;
static int __init set_mphash_entries(char *str)
{
	if (!str)
		return 0;
	mphash_entries = simple_strtoul(str, &str, 0);
	return 1;
}
__setup("mphash_entries=", set_mphash_entries);
Eric Dumazet's avatar
Eric Dumazet committed
60

Al Viro's avatar
Al Viro committed
61
static u64 event;
62
static DEFINE_IDA(mnt_id_ida);
63
static DEFINE_IDA(mnt_group_ida);
Nick Piggin's avatar
Nick Piggin committed
64
static DEFINE_SPINLOCK(mnt_id_lock);
65
66
static int mnt_id_start = 0;
static int mnt_group_start = 1;
Linus Torvalds's avatar
Linus Torvalds committed
67

Al Viro's avatar
Al Viro committed
68
static struct hlist_head *mount_hashtable __read_mostly;
Al Viro's avatar
Al Viro committed
69
static struct hlist_head *mountpoint_hashtable __read_mostly;
70
static struct kmem_cache *mnt_cache __read_mostly;
Al Viro's avatar
Al Viro committed
71
static DECLARE_RWSEM(namespace_sem);
Linus Torvalds's avatar
Linus Torvalds committed
72

Miklos Szeredi's avatar
Miklos Szeredi committed
73
/* /sys/fs */
74
75
struct kobject *fs_kobj;
EXPORT_SYMBOL_GPL(fs_kobj);
Miklos Szeredi's avatar
Miklos Szeredi committed
76

Nick Piggin's avatar
Nick Piggin committed
77
78
79
80
81
82
83
84
/*
 * vfsmount lock may be taken for read to prevent changes to the
 * vfsmount hash, ie. during mountpoint lookups or walking back
 * up the tree.
 *
 * It should be taken for write in all cases where the vfsmount
 * tree or hash is modified or when a vfsmount structure is modified.
 */
Al Viro's avatar
Al Viro committed
85
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
Nick Piggin's avatar
Nick Piggin committed
86

Al Viro's avatar
Al Viro committed
87
static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
Linus Torvalds's avatar
Linus Torvalds committed
88
{
Ram Pai's avatar
Ram Pai committed
89
90
	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
Al Viro's avatar
Al Viro committed
91
92
93
94
95
96
97
98
99
	tmp = tmp + (tmp >> m_hash_shift);
	return &mount_hashtable[tmp & m_hash_mask];
}

static inline struct hlist_head *mp_hash(struct dentry *dentry)
{
	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
	tmp = tmp + (tmp >> mp_hash_shift);
	return &mountpoint_hashtable[tmp & mp_hash_mask];
Linus Torvalds's avatar
Linus Torvalds committed
100
101
}

102
static int mnt_alloc_id(struct mount *mnt)
103
104
105
106
107
{
	int res;

retry:
	ida_pre_get(&mnt_id_ida, GFP_KERNEL);
Nick Piggin's avatar
Nick Piggin committed
108
	spin_lock(&mnt_id_lock);
Al Viro's avatar
Al Viro committed
109
	res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
110
	if (!res)
Al Viro's avatar
Al Viro committed
111
		mnt_id_start = mnt->mnt_id + 1;
Nick Piggin's avatar
Nick Piggin committed
112
	spin_unlock(&mnt_id_lock);
113
114
115
116
117
118
	if (res == -EAGAIN)
		goto retry;

	return res;
}

119
static void mnt_free_id(struct mount *mnt)
120
{
Al Viro's avatar
Al Viro committed
121
	int id = mnt->mnt_id;
Nick Piggin's avatar
Nick Piggin committed
122
	spin_lock(&mnt_id_lock);
123
124
125
	ida_remove(&mnt_id_ida, id);
	if (mnt_id_start > id)
		mnt_id_start = id;
Nick Piggin's avatar
Nick Piggin committed
126
	spin_unlock(&mnt_id_lock);
127
128
}

129
130
131
132
133
/*
 * Allocate a new peer group ID
 *
 * mnt_group_ida is protected by namespace_sem
 */
134
static int mnt_alloc_group_id(struct mount *mnt)
135
{
136
137
	int res;

138
139
140
	if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
		return -ENOMEM;

141
142
	res = ida_get_new_above(&mnt_group_ida,
				mnt_group_start,
Al Viro's avatar
Al Viro committed
143
				&mnt->mnt_group_id);
144
	if (!res)
Al Viro's avatar
Al Viro committed
145
		mnt_group_start = mnt->mnt_group_id + 1;
146
147

	return res;
148
149
150
151
152
}

/*
 * Release a peer group ID
 */
153
void mnt_release_group_id(struct mount *mnt)
154
{
Al Viro's avatar
Al Viro committed
155
	int id = mnt->mnt_group_id;
156
157
158
	ida_remove(&mnt_group_ida, id);
	if (mnt_group_start > id)
		mnt_group_start = id;
Al Viro's avatar
Al Viro committed
159
	mnt->mnt_group_id = 0;
160
161
}

Nick Piggin's avatar
Nick Piggin committed
162
163
164
/*
 * vfsmount lock must be held for read
 */
165
static inline void mnt_add_count(struct mount *mnt, int n)
Nick Piggin's avatar
Nick Piggin committed
166
167
{
#ifdef CONFIG_SMP
168
	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
Nick Piggin's avatar
Nick Piggin committed
169
170
#else
	preempt_disable();
171
	mnt->mnt_count += n;
Nick Piggin's avatar
Nick Piggin committed
172
173
174
175
176
177
178
	preempt_enable();
#endif
}

/*
 * vfsmount lock must be held for write
 */
179
unsigned int mnt_get_count(struct mount *mnt)
Nick Piggin's avatar
Nick Piggin committed
180
181
{
#ifdef CONFIG_SMP
182
	unsigned int count = 0;
Nick Piggin's avatar
Nick Piggin committed
183
184
185
	int cpu;

	for_each_possible_cpu(cpu) {
186
		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
Nick Piggin's avatar
Nick Piggin committed
187
188
189
190
	}

	return count;
#else
191
	return mnt->mnt_count;
Nick Piggin's avatar
Nick Piggin committed
192
193
194
#endif
}

195
196
197
198
199
200
201
202
static void drop_mountpoint(struct fs_pin *p)
{
	struct mount *m = container_of(p, struct mount, mnt_umount);
	dput(m->mnt_ex_mountpoint);
	pin_remove(p);
	mntput(&m->mnt);
}

203
static struct mount *alloc_vfsmnt(const char *name)
Linus Torvalds's avatar
Linus Torvalds committed
204
{
205
206
	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
	if (mnt) {
207
208
		int err;

209
		err = mnt_alloc_id(mnt);
210
211
212
213
		if (err)
			goto out_free_cache;

		if (name) {
214
			mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
215
			if (!mnt->mnt_devname)
216
				goto out_free_id;
217
218
		}

Nick Piggin's avatar
Nick Piggin committed
219
#ifdef CONFIG_SMP
220
221
		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
		if (!mnt->mnt_pcp)
Nick Piggin's avatar
Nick Piggin committed
222
223
			goto out_free_devname;

224
		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
Nick Piggin's avatar
Nick Piggin committed
225
#else
226
227
		mnt->mnt_count = 1;
		mnt->mnt_writers = 0;
Nick Piggin's avatar
Nick Piggin committed
228
229
#endif

Al Viro's avatar
Al Viro committed
230
		INIT_HLIST_NODE(&mnt->mnt_hash);
231
232
233
234
235
236
237
		INIT_LIST_HEAD(&mnt->mnt_child);
		INIT_LIST_HEAD(&mnt->mnt_mounts);
		INIT_LIST_HEAD(&mnt->mnt_list);
		INIT_LIST_HEAD(&mnt->mnt_expire);
		INIT_LIST_HEAD(&mnt->mnt_share);
		INIT_LIST_HEAD(&mnt->mnt_slave_list);
		INIT_LIST_HEAD(&mnt->mnt_slave);
238
		INIT_HLIST_NODE(&mnt->mnt_mp_list);
239
		INIT_LIST_HEAD(&mnt->mnt_umounting);
240
		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
Linus Torvalds's avatar
Linus Torvalds committed
241
	}
242
	return mnt;
243

npiggin@suse.de's avatar
npiggin@suse.de committed
244
245
#ifdef CONFIG_SMP
out_free_devname:
246
	kfree_const(mnt->mnt_devname);
npiggin@suse.de's avatar
npiggin@suse.de committed
247
#endif
248
out_free_id:
249
	mnt_free_id(mnt);
250
out_free_cache:
251
	kmem_cache_free(mnt_cache, mnt);
252
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
253
254
}

255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
/*
 * Most r/o checks on a fs are for operations that take
 * discrete amounts of time, like a write() or unlink().
 * We must keep track of when those operations start
 * (for permission checks) and when they end, so that
 * we can determine when writes are able to occur to
 * a filesystem.
 */
/*
 * __mnt_is_readonly: check whether a mount is read-only
 * @mnt: the mount to check for its write status
 *
 * This shouldn't be used directly ouside of the VFS.
 * It does not guarantee that the filesystem will stay
 * r/w, just that it is right *now*.  This can not and
 * should not be used in place of IS_RDONLY(inode).
 * mnt_want/drop_write() will _keep_ the filesystem
 * r/w.
 */
int __mnt_is_readonly(struct vfsmount *mnt)
{
276
277
	if (mnt->mnt_flags & MNT_READONLY)
		return 1;
278
	if (sb_rdonly(mnt->mnt_sb))
279
280
		return 1;
	return 0;
281
282
283
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

284
static inline void mnt_inc_writers(struct mount *mnt)
npiggin@suse.de's avatar
npiggin@suse.de committed
285
286
{
#ifdef CONFIG_SMP
287
	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
npiggin@suse.de's avatar
npiggin@suse.de committed
288
#else
289
	mnt->mnt_writers++;
npiggin@suse.de's avatar
npiggin@suse.de committed
290
291
#endif
}
292

293
static inline void mnt_dec_writers(struct mount *mnt)
294
{
npiggin@suse.de's avatar
npiggin@suse.de committed
295
#ifdef CONFIG_SMP
296
	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
npiggin@suse.de's avatar
npiggin@suse.de committed
297
#else
298
	mnt->mnt_writers--;
npiggin@suse.de's avatar
npiggin@suse.de committed
299
#endif
300
301
}

302
static unsigned int mnt_get_writers(struct mount *mnt)
303
{
npiggin@suse.de's avatar
npiggin@suse.de committed
304
305
#ifdef CONFIG_SMP
	unsigned int count = 0;
306
307
308
	int cpu;

	for_each_possible_cpu(cpu) {
309
		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
310
311
	}

npiggin@suse.de's avatar
npiggin@suse.de committed
312
313
314
315
	return count;
#else
	return mnt->mnt_writers;
#endif
316
317
}

318
319
320
321
322
323
324
325
326
static int mnt_is_readonly(struct vfsmount *mnt)
{
	if (mnt->mnt_sb->s_readonly_remount)
		return 1;
	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
	smp_rmb();
	return __mnt_is_readonly(mnt);
}

327
/*
328
329
330
331
 * Most r/o & frozen checks on a fs are for operations that take discrete
 * amounts of time, like a write() or unlink().  We must keep track of when
 * those operations start (for permission checks) and when they end, so that we
 * can determine when writes are able to occur to a filesystem.
332
333
 */
/**
334
 * __mnt_want_write - get write access to a mount without freeze protection
335
 * @m: the mount on which to take a write
336
 *
337
338
339
340
341
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mnt it read-write) before
 * returning success. This operation does not protect against filesystem being
 * frozen. When the write operation is finished, __mnt_drop_write() must be
 * called. This is effectively a refcount.
342
 */
343
int __mnt_want_write(struct vfsmount *m)
344
{
345
	struct mount *mnt = real_mount(m);
346
347
	int ret = 0;

npiggin@suse.de's avatar
npiggin@suse.de committed
348
	preempt_disable();
349
	mnt_inc_writers(mnt);
npiggin@suse.de's avatar
npiggin@suse.de committed
350
	/*
351
	 * The store to mnt_inc_writers must be visible before we pass
npiggin@suse.de's avatar
npiggin@suse.de committed
352
353
354
355
	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
	 * incremented count after it has set MNT_WRITE_HOLD.
	 */
	smp_mb();
356
	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
npiggin@suse.de's avatar
npiggin@suse.de committed
357
358
359
360
361
362
363
		cpu_relax();
	/*
	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
	 * be set to match its requirements. So we must not load that until
	 * MNT_WRITE_HOLD is cleared.
	 */
	smp_rmb();
364
	if (mnt_is_readonly(m)) {
365
		mnt_dec_writers(mnt);
366
367
		ret = -EROFS;
	}
npiggin@suse.de's avatar
npiggin@suse.de committed
368
	preempt_enable();
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389

	return ret;
}

/**
 * mnt_want_write - get write access to a mount
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mount is read-write, filesystem
 * is not frozen) before returning success.  When the write operation is
 * finished, mnt_drop_write() must be called.  This is effectively a refcount.
 */
int mnt_want_write(struct vfsmount *m)
{
	int ret;

	sb_start_write(m->mnt_sb);
	ret = __mnt_want_write(m);
	if (ret)
		sb_end_write(m->mnt_sb);
390
	return ret;
391
392
393
}
EXPORT_SYMBOL_GPL(mnt_want_write);

394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
/**
 * mnt_clone_write - get write access to a mount
 * @mnt: the mount on which to take a write
 *
 * This is effectively like mnt_want_write, except
 * it must only be used to take an extra write reference
 * on a mountpoint that we already know has a write reference
 * on it. This allows some optimisation.
 *
 * After finished, mnt_drop_write must be called as usual to
 * drop the reference.
 */
int mnt_clone_write(struct vfsmount *mnt)
{
	/* superblock may be r/o */
	if (__mnt_is_readonly(mnt))
		return -EROFS;
	preempt_disable();
412
	mnt_inc_writers(real_mount(mnt));
413
414
415
416
417
418
	preempt_enable();
	return 0;
}
EXPORT_SYMBOL_GPL(mnt_clone_write);

/**
419
 * __mnt_want_write_file - get write access to a file's mount
420
421
 * @file: the file who's mount on which to take a write
 *
422
 * This is like __mnt_want_write, but it takes a file and can
423
424
 * do some optimisations if the file is open for write already
 */
425
int __mnt_want_write_file(struct file *file)
426
{
427
	if (!(file->f_mode & FMODE_WRITER))
428
		return __mnt_want_write(file->f_path.mnt);
429
430
431
	else
		return mnt_clone_write(file->f_path.mnt);
}
432
433

/**
434
 * mnt_want_write_file_path - get write access to a file's mount
435
436
437
438
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_want_write, but it takes a file and can
 * do some optimisations if the file is open for write already
439
440
441
442
443
 *
 * Called by the vfs for cases when we have an open file at hand, but will do an
 * inode operation on it (important distinction for files opened on overlayfs,
 * since the file operations will come from the real underlying file, while
 * inode operations come from the overlay).
444
 */
445
int mnt_want_write_file_path(struct file *file)
446
447
448
{
	int ret;

449
	sb_start_write(file->f_path.mnt->mnt_sb);
450
451
	ret = __mnt_want_write_file(file);
	if (ret)
452
		sb_end_write(file->f_path.mnt->mnt_sb);
453
454
	return ret;
}
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470

static inline int may_write_real(struct file *file)
{
	struct dentry *dentry = file->f_path.dentry;
	struct dentry *upperdentry;

	/* Writable file? */
	if (file->f_mode & FMODE_WRITER)
		return 0;

	/* Not overlayfs? */
	if (likely(!(dentry->d_flags & DCACHE_OP_REAL)))
		return 0;

	/* File refers to upper, writable layer? */
	upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER);
471
472
473
	if (upperdentry &&
	    (file_inode(file) == d_inode(upperdentry) ||
	     file_inode(file) == d_inode(dentry)))
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
		return 0;

	/* Lower layer: can't write to real file, sorry... */
	return -EPERM;
}

/**
 * mnt_want_write_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_want_write, but it takes a file and can
 * do some optimisations if the file is open for write already
 *
 * Mostly called by filesystems from their ioctl operation before performing
 * modification.  On overlayfs this needs to check if the file is on a read-only
 * lower layer and deny access in that case.
 */
int mnt_want_write_file(struct file *file)
{
	int ret;

	ret = may_write_real(file);
	if (!ret) {
		sb_start_write(file_inode(file)->i_sb);
		ret = __mnt_want_write_file(file);
		if (ret)
			sb_end_write(file_inode(file)->i_sb);
	}
	return ret;
}
504
505
EXPORT_SYMBOL_GPL(mnt_want_write_file);

506
/**
507
 * __mnt_drop_write - give up write access to a mount
508
509
510
511
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done
 * performing writes to it.  Must be matched with
512
 * __mnt_want_write() call above.
513
 */
514
void __mnt_drop_write(struct vfsmount *mnt)
515
{
npiggin@suse.de's avatar
npiggin@suse.de committed
516
	preempt_disable();
517
	mnt_dec_writers(real_mount(mnt));
npiggin@suse.de's avatar
npiggin@suse.de committed
518
	preempt_enable();
519
}
520
521
522
523
524
525
526
527
528
529
530
531
532
533

/**
 * mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done performing writes to it and
 * also allows filesystem to be frozen again.  Must be matched with
 * mnt_want_write() call above.
 */
void mnt_drop_write(struct vfsmount *mnt)
{
	__mnt_drop_write(mnt);
	sb_end_write(mnt->mnt_sb);
}
534
535
EXPORT_SYMBOL_GPL(mnt_drop_write);

536
537
538
539
540
void __mnt_drop_write_file(struct file *file)
{
	__mnt_drop_write(file->f_path.mnt);
}

541
void mnt_drop_write_file_path(struct file *file)
Al Viro's avatar
Al Viro committed
542
{
543
	mnt_drop_write(file->f_path.mnt);
Al Viro's avatar
Al Viro committed
544
}
545
546
547
548
549
550

void mnt_drop_write_file(struct file *file)
{
	__mnt_drop_write(file->f_path.mnt);
	sb_end_write(file_inode(file)->i_sb);
}
Al Viro's avatar
Al Viro committed
551
552
EXPORT_SYMBOL(mnt_drop_write_file);

553
static int mnt_make_readonly(struct mount *mnt)
554
{
555
556
	int ret = 0;

557
	lock_mount_hash();
558
	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
559
	/*
npiggin@suse.de's avatar
npiggin@suse.de committed
560
561
	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
	 * should be visible before we do.
562
	 */
npiggin@suse.de's avatar
npiggin@suse.de committed
563
564
	smp_mb();

565
	/*
npiggin@suse.de's avatar
npiggin@suse.de committed
566
567
568
569
570
571
572
573
574
575
576
577
578
579
	 * With writers on hold, if this value is zero, then there are
	 * definitely no active writers (although held writers may subsequently
	 * increment the count, they'll have to wait, and decrement it after
	 * seeing MNT_READONLY).
	 *
	 * It is OK to have counter incremented on one CPU and decremented on
	 * another: the sum will add up correctly. The danger would be when we
	 * sum up each counter, if we read a counter before it is incremented,
	 * but then read another CPU's count which it has been subsequently
	 * decremented from -- we would see more decrements than we should.
	 * MNT_WRITE_HOLD protects against this scenario, because
	 * mnt_want_write first increments count, then smp_mb, then spins on
	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
	 * we're counting up here.
580
	 */
581
	if (mnt_get_writers(mnt) > 0)
npiggin@suse.de's avatar
npiggin@suse.de committed
582
583
		ret = -EBUSY;
	else
584
		mnt->mnt.mnt_flags |= MNT_READONLY;
npiggin@suse.de's avatar
npiggin@suse.de committed
585
586
587
588
589
	/*
	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
	 * that become unheld will see MNT_READONLY.
	 */
	smp_wmb();
590
	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
591
	unlock_mount_hash();
592
	return ret;
593
594
}

595
static void __mnt_unmake_readonly(struct mount *mnt)
596
{
597
	lock_mount_hash();
598
	mnt->mnt.mnt_flags &= ~MNT_READONLY;
599
	unlock_mount_hash();
600
601
}

602
603
604
605
606
int sb_prepare_remount_readonly(struct super_block *sb)
{
	struct mount *mnt;
	int err = 0;

607
608
609
610
	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
	if (atomic_long_read(&sb->s_remove_count))
		return -EBUSY;

611
	lock_mount_hash();
612
613
614
615
616
617
618
619
620
621
	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
			smp_mb();
			if (mnt_get_writers(mnt) > 0) {
				err = -EBUSY;
				break;
			}
		}
	}
622
623
624
	if (!err && atomic_long_read(&sb->s_remove_count))
		err = -EBUSY;

625
626
627
628
629
630
631
632
	if (!err) {
		sb->s_readonly_remount = 1;
		smp_wmb();
	}
	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
	}
633
	unlock_mount_hash();
634
635
636
637

	return err;
}

638
static void free_vfsmnt(struct mount *mnt)
Linus Torvalds's avatar
Linus Torvalds committed
639
{
640
	kfree_const(mnt->mnt_devname);
npiggin@suse.de's avatar
npiggin@suse.de committed
641
#ifdef CONFIG_SMP
642
	free_percpu(mnt->mnt_pcp);
npiggin@suse.de's avatar
npiggin@suse.de committed
643
#endif
644
	kmem_cache_free(mnt_cache, mnt);
Linus Torvalds's avatar
Linus Torvalds committed
645
646
}

647
648
649
650
651
static void delayed_free_vfsmnt(struct rcu_head *head)
{
	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
}

Al Viro's avatar
Al Viro committed
652
/* call under rcu_read_lock */
Al Viro's avatar
Al Viro committed
653
int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
Al Viro's avatar
Al Viro committed
654
655
656
{
	struct mount *mnt;
	if (read_seqretry(&mount_lock, seq))
Al Viro's avatar
Al Viro committed
657
		return 1;
Al Viro's avatar
Al Viro committed
658
	if (bastard == NULL)
Al Viro's avatar
Al Viro committed
659
		return 0;
Al Viro's avatar
Al Viro committed
660
661
	mnt = real_mount(bastard);
	mnt_add_count(mnt, 1);
Al Viro's avatar
Al Viro committed
662
	smp_mb();			// see mntput_no_expire()
Al Viro's avatar
Al Viro committed
663
	if (likely(!read_seqretry(&mount_lock, seq)))
Al Viro's avatar
Al Viro committed
664
		return 0;
Al Viro's avatar
Al Viro committed
665
666
	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
		mnt_add_count(mnt, -1);
Al Viro's avatar
Al Viro committed
667
668
		return 1;
	}
Al Viro's avatar
Al Viro committed
669
670
671
672
673
674
675
676
	lock_mount_hash();
	if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
		mnt_add_count(mnt, -1);
		unlock_mount_hash();
		return 1;
	}
	unlock_mount_hash();
	/* caller will mntput() */
Al Viro's avatar
Al Viro committed
677
678
679
680
681
682
683
684
685
686
687
688
689
	return -1;
}

/* call under rcu_read_lock */
bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
	int res = __legitimize_mnt(bastard, seq);
	if (likely(!res))
		return true;
	if (unlikely(res < 0)) {
		rcu_read_unlock();
		mntput(bastard);
		rcu_read_lock();
Al Viro's avatar
Al Viro committed
690
691
692
693
	}
	return false;
}

Linus Torvalds's avatar
Linus Torvalds committed
694
/*
695
 * find the first mount at @dentry on vfsmount @mnt.
Al Viro's avatar
Al Viro committed
696
 * call under rcu_read_lock()
Linus Torvalds's avatar
Linus Torvalds committed
697
 */
698
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
Linus Torvalds's avatar
Linus Torvalds committed
699
{
Al Viro's avatar
Al Viro committed
700
	struct hlist_head *head = m_hash(mnt, dentry);
701
702
	struct mount *p;

Al Viro's avatar
Al Viro committed
703
	hlist_for_each_entry_rcu(p, head, mnt_hash)
704
705
706
707
708
		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
			return p;
	return NULL;
}

709
/*
710
711
712
713
714
715
716
717
718
719
720
721
722
723
 * lookup_mnt - Return the first child mount mounted at path
 *
 * "First" means first mounted chronologically.  If you create the
 * following mounts:
 *
 * mount /dev/sda1 /mnt
 * mount /dev/sda2 /mnt
 * mount /dev/sda3 /mnt
 *
 * Then lookup_mnt() on the base /mnt dentry in the root mount will
 * return successively the root dentry and vfsmount of /dev/sda1, then
 * /dev/sda2, then /dev/sda3, then NULL.
 *
 * lookup_mnt takes a reference to the found vfsmount.
724
 */
725
struct vfsmount *lookup_mnt(const struct path *path)
726
{
727
	struct mount *child_mnt;
Al Viro's avatar
Al Viro committed
728
729
	struct vfsmount *m;
	unsigned seq;
Nick Piggin's avatar
Nick Piggin committed
730

Al Viro's avatar
Al Viro committed
731
732
733
734
735
736
737
738
	rcu_read_lock();
	do {
		seq = read_seqbegin(&mount_lock);
		child_mnt = __lookup_mnt(path->mnt, path->dentry);
		m = child_mnt ? &child_mnt->mnt : NULL;
	} while (!legitimize_mnt(m, seq));
	rcu_read_unlock();
	return m;
739
740
}

741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
/*
 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
 *                         current mount namespace.
 *
 * The common case is dentries are not mountpoints at all and that
 * test is handled inline.  For the slow case when we are actually
 * dealing with a mountpoint of some kind, walk through all of the
 * mounts in the current mount namespace and test to see if the dentry
 * is a mountpoint.
 *
 * The mount_hashtable is not usable in the context because we
 * need to identify all mounts that may be in the current mount
 * namespace not just a mount that happens to have some specified
 * parent mount.
 */
bool __is_local_mountpoint(struct dentry *dentry)
{
	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
	struct mount *mnt;
	bool is_covered = false;

	if (!d_mountpoint(dentry))
		goto out;

	down_read(&namespace_sem);
	list_for_each_entry(mnt, &ns->list, mnt_list) {
		is_covered = (mnt->mnt_mountpoint == dentry);
		if (is_covered)
			break;
	}
	up_read(&namespace_sem);
out:
	return is_covered;
}

776
static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
777
{
Al Viro's avatar
Al Viro committed
778
	struct hlist_head *chain = mp_hash(dentry);
779
780
	struct mountpoint *mp;

Al Viro's avatar
Al Viro committed
781
	hlist_for_each_entry(mp, chain, m_hash) {
782
783
784
785
786
787
788
789
		if (mp->m_dentry == dentry) {
			/* might be worth a WARN_ON() */
			if (d_unlinked(dentry))
				return ERR_PTR(-ENOENT);
			mp->m_count++;
			return mp;
		}
	}
790
791
792
	return NULL;
}

793
static struct mountpoint *get_mountpoint(struct dentry *dentry)
794
{
795
	struct mountpoint *mp, *new = NULL;
796
	int ret;
797

798
799
800
801
802
803
804
805
806
807
808
809
	if (d_mountpoint(dentry)) {
mountpoint:
		read_seqlock_excl(&mount_lock);
		mp = lookup_mountpoint(dentry);
		read_sequnlock_excl(&mount_lock);
		if (mp)
			goto done;
	}

	if (!new)
		new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
	if (!new)
810
811
		return ERR_PTR(-ENOMEM);

812
813

	/* Exactly one processes may set d_mounted */
814
815
	ret = d_set_mounted(dentry);

816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
	/* Someone else set d_mounted? */
	if (ret == -EBUSY)
		goto mountpoint;

	/* The dentry is not available as a mountpoint? */
	mp = ERR_PTR(ret);
	if (ret)
		goto done;

	/* Add the new mountpoint to the hash table */
	read_seqlock_excl(&mount_lock);
	new->m_dentry = dentry;
	new->m_count = 1;
	hlist_add_head(&new->m_hash, mp_hash(dentry));
	INIT_HLIST_HEAD(&new->m_list);
	read_sequnlock_excl(&mount_lock);

	mp = new;
	new = NULL;
done:
	kfree(new);
837
838
839
840
841
842
843
	return mp;
}

static void put_mountpoint(struct mountpoint *mp)
{
	if (!--mp->m_count) {
		struct dentry *dentry = mp->m_dentry;
844
		BUG_ON(!hlist_empty(&mp->m_list));
845
846
847
		spin_lock(&dentry->d_lock);
		dentry->d_flags &= ~DCACHE_MOUNTED;
		spin_unlock(&dentry->d_lock);
Al Viro's avatar
Al Viro committed
848
		hlist_del(&mp->m_hash);
849
850
851
852
		kfree(mp);
	}
}

Al Viro's avatar
Al Viro committed
853
static inline int check_mnt(struct mount *mnt)
Linus Torvalds's avatar
Linus Torvalds committed
854
{
855
	return mnt->mnt_ns == current->nsproxy->mnt_ns;
Linus Torvalds's avatar
Linus Torvalds committed
856
857
}

Nick Piggin's avatar
Nick Piggin committed
858
859
860
/*
 * vfsmount lock must be held for write
 */
861
static void touch_mnt_namespace(struct mnt_namespace *ns)
Al Viro's avatar
Al Viro committed
862
863
864
865
866
867
868
{
	if (ns) {
		ns->event = ++event;
		wake_up_interruptible(&ns->poll);
	}
}

Nick Piggin's avatar
Nick Piggin committed
869
870
871
/*
 * vfsmount lock must be held for write
 */
872
static void __touch_mnt_namespace(struct mnt_namespace *ns)
Al Viro's avatar
Al Viro committed
873
874
875
876
877
878
879
{
	if (ns && ns->event != event) {
		ns->event = event;
		wake_up_interruptible(&ns->poll);
	}
}

Nick Piggin's avatar
Nick Piggin committed
880
881
882
/*
 * vfsmount lock must be held for write
 */
883
static void unhash_mnt(struct mount *mnt)
884
{
885
	mnt->mnt_parent = mnt;
886
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
887
	list_del_init(&mnt->mnt_child);
Al Viro's avatar
Al Viro committed
888
	hlist_del_init_rcu(&mnt->mnt_hash);
889
	hlist_del_init(&mnt->mnt_mp_list);
890
891
	put_mountpoint(mnt->mnt_mp);
	mnt->mnt_mp = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
892
893
}

894
895
896
897
898
899
900
901
902
903
/*
 * vfsmount lock must be held for write
 */
static void detach_mnt(struct mount *mnt, struct path *old_path)
{
	old_path->dentry = mnt->mnt_mountpoint;
	old_path->mnt = &mnt->mnt_parent->mnt;
	unhash_mnt(mnt);
}

904
905
906
907
908
909
910
911
912
913
/*
 * vfsmount lock must be held for write
 */
static void umount_mnt(struct mount *mnt)
{
	/* old mountpoint will be dropped when we can do that */
	mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
	unhash_mnt(mnt);
}

Nick Piggin's avatar
Nick Piggin committed
914
915
916
/*
 * vfsmount lock must be held for write
 */
917
918
void mnt_set_mountpoint(struct mount *mnt,
			struct mountpoint *mp,
919
			struct mount *child_mnt)
920
{
921
	mp->m_count++;
922
	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
923
	child_mnt->mnt_mountpoint = dget(mp->m_dentry);
924
	child_mnt->mnt_parent = mnt;
925
	child_mnt->mnt_mp = mp;
926
	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
927
928
}

929
930
931
932
933
934
935
static void __attach_mnt(struct mount *mnt, struct mount *parent)
{
	hlist_add_head_rcu(&mnt->mnt_hash,
			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}

Nick Piggin's avatar
Nick Piggin committed
936
937
938
/*
 * vfsmount lock must be held for write
 */
939
940
941
static void attach_mnt(struct mount *mnt,
			struct mount *parent,
			struct mountpoint *mp)
Linus Torvalds's avatar
Linus Torvalds committed
942
{
943
	mnt_set_mountpoint(parent, mp, mnt);
944
	__attach_mnt(mnt, parent);
945
946
}

947
void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
Al Viro's avatar
Al Viro committed
948
{
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
	struct mountpoint *old_mp = mnt->mnt_mp;
	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
	struct mount *old_parent = mnt->mnt_parent;

	list_del_init(&mnt->mnt_child);
	hlist_del_init(&mnt->mnt_mp_list);
	hlist_del_init_rcu(&mnt->mnt_hash);

	attach_mnt(mnt, parent, mp);

	put_mountpoint(old_mp);

	/*
	 * Safely avoid even the suggestion this code might sleep or
	 * lock the mount hash by taking advantage of the knowledge that
	 * mnt_change_mountpoint will not release the final reference
	 * to a mountpoint.
	 *
	 * During mounting, the mount passed in as the parent mount will
	 * continue to use the old mountpoint and during unmounting, the
	 * old mountpoint will continue to exist until namespace_unlock,
	 * which happens well after mnt_change_mountpoint.
	 */
	spin_lock(&old_mountpoint->d_lock);
	old_mountpoint->d_lockref.count--;
	spin_unlock(&old_mountpoint->d_lock);

	mnt_add_count(old_parent, -1);
Al Viro's avatar
Al Viro committed
977
978
}

979
/*
Nick Piggin's avatar
Nick Piggin committed
980
 * vfsmount lock must be held for write
981
 */
982
static void commit_tree(struct mount *mnt)
983
{
984
	struct mount *parent = mnt->mnt_parent;
985
	struct mount *m;
986
	LIST_HEAD(head);
Al Viro's avatar
Al Viro committed
987
	struct mnt_namespace *n = parent->mnt_ns;
988

989
	BUG_ON(parent == mnt);
990

Al Viro's avatar
Al Viro committed
991
	list_add_tail(&head, &mnt->mnt_list);
Al Viro's avatar
Al Viro committed
992
	list_for_each_entry(m, &head, mnt_list)
Al Viro's avatar
Al Viro committed
993
		m->mnt_ns = n;
994

995
996
	list_splice(&head, n->list.prev);

997
998
999
	n->mounts += n->pending_mounts;
	n->pending_mounts = 0;

1000
	__attach_mnt(mnt, parent);
1001
	touch_mnt_namespace(n);
Linus Torvalds's avatar
Linus Torvalds committed
1002
1003
}

1004
static struct mount *next_mnt(struct mount *p, struct mount *root)
Linus Torvalds's avatar
Linus Torvalds committed
1005
{
1006
1007
	struct list_head *next = p->mnt_mounts.next;
	if (next == &p->mnt_mounts) {
Linus Torvalds's avatar
Linus Torvalds committed
1008
		while (1) {
1009
			if (p == root)
Linus Torvalds's avatar
Linus Torvalds committed
1010
				return NULL;
1011
1012
			next = p->mnt_child.next;
			if (next != &p->mnt_parent->mnt_mounts)
Linus Torvalds's avatar
Linus Torvalds committed
1013
				break;
1014
			p = p->mnt_parent;
Linus Torvalds's avatar
Linus Torvalds committed
1015
1016
		}
	}
1017
	return list_entry(next, struct mount, mnt_child);
Linus Torvalds's avatar
Linus Torvalds committed
1018
1019
}

1020
static struct mount *skip_mnt_tree(struct mount *p)
Ram Pai's avatar
Ram Pai committed
1021
{
1022
1023
1024
1025
	struct list_head *prev = p->mnt_mounts.prev;
	while (prev != &p->mnt_mounts) {
		p = list_entry(prev, struct mount, mnt_child);
		prev = p->mnt_mounts.prev;
Ram Pai's avatar
Ram Pai committed
1026
1027
1028
1029
	}
	return p;
}

1030
1031
1032
struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
1033
	struct mount *mnt;
1034
1035
1036
1037
1038
1039
1040
1041
1042
	struct dentry *root;

	if (!type)
		return ERR_PTR(-ENODEV);

	mnt = alloc_vfsmnt(name);
	if (!mnt)
		return ERR_PTR(-ENOMEM);

1043
	if (flags & SB_KERNMOUNT)
1044
		mnt->mnt.mnt_flags = MNT_INTERNAL;
1045
1046
1047

	root = mount_fs(type, flags, name, data);
	if (IS_ERR(root)) {
1048
		mnt_free_id(mnt);
1049
1050
1051
1052
		free_vfsmnt(mnt);
		return ERR_CAST(root);
	}

1053
1054
	mnt->mnt.mnt_root = root;
	mnt->mnt.mnt_sb = root->d_sb;
1055
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1056
	mnt->mnt_parent = mnt;
1057
	lock_mount_hash();
1058
	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
1059
	unlock_mount_hash();
1060
	return &mnt->mnt;
1061
1062
1063
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);

1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
struct vfsmount *
vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
	     const char *name, void *data)
{
	/* Until it is worked out how to pass the user namespace
	 * through from the parent mount to the submount don't support
	 * unprivileged mounts with submounts.
	 */
	if (mountpoint->d_sb->s_user_ns != &init_user_ns)
		return ERR_PTR(-EPERM);

1075
	return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
1076
1077
1078
}
EXPORT_SYMBOL_GPL(vfs_submount);

1079
static struct mount *clone_mnt(struct mount *old, struct dentry *root,
Ram Pai's avatar
Ram Pai committed
1080
					int flag)
Linus Torvalds's avatar
Linus Torvalds committed
1081
{
1082
	struct super_block *sb = old->mnt.mnt_sb;
1083
1084
	struct mount *mnt;
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
1085

1086
1087
1088
	mnt = alloc_vfsmnt(old->mnt_devname);
	if (!mnt)
		return ERR_PTR(-ENOMEM);
1089

1090
	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
1091
1092
1093
		mnt->mnt_group_id = 0; /* not a peer of original */
	else
		mnt->mnt_group_id = old->mnt_group_id;
1094

1095
1096
1097
1098
	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
		err = mnt_alloc_group_id(mnt);
		if (err)
			goto out_free;
Linus Torvalds's avatar
Linus Torvalds committed
1099
	}
1100

1101
1102
	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
1103
	/* Don't allow unprivileged users to change mount flags */
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
	if (flag & CL_UNPRIVILEGED) {
		mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;

		if (mnt->mnt.mnt_flags & MNT_READONLY)
			mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;

		if (mnt->mnt.mnt_flags & MNT_NODEV)
			mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;

		if (mnt->mnt.mnt_flags & MNT_NOSUID)
			mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;

		if (mnt->mnt.mnt_flags & MNT_NOEXEC)
			mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
	}
1119

1120
	/* Don't allow unprivileged users to reveal what is under a mount */
1121
1122
	if ((flag & CL_UNPRIVILEGED) &&
	    (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire)))
1123
1124
		mnt->mnt.mnt_flags |= MNT_LOCKED;

1125
1126
1127
1128
1129
	atomic_inc(&sb->s_active);
	mnt->mnt.mnt_sb = sb;
	mnt->mnt.mnt_root = dget(root);
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
	mnt->mnt_parent = mnt;
1130
	lock_mount_hash();
1131
	list_add_tail(&mnt-></