namespace.c 86.5 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11
/*
 *  linux/fs/namespace.c
 *
 * (C) Copyright Al Viro 2000, 2001
 *	Released under GPL v2.
 *
 * Based on code from fs/super.c, copyright Linus Torvalds and others.
 * Heavily rewritten.
 */

#include <linux/syscalls.h>
Al Viro's avatar
Al Viro committed
12
#include <linux/export.h>
13
#include <linux/capability.h>
14
#include <linux/mnt_namespace.h>
15
#include <linux/user_namespace.h>
Linus Torvalds's avatar
Linus Torvalds committed
16 17
#include <linux/namei.h>
#include <linux/security.h>
18
#include <linux/cred.h>
19
#include <linux/idr.h>
20
#include <linux/init.h>		/* init_rootfs */
Al Viro's avatar
Al Viro committed
21 22 23
#include <linux/fs_struct.h>	/* get_fs_root et.al. */
#include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
#include <linux/uaccess.h>
24
#include <linux/proc_ns.h>
25
#include <linux/magic.h>
Al Viro's avatar
Al Viro committed
26
#include <linux/bootmem.h>
Al Viro's avatar
Al Viro committed
27
#include <linux/task_work.h>
28 29
#include <linux/sched/task.h>

30
#include "pnode.h"
31
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
32

33 34 35
/* Maximum number of mounts in a mount namespace */
unsigned int sysctl_mount_max __read_mostly = 100000;

Al Viro's avatar
Al Viro committed
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
static unsigned int m_hash_mask __read_mostly;
static unsigned int m_hash_shift __read_mostly;
static unsigned int mp_hash_mask __read_mostly;
static unsigned int mp_hash_shift __read_mostly;

static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{
	if (!str)
		return 0;
	mhash_entries = simple_strtoul(str, &str, 0);
	return 1;
}
__setup("mhash_entries=", set_mhash_entries);

static __initdata unsigned long mphash_entries;
static int __init set_mphash_entries(char *str)
{
	if (!str)
		return 0;
	mphash_entries = simple_strtoul(str, &str, 0);
	return 1;
}
__setup("mphash_entries=", set_mphash_entries);
Eric Dumazet's avatar
Eric Dumazet committed
60

Al Viro's avatar
Al Viro committed
61
static u64 event;
62
static DEFINE_IDA(mnt_id_ida);
63
static DEFINE_IDA(mnt_group_ida);
Nick Piggin's avatar
Nick Piggin committed
64
static DEFINE_SPINLOCK(mnt_id_lock);
65 66
static int mnt_id_start = 0;
static int mnt_group_start = 1;
Linus Torvalds's avatar
Linus Torvalds committed
67

Al Viro's avatar
Al Viro committed
68
static struct hlist_head *mount_hashtable __read_mostly;
Al Viro's avatar
Al Viro committed
69
static struct hlist_head *mountpoint_hashtable __read_mostly;
70
static struct kmem_cache *mnt_cache __read_mostly;
Al Viro's avatar
Al Viro committed
71
static DECLARE_RWSEM(namespace_sem);
Linus Torvalds's avatar
Linus Torvalds committed
72

Miklos Szeredi's avatar
Miklos Szeredi committed
73
/* /sys/fs */
74 75
struct kobject *fs_kobj;
EXPORT_SYMBOL_GPL(fs_kobj);
Miklos Szeredi's avatar
Miklos Szeredi committed
76

Nick Piggin's avatar
Nick Piggin committed
77 78 79 80 81 82 83 84
/*
 * vfsmount lock may be taken for read to prevent changes to the
 * vfsmount hash, ie. during mountpoint lookups or walking back
 * up the tree.
 *
 * It should be taken for write in all cases where the vfsmount
 * tree or hash is modified or when a vfsmount structure is modified.
 */
Al Viro's avatar
Al Viro committed
85
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
Nick Piggin's avatar
Nick Piggin committed
86

Al Viro's avatar
Al Viro committed
87
static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
Linus Torvalds's avatar
Linus Torvalds committed
88
{
Ram Pai's avatar
Ram Pai committed
89 90
	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
Al Viro's avatar
Al Viro committed
91 92 93 94 95 96 97 98 99
	tmp = tmp + (tmp >> m_hash_shift);
	return &mount_hashtable[tmp & m_hash_mask];
}

static inline struct hlist_head *mp_hash(struct dentry *dentry)
{
	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
	tmp = tmp + (tmp >> mp_hash_shift);
	return &mountpoint_hashtable[tmp & mp_hash_mask];
Linus Torvalds's avatar
Linus Torvalds committed
100 101
}

102
static int mnt_alloc_id(struct mount *mnt)
103 104 105 106 107
{
	int res;

retry:
	ida_pre_get(&mnt_id_ida, GFP_KERNEL);
Nick Piggin's avatar
Nick Piggin committed
108
	spin_lock(&mnt_id_lock);
Al Viro's avatar
Al Viro committed
109
	res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
110
	if (!res)
Al Viro's avatar
Al Viro committed
111
		mnt_id_start = mnt->mnt_id + 1;
Nick Piggin's avatar
Nick Piggin committed
112
	spin_unlock(&mnt_id_lock);
113 114 115 116 117 118
	if (res == -EAGAIN)
		goto retry;

	return res;
}

119
static void mnt_free_id(struct mount *mnt)
120
{
Al Viro's avatar
Al Viro committed
121
	int id = mnt->mnt_id;
Nick Piggin's avatar
Nick Piggin committed
122
	spin_lock(&mnt_id_lock);
123 124 125
	ida_remove(&mnt_id_ida, id);
	if (mnt_id_start > id)
		mnt_id_start = id;
Nick Piggin's avatar
Nick Piggin committed
126
	spin_unlock(&mnt_id_lock);
127 128
}

129 130 131 132 133
/*
 * Allocate a new peer group ID
 *
 * mnt_group_ida is protected by namespace_sem
 */
134
static int mnt_alloc_group_id(struct mount *mnt)
135
{
136 137
	int res;

138 139 140
	if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
		return -ENOMEM;

141 142
	res = ida_get_new_above(&mnt_group_ida,
				mnt_group_start,
Al Viro's avatar
Al Viro committed
143
				&mnt->mnt_group_id);
144
	if (!res)
Al Viro's avatar
Al Viro committed
145
		mnt_group_start = mnt->mnt_group_id + 1;
146 147

	return res;
148 149 150 151 152
}

/*
 * Release a peer group ID
 */
153
void mnt_release_group_id(struct mount *mnt)
154
{
Al Viro's avatar
Al Viro committed
155
	int id = mnt->mnt_group_id;
156 157 158
	ida_remove(&mnt_group_ida, id);
	if (mnt_group_start > id)
		mnt_group_start = id;
Al Viro's avatar
Al Viro committed
159
	mnt->mnt_group_id = 0;
160 161
}

Nick Piggin's avatar
Nick Piggin committed
162 163 164
/*
 * vfsmount lock must be held for read
 */
165
static inline void mnt_add_count(struct mount *mnt, int n)
Nick Piggin's avatar
Nick Piggin committed
166 167
{
#ifdef CONFIG_SMP
168
	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
Nick Piggin's avatar
Nick Piggin committed
169 170
#else
	preempt_disable();
171
	mnt->mnt_count += n;
Nick Piggin's avatar
Nick Piggin committed
172 173 174 175 176 177 178
	preempt_enable();
#endif
}

/*
 * vfsmount lock must be held for write
 */
179
unsigned int mnt_get_count(struct mount *mnt)
Nick Piggin's avatar
Nick Piggin committed
180 181
{
#ifdef CONFIG_SMP
182
	unsigned int count = 0;
Nick Piggin's avatar
Nick Piggin committed
183 184 185
	int cpu;

	for_each_possible_cpu(cpu) {
186
		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
Nick Piggin's avatar
Nick Piggin committed
187 188 189 190
	}

	return count;
#else
191
	return mnt->mnt_count;
Nick Piggin's avatar
Nick Piggin committed
192 193 194
#endif
}

195 196 197 198 199 200 201 202
static void drop_mountpoint(struct fs_pin *p)
{
	struct mount *m = container_of(p, struct mount, mnt_umount);
	dput(m->mnt_ex_mountpoint);
	pin_remove(p);
	mntput(&m->mnt);
}

203
static struct mount *alloc_vfsmnt(const char *name)
Linus Torvalds's avatar
Linus Torvalds committed
204
{
205 206
	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
	if (mnt) {
207 208
		int err;

209
		err = mnt_alloc_id(mnt);
210 211 212 213
		if (err)
			goto out_free_cache;

		if (name) {
214
			mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
215
			if (!mnt->mnt_devname)
216
				goto out_free_id;
217 218
		}

Nick Piggin's avatar
Nick Piggin committed
219
#ifdef CONFIG_SMP
220 221
		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
		if (!mnt->mnt_pcp)
Nick Piggin's avatar
Nick Piggin committed
222 223
			goto out_free_devname;

224
		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
Nick Piggin's avatar
Nick Piggin committed
225
#else
226 227
		mnt->mnt_count = 1;
		mnt->mnt_writers = 0;
Nick Piggin's avatar
Nick Piggin committed
228 229
#endif

Al Viro's avatar
Al Viro committed
230
		INIT_HLIST_NODE(&mnt->mnt_hash);
231 232 233 234 235 236 237
		INIT_LIST_HEAD(&mnt->mnt_child);
		INIT_LIST_HEAD(&mnt->mnt_mounts);
		INIT_LIST_HEAD(&mnt->mnt_list);
		INIT_LIST_HEAD(&mnt->mnt_expire);
		INIT_LIST_HEAD(&mnt->mnt_share);
		INIT_LIST_HEAD(&mnt->mnt_slave_list);
		INIT_LIST_HEAD(&mnt->mnt_slave);
238
		INIT_HLIST_NODE(&mnt->mnt_mp_list);
239
		INIT_LIST_HEAD(&mnt->mnt_umounting);
240
		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
Linus Torvalds's avatar
Linus Torvalds committed
241
	}
242
	return mnt;
243

npiggin@suse.de's avatar
npiggin@suse.de committed
244 245
#ifdef CONFIG_SMP
out_free_devname:
246
	kfree_const(mnt->mnt_devname);
npiggin@suse.de's avatar
npiggin@suse.de committed
247
#endif
248
out_free_id:
249
	mnt_free_id(mnt);
250
out_free_cache:
251
	kmem_cache_free(mnt_cache, mnt);
252
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
253 254
}

255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
/*
 * Most r/o checks on a fs are for operations that take
 * discrete amounts of time, like a write() or unlink().
 * We must keep track of when those operations start
 * (for permission checks) and when they end, so that
 * we can determine when writes are able to occur to
 * a filesystem.
 */
/*
 * __mnt_is_readonly: check whether a mount is read-only
 * @mnt: the mount to check for its write status
 *
 * This shouldn't be used directly ouside of the VFS.
 * It does not guarantee that the filesystem will stay
 * r/w, just that it is right *now*.  This can not and
 * should not be used in place of IS_RDONLY(inode).
 * mnt_want/drop_write() will _keep_ the filesystem
 * r/w.
 */
int __mnt_is_readonly(struct vfsmount *mnt)
{
276 277
	if (mnt->mnt_flags & MNT_READONLY)
		return 1;
278
	if (sb_rdonly(mnt->mnt_sb))
279 280
		return 1;
	return 0;
281 282 283
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

284
static inline void mnt_inc_writers(struct mount *mnt)
npiggin@suse.de's avatar
npiggin@suse.de committed
285 286
{
#ifdef CONFIG_SMP
287
	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
npiggin@suse.de's avatar
npiggin@suse.de committed
288
#else
289
	mnt->mnt_writers++;
npiggin@suse.de's avatar
npiggin@suse.de committed
290 291
#endif
}
292

293
static inline void mnt_dec_writers(struct mount *mnt)
294
{
npiggin@suse.de's avatar
npiggin@suse.de committed
295
#ifdef CONFIG_SMP
296
	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
npiggin@suse.de's avatar
npiggin@suse.de committed
297
#else
298
	mnt->mnt_writers--;
npiggin@suse.de's avatar
npiggin@suse.de committed
299
#endif
300 301
}

302
static unsigned int mnt_get_writers(struct mount *mnt)
303
{
npiggin@suse.de's avatar
npiggin@suse.de committed
304 305
#ifdef CONFIG_SMP
	unsigned int count = 0;
306 307 308
	int cpu;

	for_each_possible_cpu(cpu) {
309
		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
310 311
	}

npiggin@suse.de's avatar
npiggin@suse.de committed
312 313 314 315
	return count;
#else
	return mnt->mnt_writers;
#endif
316 317
}

318 319 320 321 322 323 324 325 326
static int mnt_is_readonly(struct vfsmount *mnt)
{
	if (mnt->mnt_sb->s_readonly_remount)
		return 1;
	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
	smp_rmb();
	return __mnt_is_readonly(mnt);
}

327
/*
328 329 330 331
 * Most r/o & frozen checks on a fs are for operations that take discrete
 * amounts of time, like a write() or unlink().  We must keep track of when
 * those operations start (for permission checks) and when they end, so that we
 * can determine when writes are able to occur to a filesystem.
332 333
 */
/**
334
 * __mnt_want_write - get write access to a mount without freeze protection
335
 * @m: the mount on which to take a write
336
 *
337 338 339 340 341
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mnt it read-write) before
 * returning success. This operation does not protect against filesystem being
 * frozen. When the write operation is finished, __mnt_drop_write() must be
 * called. This is effectively a refcount.
342
 */
343
int __mnt_want_write(struct vfsmount *m)
344
{
345
	struct mount *mnt = real_mount(m);
346 347
	int ret = 0;

npiggin@suse.de's avatar
npiggin@suse.de committed
348
	preempt_disable();
349
	mnt_inc_writers(mnt);
npiggin@suse.de's avatar
npiggin@suse.de committed
350
	/*
351
	 * The store to mnt_inc_writers must be visible before we pass
npiggin@suse.de's avatar
npiggin@suse.de committed
352 353 354 355
	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
	 * incremented count after it has set MNT_WRITE_HOLD.
	 */
	smp_mb();
356
	while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
npiggin@suse.de's avatar
npiggin@suse.de committed
357 358 359 360 361 362 363
		cpu_relax();
	/*
	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
	 * be set to match its requirements. So we must not load that until
	 * MNT_WRITE_HOLD is cleared.
	 */
	smp_rmb();
364
	if (mnt_is_readonly(m)) {
365
		mnt_dec_writers(mnt);
366 367
		ret = -EROFS;
	}
npiggin@suse.de's avatar
npiggin@suse.de committed
368
	preempt_enable();
369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389

	return ret;
}

/**
 * mnt_want_write - get write access to a mount
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mount is read-write, filesystem
 * is not frozen) before returning success.  When the write operation is
 * finished, mnt_drop_write() must be called.  This is effectively a refcount.
 */
int mnt_want_write(struct vfsmount *m)
{
	int ret;

	sb_start_write(m->mnt_sb);
	ret = __mnt_want_write(m);
	if (ret)
		sb_end_write(m->mnt_sb);
390
	return ret;
391 392 393
}
EXPORT_SYMBOL_GPL(mnt_want_write);

394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
/**
 * mnt_clone_write - get write access to a mount
 * @mnt: the mount on which to take a write
 *
 * This is effectively like mnt_want_write, except
 * it must only be used to take an extra write reference
 * on a mountpoint that we already know has a write reference
 * on it. This allows some optimisation.
 *
 * After finished, mnt_drop_write must be called as usual to
 * drop the reference.
 */
int mnt_clone_write(struct vfsmount *mnt)
{
	/* superblock may be r/o */
	if (__mnt_is_readonly(mnt))
		return -EROFS;
	preempt_disable();
412
	mnt_inc_writers(real_mount(mnt));
413 414 415 416 417 418
	preempt_enable();
	return 0;
}
EXPORT_SYMBOL_GPL(mnt_clone_write);

/**
419
 * __mnt_want_write_file - get write access to a file's mount
420 421
 * @file: the file who's mount on which to take a write
 *
422
 * This is like __mnt_want_write, but it takes a file and can
423 424
 * do some optimisations if the file is open for write already
 */
425
int __mnt_want_write_file(struct file *file)
426
{
427
	if (!(file->f_mode & FMODE_WRITER))
428
		return __mnt_want_write(file->f_path.mnt);
429 430 431
	else
		return mnt_clone_write(file->f_path.mnt);
}
432 433

/**
434
 * mnt_want_write_file_path - get write access to a file's mount
435 436 437 438
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_want_write, but it takes a file and can
 * do some optimisations if the file is open for write already
439 440 441 442 443
 *
 * Called by the vfs for cases when we have an open file at hand, but will do an
 * inode operation on it (important distinction for files opened on overlayfs,
 * since the file operations will come from the real underlying file, while
 * inode operations come from the overlay).
444
 */
445
int mnt_want_write_file_path(struct file *file)
446 447 448 449 450 451 452 453 454
{
	int ret;

	sb_start_write(file->f_path.mnt->mnt_sb);
	ret = __mnt_want_write_file(file);
	if (ret)
		sb_end_write(file->f_path.mnt->mnt_sb);
	return ret;
}
455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470

static inline int may_write_real(struct file *file)
{
	struct dentry *dentry = file->f_path.dentry;
	struct dentry *upperdentry;

	/* Writable file? */
	if (file->f_mode & FMODE_WRITER)
		return 0;

	/* Not overlayfs? */
	if (likely(!(dentry->d_flags & DCACHE_OP_REAL)))
		return 0;

	/* File refers to upper, writable layer? */
	upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER);
471 472 473
	if (upperdentry &&
	    (file_inode(file) == d_inode(upperdentry) ||
	     file_inode(file) == d_inode(dentry)))
474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
		return 0;

	/* Lower layer: can't write to real file, sorry... */
	return -EPERM;
}

/**
 * mnt_want_write_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_want_write, but it takes a file and can
 * do some optimisations if the file is open for write already
 *
 * Mostly called by filesystems from their ioctl operation before performing
 * modification.  On overlayfs this needs to check if the file is on a read-only
 * lower layer and deny access in that case.
 */
int mnt_want_write_file(struct file *file)
{
	int ret;

	ret = may_write_real(file);
	if (!ret) {
		sb_start_write(file_inode(file)->i_sb);
		ret = __mnt_want_write_file(file);
		if (ret)
			sb_end_write(file_inode(file)->i_sb);
	}
	return ret;
}
504 505
EXPORT_SYMBOL_GPL(mnt_want_write_file);

506
/**
507
 * __mnt_drop_write - give up write access to a mount
508 509 510 511
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done
 * performing writes to it.  Must be matched with
512
 * __mnt_want_write() call above.
513
 */
514
void __mnt_drop_write(struct vfsmount *mnt)
515
{
npiggin@suse.de's avatar
npiggin@suse.de committed
516
	preempt_disable();
517
	mnt_dec_writers(real_mount(mnt));
npiggin@suse.de's avatar
npiggin@suse.de committed
518
	preempt_enable();
519
}
520 521 522 523 524 525 526 527 528 529 530 531 532 533

/**
 * mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done performing writes to it and
 * also allows filesystem to be frozen again.  Must be matched with
 * mnt_want_write() call above.
 */
void mnt_drop_write(struct vfsmount *mnt)
{
	__mnt_drop_write(mnt);
	sb_end_write(mnt->mnt_sb);
}
534 535
EXPORT_SYMBOL_GPL(mnt_drop_write);

536 537 538 539 540
void __mnt_drop_write_file(struct file *file)
{
	__mnt_drop_write(file->f_path.mnt);
}

541
void mnt_drop_write_file_path(struct file *file)
Al Viro's avatar
Al Viro committed
542 543 544
{
	mnt_drop_write(file->f_path.mnt);
}
545 546 547 548 549 550

void mnt_drop_write_file(struct file *file)
{
	__mnt_drop_write(file->f_path.mnt);
	sb_end_write(file_inode(file)->i_sb);
}
Al Viro's avatar
Al Viro committed
551 552
EXPORT_SYMBOL(mnt_drop_write_file);

553
static int mnt_make_readonly(struct mount *mnt)
554
{
555 556
	int ret = 0;

557
	lock_mount_hash();
558
	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
559
	/*
npiggin@suse.de's avatar
npiggin@suse.de committed
560 561
	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
	 * should be visible before we do.
562
	 */
npiggin@suse.de's avatar
npiggin@suse.de committed
563 564
	smp_mb();

565
	/*
npiggin@suse.de's avatar
npiggin@suse.de committed
566 567 568 569 570 571 572 573 574 575 576 577 578 579
	 * With writers on hold, if this value is zero, then there are
	 * definitely no active writers (although held writers may subsequently
	 * increment the count, they'll have to wait, and decrement it after
	 * seeing MNT_READONLY).
	 *
	 * It is OK to have counter incremented on one CPU and decremented on
	 * another: the sum will add up correctly. The danger would be when we
	 * sum up each counter, if we read a counter before it is incremented,
	 * but then read another CPU's count which it has been subsequently
	 * decremented from -- we would see more decrements than we should.
	 * MNT_WRITE_HOLD protects against this scenario, because
	 * mnt_want_write first increments count, then smp_mb, then spins on
	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
	 * we're counting up here.
580
	 */
581
	if (mnt_get_writers(mnt) > 0)
npiggin@suse.de's avatar
npiggin@suse.de committed
582 583
		ret = -EBUSY;
	else
584
		mnt->mnt.mnt_flags |= MNT_READONLY;
npiggin@suse.de's avatar
npiggin@suse.de committed
585 586 587 588 589
	/*
	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
	 * that become unheld will see MNT_READONLY.
	 */
	smp_wmb();
590
	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
591
	unlock_mount_hash();
592
	return ret;
593 594
}

595
static void __mnt_unmake_readonly(struct mount *mnt)
596
{
597
	lock_mount_hash();
598
	mnt->mnt.mnt_flags &= ~MNT_READONLY;
599
	unlock_mount_hash();
600 601
}

602 603 604 605 606
int sb_prepare_remount_readonly(struct super_block *sb)
{
	struct mount *mnt;
	int err = 0;

607 608 609 610
	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
	if (atomic_long_read(&sb->s_remove_count))
		return -EBUSY;

611
	lock_mount_hash();
612 613 614 615 616 617 618 619 620 621
	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
			smp_mb();
			if (mnt_get_writers(mnt) > 0) {
				err = -EBUSY;
				break;
			}
		}
	}
622 623 624
	if (!err && atomic_long_read(&sb->s_remove_count))
		err = -EBUSY;

625 626 627 628 629 630 631 632
	if (!err) {
		sb->s_readonly_remount = 1;
		smp_wmb();
	}
	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
	}
633
	unlock_mount_hash();
634 635 636 637

	return err;
}

638
static void free_vfsmnt(struct mount *mnt)
Linus Torvalds's avatar
Linus Torvalds committed
639
{
640
	kfree_const(mnt->mnt_devname);
npiggin@suse.de's avatar
npiggin@suse.de committed
641
#ifdef CONFIG_SMP
642
	free_percpu(mnt->mnt_pcp);
npiggin@suse.de's avatar
npiggin@suse.de committed
643
#endif
644
	kmem_cache_free(mnt_cache, mnt);
Linus Torvalds's avatar
Linus Torvalds committed
645 646
}

647 648 649 650 651
static void delayed_free_vfsmnt(struct rcu_head *head)
{
	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
}

Al Viro's avatar
Al Viro committed
652
/* call under rcu_read_lock */
Al Viro's avatar
Al Viro committed
653
int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
Al Viro's avatar
Al Viro committed
654 655 656
{
	struct mount *mnt;
	if (read_seqretry(&mount_lock, seq))
Al Viro's avatar
Al Viro committed
657
		return 1;
Al Viro's avatar
Al Viro committed
658
	if (bastard == NULL)
Al Viro's avatar
Al Viro committed
659
		return 0;
Al Viro's avatar
Al Viro committed
660 661
	mnt = real_mount(bastard);
	mnt_add_count(mnt, 1);
Al Viro's avatar
Al Viro committed
662
	smp_mb();			// see mntput_no_expire()
Al Viro's avatar
Al Viro committed
663
	if (likely(!read_seqretry(&mount_lock, seq)))
Al Viro's avatar
Al Viro committed
664
		return 0;
Al Viro's avatar
Al Viro committed
665 666
	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
		mnt_add_count(mnt, -1);
Al Viro's avatar
Al Viro committed
667 668
		return 1;
	}
Al Viro's avatar
Al Viro committed
669 670 671 672 673 674 675 676
	lock_mount_hash();
	if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
		mnt_add_count(mnt, -1);
		unlock_mount_hash();
		return 1;
	}
	unlock_mount_hash();
	/* caller will mntput() */
Al Viro's avatar
Al Viro committed
677 678 679 680 681 682 683 684 685 686 687 688 689
	return -1;
}

/* call under rcu_read_lock */
bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
	int res = __legitimize_mnt(bastard, seq);
	if (likely(!res))
		return true;
	if (unlikely(res < 0)) {
		rcu_read_unlock();
		mntput(bastard);
		rcu_read_lock();
Al Viro's avatar
Al Viro committed
690 691 692 693
	}
	return false;
}

Linus Torvalds's avatar
Linus Torvalds committed
694
/*
695
 * find the first mount at @dentry on vfsmount @mnt.
Al Viro's avatar
Al Viro committed
696
 * call under rcu_read_lock()
Linus Torvalds's avatar
Linus Torvalds committed
697
 */
698
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
Linus Torvalds's avatar
Linus Torvalds committed
699
{
Al Viro's avatar
Al Viro committed
700
	struct hlist_head *head = m_hash(mnt, dentry);
701 702
	struct mount *p;

Al Viro's avatar
Al Viro committed
703
	hlist_for_each_entry_rcu(p, head, mnt_hash)
704 705 706 707 708
		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
			return p;
	return NULL;
}

709
/*
710 711 712 713 714 715 716 717 718 719 720 721 722 723
 * lookup_mnt - Return the first child mount mounted at path
 *
 * "First" means first mounted chronologically.  If you create the
 * following mounts:
 *
 * mount /dev/sda1 /mnt
 * mount /dev/sda2 /mnt
 * mount /dev/sda3 /mnt
 *
 * Then lookup_mnt() on the base /mnt dentry in the root mount will
 * return successively the root dentry and vfsmount of /dev/sda1, then
 * /dev/sda2, then /dev/sda3, then NULL.
 *
 * lookup_mnt takes a reference to the found vfsmount.
724
 */
725
struct vfsmount *lookup_mnt(const struct path *path)
726
{
727
	struct mount *child_mnt;
Al Viro's avatar
Al Viro committed
728 729
	struct vfsmount *m;
	unsigned seq;
Nick Piggin's avatar
Nick Piggin committed
730

Al Viro's avatar
Al Viro committed
731 732 733 734 735 736 737 738
	rcu_read_lock();
	do {
		seq = read_seqbegin(&mount_lock);
		child_mnt = __lookup_mnt(path->mnt, path->dentry);
		m = child_mnt ? &child_mnt->mnt : NULL;
	} while (!legitimize_mnt(m, seq));
	rcu_read_unlock();
	return m;
739 740
}

741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775
/*
 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
 *                         current mount namespace.
 *
 * The common case is dentries are not mountpoints at all and that
 * test is handled inline.  For the slow case when we are actually
 * dealing with a mountpoint of some kind, walk through all of the
 * mounts in the current mount namespace and test to see if the dentry
 * is a mountpoint.
 *
 * The mount_hashtable is not usable in the context because we
 * need to identify all mounts that may be in the current mount
 * namespace not just a mount that happens to have some specified
 * parent mount.
 */
bool __is_local_mountpoint(struct dentry *dentry)
{
	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
	struct mount *mnt;
	bool is_covered = false;

	if (!d_mountpoint(dentry))
		goto out;

	down_read(&namespace_sem);
	list_for_each_entry(mnt, &ns->list, mnt_list) {
		is_covered = (mnt->mnt_mountpoint == dentry);
		if (is_covered)
			break;
	}
	up_read(&namespace_sem);
out:
	return is_covered;
}

776
static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
777
{
Al Viro's avatar
Al Viro committed
778
	struct hlist_head *chain = mp_hash(dentry);
779 780
	struct mountpoint *mp;

Al Viro's avatar
Al Viro committed
781
	hlist_for_each_entry(mp, chain, m_hash) {
782 783 784 785 786 787 788 789
		if (mp->m_dentry == dentry) {
			/* might be worth a WARN_ON() */
			if (d_unlinked(dentry))
				return ERR_PTR(-ENOENT);
			mp->m_count++;
			return mp;
		}
	}
790 791 792
	return NULL;
}

793
static struct mountpoint *get_mountpoint(struct dentry *dentry)
794
{
795
	struct mountpoint *mp, *new = NULL;
796
	int ret;
797

798 799 800 801 802 803 804 805 806 807 808 809
	if (d_mountpoint(dentry)) {
mountpoint:
		read_seqlock_excl(&mount_lock);
		mp = lookup_mountpoint(dentry);
		read_sequnlock_excl(&mount_lock);
		if (mp)
			goto done;
	}

	if (!new)
		new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
	if (!new)
810 811
		return ERR_PTR(-ENOMEM);

812 813

	/* Exactly one processes may set d_mounted */
814 815
	ret = d_set_mounted(dentry);

816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836
	/* Someone else set d_mounted? */
	if (ret == -EBUSY)
		goto mountpoint;

	/* The dentry is not available as a mountpoint? */
	mp = ERR_PTR(ret);
	if (ret)
		goto done;

	/* Add the new mountpoint to the hash table */
	read_seqlock_excl(&mount_lock);
	new->m_dentry = dentry;
	new->m_count = 1;
	hlist_add_head(&new->m_hash, mp_hash(dentry));
	INIT_HLIST_HEAD(&new->m_list);
	read_sequnlock_excl(&mount_lock);

	mp = new;
	new = NULL;
done:
	kfree(new);
837 838 839 840 841 842 843
	return mp;
}

static void put_mountpoint(struct mountpoint *mp)
{
	if (!--mp->m_count) {
		struct dentry *dentry = mp->m_dentry;
844
		BUG_ON(!hlist_empty(&mp->m_list));
845 846 847
		spin_lock(&dentry->d_lock);
		dentry->d_flags &= ~DCACHE_MOUNTED;
		spin_unlock(&dentry->d_lock);
Al Viro's avatar
Al Viro committed
848
		hlist_del(&mp->m_hash);
849 850 851 852
		kfree(mp);
	}
}

Al Viro's avatar
Al Viro committed
853
static inline int check_mnt(struct mount *mnt)
Linus Torvalds's avatar
Linus Torvalds committed
854
{
855
	return mnt->mnt_ns == current->nsproxy->mnt_ns;
Linus Torvalds's avatar
Linus Torvalds committed
856 857
}

Nick Piggin's avatar
Nick Piggin committed
858 859 860
/*
 * vfsmount lock must be held for write
 */
861
static void touch_mnt_namespace(struct mnt_namespace *ns)
Al Viro's avatar
Al Viro committed
862 863 864 865 866 867 868
{
	if (ns) {
		ns->event = ++event;
		wake_up_interruptible(&ns->poll);
	}
}

Nick Piggin's avatar
Nick Piggin committed
869 870 871
/*
 * vfsmount lock must be held for write
 */
872
static void __touch_mnt_namespace(struct mnt_namespace *ns)
Al Viro's avatar
Al Viro committed
873 874 875 876 877 878 879
{
	if (ns && ns->event != event) {
		ns->event = event;
		wake_up_interruptible(&ns->poll);
	}
}

Nick Piggin's avatar
Nick Piggin committed
880 881 882
/*
 * vfsmount lock must be held for write
 */
883
static void unhash_mnt(struct mount *mnt)
884
{
885
	mnt->mnt_parent = mnt;
886
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
887
	list_del_init(&mnt->mnt_child);
Al Viro's avatar
Al Viro committed
888
	hlist_del_init_rcu(&mnt->mnt_hash);
889
	hlist_del_init(&mnt->mnt_mp_list);
890 891
	put_mountpoint(mnt->mnt_mp);
	mnt->mnt_mp = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
892 893
}

894 895 896 897 898 899 900 901 902 903
/*
 * vfsmount lock must be held for write
 */
static void detach_mnt(struct mount *mnt, struct path *old_path)
{
	old_path->dentry = mnt->mnt_mountpoint;
	old_path->mnt = &mnt->mnt_parent->mnt;
	unhash_mnt(mnt);
}

904 905 906 907 908 909 910 911 912 913
/*
 * vfsmount lock must be held for write
 */
static void umount_mnt(struct mount *mnt)
{
	/* old mountpoint will be dropped when we can do that */
	mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
	unhash_mnt(mnt);
}

Nick Piggin's avatar
Nick Piggin committed
914 915 916
/*
 * vfsmount lock must be held for write
 */
917 918
void mnt_set_mountpoint(struct mount *mnt,
			struct mountpoint *mp,
919
			struct mount *child_mnt)
920
{
921
	mp->m_count++;
922
	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
923
	child_mnt->mnt_mountpoint = dget(mp->m_dentry);
924
	child_mnt->mnt_parent = mnt;
925
	child_mnt->mnt_mp = mp;
926
	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
927 928
}

929 930 931 932 933 934 935
static void __attach_mnt(struct mount *mnt, struct mount *parent)
{
	hlist_add_head_rcu(&mnt->mnt_hash,
			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}

Nick Piggin's avatar
Nick Piggin committed
936 937 938
/*
 * vfsmount lock must be held for write
 */
939 940 941
static void attach_mnt(struct mount *mnt,
			struct mount *parent,
			struct mountpoint *mp)
Linus Torvalds's avatar
Linus Torvalds committed
942
{
943
	mnt_set_mountpoint(parent, mp, mnt);
944
	__attach_mnt(mnt, parent);
945 946
}

947
void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
Al Viro's avatar
Al Viro committed
948
{
949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976
	struct mountpoint *old_mp = mnt->mnt_mp;
	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
	struct mount *old_parent = mnt->mnt_parent;

	list_del_init(&mnt->mnt_child);
	hlist_del_init(&mnt->mnt_mp_list);
	hlist_del_init_rcu(&mnt->mnt_hash);

	attach_mnt(mnt, parent, mp);

	put_mountpoint(old_mp);

	/*
	 * Safely avoid even the suggestion this code might sleep or
	 * lock the mount hash by taking advantage of the knowledge that
	 * mnt_change_mountpoint will not release the final reference
	 * to a mountpoint.
	 *
	 * During mounting, the mount passed in as the parent mount will
	 * continue to use the old mountpoint and during unmounting, the
	 * old mountpoint will continue to exist until namespace_unlock,
	 * which happens well after mnt_change_mountpoint.
	 */
	spin_lock(&old_mountpoint->d_lock);
	old_mountpoint->d_lockref.count--;
	spin_unlock(&old_mountpoint->d_lock);

	mnt_add_count(old_parent, -1);
Al Viro's avatar
Al Viro committed
977 978
}

979
/*
Nick Piggin's avatar
Nick Piggin committed
980
 * vfsmount lock must be held for write
981
 */
982
static void commit_tree(struct mount *mnt)
983
{
984
	struct mount *parent = mnt->mnt_parent;
985
	struct mount *m;
986
	LIST_HEAD(head);
Al Viro's avatar
Al Viro committed
987
	struct mnt_namespace *n = parent->mnt_ns;
988

989
	BUG_ON(parent == mnt);
990

Al Viro's avatar
Al Viro committed
991
	list_add_tail(&head, &mnt->mnt_list);
Al Viro's avatar
Al Viro committed
992
	list_for_each_entry(m, &head, mnt_list)
Al Viro's avatar
Al Viro committed
993
		m->mnt_ns = n;
994

995 996
	list_splice(&head, n->list.prev);

997 998 999
	n->mounts += n->pending_mounts;
	n->pending_mounts = 0;

1000
	__attach_mnt(mnt, parent);
1001
	touch_mnt_namespace(n);
Linus Torvalds's avatar
Linus Torvalds committed
1002 1003
}

1004
static struct mount *next_mnt(struct mount *p, struct mount *root)
Linus Torvalds's avatar
Linus Torvalds committed
1005
{
1006 1007
	struct list_head *next = p->mnt_mounts.next;
	if (next == &p->mnt_mounts) {
Linus Torvalds's avatar
Linus Torvalds committed
1008
		while (1) {
1009
			if (p == root)
Linus Torvalds's avatar
Linus Torvalds committed
1010
				return NULL;
1011 1012
			next = p->mnt_child.next;
			if (next != &p->mnt_parent->mnt_mounts)
Linus Torvalds's avatar
Linus Torvalds committed
1013
				break;
1014
			p = p->mnt_parent;
Linus Torvalds's avatar
Linus Torvalds committed
1015 1016
		}
	}
1017
	return list_entry(next, struct mount, mnt_child);
Linus Torvalds's avatar
Linus Torvalds committed
1018 1019
}

1020
static struct mount *skip_mnt_tree(struct mount *p)
Ram Pai's avatar
Ram Pai committed
1021
{