dcache.c 94.5 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 * fs/dcache.c
 *
 * Complete reimplementation
 * (C) 1997 Thomas Schoebel-Theuer,
 * with heavy changes by Linus Torvalds
 */

/*
 * Notes on the allocation strategy:
 *
 * The dcache is a master of the icache - whenever a dcache entry
 * exists, the inode will always exist. "iput()" is done either when
 * the dcache entry is deleted or garbage collected.
 */

#include <linux/syscalls.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/fs.h>
21
#include <linux/fsnotify.h>
Linus Torvalds's avatar
Linus Torvalds committed
22 23 24 25
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/cache.h>
26
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
27 28
#include <linux/mount.h>
#include <linux/file.h>
29
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
30 31 32 33
#include <linux/security.h>
#include <linux/seqlock.h>
#include <linux/swap.h>
#include <linux/bootmem.h>
34
#include <linux/fs_struct.h>
35
#include <linux/hardirq.h>
36 37
#include <linux/bit_spinlock.h>
#include <linux/rculist_bl.h>
38
#include <linux/prefetch.h>
39
#include <linux/ratelimit.h>
40
#include <linux/list_lru.h>
41 42
#include <linux/kasan.h>

43
#include "internal.h"
44
#include "mount.h"
Linus Torvalds's avatar
Linus Torvalds committed
45

Nick Piggin's avatar
Nick Piggin committed
46 47
/*
 * Usage:
48
 * dcache->d_inode->i_lock protects:
49
 *   - i_dentry, d_u.d_alias, d_inode of aliases
50 51 52 53
 * dcache_hash_bucket lock protects:
 *   - the dcache hash table
 * s_anon bl list spinlock protects:
 *   - the s_anon list (see __d_drop)
54
 * dentry->d_sb->s_dentry_lru_lock protects:
Nick Piggin's avatar
Nick Piggin committed
55 56 57 58 59
 *   - the dcache lru lists and counters
 * d_lock protects:
 *   - d_flags
 *   - d_name
 *   - d_lru
Nick Piggin's avatar
Nick Piggin committed
60
 *   - d_count
Nick Piggin's avatar
Nick Piggin committed
61
 *   - d_unhashed()
Nick Piggin's avatar
Nick Piggin committed
62 63
 *   - d_parent and d_subdirs
 *   - childrens' d_child and d_parent
64
 *   - d_u.d_alias, d_inode
Nick Piggin's avatar
Nick Piggin committed
65 66
 *
 * Ordering:
67
 * dentry->d_inode->i_lock
Nick Piggin's avatar
Nick Piggin committed
68
 *   dentry->d_lock
69
 *     dentry->d_sb->s_dentry_lru_lock
70 71
 *     dcache_hash_bucket lock
 *     s_anon lock
Nick Piggin's avatar
Nick Piggin committed
72
 *
Nick Piggin's avatar
Nick Piggin committed
73 74 75 76 77 78 79
 * If there is an ancestor relationship:
 * dentry->d_parent->...->d_parent->d_lock
 *   ...
 *     dentry->d_parent->d_lock
 *       dentry->d_lock
 *
 * If no ancestor relationship:
Nick Piggin's avatar
Nick Piggin committed
80 81 82 83
 * if (dentry1 < dentry2)
 *   dentry1->d_lock
 *     dentry2->d_lock
 */
84
int sysctl_vfs_cache_pressure __read_mostly = 100;
Linus Torvalds's avatar
Linus Torvalds committed
85 86
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);

Al Viro's avatar
Al Viro committed
87
__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
Linus Torvalds's avatar
Linus Torvalds committed
88

89
EXPORT_SYMBOL(rename_lock);
Linus Torvalds's avatar
Linus Torvalds committed
90

91
static struct kmem_cache *dentry_cache __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
92

David Howells's avatar
David Howells committed
93 94 95 96 97
const struct qstr empty_name = QSTR_INIT("", 0);
EXPORT_SYMBOL(empty_name);
const struct qstr slash_name = QSTR_INIT("/", 1);
EXPORT_SYMBOL(slash_name);

Linus Torvalds's avatar
Linus Torvalds committed
98 99 100 101 102 103 104 105 106
/*
 * This is the single most critical data structure when it comes
 * to the dcache: the hashtable for lookups. Somebody should try
 * to make this good - I've just made it work.
 *
 * This hash-function tries to avoid losing too many bits of hash
 * information, yet avoid using a prime hash-size or similar.
 */

107 108
static unsigned int d_hash_mask __read_mostly;
static unsigned int d_hash_shift __read_mostly;
109

110
static struct hlist_bl_head *dentry_hashtable __read_mostly;
111

112
static inline struct hlist_bl_head *d_hash(unsigned int hash)
113
{
114
	return dentry_hashtable + (hash >> (32 - d_hash_shift));
115 116
}

Al Viro's avatar
Al Viro committed
117 118 119 120 121 122 123 124 125 126 127
#define IN_LOOKUP_SHIFT 10
static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT];

static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent,
					unsigned int hash)
{
	hash += (unsigned long) parent / L1_CACHE_BYTES;
	return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT);
}


Linus Torvalds's avatar
Linus Torvalds committed
128 129 130 131 132
/* Statistics gathering. */
struct dentry_stat_t dentry_stat = {
	.age_limit = 45,
};

133
static DEFINE_PER_CPU(long, nr_dentry);
134
static DEFINE_PER_CPU(long, nr_dentry_unused);
135 136

#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
137 138 139 140 141 142 143 144 145 146 147 148 149

/*
 * Here we resort to our own counters instead of using generic per-cpu counters
 * for consistency with what the vfs inode code does. We are expected to harvest
 * better code and performance by having our own specialized counters.
 *
 * Please note that the loop is done over all possible CPUs, not over all online
 * CPUs. The reason for this is that we don't want to play games with CPUs going
 * on and off. If one of them goes off, we will just keep their counters.
 *
 * glommer: See cffbc8a for details, and if you ever intend to change this,
 * please update all vfs counters to match.
 */
150
static long get_nr_dentry(void)
151 152
{
	int i;
153
	long sum = 0;
154 155 156 157 158
	for_each_possible_cpu(i)
		sum += per_cpu(nr_dentry, i);
	return sum < 0 ? 0 : sum;
}

159 160 161 162 163 164 165 166 167
static long get_nr_dentry_unused(void)
{
	int i;
	long sum = 0;
	for_each_possible_cpu(i)
		sum += per_cpu(nr_dentry_unused, i);
	return sum < 0 ? 0 : sum;
}

168
int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer,
169 170
		   size_t *lenp, loff_t *ppos)
{
171
	dentry_stat.nr_dentry = get_nr_dentry();
172
	dentry_stat.nr_unused = get_nr_dentry_unused();
173
	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
174 175 176
}
#endif

177 178 179 180
/*
 * Compare 2 name strings, return 0 if they match, otherwise non-zero.
 * The strings are both count bytes long, and count is non-zero.
 */
181 182 183 184 185 186 187 188 189 190 191 192
#ifdef CONFIG_DCACHE_WORD_ACCESS

#include <asm/word-at-a-time.h>
/*
 * NOTE! 'cs' and 'scount' come from a dentry, so it has a
 * aligned allocation for this particular component. We don't
 * strictly need the load_unaligned_zeropad() safety, but it
 * doesn't hurt either.
 *
 * In contrast, 'ct' and 'tcount' can be from a pathname, and do
 * need the careful unaligned handling.
 */
193
static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
194
{
195 196 197
	unsigned long a,b,mask;

	for (;;) {
198
		a = *(unsigned long *)cs;
199
		b = load_unaligned_zeropad(ct);
200 201 202 203 204 205 206 207 208 209
		if (tcount < sizeof(unsigned long))
			break;
		if (unlikely(a != b))
			return 1;
		cs += sizeof(unsigned long);
		ct += sizeof(unsigned long);
		tcount -= sizeof(unsigned long);
		if (!tcount)
			return 0;
	}
210
	mask = bytemask_from_count(tcount);
211
	return unlikely(!!((a ^ b) & mask));
212 213
}

214
#else
215

216
static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
217
{
218 219 220 221 222 223 224 225 226 227
	do {
		if (*cs != *ct)
			return 1;
		cs++;
		ct++;
		tcount--;
	} while (tcount);
	return 0;
}

228 229
#endif

230 231 232 233
static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
{
	/*
	 * Be careful about RCU walk racing with rename:
234
	 * use 'READ_ONCE' to fetch the name pointer.
235 236 237 238 239 240 241 242 243 244 245 246 247
	 *
	 * NOTE! Even if a rename will mean that the length
	 * was not loaded atomically, we don't care. The
	 * RCU walk will check the sequence count eventually,
	 * and catch it. And we won't overrun the buffer,
	 * because we're reading the name pointer atomically,
	 * and a dentry name is guaranteed to be properly
	 * terminated with a NUL byte.
	 *
	 * End result: even if 'len' is wrong, we'll exit
	 * early because the data cannot match (there can
	 * be no NUL in the ct/tcount data)
	 */
248
	const unsigned char *cs = READ_ONCE(dentry->d_name.name);
249

250
	return dentry_string_cmp(cs, ct, tcount);
251 252
}

253 254 255 256 257 258 259 260 261 262 263 264 265
struct external_name {
	union {
		atomic_t count;
		struct rcu_head head;
	} u;
	unsigned char name[];
};

static inline struct external_name *external_name(struct dentry *dentry)
{
	return container_of(dentry->d_name.name, struct external_name, name[0]);
}

Christoph Hellwig's avatar
Christoph Hellwig committed
266
static void __d_free(struct rcu_head *head)
Linus Torvalds's avatar
Linus Torvalds committed
267
{
Christoph Hellwig's avatar
Christoph Hellwig committed
268 269
	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);

270 271 272 273 274 275 276
	kmem_cache_free(dentry_cache, dentry); 
}

static void __d_free_external(struct rcu_head *head)
{
	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
	kfree(external_name(dentry));
Linus Torvalds's avatar
Linus Torvalds committed
277 278 279
	kmem_cache_free(dentry_cache, dentry); 
}

280 281 282 283 284
static inline int dname_external(const struct dentry *dentry)
{
	return dentry->d_name.name != dentry->d_iname;
}

Al Viro's avatar
Al Viro committed
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry)
{
	spin_lock(&dentry->d_lock);
	if (unlikely(dname_external(dentry))) {
		struct external_name *p = external_name(dentry);
		atomic_inc(&p->u.count);
		spin_unlock(&dentry->d_lock);
		name->name = p->name;
	} else {
		memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN);
		spin_unlock(&dentry->d_lock);
		name->name = name->inline_name;
	}
}
EXPORT_SYMBOL(take_dentry_name_snapshot);

void release_dentry_name_snapshot(struct name_snapshot *name)
{
	if (unlikely(name->name != name->inline_name)) {
		struct external_name *p;
		p = container_of(name->name, struct external_name, name[0]);
		if (unlikely(atomic_dec_and_test(&p->u.count)))
			kfree_rcu(p, u.head);
	}
}
EXPORT_SYMBOL(release_dentry_name_snapshot);

312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333
static inline void __d_set_inode_and_type(struct dentry *dentry,
					  struct inode *inode,
					  unsigned type_flags)
{
	unsigned flags;

	dentry->d_inode = inode;
	flags = READ_ONCE(dentry->d_flags);
	flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
	flags |= type_flags;
	WRITE_ONCE(dentry->d_flags, flags);
}

static inline void __d_clear_type_and_inode(struct dentry *dentry)
{
	unsigned flags = READ_ONCE(dentry->d_flags);

	flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
	WRITE_ONCE(dentry->d_flags, flags);
	dentry->d_inode = NULL;
}

Al Viro's avatar
Al Viro committed
334 335
static void dentry_free(struct dentry *dentry)
{
336
	WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
337 338 339 340 341 342 343
	if (unlikely(dname_external(dentry))) {
		struct external_name *p = external_name(dentry);
		if (likely(atomic_dec_and_test(&p->u.count))) {
			call_rcu(&dentry->d_u.d_rcu, __d_free_external);
			return;
		}
	}
Al Viro's avatar
Al Viro committed
344 345 346 347 348 349 350
	/* if dentry was never visible to RCU, immediate free is OK */
	if (!(dentry->d_flags & DCACHE_RCUACCESS))
		__d_free(&dentry->d_u.d_rcu);
	else
		call_rcu(&dentry->d_u.d_rcu, __d_free);
}

Linus Torvalds's avatar
Linus Torvalds committed
351 352
/*
 * Release the dentry's inode, using the filesystem
353
 * d_iput() operation if defined.
Nick Piggin's avatar
Nick Piggin committed
354 355 356
 */
static void dentry_unlink_inode(struct dentry * dentry)
	__releases(dentry->d_lock)
357
	__releases(dentry->d_inode->i_lock)
Nick Piggin's avatar
Nick Piggin committed
358 359
{
	struct inode *inode = dentry->d_inode;
360
	bool hashed = !d_unhashed(dentry);
361

362 363
	if (hashed)
		raw_write_seqcount_begin(&dentry->d_seq);
364
	__d_clear_type_and_inode(dentry);
365
	hlist_del_init(&dentry->d_u.d_alias);
366 367
	if (hashed)
		raw_write_seqcount_end(&dentry->d_seq);
Nick Piggin's avatar
Nick Piggin committed
368
	spin_unlock(&dentry->d_lock);
369
	spin_unlock(&inode->i_lock);
Nick Piggin's avatar
Nick Piggin committed
370 371 372 373 374 375 376 377
	if (!inode->i_nlink)
		fsnotify_inoderemove(inode);
	if (dentry->d_op && dentry->d_op->d_iput)
		dentry->d_op->d_iput(dentry, inode);
	else
		iput(inode);
}

378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
/*
 * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
 * is in use - which includes both the "real" per-superblock
 * LRU list _and_ the DCACHE_SHRINK_LIST use.
 *
 * The DCACHE_SHRINK_LIST bit is set whenever the dentry is
 * on the shrink list (ie not on the superblock LRU list).
 *
 * The per-cpu "nr_dentry_unused" counters are updated with
 * the DCACHE_LRU_LIST bit.
 *
 * These helper functions make sure we always follow the
 * rules. d_lock must be held by the caller.
 */
#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
static void d_lru_add(struct dentry *dentry)
{
	D_FLAG_VERIFY(dentry, 0);
	dentry->d_flags |= DCACHE_LRU_LIST;
	this_cpu_inc(nr_dentry_unused);
	WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}

static void d_lru_del(struct dentry *dentry)
{
	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
	dentry->d_flags &= ~DCACHE_LRU_LIST;
	this_cpu_dec(nr_dentry_unused);
	WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}

static void d_shrink_del(struct dentry *dentry)
{
	D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
	list_del_init(&dentry->d_lru);
	dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
	this_cpu_dec(nr_dentry_unused);
}

static void d_shrink_add(struct dentry *dentry, struct list_head *list)
{
	D_FLAG_VERIFY(dentry, 0);
	list_add(&dentry->d_lru, list);
	dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
	this_cpu_inc(nr_dentry_unused);
}

/*
 * These can only be called under the global LRU lock, ie during the
 * callback for freeing the LRU list. "isolate" removes it from the
 * LRU lists entirely, while shrink_move moves it to the indicated
 * private list.
 */
431
static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
432 433 434 435
{
	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
	dentry->d_flags &= ~DCACHE_LRU_LIST;
	this_cpu_dec(nr_dentry_unused);
436
	list_lru_isolate(lru, &dentry->d_lru);
437 438
}

439 440
static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
			      struct list_head *list)
441 442 443
{
	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
	dentry->d_flags |= DCACHE_SHRINK_LIST;
444
	list_lru_isolate_move(lru, &dentry->d_lru, list);
445 446
}

447
/*
448
 * dentry_lru_(add|del)_list) must be called with d_lock held.
449 450 451
 */
static void dentry_lru_add(struct dentry *dentry)
{
452 453
	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
		d_lru_add(dentry);
454 455
	else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
		dentry->d_flags |= DCACHE_REFERENCED;
456 457
}

Nick Piggin's avatar
Nick Piggin committed
458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474
/**
 * d_drop - drop a dentry
 * @dentry: dentry to drop
 *
 * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
 * be found through a VFS lookup any more. Note that this is different from
 * deleting the dentry - d_delete will try to mark the dentry negative if
 * possible, giving a successful _negative_ lookup, while d_drop will
 * just make the cache lookup fail.
 *
 * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
 * reason (NFS timeouts or autofs deletes).
 *
 * __d_drop requires dentry->d_lock.
 */
void __d_drop(struct dentry *dentry)
{
475
	if (!d_unhashed(dentry)) {
476
		struct hlist_bl_head *b;
477 478 479 480 481 482
		/*
		 * Hashed dentries are normally on the dentry hashtable,
		 * with the exception of those newly allocated by
		 * d_obtain_alias, which are always IS_ROOT:
		 */
		if (unlikely(IS_ROOT(dentry)))
483 484
			b = &dentry->d_sb->s_anon;
		else
485
			b = d_hash(dentry->d_name.hash);
486 487 488 489 490

		hlist_bl_lock(b);
		__hlist_bl_del(&dentry->d_hash);
		dentry->d_hash.pprev = NULL;
		hlist_bl_unlock(b);
491 492
		/* After this call, in-progress rcu-walk path lookup will fail. */
		write_seqcount_invalidate(&dentry->d_seq);
Nick Piggin's avatar
Nick Piggin committed
493 494 495 496 497 498 499 500 501 502 503 504
	}
}
EXPORT_SYMBOL(__d_drop);

void d_drop(struct dentry *dentry)
{
	spin_lock(&dentry->d_lock);
	__d_drop(dentry);
	spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(d_drop);

Al Viro's avatar
Al Viro committed
505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542
static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
{
	struct dentry *next;
	/*
	 * Inform d_walk() and shrink_dentry_list() that we are no longer
	 * attached to the dentry tree
	 */
	dentry->d_flags |= DCACHE_DENTRY_KILLED;
	if (unlikely(list_empty(&dentry->d_child)))
		return;
	__list_del_entry(&dentry->d_child);
	/*
	 * Cursors can move around the list of children.  While we'd been
	 * a normal list member, it didn't matter - ->d_child.next would've
	 * been updated.  However, from now on it won't be and for the
	 * things like d_walk() it might end up with a nasty surprise.
	 * Normally d_walk() doesn't care about cursors moving around -
	 * ->d_lock on parent prevents that and since a cursor has no children
	 * of its own, we get through it without ever unlocking the parent.
	 * There is one exception, though - if we ascend from a child that
	 * gets killed as soon as we unlock it, the next sibling is found
	 * using the value left in its ->d_child.next.  And if _that_
	 * pointed to a cursor, and cursor got moved (e.g. by lseek())
	 * before d_walk() regains parent->d_lock, we'll end up skipping
	 * everything the cursor had been moved past.
	 *
	 * Solution: make sure that the pointer left behind in ->d_child.next
	 * points to something that won't be moving around.  I.e. skip the
	 * cursors.
	 */
	while (dentry->d_child.next != &parent->d_subdirs) {
		next = list_entry(dentry->d_child.next, struct dentry, d_child);
		if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
			break;
		dentry->d_child.next = next->d_child.next;
	}
}

Al Viro's avatar
Al Viro committed
543
static void __dentry_kill(struct dentry *dentry)
544
{
545 546 547
	struct dentry *parent = NULL;
	bool can_free = true;
	if (!IS_ROOT(dentry))
548
		parent = dentry->d_parent;
Nick Piggin's avatar
Nick Piggin committed
549

550 551 552 553 554
	/*
	 * The dentry is now unrecoverably dead to the world.
	 */
	lockref_mark_dead(&dentry->d_lockref);

Sage Weil's avatar
Sage Weil committed
555 556 557 558
	/*
	 * inform the fs via d_prune that this dentry is about to be
	 * unhashed and destroyed.
	 */
559
	if (dentry->d_flags & DCACHE_OP_PRUNE)
Yan, Zheng's avatar
Yan, Zheng committed
560 561
		dentry->d_op->d_prune(dentry);

562 563 564 565
	if (dentry->d_flags & DCACHE_LRU_LIST) {
		if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
			d_lru_del(dentry);
	}
566 567
	/* if it was on the hash then remove it */
	__d_drop(dentry);
Al Viro's avatar
Al Viro committed
568
	dentry_unlist(dentry, parent);
Al Viro's avatar
Al Viro committed
569 570
	if (parent)
		spin_unlock(&parent->d_lock);
571 572 573 574
	if (dentry->d_inode)
		dentry_unlink_inode(dentry);
	else
		spin_unlock(&dentry->d_lock);
Al Viro's avatar
Al Viro committed
575 576 577 578
	this_cpu_dec(nr_dentry);
	if (dentry->d_op && dentry->d_op->d_release)
		dentry->d_op->d_release(dentry);

579 580 581 582 583 584 585 586
	spin_lock(&dentry->d_lock);
	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
		dentry->d_flags |= DCACHE_MAY_FREE;
		can_free = false;
	}
	spin_unlock(&dentry->d_lock);
	if (likely(can_free))
		dentry_free(dentry);
Al Viro's avatar
Al Viro committed
587 588 589 590 591 592 593 594
}

/*
 * Finish off a dentry we've decided to kill.
 * dentry->d_lock must be held, returns with it unlocked.
 * If ref is non-zero, then decrement the refcount too.
 * Returns dentry requiring refcount drop, or NULL if we're done.
 */
595
static struct dentry *dentry_kill(struct dentry *dentry)
Al Viro's avatar
Al Viro committed
596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613
	__releases(dentry->d_lock)
{
	struct inode *inode = dentry->d_inode;
	struct dentry *parent = NULL;

	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
		goto failed;

	if (!IS_ROOT(dentry)) {
		parent = dentry->d_parent;
		if (unlikely(!spin_trylock(&parent->d_lock))) {
			if (inode)
				spin_unlock(&inode->i_lock);
			goto failed;
		}
	}

	__dentry_kill(dentry);
Al Viro's avatar
Al Viro committed
614
	return parent;
Al Viro's avatar
Al Viro committed
615 616

failed:
617
	spin_unlock(&dentry->d_lock);
Al Viro's avatar
Al Viro committed
618
	return dentry; /* try again with same dentry */
619 620
}

621 622 623 624 625
static inline struct dentry *lock_parent(struct dentry *dentry)
{
	struct dentry *parent = dentry->d_parent;
	if (IS_ROOT(dentry))
		return NULL;
626
	if (unlikely(dentry->d_lockref.count < 0))
627
		return NULL;
628 629 630
	if (likely(spin_trylock(&parent->d_lock)))
		return parent;
	rcu_read_lock();
631
	spin_unlock(&dentry->d_lock);
632
again:
633
	parent = READ_ONCE(dentry->d_parent);
634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
	spin_lock(&parent->d_lock);
	/*
	 * We can't blindly lock dentry until we are sure
	 * that we won't violate the locking order.
	 * Any changes of dentry->d_parent must have
	 * been done with parent->d_lock held, so
	 * spin_lock() above is enough of a barrier
	 * for checking if it's still our child.
	 */
	if (unlikely(parent != dentry->d_parent)) {
		spin_unlock(&parent->d_lock);
		goto again;
	}
	rcu_read_unlock();
	if (parent != dentry)
649
		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
650 651 652 653 654
	else
		parent = NULL;
	return parent;
}

655 656 657 658 659 660 661 662 663 664 665 666 667 668 669
/*
 * Try to do a lockless dput(), and return whether that was successful.
 *
 * If unsuccessful, we return false, having already taken the dentry lock.
 *
 * The caller needs to hold the RCU read lock, so that the dentry is
 * guaranteed to stay around even if the refcount goes down to zero!
 */
static inline bool fast_dput(struct dentry *dentry)
{
	int ret;
	unsigned int d_flags;

	/*
	 * If we have a d_op->d_delete() operation, we sould not
670
	 * let the dentry count go to zero, so use "put_or_lock".
671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723
	 */
	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
		return lockref_put_or_lock(&dentry->d_lockref);

	/*
	 * .. otherwise, we can try to just decrement the
	 * lockref optimistically.
	 */
	ret = lockref_put_return(&dentry->d_lockref);

	/*
	 * If the lockref_put_return() failed due to the lock being held
	 * by somebody else, the fast path has failed. We will need to
	 * get the lock, and then check the count again.
	 */
	if (unlikely(ret < 0)) {
		spin_lock(&dentry->d_lock);
		if (dentry->d_lockref.count > 1) {
			dentry->d_lockref.count--;
			spin_unlock(&dentry->d_lock);
			return 1;
		}
		return 0;
	}

	/*
	 * If we weren't the last ref, we're done.
	 */
	if (ret)
		return 1;

	/*
	 * Careful, careful. The reference count went down
	 * to zero, but we don't hold the dentry lock, so
	 * somebody else could get it again, and do another
	 * dput(), and we need to not race with that.
	 *
	 * However, there is a very special and common case
	 * where we don't care, because there is nothing to
	 * do: the dentry is still hashed, it does not have
	 * a 'delete' op, and it's referenced and already on
	 * the LRU list.
	 *
	 * NOTE! Since we aren't locked, these values are
	 * not "stable". However, it is sufficient that at
	 * some point after we dropped the reference the
	 * dentry was hashed and the flags had the proper
	 * value. Other dentry users may have re-gotten
	 * a reference to the dentry and change that, but
	 * our work is done - we can leave the dentry
	 * around with a zero refcount.
	 */
	smp_rmb();
724
	d_flags = READ_ONCE(dentry->d_flags);
725
	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED;
726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758

	/* Nothing to do? Dropping the reference was all we needed? */
	if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
		return 1;

	/*
	 * Not the fast normal case? Get the lock. We've already decremented
	 * the refcount, but we'll need to re-check the situation after
	 * getting the lock.
	 */
	spin_lock(&dentry->d_lock);

	/*
	 * Did somebody else grab a reference to it in the meantime, and
	 * we're no longer the last user after all? Alternatively, somebody
	 * else could have killed it and marked it dead. Either way, we
	 * don't need to do anything else.
	 */
	if (dentry->d_lockref.count) {
		spin_unlock(&dentry->d_lock);
		return 1;
	}

	/*
	 * Re-get the reference we optimistically dropped. We hold the
	 * lock, and we just tested that it was zero, so we can just
	 * set it to 1.
	 */
	dentry->d_lockref.count = 1;
	return 0;
}


Linus Torvalds's avatar
Linus Torvalds committed
759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
/* 
 * This is dput
 *
 * This is complicated by the fact that we do not want to put
 * dentries that are no longer on any hash chain on the unused
 * list: we'd much rather just get rid of them immediately.
 *
 * However, that implies that we have to traverse the dentry
 * tree upwards to the parents which might _also_ now be
 * scheduled for deletion (it may have been only waiting for
 * its last child to go away).
 *
 * This tail recursion is done by hand as we don't want to depend
 * on the compiler to always get this right (gcc generally doesn't).
 * Real recursion would eat up our stack space.
 */

/*
 * dput - release a dentry
 * @dentry: dentry to release 
 *
 * Release a dentry. This will drop the usage count and if appropriate
 * call the dentry unlink method as well as removing it from the queues and
 * releasing its resources. If the parent dentries were scheduled for release
 * they too may now get deleted.
 */
void dput(struct dentry *dentry)
{
787
	if (unlikely(!dentry))
Linus Torvalds's avatar
Linus Torvalds committed
788 789 790
		return;

repeat:
791 792
	might_sleep();

793 794 795
	rcu_read_lock();
	if (likely(fast_dput(dentry))) {
		rcu_read_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
796
		return;
797 798 799 800
	}

	/* Slow case: now with the dentry lock held */
	rcu_read_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
801

802 803
	WARN_ON(d_in_lookup(dentry));

804 805 806 807
	/* Unreachable? Get rid of it */
	if (unlikely(d_unhashed(dentry)))
		goto kill_it;

808 809 810
	if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
		goto kill_it;

811
	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
Linus Torvalds's avatar
Linus Torvalds committed
812
		if (dentry->d_op->d_delete(dentry))
Nick Piggin's avatar
Nick Piggin committed
813
			goto kill_it;
Linus Torvalds's avatar
Linus Torvalds committed
814
	}
815

816
	dentry_lru_add(dentry);
817

818
	dentry->d_lockref.count--;
Nick Piggin's avatar
Nick Piggin committed
819
	spin_unlock(&dentry->d_lock);
Linus Torvalds's avatar
Linus Torvalds committed
820 821
	return;

822
kill_it:
823
	dentry = dentry_kill(dentry);
824 825
	if (dentry) {
		cond_resched();
826
		goto repeat;
827
	}
Linus Torvalds's avatar
Linus Torvalds committed
828
}
829
EXPORT_SYMBOL(dput);
Linus Torvalds's avatar
Linus Torvalds committed
830 831


Nick Piggin's avatar
Nick Piggin committed
832
/* This must be called with d_lock held */
833
static inline void __dget_dlock(struct dentry *dentry)
Nick Piggin's avatar
Nick Piggin committed
834
{
835
	dentry->d_lockref.count++;
Nick Piggin's avatar
Nick Piggin committed
836 837
}

838
static inline void __dget(struct dentry *dentry)
Linus Torvalds's avatar
Linus Torvalds committed
839
{
840
	lockref_get(&dentry->d_lockref);
Linus Torvalds's avatar
Linus Torvalds committed
841 842
}

Nick Piggin's avatar
Nick Piggin committed
843 844
struct dentry *dget_parent(struct dentry *dentry)
{
845
	int gotref;
Nick Piggin's avatar
Nick Piggin committed
846 847
	struct dentry *ret;

848 849 850 851 852
	/*
	 * Do optimistic parent lookup without any
	 * locking.
	 */
	rcu_read_lock();
853
	ret = READ_ONCE(dentry->d_parent);
854 855 856
	gotref = lockref_get_not_zero(&ret->d_lockref);
	rcu_read_unlock();
	if (likely(gotref)) {
857
		if (likely(ret == READ_ONCE(dentry->d_parent)))
858 859 860 861
			return ret;
		dput(ret);
	}

Nick Piggin's avatar
Nick Piggin committed
862
repeat:
863 864 865 866 867
	/*
	 * Don't need rcu_dereference because we re-check it was correct under
	 * the lock.
	 */
	rcu_read_lock();
Nick Piggin's avatar
Nick Piggin committed
868
	ret = dentry->d_parent;
869 870 871 872
	spin_lock(&ret->d_lock);
	if (unlikely(ret != dentry->d_parent)) {
		spin_unlock(&ret->d_lock);
		rcu_read_unlock();
Nick Piggin's avatar
Nick Piggin committed
873 874
		goto repeat;
	}
875
	rcu_read_unlock();
876 877
	BUG_ON(!ret->d_lockref.count);
	ret->d_lockref.count++;
Nick Piggin's avatar
Nick Piggin committed
878 879 880 881 882
	spin_unlock(&ret->d_lock);
	return ret;
}
EXPORT_SYMBOL(dget_parent);

Linus Torvalds's avatar
Linus Torvalds committed
883 884 885 886 887 888 889 890
/**
 * d_find_alias - grab a hashed alias of inode
 * @inode: inode in question
 *
 * If inode has a hashed alias, or is a directory and has any alias,
 * acquire the reference to alias and return it. Otherwise return NULL.
 * Notice that if inode is a directory there can be only one alias and
 * it can be unhashed only if it has no children, or if it is the root
891 892
 * of a filesystem, or if the directory was renamed and d_revalidate
 * was the first vfs operation to notice.
Linus Torvalds's avatar
Linus Torvalds committed
893
 *
894
 * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
895
 * any other hashed alias over that one.
Linus Torvalds's avatar
Linus Torvalds committed
896
 */
897
static struct dentry *__d_find_alias(struct inode *inode)
Linus Torvalds's avatar
Linus Torvalds committed
898
{
Nick Piggin's avatar
Nick Piggin committed
899
	struct dentry *alias, *discon_alias;
Linus Torvalds's avatar
Linus Torvalds committed
900

Nick Piggin's avatar
Nick Piggin committed
901 902
again:
	discon_alias = NULL;
903
	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
Nick Piggin's avatar
Nick Piggin committed
904
		spin_lock(&alias->d_lock);
Linus Torvalds's avatar
Linus Torvalds committed
905
 		if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
906
			if (IS_ROOT(alias) &&
Nick Piggin's avatar
Nick Piggin committed
907
			    (alias->d_flags & DCACHE_DISCONNECTED)) {
Linus Torvalds's avatar
Linus Torvalds committed
908
				discon_alias = alias;
909
			} else {
910
				__dget_dlock(alias);
Nick Piggin's avatar
Nick Piggin committed
911 912 913 914 915 916 917 918 919 920
				spin_unlock(&alias->d_lock);
				return alias;
			}
		}
		spin_unlock(&alias->d_lock);
	}
	if (discon_alias) {
		alias = discon_alias;
		spin_lock(&alias->d_lock);
		if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
921 922 923
			__dget_dlock(alias);
			spin_unlock(&alias->d_lock);
			return alias;
Linus Torvalds's avatar
Linus Torvalds committed
924
		}
Nick Piggin's avatar
Nick Piggin committed
925 926
		spin_unlock(&alias->d_lock);
		goto again;
Linus Torvalds's avatar
Linus Torvalds committed
927
	}
Nick Piggin's avatar
Nick Piggin committed
928
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
929 930
}

Nick Piggin's avatar
Nick Piggin committed
931
struct dentry *d_find_alias(struct inode *inode)
Linus Torvalds's avatar
Linus Torvalds committed
932
{
933 934
	struct dentry *de = NULL;

935
	if (!hlist_empty(&inode->i_dentry)) {
936
		spin_lock(&inode->i_lock);
937
		de = __d_find_alias(inode);
938
		spin_unlock(&inode->i_lock);
939
	}
Linus Torvalds's avatar
Linus Torvalds committed
940 941
	return de;
}
942
EXPORT_SYMBOL(d_find_alias);
Linus Torvalds's avatar
Linus Torvalds committed
943 944 945 946 947 948 949

/*
 *	Try to kill dentries associated with this inode.
 * WARNING: you must own a reference to inode.
 */
void d_prune_aliases(struct inode *inode)
{
950
	struct dentry *dentry;
Linus Torvalds's avatar
Linus Torvalds committed
951
restart:
952
	spin_lock(&inode->i_lock);
953
	hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
Linus Torvalds's avatar
Linus Torvalds committed
954
		spin_lock(&dentry->d_lock);
955
		if (!dentry->d_lockref.count) {
956 957 958
			struct dentry *parent = lock_parent(dentry);
			if (likely(!dentry->d_lockref.count)) {
				__dentry_kill(dentry);
959
				dput(parent);
960 961 962 963
				goto restart;
			}
			if (parent)
				spin_unlock(&parent->d_lock);
Linus Torvalds's avatar
Linus Torvalds committed
964 965 966
		}
		spin_unlock(&dentry->d_lock);
	}
967
	spin_unlock(&inode->i_lock);
Linus Torvalds's avatar
Linus Torvalds committed
968
}
969
EXPORT_SYMBOL(d_prune_aliases);
Linus Torvalds's avatar
Linus Torvalds committed
970

971
static void shrink_dentry_list(struct list_head *list)
Linus Torvalds's avatar
Linus Torvalds committed
972
{
Al Viro's avatar
Al Viro committed
973
	struct dentry *dentry, *parent;
974

975
	while (!list_empty(list)) {
976
		struct inode *inode;
977
		dentry = list_entry(list->prev, struct dentry, d_lru);
978
		spin_lock(&dentry->d_lock);
979 980
		parent = lock_parent(dentry);

981 982 983 984 985
		/*
		 * The dispose list is isolated and dentries are not accounted
		 * to the LRU here, so we can simply remove it from the list
		 * here regardless of whether it is referenced or not.
		 */
986
		d_shrink_del(dentry);
987

Linus Torvalds's avatar
Linus Torvalds committed
988 989
		/*
		 * We found an inuse dentry which was not removed from
990
		 * the LRU because of laziness during lookup. Do not free it.
Linus Torvalds's avatar
Linus Torvalds committed
991
		 */
992
		if (dentry->d_lockref.count > 0) {
993
			spin_unlock(&dentry->d_lock);
994 995
			if (parent)
				spin_unlock(&parent->d_lock);
Linus Torvalds's avatar
Linus Torvalds committed
996 997
			continue;
		}
998

999 1000 1001 1002

		if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) {
			bool can_free = dentry->d_flags & DCACHE_MAY_FREE;
			spin_unlock(&dentry->d_lock);
1003 1004
			if (parent)
				spin_unlock(&parent->d_lock);
1005 1006 1007 1008 1009
			if (can_free)
				dentry_free(dentry);
			continue;
		}

1010 1011
		inode = dentry->d_inode;
		if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
1012
			d_shrink_add(dentry, list);
1013
			spin_unlock(&dentry->d_lock);
1014 1015
			if (parent)
				spin_unlock(&parent->d_lock);
Al Viro's avatar
Al Viro committed
1016
			continue;
1017
		}
1018 1019

		__dentry_kill(dentry);
1020

Al Viro's avatar
Al Viro committed
1021 1022 1023 1024 1025 1026 1027
		/*
		 * We need to prune ancestors too. This is necessary to prevent
		 * quadratic behavior of shrink_dcache_parent(), but is also
		 * expected to be beneficial in reducing dentry cache
		 * fragmentation.
		 */
		dentry = parent;
1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
		while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) {
			parent = lock_parent(dentry);
			if (dentry->d_lockref.count != 1) {
				dentry->d_lockref.count--;
				spin_unlock(&dentry->d_lock);
				if (parent)
					spin_unlock(&parent->d_lock);
				break;
			}
			inode = dentry->d_inode;	/* can't be NULL */
			if (unlikely(!spin_trylock(&inode->i_lock))) {
				spin_unlock(&dentry->d_lock);
				if (parent)
					spin_unlock(&parent->d_lock);
				cpu_relax();
				continue;
			}
			__dentry_kill(dentry);
			dentry = parent;
		}
1048
	}
1049 1050
}

1051 1052
static enum lru_status dentry_lru_isolate(struct list_head *item,
		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071
{
	struct list_head *freeable = arg;
	struct dentry	*dentry = container_of(item, struct dentry, d_lru);


	/*
	 * we are inverting the lru lock/dentry->d_lock here,
	 * so use a trylock. If we fail to get the lock, just skip
	 * it
	 */
	if (!spin_trylock(&dentry->d_lock))
		return LRU_SKIP;

	/*
	 * Referenced dentries are still in use. If they have active
	 * counts, just remove them from the LRU. Otherwise give them
	 * another pass through the LRU.
	 */
	if (dentry->d_lockref.count) {
1072
		d_lru_isolate(lru, dentry);
1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102
		spin_unlock(&dentry->d_lock);
		return LRU_REMOVED;
	}

	if (dentry->d_flags & DCACHE_REFERENCED) {
		dentry->d_flags &= ~DCACHE_REFERENCED;
		spin_unlock(&dentry->d_lock);

		/*
		 * The list move itself will be made by the common LRU code. At
		 * this point, we've dropped the dentry->d_lock but keep the
		 * lru lock. This is safe to do, since every list movement is
		 * protected by the lru lock even if both locks are held.
		 *
		 * This is guaranteed by the fact that all LRU management
		 * functions are intermediated by the LRU API calls like
		 * list_lru_add and list_lru_del. List movement in this file
		 * only ever occur through this functions or through callbacks
		 * like this one, that are called from the LRU API.
		 *
		 * The only exceptions to this are functions like
		 * shrink_dentry_list, and code that first checks for the
		 * DCACHE_SHRINK_LIST flag.  Those are guaranteed to be
		 * operating only with stack provided lists after they are
		 * properly isolated from the main list.  It is thus, always a
		 * local access.
		 */
		return LRU_ROTATE;
	}

1103
	d_lru_shrink_move(lru, dentry, freeable);
1104 1105 1106 1107 1108
	spin_unlock(&dentry->d_lock);

	return LRU_REMOVED;
}

1109
/**
1110 1111
 * prune_dcache_sb - shrink the dcache
 * @sb: superblock
1112
 * @sc: shrink control, passed to list_lru_shrink_walk()
1113
 *
1114 1115
 * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
 * is done when we need more memory and called from the superblock shrinker
1116
 * function.
1117
 *
1118 1119
 * This function may fail to free any resources if all the dentries are in
 * use.
1120
 */
1121
long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
1122
{
1123 1124
	LIST_HEAD(dispose);
	long freed;
1125

1126 1127
	freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
				     dentry_lru_isolate, &dispose);
1128
	shrink_dentry_list(&dispose);
1129
	return freed;
1130
}
Nick Piggin's avatar
Nick Piggin committed
1131

1132
static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
1133
		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
1134
{
1135 1136
	struct list_head *freeable = arg;
	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
1137

1138 1139 1140 1141 1142 1143 1144 1145
	/*
	 * we are inverting the lru lock/dentry->d_lock here,
	 * so use a trylock. If we fail to get the lock, just skip
	 * it
	 */
	if (!spin_trylock(&dentry->d_lock))
		return LRU_SKIP;

1146
	d_lru_shrink_move(lru, dentry, freeable);
1147
	spin_unlock(&dentry->d_lock);
1148

1149
	return LRU_REMOVED;
1150 1151
}

1152

Linus Torvalds's avatar
Linus Torvalds committed
1153 1154 1155 1156
/**
 * shrink_dcache_sb - shrink dcache for a superblock
 * @sb: superblock
 *
1157 1158
 * Shrink the dcache for the specified super block. This is used to free
 * the dcache before unmounting a file system.
Linus Torvalds's avatar
Linus Torvalds committed
1159
 */
1160
void shrink_dcache_sb(struct super_block *sb)
Linus Torvalds's avatar
Linus Torvalds committed
1161
{
1162 1163 1164 1165 1166