slab.c 118 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
/*
 * linux/mm/slab.c
 * Written by Mark Hemment, 1996/97.
 * (markhe@nextd.demon.co.uk)
 *
 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
 *
 * Major cleanup, different bufctl logic, per-cpu arrays
 *	(c) 2000 Manfred Spraul
 *
 * Cleanup, make the head arrays unconditional, preparation for NUMA
 * 	(c) 2002 Manfred Spraul
 *
 * An implementation of the Slab Allocator as described in outline in;
 *	UNIX Internals: The New Frontiers by Uresh Vahalia
 *	Pub: Prentice Hall	ISBN 0-13-101908-2
 * or with a little more detail in;
 *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
 *	Jeff Bonwick (Sun Microsystems).
 *	Presented at: USENIX Summer 1994 Technical Conference
 *
 * The memory is organized in caches, one cache for each object type.
 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
 * Each cache consists out of many slabs (they are small (usually one
 * page long) and always contiguous), and each slab contains multiple
 * initialized objects.
 *
 * This means, that your constructor is used only for newly allocated
Simon Arlott's avatar
Simon Arlott committed
29
 * slabs and you must pass objects with the same initializations to
Linus Torvalds's avatar
Linus Torvalds committed
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
 * kmem_cache_free.
 *
 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
 * normal). If you need a special memory type, then must create a new
 * cache for that memory type.
 *
 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
 *   full slabs with 0 free objects
 *   partial slabs
 *   empty slabs with no allocated objects
 *
 * If partial slabs exist, then new allocations come from these slabs,
 * otherwise from empty slabs or new slabs are allocated.
 *
 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
 *
 * Each cache has a short per-cpu head array, most allocs
 * and frees go into that array, and if that array overflows, then 1/2
 * of the entries in the array are given back into the global cache.
 * The head array is strictly LIFO and should improve the cache hit rates.
 * On SMP, it additionally reduces the spinlock operations.
 *
Andrew Morton's avatar
Andrew Morton committed
53
 * The c_cpuarray may not be read with enabled local interrupts -
Linus Torvalds's avatar
Linus Torvalds committed
54 55 56 57
 * it's changed with a smp_call_function().
 *
 * SMP synchronization:
 *  constructors and destructors are called without any locking.
58
 *  Several members in struct kmem_cache and struct slab never change, they
Linus Torvalds's avatar
Linus Torvalds committed
59 60 61 62 63 64 65 66 67 68 69 70
 *	are accessed without any locking.
 *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
 *  	and local interrupts are disabled so slab code is preempt-safe.
 *  The non-constant members are protected with a per-cache irq spinlock.
 *
 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
 * in 2000 - many ideas in the current implementation are derived from
 * his patch.
 *
 * Further notes from the original documentation:
 *
 * 11 April '97.  Started multi-threading - markhe
Ingo Molnar's avatar
Ingo Molnar committed
71
 *	The global cache-chain is protected by the mutex 'cache_chain_mutex'.
Linus Torvalds's avatar
Linus Torvalds committed
72 73 74 75 76 77
 *	The sem is only needed when accessing/extending the cache-chain, which
 *	can never happen inside an interrupt (kmem_cache_create(),
 *	kmem_cache_shrink() and kmem_cache_reap()).
 *
 *	At present, each engine can be growing a cache.  This should be blocked.
 *
78 79 80 81 82 83 84 85 86
 * 15 March 2005. NUMA slab allocator.
 *	Shai Fultheim <shai@scalex86.org>.
 *	Shobhit Dayal <shobhit@calsoftinc.com>
 *	Alok N Kataria <alokk@calsoftinc.com>
 *	Christoph Lameter <christoph@lameter.com>
 *
 *	Modified the slab allocator to be node aware on NUMA systems.
 *	Each node has its own list of partial, free and full slabs.
 *	All object allocations for a node occur from node specific slab lists.
Linus Torvalds's avatar
Linus Torvalds committed
87 88 89 90
 */

#include	<linux/slab.h>
#include	<linux/mm.h>
91
#include	<linux/poison.h>
Linus Torvalds's avatar
Linus Torvalds committed
92 93 94 95 96
#include	<linux/swap.h>
#include	<linux/cache.h>
#include	<linux/interrupt.h>
#include	<linux/init.h>
#include	<linux/compiler.h>
97
#include	<linux/cpuset.h>
98
#include	<linux/proc_fs.h>
Linus Torvalds's avatar
Linus Torvalds committed
99 100 101 102 103 104 105
#include	<linux/seq_file.h>
#include	<linux/notifier.h>
#include	<linux/kallsyms.h>
#include	<linux/cpu.h>
#include	<linux/sysctl.h>
#include	<linux/module.h>
#include	<linux/rcupdate.h>
106
#include	<linux/string.h>
107
#include	<linux/uaccess.h>
108
#include	<linux/nodemask.h>
109
#include	<linux/kmemleak.h>
110
#include	<linux/mempolicy.h>
Ingo Molnar's avatar
Ingo Molnar committed
111
#include	<linux/mutex.h>
112
#include	<linux/fault-inject.h>
Ingo Molnar's avatar
Ingo Molnar committed
113
#include	<linux/rtmutex.h>
114
#include	<linux/reciprocal_div.h>
115
#include	<linux/debugobjects.h>
Pekka Enberg's avatar
Pekka Enberg committed
116
#include	<linux/kmemcheck.h>
117
#include	<linux/memory.h>
Linus Torvalds's avatar
Linus Torvalds committed
118 119 120 121 122 123

#include	<asm/cacheflush.h>
#include	<asm/tlbflush.h>
#include	<asm/page.h>

/*
124
 * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
Linus Torvalds's avatar
Linus Torvalds committed
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 *		  0 for faster, smaller code (especially in the critical paths).
 *
 * STATS	- 1 to collect stats for /proc/slabinfo.
 *		  0 for faster, smaller code (especially in the critical paths).
 *
 * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
 */

#ifdef CONFIG_DEBUG_SLAB
#define	DEBUG		1
#define	STATS		1
#define	FORCED_DEBUG	1
#else
#define	DEBUG		0
#define	STATS		0
#define	FORCED_DEBUG	0
#endif

/* Shouldn't this be in a header file somewhere? */
#define	BYTES_PER_WORD		sizeof(void *)
David Woodhouse's avatar
David Woodhouse committed
145
#define	REDZONE_ALIGN		max(BYTES_PER_WORD, __alignof__(unsigned long long))
Linus Torvalds's avatar
Linus Torvalds committed
146 147 148 149 150 151 152

#ifndef ARCH_KMALLOC_FLAGS
#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
#endif

/* Legal flag mask for kmem_cache_create(). */
#if DEBUG
153
# define CREATE_MASK	(SLAB_RED_ZONE | \
Linus Torvalds's avatar
Linus Torvalds committed
154
			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
155
			 SLAB_CACHE_DMA | \
156
			 SLAB_STORE_USER | \
Linus Torvalds's avatar
Linus Torvalds committed
157
			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
158
			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
Pekka Enberg's avatar
Pekka Enberg committed
159
			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
Linus Torvalds's avatar
Linus Torvalds committed
160
#else
161
# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
162
			 SLAB_CACHE_DMA | \
Linus Torvalds's avatar
Linus Torvalds committed
163
			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
164
			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
Pekka Enberg's avatar
Pekka Enberg committed
165
			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
Linus Torvalds's avatar
Linus Torvalds committed
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
#endif

/*
 * kmem_bufctl_t:
 *
 * Bufctl's are used for linking objs within a slab
 * linked offsets.
 *
 * This implementation relies on "struct page" for locating the cache &
 * slab an object belongs to.
 * This allows the bufctl structure to be small (one int), but limits
 * the number of objects a slab (not a cache) can contain when off-slab
 * bufctls are used. The limit is the size of the largest general cache
 * that does not use off-slab slabs.
 * For 32bit archs with 4 kB pages, is this 56.
 * This is not serious, as it is only for large objects, when it is unwise
 * to have too many per slab.
 * Note: This limit can be raised by introducing a general cache whose size
 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
 */

187
typedef unsigned int kmem_bufctl_t;
Linus Torvalds's avatar
Linus Torvalds committed
188 189
#define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
#define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
190 191
#define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
#define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
Linus Torvalds's avatar
Linus Torvalds committed
192 193 194 195 196 197 198 199 200

/*
 * struct slab
 *
 * Manages the objs in a slab. Placed either at the beginning of mem allocated
 * for a slab, or allocated from an general cache.
 * Slabs are chained into three list: fully used, partial, fully free slabs.
 */
struct slab {
201 202 203 204 205 206
	struct list_head list;
	unsigned long colouroff;
	void *s_mem;		/* including colour offset */
	unsigned int inuse;	/* num of objs active in slab */
	kmem_bufctl_t free;
	unsigned short nodeid;
Linus Torvalds's avatar
Linus Torvalds committed
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
};

/*
 * struct slab_rcu
 *
 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
 * arrange for kmem_freepages to be called via RCU.  This is useful if
 * we need to approach a kernel structure obliquely, from its address
 * obtained without the usual locking.  We can lock the structure to
 * stabilize it and check it's still at the given address, only if we
 * can be sure that the memory has not been meanwhile reused for some
 * other kind of object (which our subsystem's lock might corrupt).
 *
 * rcu_read_lock before reading the address, then rcu_read_unlock after
 * taking the spinlock within the structure expected at that address.
 *
 * We assume struct slab_rcu can overlay struct slab when destroying.
 */
struct slab_rcu {
226
	struct rcu_head head;
227
	struct kmem_cache *cachep;
228
	void *addr;
Linus Torvalds's avatar
Linus Torvalds committed
229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
};

/*
 * struct array_cache
 *
 * Purpose:
 * - LIFO ordering, to hand out cache-warm objects from _alloc
 * - reduce the number of linked list operations
 * - reduce spinlock operations
 *
 * The limit is stored in the per-cpu structure to reduce the data cache
 * footprint.
 *
 */
struct array_cache {
	unsigned int avail;
	unsigned int limit;
	unsigned int batchcount;
	unsigned int touched;
248
	spinlock_t lock;
249
	void *entry[];	/*
Andrew Morton's avatar
Andrew Morton committed
250 251 252 253
			 * Must have this definition in here for the proper
			 * alignment of array_cache. Also simplifies accessing
			 * the entries.
			 */
Linus Torvalds's avatar
Linus Torvalds committed
254 255
};

Andrew Morton's avatar
Andrew Morton committed
256 257 258
/*
 * bootstrap: The caches do not work without cpuarrays anymore, but the
 * cpuarrays are allocated from the generic caches...
Linus Torvalds's avatar
Linus Torvalds committed
259 260 261 262
 */
#define BOOT_CPUCACHE_ENTRIES	1
struct arraycache_init {
	struct array_cache cache;
263
	void *entries[BOOT_CPUCACHE_ENTRIES];
Linus Torvalds's avatar
Linus Torvalds committed
264 265 266
};

/*
267
 * The slab lists for all objects.
Linus Torvalds's avatar
Linus Torvalds committed
268 269
 */
struct kmem_list3 {
270 271 272 273 274
	struct list_head slabs_partial;	/* partial list first, better asm code */
	struct list_head slabs_full;
	struct list_head slabs_free;
	unsigned long free_objects;
	unsigned int free_limit;
275
	unsigned int colour_next;	/* Per-node cache coloring */
276 277 278
	spinlock_t list_lock;
	struct array_cache *shared;	/* shared per node */
	struct array_cache **alien;	/* on other nodes */
279 280
	unsigned long next_reap;	/* updated without locking */
	int free_touched;		/* updated without locking */
Linus Torvalds's avatar
Linus Torvalds committed
281 282
};

283 284 285
/*
 * Need this for bootstrapping a per node allocator.
 */
286
#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
287 288
struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
#define	CACHE_CACHE 0
289 290
#define	SIZE_AC MAX_NUMNODES
#define	SIZE_L3 (2 * MAX_NUMNODES)
291

292 293 294 295
static int drain_freelist(struct kmem_cache *cache,
			struct kmem_list3 *l3, int tofree);
static void free_block(struct kmem_cache *cachep, void **objpp, int len,
			int node);
296
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
297
static void cache_reap(struct work_struct *unused);
298

299
/*
Andrew Morton's avatar
Andrew Morton committed
300 301
 * This function must be completely optimized away if a constant is passed to
 * it.  Mostly the same as what is in linux/slab.h except it returns an index.
302
 */
303
static __always_inline int index_of(const size_t size)
304
{
305 306
	extern void __bad_size(void);

307 308 309 310 311 312 313 314
	if (__builtin_constant_p(size)) {
		int i = 0;

#define CACHE(x) \
	if (size <=x) \
		return i; \
	else \
		i++;
315
#include <linux/kmalloc_sizes.h>
316
#undef CACHE
317
		__bad_size();
318
	} else
319
		__bad_size();
320 321 322
	return 0;
}

323 324
static int slab_early_init = 1;

325 326
#define INDEX_AC index_of(sizeof(struct arraycache_init))
#define INDEX_L3 index_of(sizeof(struct kmem_list3))
Linus Torvalds's avatar
Linus Torvalds committed
327

Pekka Enberg's avatar
Pekka Enberg committed
328
static void kmem_list3_init(struct kmem_list3 *parent)
329 330 331 332 333 334
{
	INIT_LIST_HEAD(&parent->slabs_full);
	INIT_LIST_HEAD(&parent->slabs_partial);
	INIT_LIST_HEAD(&parent->slabs_free);
	parent->shared = NULL;
	parent->alien = NULL;
335
	parent->colour_next = 0;
336 337 338 339 340
	spin_lock_init(&parent->list_lock);
	parent->free_objects = 0;
	parent->free_touched = 0;
}

Andrew Morton's avatar
Andrew Morton committed
341 342 343 344
#define MAKE_LIST(cachep, listp, slab, nodeid)				\
	do {								\
		INIT_LIST_HEAD(listp);					\
		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
345 346
	} while (0)

Andrew Morton's avatar
Andrew Morton committed
347 348
#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
	do {								\
349 350 351 352
	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
	} while (0)
Linus Torvalds's avatar
Linus Torvalds committed
353 354 355 356 357

#define CFLGS_OFF_SLAB		(0x80000000UL)
#define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)

#define BATCHREFILL_LIMIT	16
Andrew Morton's avatar
Andrew Morton committed
358 359 360
/*
 * Optimization question: fewer reaps means less probability for unnessary
 * cpucache drain/refill cycles.
Linus Torvalds's avatar
Linus Torvalds committed
361
 *
Adrian Bunk's avatar
Adrian Bunk committed
362
 * OTOH the cpuarrays can contain lots of objects,
Linus Torvalds's avatar
Linus Torvalds committed
363 364 365 366 367 368 369 370 371 372
 * which could lock up otherwise freeable slabs.
 */
#define REAPTIMEOUT_CPUC	(2*HZ)
#define REAPTIMEOUT_LIST3	(4*HZ)

#if STATS
#define	STATS_INC_ACTIVE(x)	((x)->num_active++)
#define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
#define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
#define	STATS_INC_GROWN(x)	((x)->grown++)
373
#define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
Andrew Morton's avatar
Andrew Morton committed
374 375 376 377 378
#define	STATS_SET_HIGH(x)						\
	do {								\
		if ((x)->num_active > (x)->high_mark)			\
			(x)->high_mark = (x)->num_active;		\
	} while (0)
Linus Torvalds's avatar
Linus Torvalds committed
379 380
#define	STATS_INC_ERR(x)	((x)->errors++)
#define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
381
#define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
382
#define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
Andrew Morton's avatar
Andrew Morton committed
383 384 385 386 387
#define	STATS_SET_FREEABLE(x, i)					\
	do {								\
		if ((x)->max_freeable < i)				\
			(x)->max_freeable = i;				\
	} while (0)
Linus Torvalds's avatar
Linus Torvalds committed
388 389 390 391 392 393 394 395 396
#define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
#define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
#define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
#define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
#else
#define	STATS_INC_ACTIVE(x)	do { } while (0)
#define	STATS_DEC_ACTIVE(x)	do { } while (0)
#define	STATS_INC_ALLOCED(x)	do { } while (0)
#define	STATS_INC_GROWN(x)	do { } while (0)
397
#define	STATS_ADD_REAPED(x,y)	do { (void)(y); } while (0)
Linus Torvalds's avatar
Linus Torvalds committed
398 399 400
#define	STATS_SET_HIGH(x)	do { } while (0)
#define	STATS_INC_ERR(x)	do { } while (0)
#define	STATS_INC_NODEALLOCS(x)	do { } while (0)
401
#define	STATS_INC_NODEFREES(x)	do { } while (0)
402
#define STATS_INC_ACOVERFLOW(x)   do { } while (0)
Andrew Morton's avatar
Andrew Morton committed
403
#define	STATS_SET_FREEABLE(x, i) do { } while (0)
Linus Torvalds's avatar
Linus Torvalds committed
404 405 406 407 408 409 410 411
#define STATS_INC_ALLOCHIT(x)	do { } while (0)
#define STATS_INC_ALLOCMISS(x)	do { } while (0)
#define STATS_INC_FREEHIT(x)	do { } while (0)
#define STATS_INC_FREEMISS(x)	do { } while (0)
#endif

#if DEBUG

Andrew Morton's avatar
Andrew Morton committed
412 413
/*
 * memory layout of objects:
Linus Torvalds's avatar
Linus Torvalds committed
414
 * 0		: objp
415
 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
Linus Torvalds's avatar
Linus Torvalds committed
416 417
 * 		the end of an object is aligned with the end of the real
 * 		allocation. Catches writes behind the end of the allocation.
418
 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
Linus Torvalds's avatar
Linus Torvalds committed
419
 * 		redzone word.
420 421
 * cachep->obj_offset: The real object.
 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
Andrew Morton's avatar
Andrew Morton committed
422 423
 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
 *					[BYTES_PER_WORD long]
Linus Torvalds's avatar
Linus Torvalds committed
424
 */
425
static int obj_offset(struct kmem_cache *cachep)
Linus Torvalds's avatar
Linus Torvalds committed
426
{
427
	return cachep->obj_offset;
Linus Torvalds's avatar
Linus Torvalds committed
428 429
}

430
static int obj_size(struct kmem_cache *cachep)
Linus Torvalds's avatar
Linus Torvalds committed
431
{
432
	return cachep->obj_size;
Linus Torvalds's avatar
Linus Torvalds committed
433 434
}

435
static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
Linus Torvalds's avatar
Linus Torvalds committed
436 437
{
	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
438 439
	return (unsigned long long*) (objp + obj_offset(cachep) -
				      sizeof(unsigned long long));
Linus Torvalds's avatar
Linus Torvalds committed
440 441
}

442
static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
Linus Torvalds's avatar
Linus Torvalds committed
443 444 445
{
	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
	if (cachep->flags & SLAB_STORE_USER)
446 447
		return (unsigned long long *)(objp + cachep->buffer_size -
					      sizeof(unsigned long long) -
David Woodhouse's avatar
David Woodhouse committed
448
					      REDZONE_ALIGN);
449 450
	return (unsigned long long *) (objp + cachep->buffer_size -
				       sizeof(unsigned long long));
Linus Torvalds's avatar
Linus Torvalds committed
451 452
}

453
static void **dbg_userword(struct kmem_cache *cachep, void *objp)
Linus Torvalds's avatar
Linus Torvalds committed
454 455
{
	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
456
	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
Linus Torvalds's avatar
Linus Torvalds committed
457 458 459 460
}

#else

461 462
#define obj_offset(x)			0
#define obj_size(cachep)		(cachep->buffer_size)
463 464
#define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
#define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
Linus Torvalds's avatar
Linus Torvalds committed
465 466 467 468
#define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})

#endif

469
#ifdef CONFIG_TRACING
470 471 472 473 474 475 476
size_t slab_buffer_size(struct kmem_cache *cachep)
{
	return cachep->buffer_size;
}
EXPORT_SYMBOL(slab_buffer_size);
#endif

Linus Torvalds's avatar
Linus Torvalds committed
477 478 479 480 481 482 483
/*
 * Do not go above this order unless 0 objects fit into the slab.
 */
#define	BREAK_GFP_ORDER_HI	1
#define	BREAK_GFP_ORDER_LO	0
static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;

Andrew Morton's avatar
Andrew Morton committed
484 485 486 487
/*
 * Functions for storing/retrieving the cachep and or slab from the page
 * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
 * these are used to find the cache which an obj belongs to.
Linus Torvalds's avatar
Linus Torvalds committed
488
 */
489 490 491 492 493 494 495
static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
{
	page->lru.next = (struct list_head *)cache;
}

static inline struct kmem_cache *page_get_cache(struct page *page)
{
496
	page = compound_head(page);
497
	BUG_ON(!PageSlab(page));
498 499 500 501 502 503 504 505 506 507
	return (struct kmem_cache *)page->lru.next;
}

static inline void page_set_slab(struct page *page, struct slab *slab)
{
	page->lru.prev = (struct list_head *)slab;
}

static inline struct slab *page_get_slab(struct page *page)
{
508
	BUG_ON(!PageSlab(page));
509 510
	return (struct slab *)page->lru.prev;
}
Linus Torvalds's avatar
Linus Torvalds committed
511

512 513
static inline struct kmem_cache *virt_to_cache(const void *obj)
{
514
	struct page *page = virt_to_head_page(obj);
515 516 517 518 519
	return page_get_cache(page);
}

static inline struct slab *virt_to_slab(const void *obj)
{
520
	struct page *page = virt_to_head_page(obj);
521 522 523
	return page_get_slab(page);
}

524 525 526 527 528 529
static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
				 unsigned int idx)
{
	return slab->s_mem + cache->buffer_size * idx;
}

530 531 532 533 534 535 536 537
/*
 * We want to avoid an expensive divide : (offset / cache->buffer_size)
 *   Using the fact that buffer_size is a constant for a particular cache,
 *   we can replace (offset / cache->buffer_size) by
 *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
 */
static inline unsigned int obj_to_index(const struct kmem_cache *cache,
					const struct slab *slab, void *obj)
538
{
539 540
	u32 offset = (obj - slab->s_mem);
	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
541 542
}

Andrew Morton's avatar
Andrew Morton committed
543 544 545
/*
 * These are the default caches for kmalloc. Custom caches can have other sizes.
 */
Linus Torvalds's avatar
Linus Torvalds committed
546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562
struct cache_sizes malloc_sizes[] = {
#define CACHE(x) { .cs_size = (x) },
#include <linux/kmalloc_sizes.h>
	CACHE(ULONG_MAX)
#undef CACHE
};
EXPORT_SYMBOL(malloc_sizes);

/* Must match cache_sizes above. Out of line to keep cache footprint low. */
struct cache_names {
	char *name;
	char *name_dma;
};

static struct cache_names __initdata cache_names[] = {
#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
#include <linux/kmalloc_sizes.h>
563
	{NULL,}
Linus Torvalds's avatar
Linus Torvalds committed
564 565 566 567
#undef CACHE
};

static struct arraycache_init initarray_cache __initdata =
568
    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
Linus Torvalds's avatar
Linus Torvalds committed
569
static struct arraycache_init initarray_generic =
570
    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
Linus Torvalds's avatar
Linus Torvalds committed
571 572

/* internal cache of cache description objs */
573
static struct kmem_cache cache_cache = {
574 575 576
	.batchcount = 1,
	.limit = BOOT_CPUCACHE_ENTRIES,
	.shared = 1,
577
	.buffer_size = sizeof(struct kmem_cache),
578
	.name = "kmem_cache",
Linus Torvalds's avatar
Linus Torvalds committed
579 580
};

581 582
#define BAD_ALIEN_MAGIC 0x01020304ul

583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602
/*
 * chicken and egg problem: delay the per-cpu array allocation
 * until the general caches are up.
 */
static enum {
	NONE,
	PARTIAL_AC,
	PARTIAL_L3,
	EARLY,
	FULL
} g_cpucache_up;

/*
 * used by boot code to determine if it can use slab based allocator
 */
int slab_is_available(void)
{
	return g_cpucache_up >= EARLY;
}

603 604 605 606 607 608 609 610
#ifdef CONFIG_LOCKDEP

/*
 * Slab sometimes uses the kmalloc slabs to store the slab headers
 * for other slabs "off slab".
 * The locking for this is tricky in that it nests within the locks
 * of all other slabs in a few places; to deal with this special
 * locking we put on-slab caches into a separate lock-class.
611 612 613 614
 *
 * We set lock class for alien array caches which are up during init.
 * The lock annotation will be lost if all cpus of a node goes down and
 * then comes back up during hotplug
615
 */
616 617 618
static struct lock_class_key on_slab_l3_key;
static struct lock_class_key on_slab_alc_key;

619
static void init_node_lock_keys(int q)
620
{
621 622
	struct cache_sizes *s = malloc_sizes;

623 624 625 626 627 628 629 630 631 632
	if (g_cpucache_up != FULL)
		return;

	for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
		struct array_cache **alc;
		struct kmem_list3 *l3;
		int r;

		l3 = s->cs_cachep->nodelists[q];
		if (!l3 || OFF_SLAB(s->cs_cachep))
633
			continue;
634 635 636 637 638 639 640 641 642 643
		lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
		alc = l3->alien;
		/*
		 * FIXME: This check for BAD_ALIEN_MAGIC
		 * should go away when common slab code is taught to
		 * work even without alien caches.
		 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
		 * for alloc_alien_cache,
		 */
		if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
644
			continue;
645 646 647 648
		for_each_node(r) {
			if (alc[r])
				lockdep_set_class(&alc[r]->lock,
					&on_slab_alc_key);
649
		}
650 651
	}
}
652 653 654 655 656 657 658 659

static inline void init_lock_keys(void)
{
	int node;

	for_each_node(node)
		init_node_lock_keys(node);
}
660
#else
661 662 663 664
static void init_node_lock_keys(int q)
{
}

665
static inline void init_lock_keys(void)
666 667 668 669
{
}
#endif

670
/*
671
 * Guard access to the cache-chain.
672
 */
Ingo Molnar's avatar
Ingo Molnar committed
673
static DEFINE_MUTEX(cache_chain_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
674 675
static struct list_head cache_chain;

676
static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
Linus Torvalds's avatar
Linus Torvalds committed
677

678
static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
Linus Torvalds's avatar
Linus Torvalds committed
679 680 681 682
{
	return cachep->array[smp_processor_id()];
}

Andrew Morton's avatar
Andrew Morton committed
683 684
static inline struct kmem_cache *__find_general_cachep(size_t size,
							gfp_t gfpflags)
Linus Torvalds's avatar
Linus Torvalds committed
685 686 687 688 689
{
	struct cache_sizes *csizep = malloc_sizes;

#if DEBUG
	/* This happens if someone tries to call
690 691 692
	 * kmem_cache_create(), or __kmalloc(), before
	 * the generic caches are initialized.
	 */
693
	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
Linus Torvalds's avatar
Linus Torvalds committed
694
#endif
695 696 697
	if (!size)
		return ZERO_SIZE_PTR;

Linus Torvalds's avatar
Linus Torvalds committed
698 699 700 701
	while (size > csizep->cs_size)
		csizep++;

	/*
702
	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
Linus Torvalds's avatar
Linus Torvalds committed
703 704 705
	 * has cs_{dma,}cachep==NULL. Thus no special case
	 * for large kmalloc calls required.
	 */
706
#ifdef CONFIG_ZONE_DMA
Linus Torvalds's avatar
Linus Torvalds committed
707 708
	if (unlikely(gfpflags & GFP_DMA))
		return csizep->cs_dmacachep;
709
#endif
Linus Torvalds's avatar
Linus Torvalds committed
710 711 712
	return csizep->cs_cachep;
}

713
static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
714 715 716 717
{
	return __find_general_cachep(size, gfpflags);
}

718
static size_t slab_mgmt_size(size_t nr_objs, size_t align)
Linus Torvalds's avatar
Linus Torvalds committed
719
{
720 721
	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
}
Linus Torvalds's avatar
Linus Torvalds committed
722

Andrew Morton's avatar
Andrew Morton committed
723 724 725
/*
 * Calculate the number of objects and left-over bytes for a given buffer size.
 */
726 727 728 729 730 731 732
static void cache_estimate(unsigned long gfporder, size_t buffer_size,
			   size_t align, int flags, size_t *left_over,
			   unsigned int *num)
{
	int nr_objs;
	size_t mgmt_size;
	size_t slab_size = PAGE_SIZE << gfporder;
Linus Torvalds's avatar
Linus Torvalds committed
733

734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781
	/*
	 * The slab management structure can be either off the slab or
	 * on it. For the latter case, the memory allocated for a
	 * slab is used for:
	 *
	 * - The struct slab
	 * - One kmem_bufctl_t for each object
	 * - Padding to respect alignment of @align
	 * - @buffer_size bytes for each object
	 *
	 * If the slab management structure is off the slab, then the
	 * alignment will already be calculated into the size. Because
	 * the slabs are all pages aligned, the objects will be at the
	 * correct alignment when allocated.
	 */
	if (flags & CFLGS_OFF_SLAB) {
		mgmt_size = 0;
		nr_objs = slab_size / buffer_size;

		if (nr_objs > SLAB_LIMIT)
			nr_objs = SLAB_LIMIT;
	} else {
		/*
		 * Ignore padding for the initial guess. The padding
		 * is at most @align-1 bytes, and @buffer_size is at
		 * least @align. In the worst case, this result will
		 * be one greater than the number of objects that fit
		 * into the memory allocation when taking the padding
		 * into account.
		 */
		nr_objs = (slab_size - sizeof(struct slab)) /
			  (buffer_size + sizeof(kmem_bufctl_t));

		/*
		 * This calculated number will be either the right
		 * amount, or one greater than what we want.
		 */
		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
		       > slab_size)
			nr_objs--;

		if (nr_objs > SLAB_LIMIT)
			nr_objs = SLAB_LIMIT;

		mgmt_size = slab_mgmt_size(nr_objs, align);
	}
	*num = nr_objs;
	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
Linus Torvalds's avatar
Linus Torvalds committed
782 783
}

784
#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
Linus Torvalds's avatar
Linus Torvalds committed
785

Andrew Morton's avatar
Andrew Morton committed
786 787
static void __slab_error(const char *function, struct kmem_cache *cachep,
			char *msg)
Linus Torvalds's avatar
Linus Torvalds committed
788 789
{
	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
790
	       function, cachep->name, msg);
Linus Torvalds's avatar
Linus Torvalds committed
791 792 793
	dump_stack();
}

794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809
/*
 * By default on NUMA we use alien caches to stage the freeing of
 * objects allocated from other nodes. This causes massive memory
 * inefficiencies when using fake NUMA setup to split memory into a
 * large number of small nodes, so it can be disabled on the command
 * line
  */

static int use_alien_caches __read_mostly = 1;
static int __init noaliencache_setup(char *s)
{
	use_alien_caches = 0;
	return 1;
}
__setup("noaliencache", noaliencache_setup);

810 811 812 813 814 815 816
#ifdef CONFIG_NUMA
/*
 * Special reaping functions for NUMA systems called from cache_reap().
 * These take care of doing round robin flushing of alien caches (containing
 * objects freed on different nodes from which they were allocated) and the
 * flushing of remote pcps by calling drain_node_pages.
 */
817
static DEFINE_PER_CPU(unsigned long, slab_reap_node);
818 819 820 821 822

static void init_reap_node(int cpu)
{
	int node;

823
	node = next_node(cpu_to_mem(cpu), node_online_map);
824
	if (node == MAX_NUMNODES)
825
		node = first_node(node_online_map);
826

827
	per_cpu(slab_reap_node, cpu) = node;
828 829 830 831
}

static void next_reap_node(void)
{
832
	int node = __get_cpu_var(slab_reap_node);
833 834 835 836

	node = next_node(node, node_online_map);
	if (unlikely(node >= MAX_NUMNODES))
		node = first_node(node_online_map);
837
	__get_cpu_var(slab_reap_node) = node;
838 839 840 841 842 843 844
}

#else
#define init_reap_node(cpu) do { } while (0)
#define next_reap_node(void) do { } while (0)
#endif

Linus Torvalds's avatar
Linus Torvalds committed
845 846 847 848 849 850 851
/*
 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
 * via the workqueue/eventd.
 * Add the CPU number into the expiration time to minimize the possibility of
 * the CPUs getting into lockstep and contending for the global cache chain
 * lock.
 */
852
static void __cpuinit start_cpu_timer(int cpu)
Linus Torvalds's avatar
Linus Torvalds committed
853
{
854
	struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
Linus Torvalds's avatar
Linus Torvalds committed
855 856 857 858 859 860

	/*
	 * When this gets called from do_initcalls via cpucache_init(),
	 * init_workqueues() has already run, so keventd will be setup
	 * at that time.
	 */
861
	if (keventd_up() && reap_work->work.func == NULL) {
862
		init_reap_node(cpu);
863
		INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap);
864 865
		schedule_delayed_work_on(cpu, reap_work,
					__round_jiffies_relative(HZ, cpu));
Linus Torvalds's avatar
Linus Torvalds committed
866 867 868
	}
}

869
static struct array_cache *alloc_arraycache(int node, int entries,
870
					    int batchcount, gfp_t gfp)
Linus Torvalds's avatar
Linus Torvalds committed
871
{
872
	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
Linus Torvalds's avatar
Linus Torvalds committed
873 874
	struct array_cache *nc = NULL;

875
	nc = kmalloc_node(memsize, gfp, node);
876 877 878 879 880 881 882 883
	/*
	 * The array_cache structures contain pointers to free object.
	 * However, when such objects are allocated or transfered to another
	 * cache the pointers are not cleared and they could be counted as
	 * valid references during a kmemleak scan. Therefore, kmemleak must
	 * not scan such objects.
	 */
	kmemleak_no_scan(nc);
Linus Torvalds's avatar
Linus Torvalds committed
884 885 886 887 888
	if (nc) {
		nc->avail = 0;
		nc->limit = entries;
		nc->batchcount = batchcount;
		nc->touched = 0;
889
		spin_lock_init(&nc->lock);
Linus Torvalds's avatar
Linus Torvalds committed
890 891 892 893
	}
	return nc;
}

894 895 896 897 898 899 900 901 902 903
/*
 * Transfer objects in one arraycache to another.
 * Locking must be handled by the caller.
 *
 * Return the number of entries transferred.
 */
static int transfer_objects(struct array_cache *to,
		struct array_cache *from, unsigned int max)
{
	/* Figure out how many entries to transfer */
904
	int nr = min3(from->avail, max, to->limit - to->avail);
905 906 907 908 909 910 911 912 913 914 915 916

	if (!nr)
		return 0;

	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
			sizeof(void *) *nr);

	from->avail -= nr;
	to->avail += nr;
	return nr;
}

917 918 919 920 921
#ifndef CONFIG_NUMA

#define drain_alien_cache(cachep, alien) do { } while (0)
#define reap_alien(cachep, l3) do { } while (0)

922
static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941
{
	return (struct array_cache **)BAD_ALIEN_MAGIC;
}

static inline void free_alien_cache(struct array_cache **ac_ptr)
{
}

static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
{
	return 0;
}

static inline void *alternate_node_alloc(struct kmem_cache *cachep,
		gfp_t flags)
{
	return NULL;
}

942
static inline void *____cache_alloc_node(struct kmem_cache *cachep,
943 944 945 946 947 948 949
		 gfp_t flags, int nodeid)
{
	return NULL;
}

#else	/* CONFIG_NUMA */

950
static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
951
static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
952

953
static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
954 955
{
	struct array_cache **ac_ptr;
956
	int memsize = sizeof(void *) * nr_node_ids;
957 958 959 960
	int i;

	if (limit > 1)
		limit = 12;
961
	ac_ptr = kzalloc_node(memsize, gfp, node);
962 963
	if (ac_ptr) {
		for_each_node(i) {
964
			if (i == node || !node_online(i))
965
				continue;
966
			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
967
			if (!ac_ptr[i]) {
968
				for (i--; i >= 0; i--)
969 970 971 972 973 974 975 976 977
					kfree(ac_ptr[i]);
				kfree(ac_ptr);
				return NULL;
			}
		}
	}
	return ac_ptr;
}

Pekka Enberg's avatar
Pekka Enberg committed
978
static void free_alien_cache(struct array_cache **ac_ptr)
979 980 981 982 983 984
{
	int i;

	if (!ac_ptr)
		return;
	for_each_node(i)
985
	    kfree(ac_ptr[i]);
986 987 988
	kfree(ac_ptr);
}

989
static void __drain_alien_cache(struct kmem_cache *cachep,
Pekka Enberg's avatar
Pekka Enberg committed
990
				struct array_cache *ac, int node)
991 992 993 994 995
{
	struct kmem_list3 *rl3 = cachep->nodelists[node];

	if (ac->avail) {
		spin_lock(&rl3->list_lock);
996 997 998 999 1000
		/*
		 * Stuff objects into the remote nodes shared array first.
		 * That way we could avoid the overhead of putting the objects
		 * into the free lists and getting them back later.
		 */
1001 1002
		if (rl3->shared)
			transfer_objects(rl3->shared, ac, ac->limit);
1003

1004
		free_block(cachep, ac->entry, ac->avail, node);
1005 1006 1007 1008 1009
		ac->avail = 0;
		spin_unlock(&rl3->list_lock);
	}
}

1010 1011 1012 1013 1014
/*
 * Called from cache_reap() to regularly drain alien caches round robin.
 */
static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
{
1015
	int node = __get_cpu_var(slab_reap_node);
1016 1017 1018

	if (l3->alien) {
		struct array_cache *ac = l3->alien[node];
1019 1020

		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
1021 1022 1023 1024 1025 1026
			__drain_alien_cache(cachep, ac, node);
			spin_unlock_irq(&ac->lock);
		}
	}
}

Andrew Morton's avatar
Andrew Morton committed
1027 1028
static void drain_alien_cache(struct kmem_cache *cachep,
				struct array_cache **alien)
1029
{
1030
	int i = 0;
1031 1032 1033 1034
	struct array_cache *ac;
	unsigned long flags;

	for_each_online_node(i) {
1035
		ac = alien[i];
1036 1037 1038 1039 1040 1041 1042
		if (ac) {
			spin_lock_irqsave(&ac->lock, flags);
			__drain_alien_cache(cachep, ac, i);
			spin_unlock_irqrestore(&ac->lock, flags);
		}
	}
}
1043

1044
static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1045 1046 1047 1048 1049
{
	struct slab *slabp = virt_to_slab(objp);
	int nodeid = slabp->nodeid;
	struct kmem_list3 *l3;
	struct array_cache *alien = NULL;
1050 1051
	int node;

1052
	node = numa_mem_id();
1053 1054 1055 1056 1057

	/*
	 * Make sure we are not freeing a object from another node to the array
	 * cache on this cpu.
	 */
1058
	if (likely(slabp->nodeid == node))
1059 1060
		return 0;

1061
	l3 = cachep->nodelists[node];
1062 1063 1064
	STATS_INC_NODEFREES(cachep);
	if (l3->alien && l3->alien[nodeid]) {
		alien = l3->alien[nodeid];
1065
		spin_lock(&alien->lock);
1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078
		if (unlikely(alien->avail == alien->limit)) {
			STATS_INC_ACOVERFLOW(cachep);
			__drain_alien_cache(cachep, alien, nodeid);
		}
		alien->entry[alien->avail++] = objp;
		spin_unlock(&alien->lock);
	} else {
		spin_lock(&(cachep->nodelists[nodeid])->list_lock);
		free_block(cachep, &objp, 1, nodeid);
		spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
	}
	return 1;
}
1079 1080
#endif

1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126
/*
 * Allocates and initializes nodelists for a node on each slab cache, used for
 * either memory or cpu hotplug.  If memory is being hot-added, the kmem_list3
 * will be allocated off-node since memory is not yet online for the new node.
 * When hotplugging memory or a cpu, existing nodelists are not replaced if
 * already in use.
 *
 * Must hold cache_chain_mutex.
 */
static int init_cache_nodelists_node(int node)
{
	struct kmem_cache *cachep;
	struct kmem_list3 *l3;
	const int memsize = sizeof(struct kmem_list3);

	list_for_each_entry(cachep, &cache_chain, next) {
		/*
		 * Set up the size64 kmemlist for cpu before we can
		 * begin anything. Make sure some other cpu on this
		 * node has not already allocated this
		 */
		if (!cachep->nodelists[node]) {
			l3 = kmalloc_node(memsize, GFP_KERNEL, node);
			if (!l3)
				return -ENOMEM;
			kmem_list3_init(l3);
			l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
			    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;

			/*
			 * The l3s don't come and go as CPUs come and
			 * go.  cache_chain_mutex is sufficient
			 * protection here.
			 */
			cachep->nodelists[node] = l3;
		}

		spin_lock_irq(&cachep->nodelists[node]->list_lock);
		cachep->nodelists[node]->free_limit =
			(1 + nr_cpus_node(node)) *
			cachep->batchcount + cachep->num;
		spin_unlock_irq(&cachep->nodelists[node]->list_lock);
	}
	return 0;
}

1127 1128 1129 1130
static void __cpuinit cpuup_canceled(long cpu)
{
	struct kmem_cache *cachep;
	struct kmem_list3 *l3 = NULL;
1131
	int node = cpu_to_mem(cpu);
1132
	const struct cpumask *mask = cpumask_of_node(node);
1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153

	list_for_each_entry(cachep, &cache_chain, next) {
		struct array_cache *nc;
		struct array_cache *shared;
		struct array_cache **alien;

		/* cpu is dead; no one can alloc from it. */
		nc = cachep->array[cpu];
		cachep->array[cpu] = NULL;
		l3 = cachep->nodelists[node];

		if (!l3)
			goto free_array_cache;

		spin_lock_irq(&l3->list_lock);

		/* Free limit for this kmem_list3 */
		l3->free_limit -= cachep->batchcount;
		if (nc)
			free_block(cachep, nc->entry, nc->avail, node);

1154
		if (!cpumask_empty(mask)) {
1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192
			spin_unlock_irq(&l3->list_lock);
			goto free_array_cache;
		}

		shared = l3->shared;
		if (shared) {
			free_block(cachep, shared->entry,
				   shared->avail, node);
			l3->shared = NULL;
		}

		alien = l3->alien;
		l3->alien = NULL;

		spin_unlock_irq(&l3->list_lock);

		kfree(shared);
		if (alien) {
			drain_alien_cache(cachep, alien);
			free_alien_cache(alien);
		}
free_array_cache:
		kfree(nc);
	}
	/*
	 * In the previous loop, all the objects were freed to
	 * the respective cache's slabs,  now we can go ahead and
	 * shrink each nodelist to its limit.
	 */
	list_for_each_entry(cachep, &cache_chain, next) {
		l3 = cachep->nodelists[node];
		if (!l3)
			continue;
		drain_freelist(cachep, l3, l3->free_objects);
	}
}

static int __cpuinit cpuup_prepare(long cpu)
Linus Torvalds's avatar
Linus Torvalds committed
1193
{
1194
	struct kmem_cache *cachep;
1195
	struct kmem_list3 *l3 = NULL;
1196
	int node = cpu_to_mem(cpu);
1197
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
1198

1199 1200 1201 1202 1203 1204
	/*
	 * We need to do this right in the beginning since
	 * alloc_arraycache's are going to use this list.
	 * kmalloc_node allows us to add the slab to the right
	 * kmem_list3 and not this cpu's kmem_list3
	 */
1205 1206 1207
	err = init_cache_nodelists_node(node);
	if (err < 0)
		goto bad;
1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218

	/*
	 * Now we can go ahead with allocating the shared arrays and
	 * array caches
	 */
	list_for_each_entry(cachep, &cache_chain, next) {
		struct array_cache *nc;
		struct array_cache *shared = NULL;
		struct array_cache **alien = NULL;

		nc = alloc_arraycache(node, cachep->limit,
1219
					cachep->batchcount, GFP_KERNEL);
1220 1221 1222 1223 1224
		if (!nc)
			goto bad;
		if (cachep->shared) {
			shared = alloc_arraycache(node,
				cachep->shared * cachep->batchcount,
1225
				0xbaadf00d, GFP_KERNEL);
1226 1227
			if (!shared) {
				kfree(nc);
Linus Torvalds's avatar
Linus Torvalds committed
1228
				goto bad;
1229
			}
1230 1231
		}
		if (use_alien_caches) {