memory_hotplug.c 20.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/*
 *  linux/mm/memory_hotplug.c
 *
 *  Copyright (C)
 */

#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/pagevec.h>
16
#include <linux/writeback.h>
17
18
19
20
21
22
23
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/highmem.h>
#include <linux/vmalloc.h>
24
#include <linux/ioport.h>
25
26
27
#include <linux/delay.h>
#include <linux/migrate.h>
#include <linux/page-isolation.h>
28
#include <linux/pfn.h>
29
30
31

#include <asm/tlbflush.h>

32
33
#include "internal.h"

34
35
36
37
38
39
40
41
42
43
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
	struct resource *res;
	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
	BUG_ON(!res);

	res->name = "System RAM";
	res->start = start;
	res->end = start + size - 1;
44
	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
	if (request_resource(&iomem_resource, res) < 0) {
		printk("System RAM resource %llx - %llx cannot be added\n",
		(unsigned long long)res->start, (unsigned long long)res->end);
		kfree(res);
		res = NULL;
	}
	return res;
}

static void release_memory_resource(struct resource *res)
{
	if (!res)
		return;
	release_resource(res);
	kfree(res);
	return;
}

63
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
64
#ifndef CONFIG_SPARSEMEM_VMEMMAP
65
static void get_page_bootmem(unsigned long info,  struct page *page, int type)
66
{
67
	atomic_set(&page->_mapcount, type);
68
69
70
71
72
73
74
	SetPagePrivate(page);
	set_page_private(page, info);
	atomic_inc(&page->_count);
}

void put_page_bootmem(struct page *page)
{
75
	int type;
76

77
78
	type = atomic_read(&page->_mapcount);
	BUG_ON(type >= -1);
79
80
81
82
83
84
85
86
87
88

	if (atomic_dec_return(&page->_count) == 1) {
		ClearPagePrivate(page);
		set_page_private(page, 0);
		reset_page_mapcount(page);
		__free_pages_bootmem(page, 0);
	}

}

89
static void register_page_bootmem_info_section(unsigned long start_pfn)
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
{
	unsigned long *usemap, mapsize, section_nr, i;
	struct mem_section *ms;
	struct page *page, *memmap;

	if (!pfn_valid(start_pfn))
		return;

	section_nr = pfn_to_section_nr(start_pfn);
	ms = __nr_to_section(section_nr);

	/* Get section's memmap address */
	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);

	/*
	 * Get page for the memmap's phys address
	 * XXX: need more consideration for sparse_vmemmap...
	 */
	page = virt_to_page(memmap);
	mapsize = sizeof(struct page) * PAGES_PER_SECTION;
	mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;

	/* remember memmap's page */
	for (i = 0; i < mapsize; i++, page++)
		get_page_bootmem(section_nr, page, SECTION_INFO);

	usemap = __nr_to_section(section_nr)->pageblock_flags;
	page = virt_to_page(usemap);

	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;

	for (i = 0; i < mapsize; i++, page++)
122
		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161

}

void register_page_bootmem_info_node(struct pglist_data *pgdat)
{
	unsigned long i, pfn, end_pfn, nr_pages;
	int node = pgdat->node_id;
	struct page *page;
	struct zone *zone;

	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
	page = virt_to_page(pgdat);

	for (i = 0; i < nr_pages; i++, page++)
		get_page_bootmem(node, page, NODE_INFO);

	zone = &pgdat->node_zones[0];
	for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
		if (zone->wait_table) {
			nr_pages = zone->wait_table_hash_nr_entries
				* sizeof(wait_queue_head_t);
			nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
			page = virt_to_page(zone->wait_table);

			for (i = 0; i < nr_pages; i++, page++)
				get_page_bootmem(node, page, NODE_INFO);
		}
	}

	pfn = pgdat->node_start_pfn;
	end_pfn = pfn + pgdat->node_spanned_pages;

	/* register_section info */
	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION)
		register_page_bootmem_info_section(pfn);

}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */

162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
			   unsigned long end_pfn)
{
	unsigned long old_zone_end_pfn;

	zone_span_writelock(zone);

	old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
	if (start_pfn < zone->zone_start_pfn)
		zone->zone_start_pfn = start_pfn;

	zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
				zone->zone_start_pfn;

	zone_span_writeunlock(zone);
}

static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
			    unsigned long end_pfn)
{
	unsigned long old_pgdat_end_pfn =
		pgdat->node_start_pfn + pgdat->node_spanned_pages;

	if (start_pfn < pgdat->node_start_pfn)
		pgdat->node_start_pfn = start_pfn;

	pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
					pgdat->node_start_pfn;
}

Al Viro's avatar
Al Viro committed
192
static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
193
194
195
196
197
{
	struct pglist_data *pgdat = zone->zone_pgdat;
	int nr_pages = PAGES_PER_SECTION;
	int nid = pgdat->node_id;
	int zone_type;
198
	unsigned long flags;
199
200

	zone_type = zone - pgdat->node_zones;
201
202
203
204
205
206
207
208
209
210
211
212
213
	if (!zone->wait_table) {
		int ret;

		ret = init_currently_empty_zone(zone, phys_start_pfn,
						nr_pages, MEMMAP_HOTPLUG);
		if (ret)
			return ret;
	}
	pgdat_resize_lock(zone->zone_pgdat, &flags);
	grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
	grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
			phys_start_pfn + nr_pages);
	pgdat_resize_unlock(zone->zone_pgdat, &flags);
Dave Hansen's avatar
Dave Hansen committed
214
215
	memmap_init_zone(nr_pages, nid, zone_type,
			 phys_start_pfn, MEMMAP_HOTPLUG);
216
	return 0;
217
218
}

219
220
static int __meminit __add_section(int nid, struct zone *zone,
					unsigned long phys_start_pfn)
221
222
223
224
{
	int nr_pages = PAGES_PER_SECTION;
	int ret;

225
226
227
	if (pfn_valid(phys_start_pfn))
		return -EEXIST;

228
	ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
229
230
231
232

	if (ret < 0)
		return ret;

233
234
235
236
237
	ret = __add_zone(zone, phys_start_pfn);

	if (ret < 0)
		return ret;

238
	return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
239
240
}

241
242
243
244
245
246
247
248
249
250
#ifdef CONFIG_SPARSEMEM_VMEMMAP
static int __remove_section(struct zone *zone, struct mem_section *ms)
{
	/*
	 * XXX: Freeing memmap with vmemmap is not implement yet.
	 *      This should be removed later.
	 */
	return -EBUSY;
}
#else
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
static int __remove_section(struct zone *zone, struct mem_section *ms)
{
	unsigned long flags;
	struct pglist_data *pgdat = zone->zone_pgdat;
	int ret = -EINVAL;

	if (!valid_section(ms))
		return ret;

	ret = unregister_memory_section(ms);
	if (ret)
		return ret;

	pgdat_resize_lock(pgdat, &flags);
	sparse_remove_one_section(zone, ms);
	pgdat_resize_unlock(pgdat, &flags);
	return 0;
}
269
#endif
270

271
272
273
274
275
276
/*
 * Reasonably generic function for adding memory.  It is
 * expected that archs that support memory hotplug will
 * call this function after deciding the zone to which to
 * add the new pages.
 */
277
278
int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
			unsigned long nr_pages)
279
280
281
{
	unsigned long i;
	int err = 0;
282
283
284
285
	int start_sec, end_sec;
	/* during initialize mem_map, align hot-added range to section */
	start_sec = pfn_to_section_nr(phys_start_pfn);
	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
286

287
	for (i = start_sec; i <= end_sec; i++) {
288
		err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
289

290
		/*
Simon Arlott's avatar
Simon Arlott committed
291
		 * EEXIST is finally dealt with by ioresource collision
292
293
		 * check. see add_memory() => register_memory_resource()
		 * Warning will be printed if there is collision.
294
295
		 */
		if (err && (err != -EEXIST))
296
			break;
297
		err = 0;
298
299
300
301
	}

	return err;
}
302
EXPORT_SYMBOL_GPL(__add_pages);
303

304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
/**
 * __remove_pages() - remove sections of pages from a zone
 * @zone: zone from which pages need to be removed
 * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
 * @nr_pages: number of pages to remove (must be multiple of section size)
 *
 * Generic helper function to remove section mappings and sysfs entries
 * for the section of the memory we are removing. Caller needs to make
 * sure that pages are marked reserved and zones are adjust properly by
 * calling offline_pages().
 */
int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
		 unsigned long nr_pages)
{
	unsigned long i, ret = 0;
	int sections_to_remove;

	/*
	 * We can only remove entire sections
	 */
	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
	BUG_ON(nr_pages % PAGES_PER_SECTION);

	sections_to_remove = nr_pages / PAGES_PER_SECTION;
	for (i = 0; i < sections_to_remove; i++) {
		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
330
331
		release_mem_region(pfn << PAGE_SHIFT,
				   PAGES_PER_SECTION << PAGE_SHIFT);
332
333
334
335
336
337
338
339
		ret = __remove_section(zone, __pfn_to_section(pfn));
		if (ret)
			break;
	}
	return ret;
}
EXPORT_SYMBOL_GPL(__remove_pages);

340
341
void online_page(struct page *page)
{
342
343
	unsigned long pfn = page_to_pfn(page);

344
	totalram_pages++;
345
346
	if (pfn >= num_physpages)
		num_physpages = pfn + 1;
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361

#ifdef CONFIG_HIGHMEM
	if (PageHighMem(page))
		totalhigh_pages++;
#endif

#ifdef CONFIG_FLATMEM
	max_mapnr = max(page_to_pfn(page), max_mapnr);
#endif

	ClearPageReserved(page);
	init_page_count(page);
	__free_page(page);
}

362
363
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
			void *arg)
364
365
{
	unsigned long i;
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
	unsigned long onlined_pages = *(unsigned long *)arg;
	struct page *page;
	if (PageReserved(pfn_to_page(start_pfn)))
		for (i = 0; i < nr_pages; i++) {
			page = pfn_to_page(start_pfn + i);
			online_page(page);
			onlined_pages++;
		}
	*(unsigned long *)arg = onlined_pages;
	return 0;
}


int online_pages(unsigned long pfn, unsigned long nr_pages)
{
381
382
	unsigned long onlined_pages = 0;
	struct zone *zone;
383
	int need_zonelists_rebuild = 0;
384
385
386
387
388
389
390
391
392
393
394
	int nid;
	int ret;
	struct memory_notify arg;

	arg.start_pfn = pfn;
	arg.nr_pages = nr_pages;
	arg.status_change_nid = -1;

	nid = page_to_nid(pfn_to_page(pfn));
	if (node_present_pages(nid) == 0)
		arg.status_change_nid = nid;
395

396
397
398
399
400
401
	ret = memory_notify(MEM_GOING_ONLINE, &arg);
	ret = notifier_to_errno(ret);
	if (ret) {
		memory_notify(MEM_CANCEL_ONLINE, &arg);
		return ret;
	}
402
403
404
	/*
	 * This doesn't need a lock to do pfn_to_page().
	 * The section can't be removed here because of the
405
	 * memory_block->state_mutex.
406
407
	 */
	zone = page_zone(pfn_to_page(pfn));
408
409
410
411
412
413
414
415
	/*
	 * If this zone is not populated, then it is not in zonelist.
	 * This means the page allocator ignores this zone.
	 * So, zonelist must be updated after online.
	 */
	if (!populated_zone(zone))
		need_zonelists_rebuild = 1;

416
	ret = walk_memory_resource(pfn, nr_pages, &onlined_pages,
417
		online_pages_range);
418
419
420
421
422
423
424
	if (ret) {
		printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
			nr_pages, pfn);
		memory_notify(MEM_CANCEL_ONLINE, &arg);
		return ret;
	}

425
	zone->present_pages += onlined_pages;
426
	zone->zone_pgdat->node_present_pages += onlined_pages;
427

428
	zone_pcp_update(zone);
429
	setup_per_zone_wmarks();
430
	calculate_zone_inactive_ratio(zone);
431
432
433
434
	if (onlined_pages) {
		kswapd_run(zone_to_nid(zone));
		node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
	}
435

436
437
	if (need_zonelists_rebuild)
		build_all_zonelists();
438
439
440
	else
		vm_total_pages = nr_free_pagecache_pages();

441
	writeback_set_ratelimit();
442
443
444
445

	if (onlined_pages)
		memory_notify(MEM_ONLINE, &arg);

446
447
	return 0;
}
448
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
449

450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
{
	struct pglist_data *pgdat;
	unsigned long zones_size[MAX_NR_ZONES] = {0};
	unsigned long zholes_size[MAX_NR_ZONES] = {0};
	unsigned long start_pfn = start >> PAGE_SHIFT;

	pgdat = arch_alloc_nodedata(nid);
	if (!pgdat)
		return NULL;

	arch_refresh_nodedata(nid, pgdat);

	/* we can use NODE_DATA(nid) from here */

	/* init node's zones as empty zones, we don't have any present pages.*/
466
	free_area_init_node(nid, zones_size, start_pfn, zholes_size);
467
468
469
470
471
472
473
474
475
476
477

	return pgdat;
}

static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
{
	arch_refresh_nodedata(nid, NULL);
	arch_free_nodedata(pgdat);
	return;
}

478

Al Viro's avatar
Al Viro committed
479
480
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
int __ref add_memory(int nid, u64 start, u64 size)
481
{
482
483
	pg_data_t *pgdat = NULL;
	int new_pgdat = 0;
484
	struct resource *res;
485
486
	int ret;

487
488
489
490
	res = register_memory_resource(start, size);
	if (!res)
		return -EEXIST;

491
492
493
494
495
496
497
	if (!node_online(nid)) {
		pgdat = hotadd_new_pgdat(nid, start);
		if (!pgdat)
			return -ENOMEM;
		new_pgdat = 1;
	}

498
499
500
	/* call arch's memory hotadd */
	ret = arch_add_memory(nid, start, size);

501
502
503
	if (ret < 0)
		goto error;

504
	/* we online node here. we can't roll back from here. */
505
506
	node_set_online(nid);

507
508
509
510
511
512
513
514
515
516
	if (new_pgdat) {
		ret = register_one_node(nid);
		/*
		 * If sysfs file of new node can't create, cpu on the node
		 * can't be hot-added. There is no rollback way now.
		 * So, check by BUG_ON() to catch it reluctantly..
		 */
		BUG_ON(ret);
	}

517
518
519
520
521
	return ret;
error:
	/* rollback pgdat allocation and others */
	if (new_pgdat)
		rollback_node_hotadd(nid, pgdat);
522
523
	if (res)
		release_memory_resource(res);
524

525
526
527
	return ret;
}
EXPORT_SYMBOL_GPL(add_memory);
528
529

#ifdef CONFIG_MEMORY_HOTREMOVE
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
/*
 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
 * set and the size of the free page is given by page_order(). Using this,
 * the function determines if the pageblock contains only free pages.
 * Due to buddy contraints, a free page at least the size of a pageblock will
 * be located at the start of the pageblock
 */
static inline int pageblock_free(struct page *page)
{
	return PageBuddy(page) && page_order(page) >= pageblock_order;
}

/* Return the start of the next active pageblock after a given page */
static struct page *next_active_pageblock(struct page *page)
{
	int pageblocks_stride;

	/* Ensure the starting page is pageblock-aligned */
	BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));

	/* Move forward by at least 1 * pageblock_nr_pages */
	pageblocks_stride = 1;

	/* If the entire pageblock is free, move to the end of free page */
	if (pageblock_free(page))
		pageblocks_stride += page_order(page) - pageblock_order;

	return page + (pageblocks_stride * pageblock_nr_pages);
}

/* Checks if this range of memory is likely to be hot-removable. */
int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
	int type;
	struct page *page = pfn_to_page(start_pfn);
	struct page *end_page = page + nr_pages;

	/* Check the starting page of each pageblock within the range */
	for (; page < end_page; page = next_active_pageblock(page)) {
		type = get_pageblock_migratetype(page);

		/*
		 * A pageblock containing MOVABLE or free pages is considered
		 * removable
		 */
		if (type != MIGRATE_MOVABLE && !pageblock_free(page))
			return 0;

		/*
		 * A pageblock starting with a PageReserved page is not
		 * considered removable.
		 */
		if (PageReserved(page))
			return 0;
	}

	/* All pageblocks in the memory block are likely to be hot-removable */
	return 1;
}

590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
/*
 * Confirm all pages in a range [start, end) is belongs to the same zone.
 */
static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;
	struct zone *zone = NULL;
	struct page *page;
	int i;
	for (pfn = start_pfn;
	     pfn < end_pfn;
	     pfn += MAX_ORDER_NR_PAGES) {
		i = 0;
		/* This is just a CONFIG_HOLES_IN_ZONE check.*/
		while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
			i++;
		if (i == MAX_ORDER_NR_PAGES)
			continue;
		page = pfn_to_page(pfn + i);
		if (zone && page_zone(page) != zone)
			return 0;
		zone = page_zone(page);
	}
	return 1;
}

/*
 * Scanning pfn is much easier than scanning lru list.
 * Scan pfn from start to end and Find LRU page.
 */
int scan_lru_pages(unsigned long start, unsigned long end)
{
	unsigned long pfn;
	struct page *page;
	for (pfn = start; pfn < end; pfn++) {
		if (pfn_valid(pfn)) {
			page = pfn_to_page(pfn);
			if (PageLRU(page))
				return pfn;
		}
	}
	return 0;
}

static struct page *
635
hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
636
{
637
638
	/* This should be improooooved!! */
	return alloc_page(GFP_HIGHUSER_MOVABLE);
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
}

#define NR_OFFLINE_AT_ONCE_PAGES	(256)
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;
	struct page *page;
	int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
	int not_managed = 0;
	int ret = 0;
	LIST_HEAD(source);

	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
		if (!pfn_valid(pfn))
			continue;
		page = pfn_to_page(pfn);
		if (!page_count(page))
			continue;
		/*
		 * We can skip free pages. And we can only deal with pages on
		 * LRU.
		 */
662
		ret = isolate_lru_page(page);
663
		if (!ret) { /* Success */
664
			list_add_tail(&page->lru, &source);
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
			move_pages--;
		} else {
			/* Becasue we don't have big zone->lock. we should
			   check this again here. */
			if (page_count(page))
				not_managed++;
#ifdef CONFIG_DEBUG_VM
			printk(KERN_INFO "removing from LRU failed"
					 " %lx/%d/%lx\n",
				pfn, page_count(page), page->flags);
#endif
		}
	}
	ret = -EBUSY;
	if (not_managed) {
		if (!list_empty(&source))
			putback_lru_pages(&source);
		goto out;
	}
	ret = 0;
	if (list_empty(&source))
		goto out;
	/* this function returns # of failed pages */
	ret = migrate_pages(&source, hotremove_migrate_alloc, 0);

out:
	return ret;
}

/*
 * remove from free_area[] and mark all as Reserved.
 */
static int
offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
			void *data)
{
	__offline_isolated_pages(start, start + nr_pages);
	return 0;
}

static void
offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
	walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL,
				offline_isolated_pages_cb);
}

/*
 * Check all pages in range, recoreded as memory resource, are isolated.
 */
static int
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
			void *data)
{
	int ret;
	long offlined = *(long *)data;
	ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
	offlined = nr_pages;
	if (!ret)
		*(long *)data += offlined;
	return ret;
}

static long
check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
{
	long offlined = 0;
	int ret;

	ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined,
			check_pages_isolated_cb);
	if (ret < 0)
		offlined = (long)ret;
	return offlined;
}

int offline_pages(unsigned long start_pfn,
		  unsigned long end_pfn, unsigned long timeout)
{
	unsigned long pfn, nr_pages, expire;
	long offlined_pages;
746
	int ret, drain, retry_max, node;
747
	struct zone *zone;
748
	struct memory_notify arg;
749
750
751
752
753
754
755
756
757
758
759

	BUG_ON(start_pfn >= end_pfn);
	/* at least, alignment against pageblock is necessary */
	if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
		return -EINVAL;
	if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
		return -EINVAL;
	/* This makes hotplug much easier...and readable.
	   we assume this for now. .*/
	if (!test_pages_in_a_zone(start_pfn, end_pfn))
		return -EINVAL;
760
761
762
763
764

	zone = page_zone(pfn_to_page(start_pfn));
	node = zone_to_nid(zone);
	nr_pages = end_pfn - start_pfn;

765
766
767
768
	/* set above range as isolated */
	ret = start_isolate_page_range(start_pfn, end_pfn);
	if (ret)
		return ret;
769
770
771
772
773
774
775
776
777
778
779
780

	arg.start_pfn = start_pfn;
	arg.nr_pages = nr_pages;
	arg.status_change_nid = -1;
	if (nr_pages >= node_present_pages(node))
		arg.status_change_nid = node;

	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
	ret = notifier_to_errno(ret);
	if (ret)
		goto failed_removal;

781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
	pfn = start_pfn;
	expire = jiffies + timeout;
	drain = 0;
	retry_max = 5;
repeat:
	/* start memory hot removal */
	ret = -EAGAIN;
	if (time_after(jiffies, expire))
		goto failed_removal;
	ret = -EINTR;
	if (signal_pending(current))
		goto failed_removal;
	ret = 0;
	if (drain) {
		lru_add_drain_all();
		flush_scheduled_work();
		cond_resched();
798
		drain_all_pages();
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
	}

	pfn = scan_lru_pages(start_pfn, end_pfn);
	if (pfn) { /* We have page on LRU */
		ret = do_migrate_range(pfn, end_pfn);
		if (!ret) {
			drain = 1;
			goto repeat;
		} else {
			if (ret < 0)
				if (--retry_max == 0)
					goto failed_removal;
			yield();
			drain = 1;
			goto repeat;
		}
	}
	/* drain all zone's lru pagevec, this is asyncronous... */
	lru_add_drain_all();
	flush_scheduled_work();
	yield();
	/* drain pcp pages , this is synchrouns. */
821
	drain_all_pages();
822
823
824
825
826
827
828
829
830
831
	/* check again */
	offlined_pages = check_pages_isolated(start_pfn, end_pfn);
	if (offlined_pages < 0) {
		ret = -EBUSY;
		goto failed_removal;
	}
	printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
	/* Ok, all of our target is islaoted.
	   We cannot do rollback at this point. */
	offline_isolated_pages(start_pfn, end_pfn);
832
833
	/* reset pagetype flags and makes migrate type to be MOVABLE */
	undo_isolate_page_range(start_pfn, end_pfn);
834
835
836
837
	/* removal success */
	zone->present_pages -= offlined_pages;
	zone->zone_pgdat->node_present_pages -= offlined_pages;
	totalram_pages -= offlined_pages;
838

839
840
841
	setup_per_zone_wmarks();
	calculate_zone_inactive_ratio(zone);

842
843
	vm_total_pages = nr_free_pagecache_pages();
	writeback_set_ratelimit();
844
845

	memory_notify(MEM_OFFLINE, &arg);
846
847
848
849
850
	return 0;

failed_removal:
	printk(KERN_INFO "memory offlining %lx to %lx failed\n",
		start_pfn, end_pfn);
851
	memory_notify(MEM_CANCEL_OFFLINE, &arg);
852
853
	/* pushback to free area */
	undo_isolate_page_range(start_pfn, end_pfn);
854

855
856
	return ret;
}
857
858
859
860
861
862
863
864
865

int remove_memory(u64 start, u64 size)
{
	unsigned long start_pfn, end_pfn;

	start_pfn = PFN_DOWN(start);
	end_pfn = start_pfn + PFN_DOWN(size);
	return offline_pages(start_pfn, end_pfn, 120 * HZ);
}
866
867
868
869
870
#else
int remove_memory(u64 start, u64 size)
{
	return -EINVAL;
}
871
#endif /* CONFIG_MEMORY_HOTREMOVE */
872
EXPORT_SYMBOL_GPL(remove_memory);