Commit bb776296 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (22 commits)
  x86: fix system without memory on node0
  x86, mm: Fix node_possible_map logic
  mm, x86: remove MEMORY_HOTPLUG_RESERVE related code
  x86: make sparse mem work in non-NUMA mode
  x86: process.c, remove useless headers
  x86: merge process.c a bit
  x86: use sparse_memory_present_with_active_regions() on UMA
  x86: unify 64-bit UMA and NUMA paging_init()
  x86: Allow 1MB of slack between the e820 map and SRAT, not 4GB
  x86: Sanity check the e820 against the SRAT table using e820 map only
  x86: clean up and and print out initial max_pfn_mapped
  x86/pci: remove rounding quirk from e820_setup_gap()
  x86, e820, pci: reserve extra free space near end of RAM
  x86: fix typo in address space documentation
  x86: 46 bit physical address support on 64 bits
  x86, mm: fault.c, use printk_once() in is_errata93()
  x86: move per-cpu mmu_gathers to mm/init.c
  x86: move max_pfn_mapped and max_low_pfn_mapped to setup.c
  x86: unify noexec handling
  x86: remove (null) in /sys kernel_page_tables
  ...
parents 48c72d1a 35d5a9a6
......@@ -150,11 +150,6 @@ NUMA
Otherwise, the remaining system RAM is allocated to an
additional node.
numa=hotadd=percent
Only allow hotadd memory to preallocate page structures upto
percent of already available memory.
numa=hotadd=0 will disable hotadd memory.
ACPI
acpi=off Don't enable ACPI
......
......@@ -6,10 +6,11 @@ Virtual memory map with 4 level page tables:
0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
hole caused by [48:63] sign extension
ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole
ffff880000000000 - ffffc0ffffffffff (=57 TB) direct mapping of all phys. memory
ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole
ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space
ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB)
ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0
ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space
......
......@@ -17,9 +17,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
extern void numa_init_array(void);
extern int numa_off;
extern void srat_reserve_add_area(int nodeid);
extern int hotadd_percent;
extern s16 apicid_to_node[MAX_LOCAL_APIC];
extern unsigned long numa_free_all_bootmem(void);
......@@ -27,6 +24,13 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
unsigned long end);
#ifdef CONFIG_NUMA
/*
* Too small node sizes may confuse the VM badly. Usually they
* result from BIOS bugs. So dont recognize nodes as standalone
* NUMA entities that have less than this amount of RAM listed:
*/
#define NODE_MIN_SIZE (4*1024*1024)
extern void __init init_cpu_to_node(void);
extern void __cpuinit numa_set_node(int cpu, int node);
extern void __cpuinit numa_clear_node(int cpu);
......
......@@ -54,10 +54,6 @@ extern unsigned int __VMALLOC_RESERVE;
extern int sysctl_legacy_va_layout;
extern void find_low_pfn_range(void);
extern unsigned long init_memory_mapping(unsigned long start,
unsigned long end);
extern void initmem_init(unsigned long, unsigned long);
extern void free_initmem(void);
extern void setup_bootmem_allocator(void);
#endif /* !__ASSEMBLY__ */
......
......@@ -39,7 +39,7 @@
#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
/* See Documentation/x86_64/mm.txt for a description of the memory map. */
/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
#define __PHYSICAL_MASK_SHIFT 46
#define __VIRTUAL_MASK_SHIFT 48
......@@ -63,12 +63,6 @@ extern unsigned long __phys_addr(unsigned long);
#define vmemmap ((struct page *)VMEMMAP_START)
extern unsigned long init_memory_mapping(unsigned long start,
unsigned long end);
extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
extern void free_initmem(void);
extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
......
......@@ -46,6 +46,12 @@ extern int devmem_is_allowed(unsigned long pagenr);
extern unsigned long max_low_pfn_mapped;
extern unsigned long max_pfn_mapped;
extern unsigned long init_memory_mapping(unsigned long start,
unsigned long end);
extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
extern void free_initmem(void);
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PAGE_DEFS_H */
......@@ -51,11 +51,11 @@ typedef struct { pteval_t pte; } pte_t;
#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
#define VMALLOC_START _AC(0xffffc20000000000, UL)
#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
#define VMEMMAP_START _AC(0xffffe20000000000, UL)
#define VMALLOC_START _AC(0xffffc90000000000, UL)
#define VMALLOC_END _AC(0xffffe8ffffffffff, UL)
#define VMEMMAP_START _AC(0xffffea0000000000, UL)
#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
#define MODULES_END _AC(0xffffffffff000000, UL)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
......
......@@ -273,7 +273,6 @@ typedef struct page *pgtable_t;
extern pteval_t __supported_pte_mask;
extern int nx_enabled;
extern void set_nx(void);
#define pgprot_writecombine pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);
......
......@@ -27,7 +27,7 @@
#else /* CONFIG_X86_32 */
# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
# define MAX_PHYSADDR_BITS 44
# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */
# define MAX_PHYSMEM_BITS 46
#endif
#endif /* CONFIG_SPARSEMEM */
......
......@@ -2,6 +2,7 @@
#define _ASM_X86_TRAPS_H
#include <asm/debugreg.h>
#include <asm/siginfo.h> /* TRAP_TRACE, ... */
#ifdef CONFIG_X86_32
#define dotraplinkage
......
......@@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
*/
__init void e820_setup_gap(void)
{
unsigned long gapstart, gapsize, round;
unsigned long gapstart, gapsize;
int found;
gapstart = 0x10000000;
......@@ -635,14 +635,9 @@ __init void e820_setup_gap(void)
#endif
/*
* See how much we want to round up: start off with
* rounding to the next 1MB area.
* e820_reserve_resources_late protect stolen RAM already
*/
round = 0x100000;
while ((gapsize >> 4) > round)
round += round;
/* Fun with two's complement */
pci_mem_start = (gapstart + round) & -round;
pci_mem_start = gapstart;
printk(KERN_INFO
"Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
......@@ -1371,6 +1366,23 @@ void __init e820_reserve_resources(void)
}
}
/* How much should we pad RAM ending depending on where it is? */
static unsigned long ram_alignment(resource_size_t pos)
{
unsigned long mb = pos >> 20;
/* To 64kB in the first megabyte */
if (!mb)
return 64*1024;
/* To 1MB in the first 16MB */
if (mb < 16)
return 1024*1024;
/* To 32MB for anything above that */
return 32*1024*1024;
}
void __init e820_reserve_resources_late(void)
{
int i;
......@@ -1382,6 +1394,24 @@ void __init e820_reserve_resources_late(void)
insert_resource_expand_to_fit(&iomem_resource, res);
res++;
}
/*
* Try to bump up RAM regions to reasonable boundaries to
* avoid stolen RAM:
*/
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *entry = &e820_saved.map[i];
resource_size_t start, end;
if (entry->type != E820_RAM)
continue;
start = entry->addr + entry->size;
end = round_up(start, ram_alignment(start));
if (start == end)
continue;
reserve_region_with_split(&iomem_resource, start,
end - 1, "RAM buffer");
}
}
char *__init default_machine_specific_memory_setup(void)
......
......@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/pm.h>
#include <linux/clockchips.h>
#include <linux/random.h>
#include <trace/power.h>
#include <asm/system.h>
#include <asm/apic.h>
......@@ -614,3 +615,16 @@ static int __init idle_setup(char *str)
}
early_param("idle", idle_setup);
unsigned long arch_align_stack(unsigned long sp)
{
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
sp -= get_random_int() % 8192;
return sp & ~0xf;
}
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
unsigned long range_end = mm->brk + 0x02000000;
return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
}
......@@ -9,8 +9,6 @@
* This file handles the architecture-dependent parts of process handling..
*/
#include <stdarg.h>
#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
......@@ -33,7 +31,6 @@
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/ptrace.h>
#include <linux/random.h>
#include <linux/personality.h>
#include <linux/tick.h>
#include <linux/percpu.h>
......@@ -497,15 +494,3 @@ unsigned long get_wchan(struct task_struct *p)
return 0;
}
unsigned long arch_align_stack(unsigned long sp)
{
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
sp -= get_random_int() % 8192;
return sp & ~0xf;
}
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
unsigned long range_end = mm->brk + 0x02000000;
return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
}
......@@ -14,8 +14,6 @@
* This file handles the architecture-dependent parts of process handling..
*/
#include <stdarg.h>
#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
......@@ -32,7 +30,6 @@
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/ptrace.h>
#include <linux/random.h>
#include <linux/notifier.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
......@@ -660,15 +657,3 @@ long sys_arch_prctl(int code, unsigned long addr)
return do_arch_prctl(current, code, addr);
}
unsigned long arch_align_stack(unsigned long sp)
{
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
sp -= get_random_int() % 8192;
return sp & ~0xf;
}
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
unsigned long range_end = mm->brk + 0x02000000;
return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
}
......@@ -112,6 +112,14 @@
#define ARCH_SETUP
#endif
/*
* end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
* The direct mapping extends to max_pfn_mapped, so that we can directly access
* apertures, ACPI and other tables without having to play with fixmaps.
*/
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
RESERVE_BRK(dmi_alloc, 65536);
unsigned int boot_cpu_id __read_mostly;
......@@ -860,12 +868,16 @@ void __init setup_arch(char **cmdline_p)
max_low_pfn = max_pfn;
high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
#endif
#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
setup_bios_corruption_check();
#endif
printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
max_pfn_mapped<<PAGE_SHIFT);
reserve_brk();
/* max_pfn_mapped is updated here */
......
......@@ -425,6 +425,14 @@ void __init setup_per_cpu_areas(void)
early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
#endif
#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
/*
* make sure boot cpu node_number is right, when boot cpu is on the
* node that doesn't have mem installed
*/
per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
#endif
/* Setup node to cpumask map */
setup_node_to_cpumask_map();
......
......@@ -161,13 +161,14 @@ static void note_page(struct seq_file *m, struct pg_state *st,
st->current_address >= st->marker[1].start_address) {
const char *unit = units;
unsigned long delta;
int width = sizeof(unsigned long) * 2;
/*
* Now print the actual finished series
*/
seq_printf(m, "0x%p-0x%p ",
(void *)st->start_address,
(void *)st->current_address);
seq_printf(m, "0x%0*lx-0x%0*lx ",
width, st->start_address,
width, st->current_address);
delta = (st->current_address - st->start_address) >> 10;
while (!(delta & 1023) && unit[1]) {
......
......@@ -3,40 +3,16 @@
* Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
* Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
*/
#include <linux/interrupt.h>
#include <linux/mmiotrace.h>
#include <linux/bootmem.h>
#include <linux/compiler.h>
#include <linux/highmem.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/vt_kern.h>
#include <linux/signal.h>
#include <linux/kernel.h>
#include <linux/ptrace.h>
#include <linux/string.h>
#include <linux/module.h>
#include <linux/kdebug.h>
#include <linux/errno.h>
#include <linux/magic.h>
#include <linux/sched.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/mman.h>
#include <linux/tty.h>
#include <linux/smp.h>
#include <linux/mm.h>
#include <asm-generic/sections.h>
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
#include <asm/segment.h>
#include <asm/system.h>
#include <asm/proto.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <linux/magic.h> /* STACK_END_MAGIC */
#include <linux/sched.h> /* test_thread_flag(), ... */
#include <linux/kdebug.h> /* oops_begin/end, ... */
#include <linux/module.h> /* search_exception_table */
#include <linux/bootmem.h> /* max_low_pfn */
#include <linux/kprobes.h> /* __kprobes, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
/*
* Page fault error code bits:
......@@ -538,8 +514,6 @@ static void dump_pagetable(unsigned long address)
static int is_errata93(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_64
static int once;
if (address != regs->ip)
return 0;
......@@ -549,10 +523,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
address |= 0xffffffffUL << 32;
if ((address >= (u64)_stext && address <= (u64)_etext) ||
(address >= MODULES_VADDR && address <= MODULES_END)) {
if (!once) {
printk(errata93_warning);
once = 1;
}
printk_once(errata93_warning);
regs->ip = address;
return 1;
}
......
......@@ -11,6 +11,9 @@
#include <asm/setup.h>
#include <asm/system.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
unsigned long __initdata e820_table_start;
unsigned long __meminitdata e820_table_end;
......@@ -24,6 +27,69 @@ int direct_gbpages
#endif
;
int nx_enabled;
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
static int disable_nx __cpuinitdata;
/*
* noexec = on|off
*
* Control non-executable mappings for processes.
*
* on Enable
* off Disable
*/
static int __init noexec_setup(char *str)
{
if (!str)
return -EINVAL;
if (!strncmp(str, "on", 2)) {
__supported_pte_mask |= _PAGE_NX;
disable_nx = 0;
} else if (!strncmp(str, "off", 3)) {
disable_nx = 1;
__supported_pte_mask &= ~_PAGE_NX;
}
return 0;
}
early_param("noexec", noexec_setup);
#endif
#ifdef CONFIG_X86_PAE
static void __init set_nx(void)
{
unsigned int v[4], l, h;
if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
if ((v[3] & (1 << 20)) && !disable_nx) {
rdmsr(MSR_EFER, l, h);
l |= EFER_NX;
wrmsr(MSR_EFER, l, h);
nx_enabled = 1;
__supported_pte_mask |= _PAGE_NX;
}
}
}
#else
static inline void set_nx(void)
{
}
#endif
#ifdef CONFIG_X86_64
void __cpuinit check_efer(void)
{
unsigned long efer;
rdmsrl(MSR_EFER, efer);
if (!(efer & EFER_NX) || disable_nx)
__supported_pte_mask &= ~_PAGE_NX;
}
#endif
static void __init find_early_table_space(unsigned long end, int use_pse,
int use_gbpages)
{
......@@ -67,12 +133,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
*/
#ifdef CONFIG_X86_32
start = 0x7000;
e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
tables, PAGE_SIZE);
#else /* CONFIG_X86_64 */
#else
start = 0x8000;
e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
#endif
e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
tables, PAGE_SIZE);
if (e820_table_start == -1UL)
panic("Cannot find space for the kernel page tables");
......@@ -160,12 +225,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
use_gbpages = direct_gbpages;
#endif
#ifdef CONFIG_X86_32
#ifdef CONFIG_X86_PAE
set_nx();
if (nx_enabled)
printk(KERN_INFO "NX (Execute Disable) protection: active\n");
#endif
/* Enable PSE if available */
if (cpu_has_pse)
......@@ -176,7 +238,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
set_in_cr4(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
}
#endif
if (use_gbpages)
page_size_mask |= 1 << PG_LEVEL_1G;
......
......@@ -49,12 +49,9 @@
#include <asm/paravirt.h>
#include <asm/setup.h>
#include <asm/cacheflush.h>
#include <asm/page_types.h>
#include <asm/init.h>
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
unsigned long highstart_pfn, highend_pfn;
static noinline int do_test_wp_bit(void);
......@@ -587,61 +584,9 @@ void zap_low_mappings(void)
flush_tlb_all();
}
int nx_enabled;
pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
EXPORT_SYMBOL_GPL(__supported_pte_mask);
#ifdef CONFIG_X86_PAE
static int disable_nx __initdata;
/*
* noexec = on|off
*
* Control non executable mappings.
*
* on Enable
* off Disable
*/
static int __init noexec_setup(char *str)
{
if (!str || !strcmp(str, "on")) {
if (cpu_has_nx) {
__supported_pte_mask |= _PAGE_NX;
disable_nx = 0;
}
} else {
if (!strcmp(str, "off")) {
disable_nx = 1;
__supported_pte_mask &= ~_PAGE_NX;
} else {
return -EINVAL;
}
}
return 0;
}
early_param("noexec", noexec_setup);
void __init set_nx(void)
{
unsigned int v[4], l, h;
if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
if ((v[3] & (1 << 20)) && !disable_nx) {
rdmsr(MSR_EFER, l, h);
l |= EFER_NX;
wrmsr(MSR_EFER, l, h);
nx_enabled = 1;
__supported_pte_mask |= _PAGE_NX;
}
}
}