Commit a57ed936 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86/asm changes from Ingo Molnar:
 "The biggest change (by line count) is the unification of the XOR code
  and then the introduction of an additional SSE based XOR assembly
  method.

  The other bigger change is the head_32.S rework/cleanup by Borislav
  Petkov.

  Last but not least there's the usual laundry list of small but
  dangerous (and hopefully perfectly tested) changes to subtle low level
  x86 code, plus cleanups."

* 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86, head_32: Give the 6 label a real name
  x86, head_32: Remove second CPUID detection from default_entry
  x86: Detect CPUID support early at boot
  x86, head_32: Remove i386 pieces
  x86: Require MOVBE feature in cpuid when we use it
  x86: Enable ARCH_USE_BUILTIN_BSWAP
  x86/xor: Add alternative SSE implementation only prefetching once per 64-byte line
  x86/xor: Unify SSE-base xor-block routines
  x86: Fix a typo
  x86/mm: Fix the argument passed to sync_global_pgds()
  x86/mm: Convert update_mmu_cache() and update_mmu_cache_pmd() to functions
  ix86: Tighten asmlinkage_protect() constraints
parents 5800700f 5e2a044d
......@@ -116,6 +116,7 @@ config X86
select MODULES_USE_ELF_RELA if X86_64
select CLONE_BACKWARDS if X86_32
select GENERIC_SIGALTSTACK
select ARCH_USE_BUILTIN_BSWAP
config INSTRUCTION_DECODER
def_bool y
......
......@@ -27,20 +27,20 @@
#define __asmlinkage_protect0(ret) \
__asmlinkage_protect_n(ret)
#define __asmlinkage_protect1(ret, arg1) \
__asmlinkage_protect_n(ret, "g" (arg1))
__asmlinkage_protect_n(ret, "m" (arg1))
#define __asmlinkage_protect2(ret, arg1, arg2) \
__asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2))
__asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2))
#define __asmlinkage_protect3(ret, arg1, arg2, arg3) \
__asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3))
__asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3))
#define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \
__asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
"g" (arg4))
__asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
"m" (arg4))
#define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \
__asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
"g" (arg4), "g" (arg5))
__asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
"m" (arg4), "m" (arg5))
#define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \
__asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
"g" (arg4), "g" (arg5), "g" (arg6))
__asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
"m" (arg4), "m" (arg5), "m" (arg6))
#endif /* CONFIG_X86_32 */
......
......@@ -786,6 +786,18 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
memcpy(dst, src, count * sizeof(pgd_t));
}
/*
* The x86 doesn't have any external MMU info: the kernel page
* tables contain all the necessary information.
*/
static inline void update_mmu_cache(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
}
static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmd)
{
}
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */
......
......@@ -66,13 +66,6 @@ do { \
__flush_tlb_one((vaddr)); \
} while (0)
/*
* The i386 doesn't have any external MMU info: the kernel page
* tables contain all the necessary information.
*/
#define update_mmu_cache(vma, address, ptep) do { } while (0)
#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
#endif /* !__ASSEMBLY__ */
/*
......
......@@ -142,9 +142,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
#define pte_unmap(pte) ((void)(pte))/* NOP */
#define update_mmu_cache(vma, address, ptep) do { } while (0)
#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
/* Encode and de-code a swap entry */
#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
......
......@@ -47,6 +47,12 @@
# define NEED_NOPL 0
#endif
#ifdef CONFIG_MATOM
# define NEED_MOVBE (1<<(X86_FEATURE_MOVBE & 31))
#else
# define NEED_MOVBE 0
#endif
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT
/* Paravirtualized systems may not have PSE or PGE available */
......@@ -80,7 +86,7 @@
#define REQUIRED_MASK2 0
#define REQUIRED_MASK3 (NEED_NOPL)
#define REQUIRED_MASK4 0
#define REQUIRED_MASK4 (NEED_MOVBE)
#define REQUIRED_MASK5 0
#define REQUIRED_MASK6 0
#define REQUIRED_MASK7 0
......
#ifdef CONFIG_KMEMCHECK
/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
# include <asm-generic/xor.h>
#elif !defined(_ASM_X86_XOR_H)
#define _ASM_X86_XOR_H
/*
* Optimized RAID-5 checksumming functions for SSE.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* You should have received a copy of the GNU General Public License
* (for example /usr/src/linux/COPYING); if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* Cache avoiding checksumming functions utilizing KNI instructions
* Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
*/
/*
* Based on
* High-speed RAID5 checksumming functions utilizing SSE instructions.
* Copyright (C) 1998 Ingo Molnar.
*/
/*
* x86-64 changes / gcc fixes from Andi Kleen.
* Copyright 2002 Andi Kleen, SuSE Labs.
*
* This hasn't been optimized for the hammer yet, but there are likely
* no advantages to be gotten from x86-64 here anyways.
*/
#include <asm/i387.h>
#ifdef CONFIG_X86_32
/* reduce register pressure */
# define XOR_CONSTANT_CONSTRAINT "i"
#else
# define XOR_CONSTANT_CONSTRAINT "re"
#endif
#define OFFS(x) "16*("#x")"
#define PF_OFFS(x) "256+16*("#x")"
#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
#define NOP(x)
#define BLK64(pf, op, i) \
pf(i) \
op(i, 0) \
op(i + 1, 1) \
op(i + 2, 2) \
op(i + 3, 3)
static void
xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
LD(i, 0) \
LD(i + 1, 1) \
PF1(i) \
PF1(i + 2) \
LD(i + 2, 2) \
LD(i + 3, 3) \
PF0(i + 4) \
PF0(i + 6) \
XO1(i, 0) \
XO1(i + 1, 1) \
XO1(i + 2, 2) \
XO1(i + 3, 3) \
ST(i, 0) \
ST(i + 1, 1) \
ST(i + 2, 2) \
ST(i + 3, 3) \
PF0(0)
PF0(2)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
BLK64(PF0, LD, i) \
BLK64(PF1, XO1, i) \
BLK64(NOP, ST, i) \
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
PF1(i + 2) \
LD(i, 0) \
LD(i + 1, 1) \
LD(i + 2, 2) \
LD(i + 3, 3) \
PF2(i) \
PF2(i + 2) \
PF0(i + 4) \
PF0(i + 6) \
XO1(i, 0) \
XO1(i + 1, 1) \
XO1(i + 2, 2) \
XO1(i + 3, 3) \
XO2(i, 0) \
XO2(i + 1, 1) \
XO2(i + 2, 2) \
XO2(i + 3, 3) \
ST(i, 0) \
ST(i + 1, 1) \
ST(i + 2, 2) \
ST(i + 3, 3) \
PF0(0)
PF0(2)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
BLK64(PF0, LD, i) \
BLK64(PF1, XO1, i) \
BLK64(PF2, XO2, i) \
BLK64(NOP, ST, i) \
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
PF1(i + 2) \
LD(i, 0) \
LD(i + 1, 1) \
LD(i + 2, 2) \
LD(i + 3, 3) \
PF2(i) \
PF2(i + 2) \
XO1(i, 0) \
XO1(i + 1, 1) \
XO1(i + 2, 2) \
XO1(i + 3, 3) \
PF3(i) \
PF3(i + 2) \
PF0(i + 4) \
PF0(i + 6) \
XO2(i, 0) \
XO2(i + 1, 1) \
XO2(i + 2, 2) \
XO2(i + 3, 3) \
XO3(i, 0) \
XO3(i + 1, 1) \
XO3(i + 2, 2) \
XO3(i + 3, 3) \
ST(i, 0) \
ST(i + 1, 1) \
ST(i + 2, 2) \
ST(i + 3, 3) \
PF0(0)
PF0(2)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" add %[inc], %[p4] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines), [p1] "+r" (p1),
[p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
BLK64(PF0, LD, i) \
BLK64(PF1, XO1, i) \
BLK64(PF2, XO2, i) \
BLK64(PF3, XO3, i) \
BLK64(NOP, ST, i) \
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" add %[inc], %[p4] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines), [p1] "+r" (p1),
[p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
PF1(i + 2) \
LD(i, 0) \
LD(i + 1, 1) \
LD(i + 2, 2) \
LD(i + 3, 3) \
PF2(i) \
PF2(i + 2) \
XO1(i, 0) \
XO1(i + 1, 1) \
XO1(i + 2, 2) \
XO1(i + 3, 3) \
PF3(i) \
PF3(i + 2) \
XO2(i, 0) \
XO2(i + 1, 1) \
XO2(i + 2, 2) \
XO2(i + 3, 3) \
PF4(i) \
PF4(i + 2) \
PF0(i + 4) \
PF0(i + 6) \
XO3(i, 0) \
XO3(i + 1, 1) \
XO3(i + 2, 2) \
XO3(i + 3, 3) \
XO4(i, 0) \
XO4(i + 1, 1) \
XO4(i + 2, 2) \
XO4(i + 3, 3) \
ST(i, 0) \
ST(i + 1, 1) \
ST(i + 2, 2) \
ST(i + 3, 3) \
PF0(0)
PF0(2)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" add %[inc], %[p4] ;\n"
" add %[inc], %[p5] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
[p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
BLK64(PF0, LD, i) \
BLK64(PF1, XO1, i) \
BLK64(PF2, XO2, i) \
BLK64(PF3, XO3, i) \
BLK64(PF4, XO4, i) \
BLK64(NOP, ST, i) \
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" add %[inc], %[p4] ;\n"
" add %[inc], %[p5] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
[p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static struct xor_block_template xor_block_sse_pf64 = {
.name = "prefetch64-sse",
.do_2 = xor_sse_2_pf64,
.do_3 = xor_sse_3_pf64,
.do_4 = xor_sse_4_pf64,
.do_5 = xor_sse_5_pf64,
};
#undef LD
#undef XO1
#undef XO2
#undef XO3
#undef XO4
#undef ST
#undef NOP
#undef BLK64
#undef BLOCK
#undef XOR_CONSTANT_CONSTRAINT
#ifdef CONFIG_X86_32
# include <asm/xor_32.h>
#else
# include <asm/xor_64.h>
#endif
#endif
#define XOR_SELECT_TEMPLATE(FASTEST) \
AVX_SELECT(FASTEST)
#endif /* _ASM_X86_XOR_H */
......@@ -2,7 +2,7 @@
#define _ASM_X86_XOR_32_H
/*
* Optimized RAID-5 checksumming functions for MMX and SSE.
* Optimized RAID-5 checksumming functions for MMX.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
......@@ -529,290 +529,6 @@ static struct xor_block_template xor_block_p5_mmx = {
.do_5 = xor_p5_mmx_5,
};
/*
* Cache avoiding checksumming functions utilizing KNI instructions
* Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
*/
#define OFFS(x) "16*("#x")"
#define PF_OFFS(x) "256+16*("#x")"
#define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
#define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
#define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
#define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
#define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
#define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
#define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
#define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
#define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"