1. 23 Jun, 2009 1 commit
  2. 17 Jun, 2009 3 commits
    • Wu Fengguang's avatar
      mm: introduce PageHuge() for testing huge/gigantic pages · 20a0307c
      Wu Fengguang authored
      
      
      A series of patches to enhance the /proc/pagemap interface and to add a
      userspace executable which can be used to present the pagemap data.
      
      Export 10 more flags to end users (and more for kernel developers):
      
              11. KPF_MMAP            (pseudo flag) memory mapped page
              12. KPF_ANON            (pseudo flag) memory mapped page (anonymous)
              13. KPF_SWAPCACHE       page is in swap cache
              14. KPF_SWAPBACKED      page is swap/RAM backed
              15. KPF_COMPOUND_HEAD   (*)
              16. KPF_COMPOUND_TAIL   (*)
              17. KPF_HUGE		hugeTLB pages
              18. KPF_UNEVICTABLE     page is in the unevictable LRU list
              19. KPF_HWPOISON        hardware detected corruption
              20. KPF_NOPAGE          (pseudo flag) no page frame at the address
      
              (*) For compound pages, exporting _both_ head/tail info enables
                  users to tell where a compound page starts/ends, and its order.
      
      a simple demo of the page-types tool
      
      # ./page-types -h
      page-types [options]
                  -r|--raw                  Raw mode, for kernel developers
                  -a|--addr    addr-spec    Walk a range of pages
                  -b|--bits    bits-spec    Walk pages with specified bits
                  -l|--list                 Show page details in ranges
                  -L|--list-each            Show page details one by one
                  -N|--no-summary           Don't show summay info
                  -h|--help                 Show this usage message
      addr-spec:
                  N                         one page at offset N (unit: pages)
                  N+M                       pages range from N to N+M-1
                  N,M                       pages range from N to M-1
                  N,                        pages range from N to end
                  ,M                        pages range from 0 to M
      bits-spec:
                  bit1,bit2                 (flags & (bit1|bit2)) != 0
                  bit1,bit2=bit1            (flags & (bit1|bit2)) == bit1
                  bit1,~bit2                (flags & (bit1|bit2)) == bit1
                  =bit1,bit2                flags == (bit1|bit2)
      bit-names:
                locked              error         referenced           uptodate
                 dirty                lru             active               slab
             writeback            reclaim              buddy               mmap
             anonymous          swapcache         swapbacked      compound_head
         compound_tail               huge        unevictable           hwpoison
                nopage           reserved(r)         mlocked(r)    mappedtodisk(r)
               private(r)       private_2(r)   owner_private(r)            arch(r)
              uncached(r)       readahead(o)       slob_free(o)     slub_frozen(o)
            slub_debug(o)
                                         (r) raw mode bits  (o) overloaded bits
      
      # ./page-types
                   flags      page-count       MB  symbolic-flags                     long-symbolic-flags
      0x0000000000000000          487369     1903  _________________________________
      0x0000000000000014               5        0  __R_D____________________________  referenced,dirty
      0x0000000000000020               1        0  _____l___________________________  lru
      0x0000000000000024              34        0  __R__l___________________________  referenced,lru
      0x0000000000000028            3838       14  ___U_l___________________________  uptodate,lru
      0x0001000000000028              48        0  ___U_l_______________________I___  uptodate,lru,readahead
      0x000000000000002c            6478       25  __RU_l___________________________  referenced,uptodate,lru
      0x000100000000002c              47        0  __RU_l_______________________I___  referenced,uptodate,lru,readahead
      0x0000000000000040            8344       32  ______A__________________________  active
      0x0000000000000060               1        0  _____lA__________________________  lru,active
      0x0000000000000068             348        1  ___U_lA__________________________  uptodate,lru,active
      0x0001000000000068              12        0  ___U_lA______________________I___  uptodate,lru,active,readahead
      0x000000000000006c             988        3  __RU_lA__________________________  referenced,uptodate,lru,active
      0x000100000000006c              48        0  __RU_lA______________________I___  referenced,uptodate,lru,active,readahead
      0x0000000000004078               1        0  ___UDlA_______b__________________  uptodate,dirty,lru,active,swapbacked
      0x000000000000407c              34        0  __RUDlA_______b__________________  referenced,uptodate,dirty,lru,active,swapbacked
      0x0000000000000400             503        1  __________B______________________  buddy
      0x0000000000000804               1        0  __R________M_____________________  referenced,mmap
      0x0000000000000828            1029        4  ___U_l_____M_____________________  uptodate,lru,mmap
      0x0001000000000828              43        0  ___U_l_____M_________________I___  uptodate,lru,mmap,readahead
      0x000000000000082c             382        1  __RU_l_____M_____________________  referenced,uptodate,lru,mmap
      0x000100000000082c              12        0  __RU_l_____M_________________I___  referenced,uptodate,lru,mmap,readahead
      0x0000000000000868             192        0  ___U_lA____M_____________________  uptodate,lru,active,mmap
      0x0001000000000868              12        0  ___U_lA____M_________________I___  uptodate,lru,active,mmap,readahead
      0x000000000000086c             800        3  __RU_lA____M_____________________  referenced,uptodate,lru,active,mmap
      0x000100000000086c              31        0  __RU_lA____M_________________I___  referenced,uptodate,lru,active,mmap,readahead
      0x0000000000004878               2        0  ___UDlA____M__b__________________  uptodate,dirty,lru,active,mmap,swapbacked
      0x0000000000001000             492        1  ____________a____________________  anonymous
      0x0000000000005808               4        0  ___U_______Ma_b__________________  uptodate,mmap,anonymous,swapbacked
      0x0000000000005868            2839       11  ___U_lA____Ma_b__________________  uptodate,lru,active,mmap,anonymous,swapbacked
      0x000000000000586c              30        0  __RU_lA____Ma_b__________________  referenced,uptodate,lru,active,mmap,anonymous,swapbacked
                   total          513968     2007
      
      # ./page-types -r
                   flags      page-count       MB  symbolic-flags                     long-symbolic-flags
      0x0000000000000000          468002     1828  _________________________________
      0x0000000100000000           19102       74  _____________________r___________  reserved
      0x0000000000008000              41        0  _______________H_________________  compound_head
      0x0000000000010000             188        0  ________________T________________  compound_tail
      0x0000000000008014               1        0  __R_D__________H_________________  referenced,dirty,compound_head
      0x0000000000010014               4        0  __R_D___________T________________  referenced,dirty,compound_tail
      0x0000000000000020               1        0  _____l___________________________  lru
      0x0000000800000024              34        0  __R__l__________________P________  referenced,lru,private
      0x0000000000000028            3794       14  ___U_l___________________________  uptodate,lru
      0x0001000000000028              46        0  ___U_l_______________________I___  uptodate,lru,readahead
      0x0000000400000028              44        0  ___U_l_________________d_________  uptodate,lru,mappedtodisk
      0x0001000400000028               2        0  ___U_l_________________d_____I___  uptodate,lru,mappedtodisk,readahead
      0x000000000000002c            6434       25  __RU_l___________________________  referenced,uptodate,lru
      0x000100000000002c              47        0  __RU_l_______________________I___  referenced,uptodate,lru,readahead
      0x000000040000002c              14        0  __RU_l_________________d_________  referenced,uptodate,lru,mappedtodisk
      0x000000080000002c              30        0  __RU_l__________________P________  referenced,uptodate,lru,private
      0x0000000800000040            8124       31  ______A_________________P________  active,private
      0x0000000000000040             219        0  ______A__________________________  active
      0x0000000800000060               1        0  _____lA_________________P________  lru,active,private
      0x0000000000000068             322        1  ___U_lA__________________________  uptodate,lru,active
      0x0001000000000068              12        0  ___U_lA______________________I___  uptodate,lru,active,readahead
      0x0000000400000068              13        0  ___U_lA________________d_________  uptodate,lru,active,mappedtodisk
      0x0000000800000068              12        0  ___U_lA_________________P________  uptodate,lru,active,private
      0x000000000000006c             977        3  __RU_lA__________________________  referenced,uptodate,lru,active
      0x000100000000006c              48        0  __RU_lA______________________I___  referenced,uptodate,lru,active,readahead
      0x000000040000006c               5        0  __RU_lA________________d_________  referenced,uptodate,lru,active,mappedtodisk
      0x000000080000006c               3        0  __RU_lA_________________P________  referenced,uptodate,lru,active,private
      0x0000000c0000006c               3        0  __RU_lA________________dP________  referenced,uptodate,lru,active,mappedtodisk,private
      0x0000000c00000068               1        0  ___U_lA________________dP________  uptodate,lru,active,mappedtodisk,private
      0x0000000000004078               1        0  ___UDlA_______b__________________  uptodate,dirty,lru,active,swapbacked
      0x000000000000407c              34        0  __RUDlA_______b__________________  referenced,uptodate,dirty,lru,active,swapbacked
      0x0000000000000400             538        2  __________B______________________  buddy
      0x0000000000000804               1        0  __R________M_____________________  referenced,mmap
      0x0000000000000828            1029        4  ___U_l_____M_____________________  uptodate,lru,mmap
      0x0001000000000828              43        0  ___U_l_____M_________________I___  uptodate,lru,mmap,readahead
      0x000000000000082c             382        1  __RU_l_____M_____________________  referenced,uptodate,lru,mmap
      0x000100000000082c              12        0  __RU_l_____M_________________I___  referenced,uptodate,lru,mmap,readahead
      0x0000000000000868             192        0  ___U_lA____M_____________________  uptodate,lru,active,mmap
      0x0001000000000868              12        0  ___U_lA____M_________________I___  uptodate,lru,active,mmap,readahead
      0x000000000000086c             800        3  __RU_lA____M_____________________  referenced,uptodate,lru,active,mmap
      0x000100000000086c              31        0  __RU_lA____M_________________I___  referenced,uptodate,lru,active,mmap,readahead
      0x0000000000004878               2        0  ___UDlA____M__b__________________  uptodate,dirty,lru,active,mmap,swapbacked
      0x0000000000001000             492        1  ____________a____________________  anonymous
      0x0000000000005008               2        0  ___U________a_b__________________  uptodate,anonymous,swapbacked
      0x0000000000005808               4        0  ___U_______Ma_b__________________  uptodate,mmap,anonymous,swapbacked
      0x000000000000580c               1        0  __RU_______Ma_b__________________  referenced,uptodate,mmap,anonymous,swapbacked
      0x0000000000005868            2839       11  ___U_lA____Ma_b__________________  uptodate,lru,active,mmap,anonymous,swapbacked
      0x000000000000586c              29        0  __RU_lA____Ma_b__________________  referenced,uptodate,lru,active,mmap,anonymous,swapbacked
                   total          513968     2007
      
      # ./page-types --raw --list --no-summary --bits reserved
      offset  count   flags
      0       15      _____________________r___________
      31      4       _____________________r___________
      159     97      _____________________r___________
      4096    2067    _____________________r___________
      6752    2390    _____________________r___________
      9355    3       _____________________r___________
      9728    14526   _____________________r___________
      
      This patch:
      
      Introduce PageHuge(), which identifies huge/gigantic pages by their
      dedicated compound destructor functions.
      
      Also move prep_compound_gigantic_page() to hugetlb.c and make
      __free_pages_ok() non-static.
      Signed-off-by: default avatarWu Fengguang <fengguang.wu@intel.com>
      Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
      Cc: Andi Kleen <andi@firstfloor.org>
      Cc: Matt Mackall <mpm@selenic.com>
      Cc: Alexey Dobriyan <adobriyan@gmail.com>
      Cc: Ingo Molnar <mingo@elte.hu>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      20a0307c
    • Christoph Lameter's avatar
      page allocator: use a pre-calculated value instead of num_online_nodes() in fast paths · 62bc62a8
      Christoph Lameter authored
      
      
      num_online_nodes() is called in a number of places but most often by the
      page allocator when deciding whether the zonelist needs to be filtered
      based on cpusets or the zonelist cache.  This is actually a heavy function
      and touches a number of cache lines.
      
      This patch stores the number of online nodes at boot time and updates the
      value when nodes get onlined and offlined.  The value is then used in a
      number of important paths in place of num_online_nodes().
      
      [rientjes@google.com: do not override definition of node_set_online() with macro]
      Signed-off-by: default avatarChristoph Lameter <cl@linux-foundation.org>
      Signed-off-by: default avatarMel Gorman <mel@csn.ul.ie>
      Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
      Cc: Pekka Enberg <penberg@cs.helsinki.fi>
      Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
      Cc: Nick Piggin <nickpiggin@yahoo.com.au>
      Cc: Dave Hansen <dave@linux.vnet.ibm.com>
      Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
      Signed-off-by: default avatarDavid Rientjes <rientjes@google.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      62bc62a8
    • Mel Gorman's avatar
      page allocator: do not check NUMA node ID when the caller knows the node is valid · 6484eb3e
      Mel Gorman authored
      
      
      Callers of alloc_pages_node() can optionally specify -1 as a node to mean
      "allocate from the current node".  However, a number of the callers in
      fast paths know for a fact their node is valid.  To avoid a comparison and
      branch, this patch adds alloc_pages_exact_node() that only checks the nid
      with VM_BUG_ON().  Callers that know their node is valid are then
      converted.
      Signed-off-by: default avatarMel Gorman <mel@csn.ul.ie>
      Reviewed-by: default avatarChristoph Lameter <cl@linux-foundation.org>
      Reviewed-by: default avatarKOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
      Reviewed-by: default avatarPekka Enberg <penberg@cs.helsinki.fi>
      Acked-by: Paul Mundt <lethal@linux-sh.org>	[for the SLOB NUMA bits]
      Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
      Cc: Nick Piggin <nickpiggin@yahoo.com.au>
      Cc: Dave Hansen <dave@linux.vnet.ibm.com>
      Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      6484eb3e
  3. 29 May, 2009 1 commit
    • Mel Gorman's avatar
      mm: account for MAP_SHARED mappings using VM_MAYSHARE and not VM_SHARED in hugetlbfs · f83a275d
      Mel Gorman authored
      Addresses http://bugzilla.kernel.org/show_bug.cgi?id=13302
      
      
      
      hugetlbfs reserves huge pages but does not fault them at mmap() time to
      ensure that future faults succeed.  The reservation behaviour differs
      depending on whether the mapping was mapped MAP_SHARED or MAP_PRIVATE.
      For MAP_SHARED mappings, hugepages are reserved when mmap() is first
      called and are tracked based on information associated with the inode.
      Other processes mapping MAP_SHARED use the same reservation.  MAP_PRIVATE
      track the reservations based on the VMA created as part of the mmap()
      operation.  Each process mapping MAP_PRIVATE must make its own
      reservation.
      
      hugetlbfs currently checks if a VMA is MAP_SHARED with the VM_SHARED flag
      and not VM_MAYSHARE.  For file-backed mappings, such as hugetlbfs,
      VM_SHARED is set only if the mapping is MAP_SHARED and the file was opened
      read-write.  If a shared memory mapping was mapped shared-read-write for
      populating of data and mapped shared-read-only by other processes, then
      hugetlbfs would account for the mapping as if it was MAP_PRIVATE.  This
      causes processes to fail to map the file MAP_SHARED even though it should
      succeed as the reservation is there.
      
      This patch alters mm/hugetlb.c and replaces VM_SHARED with VM_MAYSHARE
      when the intent of the code was to check whether the VMA was mapped
      MAP_SHARED or MAP_PRIVATE.
      Signed-off-by: default avatarMel Gorman <mel@csn.ul.ie>
      Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
      Cc: Ingo Molnar <mingo@elte.hu>
      Cc: <stable@kernel.org>
      Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
      Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
      Cc: <starlight@binnacle.cx>
      Cc: Eric B Munson <ebmunson@us.ibm.com>
      Cc: Adam Litke <agl@us.ibm.com>
      Cc: Andy Whitcroft <apw@canonical.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      f83a275d
  4. 01 Apr, 2009 1 commit
  5. 11 Feb, 2009 1 commit
  6. 10 Feb, 2009 1 commit
    • Mel Gorman's avatar
      Do not account for the address space used by hugetlbfs using VM_ACCOUNT · 5a6fe125
      Mel Gorman authored
      When overcommit is disabled, the core VM accounts for pages used by anonymous
      shared, private mappings and special mappings. It keeps track of VMAs that
      should be accounted for with VM_ACCOUNT and VMAs that never had a reserve
      with VM_NORESERVE.
      
      Overcommit for hugetlbfs is much riskier than overcommit for base pages
      due to contiguity requirements. It avoids overcommiting on both shared and
      private mappings using reservation counters that are checked and updated
      during mmap(). This ensures (within limits) that hugepages exist in the
      future when faults occurs or it is too easy to applications to be SIGKILLed.
      
      As hugetlbfs makes its own reservations of a different unit to the base page
      size, VM_ACCOUNT should never be set. Even if the units were correct, we would
      double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may
      be set because an application can request no reserves be made for hugetlbfs
      at the risk of getting killed later.
      
      With commit fc8744ad
      
      , VM_NORESERVE and
      VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This
      breaks the accounting for both the core VM and hugetlbfs, can trigger an
      OOM storm when hugepage pools are too small lockups and corrupted counters
      otherwise are used. This patch brings hugetlbfs more in line with how the
      core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set.
      Signed-off-by: default avatarMel Gorman <mel@csn.ul.ie>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      5a6fe125
  7. 06 Jan, 2009 4 commits
  8. 13 Nov, 2008 1 commit
  9. 06 Nov, 2008 2 commits
    • Andy Whitcroft's avatar
      hugetlb: pull gigantic page initialisation out of the default path · 18229df5
      Andy Whitcroft authored
      
      
      As we can determine exactly when a gigantic page is in use we can optimise
      the common regular page cases by pulling out gigantic page initialisation
      into its own function.  As gigantic pages are never released to buddy we
      do not need a destructor.  This effectivly reverts the previous change to
      the main buddy allocator.  It also adds a paranoid check to ensure we
      never release gigantic pages from hugetlbfs to the main buddy.
      Signed-off-by: default avatarAndy Whitcroft <apw@shadowen.org>
      Cc: Jon Tollefson <kniht@linux.vnet.ibm.com>
      Cc: Mel Gorman <mel@csn.ul.ie>
      Cc: Nick Piggin <nickpiggin@yahoo.com.au>
      Cc: Christoph Lameter <cl@linux-foundation.org>
      Cc: <stable@kernel.org>		[2.6.27.x]
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      18229df5
    • Andy Whitcroft's avatar
      hugetlbfs: handle pages higher order than MAX_ORDER · 69d177c2
      Andy Whitcroft authored
      
      
      When working with hugepages, hugetlbfs assumes that those hugepages are
      smaller than MAX_ORDER.  Specifically it assumes that the mem_map is
      contigious and uses that to optimise access to the elements of the mem_map
      that represent the hugepage.  Gigantic pages (such as 16GB pages on
      powerpc) by definition are of greater order than MAX_ORDER (larger than
      MAX_ORDER_NR_PAGES in size).  This means that we can no longer make use of
      the buddy alloctor guarentees for the contiguity of the mem_map, which
      ensures that the mem_map is at least contigious for maximmally aligned
      areas of MAX_ORDER_NR_PAGES pages.
      
      This patch adds new mem_map accessors and iterator helpers which handle
      any discontiguity at MAX_ORDER_NR_PAGES boundaries.  It then uses these to
      implement gigantic page versions of copy_huge_page and clear_huge_page,
      and to allow follow_hugetlb_page handle gigantic pages.
      Signed-off-by: default avatarAndy Whitcroft <apw@shadowen.org>
      Cc: Jon Tollefson <kniht@linux.vnet.ibm.com>
      Cc: Mel Gorman <mel@csn.ul.ie>
      Cc: Nick Piggin <nickpiggin@yahoo.com.au>
      Cc: Christoph Lameter <cl@linux-foundation.org>
      Cc: <stable@kernel.org>		[2.6.27.x]
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      69d177c2
  10. 23 Oct, 2008 1 commit
  11. 20 Oct, 2008 3 commits
    • KOSAKI Motohiro's avatar
      hugepage: support ZERO_PAGE() · 4b2e38ad
      KOSAKI Motohiro authored
      
      
      Presently hugepage doesn't use zero page at all because zero page is only
      used for coredumping and hugepage can't core dump.
      
      However we have now implemented hugepage coredumping.  Therefore we should
      implement the zero page of hugepage.
      
      Implementation note:
      
      o Why do we only check VM_SHARED for zero page?
        normal page checked as ..
      
      	static inline int use_zero_page(struct vm_area_struct *vma)
      	{
      	        if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
      	                return 0;
      
      	        return !vma->vm_ops || !vma->vm_ops->fault;
      	}
      
      First, hugepages are never mlock()ed.  We aren't concerned with VM_LOCKED.
      
      Second, hugetlbfs is a pseudo filesystem, not a real filesystem and it
      doesn't have any file backing.  Thus ops->fault checking is meaningless.
      
      o Why don't we use zero page if !pte.
      
      !pte indicate {pud, pmd} doesn't exist or some error happened.  So we
      shouldn't return zero page if any error occurred.
      Signed-off-by: default avatarKOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
      Cc: Adam Litke <agl@us.ibm.com>
      Cc: Hugh Dickins <hugh@veritas.com>
      Cc: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com>
      Cc: Mel Gorman <mel@skynet.ie>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      4b2e38ad
    • Harvey Harrison's avatar
      mm: hugetlb.c make functions static, use NULL rather than 0 · 2a4b3ded
      Harvey Harrison authored
      
      
      mm/hugetlb.c:265:17: warning: symbol 'resv_map_alloc' was not declared. Should it be static?
      mm/hugetlb.c:277:6: warning: symbol 'resv_map_release' was not declared. Should it be static?
      mm/hugetlb.c:292:9: warning: Using plain integer as NULL pointer
      mm/hugetlb.c:1750:5: warning: symbol 'unmap_ref_private' was not declared. Should it be static?
      Signed-off-by: default avatarHarvey Harrison <harvey.harrison@gmail.com>
      Acked-by: default avatarAndy Whitcroft <apw@shadowen.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      2a4b3ded
    • Rik van Riel's avatar
      vmscan: split LRU lists into anon & file sets · 4f98a2fe
      Rik van Riel authored
      
      
      Split the LRU lists in two, one set for pages that are backed by real file
      systems ("file") and one for pages that are backed by memory and swap
      ("anon").  The latter includes tmpfs.
      
      The advantage of doing this is that the VM will not have to scan over lots
      of anonymous pages (which we generally do not want to swap out), just to
      find the page cache pages that it should evict.
      
      This patch has the infrastructure and a basic policy to balance how much
      we scan the anon lists and how much we scan the file lists.  The big
      policy changes are in separate patches.
      
      [lee.schermerhorn@hp.com: collect lru meminfo statistics from correct offset]
      [kosaki.motohiro@jp.fujitsu.com: prevent incorrect oom under split_lru]
      [kosaki.motohiro@jp.fujitsu.com: fix pagevec_move_tail() doesn't treat unevictable page]
      [hugh@veritas.com: memcg swapbacked pages active]
      [hugh@veritas.com: splitlru: BDI_CAP_SWAP_BACKED]
      [akpm@linux-foundation.org: fix /proc/vmstat units]
      [nishimura@mxp.nes.nec.co.jp: memcg: fix handling of shmem migration]
      [kosaki.motohiro@jp.fujitsu.com: adjust Quicklists field of /proc/meminfo]
      [kosaki.motohiro@jp.fujitsu.com: fix style issue of get_scan_ratio()]
      Signed-off-by: default avatarRik van Riel <riel@redhat.com>
      Signed-off-by: default avatarLee Schermerhorn <Lee.Schermerhorn@hp.com>
      Signed-off-by: default avatarKOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
      Signed-off-by: default avatarHugh Dickins <hugh@veritas.com>
      Signed-off-by: default avatarDaisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      4f98a2fe
  12. 16 Oct, 2008 1 commit
    • David Gibson's avatar
      hugetlb: handle updating of ACCESSED and DIRTY in hugetlb_fault() · b4d1d99f
      David Gibson authored
      
      
      The page fault path for normal pages, if the fault is neither a no-page
      fault nor a write-protect fault, will update the DIRTY and ACCESSED bits
      in the page table appropriately.
      
      The hugepage fault path, however, does not do this, handling only no-page
      or write-protect type faults.  It assumes that either the ACCESSED and
      DIRTY bits are irrelevant for hugepages (usually true, since they are
      never swapped) or that they are handled by the arch code.
      
      This is inconvenient for some software-loaded TLB architectures, where the
      _PAGE_ACCESSED (_PAGE_DIRTY) bits need to be set to enable read (write)
      access to the page at the TLB miss.  This could be worked around in the
      arch TLB miss code, but the TLB miss fast path can be made simple more
      easily if the hugetlb_fault() path handles this, as the normal page fault
      path does.
      Signed-off-by: default avatarDavid Gibson <david@gibson.dropbear.id.au>
      Cc: William Lee Irwin III <wli@holomorphy.com>
      Cc: Hugh Dickins <hugh@veritas.com>
      Cc: Adam Litke <agl@us.ibm.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      b4d1d99f
  13. 12 Aug, 2008 3 commits
  14. 06 Aug, 2008 1 commit
  15. 01 Aug, 2008 1 commit
  16. 29 Jul, 2008 1 commit
  17. 28 Jul, 2008 2 commits
    • Adrian Bunk's avatar
      mm/hugetlb.c must #include <asm/io.h> · 78a34ae2
      Adrian Bunk authored
      This patch fixes the following build error on sh caused by commit
      aa888a74
      
       ("hugetlb: support larger than
      MAX_ORDER"):
      
        mm/hugetlb.c: In function 'alloc_bootmem_huge_page':
        mm/hugetlb.c:958: error: implicit declaration of function 'virt_to_phys'
      Signed-off-by: default avatarAdrian Bunk <bunk@kernel.org>
      Cc: Hirokazu Takata <takata@linux-m32r.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      78a34ae2
    • Andrea Arcangeli's avatar
      mmu-notifiers: core · cddb8a5c
      Andrea Arcangeli authored
      
      
      With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
       There are secondary MMUs (with secondary sptes and secondary tlbs) too.
      sptes in the kvm case are shadow pagetables, but when I say spte in
      mmu-notifier context, I mean "secondary pte".  In GRU case there's no
      actual secondary pte and there's only a secondary tlb because the GRU
      secondary MMU has no knowledge about sptes and every secondary tlb miss
      event in the MMU always generates a page fault that has to be resolved by
      the CPU (this is not the case of KVM where the a secondary tlb miss will
      walk sptes in hardware and it will refill the secondary tlb transparently
      to software if the corresponding spte is present).  The same way
      zap_page_range has to invalidate the pte before freeing the page, the spte
      (and secondary tlb) must also be invalidated before any page is freed and
      reused.
      
      Currently we take a page_count pin on every page mapped by sptes, but that
      means the pages can't be swapped whenever they're mapped by any spte
      because they're part of the guest working set.  Furthermore a spte unmap
      event can immediately lead to a page to be freed when the pin is released
      (so requiring the same complex and relatively slow tlb_gather smp safe
      logic we have in zap_page_range and that can be avoided completely if the
      spte unmap event doesn't require an unpin of the page previously mapped in
      the secondary MMU).
      
      The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
      when the VM is swapping or freeing or doing anything on the primary MMU so
      that the secondary MMU code can drop sptes before the pages are freed,
      avoiding all page pinning and allowing 100% reliable swapping of guest
      physical address space.  Furthermore it avoids the code that teardown the
      mappings of the secondary MMU, to implement a logic like tlb_gather in
      zap_page_range that would require many IPI to flush other cpu tlbs, for
      each fixed number of spte unmapped.
      
      To make an example: if what happens on the primary MMU is a protection
      downgrade (from writeable to wrprotect) the secondary MMU mappings will be
      invalidated, and the next secondary-mmu-page-fault will call
      get_user_pages and trigger a do_wp_page through get_user_pages if it
      called get_user_pages with write=1, and it'll re-establishing an updated
      spte or secondary-tlb-mapping on the copied page.  Or it will setup a
      readonly spte or readonly tlb mapping if it's a guest-read, if it calls
      get_user_pages with write=0.  This is just an example.
      
      This allows to map any page pointed by any pte (and in turn visible in the
      primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
      full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
      with kvm), or a remote DMA in software like XPMEM (hence needing of
      schedule in XPMEM code to send the invalidate to the remote node, while no
      need to schedule in kvm/gru as it's an immediate event like invalidating
      primary-mmu pte).
      
      At least for KVM without this patch it's impossible to swap guests
      reliably.  And having this feature and removing the page pin allows
      several other optimizations that simplify life considerably.
      
      Dependencies:
      
      1) mm_take_all_locks() to register the mmu notifier when the whole VM
         isn't doing anything with "mm".  This allows mmu notifier users to keep
         track if the VM is in the middle of the invalidate_range_begin/end
         critical section with an atomic counter incraese in range_begin and
         decreased in range_end.  No secondary MMU page fault is allowed to map
         any spte or secondary tlb reference, while the VM is in the middle of
         range_begin/end as any page returned by get_user_pages in that critical
         section could later immediately be freed without any further
         ->invalidate_page notification (invalidate_range_begin/end works on
         ranges and ->invalidate_page isn't called immediately before freeing
         the page).  To stop all page freeing and pagetable overwrites the
         mmap_sem must be taken in write mode and all other anon_vma/i_mmap
         locks must be taken too.
      
      2) It'd be a waste to add branches in the VM if nobody could possibly
         run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
         CONFIG_KVM=m/y.  In the current kernel kvm won't yet take advantage of
         mmu notifiers, but this already allows to compile a KVM external module
         against a kernel with mmu notifiers enabled and from the next pull from
         kvm.git we'll start using them.  And GRU/XPMEM will also be able to
         continue the development by enabling KVM=m in their config, until they
         submit all GRU/XPMEM GPLv2 code to the mainline kernel.  Then they can
         also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
         This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
         are all =n.
      
      The mmu_notifier_register call can fail because mm_take_all_locks may be
      interrupted by a signal and return -EINTR.  Because mmu_notifier_reigster
      is used when a driver startup, a failure can be gracefully handled.  Here
      an example of the change applied to kvm to register the mmu notifiers.
      Usually when a driver startups other allocations are required anyway and
      -ENOMEM failure paths exists already.
      
       struct  kvm *kvm_arch_create_vm(void)
       {
              struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
      +       int err;
      
              if (!kvm)
                      return ERR_PTR(-ENOMEM);
      
              INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
      
      +       kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
      +       err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
      +       if (err) {
      +               kfree(kvm);
      +               return ERR_PTR(err);
      +       }
      +
              return kvm;
       }
      
      mmu_notifier_unregister returns void and it's reliable.
      
      The patch also adds a few needed but missing includes that would prevent
      kernel to compile after these changes on non-x86 archs (x86 didn't need
      them by luck).
      
      [akpm@linux-foundation.org: coding-style fixes]
      [akpm@linux-foundation.org: fix mm/filemap_xip.c build]
      [akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
      Signed-off-by: default avatarAndrea Arcangeli <andrea@qumranet.com>
      Signed-off-by: default avatarNick Piggin <npiggin@suse.de>
      Signed-off-by: default avatarChristoph Lameter <cl@linux-foundation.org>
      Cc: Jack Steiner <steiner@sgi.com>
      Cc: Robin Holt <holt@sgi.com>
      Cc: Nick Piggin <npiggin@suse.de>
      Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
      Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
      Cc: Roland Dreier <rdreier@cisco.com>
      Cc: Steve Wise <swise@opengridcomputing.com>
      Cc: Avi Kivity <avi@qumranet.com>
      Cc: Hugh Dickins <hugh@veritas.com>
      Cc: Rusty Russell <rusty@rustcorp.com.au>
      Cc: Anthony Liguori <aliguori@us.ibm.com>
      Cc: Chris Wright <chrisw@redhat.com>
      Cc: Marcelo Tosatti <marcelo@kvack.org>
      Cc: Eric Dumazet <dada1@cosmosbay.com>
      Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
      Cc: Izik Eidus <izike@qumranet.com>
      Cc: Anthony Liguori <aliguori@us.ibm.com>
      Cc: Rik van Riel <riel@redhat.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      cddb8a5c
  18. 26 Jul, 2008 1 commit
  19. 25 Jul, 2008 1 commit
  20. 24 Jul, 2008 10 commits