Patch: improve huge page support

Wed May 4 07:49:50 PDT 2011

My big patch to improve huge page support in now in HotSpot upstream.
This is a backport.

The question is: to what branches should we apply this?  It makes the
use of huge pages much, much easier.  You no longer need root access
for anything except setting the size of the huge page pool, and the
huge pages aren't locked in memory, so it's a big improvement for
usability.  If the new huge page support doesn't work in the kernel,
it falls back to the old SysV-style huge page allocator.

Thoughts?

Andrew.


2011-05-04  Andrew Haley  <aph at redhat.com>

	* NEWS: Add S7037939, S7034464.
	* patches/icedtea-hugepage.patch: Support transparent huge pages
	on Linux available since 2.6.38
	Disable adaptive resizing if SHM large pages are used.

# HG changeset patch
# User iveresov
# Date 1303344724 25200
# Node ID 139667d9836ae218ab06a74d5813a5a700a076c9
# Parent  49a67202bc671d23d719c9e14752d80295a9e8f6
7034464: Support transparent large pages on Linux
Summary: Support transparent huge pages on Linux available since 2.6.38
Reviewed-by: iveresov, ysr
Contributed-by: aph at redhat.com

# HG changeset patch
# User iveresov
# Date 1303843594 25200
# Node ID c303b3532d4ada074facc42292219952c2a4204e
# Parent  d6cdc6c77582408f6450bdda3ffc831d44298714
7037939: NUMA: Disable adaptive resizing if SHM large pages are used
Summary: Make the NUMA allocator behave properly with SHM and ISM large pages.
Reviewed-by: ysr

diff -r -u openjdk.patched/hotspot/src/os/linux/vm/globals_linux.hpp openjdk/hotspot/src/os/linux/vm/globals_linux.hpp

--- openjdk.patched/hotspot/src/os/linux/vm/globals_linux.hpp	2011-03-16 02:30:16.000000000 +0000
+++ openjdk/hotspot/src/os/linux/vm/globals_linux.hpp	2011-05-03 15:28:57.506395330 +0100
@@ -29,13 +29,19 @@
 // Defines Linux specific flags. They are not available on other platforms.
 //
 #define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct) \
-  product(bool, UseOprofile, false,                                 \
-        "enable support for Oprofile profiler")                     \
-                                                                    \
-  product(bool, UseLinuxPosixThreadCPUClocks, true,                 \
-          "enable fast Linux Posix clocks where available")
-// NB: The default value of UseLinuxPosixThreadCPUClocks may be
-// overridden in Arguments::parse_each_vm_init_arg.
+  product(bool, UseOprofile, false,                                     \
+        "enable support for Oprofile profiler")                         \
+                                                                        \
+  product(bool, UseLinuxPosixThreadCPUClocks, true,                     \
+          "enable fast Linux Posix clocks where available")             \
+/*  NB: The default value of UseLinuxPosixThreadCPUClocks may be        \
+    overridden in Arguments::parse_each_vm_init_arg.  */                \
+                                                                        \
+  product(bool, UseHugeTLBFS, false,                                    \
+          "Use MAP_HUGETLB for large pages")                            \
+                                                                        \
+  product(bool, UseSHM, false,                                          \
+          "Use SYSV shared memory for large pages")

 //
 // Defines Linux-specific default values. The flags are available on all
diff -r -u openjdk.patched/hotspot/src/os/linux/vm/os_linux.cpp openjdk/hotspot/src/os/linux/vm/os_linux.cpp
--- openjdk.patched/hotspot/src/os/linux/vm/os_linux.cpp	2011-05-03 14:32:54.285722656 +0100
+++ openjdk/hotspot/src/os/linux/vm/os_linux.cpp	2011-05-04 11:54:46.645813794 +0100
@@ -2495,16 +2495,40 @@
   return res != (uintptr_t) MAP_FAILED;
 }

+// Define MAP_HUGETLB here so we can build HotSpot on old systems.
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000
+#endif
+
+// Define MADV_HUGEPAGE here so we can build HotSpot on old systems.
+#ifndef MADV_HUGEPAGE
+#define MADV_HUGEPAGE 14
+#endif
+
 bool os::commit_memory(char* addr, size_t size, size_t alignment_hint,
                        bool exec) {
+  if (UseHugeTLBFS && alignment_hint > (size_t)vm_page_size()) {
+    int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
+    uintptr_t res =
+      (uintptr_t) ::mmap(addr, size, prot,
+                         MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS|MAP_HUGETLB,
+                         -1, 0);
+    return res != (uintptr_t) MAP_FAILED;
+  }
+
   return commit_memory(addr, size, exec);
 }

-void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) { }
+void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
+  if (UseHugeTLBFS && alignment_hint > (size_t)vm_page_size()) {
+    // We don't check the return value: madvise(MADV_HUGEPAGE) may not
+    // be supported or the memory may already be backed by huge pages.
+    ::madvise(addr, bytes, MADV_HUGEPAGE);
+  }
+}

 void os::free_memory(char *addr, size_t bytes) {
-  ::mmap(addr, bytes, PROT_READ | PROT_WRITE,
-         MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
+  ::madvise(addr, bytes, MADV_DONTNEED);
 }

 void os::numa_make_global(char *addr, size_t bytes) {
@@ -2870,7 +2894,16 @@
 static size_t _large_page_size = 0;

 bool os::large_page_init() {
-  if (!UseLargePages) return false;
+  if (!UseLargePages) {
+    UseHugeTLBFS = false;
+    UseSHM = false;
+    return false;
+  }
+
+  if (FLAG_IS_DEFAULT(UseHugeTLBFS) && FLAG_IS_DEFAULT(UseSHM)) {
+    // Our user has not expressed a preference, so we'll try both.
+    UseHugeTLBFS = UseSHM = true;
+  }

   if (LargePageSizeInBytes) {
     _large_page_size = LargePageSizeInBytes;
@@ -2915,6 +2948,9 @@
     }
   }

+  // print a warning if any large page related flag is specified on command line
+  bool warn_on_failure = !FLAG_IS_DEFAULT(UseHugeTLBFS);
+
   const size_t default_page_size = (size_t)Linux::page_size();
   if (_large_page_size > default_page_size) {
     _page_sizes[0] = _large_page_size;
@@ -2922,6 +2958,14 @@
     _page_sizes[2] = 0;
   }

+  UseHugeTLBFS = UseHugeTLBFS &&
+                 Linux::hugetlbfs_sanity_check(warn_on_failure, _large_page_size);
+
+  if (UseHugeTLBFS)
+    UseSHM = false;
+
+  UseLargePages = UseHugeTLBFS || UseSHM;
+
   // Large page support is available on 2.6 or newer kernel, some vendors
   // (e.g. Redhat) have backported it to their 2.4 based distributions.
   // We optimistically assume the support is available. If later it turns out
@@ -2936,7 +2980,7 @@
 char* os::reserve_memory_special(size_t bytes, char* req_addr, bool exec) {
   // "exec" is passed in but not used.  Creating the shared image for
   // the code cache doesn't have an SHM_X executable permission to check.
-  assert(UseLargePages, "only for large pages");
+  assert(UseLargePages && UseSHM, "only for SHM large pages");

   key_t key = IPC_PRIVATE;
   char *addr;
@@ -3003,19 +3047,19 @@
   return _large_page_size;
 }

-// Linux does not support anonymous mmap with large page memory. The only way
-// to reserve large page memory without file backing is through SysV shared
-// memory API. The entire memory region is committed and pinned upfront.
-// Hopefully this will change in the future...
+// HugeTLBFS allows application to commit large page memory on demand;
+// with SysV SHM the entire memory region must be allocated as shared
+// memory.
 bool os::can_commit_large_page_memory() {
-  return false;
+  return UseHugeTLBFS;
 }

 bool os::can_execute_large_page_memory() {
-  return false;
+  return UseHugeTLBFS;
 }

 // Reserve memory at an arbitrary address, only if that area is
+// Reserve memory at an arbitrary address, only if that area is
 // available (and not reserved for something else).

 char* os::attempt_reserve_memory_at(size_t bytes, char* requested_addr) {
@@ -4104,6 +4148,23 @@
         UseNUMA = false;
       }
     }
+    // With SHM large pages we cannot uncommit a page, so there's not way
+    // we can make the adaptive lgrp chunk resizing work. If the user specified
+    // both UseNUMA and UseLargePages (or UseSHM) on the command line - warn and
+    // disable adaptive resizing.
+    if (UseNUMA && UseLargePages && UseSHM) {
+      if (!FLAG_IS_DEFAULT(UseNUMA)) {
+        if (FLAG_IS_DEFAULT(UseLargePages) && FLAG_IS_DEFAULT(UseSHM)) {
+          UseLargePages = false;
+        } else {
+          warning("UseNUMA is not fully compatible with SHM large pages, disabling adaptive resizing");
+          UseAdaptiveSizePolicy = false;
+          UseAdaptiveNUMAChunkSizing = false;
+        }
+      } else {
+        UseNUMA = false;
+      }
+    }
     if (!UseNUMA && ForceNUMA) {
       UseNUMA = true;
     }
@@ -5009,6 +5070,43 @@
 // JSR166
 // -------------------------------------------------------

+bool os::Linux::hugetlbfs_sanity_check(bool warn, size_t page_size) {
+  bool result = false;
+  void *p = mmap (NULL, page_size, PROT_READ|PROT_WRITE,
+                  MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB,
+                  -1, 0);
+
+  if (p != (void *) -1) {
+    // We don't know if this really is a huge page or not.
+    FILE *fp = fopen("/proc/self/maps", "r");
+    if (fp) {
+      while (!feof(fp)) {
+        char chars[257];
+        long x = 0;
+        if (fgets(chars, sizeof(chars), fp)) {
+          if (sscanf(chars, "%lx-%*lx", &x) == 1
+              && x == (long)p) {
+            if (strstr (chars, "hugepage")) {
+              result = true;
+              break;
+            }
+          }
+        }
+      }
+      fclose(fp);
+    }
+    munmap (p, page_size);
+    if (result)
+      return true;
+  }
+
+  if (warn) {
+    warning("HugeTLBFS is not supported by the operating system.");
+  }
+
+  return result;
+}
+
 /*
  * The solaris and linux implementations of park/unpark are fairly
  * conservative for now, but can be improved. They currently use a
diff -r -u openjdk.patched/hotspot/src/os/linux/vm/os_linux.cpp.orig openjdk/hotspot/src/os/linux/vm/os_linux.cpp.orig
--- openjdk.patched/hotspot/src/os/linux/vm/os_linux.cpp.orig	2011-05-03 14:32:53.972649764 +0100
+++ openjdk/hotspot/src/os/linux/vm/os_linux.cpp.orig	2011-05-03 15:28:57.508395796 +0100
@@ -117,6 +117,10 @@
 # include <inttypes.h>
 # include <sys/ioctl.h>

+#if __x86_64__
+#include <asm/vsyscall.h>
+#endif
+
 #define MAX_PATH    (2 * K)

 // for timer info max values which include all bits
@@ -2491,16 +2495,40 @@
   return res != (uintptr_t) MAP_FAILED;
 }

+// Define MAP_HUGETLB here so we can build HotSpot on old systems.
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000
+#endif
+
+// Define MADV_HUGEPAGE here so we can build HotSpot on old systems.
+#ifndef MADV_HUGEPAGE
+#define MADV_HUGEPAGE 14
+#endif
+
 bool os::commit_memory(char* addr, size_t size, size_t alignment_hint,
                        bool exec) {
+  if (UseHugeTLBFS && alignment_hint > (size_t)vm_page_size()) {
+    int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
+    uintptr_t res =
+      (uintptr_t) ::mmap(addr, size, prot,
+                         MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS|MAP_HUGETLB,
+                         -1, 0);
+    return res != (uintptr_t) MAP_FAILED;
+  }
+
   return commit_memory(addr, size, exec);
 }

-void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) { }
+void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
+  if (UseHugeTLBFS && alignment_hint > (size_t)vm_page_size()) {
+    // We don't check the return value: madvise(MADV_HUGEPAGE) may not
+    // be supported or the memory may already be backed by huge pages.
+    ::madvise(addr, bytes, MADV_HUGEPAGE);
+  }
+}

 void os::free_memory(char *addr, size_t bytes) {
-  ::mmap(addr, bytes, PROT_READ | PROT_WRITE,
-         MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
+  ::madvise(addr, bytes, MADV_DONTNEED);
 }

 void os::numa_make_global(char *addr, size_t bytes) {
@@ -2544,6 +2572,21 @@
   return end;
 }

+static int sched_getcpu_syscall(void) {
+  unsigned int cpu;
+  int retval = -1;
+
+#if __x86_64__
+  typedef long (*vgetcpu_t)(unsigned int *cpu, unsigned int *node, unsigned long *tcache);
+  vgetcpu_t vgetcpu = (vgetcpu_t)VSYSCALL_ADDR(__NR_vgetcpu);
+  retval = vgetcpu(&cpu, NULL, NULL);
+#elif __i386__
+  retval = syscall(SYS_getcpu, &cpu, NULL, NULL);
+#endif
+
+  return (retval == -1) ? retval : cpu;
+}
+
 extern "C" void numa_warn(int number, char *where, ...) { }
 extern "C" void numa_error(char *where) { }

@@ -2565,6 +2608,10 @@
   set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
                                   dlsym(RTLD_DEFAULT, "sched_getcpu")));

+  // If it's not, try a direct syscall.
+  if (sched_getcpu() == -1)
+    set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t, (void*)&sched_getcpu_syscall));
+
   if (sched_getcpu() != -1) { // Does it work?
     void *handle = dlopen("libnuma.so.1", RTLD_LAZY);
     if (handle != NULL) {
@@ -2847,7 +2894,16 @@
 static size_t _large_page_size = 0;

 bool os::large_page_init() {
-  if (!UseLargePages) return false;
+  if (!UseLargePages) {
+    UseHugeTLBFS = false;
+    UseSHM = false;
+    return false;
+  }
+
+  if (FLAG_IS_DEFAULT(UseHugeTLBFS) && FLAG_IS_DEFAULT(UseSHM)) {
+    // Our user has not expressed a preference, so we'll try both.
+    UseHugeTLBFS = UseSHM = true;
+  }

   if (LargePageSizeInBytes) {
     _large_page_size = LargePageSizeInBytes;
@@ -2892,6 +2948,9 @@
     }
   }

+  // print a warning if any large page related flag is specified on command line
+  bool warn_on_failure = !FLAG_IS_DEFAULT(UseHugeTLBFS);
+
   const size_t default_page_size = (size_t)Linux::page_size();
   if (_large_page_size > default_page_size) {
     _page_sizes[0] = _large_page_size;
@@ -2899,6 +2958,14 @@
     _page_sizes[2] = 0;
   }

+  UseHugeTLBFS = UseHugeTLBFS &&
+                 Linux::hugetlbfs_sanity_check(warn_on_failure, _large_page_size);
+
+  if (UseHugeTLBFS)
+    UseSHM = false;
+
+  UseLargePages = UseHugeTLBFS || UseSHM;
+
   // Large page support is available on 2.6 or newer kernel, some vendors
   // (e.g. Redhat) have backported it to their 2.4 based distributions.
   // We optimistically assume the support is available. If later it turns out
@@ -2913,7 +2980,7 @@
 char* os::reserve_memory_special(size_t bytes, char* req_addr, bool exec) {
   // "exec" is passed in but not used.  Creating the shared image for
   // the code cache doesn't have an SHM_X executable permission to check.
-  assert(UseLargePages, "only for large pages");
+  assert(UseLargePages && UseSHM, "only for SHM large pages");

   key_t key = IPC_PRIVATE;
   char *addr;
@@ -2980,18 +3047,18 @@
   return _large_page_size;
 }

-// Linux does not support anonymous mmap with large page memory. The only way
-// to reserve large page memory without file backing is through SysV shared
-// memory API. The entire memory region is committed and pinned upfront.
-// Hopefully this will change in the future...
+// HugeTLBFS allows application to commit large page memory on demand;
+// with SysV SHM the entire memory region must be allocated as shared
+// memory.
 bool os::can_commit_large_page_memory() {
-  return false;
+  return UseHugeTLBFS;
 }

 bool os::can_execute_large_page_memory() {
-  return false;
+  return UseHugeTLBFS;
 }

+ // Reserve memory at an arbitrary address, only if that area is
 // Reserve memory at an arbitrary address, only if that area is
 // available (and not reserved for something else).

@@ -4986,6 +5053,43 @@
 // JSR166
 // -------------------------------------------------------

+bool os::Linux::hugetlbfs_sanity_check(bool warn, size_t page_size) {
+  bool result = false;
+  void *p = mmap (NULL, page_size, PROT_READ|PROT_WRITE,
+                  MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB,
+                  -1, 0);
+
+  if (p != (void *) -1) {
+    // We don't know if this really is a huge page or not.
+    FILE *fp = fopen("/proc/self/maps", "r");
+    if (fp) {
+      while (!feof(fp)) {
+        char chars[257];
+        long x = 0;
+        if (fgets(chars, sizeof(chars), fp)) {
+          if (sscanf(chars, "%lx-%*lx", &x) == 1
+              && x == (long)p) {
+            if (strstr (chars, "hugepage")) {
+              result = true;
+              break;
+            }
+          }
+        }
+      }
+      fclose(fp);
+    }
+    munmap (p, page_size);
+    if (result)
+      return true;
+  }
+
+  if (warn) {
+    warning("HugeTLBFS is not supported by the operating system.");
+  }
+
+  return result;
+}
+
 /*
  * The solaris and linux implementations of park/unpark are fairly
  * conservative for now, but can be improved. They currently use a
Only in openjdk/hotspot/src/os/linux/vm: os_linux.cpp~
diff -r -u openjdk.patched/hotspot/src/os/linux/vm/os_linux.hpp openjdk/hotspot/src/os/linux/vm/os_linux.hpp
--- openjdk.patched/hotspot/src/os/linux/vm/os_linux.hpp	2011-03-16 02:30:16.000000000 +0000
+++ openjdk/hotspot/src/os/linux/vm/os_linux.hpp	2011-05-03 15:28:57.509396029 +0100
@@ -86,6 +86,9 @@

   static void rebuild_cpu_to_node_map();
   static GrowableArray<int>* cpu_to_node()    { return _cpu_to_node; }
+
+  static bool hugetlbfs_sanity_check(bool warn, size_t page_size);
+
  public:
   static void init_thread_fpu_state();
   static int  get_fpu_control_word();
diff -r -u openjdk.patched/hotspot/src/os/solaris/vm/os_solaris.cpp openjdk/hotspot/src/os/solaris/vm/os_solaris.cpp
--- openjdk.patched/hotspot/src/os/solaris/vm/os_solaris.cpp	2011-03-16 02:30:16.000000000 +0000
+++ openjdk/hotspot/src/os/solaris/vm/os_solaris.cpp	2011-05-04 11:53:24.342596468 +0100
@@ -2826,7 +2826,9 @@
 void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
   assert((intptr_t)addr % alignment_hint == 0, "Address should be aligned.");
   assert((intptr_t)(addr + bytes) % alignment_hint == 0, "End should be aligned.");
-  Solaris::set_mpss_range(addr, bytes, alignment_hint);
+  if (UseLargePages && UseMPSS) {
+    Solaris::set_mpss_range(addr, bytes, alignment_hint);
+  }
 }

 // Tell the OS to make the range local to the first-touching LWP
@@ -5042,6 +5044,20 @@
         UseNUMA = false;
       }
     }
+    // ISM is not compatible with the NUMA allocator - it always allocates
+    // pages round-robin across the lgroups.
+    if (UseNUMA && UseLargePages && UseISM) {
+      if (!FLAG_IS_DEFAULT(UseNUMA)) {
+        if (FLAG_IS_DEFAULT(UseLargePages) && FLAG_IS_DEFAULT(UseISM)) {
+          UseLargePages = false;
+        } else {
+          warning("UseNUMA is not compatible with ISM large pages, disabling NUMA allocator");
+          UseNUMA = false;
+        }
+      } else {
+        UseNUMA = false;
+      }
+    }
     if (!UseNUMA && ForceNUMA) {
       UseNUMA = true;
     }
diff -r b6472fccd0ec NEWS
--- a/NEWS      Tue May 03 13:05:05 2011 +0100
+++ b/NEWS      Wed May 04 15:49:23 2011 +0100
@@ -22,6 +22,8 @@
   - S6708580: Java applications slow when EXA enabled
   - S7029905: demo applets missing some html files
   - S6986968: Crash on XIM server restart
+  - S7034464: Support transparent large pages on Linux
+  - S7037939: NUMA: Disable adaptive resizing if SHM large pages are used
 * Bug fixes
   - PR637: make check should exit with an error code if any regression test failed.
   - G356743: Support libpng 1.5.