[PATCH] Linux NUMA support for HotSpot

Sun Mar 2 16:24:54 PST 2008

Hi,

Some time ago I played a bit with the NUMA heap support in the hotspot sources.
In particularly I implemented an interface to the Linux libnuma interface
and wrote some simple benchmarks to see if it was any faster on a Opteron
system (it wasn't unfortunately). After some debugging I concluded that
my Linux NUMA interface was likely right, but the NUMA heap implementation
seemed to be broken (I doubt it'll work very well even on Solaris)

The implementation does not require to link libnuma always in,
but dlopen()s it as needed to avoid a little bit of dll/so hell.

It also uses the Linux getcpu() call which means it can actually
adapt to migrating threads over nodes (unlike the Solaris implementation) 

Anyways just in case someone else wants to play with it here's
libnuma support for Linux for HotSpot. It can be enabled with the usual 
options, but is disabled by default.

The patch was originally against a fairly old snapshot (b13). It still
applies perfectly to the latest 6 snapshot I downloaded. I wasn't able
to retest it thought because I was unable to build the latest snapshot
even after fiddling for an hour with all the undocumented environment
variables.

-Andi


diff -u openjdk/hotspot/src/os/linux/vm/os_linux.cpp-o openjdk/hotspot/src/os/linux/vm/os_linux.cpp

--- openjdk/hotspot/src/os/linux/vm/os_linux.cpp-o	2007-05-24 09:30:53.000000000 +0200
+++ openjdk/hotspot/src/os/linux/vm/os_linux.cpp	2007-06-24 16:56:57.000000000 +0200
@@ -56,6 +56,11 @@
 # include <sys/ipc.h>
 # include <sys/shm.h>
 # include <link.h>
+# include <numa.h>
+# include <numaif.h>
+#ifdef __amd64__
+# include <asm/vsyscall.h>
+#endif
 
 #define MAX_PATH    (2 * K)
 
@@ -81,6 +86,15 @@
 char * os::Linux::_glibc_version = NULL;
 char * os::Linux::_libpthread_version = NULL;
 
+void (*os::Linux::_numa_interleave_memory)(void *, size_t, const nodemask_t *) = NULL;
+void (*os::Linux::_numa_setlocal_memory)(void *, size_t) = NULL;
+int (*os::Linux::_numa_max_node)(void) = NULL;
+nodemask_t (*os::Linux::_numa_get_run_node_mask)(void) = NULL;
+int (*os::Linux::_numa_available)(void) = NULL;
+int (*os::Linux::_getcpu)(unsigned *, unsigned *, void *) = NULL;
+int (*os::Linux::_numa_node_to_cpus)(int node, unsigned long *buffer, int buffer_len);
+bool os::Linux::getcpu_broken = false;
+
 static jlong initial_time_count=0;
 
 static int clock_tics_per_sec = 100;
@@ -739,10 +753,7 @@
   osthread->set_thread_id(os::Linux::gettid());
 
   if (UseNUMA) {
-    int lgrp_id = os::numa_get_group_id();
-    if (lgrp_id != -1) {
-      thread->set_lgrp_id(lgrp_id);
-    }
+    thread->set_lgrp_id(-1);
   }
   // initialize signal mask for this thread
   os::Linux::hotspot_sigmask(thread);
@@ -916,10 +927,10 @@
   thread->set_osthread(osthread);
 
   if (UseNUMA) {
-    int lgrp_id = os::numa_get_group_id();
-    if (lgrp_id != -1) {
-      thread->set_lgrp_id(lgrp_id);
-    }
+    // let it be retrieved on first access in thread context
+    // actually this really needs a timeout or calling getcpu with
+    // the cache
+    thread->set_lgrp_id(-1);
   }
 
   if (os::Linux::is_initial_thread()) {
@@ -2224,25 +2231,230 @@
 
 void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) { }
 void os::free_memory(char *addr, size_t bytes)         { }
-void os::numa_make_global(char *addr, size_t bytes)    { }
-void os::numa_make_local(char *addr, size_t bytes)     { }
-bool os::numa_topology_changed()                       { return false; }
-size_t os::numa_get_groups_num()                       { return 1; }
-int os::numa_get_group_id()                            { return 0; }
-size_t os::numa_get_leaf_groups(int *ids, size_t size) {
-  if (size > 0) {
-    ids[0] = 0;
-    return 1;
+
+#ifndef __amd64__ // x86-64 has a vsyscall
+
+#ifndef SYS_getcpu
+#ifdef __i386__
+#define SYS_getcpu 318
+#else
+#error define getcpu for architecture
+#endif
+#endif
+
+static int getcpu_syscall(unsigned *cpu, unsigned *node, void *cache)
+{
+  return syscall(SYS_getcpu, cpu, node, cache);
+}
+#endif
+
+static __thread nodemask_t current_node_mask;
+
+void os::Linux::numa_init(void)
+{
+  int err = 0;
+  // load libnuma lazily because it is not on all systems
+  void *lnuma = dlopen("libnuma.so.1", RTLD_LAZY);
+  if (!lnuma) { 
+    warning("NUMA requested but cannot open libnuma.so.1. NUMA disabled");
+    UseNUMA = false;
+    return;
+  }
+
+#define NSYM(sym) \
+  { typedef typeof(sym) f; \
+    _##sym = (f *)dlsym(lnuma, #sym); err += (_##sym == NULL); \
+  }
+  NSYM(numa_available);
+  NSYM(numa_interleave_memory);
+  NSYM(numa_setlocal_memory);
+  NSYM(numa_get_run_node_mask);
+  NSYM(numa_max_node);
+  NSYM(numa_node_to_cpus);
+  if (err) { 
+    warning("NUMA requested but cannot find required symbol in libnuma. NUMA disabled");
+    UseNUMA = false;
+    return;
+  } 
+#undef NSYM
+  // libnuma is never unloaded
+    
+#ifdef __x86_64__
+  // will return ENOSYS or work on all Linux x86-64 kernels
+  _getcpu = (int (*)(unsigned *,unsigned *,void *))VSYSCALL_ADDR(2);
+#else
+  _getcpu = getcpu_syscall; 
+#endif
+
+  if (_numa_available() < 0) { 
+    // don't warn here for now for a simple non numa system
+    UseNUMA = false;
+    return;
+  }
+
+  current_node_mask = _numa_get_run_node_mask();
+}
+
+// Make pages global: interleave over current cpuset limit
+void os::numa_make_global(char *addr, size_t bytes) {
+//  os::Linux::_numa_interleave_memory(addr, bytes, &current_node_mask);   
+}
+
+// local memory is default, but set it anyways in case the global
+// policy was differently set by numactl
+// this only sets first-touch policy
+// Linux also supports real page migration, but the NUMA allocator
+// here doesn't seem to.
+void os::numa_make_local(char *addr, size_t bytes) {
+  os::Linux::_numa_setlocal_memory(addr, bytes);
+}
+
+// We just return true if the cpuset changed.
+// RED-PEN this should be probably per thread
+bool os::numa_topology_changed() { 
+  nodemask_t newmask = os::Linux::_numa_get_run_node_mask();
+  if (nodemask_equal(&newmask, &current_node_mask))
+    return false;
+  fprintf(stderr,"numa topology changed\n");
+  current_node_mask = newmask;
+  return true;
+}
+
+size_t os::numa_get_groups_num() {
+  // older version of libnuma are not cpuset aware
+  // compute it from the runnode mask instead
+  nodemask_t mask = os::Linux::_numa_get_run_node_mask();
+  int i, k;
+  k = 0;
+  for (i = 0; i < NUMA_NUM_NODES; i++)
+    if (nodemask_isset(&mask, i))
+      k++;
+  return k;
+}
+
+// copy because header file is still often missing
+struct getcpu_cache { 
+	unsigned long blob[128 / sizeof(long)];	
+};
+static __thread getcpu_cache node_cache;
+
+// slow and complicated fallback method when getcpu is not working
+// we do this only once per thread
+int os::Linux::fallback_get_group_id() {
+  FILE *f = fopen("/proc/self/stat", "r");
+  if (!f)
+    return 0;
+
+  size_t linesz = 0;
+  char *line = NULL; 
+  
+  int n = getline(&line, &linesz, f);
+  fclose(f);
+  if (n <= 0) { 
+    free(line);
+    return 0;
+  }
+  
+  // find processor field
+  
+  // skip non numbers (3 fields) at the beginning
+  char ch; 
+  size_t offset;
+  if (sscanf(line, "%*d %*s %c%n", &ch, &offset) != 1) {
+    free(line);
+    return 0;
+  }
+
+  // process numbers; processor is the 39th field
+  char *p = line + offset;
+  int i, cpu;
+  for (i = 0; i < 39-3; i++) {
+    char *end;
+    cpu = strtol(p, &end, 0);
+    if (p == end) {
+      cpu = 0;
+      break;
+    }
+    p = end;
+  }
+  free(line);
+  
+  // convert to node using libnuma
+  int max_node = os::Linux::_numa_max_node();
+  for (i = 0; i <= max_node; i++) {
+    unsigned long cpus[128]; 
+    if (os::Linux::_numa_node_to_cpus(i, cpus, sizeof(cpus)) < 0) 
+      continue;
+    if (cpus[cpu / sizeof(long)] & (1ULL<<(cpu%sizeof(long)))) {
+      return i;
+    }
   }
   return 0;
 }
 
+int os::numa_get_group_id() {
+    // fast method only in Linux 2.6.19+
+  // this should be fast enough it can be done everytime
+  unsigned cpu, node;
+  if (!os::Linux::getcpu_broken) {
+    if (os::Linux::_getcpu(&cpu, &node, &node_cache) == 0)
+      return node;  
+    os::Linux::getcpu_broken = true;
+  }
+
+  // otherwise use fallback once and then keep using the
+  // cached value. 
+  // it would be better to have some kind of timeout
+  // but i don't know of a fast way to do this
+  int id = Thread::current()->lgrp_id();
+  if (id != -1)
+    return id;
+
+  id = os::Linux::fallback_get_group_id();
+  Thread::current()->set_lgrp_id(id);
+  return id;
+}
+
+size_t os::numa_get_leaf_groups(int *ids, size_t size) {
+  nodemask_t nodes;
+  nodes = os::Linux::_numa_get_run_node_mask();
+  unsigned i, k;
+  k = 0;
+  for (i = 0; i < NUMA_NUM_NODES; i++)
+    if (nodemask_isset(&nodes, i)) { 
+      // could happen when the cpuset shrinks during runtime I think
+      if (k >= size)
+	return k;
+      ids[k++] = i;
+    }
+  return k;
+}
+
 bool os::get_page_info(char *start, page_info* info) {
+  unsigned pol;
+  info->size = 0;
+  info->lgrp_id = -1;
+  // no nice way to detect huge pages here
+  if (syscall(SYS_get_mempolicy, &pol, NULL, 0, start, MPOL_F_NODE|MPOL_F_ADDR) == 0) { 
+    info->size = os::Linux::_page_size;
+    info->lgrp_id = pol;
+    return true;
+  }
   return false;
 }
 
+
+// Scan the pages from start to end until a page different than
+// the one described in the info parameter is encountered.
 char *os::scan_pages(char *start, char* end, page_info* page_expected, page_info* page_found) {
-  return end;
+  while (start < end) { 
+    if (!get_page_info(start, page_found))
+      return NULL;
+    if (page_expected->lgrp_id != page_found->lgrp_id)
+      return start;
+    start += os::Linux::_page_size;
+  }
+  return start;
 }
 
 bool os::uncommit_memory(char* addr, size_t size) {
@@ -3571,6 +3783,9 @@
   // initialize thread priority policy
   prio_init();
 
+  if (UseNUMA)
+    Linux::numa_init();
+
   return JNI_OK;
 }
 
diff -u openjdk/hotspot/src/os/linux/vm/os_linux.hpp-o openjdk/hotspot/src/os/linux/vm/os_linux.hpp
--- openjdk/hotspot/src/os/linux/vm/os_linux.hpp-o	2007-05-24 09:30:53.000000000 +0200
+++ openjdk/hotspot/src/os/linux/vm/os_linux.hpp	2007-05-29 03:05:23.000000000 +0200
@@ -52,6 +52,17 @@
   static int (*_clock_gettime)(clockid_t, struct timespec *);
   static int (*_pthread_getcpuclockid)(pthread_t, clockid_t *);
 
+  static void (*_numa_interleave_memory)(void *, size_t, const nodemask_t *);
+  static void (*_numa_setlocal_memory)(void *, size_t);
+  static nodemask_t (*_numa_get_run_node_mask)(void);
+  static int (*_numa_node_to_cpus)(int node, unsigned long *buffer, int buffer_len);
+  static int (*_numa_available)(void);
+  static int (*_numa_max_node)(void);
+  static int (*_getcpu)(unsigned *, unsigned *, void *);
+  static void numa_init();
+  static bool getcpu_broken; 
+  static int fallback_get_group_id();
+
   static address   _initial_thread_stack_bottom;
   static uintptr_t _initial_thread_stack_size;