[PATCH] JDK NUMA Interleaving issue

amith pawar amith.pawar at gmail.com
Fri Nov 9 12:23:32 UTC 2018


Hi all,


The flag UseNUMA (or UseNUMAInterleaving), has to interleave old gen, S1
and S2 region (if any other ) memory areas

on requested Numa nodes and it should not configure itself to access other
Numa nodes. This issue is observed only when Java

is allowed to run on fewer NUMA nodes than available on the system with
numactl membind and interleave options. Running

on all the nodes does not have any effect. This will cause some
applications (objects residing in old gen and survivor region) to

run slower on system with large Numa nodes.



I have described below in more detail for both numactl MEMBIND and
INTERLEAVE options. Addresses from both GC log and

process numa maps clearly shows that the JAVA process is configured to
access other memory nodes even though it is not allowed.



Two scenarios are:

          1. Numactl membind case : numactl -m 0-1, -N 0-1 <command and its
arguments>


Numa map shows these regions are INTERLEAVED ON ALL NODES instead of
specified Numa memory nodes 0 and 1.



----------------------------------- GC
HEAP--------------------------------------------------------------------------------------------------------------------------------------------------------------------

GC Options: -Xlog:gc*=info,gc+heap=debug

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[602.180s][debug][gc,heap        ] GC(20) Heap before GC invocations=21
(full 4): PSYoungGen      total 120604672K, used 11907587K
[0x00002afc4b200000, 0x00002b198b200000, 0x00002b198b200000)

[602.180s][debug][gc,heap        ] GC(20)   eden space 118525952K, 8% used
[0x00002afc4b200000,0x00002b0bb1b376e0,0x00002b188d600000)

[602.180s][debug][gc,heap        ] GC(20)     lgrp 0 space 59262976K, 8%
used [0x00002afc4b200000,0x00002afd89bef450,0x00002b0a6c400000)

[602.180s][debug][gc,heap        ] GC(20)     lgrp 1 space 59262976K, 8%
used [0x00002b0a6c400000,0x00002b0bb1b376e0,0x00002b188d600000)

[602.180s][debug][gc,heap        ] GC(20)   from space 2078720K, 65% used
[0x00002b190c400000,0x00002b195ef5a0d0,0x00002b198b200000)

[602.180s][debug][gc,heap        ] GC(20)   to   space 2078720K, 0% used
[0x00002b188d600000,0x00002b188d600000,0x00002b190c400000)

[602.180s][debug][gc,heap        ] GC(20)  ParOldGen       total 2097152K,
used 226685K [0x00002afbcb200000, 0x00002afc4b200000, 0x00002afc4b200000)

[602.180s][debug][gc,heap        ] GC(20)   object space 2097152K, 10% used
[0x00002afbcb200000,0x00002afbd8f5f460,0x00002afc4b200000)

[602.180s][debug][gc,heap        ] GC(20)  Metaspace       used 28462K,
capacity 29008K, committed 29184K, reserved 30720K

----------------------------------- GC HEAP
END--------------------------------------------------------------------------------------------------------------------------------------------------------------


---------------------------------- PROCESS NUMA Maps
---------------------------------------------------------------------------------------------------------------------------------------------------

Command : cat /proc/4947/numa_maps

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    =======> Following addresses are interleaved on all nodes.

2afbb4f4c000 interleave:0-7 anon=16 dirty=16 N0=2 N1=2 N2=2 N3=2 N4=2 N5=2
N6=2 N7=2 kernelpagesize_kB=4

2afbb4f6c000 interleave:0-7

2afbb7e88000 interleave:0-7 anon=50 dirty=50 N0=7 N1=7 N2=6 N3=6 N4=6 N5=6
N6=6 N7=6 kernelpagesize_kB=4

2afbbc000000 interleave:0-7 anon=8704 dirty=8704 N0=1600 N1=1088 N2=1088
N3=576 N4=1088 N5=1088 N6=1088 N7=1088 kernelpagesize_kB=4

2afbc3be6000 interleave:0-7 anon=6682 dirty=6682 N0=1027 N1=1027 N2=515
N3=515 N4=515 N5=1027 N6=1028 N7=1028 kernelpagesize_kB=4

2afbcb000000 interleave:0-7 anon=50 dirty=50 N0=7 N1=7 N2=6 N3=6 N4=6 N5=6
N6=6 N7=6 kernelpagesize_kB=4

2afbcb200000 interleave:0-7 anon=524288 dirty=524288 N0=65536 N1=65536
N2=65536 N3=65536 N4=65536 N5=65536 N6=65536 N7=65536 kernelpagesize_kB=4

            ==> OLD GEN Address

2afc4b200000 prefer:0 anon=1536 dirty=1536 N0=1536 kernelpagesize_kB=4

2b0a6c400000 prefer:1 anon=512 dirty=512 N1=512 kernelpagesize_kB=4

2b188d600000 interleave:0-7 anon=1040384 dirty=1040384 N0=130048 N1=130048
N2=130048 N3=130048 N4=130048 N5=130048 N6=130048 N7=130048
kernelpagesize_kB=4

            ==> Survivor Region

2b198b600000 interleave:0-7 anon=60929 dirty=60929 N0=7233 N1=7744 N2=7232
N3=7744 N4=7744 N5=7744 N6=7744 N7=7744 kernelpagesize_kB=4

---------------------------------- PROCESS NUMA Maps
-----------------------------------------------------------------------------------------------------------------------------------------------------



---------------------------------- PROCESS status
-----------------------------------------------------------------------------------------------------------------------------------------------------------

Command : cat /proc/4947/status

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Cpus_allowed:   00000000,0000ffff,00000000,0000ffff

Cpus_allowed_list:      0-15,64-79

Mems_allowed:
00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,000000ff

Mems_allowed_list:      0-7

--------------------------------- PROCESS status
-------------------------------------------------------------------------------------------------------------------------------------------------------------




        2. NUMACTL Interleave  case: numactl -i 0-1, -N 0-1 <command and
its arguments>


NUMAmaps below shows interleaved on all nodes instead of specified Numa
memory nodes 0 and 1.

----------------------------------- GC
HEAP--------------------------------------------------------------------------------------------------------------------------------------------------------------------


GC Options: -Xlog:gc*=info,gc+heap=debug

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2216.439s][debug][gc,heap        ] GC(159) Heap before GC invocations=160
(full 9): PSYoungGen      total 120604672K, used 30143454K
[0x00002b9d47c00000, 0x00002bba87c00000, 0x00002bba87c00000)

[2216.439s][debug][gc,heap        ] GC(159)   eden space 118525952K, 24%
used [0x00002b9d47c00000,0x00002ba458400000,0x00002bb98a000000)

[2216.439s][debug][gc,heap        ] GC(159)     lgrp 0 space 14815232K, 98%
used [0x00002b9d47c00000,0x00002ba0beb87c90,0x00002ba0d0000000)

[2216.439s][debug][gc,heap        ] GC(159)     lgrp 1 space 14815232K,
100% used [0x00002ba0d0000000,0x00002ba458400000,0x00002ba458400000)

       ==> Memory allocated on following nodes are unused.

[2216.439s][debug][gc,heap        ] GC(159)     lgrp 2 space 14815232K, 0%
used [0x00002ba458400000,0x00002ba458400000,0x00002ba7e0800000)

[2216.439s][debug][gc,heap        ] GC(159)     lgrp 3 space 14815232K, 0%
used [0x00002ba7e0800000,0x00002ba7e0800000,0x00002bab68c00000)

[2216.439s][debug][gc,heap        ] GC(159)     lgrp 4 space 14815232K, 0%
used [0x00002bab68c00000,0x00002bab68c00000,0x00002baef1000000)

[2216.439s][debug][gc,heap        ] GC(159)     lgrp 5 space 14815232K, 0%
used [0x00002baef1000000,0x00002baef1000000,0x00002bb279400000)

[2216.439s][debug][gc,heap        ] GC(159)     lgrp 6 space 14815232K, 0%
used [0x00002bb279400000,0x00002bb279400000,0x00002bb601800000)

[2216.439s][debug][gc,heap        ] GC(159)     lgrp 7 space 14819328K, 0%
used [0x00002bb601800000,0x00002bb601800000,0x00002bb98a000000)

[2216.439s][debug][gc,heap        ] GC(159)   from space 2078720K, 38% used
[0x00002bba08e00000,0x00002bba3976fb70,0x00002bba87c00000)

[2216.439s][debug][gc,heap        ] GC(159)   to   space 2078720K, 0% used
[0x00002bb98a000000,0x00002bb98a000000,0x00002bba08e00000)

[2216.439s][debug][gc,heap        ] GC(159)  ParOldGen       total
2097152K, used 685229K [0x00002b9cc7c00000, 0x00002b9d47c00000,
0x00002b9d47c00000)

[2216.439s][debug][gc,heap        ] GC(159)   object space 2097152K, 32%
used [0x00002b9cc7c00000,0x00002b9cf192b6e8,0x00002b9d47c00000)

[2216.439s][debug][gc,heap        ] GC(159)  Metaspace       used 28753K,
capacity 29257K, committed 29440K, reserved 30720K

----------------------------------- GC HEAP
END-------------------------------------------------------------------------------------------------------------------------------------------------------------


---------------------------------- PROCESS NUMA Maps
--------------------------------------------------------------------------------------------------------------------------------------------------

Command : cat /proc/pid/numa_maps

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

   ==> Following addresses are interleaved on all the nodes.

2b9cb1992000 interleave:0-7 anon=16 dirty=16 N0=2 N1=2 N2=2 N3=2 N4=2 N5=2
N6=2 N7=2 kernelpagesize_kB=4

2b9cb19b2000 interleave:0-7

2b9cb3e65000 interleave:0-7 anon=50 dirty=50 N0=6 N1=6 N2=6 N3=6 N4=6 N5=7
N6=7 N7=6 kernelpagesize_kB=4

2b9cb8a69000 interleave:0-7 anon=8599 dirty=8599 N0=626 N1=1139 N2=1139
N3=1139 N4=1139 N5=1139 N6=1139 N7=1139 kernelpagesize_kB=4

2b9cc064f000 interleave:0-7 anon=6577 dirty=6577 N0=566 N1=566 N2=566
N3=1078 N4=1078 N5=1078 N6=1078 N7=567 kernelpagesize_kB=4

2b9cc7a69000 interleave:0-7 anon=50 dirty=50 N0=6 N1=7 N2=7 N3=6 N4=6 N5=6
N6=6 N7=6 kernelpagesize_kB=4

2b9cc7c00000 interleave:0-7 anon=524288 dirty=524288 N0=65536 N1=65536
N2=65536 N3=65536 N4=65536 N5=65536 N6=65536 N7=65536 kernelpagesize_kB=4

2b9d47c00000 prefer:0 anon=2560 dirty=2560 N0=2560 kernelpagesize_kB=4
   ==> Logical group 1

2ba0d0000000
prefer:1
==> Logical group 2

2ba458400000 prefer:2

==> This one and below all are unnecessary and leaving memory unused.

2ba7e0800000 prefer:3

2bab68c00000 prefer:4

2baef1000000 prefer:5

2bb279400000 prefer:6

2bb601800000 prefer:7

2bb98a000000 interleave:0-7 anon=1040384 dirty=1040384 N0=130048 N1=130048
N2=130048 N3=130048 N4=130048 N5=130048 N6=130048 N7=130048
kernelpagesize_kB=4

2bba88000000 interleave:0-7 anon=60929 dirty=60929 N0=7745 N1=7744 N2=7744
N3=7744 N4=7744 N5=7232 N6=7744 N7=7232 kernelpagesize_kB=4

---------------------------------- PROCESS NUMA Maps
--------------------------------------------------------------------------------------------------------------------------------------------------



 --------------------------------- PROCESS status
-----------------------------------------------------------------------------------------------------------------------------------------------------------

Command : cat /proc/pid/status

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Cpus_allowed:   00000000,0000ffff,00000000,0000ffff

Cpus_allowed_list:      0-15,64-79

Mems_allowed:
00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,000000ff

Mems_allowed_list:      0-7

 --------------------------------- PROCESS status
-----------------------------------------------------------------------------------------------------------------------------------------------------------






Please note “Mems_allowed” and “Mems_allowed_list” list also shows
incorrect range. This could be LIBNUMA issue in case of membind.

When application is externally bound to fewer nodes then calls to libNUMA
API’s SHOULD FAIL to interleave on all nodes but right now it

does not with currently used/called API’s in JDK.



How patch works ?

The patch gets bitmask structure by calling following API's (Man page
definition for these functions are also given below).

 1. For Membind : Use numa_get_membind to get membind bitmask (already used
in the code)


    numa_get_membind() returns the mask of nodes from which memory can
currently be allocated.  If the returned mask is equal to numa_all_nodes,
then memory allocation is allowed from all nodes.

 2. For Interleave: use numa_get_interleave_mask to get interleave mask
(currently not used/called in JDK)
     numa_get_interleave_mask() returns the current interleave mask if the
task's memory allocation policy is page interleaved.  Otherwise, this
function returns an empty mask.



Check node counts from both the Bitmasks to identify current running mode.
When Interleave mask is EMPTY then it is running in membind else it is in
interleave mode.

Call “numa_interleave_memory” (called indirectly through “numa_make_global”
funcition) function with right bitmask identified above helps to fix this
issue.





Improvement:

This patch is tested on EPYC with SPECJBB benchmark and score improvements
are given below.

1. For NUMACTL membind

    Max-jOPS improved by 5-12 % and Critical-jOPS by 2-6 %

2. For NUMACTL interleave (This patch fixes memory usage when invoked with
numactl -i)

    Max-jOPS by 5-15% and Critical-jOPS by 11-100%.

3. With this fix, flag UseNUMAInterleaving turning on/off has no effect
when externally interleaved through NUMACTL.

4. Flag UseNUMA score improved by ~14 % when enabled for single NUMA node.
Currently it gets disabled when externally bound to single node.





---------------------- PATCH --------------------------
diff --git a/src/hotspot/os/linux/os_linux.cpp
b/src/hotspot/os/linux/os_linux.cpp
--- a/src/hotspot/os/linux/os_linux.cpp
+++ b/src/hotspot/os/linux/os_linux.cpp
@@ -2720,6 +2720,8 @@
 }

 void os::numa_make_global(char *addr, size_t bytes) {
+  if (!UseNUMAInterleaving)
+    return ;
   Linux::numa_interleave_memory(addr, bytes);
 }

@@ -2785,6 +2787,15 @@
       ids[i++] = node;
     }
   }
+
+  if (Linux::_numa_interleave_ptr != NULL ) {
+    i = 0;
+    for (int node = 0; node <= highest_node_number; node++) {
+      if (Linux::_numa_bitmask_isbitset(Linux::_numa_interleave_ptr,
node)) {
+        ids[i++] = node;
+      }
+    }
+  }
   return i;
 }

@@ -2884,11 +2895,18 @@
                                        libnuma_dlsym(handle,
"numa_distance")));
       set_numa_get_membind(CAST_TO_FN_PTR(numa_get_membind_func_t,
                                           libnuma_v2_dlsym(handle,
"numa_get_membind")));
+
set_numa_get_interleave_mask(CAST_TO_FN_PTR(numa_get_interleave_mask_func_t,
+                                          libnuma_v2_dlsym(handle,
"numa_get_interleave_mask")));

       if (numa_available() != -1) {
+        struct bitmask *bmp;
         set_numa_all_nodes((unsigned long*)libnuma_dlsym(handle,
"numa_all_nodes"));
         set_numa_all_nodes_ptr((struct bitmask **)libnuma_dlsym(handle,
"numa_all_nodes_ptr"));
         set_numa_nodes_ptr((struct bitmask **)libnuma_dlsym(handle,
"numa_nodes_ptr"));
+        bmp = _numa_get_interleave_mask();
+        set_numa_interleave_ptr(&bmp);
+        bmp = _numa_get_membind();
+        set_numa_membind_ptr(&bmp);
         // Create an index -> node mapping, since nodes are not always
consecutive
         _nindex_to_node = new (ResourceObj::C_HEAP, mtInternal)
GrowableArray<int>(0, true);
         rebuild_nindex_to_node_map();
@@ -3015,9 +3033,12 @@
 os::Linux::numa_bitmask_isbitset_func_t os::Linux::_numa_bitmask_isbitset;
 os::Linux::numa_distance_func_t os::Linux::_numa_distance;
 os::Linux::numa_get_membind_func_t os::Linux::_numa_get_membind;
+os::Linux::numa_get_interleave_mask_func_t
os::Linux::_numa_get_interleave_mask;
 unsigned long* os::Linux::_numa_all_nodes;
 struct bitmask* os::Linux::_numa_all_nodes_ptr;
 struct bitmask* os::Linux::_numa_nodes_ptr;
+struct bitmask* os::Linux::_numa_interleave_ptr;
+struct bitmask* os::Linux::_numa_membind_ptr;

 bool os::pd_uncommit_memory(char* addr, size_t size) {
   uintptr_t res = (uintptr_t) ::mmap(addr, size, PROT_NONE,
@@ -4997,13 +5018,38 @@
     if (!Linux::libnuma_init()) {
       UseNUMA = false;
     } else {
-      if ((Linux::numa_max_node() < 1) || Linux::isbound_to_single_node())
{
-        // If there's only one node (they start from 0) or if the process
-        // is bound explicitly to a single node using membind, disable
NUMA.
-        UseNUMA = false;
+
+    // Identify whether running in membind or interleave mode.
+    struct bitmask *bmp;
+    bool _is_membind = false;
+    bool _is_interleaved = false;
+
+    // Check for membind mode.
+    bmp = Linux::_numa_membind_ptr;
+    for (int node = 0; node <= Linux::numa_max_node() ; node++) {
+      if (Linux::_numa_bitmask_isbitset(bmp, node)) {
+        _is_membind = true;
       }
     }

+    // Check for interleave mode.
+    bmp = Linux::_numa_interleave_ptr;
+    for (int node = 0; node <= Linux::numa_max_node() ; node++) {
+      if (Linux::_numa_bitmask_isbitset(bmp, node)) {
+        _is_interleaved = true;
+        // Set membind to false as interleave mode allows all nodes to be
used.
+        _is_membind = false;
+      }
+    }
+
+    if (_is_membind)
+      Linux::set_numa_interleave_ptr (NULL);
+
+    if (_is_interleaved)
+      Linux::set_numa_membind_ptr (NULL);
+
+    }
+
     if (UseParallelGC && UseNUMA && UseLargePages &&
!can_commit_large_page_memory()) {
       // With SHM and HugeTLBFS large pages we cannot uncommit a page, so
there's no way
       // we can make the adaptive lgrp chunk resizing work. If the user
specified both
diff --git a/src/hotspot/os/linux/os_linux.hpp
b/src/hotspot/os/linux/os_linux.hpp
--- a/src/hotspot/os/linux/os_linux.hpp
+++ b/src/hotspot/os/linux/os_linux.hpp
@@ -222,6 +222,7 @@
   typedef void (*numa_interleave_memory_func_t)(void *start, size_t size,
unsigned long *nodemask);
   typedef void (*numa_interleave_memory_v2_func_t)(void *start, size_t
size, struct bitmask* mask);
   typedef struct bitmask* (*numa_get_membind_func_t)(void);
+  typedef struct bitmask* (*numa_get_interleave_mask_func_t)(void);

   typedef void (*numa_set_bind_policy_func_t)(int policy);
   typedef int (*numa_bitmask_isbitset_func_t)(struct bitmask *bmp,
unsigned int n);
@@ -239,9 +240,12 @@
   static numa_bitmask_isbitset_func_t _numa_bitmask_isbitset;
   static numa_distance_func_t _numa_distance;
   static numa_get_membind_func_t _numa_get_membind;
+  static numa_get_interleave_mask_func_t _numa_get_interleave_mask;
   static unsigned long* _numa_all_nodes;
   static struct bitmask* _numa_all_nodes_ptr;
   static struct bitmask* _numa_nodes_ptr;
+  static struct bitmask* _numa_interleave_ptr;
+  static struct bitmask* _numa_membind_ptr;

   static void set_sched_getcpu(sched_getcpu_func_t func) { _sched_getcpu =
func; }
   static void set_numa_node_to_cpus(numa_node_to_cpus_func_t func) {
_numa_node_to_cpus = func; }
@@ -255,9 +259,12 @@
   static void set_numa_bitmask_isbitset(numa_bitmask_isbitset_func_t func)
{ _numa_bitmask_isbitset = func; }
   static void set_numa_distance(numa_distance_func_t func) {
_numa_distance = func; }
   static void set_numa_get_membind(numa_get_membind_func_t func) {
_numa_get_membind = func; }
+  static void set_numa_get_interleave_mask(numa_get_interleave_mask_func_t
func) { _numa_get_interleave_mask = func; }
   static void set_numa_all_nodes(unsigned long* ptr) { _numa_all_nodes =
ptr; }
   static void set_numa_all_nodes_ptr(struct bitmask **ptr) {
_numa_all_nodes_ptr = (ptr == NULL ? NULL : *ptr); }
   static void set_numa_nodes_ptr(struct bitmask **ptr) { _numa_nodes_ptr =
(ptr == NULL ? NULL : *ptr); }
+  static void set_numa_interleave_ptr(struct bitmask **ptr) {
_numa_interleave_ptr = (ptr == NULL ? NULL : *ptr); }
+  static void set_numa_membind_ptr(struct bitmask **ptr) {
_numa_membind_ptr = (ptr == NULL ? NULL : *ptr); }
   static int sched_getcpu_syscall(void);
  public:
   static int sched_getcpu()  { return _sched_getcpu != NULL ?
_sched_getcpu() : -1; }
@@ -275,7 +282,10 @@
   static void numa_interleave_memory(void *start, size_t size) {
     // Use v2 api if available
     if (_numa_interleave_memory_v2 != NULL && _numa_all_nodes_ptr != NULL)
{
-      _numa_interleave_memory_v2(start, size, _numa_all_nodes_ptr);
+      if (_numa_interleave_ptr != NULL)
+        _numa_interleave_memory_v2(start, size, _numa_interleave_ptr);
+      else
+        _numa_interleave_memory_v2(start, size, _numa_membind_ptr);
     } else if (_numa_interleave_memory != NULL && _numa_all_nodes != NULL)
{
       _numa_interleave_memory(start, size, _numa_all_nodes);
     }
----------------------- PATCH -----------------------------------


Thanks
Amit Pawar


More information about the hotspot-dev mailing list