UseNUMA membind Issue in openJDK

Swati Sharma swatibits14 at gmail.com
Fri Jun 1 11:10:28 UTC 2018


Hi Gustavo,

I will fix the thread binding issue in a separate patch.

Updated the previous patch by removing the structure and using the methods
provided by numa API.Here is the updated one with the changes(attached
also).

========================PATCH=========================
diff --git a/src/hotspot/os/linux/os_linux.cpp
b/src/hotspot/os/linux/os_linux.cpp
--- a/src/hotspot/os/linux/os_linux.cpp
+++ b/src/hotspot/os/linux/os_linux.cpp
@@ -2831,9 +2831,10 @@

   // Map all node ids in which is possible to allocate memory. Also nodes
are
   // not always consecutively available, i.e. available from 0 to the
highest
-  // node number.
+  // node number. If the nodes have been bound explicitly using numactl
membind,
+  // then allocate memory from those nodes only.
   for (size_t node = 0; node <= highest_node_number; node++) {
-    if (Linux::isnode_in_configured_nodes(node)) {
+    if (Linux::isnode_in_bound_nodes(node)) {
       ids[i++] = node;
     }
   }
@@ -2930,6 +2931,12 @@
                                                libnuma_dlsym(handle,
"numa_bitmask_isbitset")));
       set_numa_distance(CAST_TO_FN_PTR(numa_distance_func_t,
                                        libnuma_dlsym(handle,
"numa_distance")));
+      set_numa_set_membind(CAST_TO_FN_PTR(numa_set_membind_func_t,
+                                          libnuma_dlsym(handle,
"numa_set_membind")));
+      set_numa_get_membind(CAST_TO_FN_PTR(numa_get_membind_func_t,
+                                          libnuma_v2_dlsym(handle,
"numa_get_membind")));
+      set_numa_bitmask_nbytes(CAST_TO_FN_PTR(numa_bitmask_nbytes_func_t,
+                                             libnuma_dlsym(handle,
"numa_bitmask_nbytes")));

       if (numa_available() != -1) {
         set_numa_all_nodes((unsigned long*)libnuma_dlsym(handle,
"numa_all_nodes"));
@@ -3054,6 +3061,9 @@
 os::Linux::numa_set_bind_policy_func_t os::Linux::_numa_set_bind_policy;
 os::Linux::numa_bitmask_isbitset_func_t os::Linux::_numa_bitmask_isbitset;
 os::Linux::numa_distance_func_t os::Linux::_numa_distance;
+os::Linux::numa_set_membind_func_t os::Linux::_numa_set_membind;
+os::Linux::numa_get_membind_func_t os::Linux::_numa_get_membind;
+os::Linux::numa_bitmask_nbytes_func_t os::Linux::_numa_bitmask_nbytes;
 unsigned long* os::Linux::_numa_all_nodes;
 struct bitmask* os::Linux::_numa_all_nodes_ptr;
 struct bitmask* os::Linux::_numa_nodes_ptr;
@@ -4962,8 +4972,9 @@
     if (!Linux::libnuma_init()) {
       UseNUMA = false;
     } else {
-      if ((Linux::numa_max_node() < 1)) {
-        // There's only one node(they start from 0), disable NUMA.
+      if ((Linux::numa_max_node() < 1) || Linux::isbound_to_single_node())
{
+        // If there's only one node(they start from 0) or if the process
+        // is bound explicitly to a single node using membind, disable
NUMA.
         UseNUMA = false;
       }
     }
diff --git a/src/hotspot/os/linux/os_linux.hpp
b/src/hotspot/os/linux/os_linux.hpp
--- a/src/hotspot/os/linux/os_linux.hpp
+++ b/src/hotspot/os/linux/os_linux.hpp
@@ -228,6 +228,9 @@
   typedef int (*numa_tonode_memory_func_t)(void *start, size_t size, int
node);
   typedef void (*numa_interleave_memory_func_t)(void *start, size_t size,
unsigned long *nodemask);
   typedef void (*numa_interleave_memory_v2_func_t)(void *start, size_t
size, struct bitmask* mask);
+  typedef void (*numa_set_membind_func_t)(struct bitmask *mask);
+  typedef struct bitmask* (*numa_get_membind_func_t)(void);
+  typedef unsigned int (*numa_bitmask_nbytes_func_t)(struct bitmask *mask);

   typedef void (*numa_set_bind_policy_func_t)(int policy);
   typedef int (*numa_bitmask_isbitset_func_t)(struct bitmask *bmp,
unsigned int n);
@@ -244,6 +247,9 @@
   static numa_set_bind_policy_func_t _numa_set_bind_policy;
   static numa_bitmask_isbitset_func_t _numa_bitmask_isbitset;
   static numa_distance_func_t _numa_distance;
+  static numa_set_membind_func_t _numa_set_membind;
+  static numa_get_membind_func_t _numa_get_membind;
+  static numa_bitmask_nbytes_func_t _numa_bitmask_nbytes;
   static unsigned long* _numa_all_nodes;
   static struct bitmask* _numa_all_nodes_ptr;
   static struct bitmask* _numa_nodes_ptr;
@@ -259,6 +265,9 @@
   static void set_numa_set_bind_policy(numa_set_bind_policy_func_t func) {
_numa_set_bind_policy = func; }
   static void set_numa_bitmask_isbitset(numa_bitmask_isbitset_func_t func)
{ _numa_bitmask_isbitset = func; }
   static void set_numa_distance(numa_distance_func_t func) {
_numa_distance = func; }
+  static void set_numa_set_membind(numa_set_membind_func_t func) {
_numa_set_membind = func; }
+  static void set_numa_get_membind(numa_get_membind_func_t func) {
_numa_get_membind = func; }
+  static void set_numa_bitmask_nbytes(numa_bitmask_nbytes_func_t func)
{_numa_bitmask_nbytes = func; }
   static void set_numa_all_nodes(unsigned long* ptr) { _numa_all_nodes =
ptr; }
   static void set_numa_all_nodes_ptr(struct bitmask **ptr) {
_numa_all_nodes_ptr = (ptr == NULL ? NULL : *ptr); }
   static void set_numa_nodes_ptr(struct bitmask **ptr) { _numa_nodes_ptr =
(ptr == NULL ? NULL : *ptr); }
@@ -320,6 +329,41 @@
     } else
       return 0;
   }
+  // Check if node is in bound node set.
+  static bool isnode_in_bound_nodes(int node) {
+    if (_numa_get_membind != NULL && _numa_bitmask_isbitset != NULL) {
+      return _numa_bitmask_isbitset(_numa_get_membind(), node);
+    } else {
+      return false;
+    }
+  }
+  // Check if bound to only one numa node.
+  // Returns true if bound to a single numa node, otherwise returns false.
+  static bool isbound_to_single_node() {
+    int single_node = 0;
+    struct bitmask* bmp = NULL;
+    unsigned int node = 0;
+    unsigned int max_number_of_nodes = 0;
+    if (_numa_get_membind != NULL && _numa_bitmask_nbytes != NULL) {
+      bmp = _numa_get_membind();
+      max_number_of_nodes = _numa_bitmask_nbytes(bmp) * 8;
+    } else {
+      return false;
+    }
+    for (node = 0; node < max_number_of_nodes; node++) {
+       if (_numa_bitmask_isbitset(bmp, node)) {
+         single_node++;
+         if (single_node == 2) {
+           return false;
+         }
+       }
+    }
+    if (single_node == 1) {
+      return true;
+    } else {
+      return false;
+    }
+  }
 };

 #endif // OS_LINUX_VM_OS_LINUX_HPP
=======================================================

Swati



On Tue, May 29, 2018 at 6:53 PM, Gustavo Romero <gromero at linux.vnet.ibm.com>
wrote:
>
> Hi Swati,
>
> On 05/29/2018 06:12 AM, Swati Sharma wrote:
>>
>> I have incorporated some changes suggested by you.
>>
>> The use of struct bitmask's  maskp for checking 64 bit in single
iteration
>> is more optimized compared to numa_bitmask_isbitset()  as by using this
we
>> need to check each bit for  1024 times(SUSE case) and 64 times(Ubuntu
Case).
>> If its fine to iterate at initialization time then I can change.
>
>
> Yes, I know, your version is more optimized. libnuma API should provide a
> ready-made solution for that... but that's another story. I'm curious to
know
> what the time difference is on the worst case for both ways tho. Anyway, I
> just would like to point out that, regardless performance, it's possible
to
> achieve the same result with current libnuma API.
>
>
>> For the answer to your question:
>> If it picks up node 16, not so bad, but what if it picks up node 0 or 1?
>> It can be checked based on numa_distance instead of picking up the lgrps
randomly.
>
>
> That seems a good solution. You can do the checking very early, so
> lgrp_spaces()->find() does not even fail (return -1), i.e. by changing
the CPU to
> node mapping on initialization (avoiding to change cas_allocate()). On
that checking
> both numa distance and if the node is bound (or not) would be considered
to generate
> the map.
>
>
> Best regards,
> Gustavo
>
>> Thanks,
>> Swati
>>
>>
>>
>> On Fri, May 25, 2018 at 4:54 AM, Gustavo Romero <
gromero at linux.vnet.ibm.com <mailto:gromero at linux.vnet.ibm.com>> wrote:
>>
>>     Hi Swati,
>>
>>
>>     Thanks for CC:ing me. Sorry for the delay replying it, I had to
reserve a few
>>     specific machines before trying your patch :-)
>>
>>     I think that UseNUMA's original task was to figure out the best
binding
>>     setup for the JVM automatically but I understand that it also has to
be aware
>>     that sometimes, for some (new) particular reasons, its binding task
is
>>     "modulated" by other external agents. Thanks for proposing a fix.
>>
>>     I have just a question/concern on the proposal: how the JVM should
behave if
>>     CPUs are not bound in accordance to the bound memory nodes? For
instance, what
>>     happens if no '--cpunodebind' is passed and '--membind=0,1,16' is
passed at
>>     the same time on this numa topology:
>>
>>     brianh at p215n12:~$ numactl -H
>>     available: 4 nodes (0-1,16-17)
>>     node 0 cpus: 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 32 33 34 35
>>     node 0 size: 65342 MB
>>     node 0 free: 56902 MB
>>     node 1 cpus: 40 41 42 43 48 49 50 51 56 57 58 59 64 65 66 67 72 73
74 75
>>     node 1 size: 65447 MB
>>     node 1 free: 58322 MB
>>     node 16 cpus: 80 81 82 83 88 89 90 91 96 97 98 99 104 105 106 107
112 113 114 115
>>     node 16 size: 65448 MB
>>     node 16 free: 63096 MB
>>     node 17 cpus: 120 121 122 123 128 129 130 131 136 137 138 139 144
145 146 147 152 153 154 155
>>     node 17 size: 65175 MB
>>     node 17 free: 61522 MB
>>     node distances:
>>     node   0   1  16  17
>>        0:  10  20  40  40
>>        1:  20  10  40  40
>>       16:  40  40  10  20
>>       17:  40  40  20  10
>>
>>
>>     In that case JVM will spawn threads that will run on all CPUs,
including those
>>     CPUs in numa node 17. Then once in
>>     src/hotspot/share/gc/parallel/mutableNUMASpace.cpp, in
cas_allocate():
>>
>>       834 // This version is lock-free.
>>       835 HeapWord* MutableNUMASpace::cas_allocate(size_t size) {
>>       836   Thread* thr = Thread::current();
>>       837   int lgrp_id = thr->lgrp_id();
>>       838   if (lgrp_id == -1 || !os::numa_has_group_homing()) {
>>       839     lgrp_id = os::numa_get_group_id();
>>       840     thr->set_lgrp_id(lgrp_id);
>>       841   }
>>
>>     a newly created thread will try to be mapped to a numa node given
your CPU ID.
>>     So if that CPU is in numa node 17 it will then not find it in:
>>
>>       843   int i = lgrp_spaces()->find(&lgrp_id, LGRPSpace::equals);
>>
>>     and will fallback to a random map, picking up a random numa node
among nodes
>>     0, 1, and 16:
>>
>>       846   if (i == -1) {
>>       847     i = os::random() % lgrp_spaces()->length();
>>       848   }
>>
>>     If it picks up node 16, not so bad, but what if it picks up node 0
or 1?
>>
>>     I see that if one binds mem but leaves CPU unbound one has to know
exactly what
>>     she/he is doing, because it can be likely suboptimal. On the other
hand, letting
>>     the node being picked up randomly when there are memory nodes bound
but no CPUs
>>     seems even more suboptimal in some scenarios. Thus, should the JVM
deal with it?
>>
>>     @Zhengyu, do you have any opinion on that?
>>
>>     Please find a few nits / comments inline.
>>
>>     Note that I'm not a (R)eviewer so you still need two official
reviews.
>>
>>
>>     Best regards,
>>     Gustavo
>>
>>     On 05/21/2018 01:44 PM, Swati Sharma wrote:
>>
>>         ======================PATCH==============================
>>         diff --git a/src/hotspot/os/linux/os_linux.cpp
b/src/hotspot/os/linux/os_linux.cpp
>>         --- a/src/hotspot/os/linux/os_linux.cpp
>>         +++ b/src/hotspot/os/linux/os_linux.cpp
>>         @@ -2832,14 +2832,42 @@
>>              // Map all node ids in which is possible to allocate
memory. Also nodes are
>>              // not always consecutively available, i.e. available from
0 to the highest
>>              // node number.
>>         +  // If the nodes have been bound explicitly using numactl
membind, then
>>         +  // allocate memory from those nodes only.
>>
>>
>>     I think ok to place that comment on the same existing line, like:
>>
>>     -  // node number.
>>     +  // node number. If the nodes have been bound explicitly using
numactl membind,
>>     +  // then allocate memory from these nodes only.
>>
>>
>>              for (size_t node = 0; node <= highest_node_number; node++) {
>>         -    if (Linux::isnode_in_configured_nodes(node)) {
>>         +    if (Linux::isnode_in_bounded_nodes(node)) {
>>
>>     ---------------------------------^ s/bounded/bound/
>>
>>
>>                  ids[i++] = node;
>>                }
>>              }
>>              return i;
>>            }
>>         +extern "C"  struct bitmask {
>>         +  unsigned long size; /* number of bits in the map */
>>         +  unsigned long *maskp;
>>         +};
>>
>>
>>     I think it's possible to move the function below to os_linux.hpp
with its
>>     friends and cope with the forward declaration of 'struct bitmask*`
by using the
>>     functions from numa API, notably numa_bitmask_nbytes() and
>>     numa_bitmask_isbitset() only,  avoiding the member dereferecing
issue and the
>>     need to add the above struct explicitly.
>>
>>
>>         +// Check if single memory node bound.
>>         +// Returns true if single memory node bound.
>>
>>
>>     I suggest a minuscule improvement, something like:
>>
>>     +// Check if bound to only one numa node.
>>     +// Returns true if bound to a single numa node, otherwise returns
false.
>>
>>
>>         +bool os::Linux::issingle_node_bound() {
>>
>>
>>     What about s/issingle_node_bound/isbound_to_single_node/ ?
>>
>>
>>         +  struct bitmask* bmp = _numa_get_membind != NULL ?
_numa_get_membind() : NULL;
>>         +  if(!(bmp != NULL && bmp->maskp != NULL)) return false;
>>
>>                                -----^
>>     Are you sure this checking is necessary? I think if numa_get_membind
succeed
>>     bmp->maskp is always != NULL.
>>
>>     Indentation here is odd. No space before 'if' and return on the same
line.
>>
>>     I would try to avoid lines over 80 chars.
>>
>>
>>         +  int issingle = 0;
>>         +  // System can have more than 64 nodes so check in all the
elements of
>>         +  // unsigned long array
>>         +  for (unsigned long i = 0; i < (bmp->size / (8 *
sizeof(unsigned long))); i++) {
>>         +    if (bmp->maskp[i] == 0) {
>>         +      continue;
>>         +    } else if ((bmp->maskp[i] & (bmp->maskp[i] - 1)) == 0) {
>>         +      issingle++;
>>         +    } else {
>>         +      return false;
>>         +    }
>>         +  }
>>         +  if (issingle == 1)
>>         +    return true;
>>         +  return false;
>>         +}
>>         +
>>
>>
>>     As I mentioned, I think it could be moved to os_linux.hpp instead.
Also, it
>>     could be something like:
>>
>>     +bool os::Linux::isbound_to_single_node(void) {
>>     +  struct bitmask* bmp;
>>     +  unsigned long mask; // a mask element in the mask array
>>     +  unsigned long max_num_masks;
>>     +  int single_node = 0;
>>     +
>>     +  if (_numa_get_membind != NULL) {
>>     +    bmp = _numa_get_membind();
>>     +  } else {
>>     +    return false;
>>     +  }
>>     +
>>     +  max_num_masks = bmp->size / (8 * sizeof(unsigned long));
>>     +
>>     +  for (mask = 0; mask < max_num_masks; mask++) {
>>     +    if (bmp->maskp[mask] != 0) { // at least one numa node in the
mask
>>     +      if (bmp->maskp[mask] & (bmp->maskp[mask] - 1) == 0) {
>>     +        single_node++; // a single numa node in the mask
>>     +      } else {
>>     +        return false;
>>     +      }
>>     +    }
>>     +  }
>>     +
>>     +  if (single_node == 1) {
>>     +    return true; // only a single mask with a single numa node
>>     +  } else {
>>     +    return false;
>>     +  }
>>     +}
>>
>>
>>            bool os::get_page_info(char *start, page_info* info) {
>>              return false;
>>            }
>>         @@ -2930,6 +2958,10 @@
>>
libnuma_dlsym(handle, "numa_bitmask_isbitset")));
>>                  set_numa_distance(CAST_TO_FN_PTR(numa_distance_func_t,
>>                                                   libnuma_dlsym(handle,
"numa_distance")));
>>         +
 set_numa_set_membind(CAST_TO_FN_PTR(numa_set_membind_func_t,
>>         +                                          libnuma_dlsym(handle,
"numa_set_membind")));
>>         +
 set_numa_get_membind(CAST_TO_FN_PTR(numa_get_membind_func_t,
>>         +
 libnuma_v2_dlsym(handle, "numa_get_membind")));
>>                  if (numa_available() != -1) {
>>                    set_numa_all_nodes((unsigned
long*)libnuma_dlsym(handle, "numa_all_nodes"));
>>         @@ -3054,6 +3086,8 @@
>>            os::Linux::numa_set_bind_policy_func_t
os::Linux::_numa_set_bind_policy;
>>            os::Linux::numa_bitmask_isbitset_func_t
os::Linux::_numa_bitmask_isbitset;
>>            os::Linux::numa_distance_func_t os::Linux::_numa_distance;
>>         +os::Linux::numa_set_membind_func_t os::Linux::_numa_set_membind;
>>         +os::Linux::numa_get_membind_func_t os::Linux::_numa_get_membind;
>>            unsigned long* os::Linux::_numa_all_nodes;
>>            struct bitmask* os::Linux::_numa_all_nodes_ptr;
>>            struct bitmask* os::Linux::_numa_nodes_ptr;
>>         @@ -4962,8 +4996,9 @@
>>                if (!Linux::libnuma_init()) {
>>                  UseNUMA = false;
>>                } else {
>>         -      if ((Linux::numa_max_node() < 1)) {
>>         -        // There's only one node(they start from 0), disable
NUMA.
>>         +      if ((Linux::numa_max_node() < 1) ||
Linux::issingle_node_bound()) {
>>         +        // If there's only one node(they start from 0) or if
the process
>>         +        // is bound explicitly to a single node using membind,
disable NUMA.
>>                    UseNUMA = false;
>>                  }
>>                }
>>         diff --git a/src/hotspot/os/linux/os_linux.hpp
b/src/hotspot/os/linux/os_linux.hpp
>>         --- a/src/hotspot/os/linux/os_linux.hpp
>>         +++ b/src/hotspot/os/linux/os_linux.hpp
>>         @@ -228,6 +228,8 @@
>>              typedef int (*numa_tonode_memory_func_t)(void *start,
size_t size, int node);
>>              typedef void (*numa_interleave_memory_func_t)(void *start,
size_t size, unsigned long *nodemask);
>>              typedef void (*numa_interleave_memory_v2_func_t)(void
*start, size_t size, struct bitmask* mask);
>>         +  typedef void (*numa_set_membind_func_t)(struct bitmask *mask);
>>         +  typedef struct bitmask* (*numa_get_membind_func_t)(void);
>>              typedef void (*numa_set_bind_policy_func_t)(int policy);
>>              typedef int (*numa_bitmask_isbitset_func_t)(struct bitmask
*bmp, unsigned int n);
>>         @@ -244,6 +246,8 @@
>>              static numa_set_bind_policy_func_t _numa_set_bind_policy;
>>              static numa_bitmask_isbitset_func_t _numa_bitmask_isbitset;
>>              static numa_distance_func_t _numa_distance;
>>         +  static numa_set_membind_func_t _numa_set_membind;
>>         +  static numa_get_membind_func_t _numa_get_membind;
>>              static unsigned long* _numa_all_nodes;
>>              static struct bitmask* _numa_all_nodes_ptr;
>>              static struct bitmask* _numa_nodes_ptr;
>>         @@ -259,6 +263,8 @@
>>              static void
set_numa_set_bind_policy(numa_set_bind_policy_func_t func) {
_numa_set_bind_policy = func; }
>>              static void
set_numa_bitmask_isbitset(numa_bitmask_isbitset_func_t func) {
_numa_bitmask_isbitset = func; }
>>              static void set_numa_distance(numa_distance_func_t func) {
_numa_distance = func; }
>>         +  static void set_numa_set_membind(numa_set_membind_func_t
func) { _numa_set_membind = func; }
>>         +  static void set_numa_get_membind(numa_get_membind_func_t
func) { _numa_get_membind = func; }
>>              static void set_numa_all_nodes(unsigned long* ptr) {
_numa_all_nodes = ptr; }
>>              static void set_numa_all_nodes_ptr(struct bitmask **ptr) {
_numa_all_nodes_ptr = (ptr == NULL ? NULL : *ptr); }
>>              static void set_numa_nodes_ptr(struct bitmask **ptr) {
_numa_nodes_ptr = (ptr == NULL ? NULL : *ptr); }
>>         @@ -320,6 +326,15 @@
>>                } else
>>                  return 0;
>>              }
>>         +  // Check if node in bounded nodes
>>
>>
>>     +  // Check if node is in bound node set. Maybe?
>>
>>
>>         +  static bool isnode_in_bounded_nodes(int node) {
>>         +    struct bitmask* bmp = _numa_get_membind != NULL ?
_numa_get_membind() : NULL;
>>         +    if (bmp != NULL && _numa_bitmask_isbitset != NULL &&
_numa_bitmask_isbitset(bmp, node)) {
>>         +      return true;
>>         +    } else
>>         +      return false;
>>         +  }
>>         +  static bool issingle_node_bound();
>>
>>
>>     Looks like it can be re-written like:
>>
>>     +  static bool isnode_in_bound_nodes(int node) {
>>     +    if (_numa_get_membind != NULL && _numa_bitmask_isbitset !=
NULL) {
>>     +      return _numa_bitmask_isbitset(_numa_get_membind(), node);
>>     +    } else {
>>     +      return false;
>>     +    }
>>     +  }
>>
>>     ?
>>
>>
>>            };
>>            #endif // OS_LINUX_VM_OS_LINUX_HPP
>>
>>
>>
>


More information about the hotspot-dev mailing list