RFR: 8352075: Perf regression accessing fields [v23]

Tue Jun 10 14:22:39 UTC 2025

On Tue, 10 Jun 2025 12:49:55 GMT, Johan Sjölen <jsjolen at openjdk.org> wrote:

>> @coleenp Can't find the comment to reply... I've replaced all `_r.next_uint()` with just `next_uint()`, it's a bikeshed argument.
>
> Hi @rvansa ,
> 
> What about this type of API for dealing with the compressed table? We do the 8 byte accesses as unsigned chars (important so that they're 0-extended and not sign extended) and write the compressed KV down in little-endian. We use bitwise OR to not squash whatever was there before. Comparing GCC x64 and PPC (power), it looks good.
> 
> ```c++
> #include <cstdint>
> #include <cstdio>
> #include <cstdlib>
> #include <cstring>
> 
> 
> // For this hard-coded example we use 2 byte keys
> // and 3 byte values -- for an element_bytes of 5
> uint32_t key_mask = (1 << 16) - 1;
> uint32_t value_mask = (1 << 24) - 1;
> uint32_t value_shift = 16;
> 
> uint32_t element_bytes = 5;
> 
> uint64_t* kv_array;
> 
> uint32_t unpack_key(uint64_t kv) {
>   return kv & key_mask;
> }
> uint32_t unpack_value(uint64_t kv) { return (kv >> value_shift) & value_mask; }
> 
> uint64_t pack_kv(uint64_t k, uint64_t v) {
>   return k | (v << value_shift);
> }
> 
> 
> uint32_t align_down(uint32_t x, uint32_t align) { return x & -align; }
> uint32_t align_up(uint32_t x, uint32_t align) {
>   return (x + (align - 1)) & -align;
> }
> 
> uint64_t u64s_required(int n) {
>   uint32_t bytes_required = element_bytes * n;
>   return align_up(bytes_required, 8) / 8;
> }
> 
> uint64_t read_u8(uint8_t* p) {
>   uint64_t result = 0;
>   result |= ((uint64_t)*(p + 0)) << (0 * 8);
>   result |= ((uint64_t)*(p + 1)) << (1 * 8);
>   result |= ((uint64_t)*(p + 2)) << (2 * 8);
>   result |= ((uint64_t)*(p + 3)) << (3 * 8);
>   result |= ((uint64_t)*(p + 4)) << (4 * 8);
>   result |= ((uint64_t)*(p + 5)) << (5 * 8);
>   result |= ((uint64_t)*(p + 6)) << (6 * 8);
>   result |= ((uint64_t)*(p + 7)) << (7 * 8);
>   return result;
> }
> 
> void write_u8(uint8_t* p, uint64_t u8) {
>   p[0] |= u8             & 0xFF;
>   p[1] |= (u8 >> 8*1) & 0xFF;
>   p[2] |= (u8 >> 8*2) & 0xFF;
>   p[3] |= (u8 >> 8*3) & 0xFF;
>   p[4] |= (u8 >> 8*4) & 0xFF;
>   p[5] |= (u8 >> 8*5) & 0xFF;
>   p[6] |= (u8 >> 8*6) & 0xFF;
>   p[7] |= (u8 >> 8*7) & 0xFF;
> }
> 
> uint64_t read(int n) {
>   uint32_t byte_index = element_bytes * n;
>   return read_u8(&reinterpret_cast<uint8_t*>(kv_array)[byte_index]);
> }
> 
> void fill(int n, uint64_t kv) {
>   uint32_t byte_index = element_bytes * n;
>   write_u8(&reinterpret_cast<uint8_t*>(kv_array)[byte_index], kv);
>   return;
> }
> 
> int main() {
>   int num_elts = 65536;
>   uint64_t sz = u64s_required(num_elts);
>   kv_array = (uint64_t*)malloc(sz * 8);
>   for (int i = 0; i < sz; i++) {
>     kv_array[i] = 0;
>   }
> 
>   for (int i = 0; i < num_elts; i++) {
>     uint64_t kv = pack_kv(i, 2 * i);
>     fill(i, kv);
>   }
>   for (int i = 0; i < num_...

> Shouldn't the Copy package or memcpy do all of this? @jdksjolen I don't think this should be in the packedTable code.

> @jdksjolen We'd need to solve the case without padding at the end of the table (or add the padding to ensure that we're not accessing past allocated area). However, I did not really get what problem are you trying to solve?

My worry was that `memcpy` would have incorrect semantics when on a BE system. As the code no longer uses memcpy, we're fine.

-------------

PR Comment: https://git.openjdk.org/jdk/pull/24847#issuecomment-2959448541