RFR: 8338591: Improve performance of MemorySegment::copy

Francesco Nigro duke at openjdk.org
Tue Sep 3 12:36:24 UTC 2024


On Tue, 3 Sep 2024 07:52:44 GMT, Per Minborg <pminborg at openjdk.org> wrote:

> This PR proposes to handle smaller FFM copy operations with Java code rather than transitioning to native code. This will improve performance. In this PR, copy operations involving zero to 63 bytes will be handled by Java code.
> 
> Here is what it looks like for Windows x64:
> 
> ![image](https://github.com/user-attachments/assets/6b31206e-3b24-4b34-bf38-a1be393186d3)
> 
> Here is another chart for Linux a64:
> 
> ![image](https://github.com/user-attachments/assets/b679bfac-670a-42a5-802b-2b17adf5ec79)
> 
> Other platforms exhibit similar behavior. It should be noted that the gain with this PR is pronounced for certain common sizes that are more likely to appear in code (e.g. 8, 16, 24, and 32)
> 
> It would be possible to use the same code path for the 7arg  `MemorySegment::copy` method if it is similar to:
> 
> 
> MemorySegment.copy(heapSrcSegment, JAVA_BYTE, 0, heapDstSegment, JAVA_BYTE, 0, ELEM_SIZE);
> 
> 
> This could be added in a separate PR.
> 
> This PR has been tested with tier1-3 and passed.

I would suggest this additional benchmark as well


import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.CompilerControl;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.BenchmarkParams;

import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.nio.ByteBuffer;
import java.util.concurrent.TimeUnit;

@BenchmarkMode(Mode.AverageTime)
@Warmup(iterations = 5, time = 500, timeUnit = TimeUnit.MILLISECONDS)
@Measurement(iterations = 10, time = 500, timeUnit = TimeUnit.MILLISECONDS)
@State(Scope.Thread)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Fork(value = 3)
public class PolluteCopyTest {

    @Param({"0", "1", "2", "3", "4", "5", "6", "7", "8",
            "9", "10", "11", "12", "13", "14", "15", "16",
            "17", "18", "19", "20", "21", "22", "23", "24",
            "25", "26", "27", "28", "29", "30", "31", "32",
            "33", "36", "40", "44", "48", "52", "56", "60", "63", "64", "128"})
    public int ELEM_SIZE;

    MemorySegment heapSrcSegment;
    MemorySegment heapDstSegment;
    MemorySegment nativeSrcSegment;
    MemorySegment nativeDstSegment;

    @Param({"false", "true"})
    public boolean polluteCopy;

    @Setup
    public void setup(BenchmarkParams params) {
        byte[] srcArray = new byte[ELEM_SIZE];
        byte[] dstArray = new byte[ELEM_SIZE];
        heapSrcSegment = MemorySegment.ofArray(srcArray);
        heapDstSegment = MemorySegment.ofArray(dstArray);
        nativeSrcSegment = Arena.ofAuto().allocate(ELEM_SIZE);
        nativeDstSegment = Arena.ofAuto().allocate(ELEM_SIZE);
        if (polluteCopy) {
            if (params.getBenchmark().contains("not_inlined")) {
                for (int i = 0; i < 15_000; i++) {
                    heap_segment_copy5Arg_not_inlined();
                    native_segment_copy5Arg_not_inlined();
                }
            } else {
                for (int i = 0; i < 15_000; i++) {
                    MemorySegment.copy(heapSrcSegment, 0, heapDstSegment, 0, ELEM_SIZE);
                    MemorySegment.copy(nativeSrcSegment, 0, nativeDstSegment, 0, ELEM_SIZE);
                    MemorySegment.copy(heapSrcSegment, 0, nativeDstSegment, 0, ELEM_SIZE);
                    MemorySegment.copy(nativeSrcSegment, 0, heapDstSegment, 0, ELEM_SIZE);
                }
            }
        }
    }

    @Benchmark
    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void heap_segment_copy5Arg_not_inlined() {
        MemorySegment.copy(heapSrcSegment, 0, heapDstSegment, 0, ELEM_SIZE);
    }

    @Benchmark
    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void native_segment_copy5Arg_not_inlined() {
        MemorySegment.copy(nativeSrcSegment, 0, nativeDstSegment, 0, ELEM_SIZE);
    }

    @Benchmark
    public void heap_segment_copy5Arg() {
        MemorySegment.copy(heapSrcSegment, 0, heapDstSegment, 0, ELEM_SIZE);
    }

    @Benchmark
    public void native_segment_copy5Arg() {
        MemorySegment.copy(nativeSrcSegment, 0, nativeDstSegment, 0, ELEM_SIZE);
    }

}

which is not super stable (we can disable background compilation and enable type profiling - to make sure that we are doing things right by the end of `Setup` - or warmup at least)

It pollutes the type profile of existing copy methods in different ways - but while compiled, it will eventually depends on how types are handled.
There's no branch mispredict here, but a "what happen if we move the ops to java and java is type "poisoned/pollute"?" - because is what could happen in the real world (a mix of heap/off-heap segment types) and we want to capture what this PR get in such case as well.

-------------

PR Comment: https://git.openjdk.org/jdk/pull/20829#issuecomment-2326404582


More information about the core-libs-dev mailing list