[PATCH] Reduce Chance Of Mistakenly Early Backing Memory Cleanup
Ben Walsh
ben_walsh at uk.ibm.com
Mon Feb 19 16:37:14 UTC 2018
As requested, here are the results with modifications to the annotations
on Reference.reachabilityFence. Much more promising ...
* Benchmark 1 *
Test Code :
package org.sample;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import java.nio.ByteBuffer;
public class ByteBufferBenchmark {
@State(Scope.Benchmark)
public static class ByteBufferContainer {
ByteBuffer bb;
@Setup(Level.Invocation)
public void initByteBuffer() {
bb = ByteBuffer.allocateDirect(1);
}
ByteBuffer getByteBuffer() {
return bb;
}
}
@Benchmark
public void benchmark_byte_buffer_put(ByteBufferContainer bbC) {
bbC.getByteBuffer().put((byte)42);
}
}
Results :
- Unmodified Build -
Benchmark Mode Cnt Score
Error Units
ByteBufferBenchmark.benchmark_byte_buffer_put thrpt 200 35604933.518 ±
654975.515 ops/s
- Build With Reference.reachabilityFences Added -
Benchmark Mode Cnt Score
Error Units Impact
ByteBufferBenchmark.benchmark_byte_buffer_put thrpt 200 33100911.857 ±
747461.951 ops/s -7.033%
- Build With Reference.reachabilityFences Added And DontInline Replaced
With ForceInline -
Benchmark Mode Cnt Score
Error Units Impact
ByteBufferBenchmark.benchmark_byte_buffer_put thrpt 200 34836320.294 ±
640188.408 ops/s -2.159%
- Build With Reference.reachabilityFences Added And DontInline Removed -
Benchmark Mode Cnt Score
Error Units Impact
ByteBufferBenchmark.benchmark_byte_buffer_put thrpt 200 34740015.332 ±
556578.542 ops/s -2.429%
* Benchmark 2 *
Test Code :
package org.sample;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import java.nio.ByteBuffer;
@State(Scope.Benchmark)
public class ByteBufferBenchmark {
@Param({"1", "10", "100", "1000", "10000"})
public int L;
@State(Scope.Benchmark)
public static class ByteBufferContainer {
ByteBuffer bb;
@Setup(Level.Invocation)
public void initByteBuffer() {
bb = ByteBuffer.allocateDirect(10000);
}
ByteBuffer getByteBuffer() {
return bb;
}
}
@Benchmark
public ByteBuffer benchmark_byte_buffer_put(ByteBufferContainer bbC) {
ByteBuffer bb = bbC.getByteBuffer();
for (int i = 0; i < L; i++) {
bb.put((byte)i);
}
return bb;
}
}
Results :
- Unmodified Build -
Benchmark (L) Mode Cnt Score
Error Units
ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200
29303145.752 ± 635979.750 ops/s
ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200
24260859.017 ± 528891.303 ops/s
ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200
8512366.637 ± 136615.070 ops/s
ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200
1323756.037 ± 21485.369 ops/s
ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200
145965.305 ± 1301.469 ops/s
- Build With Reference.reachabilityFences Added -
Benchmark (L) Mode Cnt Score
Error Units Impact
ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200
28893540.122 ± 754554.747 ops/s -1.398%
ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200
15317696.355 ± 231621.608 ops/s -36.863%
ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200
2546599.578 ± 32136.873 ops/s -70.084%
ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200
288832.514 ± 3854.522 ops/s -78.181%
ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200 29747.386
± 214.831 ops/s -79.620%
- Build With Reference.reachabilityFences Added And DontInline Replaced
With ForceInline -
Benchmark (L) Mode Cnt Score
Error Units Impact
ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200
29372326.859 ± 525988.179 ops/s +0.236%
ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200
24326735.480 ± 484358.862 ops/s +0.272%
ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200
8492692.912 ± 120924.878 ops/s -0.231%
ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200
1332131.417 ± 14981.587 ops/s +0.633%
ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200
144990.569 ± 1518.877 ops/s -0.668%
- Build With Reference.reachabilityFences Added And DontInline Removed -
Benchmark (L) Mode Cnt Score
Error Units Impact
ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200
29842696.017 ± 462902.634 ops/s +1.841%
ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200
24842729.069 ± 436174.452 ops/s +2.398%
ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200
8518393.953 ± 129254.536 ops/s +0.071%
ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200
1344772.370 ± 15916.867 ops/s +1.588%
ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200
145087.256 ± 1277.491 ops/s -0.602%
* Benchmark 3 *
Test Code :
package org.sample;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import java.nio.ByteBuffer;
@State(Scope.Benchmark)
public class ByteBufferBenchmark {
@Param({"1", "10", "100", "1000", "10000"})
public int L;
@State(Scope.Benchmark)
public static class ByteBufferContainer {
ByteBuffer bb;
@Setup(Level.Invocation)
public void initByteBuffer() {
bb = ByteBuffer.allocateDirect(4 * 10000);
for (int i = 0; i < 10000; i++) {
bb.putInt(i);
}
}
ByteBuffer getByteBuffer() {
return bb;
}
}
@Benchmark
public int benchmark_byte_buffer_put(ByteBufferContainer bbC) {
ByteBuffer bb = bbC.getByteBuffer();
bb.position(0);
int sum = 0;
for (int i = 0; i < L; i++) {
sum += bb.getInt();
}
return sum;
}
}
Results :
- Unmodified Build -
Benchmark (L) Mode Cnt Score
Error Units
ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200
29677205.748 ± 544721.142 ops/s
ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200
18219951.454 ± 320724.793 ops/s
ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200
7767650.826 ± 121798.910 ops/s
ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200
1646075.010 ± 9804.499 ops/s
ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200
183489.418 ± 1355.967 ops/s
- Build With Reference.reachabilityFences Added -
Benchmark (L) Mode Cnt Score
Error Units Impact
ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200
15230086.695 ± 390174.190 ops/s -48.681%
ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200
8126310.728 ± 123661.342 ops/s -55.399%
ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200
1582699.233 ± 7278.744 ops/s -79.624%
ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200
179726.465 ± 802.333 ops/s -89.082%
ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200 18327.049
± 9.506 ops/s -90.012%
- Build With Reference.reachabilityFences Added And DontInline Replaced
With ForceInline -
Benchmark (L) Mode Cnt Score
Error Units Impact
ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200
29839190.147 ± 576585.796 ops/s +0.546%
ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200
18397768.759 ± 338144.327 ops/s +0.976%
ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200
7746079.875 ± 101621.105 ops/s -0.278%
ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200
1629413.444 ± 24163.399 ops/s -1.012%
ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200
182250.811 ± 2028.461 ops/s -0.675%
- Build With Reference.reachabilityFences Added And DontInline Removed -
Benchmark (L) Mode Cnt Score
Error Units Impact
ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200
29442980.464 ± 556324.877 ops/s -0.789%
ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200
18401757.539 ± 419383.901 ops/s +0.998%
ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200
7816766.062 ± 100144.611 ops/s +0.632%
ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200
1636811.564 ± 13811.447 ops/s -0.563%
ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200
183463.292 ± 2056.016 ops/s -0.014%
Regards,
Ben
From: Paul Sandoz <paul.sandoz at oracle.com>
To: Ben Walsh <ben_walsh at uk.ibm.com>
Cc: core-libs-dev <core-libs-dev at openjdk.java.net>
Date: 08/02/2018 16:54
Subject: Re: [PATCH] Reduce Chance Of Mistakenly Early Backing
Memory Cleanup
Hi Ben,
Thanks. I anticipated a performance hit but not necessarily a 10x. Without
looking at the generated code of the benchmark method it is hard to be
sure [*], but i believe the fence is interfering with loop unrolling
and/or vectorization, the comparative differences between byte and int may
be related to vectorization (for byte there may be less or limited support
for vectorization).
How about we now try another experiment commenting out the @DontInline on
the fence method and re-run the benchmarks. From Peter’s observations and
Vladimir’s analysis we should be able to remove that, or even, contrary to
what we initial expected when adding this feature, change to @ForceInline!
Thanks,
Paul.
[*] If you are running on linux you can use the excellent JMH perfasm
feature to dump the hot parts of HotSpots generated code.
> On Feb 8, 2018, at 8:22 AM, Ben Walsh <ben_walsh at uk.ibm.com> wrote:
>
> Hi Paul,
>
> Following up with the requested loop and vectorization benchmarks ...
>
>
> (Do the vectorization benchmark results imply that the Hotspot compiler
> has been unable to perform the vectorization optimisation due to the
> presence of the reachabilityFence ?)
>
>
>
-----------------------------------------------------------------------------------------------------------------------
>
>
> Loop Benchmarking
> ---- ------------
>
> package org.sample;
>
> import org.openjdk.jmh.annotations.Benchmark;
> import org.openjdk.jmh.annotations.Level;
> import org.openjdk.jmh.annotations.Param;
> import org.openjdk.jmh.annotations.Scope;
> import org.openjdk.jmh.annotations.Setup;
> import org.openjdk.jmh.annotations.State;
>
> import java.nio.ByteBuffer;
>
> @State(Scope.Benchmark)
> public class ByteBufferBenchmark {
>
> @Param({"1", "10", "100", "1000", "10000"})
> public int L;
>
> @State(Scope.Benchmark)
> public static class ByteBufferContainer {
>
> ByteBuffer bb;
>
> @Setup(Level.Invocation)
> public void initByteBuffer() {
> bb = ByteBuffer.allocateDirect(10000);
> }
>
> ByteBuffer getByteBuffer() {
> return bb;
> }
> }
>
> @Benchmark
> public ByteBuffer benchmark_byte_buffer_put(ByteBufferContainer bbC)
{
>
> ByteBuffer bb = bbC.getByteBuffer();
>
> for (int i = 0; i < L; i++) {
> bb.put((byte)i);
> }
>
> return bb;
> }
>
> }
>
>
> Without Changes
>
> Benchmark (L) Mode Cnt Score
> Error Units
> ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200
> 29303145.752 ± 635979.750 ops/s
> ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200
> 24260859.017 ± 528891.303 ops/s
> ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200
> 8512366.637 ± 136615.070 ops/s
> ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200
> 1323756.037 ± 21485.369 ops/s
> ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200
> 145965.305 ± 1301.469 ops/s
>
>
> With Changes
>
> Benchmark (L) Mode Cnt Score
> Error Units Impact
> ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200
> 28893540.122 ± 754554.747 ops/s -1.398%
> ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200
> 15317696.355 ± 231621.608 ops/s -36.863%
> ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200
> 2546599.578 ± 32136.873 ops/s -70.084%
> ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200
> 288832.514 ± 3854.522 ops/s -78.181%
> ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200
29747.386
> ± 214.831 ops/s -79.620%
>
>
>
-----------------------------------------------------------------------------------------------------------------------
>
>
> Vectorization Benchmarking
> ------------- ------------
>
> package org.sample;
>
> import org.openjdk.jmh.annotations.Benchmark;
> import org.openjdk.jmh.annotations.Level;
> import org.openjdk.jmh.annotations.Param;
> import org.openjdk.jmh.annotations.Scope;
> import org.openjdk.jmh.annotations.Setup;
> import org.openjdk.jmh.annotations.State;
>
> import java.nio.ByteBuffer;
>
> @State(Scope.Benchmark)
> public class ByteBufferBenchmark {
>
> @Param({"1", "10", "100", "1000", "10000"})
> public int L;
>
> @State(Scope.Benchmark)
> public static class ByteBufferContainer {
>
> ByteBuffer bb;
>
> @Setup(Level.Invocation)
> public void initByteBuffer() {
> bb = ByteBuffer.allocateDirect(4 * 10000);
>
> for (int i = 0; i < 10000; i++) {
> bb.putInt(i);
> }
> }
>
> ByteBuffer getByteBuffer() {
> return bb;
> }
>
> }
>
> @Benchmark
> public int benchmark_byte_buffer_put(ByteBufferContainer bbC) {
>
> ByteBuffer bb = bbC.getByteBuffer();
>
> bb.position(0);
>
> int sum = 0;
>
> for (int i = 0; i < L; i++) {
> sum += bb.getInt();
> }
>
> return sum;
>
> }
>
> }
>
>
> Without Changes
>
> Benchmark (L) Mode Cnt Score
> Error Units
> ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200
> 29677205.748 ± 544721.142 ops/s
> ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200
> 18219951.454 ± 320724.793 ops/s
> ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200
> 7767650.826 ± 121798.910 ops/s
> ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200
> 1646075.010 ± 9804.499 ops/s
> ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200
> 183489.418 ± 1355.967 ops/s
>
>
> With Changes
>
> Benchmark (L) Mode Cnt Score
> Error Units Impact
> ByteBufferBenchmark.benchmark_byte_buffer_put 1 thrpt 200
> 15230086.695 ± 390174.190 ops/s -48.681%
> ByteBufferBenchmark.benchmark_byte_buffer_put 10 thrpt 200
> 8126310.728 ± 123661.342 ops/s -55.399%
> ByteBufferBenchmark.benchmark_byte_buffer_put 100 thrpt 200
> 1582699.233 ± 7278.744 ops/s -79.624%
> ByteBufferBenchmark.benchmark_byte_buffer_put 1000 thrpt 200
> 179726.465 ± 802.333 ops/s -89.082%
> ByteBufferBenchmark.benchmark_byte_buffer_put 10000 thrpt 200
18327.049
> ± 9.506 ops/s -90.012%
>
>
>
> NB : For reference - for this and previous benchmarking results ...
>
> "Without Changes" and "With Changes" - java -version ...
>
> openjdk version "10-internal" 2018-03-20
> OpenJDK Runtime Environment (build 10-internal+0-adhoc.walshbp.jdk)
> OpenJDK 64-Bit Server VM (build 10-internal+0-adhoc.walshbp.jdk, mixed
> mode)
>
>
>
-----------------------------------------------------------------------------------------------------------------------
>
>
> Regards,
> Ben Walsh
>
Unless stated otherwise above:
IBM United Kingdom Limited - Registered in England and Wales with number
741598.
Registered office: PO Box 41, North Harbour, Portsmouth, Hampshire PO6 3AU
More information about the core-libs-dev
mailing list