RFR: 8333893: Optimization for StringBuilder append boolean & null
Shaojin Wen
duke at openjdk.org
Mon Jun 10 23:13:13 UTC 2024
On Mon, 10 Jun 2024 14:32:46 GMT, Emanuel Peter <epeter at openjdk.org> wrote:
>> # 1. Compare with unsafe branch
>>
>> 1. current (`5e815`) https://github.com/wenshao/jdk/tree/optim_str_builder_append_202406
>> 2. unsafe (`adc220`) https://github.com/wenshao/jdk/tree/optim_str_builder_append_202406_unsafe
>>
>> I think the performance of the Unsafe branch may be the best data for the C2 optimizer. @eme64 can help me see if C2 can do it?
>>
>> # 2. Benchmark Commands
>>
>> make test TEST="micro:java.lang.StringBuilders.toStringCharWithBool8"
>> make test TEST="micro:java.lang.StringBuilders.toStringCharWithNull8"
>>
>>
>> # 3. Implementation of Unsafe Branch
>>
>> class AbstractStringBuilder {
>> static final Unsafe UNSAFE = Unsafe.getUnsafe();
>>
>> static final int NULL_LATIN1;
>> static final int TRUE_LATIN1;
>> static final int FALS_LATIN1;
>>
>> static final long NULL_UTF16;
>> static final long TRUE_UTF16;
>> static final long FALS_UTF16;
>>
>> static {
>> byte[] bytes4 = new byte[4];
>> byte[] bytes8 = new byte[8];
>>
>> bytes4[0] = 'n';
>> bytes4[1] = 'u';
>> bytes4[2] = 'l';
>> bytes4[3] = 'l';
>> NULL_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET);
>> StringUTF16.inflate(bytes4, 0, bytes8, 0, 4);
>> NULL_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET);
>>
>> bytes4[0] = 't';
>> bytes4[1] = 'r';
>> bytes4[2] = 'u';
>> bytes4[3] = 'e';
>> TRUE_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET);
>> StringUTF16.inflate(bytes4, 0, bytes8, 0, 4);
>> TRUE_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET);
>>
>> bytes4[0] = 'f';
>> bytes4[1] = 'a';
>> bytes4[2] = 'l';
>> bytes4[3] = 's';
>> FALS_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET);
>> StringUTF16.inflate(bytes4, 0, bytes8, 0, 4);
>> FALS_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET);
>> }
>>
>> private AbstractStringBuilder appendNull() {
>> ensureCapacityInternal(count + 4);
>> int count = this.count;
>> byte[] val = this.value;
>> if (isLatin1()) {
>> UNSAFE.putInt(
>> val,
>> Unsafe.ARRAY_BYTE_BASE_OFFSET + count,
>> NULL_LATIN1);
>> } else {
>> UNSAFE.putLong(
>> val,
>> Unsafe.ARRAY_BYTE_BASE_OFFSET + (count << 1),
>> NULL_UTF16);
>> ...
>
> @wenshao
>> I think the performance of the Unsafe branch may be the best data for the C2 optimizer. @eme64 can help me see if C2 can do it?
>
> Have you tried to see if the optimization actually was done/taken? You can use the `TraceMergeStores,` flag. Can you present the generated assembly code of the benchmarks, and explain the difference based on the generated assembly code? You can run JMH penchmarks with `perf`. These two blogs may help you:
>
> http://psy-lob-saw.blogspot.com/2015/07/jmh-perfasm.html
> https://shipilev.net/blog/2016/arrays-wisdom-ancients/#_meet_jmh_prof_perfasm
>
> @liach I don't think it makes a difference if it is `int` or `byte` constants. Or what exactly is the code change you are proposing?
@eme64 It seems that when the following code uses StringUTF16.putChar, C2's optimization is not as good as the manual merging and storage effect.
class AbstractStringBuilder {
private AbstractStringBuilder appendNull() {
// ...
StringUTF16.putCharsAt(val, count, 'n', 'u', 'l', 'l');
// ...
}
public AbstractStringBuilder append(boolean b) {
// ...
StringUTF16.putCharsAt(val, count, 't', 'r', 'u', 'e');
// ...
StringUTF16.putCharsAt(val, count, 'f', 'a', 'l', 's', 'e');
// ...
}
}
class StringUTF16 {
public static void putCharsAt(byte[] value, int i, char c1, char c2, char c3, char c4) {
putChar(value, i , c1);
putChar(value, i + 1, c2);
putChar(value, i + 2, c3);
putChar(value, i + 3, c4);
}
@IntrinsicCandidate
// intrinsic performs no bounds checks
static void putChar(byte[] val, int index, int c) {
assert index >= 0 && index < length(val) : "Trusted caller missed bounds check";
index <<= 1;
val[index++] = (byte)(c >> HI_BYTE_SHIFT);
val[index] = (byte)(c >> LO_BYTE_SHIFT);
}
}
The code for manually merging storage is as follows, without using Unsafe:
class AbstractStringBuilder {
static final long NULL_UTF16;
static final long TRUE_UTF16;
static final long FALS_UTF16;
static {
byte[] bytes = new byte[8];
StringUTF16.putCharsAt(bytes, 0, 'n', 'u', 'l', 'l');
NULL_UTF16 = getLong(bytes, 0);
StringUTF16.putCharsAt(bytes, 0, 't', 'r', 'u', 'e');
TRUE_UTF16 = getLong(bytes, 0);
StringUTF16.putCharsAt(bytes, 0, 'f', 'a', 'l', 's');
FALS_UTF16 = getLong(bytes, 0);
}
private static long getLong(byte[] bytes, int offset) {
return (((long)bytes[offset ] & 0xff) ) |
(((long)bytes[offset + 1] & 0xff) << 8) |
(((long)bytes[offset + 2] & 0xff) << 16) |
(((long)bytes[offset + 3] & 0xff) << 24) |
(((long)bytes[offset + 4] & 0xff) << 32) |
(((long)bytes[offset + 5] & 0xff) << 40) |
(((long)bytes[offset + 6] & 0xff) << 48) |
(((long)bytes[offset + 7] & 0xff) << 56);
}
private static void setLong(byte[] array, int offset, long value) {
array[offset] = (byte) value;
array[offset + 1] = (byte) (value >> 8);
array[offset + 2] = (byte) (value >> 16);
array[offset + 3] = (byte) (value >> 24);
array[offset + 4] = (byte) (value >> 32);
array[offset + 5] = (byte) (value >> 40);
array[offset + 6] = (byte) (value >> 48);
array[offset + 7] = (byte) (value >> 56);
}
private AbstractStringBuilder appendNull() {
int count = this.count;
ensureCapacityInternal(count + 4);
byte[] val = this.value;
if (isLatin1()) {
val[count ] = 'n';
val[count + 1] = 'u';
val[count + 2] = 'l';
val[count + 3] = 'l';
} else {
setLong(val, count, NULL_UTF16);
}
this.count = count + 4;
return this;
}
public AbstractStringBuilder append(boolean b) {
int count = this.count;
int spaceNeeded = count + (b ? 4 : 5);
ensureCapacityInternal(spaceNeeded);
byte[] val = this.value;
if (isLatin1()) {
if (b) {
val[count ] = 't';
val[count + 1] = 'r';
val[count + 2] = 'u';
val[count + 3] = 'e';
} else {
val[count ] = 'f';
val[count + 1] = 'a';
val[count + 2] = 'l';
val[count + 3] = 's';
val[count + 4] = 'e';
}
} else {
setLong(val, count, b ? TRUE_UTF16 : FALS_UTF16);
if (!b) {
StringUTF16.putChar(val, count + 4, 'e');
}
}
this.count = spaceNeeded;
return this;
}
}
The getLong/setLong methods here can be optimized and merged by C2. Maybe we need a public class that does not use Unsafe to implement these methods. They are needed in many places. Maybe it is appropriate to improve them based on ByteArray/ByteArrayLittleEndian.
-------------
PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2159458425
More information about the hotspot-compiler-dev
mailing list