RFR: 8333893: Optimization for StringBuilder append boolean & null

Mon Jun 10 23:13:13 UTC 2024

On Mon, 10 Jun 2024 14:32:46 GMT, Emanuel Peter <epeter at openjdk.org> wrote:

>> # 1. Compare with unsafe branch
>> 
>> 1. current (`5e815`) https://github.com/wenshao/jdk/tree/optim_str_builder_append_202406
>> 2. unsafe (`adc220`) https://github.com/wenshao/jdk/tree/optim_str_builder_append_202406_unsafe
>> 
>> I think the performance of the Unsafe branch may be the best data for the C2 optimizer. @eme64 can help me see if C2 can do it?
>> 
>> # 2. Benchmark Commands
>> 
>> make test TEST="micro:java.lang.StringBuilders.toStringCharWithBool8"
>> make test TEST="micro:java.lang.StringBuilders.toStringCharWithNull8"
>> 
>> 
>> # 3. Implementation of Unsafe Branch
>> 
>> class AbstractStringBuilder {
>>     static final Unsafe UNSAFE = Unsafe.getUnsafe();
>> 
>>     static final int NULL_LATIN1;
>>     static final int TRUE_LATIN1;
>>     static final int FALS_LATIN1;
>> 
>>     static final long NULL_UTF16;
>>     static final long TRUE_UTF16;
>>     static final long FALS_UTF16;
>> 
>>     static {
>>         byte[] bytes4 = new byte[4];
>>         byte[] bytes8 = new byte[8];
>> 
>>         bytes4[0] = 'n';
>>         bytes4[1] = 'u';
>>         bytes4[2] = 'l';
>>         bytes4[3] = 'l';
>>         NULL_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET);
>>         StringUTF16.inflate(bytes4, 0, bytes8, 0, 4);
>>         NULL_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET);
>> 
>>         bytes4[0] = 't';
>>         bytes4[1] = 'r';
>>         bytes4[2] = 'u';
>>         bytes4[3] = 'e';
>>         TRUE_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET);
>>         StringUTF16.inflate(bytes4, 0, bytes8, 0, 4);
>>         TRUE_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET);
>> 
>>         bytes4[0] = 'f';
>>         bytes4[1] = 'a';
>>         bytes4[2] = 'l';
>>         bytes4[3] = 's';
>>         FALS_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET);
>>         StringUTF16.inflate(bytes4, 0, bytes8, 0, 4);
>>         FALS_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET);
>>     }
>> 
>>     private AbstractStringBuilder appendNull() {
>>         ensureCapacityInternal(count + 4);
>>         int count = this.count;
>>         byte[] val = this.value;
>>         if (isLatin1()) {
>>             UNSAFE.putInt(
>>                     val,
>>                     Unsafe.ARRAY_BYTE_BASE_OFFSET + count,
>>                     NULL_LATIN1);
>>         } else {
>>             UNSAFE.putLong(
>>                     val,
>>                     Unsafe.ARRAY_BYTE_BASE_OFFSET + (count << 1),
>>                     NULL_UTF16);
>>      ...
>
> @wenshao 
>> I think the performance of the Unsafe branch may be the best data for the C2 optimizer. @eme64 can help me see if C2 can do it?
> 
> Have you tried to see if the optimization actually was done/taken? You can use the `TraceMergeStores,` flag. Can you present the generated assembly code of the benchmarks, and explain the difference based on the generated assembly code? You can run JMH penchmarks with `perf`. These two blogs may help you:
> 
> http://psy-lob-saw.blogspot.com/2015/07/jmh-perfasm.html
> https://shipilev.net/blog/2016/arrays-wisdom-ancients/#_meet_jmh_prof_perfasm
> 
> @liach I don't think it makes a difference if it is `int` or `byte` constants. Or what exactly is the code change you are proposing?

@eme64 It seems that when the following code uses StringUTF16.putChar, C2's optimization is not as good as the manual merging and storage effect.

class AbstractStringBuilder {
    private AbstractStringBuilder appendNull() {
    	// ...
		StringUTF16.putCharsAt(val, count, 'n', 'u', 'l', 'l');
		// ...
	}

	public AbstractStringBuilder append(boolean b) {
	    // ...
		StringUTF16.putCharsAt(val, count, 't', 'r', 'u', 'e');
		// ...
		StringUTF16.putCharsAt(val, count, 'f', 'a', 'l', 's', 'e');
		// ...
	}
}

class StringUTF16 {
   public static void putCharsAt(byte[] value, int i, char c1, char c2, char c3, char c4) {
        putChar(value, i    , c1);
        putChar(value, i + 1, c2);
        putChar(value, i + 2, c3);
        putChar(value, i + 3, c4);
    }

    @IntrinsicCandidate
    // intrinsic performs no bounds checks
    static void putChar(byte[] val, int index, int c) {
        assert index >= 0 && index < length(val) : "Trusted caller missed bounds check";
        index <<= 1;
        val[index++] = (byte)(c >> HI_BYTE_SHIFT);
        val[index]   = (byte)(c >> LO_BYTE_SHIFT);
    }
}

The code for manually merging storage is as follows, without using Unsafe:

class AbstractStringBuilder {
    static final long NULL_UTF16;
    static final long TRUE_UTF16;
    static final long FALS_UTF16;

    static {
        byte[] bytes = new byte[8];

        StringUTF16.putCharsAt(bytes, 0, 'n', 'u', 'l', 'l');
        NULL_UTF16 = getLong(bytes, 0);

        StringUTF16.putCharsAt(bytes, 0, 't', 'r', 'u', 'e');
        TRUE_UTF16 = getLong(bytes, 0);

        StringUTF16.putCharsAt(bytes, 0, 'f', 'a', 'l', 's');
        FALS_UTF16 = getLong(bytes, 0);
    }

    private static long getLong(byte[] bytes, int offset) {
        return (((long)bytes[offset    ] & 0xff)      ) |
               (((long)bytes[offset + 1] & 0xff) <<  8) |
               (((long)bytes[offset + 2] & 0xff) << 16) |
               (((long)bytes[offset + 3] & 0xff) << 24) |
               (((long)bytes[offset + 4] & 0xff) << 32) |
               (((long)bytes[offset + 5] & 0xff) << 40) |
               (((long)bytes[offset + 6] & 0xff) << 48) |
               (((long)bytes[offset + 7] & 0xff) << 56);
    }

    private static void setLong(byte[] array, int offset, long value) {
        array[offset]     = (byte)  value;
        array[offset + 1] = (byte) (value >> 8);
        array[offset + 2] = (byte) (value >> 16);
        array[offset + 3] = (byte) (value >> 24);
        array[offset + 4] = (byte) (value >> 32);
        array[offset + 5] = (byte) (value >> 40);
        array[offset + 6] = (byte) (value >> 48);
        array[offset + 7] = (byte) (value >> 56);
    }

    private AbstractStringBuilder appendNull() {
        int count = this.count;
        ensureCapacityInternal(count + 4);
        byte[] val = this.value;
        if (isLatin1()) {
            val[count    ] = 'n';
            val[count + 1] = 'u';
            val[count + 2] = 'l';
            val[count + 3] = 'l';
        } else {
            setLong(val, count, NULL_UTF16);
        }
        this.count = count + 4;
        return this;
    }

    public AbstractStringBuilder append(boolean b) {
        int count = this.count;
        int spaceNeeded = count + (b ? 4 : 5);
        ensureCapacityInternal(spaceNeeded);
        byte[] val = this.value;
        if (isLatin1()) {
            if (b) {
                val[count    ] = 't';
                val[count + 1] = 'r';
                val[count + 2] = 'u';
                val[count + 3] = 'e';
            } else {
                val[count    ] = 'f';
                val[count + 1] = 'a';
                val[count + 2] = 'l';
                val[count + 3] = 's';
                val[count + 4] = 'e';
            }
        } else {
            setLong(val, count, b ? TRUE_UTF16 : FALS_UTF16);
            if (!b) {
                StringUTF16.putChar(val, count + 4, 'e');
            }
        }
        this.count = spaceNeeded;
        return this;
    }
}

The getLong/setLong methods here can be optimized and merged by C2. Maybe we need a public class that does not use Unsafe to implement these methods. They are needed in many places. Maybe it is appropriate to improve them based on ByteArray/ByteArrayLittleEndian.

-------------

PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2159458425