RFR: 8311220: Optimization for StringLatin UpperLower [v3]

Fri Sep 1 19:55:51 UTC 2023

On Thu, 31 Aug 2023 11:39:57 GMT, Claes Redestad <redestad at openjdk.org> wrote:

>> 温绍锦 has updated the pull request incrementally with one additional commit since the last revision:
>> 
>>   add method CharacterDataLatin1#isLowerCaseEx
>
> src/java.base/share/classes/java/lang/CharacterDataLatin1.java.template line 94:
> 
>> 92: 
>> 93:     boolean isLowerCaseEx(int ch) {
>> 94:         return ch >= 'a' && (ch <= 'z' || ch == 181 || (ch >= 223 && ch != 247));
> 
> What is the contract for this? Specifically there are two special superscripte codepoints (170 and 186) which are lower-case (`Character.isLowerCase(170) => true`) but doesn't have an upper-case (`Character.toUpperCase(170) => 170`). It seems reasonable to exclude them if only used for operations like toUpper/toLower (since they won't change), but it should be spelled out to avoid surprises.
> 
> For consistency I think we should use hex literals in this file, e.g. `0xDF` instead of `223`

The current implementation of the isLowerCaseEx method and the previous implementation "cp != CharacterDataLatin1.instance.toUpperCaseEx(cp)"
The result is exactly the same.

The code below compares all numbers in [-128, 128]

import sun.misc.Unsafe;

import java.lang.invoke.MethodHandle;
import java.lang.invoke.MethodHandles;
import java.lang.invoke.MethodType;
import java.lang.reflect.Field;
import java.nio.charset.StandardCharsets;

import static com.alibaba.fastjson2.util.JDKUtils.UNSAFE;

public class CharacterDataLatin1Test {
    public static void main(String[] args) throws Throwable {
        for (int i = Byte.MIN_VALUE; i <= Byte.MAX_VALUE; ++i) {
            byte b = (byte) i;
            int cp = b & 0xff;

            boolean r0 = cp != toUpperCaseEx(cp);
            boolean r1 = isLowerCaseEx(cp);
            if (r0) {
                System.out.println(cp + "\t0x" + Integer.toHexString(cp)
                        + "\t" + new String(new byte[] {b}, StandardCharsets.ISO_8859_1));
            }

            if (r0 != r1) {
                System.out.println("error " + i);
            }
        }
    }

    static boolean isLowerCaseEx(int ch) {
        return ch >= 'a' && (ch <= 'z' || ch == 0xb5 || (ch >= 0xdf && ch != 0xf7));
    }

    static int toUpperCaseEx(int cp) throws Throwable {
        Field theUnsafeField = Unsafe.class.getDeclaredField("theUnsafe");
        theUnsafeField.setAccessible(true);
        Unsafe unsafe = (Unsafe) theUnsafeField.get(null);

        Class<?> charbinClass = Class.forName("java.lang.CharacterDataLatin1");
        Field field = charbinClass.getDeclaredField("instance");
        long fieldOffset = unsafe.staticFieldOffset(field);
        Object instance = unsafe.getObject(charbinClass, fieldOffset);

        Class lookupClass = MethodHandles.Lookup.class;
        Field implLookup = lookupClass.getDeclaredField("IMPL_LOOKUP");
        MethodHandles.Lookup trustedLookup = (MethodHandles.Lookup) unsafe.getObject(lookupClass,
                UNSAFE.staticFieldOffset(implLookup));

        MethodHandles.lookup();
        MethodHandle toLowerCase = trustedLookup
                .findVirtual(charbinClass, "toUpperCaseEx", MethodType.methodType(int.class, int.class));

        return (Integer) toLowerCase.invoke(instance, cp);
    }
}

-------------

PR Review Comment: https://git.openjdk.org/jdk/pull/14751#discussion_r1313453795