hg: jdk7/tl/jdk: 6642323: Speeding up Single Byte Decoders; ...

Ulf Zibis Ulf.Zibis at gmx.de
Sat Jan 17 18:01:33 PST 2009


Am 17.01.2009 23:48, Christian Thalinger schrieb:
>
> I tried that one and it's slightly slower with my new version (unsigned
> byte loads).  Btw. I ran the benchmarks on an Intel Core 2 Duo.
>
> -- Christian
>
>   
Hi Christian,

I have Intel Pentium M, 2 GHz, FSB 533 MHz  (notebook).
I experienced, that CPU will clock down after some time because of 
overheating, so comparing absolute times became nonsense.

Will you please check your HotSpot optimizations with this code:

/**
 *
 * @author Ulf Zibis <Ulf.Zibis at CoSoCo.de>
 */
public class DecoderBenchmark {

    public static void main(String[] args) {
        foo();
    }

    static final char[] map = new char[256];
    static final byte[] src = new byte[16384]; // be aware of exceeding CPU cache
    static final char[] dst = new char[16384]; // be aware of exceeding CPU cache

    static void foo() {
        // fill arrays, to force real memory load and prohibit HotSpot from just incrementing
        // (maybe candidate for sophisticated HotSpot optimization ;-) ) 
        for (int i=0; i<map.length; i++)
            map[i] = (char)(59 * (227 - i));
        for (int i=0; i<src.length; i++)
            src[i] = (byte)(13 * (17 - i));
        // warm up:
        long time = System.nanoTime();
        for (int i=0; i<100; i++) {
            for (int j=0; j<100; j++)
                bar1(src, dst);
            for (int j=0; j<100; j++)
                bar2(src, dst);
            for (int j=0; j<100; j++)
                bar3(src, dst);
            for (int j=0; j<100; j++)
                bar4(src, dst);
        }
        System.out.println("time for 1st warm up: "+(-time+(time = System.nanoTime()))/1000000+" ms");
        for (int i=0; i<100; i++) {
            for (int j=0; j<100; j++)
                bar1(src, dst);
            for (int j=0; j<100; j++)
                bar2(src, dst);
            for (int j=0; j<100; j++)
                bar3(src, dst);
            for (int j=0; j<100; j++)
                bar4(src, dst);
        }
        System.out.println("time for 2nd warm up: "+(-time+(time = System.nanoTime()))/1000000+" ms");
        long time1 = 0;
        long time2 = 0;
        long time3 = 0;
        long time4 = 0;
        // swap decoders to eliminate influence of
        // other processes and CPU clockdown, caused by overheating
        for (int i=0; i<100; i++) {
            for (int j=0; j<1000; j++)
                bar1(src, dst);
            time1 -= time - (time = System.nanoTime());
            for (int j=0; j<1000; j++)
                bar2(src, dst);
            time2 -= time - (time = System.nanoTime());
            for (int j=0; j<1000; j++)
                bar3(src, dst);
            time3 -= time - (time = System.nanoTime());
            for (int j=0; j<1000; j++)
                bar4(src, dst);
            time4 -= time - (time = System.nanoTime());
        }
        System.out.println("time for map[a & 0xFF]: "+time1/1000000+" ms");
        System.out.println("time for map[a + 0x80]: "+time2/1000000+" ms");
        System.out.println("time for inlined map[a & 0xFF]: "+time3/1000000+" ms");
        System.out.println("time for inlined map[a + 0x80]: "+time4/1000000+" ms");
    }

    static void bar1(byte[] src, char[] dst) {
        for (int i=0; i<src.length; i++)
            dst[i] = decode1(src[i]);
    }
    static void bar2(byte[] src, char[] dst) {
        for (int i=0; i<src.length; i++)
            dst[i] = decode2(src[i]);
    }
    static void bar3(byte[] src, char[] dst) {
        for (int i=0; i<src.length; i++)
            dst[i] = map[src[i] & 0xFF];
    }
    static void bar4(byte[] src, char[] dst) {
        for (int i=0; i<src.length; i++)
            dst[i] = map[src[i] + 0x80];
    }

    public static char decode1(byte a) {
        return map[a & 0xFF];
    }
    public static char decode2(byte a) {
        return map[a + 0x80];
    }

}


My results by JDK 1.6.0_03 :

time for 1st warm up: 3136 ms
time for 2nd warm up: 3098 ms
time for map[a & 0xFF]: 7909 ms
time for map[a + 0x80]: 8081 ms
time for inlined map[a & 0xFF]: 8841 ms
time for inlined map[a + 0x80]: 9150 ms


My results by JDK 1.6.0_11 :

time for 1st warm up: 3152 ms
time for 2nd warm up: 3117 ms
time for map[a & 0xFF]: 7147 ms
time for map[a + 0x80]: 7124 ms
time for inlined map[a & 0xFF]: 7969 ms
time for inlined map[a + 0x80]: 7990 ms


My results by JDK 1.7.0 ea b43 :

time for 1st warm up: 3091 ms
time for 2nd warm up: 3100 ms
time for map[a & 0xFF]: 7207 ms
time for map[a + 0x80]: 7225 ms
time for inlined map[a & 0xFF]: 8100 ms
time for inlined map[a + 0x80]: 8084 ms


- Ulf










More information about the hotspot-dev mailing list