slow performance of loom continuations

Tue Sep 18 05:17:15 UTC 2018

Hi Seth,

Thank you for comparing continuation performance between kilim and loom.
After I spent some time to study the continuation(called Task?) 
implementation in kilim, I'm interested in another 2 performance data 
for kilim/loom continuation.
1. when continuation stack depth grows, what's the performance?
2. when continuation number grows, what's the performance?

I refactored your code a little to get above performance data for loom. 
(code is attached at the bottom)

following is data with continuation stack depth N changing from 1 to 
10,000 and only 1 continuation in total:
(command to run: java -XX:+UnlockDiagnosticVMOptions -XX:+UseNewCode 
-XX:+UseNewCode2 -XX:+UseNewCode3 Xorshift 200000 10 N)
continuation stack depth: 1,     perf: 1610.26 nanos/op
continuation stack depth: 10,    perf: 1607.82 nanos/op
continuation stack depth: 100,   perf: 1566.41 nanos/op
continuation stack depth: *500*,   perf: *2179.94* nanos/op
continuation stack depth: *1000*,  perf: *2145.76* nanos/op
continuation stack depth: *3000*,  perf: *2105.57* nanos/op
continuation stack depth: 5000,  perf: 1517.82 nanos/op
continuation stack depth: 10000, perf: 1673.76 nanos/op

following is data with continuation number N changing from 1 to 100,000 
and continuation stack depth only 1 for every continuation:
(command to run: java -XX:+UnlockDiagnosticVMOptions -XX:+UseNewCode 
-XX:+UseNewCode2 -XX:+UseNewCode3 Xorshift 200000 10 1 N)
continuation number: 1,      perf: 1514.10 nanos/op
continuation number: 10,     perf: 1936.75 nanos/op
continuation number: 30,     perf: 2455.95 nanos/op
continuation number: *50*,     perf: *3214.05* nanos/op
continuation number: *100*,    perf: *4241.50* nanos/op
continuation number: *110*,    perf: *3969.00* nanos/op
continuation number: *120*,    perf: *4822.41* nanos/op
continuation number: *125*,    perf: *5080.64* nanos/op
continuation number: *128*,    perf: *7963.00* nanos/op
continuation number: 130,    perf: 1907.96 nanos/op
continuation number: 500,    perf: 1854.22 nanos/op
continuation number: 1000,   perf: 1844.98 nanos/op
continuation number: 10000,  perf: 2064.03 nanos/op
continuation number: 100000, perf: 2156.77 nanos/op

Above data for loom shows that although base is quite high(more than 
1000 nanos/op with 1 continuation and stack depth 1), when stack depth 
or continuation number grows, time/op grows very very slowly and it even 
reduces when continuation number and stack depth keep growing(I'm not 
sure what's the cause).

I'm interested in the data for kilim, I tried to get it myself, but I 
failed to build kilim myself, would you mind to help to get the similar 
data for kilim?
With this data, we can compare kilim with loom more comprehensively, and 
it might help to find out the performance bottleneck and improve the 
loom's continuation performance in the future too.

Thank you
-Hamlin

==================================================================
public class Xorshift {
     static final ContinuationScope SCOPE = new ContinuationScope() {};
     Continuation ctus[];
     long result;
     int depth;
     boolean debug;
     int ctuNum;
     double sum;
     long times;

     void average() {
         System.out.format("==== loom average : %-10.2f nanos/op%n", 
sum/times);
     }

     Xorshift(int depth, int ctuNum, boolean debug) {
         this.depth = depth;
         this.ctuNum = ctuNum;
         this.debug = debug;
         ctus = new Continuation[ctuNum];
         for (int i = 0; i < ctuNum; i++) {
             ctus[i] = new Continuation(SCOPE, this::recursiveCall);
         }
     }

     void warmup(long num) {
         long warmup = 5000000000L;
         final long start = System.nanoTime();
         long dummy = 0;
         while (System.nanoTime() - start < warmup)
             dummy = dummy ^ loop(num);
         System.out.println("warmup: " + dummy);
     }
     void cycle(long num) {
         final long start = System.nanoTime();
         long val = loop(num);
         long duration = System.nanoTime() - start;
         double tmp = 1.0*duration/num;
         sum += tmp;
         times++;
         System.out.format("loom : %-10.2f nanos/op, %30d\n", tmp, val);
     }
     public long loop(long num) {
         long val = 0;
         for (int ii=0; ii < num; ii++) {
             ctus[ii%ctuNum].run();
             val = val ^ result;
         }
         return val;
     }

     void recursiveCall() {
         //System.out.println("initial d: " + depth);
         recursiveCall(depth);
     }
     void recursiveCall(int d) {
         //System.out.println("d: " + d);
         while(d > 0) {
             recursiveCall(--d);
         }
         execute();
     }
     public void execute() {
         if (debug) {
             new Throwable().printStackTrace();
         }
         long x, y, s0=103, s1=17;
         while (true) {
             x = s0;
             y = s1;
             s0 = y;
             x ^= (x << 23);
             s1 = x ^ y ^ (x >> 17) ^ (y >> 26);
             result = (s1 + y);
             Continuation.yield(SCOPE);
         }
     }

     public static void main(String[] args) {

         long cycles = 200000;
         int reps = 10;
         int depth = 0;
         boolean debug = false;
         int ctuNum = 1;

         try { cycles = Long.parseLong(args[0]); } catch (Exception ex) {}
         try { reps = Integer.parseInt(args[1]); } catch (Exception ex) {}
         try { depth = Integer.parseInt(args[2]); } catch (Exception ex) {}
         try { ctuNum = Integer.parseInt(args[3]); } catch (Exception ex) {}
         try { debug = Boolean.parseBoolean(args[4]); } catch (Exception 
ex) {}

         if (args.length == 0) {
             System.out.println("args: number of cycles, number of 
repeats, depth of recursive calls, continuation number, debug");
             System.out.format("\t no args provided using defaults: %d 
%d %d %d %b\n",cycles, reps, depth, ctuNum, debug);
         } else {
             System.out.format("\t cycle: %d, reps: %d, depth: %d, 
ctuNum: %d, debug: %b\n",cycles, reps, depth, ctuNum, debug);
         }

         new Xorshift(depth, ctuNum, debug).warmup(cycles);
         Xorshift xor = new Xorshift(depth, ctuNum, debug);

         for (int jj=0; jj < reps; jj++) {
             //Xorshift xor = new Xorshift(depth, ctuNum, debug);
             xor.cycle(cycles);
         }
         xor.average();
     }
}

On 2018/9/7 12:47 PM, seth lytle wrote:
> i ported a Xorshift implementation from kilim to project loom using the
> prototype that was announced in august. the Continuation apis are similar -
> the only changes are ctu.run() instead of run()
> and Continuation.yield(SCOPE) instead of kilim.Fiber.yield(), and the code
> runs and produces the same answers. however, performance with loom is on
> the order of 500x slower. i tried both f46dc5c01b7d (2% faster) and
> 544bfe4ccd84
>
> kilim: 32.30      nanos/op,           -4971871656801550503
> kilim: 36.99      nanos/op,            2357119851256129241
> kilim: 32.60      nanos/op,            8340372355868382387
>
> loom : 18303.18   nanos/op,           -4971871656801550503
> loom : 18107.61   nanos/op,            2357119851256129241
> loom : 18184.15   nanos/op,            8340372355868382387
>
>
> with -XX:+UnlockExperimentalVMOptions -XX:+UseNewCode
>
> loom : 2072.82    nanos/op,           -4971871656801550503
> loom : 1879.37    nanos/op,            2357119851256129241
> loom : 1841.58    nanos/op,            8340372355868382387
>
>
> am i using the api correctly ? i do use exceptions in other continuations,
> so UseNewCode isn't really an option for me
>
> i realize that this is an early prototype. do you have any idea what
> performance you're ultimately shooting for for this sort of loop ?
>
> is there a tradeoff between performance in these simple cases and being
> able to weave "most" production code ?
>
>
>
> here's the full example:
>
>
> public class Xorshift {
>      static final ContinuationScope SCOPE = new ContinuationScope() {};
>      Continuation ctu = new Continuation(SCOPE,this::execute);
>      long result;
>
>      void warmup(long num) {
>          long warmup = 5000000000L;
>          final long start = System.nanoTime();
>          long dummy = 0;
>          while (System.nanoTime() - start < warmup)
>              dummy = dummy ^ loop(num);
>          System.out.println("warmup: " + dummy);
>      }
>      void cycle(long num) {
>          final long start = System.nanoTime();
>          long val = loop(num);
>          long duration = System.nanoTime() - start;
>          System.out.format("loom : %-10.2f nanos/op, %30d\n",
> 1.0*duration/num, val);
>      }
>      public long loop(long num) {
>          long val = 0;
>          for (int ii=0; ii < num; ii++) {
>              ctu.run();
>              val = val ^ result;
>          }
>          return val;
>      }
>      public void execute() {
>          long x, y, s0=103, s1=17;
>          while (true) {
>              x = s0;
>              y = s1;
>              s0 = y;
>              x ^= (x << 23);
>              s1 = x ^ y ^ (x >> 17) ^ (y >> 26);
>              result = (s1 + y);
>              Continuation.yield(SCOPE);
>          }
>      }
>
>      public static void main(String[] args) {
>
>          long cycles = 200000;
>          int reps = 10;
>          if (args.length == 0) {
>              System.out.println("args: number of cycles, number of repeats");
>              System.out.format("\t no args provided using defaults: %d
> %d\n",cycles,reps);
>          }
>          try { cycles = Long.parseLong(args[0]); } catch (Exception ex) {}
>          try { reps = Integer.parseInt(args[1]); } catch (Exception ex) {}
>
>          new Xorshift().warmup(cycles);
>          Xorshift xor = new Xorshift();
>
>          for (int jj=0; jj < reps; jj++)
>              xor.cycle(cycles);
>      }
> }