RSA and Diffie-Hellman performance [Was: RFR(L): 8069539: RSA acceleration]

Thu May 28 09:25:38 UTC 2015

On 05/28/2015 08:51 AM, Andrew Haley wrote:
> On 27/05/15 21:35, Anthony Scarpino wrote:
> 
>> How much of the montgomery multiply and sqaure are you planning to
>> intrinsify?  Are you doing the whole thing or just portions of the
>> operations similar to multiplyToLen and squareToLen?
> 
> I'm doing the whole montgomery multiply and square operation in a
> routine which interleaves multiplication and reduction so that the
> intermediate product is never written into memory.

For a bit more information, here's the algorithm in C:

Andrew.

// Multiply (unsigned) Long A by Long B, accumulating the double-
// length result into the accumulator formed of T0, T1, and T2.
#define MACC(A, B, T0, T1, T2)                                  \
do {                                                            \
  unsigned long hi, lo;                                         \
  __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
           : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
           : "r"(A), "a"(B));                                   \
 } while(0)

// Fast Montgomery multiplication.  The derivation of the algorithm is
// in  A Cryptographic Library for the Motorola DSP56000,
// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.

static void __attribute__((noinline))
montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
                    unsigned long m[], unsigned long inv, int len) {
  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
  int i;

  assert(inv * n[0] == -1UL);

  for (i = 0; i < len; i++) {
    int j;
    for (j = 0; j < i; j++) {
      MACC(a[j], b[i-j], t0, t1, t2);
      MACC(m[j], n[i-j], t0, t1, t2);
    }
    MACC(a[i], b[0], t0, t1, t2);
    m[i] = t0 * inv;
    MACC(m[i], n[0], t0, t1, t2);

    assert(t0 == 0);
    t0 = t1; t1 = t2; t2 = 0;
  }

  for (i = len; i < 2*len; i++) {
    int j;
    for (j = i-len+1; j < len; j++) {
      MACC(a[j], b[i-j], t0, t1, t2);
      MACC(m[j], n[i-j], t0, t1, t2);
    }
    m[i-len] = t0;
    t0 = t1; t1 = t2; t2 = 0;
  }

  m[len] = t0;
  while(cmp(m, n, len+1) > 0)
    sub(m, n, len+1);
}