RSA and Diffie-Hellman performance [Was: RFR(L): 8069539: RSA acceleration]
Andrew Haley
aph at redhat.com
Thu May 28 09:25:38 UTC 2015
On 05/28/2015 08:51 AM, Andrew Haley wrote:
> On 27/05/15 21:35, Anthony Scarpino wrote:
>
>> How much of the montgomery multiply and sqaure are you planning to
>> intrinsify? Are you doing the whole thing or just portions of the
>> operations similar to multiplyToLen and squareToLen?
>
> I'm doing the whole montgomery multiply and square operation in a
> routine which interleaves multiplication and reduction so that the
> intermediate product is never written into memory.
For a bit more information, here's the algorithm in C:
Andrew.
// Multiply (unsigned) Long A by Long B, accumulating the double-
// length result into the accumulator formed of T0, T1, and T2.
#define MACC(A, B, T0, T1, T2) \
do { \
unsigned long hi, lo; \
__asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
: "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
: "r"(A), "a"(B)); \
} while(0)
// Fast Montgomery multiplication. The derivation of the algorithm is
// in A Cryptographic Library for the Motorola DSP56000,
// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
static void __attribute__((noinline))
montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
unsigned long m[], unsigned long inv, int len) {
unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
int i;
assert(inv * n[0] == -1UL);
for (i = 0; i < len; i++) {
int j;
for (j = 0; j < i; j++) {
MACC(a[j], b[i-j], t0, t1, t2);
MACC(m[j], n[i-j], t0, t1, t2);
}
MACC(a[i], b[0], t0, t1, t2);
m[i] = t0 * inv;
MACC(m[i], n[0], t0, t1, t2);
assert(t0 == 0);
t0 = t1; t1 = t2; t2 = 0;
}
for (i = len; i < 2*len; i++) {
int j;
for (j = i-len+1; j < len; j++) {
MACC(a[j], b[i-j], t0, t1, t2);
MACC(m[j], n[i-j], t0, t1, t2);
}
m[i-len] = t0;
t0 = t1; t1 = t2; t2 = 0;
}
m[len] = t0;
while(cmp(m, n, len+1) > 0)
sub(m, n, len+1);
}
More information about the security-dev
mailing list