Add fast multiply asm for AVR (#50)

This commit is contained in:
Ken MacKay
2016-01-07 23:13:58 -08:00
parent d18c132149
commit f870194aae
4 changed files with 3418 additions and 18 deletions
+2 -2
View File
@@ -3,8 +3,6 @@
#ifndef _UECC_ASM_ARM_H_
#define _UECC_ASM_ARM_H_
#include "asm_arm_mult_square.inc"
#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
#define uECC_MIN_WORDS 8
#endif
@@ -158,6 +156,8 @@ uECC_VLI_API uECC_word_t uECC_vli_sub(uECC_word_t *result,
#if (uECC_OPTIMIZATION_LEVEL >= 3)
#include "asm_arm_mult_square.inc"
#define FAST_MULT_ASM_5_TO_6 \
"cmp r3, #5 \n\t" \
"beq 1f \n\t" \
+76
View File
@@ -3,6 +3,22 @@
#ifndef _UECC_ASM_AVR_H_
#define _UECC_ASM_AVR_H_
#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
#define uECC_MIN_WORDS 32
#endif
#if uECC_SUPPORTS_secp224r1
#undef uECC_MIN_WORDS
#define uECC_MIN_WORDS 28
#endif
#if uECC_SUPPORTS_secp192r1
#undef uECC_MIN_WORDS
#define uECC_MIN_WORDS 24
#endif
#if uECC_SUPPORTS_secp160r1
#undef uECC_MIN_WORDS
#define uECC_MIN_WORDS 20
#endif
#if __AVR_HAVE_EIJMP_EICALL__
#define IJMP "eijmp \n\t"
#else
@@ -189,6 +205,64 @@ uECC_VLI_API uECC_word_t uECC_vli_sub(uECC_word_t *result,
}
#define asm_sub 1
#if (uECC_OPTIMIZATION_LEVEL >= 3)
#include "asm_avr_mult_square.inc"
__attribute((noinline))
uECC_VLI_API void uECC_vli_mult(uECC_word_t *result,
const uECC_word_t *left,
const uECC_word_t *right,
wordcount_t num_words) {
/* num_words should already be in r18. */
register wordcount_t r18 __asm__("r18") = num_words;
__asm__ volatile (
"push r18 \n\t"
#if (uECC_MIN_WORDS == 20)
FAST_MULT_ASM_20
"pop r18 \n\t"
#if (uECC_MAX_WORDS > 20)
FAST_MULT_ASM_20_TO_24
#endif
#if (uECC_MAX_WORDS > 24)
FAST_MULT_ASM_24_TO_28
#endif
#if (uECC_MAX_WORDS > 28)
FAST_MULT_ASM_28_TO_32
#endif
#elif (uECC_MIN_WORDS == 24)
FAST_MULT_ASM_24
"pop r18 \n\t"
#if (uECC_MAX_WORDS > 24)
FAST_MULT_ASM_24_TO_28
#endif
#if (uECC_MAX_WORDS > 28)
FAST_MULT_ASM_28_TO_32
#endif
#elif (uECC_MIN_WORDS == 28)
FAST_MULT_ASM_28
"pop r18 \n\t"
#if (uECC_MAX_WORDS > 28)
FAST_MULT_ASM_28_TO_32
#endif
#elif (uECC_MIN_WORDS == 32)
FAST_MULT_ASM_32
"pop r18 \n\t"
#endif
"done: \n\t"
"eor r1, r1 \n\t"
: "+x" (left), "+y" (right), "+z" (result)
: "r" (r18)
: "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
"r11", "r12", "r13", "r14", "r15", "r16", "r17", "r19", "r20",
"r21", "r22", "r23", "r24", "r25", "cc", "memory"
);
}
#define asm_mult 1
#endif /* (uECC_OPTIMIZATION_LEVEL >= 3) */
#if uECC_SUPPORTS_secp160r1
static const struct uECC_Curve_t curve_secp160r1;
static void vli_mmod_fast_secp160r1(uECC_word_t *result, uECC_word_t *product) {
@@ -704,6 +778,8 @@ static void vli_mmod_fast_secp256r1(uECC_word_t *result, uECC_word_t *product) {
#endif /* (uECC_OPTIMIZATION_LEVEL >= 2) */
/* ---- "Small" implementations ---- */
#if !asm_add
uECC_VLI_API uECC_word_t uECC_vli_add(uECC_word_t *result,
const uECC_word_t *left,
+3197 -16
View File
File diff suppressed because it is too large Load Diff
+143
View File
@@ -0,0 +1,143 @@
#!/usr/bin/env python
import sys
if len(sys.argv) < 2:
print "Provide the integer size in bytes"
sys.exit(1)
size = int(sys.argv[1])
def lhi(i):
return i + 2
def rhi(i):
return i + 6
left_lo = [10, 11, 12, 13]
right_lo = [14, 15, 16, 17]
def llo(i):
return left_lo[i]
def rlo(i):
return right_lo[i]
def emit(line, *args):
s = '"' + line + r' \n\t"'
print s % args
def update_low():
global left_lo
global right_lo
left_lo = left_lo[1:] + left_lo[:1]
right_lo = right_lo[1:] + right_lo[:1]
emit("ld r%s, x+", left_lo[3])
emit("ld r%s, y+", right_lo[3])
accum = [19, 20, 21]
def acc(i):
return accum[i]
def rotate_acc():
global accum
accum = accum[1:] + accum[:1]
# Load high values
for i in xrange(4):
emit("ld r%s, x+", lhi(i))
emit("ld r%s, y+", rhi(i))
emit("sbiw r26, %s", size + 4)
emit("sbiw r28, %s", size + 4)
emit("sbiw r30, %s", size)
# Load low values
for i in xrange(4):
emit("ld r%s, x+", llo(i))
emit("ld r%s, y+", rlo(i))
print ""
# Compute initial triangles
emit("mul r%s, r%s", lhi(0), rlo(0))
emit("mov r%s, r0", acc(0))
emit("mov r%s, r1", acc(1))
emit("ldi r%s, 0", acc(2))
emit("ld r0, z")
emit("add r%s, r0", acc(0))
emit("adc r%s, r25", acc(1))
emit("mul r%s, r%s", rhi(0), llo(0))
emit("add r%s, r0", acc(0))
emit("adc r%s, r1", acc(1))
emit("adc r%s, r25", acc(2))
emit("st z+, r%s", acc(0))
print ""
rotate_acc()
for i in xrange(1, 4):
emit("ldi r%s, 0", acc(2))
emit("ld r0, z")
emit("add r%s, r0", acc(0))
emit("adc r%s, r25", acc(1))
for j in xrange(i + 1):
emit("mul r%s, r%s", lhi(j), rlo(i-j))
emit("add r%s, r0", acc(0))
emit("adc r%s, r1", acc(1))
emit("adc r%s, r25", acc(2))
emit("mul r%s, r%s", rhi(j), llo(i-j))
emit("add r%s, r0", acc(0))
emit("adc r%s, r1", acc(1))
emit("adc r%s, r25", acc(2))
emit("st z+, r%s", acc(0))
print ""
rotate_acc()
# Compute rows overlapping old block
for i in xrange(4, size):
emit("ldi r%s, 0", acc(2))
emit("ld r0, z")
emit("add r%s, r0", acc(0))
emit("adc r%s, r25", acc(1))
update_low()
for j in xrange(4):
emit("mul r%s, r%s", lhi(j), rlo(3-j))
emit("add r%s, r0", acc(0))
emit("adc r%s, r1", acc(1))
emit("adc r%s, r25", acc(2))
emit("mul r%s, r%s", rhi(j), llo(3-j))
emit("add r%s, r0", acc(0))
emit("adc r%s, r1", acc(1))
emit("adc r%s, r25", acc(2))
emit("st z+, r%s", acc(0))
print ""
rotate_acc()
# Compute new triangle
left_combined = [llo(1), llo(2), llo(3), lhi(0), lhi(1), lhi(2), lhi(3)]
right_combined = [rlo(1), rlo(2), rlo(3), rhi(0), rhi(1), rhi(2), rhi(3)]
def left(i):
return left_combined[i]
def right(i):
return right_combined[i]
for i in xrange(6):
emit("ldi r%s, 0", acc(2))
for j in xrange(7 - i):
emit("mul r%s, r%s", left(i+j), right(6-j))
emit("add r%s, r0", acc(0))
emit("adc r%s, r1", acc(1))
emit("adc r%s, r25", acc(2))
emit("st z+, r%s", acc(0))
print ""
rotate_acc()
emit("mul r%s, r%s", left(6), right(6))
emit("add r%s, r0", acc(0))
emit("adc r%s, r1", acc(1))
emit("st z+, r%s", acc(0))
emit("st z+, r%s", acc(1))
emit("adiw r26, 4")
emit("adiw r28, 4")