mirror of
https://github.com/kmackay/micro-ecc.git
synced 2026-06-05 21:14:51 +00:00
Add fast multiply asm for AVR (#50)
This commit is contained in:
+2
-2
@@ -3,8 +3,6 @@
|
||||
#ifndef _UECC_ASM_ARM_H_
|
||||
#define _UECC_ASM_ARM_H_
|
||||
|
||||
#include "asm_arm_mult_square.inc"
|
||||
|
||||
#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
|
||||
#define uECC_MIN_WORDS 8
|
||||
#endif
|
||||
@@ -158,6 +156,8 @@ uECC_VLI_API uECC_word_t uECC_vli_sub(uECC_word_t *result,
|
||||
|
||||
#if (uECC_OPTIMIZATION_LEVEL >= 3)
|
||||
|
||||
#include "asm_arm_mult_square.inc"
|
||||
|
||||
#define FAST_MULT_ASM_5_TO_6 \
|
||||
"cmp r3, #5 \n\t" \
|
||||
"beq 1f \n\t" \
|
||||
|
||||
+76
@@ -3,6 +3,22 @@
|
||||
#ifndef _UECC_ASM_AVR_H_
|
||||
#define _UECC_ASM_AVR_H_
|
||||
|
||||
#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
|
||||
#define uECC_MIN_WORDS 32
|
||||
#endif
|
||||
#if uECC_SUPPORTS_secp224r1
|
||||
#undef uECC_MIN_WORDS
|
||||
#define uECC_MIN_WORDS 28
|
||||
#endif
|
||||
#if uECC_SUPPORTS_secp192r1
|
||||
#undef uECC_MIN_WORDS
|
||||
#define uECC_MIN_WORDS 24
|
||||
#endif
|
||||
#if uECC_SUPPORTS_secp160r1
|
||||
#undef uECC_MIN_WORDS
|
||||
#define uECC_MIN_WORDS 20
|
||||
#endif
|
||||
|
||||
#if __AVR_HAVE_EIJMP_EICALL__
|
||||
#define IJMP "eijmp \n\t"
|
||||
#else
|
||||
@@ -189,6 +205,64 @@ uECC_VLI_API uECC_word_t uECC_vli_sub(uECC_word_t *result,
|
||||
}
|
||||
#define asm_sub 1
|
||||
|
||||
#if (uECC_OPTIMIZATION_LEVEL >= 3)
|
||||
|
||||
#include "asm_avr_mult_square.inc"
|
||||
|
||||
__attribute((noinline))
|
||||
uECC_VLI_API void uECC_vli_mult(uECC_word_t *result,
|
||||
const uECC_word_t *left,
|
||||
const uECC_word_t *right,
|
||||
wordcount_t num_words) {
|
||||
/* num_words should already be in r18. */
|
||||
register wordcount_t r18 __asm__("r18") = num_words;
|
||||
|
||||
__asm__ volatile (
|
||||
"push r18 \n\t"
|
||||
#if (uECC_MIN_WORDS == 20)
|
||||
FAST_MULT_ASM_20
|
||||
"pop r18 \n\t"
|
||||
#if (uECC_MAX_WORDS > 20)
|
||||
FAST_MULT_ASM_20_TO_24
|
||||
#endif
|
||||
#if (uECC_MAX_WORDS > 24)
|
||||
FAST_MULT_ASM_24_TO_28
|
||||
#endif
|
||||
#if (uECC_MAX_WORDS > 28)
|
||||
FAST_MULT_ASM_28_TO_32
|
||||
#endif
|
||||
#elif (uECC_MIN_WORDS == 24)
|
||||
FAST_MULT_ASM_24
|
||||
"pop r18 \n\t"
|
||||
#if (uECC_MAX_WORDS > 24)
|
||||
FAST_MULT_ASM_24_TO_28
|
||||
#endif
|
||||
#if (uECC_MAX_WORDS > 28)
|
||||
FAST_MULT_ASM_28_TO_32
|
||||
#endif
|
||||
#elif (uECC_MIN_WORDS == 28)
|
||||
FAST_MULT_ASM_28
|
||||
"pop r18 \n\t"
|
||||
#if (uECC_MAX_WORDS > 28)
|
||||
FAST_MULT_ASM_28_TO_32
|
||||
#endif
|
||||
#elif (uECC_MIN_WORDS == 32)
|
||||
FAST_MULT_ASM_32
|
||||
"pop r18 \n\t"
|
||||
#endif
|
||||
"done: \n\t"
|
||||
"eor r1, r1 \n\t"
|
||||
: "+x" (left), "+y" (right), "+z" (result)
|
||||
: "r" (r18)
|
||||
: "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
|
||||
"r11", "r12", "r13", "r14", "r15", "r16", "r17", "r19", "r20",
|
||||
"r21", "r22", "r23", "r24", "r25", "cc", "memory"
|
||||
);
|
||||
}
|
||||
#define asm_mult 1
|
||||
|
||||
#endif /* (uECC_OPTIMIZATION_LEVEL >= 3) */
|
||||
|
||||
#if uECC_SUPPORTS_secp160r1
|
||||
static const struct uECC_Curve_t curve_secp160r1;
|
||||
static void vli_mmod_fast_secp160r1(uECC_word_t *result, uECC_word_t *product) {
|
||||
@@ -704,6 +778,8 @@ static void vli_mmod_fast_secp256r1(uECC_word_t *result, uECC_word_t *product) {
|
||||
|
||||
#endif /* (uECC_OPTIMIZATION_LEVEL >= 2) */
|
||||
|
||||
/* ---- "Small" implementations ---- */
|
||||
|
||||
#if !asm_add
|
||||
uECC_VLI_API uECC_word_t uECC_vli_add(uECC_word_t *result,
|
||||
const uECC_word_t *left,
|
||||
|
||||
+3197
-16
File diff suppressed because it is too large
Load Diff
Executable
+143
@@ -0,0 +1,143 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print "Provide the integer size in bytes"
|
||||
sys.exit(1)
|
||||
|
||||
size = int(sys.argv[1])
|
||||
|
||||
def lhi(i):
|
||||
return i + 2
|
||||
|
||||
def rhi(i):
|
||||
return i + 6
|
||||
|
||||
left_lo = [10, 11, 12, 13]
|
||||
right_lo = [14, 15, 16, 17]
|
||||
|
||||
def llo(i):
|
||||
return left_lo[i]
|
||||
|
||||
def rlo(i):
|
||||
return right_lo[i]
|
||||
|
||||
def emit(line, *args):
|
||||
s = '"' + line + r' \n\t"'
|
||||
print s % args
|
||||
|
||||
def update_low():
|
||||
global left_lo
|
||||
global right_lo
|
||||
left_lo = left_lo[1:] + left_lo[:1]
|
||||
right_lo = right_lo[1:] + right_lo[:1]
|
||||
emit("ld r%s, x+", left_lo[3])
|
||||
emit("ld r%s, y+", right_lo[3])
|
||||
|
||||
accum = [19, 20, 21]
|
||||
|
||||
def acc(i):
|
||||
return accum[i]
|
||||
|
||||
def rotate_acc():
|
||||
global accum
|
||||
accum = accum[1:] + accum[:1]
|
||||
|
||||
# Load high values
|
||||
for i in xrange(4):
|
||||
emit("ld r%s, x+", lhi(i))
|
||||
emit("ld r%s, y+", rhi(i))
|
||||
|
||||
emit("sbiw r26, %s", size + 4)
|
||||
emit("sbiw r28, %s", size + 4)
|
||||
emit("sbiw r30, %s", size)
|
||||
|
||||
# Load low values
|
||||
for i in xrange(4):
|
||||
emit("ld r%s, x+", llo(i))
|
||||
emit("ld r%s, y+", rlo(i))
|
||||
print ""
|
||||
|
||||
# Compute initial triangles
|
||||
emit("mul r%s, r%s", lhi(0), rlo(0))
|
||||
emit("mov r%s, r0", acc(0))
|
||||
emit("mov r%s, r1", acc(1))
|
||||
emit("ldi r%s, 0", acc(2))
|
||||
emit("ld r0, z")
|
||||
emit("add r%s, r0", acc(0))
|
||||
emit("adc r%s, r25", acc(1))
|
||||
emit("mul r%s, r%s", rhi(0), llo(0))
|
||||
emit("add r%s, r0", acc(0))
|
||||
emit("adc r%s, r1", acc(1))
|
||||
emit("adc r%s, r25", acc(2))
|
||||
emit("st z+, r%s", acc(0))
|
||||
print ""
|
||||
rotate_acc()
|
||||
|
||||
for i in xrange(1, 4):
|
||||
emit("ldi r%s, 0", acc(2))
|
||||
emit("ld r0, z")
|
||||
emit("add r%s, r0", acc(0))
|
||||
emit("adc r%s, r25", acc(1))
|
||||
for j in xrange(i + 1):
|
||||
emit("mul r%s, r%s", lhi(j), rlo(i-j))
|
||||
emit("add r%s, r0", acc(0))
|
||||
emit("adc r%s, r1", acc(1))
|
||||
emit("adc r%s, r25", acc(2))
|
||||
emit("mul r%s, r%s", rhi(j), llo(i-j))
|
||||
emit("add r%s, r0", acc(0))
|
||||
emit("adc r%s, r1", acc(1))
|
||||
emit("adc r%s, r25", acc(2))
|
||||
emit("st z+, r%s", acc(0))
|
||||
print ""
|
||||
rotate_acc()
|
||||
|
||||
# Compute rows overlapping old block
|
||||
for i in xrange(4, size):
|
||||
emit("ldi r%s, 0", acc(2))
|
||||
emit("ld r0, z")
|
||||
emit("add r%s, r0", acc(0))
|
||||
emit("adc r%s, r25", acc(1))
|
||||
update_low()
|
||||
for j in xrange(4):
|
||||
emit("mul r%s, r%s", lhi(j), rlo(3-j))
|
||||
emit("add r%s, r0", acc(0))
|
||||
emit("adc r%s, r1", acc(1))
|
||||
emit("adc r%s, r25", acc(2))
|
||||
emit("mul r%s, r%s", rhi(j), llo(3-j))
|
||||
emit("add r%s, r0", acc(0))
|
||||
emit("adc r%s, r1", acc(1))
|
||||
emit("adc r%s, r25", acc(2))
|
||||
emit("st z+, r%s", acc(0))
|
||||
print ""
|
||||
rotate_acc()
|
||||
|
||||
# Compute new triangle
|
||||
left_combined = [llo(1), llo(2), llo(3), lhi(0), lhi(1), lhi(2), lhi(3)]
|
||||
right_combined = [rlo(1), rlo(2), rlo(3), rhi(0), rhi(1), rhi(2), rhi(3)]
|
||||
|
||||
def left(i):
|
||||
return left_combined[i]
|
||||
|
||||
def right(i):
|
||||
return right_combined[i]
|
||||
|
||||
for i in xrange(6):
|
||||
emit("ldi r%s, 0", acc(2))
|
||||
for j in xrange(7 - i):
|
||||
emit("mul r%s, r%s", left(i+j), right(6-j))
|
||||
emit("add r%s, r0", acc(0))
|
||||
emit("adc r%s, r1", acc(1))
|
||||
emit("adc r%s, r25", acc(2))
|
||||
emit("st z+, r%s", acc(0))
|
||||
print ""
|
||||
rotate_acc()
|
||||
|
||||
emit("mul r%s, r%s", left(6), right(6))
|
||||
emit("add r%s, r0", acc(0))
|
||||
emit("adc r%s, r1", acc(1))
|
||||
emit("st z+, r%s", acc(0))
|
||||
emit("st z+, r%s", acc(1))
|
||||
emit("adiw r26, 4")
|
||||
emit("adiw r28, 4")
|
||||
Reference in New Issue
Block a user