mirror of
https://github.com/kmackay/micro-ecc.git
synced 2026-06-05 21:14:51 +00:00
Add fast multiply asm for AVR (#50)
This commit is contained in:
+2
-2
@@ -3,8 +3,6 @@
|
|||||||
#ifndef _UECC_ASM_ARM_H_
|
#ifndef _UECC_ASM_ARM_H_
|
||||||
#define _UECC_ASM_ARM_H_
|
#define _UECC_ASM_ARM_H_
|
||||||
|
|
||||||
#include "asm_arm_mult_square.inc"
|
|
||||||
|
|
||||||
#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
|
#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
|
||||||
#define uECC_MIN_WORDS 8
|
#define uECC_MIN_WORDS 8
|
||||||
#endif
|
#endif
|
||||||
@@ -158,6 +156,8 @@ uECC_VLI_API uECC_word_t uECC_vli_sub(uECC_word_t *result,
|
|||||||
|
|
||||||
#if (uECC_OPTIMIZATION_LEVEL >= 3)
|
#if (uECC_OPTIMIZATION_LEVEL >= 3)
|
||||||
|
|
||||||
|
#include "asm_arm_mult_square.inc"
|
||||||
|
|
||||||
#define FAST_MULT_ASM_5_TO_6 \
|
#define FAST_MULT_ASM_5_TO_6 \
|
||||||
"cmp r3, #5 \n\t" \
|
"cmp r3, #5 \n\t" \
|
||||||
"beq 1f \n\t" \
|
"beq 1f \n\t" \
|
||||||
|
|||||||
+76
@@ -3,6 +3,22 @@
|
|||||||
#ifndef _UECC_ASM_AVR_H_
|
#ifndef _UECC_ASM_AVR_H_
|
||||||
#define _UECC_ASM_AVR_H_
|
#define _UECC_ASM_AVR_H_
|
||||||
|
|
||||||
|
#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
|
||||||
|
#define uECC_MIN_WORDS 32
|
||||||
|
#endif
|
||||||
|
#if uECC_SUPPORTS_secp224r1
|
||||||
|
#undef uECC_MIN_WORDS
|
||||||
|
#define uECC_MIN_WORDS 28
|
||||||
|
#endif
|
||||||
|
#if uECC_SUPPORTS_secp192r1
|
||||||
|
#undef uECC_MIN_WORDS
|
||||||
|
#define uECC_MIN_WORDS 24
|
||||||
|
#endif
|
||||||
|
#if uECC_SUPPORTS_secp160r1
|
||||||
|
#undef uECC_MIN_WORDS
|
||||||
|
#define uECC_MIN_WORDS 20
|
||||||
|
#endif
|
||||||
|
|
||||||
#if __AVR_HAVE_EIJMP_EICALL__
|
#if __AVR_HAVE_EIJMP_EICALL__
|
||||||
#define IJMP "eijmp \n\t"
|
#define IJMP "eijmp \n\t"
|
||||||
#else
|
#else
|
||||||
@@ -189,6 +205,64 @@ uECC_VLI_API uECC_word_t uECC_vli_sub(uECC_word_t *result,
|
|||||||
}
|
}
|
||||||
#define asm_sub 1
|
#define asm_sub 1
|
||||||
|
|
||||||
|
#if (uECC_OPTIMIZATION_LEVEL >= 3)
|
||||||
|
|
||||||
|
#include "asm_avr_mult_square.inc"
|
||||||
|
|
||||||
|
__attribute((noinline))
|
||||||
|
uECC_VLI_API void uECC_vli_mult(uECC_word_t *result,
|
||||||
|
const uECC_word_t *left,
|
||||||
|
const uECC_word_t *right,
|
||||||
|
wordcount_t num_words) {
|
||||||
|
/* num_words should already be in r18. */
|
||||||
|
register wordcount_t r18 __asm__("r18") = num_words;
|
||||||
|
|
||||||
|
__asm__ volatile (
|
||||||
|
"push r18 \n\t"
|
||||||
|
#if (uECC_MIN_WORDS == 20)
|
||||||
|
FAST_MULT_ASM_20
|
||||||
|
"pop r18 \n\t"
|
||||||
|
#if (uECC_MAX_WORDS > 20)
|
||||||
|
FAST_MULT_ASM_20_TO_24
|
||||||
|
#endif
|
||||||
|
#if (uECC_MAX_WORDS > 24)
|
||||||
|
FAST_MULT_ASM_24_TO_28
|
||||||
|
#endif
|
||||||
|
#if (uECC_MAX_WORDS > 28)
|
||||||
|
FAST_MULT_ASM_28_TO_32
|
||||||
|
#endif
|
||||||
|
#elif (uECC_MIN_WORDS == 24)
|
||||||
|
FAST_MULT_ASM_24
|
||||||
|
"pop r18 \n\t"
|
||||||
|
#if (uECC_MAX_WORDS > 24)
|
||||||
|
FAST_MULT_ASM_24_TO_28
|
||||||
|
#endif
|
||||||
|
#if (uECC_MAX_WORDS > 28)
|
||||||
|
FAST_MULT_ASM_28_TO_32
|
||||||
|
#endif
|
||||||
|
#elif (uECC_MIN_WORDS == 28)
|
||||||
|
FAST_MULT_ASM_28
|
||||||
|
"pop r18 \n\t"
|
||||||
|
#if (uECC_MAX_WORDS > 28)
|
||||||
|
FAST_MULT_ASM_28_TO_32
|
||||||
|
#endif
|
||||||
|
#elif (uECC_MIN_WORDS == 32)
|
||||||
|
FAST_MULT_ASM_32
|
||||||
|
"pop r18 \n\t"
|
||||||
|
#endif
|
||||||
|
"done: \n\t"
|
||||||
|
"eor r1, r1 \n\t"
|
||||||
|
: "+x" (left), "+y" (right), "+z" (result)
|
||||||
|
: "r" (r18)
|
||||||
|
: "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
|
||||||
|
"r11", "r12", "r13", "r14", "r15", "r16", "r17", "r19", "r20",
|
||||||
|
"r21", "r22", "r23", "r24", "r25", "cc", "memory"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
#define asm_mult 1
|
||||||
|
|
||||||
|
#endif /* (uECC_OPTIMIZATION_LEVEL >= 3) */
|
||||||
|
|
||||||
#if uECC_SUPPORTS_secp160r1
|
#if uECC_SUPPORTS_secp160r1
|
||||||
static const struct uECC_Curve_t curve_secp160r1;
|
static const struct uECC_Curve_t curve_secp160r1;
|
||||||
static void vli_mmod_fast_secp160r1(uECC_word_t *result, uECC_word_t *product) {
|
static void vli_mmod_fast_secp160r1(uECC_word_t *result, uECC_word_t *product) {
|
||||||
@@ -704,6 +778,8 @@ static void vli_mmod_fast_secp256r1(uECC_word_t *result, uECC_word_t *product) {
|
|||||||
|
|
||||||
#endif /* (uECC_OPTIMIZATION_LEVEL >= 2) */
|
#endif /* (uECC_OPTIMIZATION_LEVEL >= 2) */
|
||||||
|
|
||||||
|
/* ---- "Small" implementations ---- */
|
||||||
|
|
||||||
#if !asm_add
|
#if !asm_add
|
||||||
uECC_VLI_API uECC_word_t uECC_vli_add(uECC_word_t *result,
|
uECC_VLI_API uECC_word_t uECC_vli_add(uECC_word_t *result,
|
||||||
const uECC_word_t *left,
|
const uECC_word_t *left,
|
||||||
|
|||||||
+3197
-16
File diff suppressed because it is too large
Load Diff
Executable
+143
@@ -0,0 +1,143 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print "Provide the integer size in bytes"
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
size = int(sys.argv[1])
|
||||||
|
|
||||||
|
def lhi(i):
|
||||||
|
return i + 2
|
||||||
|
|
||||||
|
def rhi(i):
|
||||||
|
return i + 6
|
||||||
|
|
||||||
|
left_lo = [10, 11, 12, 13]
|
||||||
|
right_lo = [14, 15, 16, 17]
|
||||||
|
|
||||||
|
def llo(i):
|
||||||
|
return left_lo[i]
|
||||||
|
|
||||||
|
def rlo(i):
|
||||||
|
return right_lo[i]
|
||||||
|
|
||||||
|
def emit(line, *args):
|
||||||
|
s = '"' + line + r' \n\t"'
|
||||||
|
print s % args
|
||||||
|
|
||||||
|
def update_low():
|
||||||
|
global left_lo
|
||||||
|
global right_lo
|
||||||
|
left_lo = left_lo[1:] + left_lo[:1]
|
||||||
|
right_lo = right_lo[1:] + right_lo[:1]
|
||||||
|
emit("ld r%s, x+", left_lo[3])
|
||||||
|
emit("ld r%s, y+", right_lo[3])
|
||||||
|
|
||||||
|
accum = [19, 20, 21]
|
||||||
|
|
||||||
|
def acc(i):
|
||||||
|
return accum[i]
|
||||||
|
|
||||||
|
def rotate_acc():
|
||||||
|
global accum
|
||||||
|
accum = accum[1:] + accum[:1]
|
||||||
|
|
||||||
|
# Load high values
|
||||||
|
for i in xrange(4):
|
||||||
|
emit("ld r%s, x+", lhi(i))
|
||||||
|
emit("ld r%s, y+", rhi(i))
|
||||||
|
|
||||||
|
emit("sbiw r26, %s", size + 4)
|
||||||
|
emit("sbiw r28, %s", size + 4)
|
||||||
|
emit("sbiw r30, %s", size)
|
||||||
|
|
||||||
|
# Load low values
|
||||||
|
for i in xrange(4):
|
||||||
|
emit("ld r%s, x+", llo(i))
|
||||||
|
emit("ld r%s, y+", rlo(i))
|
||||||
|
print ""
|
||||||
|
|
||||||
|
# Compute initial triangles
|
||||||
|
emit("mul r%s, r%s", lhi(0), rlo(0))
|
||||||
|
emit("mov r%s, r0", acc(0))
|
||||||
|
emit("mov r%s, r1", acc(1))
|
||||||
|
emit("ldi r%s, 0", acc(2))
|
||||||
|
emit("ld r0, z")
|
||||||
|
emit("add r%s, r0", acc(0))
|
||||||
|
emit("adc r%s, r25", acc(1))
|
||||||
|
emit("mul r%s, r%s", rhi(0), llo(0))
|
||||||
|
emit("add r%s, r0", acc(0))
|
||||||
|
emit("adc r%s, r1", acc(1))
|
||||||
|
emit("adc r%s, r25", acc(2))
|
||||||
|
emit("st z+, r%s", acc(0))
|
||||||
|
print ""
|
||||||
|
rotate_acc()
|
||||||
|
|
||||||
|
for i in xrange(1, 4):
|
||||||
|
emit("ldi r%s, 0", acc(2))
|
||||||
|
emit("ld r0, z")
|
||||||
|
emit("add r%s, r0", acc(0))
|
||||||
|
emit("adc r%s, r25", acc(1))
|
||||||
|
for j in xrange(i + 1):
|
||||||
|
emit("mul r%s, r%s", lhi(j), rlo(i-j))
|
||||||
|
emit("add r%s, r0", acc(0))
|
||||||
|
emit("adc r%s, r1", acc(1))
|
||||||
|
emit("adc r%s, r25", acc(2))
|
||||||
|
emit("mul r%s, r%s", rhi(j), llo(i-j))
|
||||||
|
emit("add r%s, r0", acc(0))
|
||||||
|
emit("adc r%s, r1", acc(1))
|
||||||
|
emit("adc r%s, r25", acc(2))
|
||||||
|
emit("st z+, r%s", acc(0))
|
||||||
|
print ""
|
||||||
|
rotate_acc()
|
||||||
|
|
||||||
|
# Compute rows overlapping old block
|
||||||
|
for i in xrange(4, size):
|
||||||
|
emit("ldi r%s, 0", acc(2))
|
||||||
|
emit("ld r0, z")
|
||||||
|
emit("add r%s, r0", acc(0))
|
||||||
|
emit("adc r%s, r25", acc(1))
|
||||||
|
update_low()
|
||||||
|
for j in xrange(4):
|
||||||
|
emit("mul r%s, r%s", lhi(j), rlo(3-j))
|
||||||
|
emit("add r%s, r0", acc(0))
|
||||||
|
emit("adc r%s, r1", acc(1))
|
||||||
|
emit("adc r%s, r25", acc(2))
|
||||||
|
emit("mul r%s, r%s", rhi(j), llo(3-j))
|
||||||
|
emit("add r%s, r0", acc(0))
|
||||||
|
emit("adc r%s, r1", acc(1))
|
||||||
|
emit("adc r%s, r25", acc(2))
|
||||||
|
emit("st z+, r%s", acc(0))
|
||||||
|
print ""
|
||||||
|
rotate_acc()
|
||||||
|
|
||||||
|
# Compute new triangle
|
||||||
|
left_combined = [llo(1), llo(2), llo(3), lhi(0), lhi(1), lhi(2), lhi(3)]
|
||||||
|
right_combined = [rlo(1), rlo(2), rlo(3), rhi(0), rhi(1), rhi(2), rhi(3)]
|
||||||
|
|
||||||
|
def left(i):
|
||||||
|
return left_combined[i]
|
||||||
|
|
||||||
|
def right(i):
|
||||||
|
return right_combined[i]
|
||||||
|
|
||||||
|
for i in xrange(6):
|
||||||
|
emit("ldi r%s, 0", acc(2))
|
||||||
|
for j in xrange(7 - i):
|
||||||
|
emit("mul r%s, r%s", left(i+j), right(6-j))
|
||||||
|
emit("add r%s, r0", acc(0))
|
||||||
|
emit("adc r%s, r1", acc(1))
|
||||||
|
emit("adc r%s, r25", acc(2))
|
||||||
|
emit("st z+, r%s", acc(0))
|
||||||
|
print ""
|
||||||
|
rotate_acc()
|
||||||
|
|
||||||
|
emit("mul r%s, r%s", left(6), right(6))
|
||||||
|
emit("add r%s, r0", acc(0))
|
||||||
|
emit("adc r%s, r1", acc(1))
|
||||||
|
emit("st z+, r%s", acc(0))
|
||||||
|
emit("st z+, r%s", acc(1))
|
||||||
|
emit("adiw r26, 4")
|
||||||
|
emit("adiw r28, 4")
|
||||||
Reference in New Issue
Block a user