Add ARM fast multiply/square for secp224r1.

This commit is contained in:
Ken MacKay
2015-06-28 21:47:16 -07:00
parent 1015fe5c43
commit 05cdd402f2
3 changed files with 567 additions and 8 deletions
+548
View File
@@ -502,6 +502,331 @@ static void vli_mult(uint32_t *result, const uint32_t *left, const uint32_t *rig
#define asm_mult 1
#endif /* (uECC_WORDS == 6) */
#if (uECC_WORDS == 7)
static void vli_mult(uint32_t *result, const uint32_t *left, const uint32_t *right) {
register uint32_t *r0 __asm__("r0") = result;
register const uint32_t *r1 __asm__("r1") = left;
register const uint32_t *r2 __asm__("r2") = right;
__asm__ volatile (
".syntax unified \n\t"
"add r0, 24 \n\t"
"add r2, 24 \n\t"
"ldmia r1!, {r3} \n\t"
"ldmia r2!, {r6} \n\t"
"umull r9, r10, r3, r6 \n\t"
"stmia r0!, {r9, r10} \n\t"
"sub r0, 20 \n\t"
"sub r2, 16 \n\t"
"ldmia r2!, {r6, r7, r8} \n\t"
"ldmia r1!, {r4, r5} \n\t"
"umull r9, r10, r3, r6 \n\t"
"stmia r0!, {r9} \n\t"
"mov r14, #0 \n\t"
"umull r9, r12, r3, r7 \n\t"
"adds r10, r9 \n\t"
"adc r12, #0 \n\t"
"umull r9, r11, r4, r6 \n\t"
"adds r10, r9 \n\t"
"adcs r12, r11 \n\t"
"adc r14, #0 \n\t"
"stmia r0!, {r10} \n\t"
"mov r9, #0 \n\t"
"umull r10, r11, r3, r8 \n\t"
"adds r12, r10 \n\t"
"adcs r14, r11 \n\t"
"adc r9, #0 \n\t"
"umull r10, r11, r4, r7 \n\t"
"adds r12, r10 \n\t"
"adcs r14, r11 \n\t"
"adc r9, #0 \n\t"
"umull r10, r11, r5, r6 \n\t"
"adds r12, r10 \n\t"
"adcs r14, r11 \n\t"
"adc r9, #0 \n\t"
"stmia r0!, {r12} \n\t"
"ldmia r1!, {r3} \n\t"
"mov r10, #0 \n\t"
"umull r11, r12, r4, r8 \n\t"
"adds r14, r11 \n\t"
"adcs r9, r12 \n\t"
"adc r10, #0 \n\t"
"umull r11, r12, r5, r7 \n\t"
"adds r14, r11 \n\t"
"adcs r9, r12 \n\t"
"adc r10, #0 \n\t"
"umull r11, r12, r3, r6 \n\t"
"adds r14, r11 \n\t"
"adcs r9, r12 \n\t"
"adc r10, #0 \n\t"
"ldr r11, [r0] \n\t"
"adds r14, r11 \n\t"
"adcs r9, #0 \n\t"
"adc r10, #0 \n\t"
"stmia r0!, {r14} \n\t"
"ldmia r2!, {r6} \n\t"
"mov r11, #0 \n\t"
"umull r12, r14, r4, r6 \n\t"
"adds r9, r12 \n\t"
"adcs r10, r14 \n\t"
"adc r11, #0 \n\t"
"umull r12, r14, r5, r8 \n\t"
"adds r9, r12 \n\t"
"adcs r10, r14 \n\t"
"adc r11, #0 \n\t"
"umull r12, r14, r3, r7 \n\t"
"adds r9, r12 \n\t"
"adcs r10, r14 \n\t"
"adc r11, #0 \n\t"
"ldr r12, [r0] \n\t"
"adds r9, r12 \n\t"
"adcs r10, #0 \n\t"
"adc r11, #0 \n\t"
"stmia r0!, {r9} \n\t"
"mov r12, #0 \n\t"
"umull r14, r9, r5, r6 \n\t"
"adds r10, r14 \n\t"
"adcs r11, r9 \n\t"
"adc r12, #0 \n\t"
"umull r14, r9, r3, r8 \n\t"
"adds r10, r14 \n\t"
"adcs r11, r9 \n\t"
"adc r12, #0 \n\t"
"stmia r0!, {r10} \n\t"
"umull r9, r10, r3, r6 \n\t"
"adds r11, r9 \n\t"
"adc r12, r10 \n\t"
"stmia r0!, {r11, r12} \n\t"
"sub r0, 44 \n\t"
"sub r1, 16 \n\t"
"sub r2, 28 \n\t"
"ldmia r1!, {r3,r4,r5} \n\t"
"ldmia r2!, {r6,r7,r8} \n\t"
"umull r9, r10, r3, r6 \n\t"
"stmia r0!, {r9} \n\t"
"mov r14, #0 \n\t"
"umull r9, r12, r3, r7 \n\t"
"adds r10, r9 \n\t"
"adc r12, #0 \n\t"
"umull r9, r11, r4, r6 \n\t"
"adds r10, r9 \n\t"
"adcs r12, r11 \n\t"
"adc r14, #0 \n\t"
"stmia r0!, {r10} \n\t"
"mov r9, #0 \n\t"
"umull r10, r11, r3, r8 \n\t"
"adds r12, r10 \n\t"
"adcs r14, r11 \n\t"
"adc r9, #0 \n\t"
"umull r10, r11, r4, r7 \n\t"
"adds r12, r10 \n\t"
"adcs r14, r11 \n\t"
"adc r9, #0 \n\t"
"umull r10, r11, r5, r6 \n\t"
"adds r12, r10 \n\t"
"adcs r14, r11 \n\t"
"adc r9, #0 \n\t"
"stmia r0!, {r12} \n\t"
"ldmia r1!, {r3} \n\t"
"mov r10, #0 \n\t"
"umull r11, r12, r4, r8 \n\t"
"adds r14, r11 \n\t"
"adcs r9, r12 \n\t"
"adc r10, #0 \n\t"
"umull r11, r12, r5, r7 \n\t"
"adds r14, r11 \n\t"
"adcs r9, r12 \n\t"
"adc r10, #0 \n\t"
"umull r11, r12, r3, r6 \n\t"
"adds r14, r11 \n\t"
"adcs r9, r12 \n\t"
"adc r10, #0 \n\t"
"ldr r11, [r0] \n\t"
"adds r14, r11 \n\t"
"adcs r9, #0 \n\t"
"adc r10, #0 \n\t"
"stmia r0!, {r14} \n\t"
"ldmia r1!, {r4} \n\t"
"mov r11, #0 \n\t"
"umull r12, r14, r5, r8 \n\t"
"adds r9, r12 \n\t"
"adcs r10, r14 \n\t"
"adc r11, #0 \n\t"
"umull r12, r14, r3, r7 \n\t"
"adds r9, r12 \n\t"
"adcs r10, r14 \n\t"
"adc r11, #0 \n\t"
"umull r12, r14, r4, r6 \n\t"
"adds r9, r12 \n\t"
"adcs r10, r14 \n\t"
"adc r11, #0 \n\t"
"ldr r12, [r0] \n\t"
"adds r9, r12 \n\t"
"adcs r10, #0 \n\t"
"adc r11, #0 \n\t"
"stmia r0!, {r9} \n\t"
"ldmia r1!, {r5} \n\t"
"mov r12, #0 \n\t"
"umull r14, r9, r3, r8 \n\t"
"adds r10, r14 \n\t"
"adcs r11, r9 \n\t"
"adc r12, #0 \n\t"
"umull r14, r9, r4, r7 \n\t"
"adds r10, r14 \n\t"
"adcs r11, r9 \n\t"
"adc r12, #0 \n\t"
"umull r14, r9, r5, r6 \n\t"
"adds r10, r14 \n\t"
"adcs r11, r9 \n\t"
"adc r12, #0 \n\t"
"ldr r14, [r0] \n\t"
"adds r10, r14 \n\t"
"adcs r11, #0 \n\t"
"adc r12, #0 \n\t"
"stmia r0!, {r10} \n\t"
"ldmia r1!, {r3} \n\t"
"mov r14, #0 \n\t"
"umull r9, r10, r4, r8 \n\t"
"adds r11, r9 \n\t"
"adcs r12, r10 \n\t"
"adc r14, #0 \n\t"
"umull r9, r10, r5, r7 \n\t"
"adds r11, r9 \n\t"
"adcs r12, r10 \n\t"
"adc r14, #0 \n\t"
"umull r9, r10, r3, r6 \n\t"
"adds r11, r9 \n\t"
"adcs r12, r10 \n\t"
"adc r14, #0 \n\t"
"ldr r9, [r0] \n\t"
"adds r11, r9 \n\t"
"adcs r12, #0 \n\t"
"adc r14, #0 \n\t"
"stmia r0!, {r11} \n\t"
"ldmia r2!, {r6} \n\t"
"mov r9, #0 \n\t"
"umull r10, r11, r4, r6 \n\t"
"adds r12, r10 \n\t"
"adcs r14, r11 \n\t"
"adc r9, #0 \n\t"
"umull r10, r11, r5, r8 \n\t"
"adds r12, r10 \n\t"
"adcs r14, r11 \n\t"
"adc r9, #0 \n\t"
"umull r10, r11, r3, r7 \n\t"
"adds r12, r10 \n\t"
"adcs r14, r11 \n\t"
"adc r9, #0 \n\t"
"ldr r10, [r0] \n\t"
"adds r12, r10 \n\t"
"adcs r14, #0 \n\t"
"adc r9, #0 \n\t"
"stmia r0!, {r12} \n\t"
"ldmia r2!, {r7} \n\t"
"mov r10, #0 \n\t"
"umull r11, r12, r4, r7 \n\t"
"adds r14, r11 \n\t"
"adcs r9, r12 \n\t"
"adc r10, #0 \n\t"
"umull r11, r12, r5, r6 \n\t"
"adds r14, r11 \n\t"
"adcs r9, r12 \n\t"
"adc r10, #0 \n\t"
"umull r11, r12, r3, r8 \n\t"
"adds r14, r11 \n\t"
"adcs r9, r12 \n\t"
"adc r10, #0 \n\t"
"ldr r11, [r0] \n\t"
"adds r14, r11 \n\t"
"adcs r9, #0 \n\t"
"adc r10, #0 \n\t"
"stmia r0!, {r14} \n\t"
"ldmia r2!, {r8} \n\t"
"mov r11, #0 \n\t"
"umull r12, r14, r4, r8 \n\t"
"adds r9, r12 \n\t"
"adcs r10, r14 \n\t"
"adc r11, #0 \n\t"
"umull r12, r14, r5, r7 \n\t"
"adds r9, r12 \n\t"
"adcs r10, r14 \n\t"
"adc r11, #0 \n\t"
"umull r12, r14, r3, r6 \n\t"
"adds r9, r12 \n\t"
"adcs r10, r14 \n\t"
"adc r11, #0 \n\t"
"ldr r12, [r0] \n\t"
"adds r9, r12 \n\t"
"adcs r10, #0 \n\t"
"adc r11, #0 \n\t"
"stmia r0!, {r9} \n\t"
"ldmia r2!, {r6} \n\t"
"mov r12, #0 \n\t"
"umull r14, r9, r4, r6 \n\t"
"adds r10, r14 \n\t"
"adcs r11, r9 \n\t"
"adc r12, #0 \n\t"
"umull r14, r9, r5, r8 \n\t"
"adds r10, r14 \n\t"
"adcs r11, r9 \n\t"
"adc r12, #0 \n\t"
"umull r14, r9, r3, r7 \n\t"
"adds r10, r14 \n\t"
"adcs r11, r9 \n\t"
"adc r12, #0 \n\t"
"ldr r14, [r0] \n\t"
"adds r10, r14 \n\t"
"adcs r11, #0 \n\t"
"adc r12, #0 \n\t"
"stmia r0!, {r10} \n\t"
"mov r14, #0 \n\t"
"umull r9, r10, r5, r6 \n\t"
"adds r11, r9 \n\t"
"adcs r12, r10 \n\t"
"adc r14, #0 \n\t"
"umull r9, r10, r3, r8 \n\t"
"adds r11, r9 \n\t"
"adcs r12, r10 \n\t"
"adc r14, #0 \n\t"
"stmia r0!, {r11} \n\t"
"umull r10, r11, r3, r6 \n\t"
"adds r12, r10 \n\t"
"adc r14, r11 \n\t"
"stmia r0!, {r12, r14} \n\t"
#if (uECC_PLATFORM != uECC_arm_thumb2)
".syntax divided \n\t"
#endif
: "+r" (r0), "+r" (r1), "+r" (r2)
:
: "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
);
}
#define asm_mult 1
#endif /* (uECC_WORDS == 7) */
#if (uECC_WORDS == 8)
static void vli_mult(uint32_t *result, const uint32_t *left, const uint32_t *right) {
register uint32_t *r0 __asm__("r0") = result;
@@ -1200,6 +1525,229 @@ static void vli_square(uint32_t *result, const uint32_t *left) {
#define asm_square 1
#endif /* (uECC_WORDS == 6) */
#if (uECC_WORDS == 7)
static void vli_square(uint32_t *result, const uint32_t *left) {
register uint32_t *r0 __asm__("r0") = result;
register const uint32_t *r1 __asm__("r1") = left;
__asm__ volatile (
".syntax unified \n\t"
"ldmia r1!, {r2} \n\t"
"add r1, 20 \n\t"
"ldmia r1!, {r5} \n\t"
"add r0, 24 \n\t"
"umull r8, r9, r2, r5 \n\t"
"stmia r0!, {r8, r9} \n\t"
"sub r0, 32 \n\t"
"sub r1, 28 \n\t"
"ldmia r1!, {r2, r3, r4, r5, r6, r7} \n\t"
"umull r11, r12, r2, r2 \n\t"
"stmia r0!, {r11} \n\t"
"mov r9, #0 \n\t"
"umull r10, r11, r2, r3 \n\t"
"adds r12, r10 \n\t"
"adcs r8, r11, #0 \n\t"
"adc r9, #0 \n\t"
"adds r12, r10 \n\t"
"adcs r8, r11 \n\t"
"adc r9, #0 \n\t"
"stmia r0!, {r12} \n\t"
"mov r10, #0 \n\t"
"umull r11, r12, r2, r4 \n\t"
"adds r11, r11 \n\t"
"adcs r12, r12 \n\t"
"adc r10, #0 \n\t"
"adds r8, r11 \n\t"
"adcs r9, r12 \n\t"
"adc r10, #0 \n\t"
"umull r11, r12, r3, r3 \n\t"
"adds r8, r11 \n\t"
"adcs r9, r12 \n\t"
"adc r10, #0 \n\t"
"stmia r0!, {r8} \n\t"
"mov r12, #0 \n\t"
"umull r8, r11, r2, r5 \n\t"
"mov r14, r11 \n\t"
"umlal r8, r11, r3, r4 \n\t"
"cmp r14, r11 \n\t"
"it hi \n\t"
"adchi r12, #0 \n\t"
"adds r8, r8 \n\t"
"adcs r11, r11 \n\t"
"adc r12, r12 \n\t"
"adds r8, r9 \n\t"
"adcs r11, r10 \n\t"
"adc r12, #0 \n\t"
"stmia r0!, {r8} \n\t"
"mov r10, #0 \n\t"
"umull r8, r9, r2, r6 \n\t"
"mov r14, r9 \n\t"
"umlal r8, r9, r3, r5 \n\t"
"cmp r14, r9 \n\t"
"it hi \n\t"
"adchi r10, #0 \n\t"
"adds r8, r8 \n\t"
"adcs r9, r9 \n\t"
"adc r10, r10 \n\t"
"mov r14, r9 \n\t"
"umlal r8, r9, r4, r4 \n\t"
"cmp r14, r9 \n\t"
"it hi \n\t"
"adchi r10, #0 \n\t"
"adds r8, r11 \n\t"
"adcs r9, r12 \n\t"
"adc r10, #0 \n\t"
"stmia r0!, {r8} \n\t"
"mov r12, #0 \n\t"
"umull r8, r11, r2, r7 \n\t"
"mov r14, r11 \n\t"
"umlal r8, r11, r3, r6 \n\t"
"cmp r14, r11 \n\t"
"it hi \n\t"
"adchi r12, #0 \n\t"
"mov r14, r11 \n\t"
"umlal r8, r11, r4, r5 \n\t"
"cmp r14, r11 \n\t"
"it hi \n\t"
"adchi r12, #0 \n\t"
"adds r8, r8 \n\t"
"adcs r11, r11 \n\t"
"adc r12, r12 \n\t"
"adds r8, r9 \n\t"
"adcs r11, r10 \n\t"
"adc r12, #0 \n\t"
"stmia r0!, {r8} \n\t"
"ldmia r1!, {r2} \n\t"
"mov r10, #0 \n\t"
"umull r8, r9, r3, r7 \n\t"
"mov r14, r9 \n\t"
"umlal r8, r9, r4, r6 \n\t"
"cmp r14, r9 \n\t"
"it hi \n\t"
"adchi r10, #0 \n\t"
"ldr r14, [r0] \n\t"
"adds r8, r14 \n\t"
"adcs r9, #0 \n\t"
"adc r10, #0 \n\t"
"adds r8, r8 \n\t"
"adcs r9, r9 \n\t"
"adc r10, r10 \n\t"
"mov r14, r9 \n\t"
"umlal r8, r9, r5, r5 \n\t"
"cmp r14, r9 \n\t"
"it hi \n\t"
"adchi r10, #0 \n\t"
"adds r8, r11 \n\t"
"adcs r9, r12 \n\t"
"adc r10, #0 \n\t"
"stmia r0!, {r8} \n\t"
"mov r12, #0 \n\t"
"umull r8, r11, r3, r2 \n\t"
"mov r14, r11 \n\t"
"umlal r8, r11, r4, r7 \n\t"
"cmp r14, r11 \n\t"
"it hi \n\t"
"adchi r12, #0 \n\t"
"mov r14, r11 \n\t"
"umlal r8, r11, r5, r6 \n\t"
"cmp r14, r11 \n\t"
"it hi \n\t"
"adchi r12, #0 \n\t"
"ldr r14, [r0] \n\t"
"adds r8, r14 \n\t"
"adcs r11, #0 \n\t"
"adc r12, #0 \n\t"
"adds r8, r8 \n\t"
"adcs r11, r11 \n\t"
"adc r12, r12 \n\t"
"adds r8, r9 \n\t"
"adcs r11, r10 \n\t"
"adc r12, #0 \n\t"
"stmia r0!, {r8} \n\t"
"mov r10, #0 \n\t"
"umull r8, r9, r4, r2 \n\t"
"mov r14, r9 \n\t"
"umlal r8, r9, r5, r7 \n\t"
"cmp r14, r9 \n\t"
"it hi \n\t"
"adchi r10, #0 \n\t"
"adds r8, r8 \n\t"
"adcs r9, r9 \n\t"
"adc r10, r10 \n\t"
"mov r14, r9 \n\t"
"umlal r8, r9, r6, r6 \n\t"
"cmp r14, r9 \n\t"
"it hi \n\t"
"adchi r10, #0 \n\t"
"adds r8, r11 \n\t"
"adcs r9, r12 \n\t"
"adc r10, #0 \n\t"
"stmia r0!, {r8} \n\t"
"mov r12, #0 \n\t"
"umull r8, r11, r5, r2 \n\t"
"mov r14, r11 \n\t"
"umlal r8, r11, r6, r7 \n\t"
"cmp r14, r11 \n\t"
"it hi \n\t"
"adchi r12, #0 \n\t"
"adds r8, r8 \n\t"
"adcs r11, r11 \n\t"
"adc r12, r12 \n\t"
"adds r8, r9 \n\t"
"adcs r11, r10 \n\t"
"adc r12, #0 \n\t"
"stmia r0!, {r8} \n\t"
"mov r8, #0 \n\t"
"umull r1, r10, r6, r2 \n\t"
"adds r1, r1 \n\t"
"adcs r10, r10 \n\t"
"adc r8, #0 \n\t"
"adds r11, r1 \n\t"
"adcs r12, r10 \n\t"
"adc r8, #0 \n\t"
"umull r1, r10, r7, r7 \n\t"
"adds r11, r1 \n\t"
"adcs r12, r10 \n\t"
"adc r8, #0 \n\t"
"stmia r0!, {r11} \n\t"
"mov r11, #0 \n\t"
"umull r1, r10, r7, r2 \n\t"
"adds r1, r1 \n\t"
"adcs r10, r10 \n\t"
"adc r11, #0 \n\t"
"adds r12, r1 \n\t"
"adcs r8, r10 \n\t"
"adc r11, #0 \n\t"
"stmia r0!, {r12} \n\t"
"umull r1, r10, r2, r2 \n\t"
"adds r8, r1 \n\t"
"adcs r11, r10 \n\t"
"stmia r0!, {r8, r11} \n\t"
#if (uECC_PLATFORM != uECC_arm_thumb2)
".syntax divided \n\t"
#endif
: "+r" (r0), "+r" (r1)
:
: "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
);
}
#define asm_square 1
#endif /* (uECC_WORDS == 7) */
#if (uECC_WORDS == 8)
static void vli_square(uint32_t *result, const uint32_t *left) {
register uint32_t *r0 __asm__("r0") = result;
+4 -4
View File
@@ -26,8 +26,8 @@ ry = [6, 7, 8]
emit("add r0, %s", (size - init_size) * 4) # move z
emit("add r2, %s", (size - init_size) * 4) # move y
emit("ldmia r1!, {%s}", ",".join(["r%s" % (rx[i]) for i in xrange(init_size)]))
emit("ldmia r2!, {%s}", ",".join(["r%s" % (ry[i]) for i in xrange(init_size)]))
emit("ldmia r1!, {%s}", ", ".join(["r%s" % (rx[i]) for i in xrange(init_size)]))
emit("ldmia r2!, {%s}", ", ".join(["r%s" % (ry[i]) for i in xrange(init_size)]))
print ""
if init_size == 1:
@@ -84,11 +84,11 @@ emit("sub r0, %s", (2 * init_size + 3) * 4)
emit("sub r2, %s", (init_size + 3) * 4)
#### load y registers
emit("ldmia r2!, {%s}", ",".join(["r%s" % (ry[i]) for i in xrange(3)]))
emit("ldmia r2!, {%s}", ", ".join(["r%s" % (ry[i]) for i in xrange(3)]))
#### load additional x registers
if init_size != 3:
emit("ldmia r1!, {%s}", ",".join(["r%s" % (rx[i]) for i in xrange(init_size, 3)]))
emit("ldmia r1!, {%s}", ", ".join(["r%s" % (rx[i]) for i in xrange(init_size, 3)]))
print ""
prev_size = init_size
+15 -4
View File
@@ -8,7 +8,7 @@ if len(sys.argv) < 2:
size = int(sys.argv[1])
if size > 6 and size != 8:
if size > 8:
print "This script doesn't work with integer size %s due to laziness" % (size)
sys.exit(1)
@@ -37,8 +37,19 @@ r = [2, 3, 4, 5, 6, 7]
s = size - init_size
# Note that I just implemented the init_size = 2 case directly
if init_size > 0:
if init_size == 1:
emit("ldmia r1!, {r2}")
emit("add r1, %s", (size - init_size * 2) * 4)
emit("ldmia r1!, {r5}")
emit("add r0, %s", (size - init_size) * 4)
emit("umull r8, r9, r2, r5")
emit("stmia r0!, {r8, r9}")
emit("sub r0, %s", (size + init_size) * 4)
emit("sub r1, %s", (size) * 4)
print ""
elif init_size == 2:
emit("ldmia r1!, {r2, r3}")
emit("add r1, %s", (size - init_size * 2) * 4)
emit("ldmia r1!, {r5, r6}")
@@ -66,7 +77,7 @@ if init_size > 0:
emit("sub r1, %s", (size) * 4)
# load input words
emit("ldmia r1!, {%s}", ",".join(["r%s" % (r[i]) for i in xrange(s)]))
emit("ldmia r1!, {%s}", ", ".join(["r%s" % (r[i]) for i in xrange(s)]))
print ""
emit("umull r11, r12, r2, r2")