Skip to content

Commit e9f1489

Browse files
committed
SP ECC: x64 minor speed improvement
ARM32/Thumb2: for safer code, do two reductions in mont triple after doing double part.
1 parent 7d85e39 commit e9f1489

5 files changed

Lines changed: 210 additions & 297 deletions

File tree

wolfcrypt/src/sp_arm32.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71903,8 +71903,16 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
7190371903
"sbcs r9, r9, #0\n\t"
7190471904
"sbcs r10, r10, r12, LSR #31\n\t"
7190571905
"sbcs r11, r11, r12\n\t"
71906-
"rsb r12, r12, #0\n\t"
71907-
"sbc r12, r12, #0\n\t"
71906+
"sbc r2, r2, r2\n\t"
71907+
"sub r12, r12, r2\n\t"
71908+
"subs r4, r4, r12\n\t"
71909+
"sbcs r5, r5, r12\n\t"
71910+
"sbcs r6, r6, r12\n\t"
71911+
"sbcs r7, r7, #0\n\t"
71912+
"sbcs r8, r8, #0\n\t"
71913+
"sbcs r9, r9, #0\n\t"
71914+
"sbcs r10, r10, r12, LSR #31\n\t"
71915+
"sbc r11, r11, r12\n\t"
7190871916
"ldm %[a]!, {r2, r3}\n\t"
7190971917
"adds r4, r4, r2\n\t"
7191071918
"adcs r5, r5, r3\n\t"

wolfcrypt/src/sp_cortexm.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33116,8 +33116,16 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
3311633116
"SBCS r9, r9, #0x0\n\t"
3311733117
"SBCS r10, r10, r12, LSR #31\n\t"
3311833118
"SBCS r11, r11, r12\n\t"
33119-
"RSB r12, r12, #0x0\n\t"
33120-
"SBC r12, r12, #0x0\n\t"
33119+
"SBC r2, r2, r2\n\t"
33120+
"SUB r12, r12, r2\n\t"
33121+
"SUBS r4, r4, r12\n\t"
33122+
"SBCS r5, r5, r12\n\t"
33123+
"SBCS r6, r6, r12\n\t"
33124+
"SBCS r7, r7, #0x0\n\t"
33125+
"SBCS r8, r8, #0x0\n\t"
33126+
"SBCS r9, r9, #0x0\n\t"
33127+
"SBCS r10, r10, r12, LSR #31\n\t"
33128+
"SBC r11, r11, r12\n\t"
3312133129
"LDM %[a]!, {r2, r3}\n\t"
3312233130
"ADDS r4, r4, r2\n\t"
3312333131
"ADCS r5, r5, r3\n\t"

wolfcrypt/src/sp_x86_64.c

Lines changed: 28 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -8607,7 +8607,7 @@ extern void sp_256_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m);
86078607
#ifdef __cplusplus
86088608
extern "C" {
86098609
#endif
8610-
extern void sp_256_mont_sub_dbl_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m);
8610+
extern void sp_256_mont_rsb_sub_dbl_4(sp_digit* r, const sp_digit* a, sp_digit* b, const sp_digit* m);
86118611
#ifdef __cplusplus
86128612
}
86138613
#endif
@@ -8661,9 +8661,8 @@ static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p,
86618661
/* X = T1 * T1 */
86628662
sp_256_mont_sqr_4(x, t1, p256_mod, p256_mp_mod);
86638663
/* X = X - 2*Y */
8664-
sp_256_mont_sub_dbl_4(x, x, y, p256_mod);
86658664
/* Y = Y - X */
8666-
sp_256_mont_sub_4(y, y, x, p256_mod);
8665+
sp_256_mont_rsb_sub_dbl_4(x, x, y, p256_mod);
86678666
/* Y = Y * T1 */
86688667
sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod);
86698668
/* Y = Y - T2 */
@@ -8775,15 +8774,14 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
87758774
break;
87768775
case 14:
87778776
/* X = X - 2*Y */
8778-
sp_256_mont_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
8777+
/* Y = Y - X */
8778+
sp_256_mont_rsb_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
87798779
ctx->state = 15;
87808780
break;
87818781
case 15:
87828782
ctx->state = 16;
87838783
break;
87848784
case 16:
8785-
/* Y = Y - X */
8786-
sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod);
87878785
ctx->state = 17;
87888786
break;
87898787
case 17:
@@ -8808,13 +8806,6 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
88088806
return err;
88098807
}
88108808
#endif /* WOLFSSL_SP_NONBLOCK */
8811-
#ifdef __cplusplus
8812-
extern "C" {
8813-
#endif
8814-
extern void sp_256_mont_dbl_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m);
8815-
#ifdef __cplusplus
8816-
}
8817-
#endif
88188809
/* Double the Montgomery form projective point p a number of times.
88198810
*
88208811
* r Result of repeated doubling of point.
@@ -8858,9 +8849,9 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
88588849
sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod);
88598850
/* X = A^2 - 2B */
88608851
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
8861-
sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
8852+
sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod);
88628853
/* B = 2.(B - X) */
8863-
sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
8854+
sp_256_mont_dbl_4(b, b, p256_mod);
88648855
/* Z = Z*Y */
88658856
sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
88668857
/* t1 = Y^4 */
@@ -8886,9 +8877,9 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
88868877
sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod);
88878878
/* X = A^2 - 2B */
88888879
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
8889-
sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
8880+
sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod);
88908881
/* B = 2.(B - X) */
8891-
sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
8882+
sp_256_mont_dbl_4(b, b, p256_mod);
88928883
/* Z = Z*Y */
88938884
sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
88948885
/* t1 = Y^4 */
@@ -8981,9 +8972,8 @@ static void sp_256_proj_point_add_4(sp_point_256* r,
89818972
sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod);
89828973
sp_256_mont_sub_4(x, x, t5, p256_mod);
89838974
sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod);
8984-
sp_256_mont_sub_dbl_4(x, x, y, p256_mod);
89858975
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
8986-
sp_256_mont_sub_4(y, y, x, p256_mod);
8976+
sp_256_mont_rsb_sub_dbl_4(x, x, y, p256_mod);
89878977
sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod);
89888978
sp_256_mont_sub_4(y, y, t5, p256_mod);
89898979
{
@@ -9159,12 +9149,11 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
91599149
ctx->state = 20;
91609150
break;
91619151
case 20:
9162-
sp_256_mont_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
9152+
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
9153+
sp_256_mont_rsb_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
91639154
ctx->state = 21;
91649155
break;
91659156
case 21:
9166-
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
9167-
sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod);
91689157
ctx->state = 22;
91699158
break;
91709159
case 22:
@@ -9263,9 +9252,9 @@ static void sp_256_proj_point_dbl_n_store_4(sp_point_256* r,
92639252
x = r[j].x;
92649253
/* X = A^2 - 2B */
92659254
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
9266-
sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
9255+
sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod);
92679256
/* B = 2.(B - X) */
9268-
sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
9257+
sp_256_mont_dbl_4(b, b, p256_mod);
92699258
/* Z = Z*Y */
92709259
sp_256_mont_mul_4(r[j].z, z, y, p256_mod, p256_mp_mod);
92719260
z = r[j].z;
@@ -9764,7 +9753,7 @@ extern void sp_256_div2_avx2_4(sp_digit* r, const sp_digit* a, const sp_digit* m
97649753
#ifdef __cplusplus
97659754
}
97669755
#endif
9767-
#define sp_256_mont_sub_dbl_avx2_4 sp_256_mont_sub_dbl_4
9756+
#define sp_256_mont_rsb_sub_dbl_avx2_4 sp_256_mont_rsb_sub_dbl_4
97689757
/* Double the Montgomery form projective point p.
97699758
*
97709759
* r Result of doubling point.
@@ -9815,9 +9804,8 @@ static void sp_256_proj_point_dbl_avx2_4(sp_point_256* r, const sp_point_256* p,
98159804
/* X = T1 * T1 */
98169805
sp_256_mont_sqr_avx2_4(x, t1, p256_mod, p256_mp_mod);
98179806
/* X = X - 2*Y */
9818-
sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod);
98199807
/* Y = Y - X */
9820-
sp_256_mont_sub_avx2_4(y, y, x, p256_mod);
9808+
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, y, p256_mod);
98219809
/* Y = Y * T1 */
98229810
sp_256_mont_mul_avx2_4(y, y, t1, p256_mod, p256_mp_mod);
98239811
/* Y = Y - T2 */
@@ -9929,15 +9917,14 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
99299917
break;
99309918
case 14:
99319919
/* X = X - 2*Y */
9932-
sp_256_mont_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
9920+
/* Y = Y - X */
9921+
sp_256_mont_rsb_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
99339922
ctx->state = 15;
99349923
break;
99359924
case 15:
99369925
ctx->state = 16;
99379926
break;
99389927
case 16:
9939-
/* Y = Y - X */
9940-
sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod);
99419928
ctx->state = 17;
99429929
break;
99439930
case 17:
@@ -9962,7 +9949,6 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
99629949
return err;
99639950
}
99649951
#endif /* WOLFSSL_SP_NONBLOCK */
9965-
#define sp_256_mont_dbl_sub_avx2_4 sp_256_mont_dbl_sub_4
99669952
/* Double the Montgomery form projective point p a number of times.
99679953
*
99689954
* r Result of repeated doubling of point.
@@ -10006,9 +9992,9 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i,
100069992
sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod);
100079993
/* X = A^2 - 2B */
100089994
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
10009-
sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod);
9995+
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod);
100109996
/* B = 2.(B - X) */
10011-
sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod);
9997+
sp_256_mont_dbl_avx2_4(b, b, p256_mod);
100129998
/* Z = Z*Y */
100139999
sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod);
1001410000
/* t1 = Y^4 */
@@ -10034,9 +10020,9 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i,
1003410020
sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod);
1003510021
/* X = A^2 - 2B */
1003610022
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
10037-
sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod);
10023+
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod);
1003810024
/* B = 2.(B - X) */
10039-
sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod);
10025+
sp_256_mont_dbl_avx2_4(b, b, p256_mod);
1004010026
/* Z = Z*Y */
1004110027
sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod);
1004210028
/* t1 = Y^4 */
@@ -10105,9 +10091,8 @@ static void sp_256_proj_point_add_avx2_4(sp_point_256* r,
1010510091
sp_256_mont_sqr_avx2_4(x, t4, p256_mod, p256_mp_mod);
1010610092
sp_256_mont_sub_avx2_4(x, x, t5, p256_mod);
1010710093
sp_256_mont_mul_avx2_4(t5, t5, t3, p256_mod, p256_mp_mod);
10108-
sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod);
1010910094
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
10110-
sp_256_mont_sub_avx2_4(y, y, x, p256_mod);
10095+
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, y, p256_mod);
1011110096
sp_256_mont_mul_avx2_4(y, y, t4, p256_mod, p256_mp_mod);
1011210097
sp_256_mont_sub_avx2_4(y, y, t5, p256_mod);
1011310098
{
@@ -10283,12 +10268,11 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
1028310268
ctx->state = 20;
1028410269
break;
1028510270
case 20:
10286-
sp_256_mont_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
10271+
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
10272+
sp_256_mont_rsb_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
1028710273
ctx->state = 21;
1028810274
break;
1028910275
case 21:
10290-
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
10291-
sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod);
1029210276
ctx->state = 22;
1029310277
break;
1029410278
case 22:
@@ -10387,9 +10371,9 @@ static void sp_256_proj_point_dbl_n_store_avx2_4(sp_point_256* r,
1038710371
x = r[j].x;
1038810372
/* X = A^2 - 2B */
1038910373
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
10390-
sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod);
10374+
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod);
1039110375
/* B = 2.(B - X) */
10392-
sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod);
10376+
sp_256_mont_dbl_avx2_4(b, b, p256_mod);
1039310377
/* Z = Z*Y */
1039410378
sp_256_mont_mul_avx2_4(r[j].z, z, y, p256_mod, p256_mp_mod);
1039510379
z = r[j].z;
@@ -10689,9 +10673,8 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r,
1068910673
sp_256_mont_mul_4(t1, t1, t2, p256_mod, p256_mp_mod);
1069010674
sp_256_mont_sqr_4(t2, t4, p256_mod, p256_mp_mod);
1069110675
sp_256_mont_sub_4(t2, t2, t1, p256_mod);
10692-
sp_256_mont_sub_dbl_4(x, t2, t3, p256_mod);
10676+
sp_256_mont_rsb_sub_dbl_4(x, t2, t3, p256_mod);
1069310677
/* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
10694-
sp_256_mont_sub_4(t3, t3, x, p256_mod);
1069510678
sp_256_mont_mul_4(t3, t3, t4, p256_mod, p256_mp_mod);
1069610679
sp_256_mont_mul_4(t1, t1, p->y, p256_mod, p256_mp_mod);
1069710680
sp_256_mont_sub_4(y, t3, t1, p256_mod);
@@ -11178,9 +11161,8 @@ static void sp_256_proj_point_add_qz1_avx2_4(sp_point_256* r,
1117811161
sp_256_mont_mul_avx2_4(t1, t1, t2, p256_mod, p256_mp_mod);
1117911162
sp_256_mont_sqr_avx2_4(t2, t4, p256_mod, p256_mp_mod);
1118011163
sp_256_mont_sub_avx2_4(t2, t2, t1, p256_mod);
11181-
sp_256_mont_sub_dbl_avx2_4(x, t2, t3, p256_mod);
11164+
sp_256_mont_rsb_sub_dbl_avx2_4(x, t2, t3, p256_mod);
1118211165
/* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
11183-
sp_256_mont_sub_avx2_4(t3, t3, x, p256_mod);
1118411166
sp_256_mont_mul_avx2_4(t3, t3, t4, p256_mod, p256_mp_mod);
1118511167
sp_256_mont_mul_avx2_4(t1, t1, p->y, p256_mod, p256_mp_mod);
1118611168
sp_256_mont_sub_avx2_4(y, t3, t1, p256_mod);

0 commit comments

Comments
 (0)