Skip to content

Commit 2cc5ecf

Browse files
authored
Merge pull request #7759 from JacobBarthelmeh/poly1305
w64wrapper for poly1305
2 parents a30d9c9 + f1ace62 commit 2cc5ecf

4 files changed

Lines changed: 171 additions & 7 deletions

File tree

wolfcrypt/src/misc.c

Lines changed: 64 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -709,13 +709,23 @@ WC_MISC_STATIC WC_INLINE void w64SetLow32(w64wrapper *n, word32 low) {
709709

710710
WC_MISC_STATIC WC_INLINE w64wrapper w64Add32(w64wrapper a, word32 b, byte *wrap)
711711
{
712-
a.n = a.n + b;
712+
a.n += b;
713713
if (a.n < b && wrap != NULL)
714714
*wrap = 1;
715715

716716
return a;
717717
}
718718

719+
WC_MISC_STATIC WC_INLINE w64wrapper w64Add(w64wrapper a, w64wrapper b,
720+
byte *wrap)
721+
{
722+
a.n += b.n;
723+
if (a.n < b.n && wrap != NULL)
724+
*wrap = 1;
725+
726+
return a;
727+
}
728+
719729
WC_MISC_STATIC WC_INLINE w64wrapper w64Sub32(w64wrapper a, word32 b, byte *wrap)
720730
{
721731
if (a.n < b && wrap != NULL)
@@ -796,6 +806,13 @@ WC_MISC_STATIC WC_INLINE w64wrapper w64ShiftLeft(w64wrapper a, int shift)
796806
return a;
797807
}
798808

809+
WC_MISC_STATIC WC_INLINE w64wrapper w64Mul(word32 a, word32 b)
810+
{
811+
w64wrapper ret;
812+
ret.n = (word64)a * (word64)b;
813+
return ret;
814+
}
815+
799816
#else
800817

801818
WC_MISC_STATIC WC_INLINE void w64Increment(w64wrapper *n)
@@ -831,7 +848,7 @@ WC_MISC_STATIC WC_INLINE void w64SetLow32(w64wrapper *n, word32 low)
831848

832849
WC_MISC_STATIC WC_INLINE w64wrapper w64Add32(w64wrapper a, word32 b, byte *wrap)
833850
{
834-
a.n[1] = a.n[1] + b;
851+
a.n[1] += b;
835852
if (a.n[1] < b) {
836853
a.n[0]++;
837854
if (wrap != NULL && a.n[0] == 0)
@@ -841,6 +858,24 @@ WC_MISC_STATIC WC_INLINE w64wrapper w64Add32(w64wrapper a, word32 b, byte *wrap)
841858
return a;
842859
}
843860

861+
WC_MISC_STATIC WC_INLINE w64wrapper w64Add(w64wrapper a, w64wrapper b,
862+
byte *wrap)
863+
{
864+
a.n[1] += b.n[1];
865+
if (a.n[1] < b.n[1]) {
866+
a.n[0]++;
867+
if (wrap != NULL && a.n[0] == 0)
868+
*wrap = 1;
869+
}
870+
871+
a.n[0] += b.n[0];
872+
if (wrap != NULL && a.n[0] < b.n[0]) {
873+
*wrap = 1;
874+
}
875+
876+
return a;
877+
}
878+
844879
WC_MISC_STATIC WC_INLINE w64wrapper w64Sub32(w64wrapper a, word32 b, byte *wrap)
845880
{
846881
byte _underflow = 0;
@@ -894,7 +929,7 @@ WC_MISC_STATIC WC_INLINE byte w64IsZero(w64wrapper a)
894929
return a.n[0] == 0 && a.n[1] == 0;
895930
}
896931

897-
WC_MISC_STATIC WC_INLINE void c64toa(w64wrapper *a, byte *out)
932+
WC_MISC_STATIC WC_INLINE void c64toa(const w64wrapper *a, byte *out)
898933
{
899934
#ifdef BIG_ENDIAN_ORDER
900935
word32 *_out = (word32*)(out);
@@ -939,7 +974,7 @@ WC_MISC_STATIC WC_INLINE byte w64LT(w64wrapper a, w64wrapper b)
939974
WC_MISC_STATIC WC_INLINE w64wrapper w64ShiftRight(w64wrapper a, int shift)
940975
{
941976
if (shift < 32) {
942-
a.n[1] = (a.n[1] >> shift) || (a.n[0] << (32 - shift));
977+
a.n[1] = (a.n[1] >> shift) | (a.n[0] << (32 - shift));
943978
a.n[0] >>= shift;
944979
}
945980
else {
@@ -951,7 +986,7 @@ WC_MISC_STATIC WC_INLINE w64wrapper w64ShiftRight(w64wrapper a, int shift)
951986
WC_MISC_STATIC WC_INLINE w64wrapper w64ShiftLeft(w64wrapper a, int shift)
952987
{
953988
if (shift < 32) {
954-
a.n[0] = (a.n[0] << shift) || (a.n[1] >> (32 - shift));
989+
a.n[0] = (a.n[0] << shift) | (a.n[1] >> (32 - shift));
955990
a.n[1] <<= shift;
956991
}
957992
else {
@@ -961,6 +996,30 @@ WC_MISC_STATIC WC_INLINE w64wrapper w64ShiftLeft(w64wrapper a, int shift)
961996
return a;
962997
}
963998

999+
WC_MISC_STATIC WC_INLINE w64wrapper w64Mul(word32 a, word32 b)
1000+
{
1001+
w64wrapper ret;
1002+
word16 ltlA, ltlB, ltlC, ltlD;
1003+
word32 bigA, bigB, bigC, bigD;
1004+
1005+
ltlA = a & 0xFFFF;
1006+
ltlB = (a >> 16) & 0xFFFF;
1007+
ltlC = b & 0xFFFF;
1008+
ltlD = (b >> 16) & 0xFFFF;
1009+
1010+
bigA = (word32)ltlA * (word32)ltlC;
1011+
bigC = (word32)ltlB * (word32)ltlC;
1012+
bigD = (word32)ltlA * (word32)ltlD;
1013+
bigB = (word32)ltlB * (word32)ltlD;
1014+
1015+
ret = w64From32(0, bigB);
1016+
ret = w64ShiftLeft(ret, 16);
1017+
ret = w64Add32(ret, bigD, NULL);
1018+
ret = w64Add32(ret, bigC, NULL);
1019+
ret = w64ShiftLeft(ret, 16);
1020+
return w64Add32(ret, bigA, NULL);
1021+
}
1022+
9641023
#endif /* WORD64_AVAILABLE && !WOLFSSL_W64_WRAPPER_TEST */
9651024
#endif /* WOLFSSL_W64_WRAPPER */
9661025

wolfcrypt/src/poly1305.c

Lines changed: 104 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@ and Daniel J. Bernstein
2929
*/
3030

3131

32+
/*
33+
* WOLFSSL_W64_WRAPPER Uses wrappers around word64 types for a system that does
34+
* not have word64 available. As expected it reduces
35+
* performance. Benchmarks collected July 2024 show
36+
* 303.004 MiB/s with and 1874.194 MiB/s without.
37+
*/
38+
3239
#ifdef HAVE_CONFIG_H
3340
#include <config.h>
3441
#endif
@@ -332,8 +339,22 @@ static int poly1305_blocks(Poly1305* ctx, const unsigned char *m,
332339
word32 r0,r1,r2,r3,r4;
333340
word32 s1,s2,s3,s4;
334341
word32 h0,h1,h2,h3,h4;
335-
word64 d0,d1,d2,d3,d4;
336342
word32 c;
343+
#ifdef WOLFSSL_W64_WRAPPER
344+
#ifdef WOLFSSL_SMALL_STACK
345+
w64wrapper* d;
346+
347+
d = (w64wrapper*)XMALLOC(5 * sizeof(w64wrapper), NULL,
348+
DYNAMIC_TYPE_TMP_BUFFER);
349+
if (d == NULL) {
350+
return MEMORY_E;
351+
}
352+
#else
353+
w64wrapper d[5];
354+
#endif
355+
#else
356+
word64 d0,d1,d2,d3,d4;
357+
#endif
337358

338359

339360
r0 = ctx->r[0];
@@ -362,6 +383,41 @@ static int poly1305_blocks(Poly1305* ctx, const unsigned char *m,
362383
h4 += (U8TO32(m+12) >> 8) | hibit;
363384

364385
/* h *= r */
386+
#ifdef WOLFSSL_W64_WRAPPER
387+
{
388+
w64wrapper tmp;
389+
390+
d[0] = w64Mul(h0, r0); tmp = w64Mul(h1, s4);
391+
d[0] = w64Add(d[0], tmp, NULL); tmp = w64Mul(h2, s3);
392+
d[0] = w64Add(d[0], tmp, NULL); tmp = w64Mul(h3, s2);
393+
d[0] = w64Add(d[0], tmp, NULL); tmp = w64Mul(h4, s1);
394+
d[0] = w64Add(d[0], tmp, NULL);
395+
396+
d[1] = w64Mul(h0, r1); tmp = w64Mul(h1, r0);
397+
d[1] = w64Add(d[1], tmp, NULL); tmp = w64Mul(h2, s4);
398+
d[1] = w64Add(d[1], tmp, NULL); tmp = w64Mul(h3, s3);
399+
d[1] = w64Add(d[1], tmp, NULL); tmp = w64Mul(h4, s2);
400+
d[1] = w64Add(d[1], tmp, NULL);
401+
402+
d[2] = w64Mul(h0, r2); tmp = w64Mul(h1, r1);
403+
d[2] = w64Add(d[2], tmp, NULL); tmp = w64Mul(h2, r0);
404+
d[2] = w64Add(d[2], tmp, NULL); tmp = w64Mul(h3, s4);
405+
d[2] = w64Add(d[2], tmp, NULL); tmp = w64Mul(h4, s3);
406+
d[2] = w64Add(d[2], tmp, NULL);
407+
408+
d[3] = w64Mul(h0, r3); tmp = w64Mul(h1, r2);
409+
d[3] = w64Add(d[3], tmp, NULL); tmp = w64Mul(h2, r1);
410+
d[3] = w64Add(d[3], tmp, NULL); tmp = w64Mul(h3, r0);
411+
d[3] = w64Add(d[3], tmp, NULL); tmp = w64Mul(h4, s4);
412+
d[3] = w64Add(d[3], tmp, NULL);
413+
414+
d[4] = w64Mul(h0, r4); tmp = w64Mul(h1, r3);
415+
d[4] = w64Add(d[4], tmp, NULL); tmp = w64Mul(h2, r2);
416+
d[4] = w64Add(d[4], tmp, NULL); tmp = w64Mul(h3, r1);
417+
d[4] = w64Add(d[4], tmp, NULL); tmp = w64Mul(h4, r0);
418+
d[4] = w64Add(d[4], tmp, NULL);
419+
}
420+
#else
365421
d0 = ((word64)h0 * r0) + ((word64)h1 * s4) + ((word64)h2 * s3) +
366422
((word64)h3 * s2) + ((word64)h4 * s1);
367423
d1 = ((word64)h0 * r1) + ((word64)h1 * r0) + ((word64)h2 * s4) +
@@ -372,13 +428,31 @@ static int poly1305_blocks(Poly1305* ctx, const unsigned char *m,
372428
((word64)h3 * r0) + ((word64)h4 * s4);
373429
d4 = ((word64)h0 * r4) + ((word64)h1 * r3) + ((word64)h2 * r2) +
374430
((word64)h3 * r1) + ((word64)h4 * r0);
431+
#endif
375432

376433
/* (partial) h %= p */
434+
#ifdef WOLFSSL_W64_WRAPPER
435+
c = w64GetLow32(w64ShiftRight(d[0], 26));
436+
h0 = w64GetLow32(d[0]) & 0x3ffffff;
437+
d[1] = w64Add32(d[1], c, NULL);
438+
c = w64GetLow32(w64ShiftRight(d[1], 26));
439+
h1 = w64GetLow32(d[1]) & 0x3ffffff;
440+
d[2] = w64Add32(d[2], c, NULL);
441+
c = w64GetLow32(w64ShiftRight(d[2], 26));
442+
h2 = w64GetLow32(d[2]) & 0x3ffffff;
443+
d[3] = w64Add32(d[3], c, NULL);
444+
c = w64GetLow32(w64ShiftRight(d[3], 26));
445+
h3 = w64GetLow32(d[3]) & 0x3ffffff;
446+
d[4] = w64Add32(d[4], c, NULL);
447+
c = w64GetLow32(w64ShiftRight(d[4], 26));
448+
h4 = w64GetLow32(d[4]) & 0x3ffffff;
449+
#else
377450
c = (word32)(d0 >> 26); h0 = (word32)d0 & 0x3ffffff;
378451
d1 += c; c = (word32)(d1 >> 26); h1 = (word32)d1 & 0x3ffffff;
379452
d2 += c; c = (word32)(d2 >> 26); h2 = (word32)d2 & 0x3ffffff;
380453
d3 += c; c = (word32)(d3 >> 26); h3 = (word32)d3 & 0x3ffffff;
381454
d4 += c; c = (word32)(d4 >> 26); h4 = (word32)d4 & 0x3ffffff;
455+
#endif
382456
h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff;
383457
h1 += c;
384458

@@ -392,6 +466,10 @@ static int poly1305_blocks(Poly1305* ctx, const unsigned char *m,
392466
ctx->h[3] = h3;
393467
ctx->h[4] = h4;
394468

469+
#if defined(WOLFSSL_W64_WRAPPER) && defined(WOLFSSL_SMALL_STACK)
470+
XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
471+
#endif
472+
395473
return 0;
396474

397475
#endif /* end of 64 bit cpu blocks or 32 bit cpu */
@@ -517,7 +595,11 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
517595

518596
word32 h0,h1,h2,h3,h4,c;
519597
word32 g0,g1,g2,g3,g4;
598+
#ifdef WOLFSSL_W64_WRAPPER
599+
w64wrapper f;
600+
#else
520601
word64 f;
602+
#endif
521603
word32 mask;
522604

523605
#endif
@@ -656,10 +738,31 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
656738
h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
657739

658740
/* mac = (h + pad) % (2^128) */
741+
#ifdef WOLFSSL_W64_WRAPPER
742+
f = w64From32(0, h0);
743+
f = w64Add32(f, ctx->pad[0], NULL);
744+
h0 = w64GetLow32(f);
745+
746+
f = w64ShiftRight(f, 32);
747+
f = w64Add32(f, h1, NULL);
748+
f = w64Add32(f, ctx->pad[1], NULL);
749+
h1 = w64GetLow32(f);
750+
751+
f = w64ShiftRight(f, 32);
752+
f = w64Add32(f, h2, NULL);
753+
f = w64Add32(f, ctx->pad[2], NULL);
754+
h2 = w64GetLow32(f);
755+
756+
f = w64ShiftRight(f, 32);
757+
f = w64Add32(f, h3, NULL);
758+
f = w64Add32(f, ctx->pad[3], NULL);
759+
h3 = w64GetLow32(f);
760+
#else
659761
f = (word64)h0 + ctx->pad[0] ; h0 = (word32)f;
660762
f = (word64)h1 + ctx->pad[1] + (f >> 32); h1 = (word32)f;
661763
f = (word64)h2 + ctx->pad[2] + (f >> 32); h2 = (word32)f;
662764
f = (word64)h3 + ctx->pad[3] + (f >> 32); h3 = (word32)f;
765+
#endif
663766

664767
U32TO8(mac + 0, h0);
665768
U32TO8(mac + 4, h1);

wolfssl/wolfcrypt/misc.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ WOLFSSL_LOCAL word32 w64GetLow32(w64wrapper n);
145145
WOLFSSL_LOCAL word32 w64GetHigh32(w64wrapper n);
146146
WOLFSSL_LOCAL void w64SetLow32(w64wrapper *n, word32 low);
147147
WOLFSSL_LOCAL w64wrapper w64Add32(w64wrapper a, word32 b, byte *wrap);
148+
WOLFSSL_LOCAL w64wrapper w64Add(w64wrapper a, w64wrapper b, byte *wrap);
148149
WOLFSSL_LOCAL w64wrapper w64Sub32(w64wrapper a, word32 b, byte *wrap);
149150
WOLFSSL_LOCAL byte w64GT(w64wrapper a, w64wrapper b);
150151
WOLFSSL_LOCAL byte w64IsZero(w64wrapper a);
@@ -157,6 +158,7 @@ WOLFSSL_LOCAL w64wrapper w64Sub(w64wrapper a, w64wrapper b);
157158
WOLFSSL_LOCAL void w64Zero(w64wrapper *a);
158159
WOLFSSL_LOCAL w64wrapper w64ShiftRight(w64wrapper a, int shift);
159160
WOLFSSL_LOCAL w64wrapper w64ShiftLeft(w64wrapper a, int shift);
161+
WOLFSSL_LOCAL w64wrapper w64Mul(word32 a, word32 b);
160162

161163
#else /* !NO_INLINE */
162164

wolfssl/wolfcrypt/poly1305.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757

5858
#if defined(USE_INTEL_POLY1305_SPEEDUP)
5959
#elif (defined(WC_HAS_SIZEOF_INT128_64BIT) || defined(WC_HAS_MSVC_64BIT) || \
60-
defined(WC_HAS_GCC_4_4_64BIT))
60+
defined(WC_HAS_GCC_4_4_64BIT)) && !defined(WOLFSSL_W64_WRAPPER_TEST)
6161
#define POLY130564
6262
#else
6363
#define POLY130532

0 commit comments

Comments
 (0)