@@ -29,6 +29,13 @@ and Daniel J. Bernstein
2929*/
3030
3131
32+ /*
33+ * WOLFSSL_W64_WRAPPER Uses wrappers around word64 types for a system that does
34+ * not have word64 available. As expected it reduces
35+ * performance. Benchmarks collected July 2024 show
36+ * 303.004 MiB/s with and 1874.194 MiB/s without.
37+ */
38+
3239#ifdef HAVE_CONFIG_H
3340 #include <config.h>
3441#endif
@@ -332,8 +339,22 @@ static int poly1305_blocks(Poly1305* ctx, const unsigned char *m,
332339 word32 r0 ,r1 ,r2 ,r3 ,r4 ;
333340 word32 s1 ,s2 ,s3 ,s4 ;
334341 word32 h0 ,h1 ,h2 ,h3 ,h4 ;
335- word64 d0 ,d1 ,d2 ,d3 ,d4 ;
336342 word32 c ;
343+ #ifdef WOLFSSL_W64_WRAPPER
344+ #ifdef WOLFSSL_SMALL_STACK
345+ w64wrapper * d ;
346+
347+ d = (w64wrapper * )XMALLOC (5 * sizeof (w64wrapper ), NULL ,
348+ DYNAMIC_TYPE_TMP_BUFFER );
349+ if (d == NULL ) {
350+ return MEMORY_E ;
351+ }
352+ #else
353+ w64wrapper d [5 ];
354+ #endif
355+ #else
356+ word64 d0 ,d1 ,d2 ,d3 ,d4 ;
357+ #endif
337358
338359
339360 r0 = ctx -> r [0 ];
@@ -362,6 +383,41 @@ static int poly1305_blocks(Poly1305* ctx, const unsigned char *m,
362383 h4 += (U8TO32 (m + 12 ) >> 8 ) | hibit ;
363384
364385 /* h *= r */
386+ #ifdef WOLFSSL_W64_WRAPPER
387+ {
388+ w64wrapper tmp ;
389+
390+ d [0 ] = w64Mul (h0 , r0 ); tmp = w64Mul (h1 , s4 );
391+ d [0 ] = w64Add (d [0 ], tmp , NULL ); tmp = w64Mul (h2 , s3 );
392+ d [0 ] = w64Add (d [0 ], tmp , NULL ); tmp = w64Mul (h3 , s2 );
393+ d [0 ] = w64Add (d [0 ], tmp , NULL ); tmp = w64Mul (h4 , s1 );
394+ d [0 ] = w64Add (d [0 ], tmp , NULL );
395+
396+ d [1 ] = w64Mul (h0 , r1 ); tmp = w64Mul (h1 , r0 );
397+ d [1 ] = w64Add (d [1 ], tmp , NULL ); tmp = w64Mul (h2 , s4 );
398+ d [1 ] = w64Add (d [1 ], tmp , NULL ); tmp = w64Mul (h3 , s3 );
399+ d [1 ] = w64Add (d [1 ], tmp , NULL ); tmp = w64Mul (h4 , s2 );
400+ d [1 ] = w64Add (d [1 ], tmp , NULL );
401+
402+ d [2 ] = w64Mul (h0 , r2 ); tmp = w64Mul (h1 , r1 );
403+ d [2 ] = w64Add (d [2 ], tmp , NULL ); tmp = w64Mul (h2 , r0 );
404+ d [2 ] = w64Add (d [2 ], tmp , NULL ); tmp = w64Mul (h3 , s4 );
405+ d [2 ] = w64Add (d [2 ], tmp , NULL ); tmp = w64Mul (h4 , s3 );
406+ d [2 ] = w64Add (d [2 ], tmp , NULL );
407+
408+ d [3 ] = w64Mul (h0 , r3 ); tmp = w64Mul (h1 , r2 );
409+ d [3 ] = w64Add (d [3 ], tmp , NULL ); tmp = w64Mul (h2 , r1 );
410+ d [3 ] = w64Add (d [3 ], tmp , NULL ); tmp = w64Mul (h3 , r0 );
411+ d [3 ] = w64Add (d [3 ], tmp , NULL ); tmp = w64Mul (h4 , s4 );
412+ d [3 ] = w64Add (d [3 ], tmp , NULL );
413+
414+ d [4 ] = w64Mul (h0 , r4 ); tmp = w64Mul (h1 , r3 );
415+ d [4 ] = w64Add (d [4 ], tmp , NULL ); tmp = w64Mul (h2 , r2 );
416+ d [4 ] = w64Add (d [4 ], tmp , NULL ); tmp = w64Mul (h3 , r1 );
417+ d [4 ] = w64Add (d [4 ], tmp , NULL ); tmp = w64Mul (h4 , r0 );
418+ d [4 ] = w64Add (d [4 ], tmp , NULL );
419+ }
420+ #else
365421 d0 = ((word64 )h0 * r0 ) + ((word64 )h1 * s4 ) + ((word64 )h2 * s3 ) +
366422 ((word64 )h3 * s2 ) + ((word64 )h4 * s1 );
367423 d1 = ((word64 )h0 * r1 ) + ((word64 )h1 * r0 ) + ((word64 )h2 * s4 ) +
@@ -372,13 +428,31 @@ static int poly1305_blocks(Poly1305* ctx, const unsigned char *m,
372428 ((word64 )h3 * r0 ) + ((word64 )h4 * s4 );
373429 d4 = ((word64 )h0 * r4 ) + ((word64 )h1 * r3 ) + ((word64 )h2 * r2 ) +
374430 ((word64 )h3 * r1 ) + ((word64 )h4 * r0 );
431+ #endif
375432
376433 /* (partial) h %= p */
434+ #ifdef WOLFSSL_W64_WRAPPER
435+ c = w64GetLow32 (w64ShiftRight (d [0 ], 26 ));
436+ h0 = w64GetLow32 (d [0 ]) & 0x3ffffff ;
437+ d [1 ] = w64Add32 (d [1 ], c , NULL );
438+ c = w64GetLow32 (w64ShiftRight (d [1 ], 26 ));
439+ h1 = w64GetLow32 (d [1 ]) & 0x3ffffff ;
440+ d [2 ] = w64Add32 (d [2 ], c , NULL );
441+ c = w64GetLow32 (w64ShiftRight (d [2 ], 26 ));
442+ h2 = w64GetLow32 (d [2 ]) & 0x3ffffff ;
443+ d [3 ] = w64Add32 (d [3 ], c , NULL );
444+ c = w64GetLow32 (w64ShiftRight (d [3 ], 26 ));
445+ h3 = w64GetLow32 (d [3 ]) & 0x3ffffff ;
446+ d [4 ] = w64Add32 (d [4 ], c , NULL );
447+ c = w64GetLow32 (w64ShiftRight (d [4 ], 26 ));
448+ h4 = w64GetLow32 (d [4 ]) & 0x3ffffff ;
449+ #else
377450 c = (word32 )(d0 >> 26 ); h0 = (word32 )d0 & 0x3ffffff ;
378451 d1 += c ; c = (word32 )(d1 >> 26 ); h1 = (word32 )d1 & 0x3ffffff ;
379452 d2 += c ; c = (word32 )(d2 >> 26 ); h2 = (word32 )d2 & 0x3ffffff ;
380453 d3 += c ; c = (word32 )(d3 >> 26 ); h3 = (word32 )d3 & 0x3ffffff ;
381454 d4 += c ; c = (word32 )(d4 >> 26 ); h4 = (word32 )d4 & 0x3ffffff ;
455+ #endif
382456 h0 += c * 5 ; c = (h0 >> 26 ); h0 = h0 & 0x3ffffff ;
383457 h1 += c ;
384458
@@ -392,6 +466,10 @@ static int poly1305_blocks(Poly1305* ctx, const unsigned char *m,
392466 ctx -> h [3 ] = h3 ;
393467 ctx -> h [4 ] = h4 ;
394468
469+ #if defined(WOLFSSL_W64_WRAPPER ) && defined(WOLFSSL_SMALL_STACK )
470+ XFREE (d , NULL , DYNAMIC_TYPE_TMP_BUFFER );
471+ #endif
472+
395473 return 0 ;
396474
397475#endif /* end of 64 bit cpu blocks or 32 bit cpu */
@@ -517,7 +595,11 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
517595
518596 word32 h0 ,h1 ,h2 ,h3 ,h4 ,c ;
519597 word32 g0 ,g1 ,g2 ,g3 ,g4 ;
598+ #ifdef WOLFSSL_W64_WRAPPER
599+ w64wrapper f ;
600+ #else
520601 word64 f ;
602+ #endif
521603 word32 mask ;
522604
523605#endif
@@ -656,10 +738,31 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
656738 h3 = ((h3 >> 18 ) | (h4 << 8 )) & 0xffffffff ;
657739
658740 /* mac = (h + pad) % (2^128) */
741+ #ifdef WOLFSSL_W64_WRAPPER
742+ f = w64From32 (0 , h0 );
743+ f = w64Add32 (f , ctx -> pad [0 ], NULL );
744+ h0 = w64GetLow32 (f );
745+
746+ f = w64ShiftRight (f , 32 );
747+ f = w64Add32 (f , h1 , NULL );
748+ f = w64Add32 (f , ctx -> pad [1 ], NULL );
749+ h1 = w64GetLow32 (f );
750+
751+ f = w64ShiftRight (f , 32 );
752+ f = w64Add32 (f , h2 , NULL );
753+ f = w64Add32 (f , ctx -> pad [2 ], NULL );
754+ h2 = w64GetLow32 (f );
755+
756+ f = w64ShiftRight (f , 32 );
757+ f = w64Add32 (f , h3 , NULL );
758+ f = w64Add32 (f , ctx -> pad [3 ], NULL );
759+ h3 = w64GetLow32 (f );
760+ #else
659761 f = (word64 )h0 + ctx -> pad [0 ] ; h0 = (word32 )f ;
660762 f = (word64 )h1 + ctx -> pad [1 ] + (f >> 32 ); h1 = (word32 )f ;
661763 f = (word64 )h2 + ctx -> pad [2 ] + (f >> 32 ); h2 = (word32 )f ;
662764 f = (word64 )h3 + ctx -> pad [3 ] + (f >> 32 ); h3 = (word32 )f ;
765+ #endif
663766
664767 U32TO8 (mac + 0 , h0 );
665768 U32TO8 (mac + 4 , h1 );
0 commit comments