Skip to content

Commit 9d79135

Browse files
authored
Merge pull request #7460 from douzzer/20240423-linuxkm-sha-2-3-asm-save-vector-regs
20240423-linuxkm-sha-2-3-asm-save-vector-regs
2 parents a75c2be + 5d9154e commit 9d79135

4 files changed

Lines changed: 333 additions & 5 deletions

File tree

wolfcrypt/src/sha256.c

Lines changed: 157 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -371,25 +371,172 @@ static int InitSha256(wc_Sha256* sha256)
371371
} /* extern "C" */
372372
#endif
373373

374+
static word32 intel_flags;
375+
static int Transform_Sha256_is_vectorized = 0;
376+
377+
#ifdef WC_NO_INTERNAL_FUNCTION_POINTERS
378+
379+
static enum { SHA256_UNSET, SHA256_AVX1, SHA256_AVX2, SHA256_AVX1_RORX,
380+
SHA256_AVX2_RORX, SHA256_SSE2, SHA256_C }
381+
sha_method = SHA256_UNSET;
382+
383+
static void Sha256_SetTransform(void)
384+
{
385+
386+
if (sha_method != SHA256_UNSET)
387+
return;
388+
389+
intel_flags = cpuid_get_flags();
390+
391+
if (IS_INTEL_SHA(intel_flags)) {
392+
#ifdef HAVE_INTEL_AVX1
393+
if (IS_INTEL_AVX1(intel_flags)) {
394+
sha_method = SHA256_AVX1;
395+
Transform_Sha256_is_vectorized = 1;
396+
}
397+
else
398+
#endif
399+
{
400+
sha_method = SHA256_SSE2;
401+
Transform_Sha256_is_vectorized = 1;
402+
}
403+
}
404+
else
405+
#ifdef HAVE_INTEL_AVX2
406+
if (IS_INTEL_AVX2(intel_flags)) {
407+
#ifdef HAVE_INTEL_RORX
408+
if (IS_INTEL_BMI2(intel_flags)) {
409+
sha_method = SHA256_AVX2_RORX;
410+
Transform_Sha256_is_vectorized = 1;
411+
}
412+
else
413+
#endif
414+
{
415+
sha_method = SHA256_AVX2;
416+
Transform_Sha256_is_vectorized = 1;
417+
}
418+
}
419+
else
420+
#endif
421+
#ifdef HAVE_INTEL_AVX1
422+
if (IS_INTEL_AVX1(intel_flags)) {
423+
#ifdef HAVE_INTEL_RORX
424+
if (IS_INTEL_BMI2(intel_flags)) {
425+
sha_method = SHA256_AVX1_RORX;
426+
Transform_Sha256_is_vectorized = 1;
427+
}
428+
else
429+
#endif
430+
{
431+
sha_method = SHA256_AVX1;
432+
Transform_Sha256_is_vectorized = 1;
433+
}
434+
}
435+
else
436+
#endif
437+
{
438+
sha_method = SHA256_C;
439+
Transform_Sha256_is_vectorized = 0;
440+
}
441+
}
442+
443+
static WC_INLINE int inline_XTRANSFORM(wc_Sha256* S, const byte* D) {
444+
int ret;
445+
if (sha_method == SHA256_C)
446+
return Transform_Sha256(S, D);
447+
SAVE_VECTOR_REGISTERS(return _svr_ret;);
448+
switch (sha_method) {
449+
case SHA256_AVX2:
450+
ret = Transform_Sha256_AVX2(S, D);
451+
break;
452+
case SHA256_AVX2_RORX:
453+
ret = Transform_Sha256_AVX2_RORX(S, D);
454+
break;
455+
case SHA256_AVX1:
456+
ret = Transform_Sha256_AVX1_Sha(S, D);
457+
break;
458+
case SHA256_AVX1_RORX:
459+
ret = Transform_Sha256_AVX1_RORX(S, D);
460+
break;
461+
case SHA256_SSE2:
462+
ret = Transform_Sha256_SSE2_Sha(S, D);
463+
break;
464+
case SHA256_C:
465+
case SHA256_UNSET:
466+
default:
467+
ret = Transform_Sha256(S, D);
468+
break;
469+
}
470+
RESTORE_VECTOR_REGISTERS();
471+
return ret;
472+
}
473+
#define XTRANSFORM(...) inline_XTRANSFORM(__VA_ARGS__)
474+
475+
static WC_INLINE int inline_XTRANSFORM_LEN(wc_Sha256* S, const byte* D, word32 L) {
476+
int ret;
477+
SAVE_VECTOR_REGISTERS(return _svr_ret;);
478+
switch (sha_method) {
479+
case SHA256_AVX2:
480+
ret = Transform_Sha256_AVX2_Len(S, D, L);
481+
break;
482+
case SHA256_AVX2_RORX:
483+
ret = Transform_Sha256_AVX2_RORX_Len(S, D, L);
484+
break;
485+
case SHA256_AVX1:
486+
ret = Transform_Sha256_AVX1_Sha_Len(S, D, L);
487+
break;
488+
case SHA256_AVX1_RORX:
489+
ret = Transform_Sha256_AVX1_RORX_Len(S, D, L);
490+
break;
491+
case SHA256_SSE2:
492+
ret = Transform_Sha256_SSE2_Sha_Len(S, D, L);
493+
break;
494+
case SHA256_C:
495+
case SHA256_UNSET:
496+
default:
497+
ret = 0;
498+
break;
499+
}
500+
RESTORE_VECTOR_REGISTERS();
501+
return ret;
502+
}
503+
#define XTRANSFORM_LEN(...) inline_XTRANSFORM_LEN(__VA_ARGS__)
504+
505+
#else /* !WC_NO_INTERNAL_FUNCTION_POINTERS */
506+
374507
static int (*Transform_Sha256_p)(wc_Sha256* sha256, const byte* data);
375508
/* = _Transform_Sha256 */
376509
static int (*Transform_Sha256_Len_p)(wc_Sha256* sha256, const byte* data,
377510
word32 len);
378511
/* = NULL */
379512
static int transform_check = 0;
380-
static word32 intel_flags;
381-
static int Transform_Sha256_is_vectorized = 0;
382513

383514
static WC_INLINE int inline_XTRANSFORM(wc_Sha256* S, const byte* D) {
384515
int ret;
516+
#ifdef WOLFSSL_LINUXKM
517+
if (Transform_Sha256_is_vectorized)
518+
SAVE_VECTOR_REGISTERS(return _svr_ret;);
519+
#endif
385520
ret = (*Transform_Sha256_p)(S, D);
521+
#ifdef WOLFSSL_LINUXKM
522+
if (Transform_Sha256_is_vectorized)
523+
RESTORE_VECTOR_REGISTERS();
524+
#endif
386525
return ret;
387526
}
388527
#define XTRANSFORM(...) inline_XTRANSFORM(__VA_ARGS__)
389528

390529
static WC_INLINE int inline_XTRANSFORM_LEN(wc_Sha256* S, const byte* D, word32 L) {
391530
int ret;
531+
#ifdef WOLFSSL_LINUXKM
532+
if (Transform_Sha256_is_vectorized)
533+
SAVE_VECTOR_REGISTERS(return _svr_ret;);
534+
#endif
392535
ret = (*Transform_Sha256_Len_p)(S, D, L);
536+
#ifdef WOLFSSL_LINUXKM
537+
if (Transform_Sha256_is_vectorized)
538+
RESTORE_VECTOR_REGISTERS();
539+
#endif
393540
return ret;
394541
}
395542
#define XTRANSFORM_LEN(...) inline_XTRANSFORM_LEN(__VA_ARGS__)
@@ -463,6 +610,8 @@ static int InitSha256(wc_Sha256* sha256)
463610
transform_check = 1;
464611
}
465612

613+
#endif /* !WC_NO_INTERNAL_FUNCTION_POINTERS */
614+
466615
#if !defined(WOLFSSL_KCAPI_HASH)
467616
int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
468617
{
@@ -1162,7 +1311,13 @@ static int InitSha256(wc_Sha256* sha256)
11621311
#ifdef XTRANSFORM_LEN
11631312
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
11641313
(defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
1314+
1315+
#ifdef WC_NO_INTERNAL_FUNCTION_POINTERS
1316+
if (sha_method != SHA256_C)
1317+
#else
11651318
if (Transform_Sha256_Len_p != NULL)
1319+
#endif
1320+
11661321
#endif
11671322
{
11681323
if (len >= WC_SHA256_BLOCK_SIZE) {

wolfcrypt/src/sha3.c

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,10 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p)
651651
word32 i;
652652
word32 blocks;
653653

654+
#if defined(WOLFSSL_LINUXKM) && defined(USE_INTEL_SPEEDUP)
655+
if (sha3_block == sha3_block_avx2)
656+
SAVE_VECTOR_REGISTERS(return _svr_ret;);
657+
#endif
654658
if (sha3->i > 0) {
655659
byte *t;
656660
byte l = (byte)(p * 8 - sha3->i);
@@ -699,6 +703,10 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p)
699703
len -= p * 8;
700704
data += p * 8;
701705
}
706+
#if defined(WOLFSSL_LINUXKM) && defined(USE_INTEL_SPEEDUP)
707+
if (sha3_block == sha3_block_avx2)
708+
RESTORE_VECTOR_REGISTERS();
709+
#endif
702710
XMEMCPY(sha3->t, data, len);
703711
sha3->i += (byte)len;
704712

@@ -732,6 +740,12 @@ static int Sha3Final(wc_Sha3* sha3, byte padChar, byte* hash, byte p, word32 l)
732740
for (i = 0; i < p; i++) {
733741
sha3->s[i] ^= Load64BitBigEndian(sha3->t + 8 * i);
734742
}
743+
744+
#if defined(WOLFSSL_LINUXKM) && defined(USE_INTEL_SPEEDUP)
745+
if (sha3_block == sha3_block_avx2)
746+
SAVE_VECTOR_REGISTERS(return _svr_ret;);
747+
#endif
748+
735749
for (j = 0; l - j >= rate; j += rate) {
736750
#ifdef USE_INTEL_SPEEDUP
737751
(*sha3_block)(sha3->s);
@@ -755,6 +769,11 @@ static int Sha3Final(wc_Sha3* sha3, byte padChar, byte* hash, byte p, word32 l)
755769
#endif
756770
XMEMCPY(hash + j, sha3->s, l - j);
757771
}
772+
#if defined(WOLFSSL_LINUXKM) && defined(USE_INTEL_SPEEDUP)
773+
if (sha3_block == sha3_block_avx2)
774+
RESTORE_VECTOR_REGISTERS();
775+
#endif
776+
758777
return 0;
759778
}
760779

@@ -1328,6 +1347,10 @@ int wc_Shake128_Absorb(wc_Shake* shake, const byte* data, word32 len)
13281347
*/
13291348
int wc_Shake128_SqueezeBlocks(wc_Shake* shake, byte* out, word32 blockCnt)
13301349
{
1350+
#if defined(WOLFSSL_LINUXKM) && defined(USE_INTEL_SPEEDUP)
1351+
if (sha3_block == sha3_block_avx2)
1352+
SAVE_VECTOR_REGISTERS(return _svr_ret;);
1353+
#endif
13311354
for (; (blockCnt > 0); blockCnt--) {
13321355
#ifdef USE_INTEL_SPEEDUP
13331356
(*sha3_block)(shake->s);
@@ -1341,6 +1364,10 @@ int wc_Shake128_SqueezeBlocks(wc_Shake* shake, byte* out, word32 blockCnt)
13411364
#endif
13421365
out += WC_SHA3_128_COUNT * 8;
13431366
}
1367+
#if defined(WOLFSSL_LINUXKM) && defined(USE_INTEL_SPEEDUP)
1368+
if (sha3_block == sha3_block_avx2)
1369+
RESTORE_VECTOR_REGISTERS();
1370+
#endif
13441371

13451372
return 0;
13461373
}
@@ -1458,6 +1485,10 @@ int wc_Shake256_Absorb(wc_Shake* shake, const byte* data, word32 len)
14581485
*/
14591486
int wc_Shake256_SqueezeBlocks(wc_Shake* shake, byte* out, word32 blockCnt)
14601487
{
1488+
#if defined(WOLFSSL_LINUXKM) && defined(USE_INTEL_SPEEDUP)
1489+
if (sha3_block == sha3_block_avx2)
1490+
SAVE_VECTOR_REGISTERS(return _svr_ret;);
1491+
#endif
14611492
for (; (blockCnt > 0); blockCnt--) {
14621493
#ifdef USE_INTEL_SPEEDUP
14631494
(*sha3_block)(shake->s);
@@ -1471,6 +1502,10 @@ int wc_Shake256_SqueezeBlocks(wc_Shake* shake, byte* out, word32 blockCnt)
14711502
#endif
14721503
out += WC_SHA3_256_COUNT * 8;
14731504
}
1505+
#if defined(WOLFSSL_LINUXKM) && defined(USE_INTEL_SPEEDUP)
1506+
if (sha3_block == sha3_block_avx2)
1507+
RESTORE_VECTOR_REGISTERS();
1508+
#endif
14741509

14751510
return 0;
14761511
}

0 commit comments

Comments
 (0)