@@ -1236,8 +1236,9 @@ void kyber_keygen(sword16* priv, sword16* pub, sword16* e, const sword16* a,
12361236 int kp )
12371237{
12381238#ifdef USE_INTEL_SPEEDUP
1239- if (IS_INTEL_AVX2 (cpuid_flags )) {
1239+ if (( IS_INTEL_AVX2 (cpuid_flags )) && ( SAVE_VECTOR_REGISTERS2 () == 0 )) {
12401240 kyber_keygen_avx2 (priv , pub , e , a , kp );
1241+ RESTORE_VECTOR_REGISTERS ();
12411242 }
12421243 else
12431244#endif
@@ -1314,8 +1315,9 @@ void kyber_encapsulate(const sword16* pub, sword16* bp, sword16* v,
13141315 const sword16 * m , int kp )
13151316{
13161317#ifdef USE_INTEL_SPEEDUP
1317- if (IS_INTEL_AVX2 (cpuid_flags )) {
1318+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
13181319 kyber_encapsulate_avx2 (pub , bp , v , at , sp , ep , epp , m , kp );
1320+ RESTORE_VECTOR_REGISTERS ();
13191321 }
13201322 else
13211323#endif
@@ -1365,8 +1367,9 @@ void kyber_decapsulate(const sword16* priv, sword16* mp, sword16* bp,
13651367 const sword16 * v , int kp )
13661368{
13671369#ifdef USE_INTEL_SPEEDUP
1368- if (IS_INTEL_AVX2 (cpuid_flags )) {
1370+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
13691371 kyber_decapsulate_avx2 (priv , mp , bp , v , kp );
1372+ RESTORE_VECTOR_REGISTERS ();
13701373 }
13711374 else
13721375#endif
@@ -1569,8 +1572,9 @@ static int kyber_gen_matrix_k3_avx2(sword16* a, byte* seed, int transposed)
15691572 if (IS_INTEL_BMI2 (cpuid_flags )) {
15701573 sha3_block_bmi2 (state );
15711574 }
1572- else if (IS_INTEL_AVX2 (cpuid_flags )) {
1575+ else if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
15731576 sha3_block_avx2 (state );
1577+ RESTORE_VECTOR_REGISTERS ();
15741578 }
15751579 else {
15761580 BlockSha3 (state );
@@ -1582,8 +1586,9 @@ static int kyber_gen_matrix_k3_avx2(sword16* a, byte* seed, int transposed)
15821586 if (IS_INTEL_BMI2 (cpuid_flags )) {
15831587 sha3_block_bmi2 (state );
15841588 }
1585- else if (IS_INTEL_AVX2 (cpuid_flags )) {
1589+ else if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
15861590 sha3_block_avx2 (state );
1591+ RESTORE_VECTOR_REGISTERS ();
15871592 }
15881593 else {
15891594 BlockSha3 (state );
@@ -2058,8 +2063,9 @@ static int kyber_prf(wc_Shake* shake256, byte* out, unsigned int outLen,
20582063 if (IS_INTEL_BMI2 (cpuid_flags )) {
20592064 sha3_block_bmi2 (state );
20602065 }
2061- else if (IS_INTEL_AVX2 (cpuid_flags )) {
2066+ else if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
20622067 sha3_block_avx2 (state );
2068+ RESTORE_VECTOR_REGISTERS ();
20632069 }
20642070 else {
20652071 BlockSha3 (state );
@@ -2105,8 +2111,9 @@ int kyber_kdf(byte* seed, int seedLen, byte* out, int outLen)
21052111 if (IS_INTEL_BMI2 (cpuid_flags )) {
21062112 sha3_block_bmi2 (state );
21072113 }
2108- else if (IS_INTEL_AVX2 (cpuid_flags )) {
2114+ else if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
21092115 sha3_block_avx2 (state );
2116+ RESTORE_VECTOR_REGISTERS ();
21102117 }
21112118 else {
21122119 BlockSha3 (state );
@@ -2376,8 +2383,9 @@ int kyber_gen_matrix(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed,
23762383 ret = kyber_gen_matrix_k2_aarch64 (a , seed , transposed );
23772384#else
23782385 #ifdef USE_INTEL_SPEEDUP
2379- if (IS_INTEL_AVX2 (cpuid_flags )) {
2386+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
23802387 ret = kyber_gen_matrix_k2_avx2 (a , seed , transposed );
2388+ RESTORE_VECTOR_REGISTERS ();
23812389 }
23822390 else
23832391 #endif
@@ -2394,8 +2402,9 @@ int kyber_gen_matrix(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed,
23942402 ret = kyber_gen_matrix_k3_aarch64 (a , seed , transposed );
23952403#else
23962404 #ifdef USE_INTEL_SPEEDUP
2397- if (IS_INTEL_AVX2 (cpuid_flags )) {
2405+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
23982406 ret = kyber_gen_matrix_k3_avx2 (a , seed , transposed );
2407+ RESTORE_VECTOR_REGISTERS ();
23992408 }
24002409 else
24012410 #endif
@@ -2412,8 +2421,9 @@ int kyber_gen_matrix(KYBER_PRF_T* prf, sword16* a, int kp, byte* seed,
24122421 ret = kyber_gen_matrix_k4_aarch64 (a , seed , transposed );
24132422#else
24142423 #ifdef USE_INTEL_SPEEDUP
2415- if (IS_INTEL_AVX2 (cpuid_flags )) {
2424+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
24162425 ret = kyber_gen_matrix_k4_avx2 (a , seed , transposed );
2426+ RESTORE_VECTOR_REGISTERS ();
24172427 }
24182428 else
24192429 #endif
@@ -3213,8 +3223,9 @@ int kyber_get_noise(KYBER_PRF_T* prf, int kp, sword16* vec1,
32133223 ret = kyber_get_noise_k2_aarch64 (vec1 , vec2 , poly , seed );
32143224#else
32153225 #ifdef USE_INTEL_SPEEDUP
3216- if (IS_INTEL_AVX2 (cpuid_flags )) {
3226+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
32173227 ret = kyber_get_noise_k2_avx2 (prf , vec1 , vec2 , poly , seed );
3228+ RESTORE_VECTOR_REGISTERS ();
32183229 }
32193230 else
32203231 #endif
@@ -3236,8 +3247,9 @@ int kyber_get_noise(KYBER_PRF_T* prf, int kp, sword16* vec1,
32363247 ret = kyber_get_noise_k3_aarch64 (vec1 , vec2 , poly , seed );
32373248#else
32383249 #ifdef USE_INTEL_SPEEDUP
3239- if (IS_INTEL_AVX2 (cpuid_flags )) {
3250+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
32403251 ret = kyber_get_noise_k3_avx2 (vec1 , vec2 , poly , seed );
3252+ RESTORE_VECTOR_REGISTERS ();
32413253 }
32423254 else
32433255 #endif
@@ -3255,8 +3267,9 @@ int kyber_get_noise(KYBER_PRF_T* prf, int kp, sword16* vec1,
32553267 ret = kyber_get_noise_k4_aarch64 (vec1 , vec2 , poly , seed );
32563268#else
32573269 #ifdef USE_INTEL_SPEEDUP
3258- if (IS_INTEL_AVX2 (cpuid_flags )) {
3270+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
32593271 ret = kyber_get_noise_k4_avx2 (prf , vec1 , vec2 , poly , seed );
3272+ RESTORE_VECTOR_REGISTERS ();
32603273 }
32613274 else
32623275 #endif
@@ -3317,8 +3330,9 @@ int kyber_cmp(const byte* a, const byte* b, int sz)
33173330 int fail ;
33183331
33193332#ifdef USE_INTEL_SPEEDUP
3320- if (IS_INTEL_AVX2 (cpuid_flags )) {
3333+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
33213334 fail = kyber_cmp_avx2 (a , b , sz );
3335+ RESTORE_VECTOR_REGISTERS ();
33223336 }
33233337 else
33243338#endif
@@ -3555,8 +3569,9 @@ static void kyber_vec_compress_10_c(byte* r, sword16* v, unsigned int kp)
35553569void kyber_vec_compress_10 (byte * r , sword16 * v , unsigned int kp )
35563570{
35573571#ifdef USE_INTEL_SPEEDUP
3558- if (IS_INTEL_AVX2 (cpuid_flags )) {
3572+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
35593573 kyber_compress_10_avx2 (r , v , kp );
3574+ RESTORE_VECTOR_REGISTERS ();
35603575 }
35613576 else
35623577#endif
@@ -3648,8 +3663,9 @@ static void kyber_vec_compress_11_c(byte* r, sword16* v)
36483663void kyber_vec_compress_11 (byte * r , sword16 * v )
36493664{
36503665#ifdef USE_INTEL_SPEEDUP
3651- if (IS_INTEL_AVX2 (cpuid_flags )) {
3666+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
36523667 kyber_compress_11_avx2 (r , v , 4 );
3668+ RESTORE_VECTOR_REGISTERS ();
36533669 }
36543670 else
36553671#endif
@@ -3746,8 +3762,9 @@ void kyber_vec_decompress_10(sword16* v, const unsigned char* b,
37463762 unsigned int kp )
37473763{
37483764#ifdef USE_INTEL_SPEEDUP
3749- if (IS_INTEL_AVX2 (cpuid_flags )) {
3765+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
37503766 kyber_decompress_10_avx2 (v , b , kp );
3767+ RESTORE_VECTOR_REGISTERS ();
37513768 }
37523769 else
37533770#endif
@@ -3829,8 +3846,9 @@ static void kyber_vec_decompress_11_c(sword16* v, const unsigned char* b)
38293846void kyber_vec_decompress_11 (sword16 * v , const unsigned char * b )
38303847{
38313848#ifdef USE_INTEL_SPEEDUP
3832- if (IS_INTEL_AVX2 (cpuid_flags )) {
3849+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
38333850 kyber_decompress_11_avx2 (v , b , 4 );
3851+ RESTORE_VECTOR_REGISTERS ();
38343852 }
38353853 else
38363854#endif
@@ -3979,8 +3997,9 @@ static void kyber_compress_4_c(byte* b, sword16* p)
39793997void kyber_compress_4 (byte * b , sword16 * p )
39803998{
39813999#ifdef USE_INTEL_SPEEDUP
3982- if (IS_INTEL_AVX2 (cpuid_flags )) {
4000+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
39834001 kyber_compress_4_avx2 (b , p );
4002+ RESTORE_VECTOR_REGISTERS ();
39844003 }
39854004 else
39864005#endif
@@ -4052,8 +4071,9 @@ static void kyber_compress_5_c(byte* b, sword16* p)
40524071void kyber_compress_5 (byte * b , sword16 * p )
40534072{
40544073#ifdef USE_INTEL_SPEEDUP
4055- if (IS_INTEL_AVX2 (cpuid_flags )) {
4074+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
40564075 kyber_compress_5_avx2 (b , p );
4076+ RESTORE_VECTOR_REGISTERS ();
40574077 }
40584078 else
40594079#endif
@@ -4112,8 +4132,9 @@ static void kyber_decompress_4_c(sword16* p, const unsigned char* b)
41124132void kyber_decompress_4 (sword16 * p , const unsigned char * b )
41134133{
41144134#ifdef USE_INTEL_SPEEDUP
4115- if (IS_INTEL_AVX2 (cpuid_flags )) {
4135+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
41164136 kyber_decompress_4_avx2 (p , b );
4137+ RESTORE_VECTOR_REGISTERS ();
41174138 }
41184139 else
41194140#endif
@@ -4186,8 +4207,9 @@ static void kyber_decompress_5_c(sword16* p, const unsigned char* b)
41864207void kyber_decompress_5 (sword16 * p , const unsigned char * b )
41874208{
41884209#ifdef USE_INTEL_SPEEDUP
4189- if (IS_INTEL_AVX2 (cpuid_flags )) {
4210+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
41904211 kyber_decompress_5_avx2 (p , b );
4212+ RESTORE_VECTOR_REGISTERS ();
41914213 }
41924214 else
41934215#endif
@@ -4253,8 +4275,9 @@ static void kyber_from_msg_c(sword16* p, const byte* msg)
42534275void kyber_from_msg (sword16 * p , const byte * msg )
42544276{
42554277#ifdef USE_INTEL_SPEEDUP
4256- if (IS_INTEL_AVX2 (cpuid_flags )) {
4278+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
42574279 kyber_from_msg_avx2 (p , msg );
4280+ RESTORE_VECTOR_REGISTERS ();
42584281 }
42594282 else
42604283#endif
@@ -4342,9 +4365,10 @@ static void kyber_to_msg_c(byte* msg, sword16* p)
43424365void kyber_to_msg (byte * msg , sword16 * p )
43434366{
43444367#ifdef USE_INTEL_SPEEDUP
4345- if (IS_INTEL_AVX2 (cpuid_flags )) {
4368+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
43464369 /* Convert the polynomial into a array of bytes (message). */
43474370 kyber_to_msg_avx2 (msg , p );
4371+ RESTORE_VECTOR_REGISTERS ();
43484372 }
43494373 else
43504374#endif
@@ -4414,14 +4438,16 @@ static void kyber_from_bytes_c(sword16* p, const byte* b, int k)
44144438void kyber_from_bytes (sword16 * p , const byte * b , int k )
44154439{
44164440#ifdef USE_INTEL_SPEEDUP
4417- if (IS_INTEL_AVX2 (cpuid_flags )) {
4441+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
44184442 int i ;
44194443
44204444 for (i = 0 ; i < k ; i ++ ) {
44214445 kyber_from_bytes_avx2 (p , b );
44224446 p += KYBER_N ;
44234447 b += KYBER_POLY_SIZE ;
44244448 }
4449+
4450+ RESTORE_VECTOR_REGISTERS ();
44254451 }
44264452 else
44274453#endif
@@ -4473,14 +4499,16 @@ static void kyber_to_bytes_c(byte* b, sword16* p, int k)
44734499void kyber_to_bytes (byte * b , sword16 * p , int k )
44744500{
44754501#ifdef USE_INTEL_SPEEDUP
4476- if (IS_INTEL_AVX2 (cpuid_flags )) {
4502+ if (IS_INTEL_AVX2 (cpuid_flags ) && ( SAVE_VECTOR_REGISTERS2 () == 0 ) ) {
44774503 int i ;
44784504
44794505 for (i = 0 ; i < k ; i ++ ) {
44804506 kyber_to_bytes_avx2 (b , p );
44814507 p += KYBER_N ;
44824508 b += KYBER_POLY_SIZE ;
44834509 }
4510+
4511+ RESTORE_VECTOR_REGISTERS ();
44844512 }
44854513 else
44864514#endif
0 commit comments