@@ -1395,10 +1395,10 @@ static WC_INLINE int wc_chacha_encrypt_256(const word32* input, const byte* m,
13951395 /* Odd Round */
13961396 QUARTER_ROUND_ODD_4 ()
13971397 ODD_SHUFFLE_4 ()
1398+ "addi a3, a3, -1\n\t"
13981399 /* Even Round */
13991400 QUARTER_ROUND_EVEN_4 ()
14001401 EVEN_SHUFFLE_4 ()
1401- "addi a3, a3, -1\n\t"
14021402 "bnez a3, L_chacha20_riscv_256_loop\n\t"
14031403 /* Load message */
14041404 "mv t2, %[m]\n\t"
@@ -1770,13 +1770,13 @@ static WC_INLINE void wc_chacha_encrypt_64(const word32* input, const byte* m,
17701770 EIGHT_QUARTER_ROUNDS (REG_V0 , REG_V1 , REG_V2 , REG_V3 , REG_V12 )
17711771 EIGHT_QUARTER_ROUNDS (REG_V0 , REG_V1 , REG_V2 , REG_V3 , REG_V12 )
17721772 EIGHT_QUARTER_ROUNDS (REG_V0 , REG_V1 , REG_V2 , REG_V3 , REG_V12 )
1773+ "addi t1, %[bytes], -64\n\t"
17731774 /* Add back state */
17741775 VADD_VV (REG_V0 , REG_V0 , REG_V8 )
17751776 VADD_VV (REG_V1 , REG_V1 , REG_V9 )
17761777 VADD_VV (REG_V2 , REG_V2 , REG_V10 )
17771778 VADD_VV (REG_V3 , REG_V3 , REG_V11 )
1778- "addi t2, %[bytes], -64\n\t"
1779- "bltz t2, L_chacha20_riscv_64_lt_64\n\t"
1779+ "bltz t1, L_chacha20_riscv_64_lt_64\n\t"
17801780 "mv t2, %[m]\n\t"
17811781 VL4RE32_V (REG_V4 , REG_T2 )
17821782 VXOR_VV (REG_V4 , REG_V4 , REG_V0 )
@@ -1785,73 +1785,73 @@ static WC_INLINE void wc_chacha_encrypt_64(const word32* input, const byte* m,
17851785 VXOR_VV (REG_V7 , REG_V7 , REG_V3 )
17861786 "mv t2, %[c]\n\t"
17871787 VS4R_V (REG_V4 , REG_T2 )
1788+ "addi %[bytes], %[bytes], -64\n\t"
17881789 "addi %[c], %[c], 64\n\t"
17891790 "addi %[m], %[m], 64\n\t"
1790- "addi %[bytes], %[bytes], -64\n\t"
17911791 VADD_VV (REG_V11 , REG_V11 , REG_V13 )
17921792 "bnez %[bytes], L_chacha20_riscv_64_loop\n\t"
17931793 "beqz %[bytes], L_chacha20_riscv_64_done\n\t"
17941794 "\n"
17951795 "L_chacha20_riscv_64_lt_64:\n\t"
17961796 "mv t2, %[over]\n\t"
1797+ "addi t1, %[bytes], -32\n\t"
17971798 VS4R_V (REG_V0 , REG_T2 )
17981799
1799- "addi t2, %[bytes], -32\n\t"
1800- "bltz t2, L_chacha20_riscv_64_lt_32\n\t"
1800+ "bltz t1, L_chacha20_riscv_64_lt_32\n\t"
18011801 "mv t2, %[m]\n\t"
18021802 VL2RE32_V (REG_V4 , REG_T2 )
18031803 VXOR_VV (REG_V4 , REG_V4 , REG_V0 )
18041804 VXOR_VV (REG_V5 , REG_V5 , REG_V1 )
18051805 "mv t2, %[c]\n\t"
18061806 VS2R_V (REG_V4 , REG_T2 )
1807+ "addi %[bytes], %[bytes], -32\n\t"
18071808 "addi %[c], %[c], 32\n\t"
18081809 "addi %[m], %[m], 32\n\t"
1809- "addi %[bytes], %[bytes], -32\n\t"
18101810 "beqz %[bytes], L_chacha20_riscv_64_done\n\t"
18111811 VMVR_V (REG_V0 , REG_V2 , 2 )
18121812 "\n"
18131813 "L_chacha20_riscv_64_lt_32:\n\t"
1814- "addi t2 , %[bytes], -16\n\t"
1815- "bltz t2 , L_chacha20_riscv_64_lt_16\n\t"
1814+ "addi t1 , %[bytes], -16\n\t"
1815+ "bltz t1 , L_chacha20_riscv_64_lt_16\n\t"
18161816 "mv t2, %[m]\n\t"
18171817 VL1RE32_V (REG_V4 , REG_T2 )
18181818 VXOR_VV (REG_V4 , REG_V4 , REG_V0 )
18191819 "mv t2, %[c]\n\t"
18201820 VS1R_V (REG_V4 , REG_T2 )
1821+ "addi %[bytes], %[bytes], -16\n\t"
18211822 "addi %[c], %[c], 16\n\t"
18221823 "addi %[m], %[m], 16\n\t"
1823- "addi %[bytes], %[bytes], -16\n\t"
18241824 "beqz %[bytes], L_chacha20_riscv_64_done\n\t"
18251825 VMV_V_V (REG_V0 , REG_V1 )
18261826 "\n"
18271827 "L_chacha20_riscv_64_lt_16:\n\t"
1828- "addi t2 , %[bytes], -8\n\t"
1829- "bltz t2 , L_chacha20_riscv_64_lt_8\n\t"
1828+ "addi t1 , %[bytes], -8\n\t"
1829+ "bltz t1 , L_chacha20_riscv_64_lt_8\n\t"
18301830 VSETIVLI (REG_X0 , 2 , 1 , 1 , 0b011 , 0b000 )
18311831 VMV_X_S (REG_T0 , REG_V0 )
18321832 VSETIVLI (REG_X0 , 4 , 1 , 1 , 0b010 , 0b000 )
18331833 "ld t1, (%[m])\n\t"
18341834 "xor t1, t1, t0\n\t"
18351835 "sd t1, (%[c])\n\t"
1836+ "addi %[bytes], %[bytes], -8\n\t"
18361837 "addi %[c], %[c], 8\n\t"
18371838 "addi %[m], %[m], 8\n\t"
1838- "addi %[bytes], %[bytes], -8\n\t"
18391839 "beqz %[bytes], L_chacha20_riscv_64_done\n\t"
18401840 VSLIDEDOWN_VI (REG_V0 , REG_V0 , 2 )
18411841 "\n"
18421842 "L_chacha20_riscv_64_lt_8:\n\t"
1843+ "addi %[bytes], %[bytes], -1\n\t"
18431844 VSETIVLI (REG_X0 , 2 , 1 , 1 , 0b011 , 0b000 )
18441845 VMV_X_S (REG_T0 , REG_V0 )
18451846 VSETIVLI (REG_X0 , 4 , 1 , 1 , 0b010 , 0b000 )
1846- "addi %[bytes], %[bytes], -1\n\t"
18471847 "\n"
18481848 "L_chacha20_riscv_64_loop_lt_8:\n\t"
1849+ "addi %[bytes], %[bytes], -1\n\t"
18491850 "lb t1, (%[m])\n\t"
18501851 "addi %[m], %[m], 1\n\t"
18511852 "xor t1, t1, t0\n\t"
18521853 "sb t1, (%[c])\n\t"
18531854 "addi %[c], %[c], 1\n\t"
1854- "addi %[bytes], %[bytes], -1\n\t"
18551855 "srli t0, t0, 8\n\t"
18561856 "bgez %[bytes], L_chacha20_riscv_64_loop_lt_8\n\t"
18571857 "\n"
@@ -2085,9 +2085,11 @@ static void wc_chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c,
20852085static WC_INLINE void wc_chacha_encrypt (const word32 * input , const byte * m ,
20862086 byte * c , word32 bytes , word32 * over )
20872087{
2088- word64 bytes64 = (word64 )bytes ;
2089-
20902088 __asm__ __volatile__ (
2089+ /* Ensure 64-bit bytes has top bits clear. */
2090+ "slli %[bytes], %[bytes], 32\n\t"
2091+ "srli %[bytes], %[bytes], 32\n\t"
2092+
20912093 "L_chacha20_riscv_outer:\n\t"
20922094 /* Move state into regular registers */
20932095 "ld a4, 0(%[input])\n\t"
@@ -2113,11 +2115,13 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
21132115 "L_chacha20_riscv_loop:\n\t"
21142116 /* Odd Round */
21152117 QUARTER_ROUND_ODD ()
2118+ "addi a3, a3, -1\n\t"
21162119 /* Even Round */
21172120 QUARTER_ROUND_EVEN ()
2118- "addi a3, a3, -1\n\t"
21192121 "bnez a3, L_chacha20_riscv_loop\n\t"
21202122
2123+ "addi %[bytes], %[bytes], -64\n\t"
2124+
21212125 "ld t0, 0(%[input])\n\t"
21222126 "ld t1, 8(%[input])\n\t"
21232127 "ld t2, 16(%[input])\n\t"
@@ -2141,89 +2145,20 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
21412145 "add s2, s2, t0\n\t"
21422146 "add s4, s4, t1\n\t"
21432147 "add s6, s6, t2\n\t"
2148+ "addi t2, t2, 1\n\t"
21442149 "add s8, s8, s1\n\t"
21452150 "srli t0, t0, 32\n\t"
21462151 "srli t1, t1, 32\n\t"
2152+ "sw t2, 48(%[input])\n\t"
21472153 "srli t2, t2, 32\n\t"
21482154 "srli s1, s1, 32\n\t"
21492155 "add s3, s3, t0\n\t"
21502156 "add s5, s5, t1\n\t"
21512157 "add s7, s7, t2\n\t"
21522158 "add s9, s9, s1\n\t"
21532159
2154- "addi %[bytes], %[bytes], -64\n\t"
2155- "bgez %[bytes], L_chacha20_riscv_xor\n\t"
2156- "addi a3, %[bytes], 64\n\t"
2157-
2158- "sw a4, 0(%[over])\n\t"
2159- "sw a5, 4(%[over])\n\t"
2160- "sw a6, 8(%[over])\n\t"
2161- "sw a7, 12(%[over])\n\t"
2162- "sw t3, 16(%[over])\n\t"
2163- "sw t4, 20(%[over])\n\t"
2164- "sw t5, 24(%[over])\n\t"
2165- "sw t6, 28(%[over])\n\t"
2166- "sw s2, 32(%[over])\n\t"
2167- "sw s3, 36(%[over])\n\t"
2168- "sw s4, 40(%[over])\n\t"
2169- "sw s5, 44(%[over])\n\t"
2170- "sw s6, 48(%[over])\n\t"
2171- "sw s7, 52(%[over])\n\t"
2172- "sw s8, 56(%[over])\n\t"
2173- "sw s9, 60(%[over])\n\t"
2174-
2175- "addi t0, a3, -8\n\t"
2176- "bltz t0, L_chacha20_riscv_32bit\n\t"
2177- "addi a3, a3, -1\n\t"
2178- "L_chacha20_riscv_64bit_loop:\n\t"
2179- "ld t0, (%[m])\n\t"
2180- "ld t1, (%[over])\n\t"
2181- "xor t0, t0, t1\n\t"
2182- "sd t0, (%[c])\n\t"
2183- "addi %[m], %[m], 8\n\t"
2184- "addi %[c], %[c], 8\n\t"
2185- "addi %[over], %[over], 8\n\t"
2186- "addi a3, a3, -8\n\t"
2187- "bgez a3, L_chacha20_riscv_64bit_loop\n\t"
2188- "addi a3, a3, 1\n\t"
2189-
2190- "L_chacha20_riscv_32bit:\n\t"
2191- "addi t0, a3, -4\n\t"
2192- "bltz t0, L_chacha20_riscv_16bit\n\t"
2193- "lw t0, (%[m])\n\t"
2194- "lw t1, (%[over])\n\t"
2195- "xor t0, t0, t1\n\t"
2196- "sw t0, (%[c])\n\t"
2197- "addi %[m], %[m], 4\n\t"
2198- "addi %[c], %[c], 4\n\t"
2199- "addi %[over], %[over], 4\n\t"
2200-
2201- "L_chacha20_riscv_16bit:\n\t"
2202- "addi t0, a3, -2\n\t"
2203- "bltz t0, L_chacha20_riscv_8bit\n\t"
2204- "lh t0, (%[m])\n\t"
2205- "lh t1, (%[over])\n\t"
2206- "xor t0, t0, t1\n\t"
2207- "sh t0, (%[c])\n\t"
2208- "addi %[m], %[m], 2\n\t"
2209- "addi %[c], %[c], 2\n\t"
2210- "addi %[over], %[over], 2\n\t"
2211-
2212- "L_chacha20_riscv_8bit:\n\t"
2213- "addi t0, a3, -1\n\t"
2214- "bltz t0, L_chacha20_riscv_bytes_done\n\t"
2215- "lb t0, (%[m])\n\t"
2216- "lb t1, (%[over])\n\t"
2217- "xor t0, t0, t1\n\t"
2218- "sb t0, (%[c])\n\t"
2219-
2220- "L_chacha20_riscv_bytes_done:\n\t"
2221- "lw t0, 48(%[input])\n\t"
2222- "addi t0, t0, 1\n\t"
2223- "sw t0, 48(%[input])\n\t"
2224- "bltz %[bytes], L_chacha20_riscv_done\n\t"
2160+ "bltz %[bytes], L_chacha20_riscv_over\n\t"
22252161
2226- "L_chacha20_riscv_xor:\n\t"
22272162#if !defined(WOLFSSL_RISCV_BIT_MANIPULATION )
22282163 "ld t0, 0(%[m])\n\t"
22292164 "ld t1, 8(%[m])\n\t"
@@ -2308,16 +2243,80 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
23082243 "sd s8, 56(%[c])\n\t"
23092244#endif
23102245
2311- "lw t0, 48(%[input])\n\t"
23122246 "addi %[m], %[m], 64\n\t"
2313- "addi t0, t0, 1\n\t"
23142247 "addi %[c], %[c], 64\n\t"
2315- "sw t0, 48(%[input])\n\t"
23162248
23172249 "bnez %[bytes], L_chacha20_riscv_outer\n\t"
2250+ "beqz %[bytes], L_chacha20_riscv_done\n\t"
2251+
2252+ "L_chacha20_riscv_over:\n\t"
2253+ "addi a3, %[bytes], 64\n\t"
2254+
2255+ "sw a4, 0(%[over])\n\t"
2256+ "sw a5, 4(%[over])\n\t"
2257+ "sw a6, 8(%[over])\n\t"
2258+ "sw a7, 12(%[over])\n\t"
2259+ "sw t3, 16(%[over])\n\t"
2260+ "sw t4, 20(%[over])\n\t"
2261+ "sw t5, 24(%[over])\n\t"
2262+ "sw t6, 28(%[over])\n\t"
2263+ "sw s2, 32(%[over])\n\t"
2264+ "sw s3, 36(%[over])\n\t"
2265+ "sw s4, 40(%[over])\n\t"
2266+ "sw s5, 44(%[over])\n\t"
2267+ "sw s6, 48(%[over])\n\t"
2268+ "sw s7, 52(%[over])\n\t"
2269+ "sw s8, 56(%[over])\n\t"
2270+ "sw s9, 60(%[over])\n\t"
2271+
2272+ "addi t0, a3, -8\n\t"
2273+ "bltz t0, L_chacha20_riscv_32bit\n\t"
2274+ "addi a3, a3, -1\n\t"
2275+ "L_chacha20_riscv_64bit_loop:\n\t"
2276+ "ld t0, (%[m])\n\t"
2277+ "ld t1, (%[over])\n\t"
2278+ "xor t0, t0, t1\n\t"
2279+ "sd t0, (%[c])\n\t"
2280+ "addi %[m], %[m], 8\n\t"
2281+ "addi %[c], %[c], 8\n\t"
2282+ "addi %[over], %[over], 8\n\t"
2283+ "addi a3, a3, -8\n\t"
2284+ "bgez a3, L_chacha20_riscv_64bit_loop\n\t"
2285+ "addi a3, a3, 1\n\t"
2286+
2287+ "L_chacha20_riscv_32bit:\n\t"
2288+ "addi t0, a3, -4\n\t"
2289+ "bltz t0, L_chacha20_riscv_16bit\n\t"
2290+ "lw t0, (%[m])\n\t"
2291+ "lw t1, (%[over])\n\t"
2292+ "xor t0, t0, t1\n\t"
2293+ "sw t0, (%[c])\n\t"
2294+ "addi %[m], %[m], 4\n\t"
2295+ "addi %[c], %[c], 4\n\t"
2296+ "addi %[over], %[over], 4\n\t"
2297+
2298+ "L_chacha20_riscv_16bit:\n\t"
2299+ "addi t0, a3, -2\n\t"
2300+ "bltz t0, L_chacha20_riscv_8bit\n\t"
2301+ "lh t0, (%[m])\n\t"
2302+ "lh t1, (%[over])\n\t"
2303+ "xor t0, t0, t1\n\t"
2304+ "sh t0, (%[c])\n\t"
2305+ "addi %[m], %[m], 2\n\t"
2306+ "addi %[c], %[c], 2\n\t"
2307+ "addi %[over], %[over], 2\n\t"
2308+
2309+ "L_chacha20_riscv_8bit:\n\t"
2310+ "addi t0, a3, -1\n\t"
2311+ "bltz t0, L_chacha20_riscv_done\n\t\n\t"
2312+ "lb t0, (%[m])\n\t"
2313+ "lb t1, (%[over])\n\t"
2314+ "xor t0, t0, t1\n\t"
2315+ "sb t0, (%[c])\n\t"
2316+ "bltz %[bytes], L_chacha20_riscv_done\n\t"
23182317
23192318 "L_chacha20_riscv_done:\n\t"
2320- : [m ] "+ r " (m), [c] " + r " (c), [bytes] " + r " (bytes64 ), [over] " + r " (over)
2319+ : [m ] "+ r " (m), [c] " + r " (c), [bytes] " + r " (bytes ), [over] " + r " (over)
23212320 : [input ] "r" (input )
23222321 : "memory ", "t0" , "t1" , "t2" , "s1" , "a3" ,
23232322 "t3" , "t4" , "t5" , "t6" ,
@@ -2330,12 +2329,12 @@ static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
23302329/**
23312330 * Encrypt a stream of bytes
23322331 */
2333- static void wc_chacha_encrypt_bytes (ChaCha * ctx , const byte * m , byte * c ,
2334- word32 bytes )
2332+ static WC_INLINE void wc_chacha_encrypt_bytes (ChaCha * ctx , const byte * m ,
2333+ byte * c , word32 bytes )
23352334{
23362335 wc_chacha_encrypt (ctx -> X , m , c , bytes , ctx -> over );
2337- ctx -> left = CHACHA_CHUNK_BYTES - (bytes & (CHACHA_CHUNK_BYTES - 1 ));
2338- ctx -> left &= CHACHA_CHUNK_BYTES - 1 ;
2336+ ctx -> left = ( CHACHA_CHUNK_BYTES - (bytes & (CHACHA_CHUNK_BYTES - 1 ))) &
2337+ ( CHACHA_CHUNK_BYTES - 1 ) ;
23392338}
23402339#endif
23412340
@@ -2350,24 +2349,20 @@ int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input,
23502349 if ((ctx == NULL ) || (output == NULL ) || (input == NULL )) {
23512350 ret = BAD_FUNC_ARG ;
23522351 }
2353- else {
2354- /* handle left overs */
2355- if (msglen > 0 && ctx -> left > 0 ) {
2356- byte * out ;
2357- word32 i ;
2358-
2359- out = (byte * )ctx -> over + CHACHA_CHUNK_BYTES - ctx -> left ;
2360- for (i = 0 ; i < msglen && i < ctx -> left ; i ++ ) {
2361- output [i ] = (byte )(input [i ] ^ out [i ]);
2362- }
2363- ctx -> left -= i ;
2364-
2365- msglen -= i ;
2366- output += i ;
2367- input += i ;
2352+ else if (msglen > 0 ) {
2353+ if (ctx -> left > 0 ) {
2354+ word32 processed = min (msglen , ctx -> left );
2355+ byte * out = (byte * )ctx -> over + CHACHA_CHUNK_BYTES - ctx -> left ;
2356+
2357+ xorbufout (output , input , out , processed );
2358+
2359+ ctx -> left -= processed ;
2360+ msglen -= processed ;
2361+ output += processed ;
2362+ input += processed ;
23682363 }
23692364
2370- if (msglen != 0 ) {
2365+ if (msglen > 0 ) {
23712366 wc_chacha_encrypt_bytes (ctx , input , output , msglen );
23722367 }
23732368 }
0 commit comments