.. | .. |
---|
2 | 2 | // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions |
---|
3 | 3 | // |
---|
4 | 4 | // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> |
---|
| 5 | +// Copyright (C) 2019 Google LLC <ebiggers@google.com> |
---|
5 | 6 | // |
---|
6 | 7 | // This program is free software; you can redistribute it and/or modify |
---|
7 | 8 | // it under the terms of the GNU General Public License version 2 as |
---|
8 | 9 | // published by the Free Software Foundation. |
---|
9 | 10 | // |
---|
10 | 11 | |
---|
| 12 | +// Derived from the x86 version: |
---|
11 | 13 | // |
---|
12 | 14 | // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions |
---|
13 | 15 | // |
---|
.. | .. |
---|
54 | 56 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
---|
55 | 57 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
---|
56 | 58 | // |
---|
57 | | -// Function API: |
---|
58 | | -// UINT16 crc_t10dif_pcl( |
---|
59 | | -// UINT16 init_crc, //initial CRC value, 16 bits |
---|
60 | | -// const unsigned char *buf, //buffer pointer to calculate CRC on |
---|
61 | | -// UINT64 len //buffer length in bytes (64-bit data) |
---|
62 | | -// ); |
---|
63 | | -// |
---|
64 | 59 | // Reference paper titled "Fast CRC Computation for Generic |
---|
65 | 60 | // Polynomials Using PCLMULQDQ Instruction" |
---|
66 | 61 | // URL: http://www.intel.com/content/dam/www/public/us/en/documents |
---|
67 | 62 | // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf |
---|
68 | | -// |
---|
69 | 63 | // |
---|
70 | 64 | |
---|
71 | 65 | #include <linux/linkage.h> |
---|
72 | 66 | #include <asm/assembler.h> |
---|
73 | 67 | |
---|
74 | 68 | .text |
---|
75 | | - .cpu generic+crypto |
---|
| 69 | + .arch armv8-a+crypto |
---|
76 | 70 | |
---|
77 | | - arg1_low32 .req w19 |
---|
78 | | - arg2 .req x20 |
---|
79 | | - arg3 .req x21 |
---|
| 71 | + init_crc .req w0 |
---|
| 72 | + buf .req x1 |
---|
| 73 | + len .req x2 |
---|
| 74 | + fold_consts_ptr .req x3 |
---|
80 | 75 | |
---|
81 | | - vzr .req v13 |
---|
| 76 | + fold_consts .req v10 |
---|
82 | 77 | |
---|
83 | | -ENTRY(crc_t10dif_pmull) |
---|
84 | | - frame_push 3, 128 |
---|
| 78 | + ad .req v14 |
---|
85 | 79 | |
---|
86 | | - mov arg1_low32, w0 |
---|
87 | | - mov arg2, x1 |
---|
88 | | - mov arg3, x2 |
---|
| 80 | + k00_16 .req v15 |
---|
| 81 | + k32_48 .req v16 |
---|
89 | 82 | |
---|
90 | | - movi vzr.16b, #0 // init zero register |
---|
| 83 | + t3 .req v17 |
---|
| 84 | + t4 .req v18 |
---|
| 85 | + t5 .req v19 |
---|
| 86 | + t6 .req v20 |
---|
| 87 | + t7 .req v21 |
---|
| 88 | + t8 .req v22 |
---|
| 89 | + t9 .req v23 |
---|
91 | 90 | |
---|
92 | | - // adjust the 16-bit initial_crc value, scale it to 32 bits |
---|
93 | | - lsl arg1_low32, arg1_low32, #16 |
---|
| 91 | + perm1 .req v24 |
---|
| 92 | + perm2 .req v25 |
---|
| 93 | + perm3 .req v26 |
---|
| 94 | + perm4 .req v27 |
---|
94 | 95 | |
---|
95 | | - // check if smaller than 256 |
---|
96 | | - cmp arg3, #256 |
---|
| 96 | + bd1 .req v28 |
---|
| 97 | + bd2 .req v29 |
---|
| 98 | + bd3 .req v30 |
---|
| 99 | + bd4 .req v31 |
---|
97 | 100 | |
---|
98 | | - // for sizes less than 128, we can't fold 64B at a time... |
---|
99 | | - b.lt _less_than_128 |
---|
| 101 | + .macro __pmull_init_p64 |
---|
| 102 | + .endm |
---|
100 | 103 | |
---|
101 | | - // load the initial crc value |
---|
102 | | - // crc value does not need to be byte-reflected, but it needs |
---|
103 | | - // to be moved to the high part of the register. |
---|
104 | | - // because data will be byte-reflected and will align with |
---|
105 | | - // initial crc at correct place. |
---|
106 | | - movi v10.16b, #0 |
---|
107 | | - mov v10.s[3], arg1_low32 // initial crc |
---|
| 104 | + .macro __pmull_pre_p64, bd |
---|
| 105 | + .endm |
---|
108 | 106 | |
---|
109 | | - // receive the initial 64B data, xor the initial crc value |
---|
110 | | - ldp q0, q1, [arg2] |
---|
111 | | - ldp q2, q3, [arg2, #0x20] |
---|
112 | | - ldp q4, q5, [arg2, #0x40] |
---|
113 | | - ldp q6, q7, [arg2, #0x60] |
---|
114 | | - add arg2, arg2, #0x80 |
---|
| 107 | + .macro __pmull_init_p8 |
---|
| 108 | + // k00_16 := 0x0000000000000000_000000000000ffff |
---|
| 109 | + // k32_48 := 0x00000000ffffffff_0000ffffffffffff |
---|
| 110 | + movi k32_48.2d, #0xffffffff |
---|
| 111 | + mov k32_48.h[2], k32_48.h[0] |
---|
| 112 | + ushr k00_16.2d, k32_48.2d, #32 |
---|
115 | 113 | |
---|
116 | | -CPU_LE( rev64 v0.16b, v0.16b ) |
---|
117 | | -CPU_LE( rev64 v1.16b, v1.16b ) |
---|
118 | | -CPU_LE( rev64 v2.16b, v2.16b ) |
---|
119 | | -CPU_LE( rev64 v3.16b, v3.16b ) |
---|
120 | | -CPU_LE( rev64 v4.16b, v4.16b ) |
---|
121 | | -CPU_LE( rev64 v5.16b, v5.16b ) |
---|
122 | | -CPU_LE( rev64 v6.16b, v6.16b ) |
---|
123 | | -CPU_LE( rev64 v7.16b, v7.16b ) |
---|
| 114 | + // prepare the permutation vectors |
---|
| 115 | + mov_q x5, 0x080f0e0d0c0b0a09 |
---|
| 116 | + movi perm4.8b, #8 |
---|
| 117 | + dup perm1.2d, x5 |
---|
| 118 | + eor perm1.16b, perm1.16b, perm4.16b |
---|
| 119 | + ushr perm2.2d, perm1.2d, #8 |
---|
| 120 | + ushr perm3.2d, perm1.2d, #16 |
---|
| 121 | + ushr perm4.2d, perm1.2d, #24 |
---|
| 122 | + sli perm2.2d, perm1.2d, #56 |
---|
| 123 | + sli perm3.2d, perm1.2d, #48 |
---|
| 124 | + sli perm4.2d, perm1.2d, #40 |
---|
| 125 | + .endm |
---|
124 | 126 | |
---|
125 | | -CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) |
---|
126 | | -CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) |
---|
127 | | -CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) |
---|
128 | | -CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) |
---|
129 | | -CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) |
---|
130 | | -CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) |
---|
131 | | -CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) |
---|
132 | | -CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) |
---|
| 127 | + .macro __pmull_pre_p8, bd |
---|
| 128 | + tbl bd1.16b, {\bd\().16b}, perm1.16b |
---|
| 129 | + tbl bd2.16b, {\bd\().16b}, perm2.16b |
---|
| 130 | + tbl bd3.16b, {\bd\().16b}, perm3.16b |
---|
| 131 | + tbl bd4.16b, {\bd\().16b}, perm4.16b |
---|
| 132 | + .endm |
---|
133 | 133 | |
---|
134 | | - // XOR the initial_crc value |
---|
135 | | - eor v0.16b, v0.16b, v10.16b |
---|
| 134 | +SYM_FUNC_START_LOCAL(__pmull_p8_core) |
---|
| 135 | +.L__pmull_p8_core: |
---|
| 136 | + ext t4.8b, ad.8b, ad.8b, #1 // A1 |
---|
| 137 | + ext t5.8b, ad.8b, ad.8b, #2 // A2 |
---|
| 138 | + ext t6.8b, ad.8b, ad.8b, #3 // A3 |
---|
136 | 139 | |
---|
137 | | - ldr_l q10, rk3, x8 // xmm10 has rk3 and rk4 |
---|
138 | | - // type of pmull instruction |
---|
139 | | - // will determine which constant to use |
---|
| 140 | + pmull t4.8h, t4.8b, fold_consts.8b // F = A1*B |
---|
| 141 | + pmull t8.8h, ad.8b, bd1.8b // E = A*B1 |
---|
| 142 | + pmull t5.8h, t5.8b, fold_consts.8b // H = A2*B |
---|
| 143 | + pmull t7.8h, ad.8b, bd2.8b // G = A*B2 |
---|
| 144 | + pmull t6.8h, t6.8b, fold_consts.8b // J = A3*B |
---|
| 145 | + pmull t9.8h, ad.8b, bd3.8b // I = A*B3 |
---|
| 146 | + pmull t3.8h, ad.8b, bd4.8b // K = A*B4 |
---|
| 147 | + b 0f |
---|
140 | 148 | |
---|
141 | | - // |
---|
142 | | - // we subtract 256 instead of 128 to save one instruction from the loop |
---|
143 | | - // |
---|
144 | | - sub arg3, arg3, #256 |
---|
| 149 | +.L__pmull_p8_core2: |
---|
| 150 | + tbl t4.16b, {ad.16b}, perm1.16b // A1 |
---|
| 151 | + tbl t5.16b, {ad.16b}, perm2.16b // A2 |
---|
| 152 | + tbl t6.16b, {ad.16b}, perm3.16b // A3 |
---|
145 | 153 | |
---|
146 | | - // at this section of the code, there is 64*x+y (0<=y<64) bytes of |
---|
147 | | - // buffer. The _fold_64_B_loop will fold 64B at a time |
---|
148 | | - // until we have 64+y Bytes of buffer |
---|
| 154 | + pmull2 t4.8h, t4.16b, fold_consts.16b // F = A1*B |
---|
| 155 | + pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1 |
---|
| 156 | + pmull2 t5.8h, t5.16b, fold_consts.16b // H = A2*B |
---|
| 157 | + pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2 |
---|
| 158 | + pmull2 t6.8h, t6.16b, fold_consts.16b // J = A3*B |
---|
| 159 | + pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3 |
---|
| 160 | + pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4 |
---|
149 | 161 | |
---|
| 162 | +0: eor t4.16b, t4.16b, t8.16b // L = E + F |
---|
| 163 | + eor t5.16b, t5.16b, t7.16b // M = G + H |
---|
| 164 | + eor t6.16b, t6.16b, t9.16b // N = I + J |
---|
150 | 165 | |
---|
151 | | - // fold 64B at a time. This section of the code folds 4 vector |
---|
152 | | - // registers in parallel |
---|
153 | | -_fold_64_B_loop: |
---|
| 166 | + uzp1 t8.2d, t4.2d, t5.2d |
---|
| 167 | + uzp2 t4.2d, t4.2d, t5.2d |
---|
| 168 | + uzp1 t7.2d, t6.2d, t3.2d |
---|
| 169 | + uzp2 t6.2d, t6.2d, t3.2d |
---|
154 | 170 | |
---|
155 | | - .macro fold64, reg1, reg2 |
---|
156 | | - ldp q11, q12, [arg2], #0x20 |
---|
| 171 | + // t4 = (L) (P0 + P1) << 8 |
---|
| 172 | + // t5 = (M) (P2 + P3) << 16 |
---|
| 173 | + eor t8.16b, t8.16b, t4.16b |
---|
| 174 | + and t4.16b, t4.16b, k32_48.16b |
---|
157 | 175 | |
---|
158 | | - pmull2 v8.1q, \reg1\().2d, v10.2d |
---|
159 | | - pmull \reg1\().1q, \reg1\().1d, v10.1d |
---|
| 176 | + // t6 = (N) (P4 + P5) << 24 |
---|
| 177 | + // t7 = (K) (P6 + P7) << 32 |
---|
| 178 | + eor t7.16b, t7.16b, t6.16b |
---|
| 179 | + and t6.16b, t6.16b, k00_16.16b |
---|
| 180 | + |
---|
| 181 | + eor t8.16b, t8.16b, t4.16b |
---|
| 182 | + eor t7.16b, t7.16b, t6.16b |
---|
| 183 | + |
---|
| 184 | + zip2 t5.2d, t8.2d, t4.2d |
---|
| 185 | + zip1 t4.2d, t8.2d, t4.2d |
---|
| 186 | + zip2 t3.2d, t7.2d, t6.2d |
---|
| 187 | + zip1 t6.2d, t7.2d, t6.2d |
---|
| 188 | + |
---|
| 189 | + ext t4.16b, t4.16b, t4.16b, #15 |
---|
| 190 | + ext t5.16b, t5.16b, t5.16b, #14 |
---|
| 191 | + ext t6.16b, t6.16b, t6.16b, #13 |
---|
| 192 | + ext t3.16b, t3.16b, t3.16b, #12 |
---|
| 193 | + |
---|
| 194 | + eor t4.16b, t4.16b, t5.16b |
---|
| 195 | + eor t6.16b, t6.16b, t3.16b |
---|
| 196 | + ret |
---|
| 197 | +SYM_FUNC_END(__pmull_p8_core) |
---|
| 198 | + |
---|
| 199 | + .macro __pmull_p8, rq, ad, bd, i |
---|
| 200 | + .ifnc \bd, fold_consts |
---|
| 201 | + .err |
---|
| 202 | + .endif |
---|
| 203 | + mov ad.16b, \ad\().16b |
---|
| 204 | + .ifb \i |
---|
| 205 | + pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B |
---|
| 206 | + .else |
---|
| 207 | + pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B |
---|
| 208 | + .endif |
---|
| 209 | + |
---|
| 210 | + bl .L__pmull_p8_core\i |
---|
| 211 | + |
---|
| 212 | + eor \rq\().16b, \rq\().16b, t4.16b |
---|
| 213 | + eor \rq\().16b, \rq\().16b, t6.16b |
---|
| 214 | + .endm |
---|
| 215 | + |
---|
| 216 | + // Fold reg1, reg2 into the next 32 data bytes, storing the result back |
---|
| 217 | + // into reg1, reg2. |
---|
| 218 | + .macro fold_32_bytes, p, reg1, reg2 |
---|
| 219 | + ldp q11, q12, [buf], #0x20 |
---|
| 220 | + |
---|
| 221 | + __pmull_\p v8, \reg1, fold_consts, 2 |
---|
| 222 | + __pmull_\p \reg1, \reg1, fold_consts |
---|
160 | 223 | |
---|
161 | 224 | CPU_LE( rev64 v11.16b, v11.16b ) |
---|
162 | 225 | CPU_LE( rev64 v12.16b, v12.16b ) |
---|
163 | 226 | |
---|
164 | | - pmull2 v9.1q, \reg2\().2d, v10.2d |
---|
165 | | - pmull \reg2\().1q, \reg2\().1d, v10.1d |
---|
| 227 | + __pmull_\p v9, \reg2, fold_consts, 2 |
---|
| 228 | + __pmull_\p \reg2, \reg2, fold_consts |
---|
166 | 229 | |
---|
167 | 230 | CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) |
---|
168 | 231 | CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) |
---|
.. | .. |
---|
173 | 236 | eor \reg2\().16b, \reg2\().16b, v12.16b |
---|
174 | 237 | .endm |
---|
175 | 238 | |
---|
176 | | - fold64 v0, v1 |
---|
177 | | - fold64 v2, v3 |
---|
178 | | - fold64 v4, v5 |
---|
179 | | - fold64 v6, v7 |
---|
180 | | - |
---|
181 | | - subs arg3, arg3, #128 |
---|
182 | | - |
---|
183 | | - // check if there is another 64B in the buffer to be able to fold |
---|
184 | | - b.lt _fold_64_B_end |
---|
185 | | - |
---|
186 | | - if_will_cond_yield_neon |
---|
187 | | - stp q0, q1, [sp, #.Lframe_local_offset] |
---|
188 | | - stp q2, q3, [sp, #.Lframe_local_offset + 32] |
---|
189 | | - stp q4, q5, [sp, #.Lframe_local_offset + 64] |
---|
190 | | - stp q6, q7, [sp, #.Lframe_local_offset + 96] |
---|
191 | | - do_cond_yield_neon |
---|
192 | | - ldp q0, q1, [sp, #.Lframe_local_offset] |
---|
193 | | - ldp q2, q3, [sp, #.Lframe_local_offset + 32] |
---|
194 | | - ldp q4, q5, [sp, #.Lframe_local_offset + 64] |
---|
195 | | - ldp q6, q7, [sp, #.Lframe_local_offset + 96] |
---|
196 | | - ldr_l q10, rk3, x8 |
---|
197 | | - movi vzr.16b, #0 // init zero register |
---|
198 | | - endif_yield_neon |
---|
199 | | - |
---|
200 | | - b _fold_64_B_loop |
---|
201 | | - |
---|
202 | | -_fold_64_B_end: |
---|
203 | | - // at this point, the buffer pointer is pointing at the last y Bytes |
---|
204 | | - // of the buffer the 64B of folded data is in 4 of the vector |
---|
205 | | - // registers: v0, v1, v2, v3 |
---|
206 | | - |
---|
207 | | - // fold the 8 vector registers to 1 vector register with different |
---|
208 | | - // constants |
---|
209 | | - |
---|
210 | | - ldr_l q10, rk9, x8 |
---|
211 | | - |
---|
212 | | - .macro fold16, reg, rk |
---|
213 | | - pmull v8.1q, \reg\().1d, v10.1d |
---|
214 | | - pmull2 \reg\().1q, \reg\().2d, v10.2d |
---|
215 | | - .ifnb \rk |
---|
216 | | - ldr_l q10, \rk, x8 |
---|
| 239 | + // Fold src_reg into dst_reg, optionally loading the next fold constants |
---|
| 240 | + .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts |
---|
| 241 | + __pmull_\p v8, \src_reg, fold_consts |
---|
| 242 | + __pmull_\p \src_reg, \src_reg, fold_consts, 2 |
---|
| 243 | + .ifnb \load_next_consts |
---|
| 244 | + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 |
---|
| 245 | + __pmull_pre_\p fold_consts |
---|
217 | 246 | .endif |
---|
218 | | - eor v7.16b, v7.16b, v8.16b |
---|
219 | | - eor v7.16b, v7.16b, \reg\().16b |
---|
| 247 | + eor \dst_reg\().16b, \dst_reg\().16b, v8.16b |
---|
| 248 | + eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b |
---|
220 | 249 | .endm |
---|
221 | 250 | |
---|
222 | | - fold16 v0, rk11 |
---|
223 | | - fold16 v1, rk13 |
---|
224 | | - fold16 v2, rk15 |
---|
225 | | - fold16 v3, rk17 |
---|
226 | | - fold16 v4, rk19 |
---|
227 | | - fold16 v5, rk1 |
---|
228 | | - fold16 v6 |
---|
| 251 | + .macro __pmull_p64, rd, rn, rm, n |
---|
| 252 | + .ifb \n |
---|
| 253 | + pmull \rd\().1q, \rn\().1d, \rm\().1d |
---|
| 254 | + .else |
---|
| 255 | + pmull2 \rd\().1q, \rn\().2d, \rm\().2d |
---|
| 256 | + .endif |
---|
| 257 | + .endm |
---|
229 | 258 | |
---|
230 | | - // instead of 64, we add 48 to the loop counter to save 1 instruction |
---|
231 | | - // from the loop instead of a cmp instruction, we use the negative |
---|
232 | | - // flag with the jl instruction |
---|
233 | | - adds arg3, arg3, #(128-16) |
---|
234 | | - b.lt _final_reduction_for_128 |
---|
| 259 | + .macro crc_t10dif_pmull, p |
---|
| 260 | + __pmull_init_\p |
---|
235 | 261 | |
---|
236 | | - // now we have 16+y bytes left to reduce. 16 Bytes is in register v7 |
---|
237 | | - // and the rest is in memory. We can fold 16 bytes at a time if y>=16 |
---|
238 | | - // continue folding 16B at a time |
---|
| 262 | + // For sizes less than 256 bytes, we can't fold 128 bytes at a time. |
---|
| 263 | + cmp len, #256 |
---|
| 264 | + b.lt .Lless_than_256_bytes_\@ |
---|
239 | 265 | |
---|
240 | | -_16B_reduction_loop: |
---|
241 | | - pmull v8.1q, v7.1d, v10.1d |
---|
242 | | - pmull2 v7.1q, v7.2d, v10.2d |
---|
| 266 | + adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts |
---|
| 267 | + |
---|
| 268 | + // Load the first 128 data bytes. Byte swapping is necessary to make |
---|
| 269 | + // the bit order match the polynomial coefficient order. |
---|
| 270 | + ldp q0, q1, [buf] |
---|
| 271 | + ldp q2, q3, [buf, #0x20] |
---|
| 272 | + ldp q4, q5, [buf, #0x40] |
---|
| 273 | + ldp q6, q7, [buf, #0x60] |
---|
| 274 | + add buf, buf, #0x80 |
---|
| 275 | +CPU_LE( rev64 v0.16b, v0.16b ) |
---|
| 276 | +CPU_LE( rev64 v1.16b, v1.16b ) |
---|
| 277 | +CPU_LE( rev64 v2.16b, v2.16b ) |
---|
| 278 | +CPU_LE( rev64 v3.16b, v3.16b ) |
---|
| 279 | +CPU_LE( rev64 v4.16b, v4.16b ) |
---|
| 280 | +CPU_LE( rev64 v5.16b, v5.16b ) |
---|
| 281 | +CPU_LE( rev64 v6.16b, v6.16b ) |
---|
| 282 | +CPU_LE( rev64 v7.16b, v7.16b ) |
---|
| 283 | +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) |
---|
| 284 | +CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) |
---|
| 285 | +CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) |
---|
| 286 | +CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) |
---|
| 287 | +CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) |
---|
| 288 | +CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) |
---|
| 289 | +CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) |
---|
| 290 | +CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) |
---|
| 291 | + |
---|
| 292 | + // XOR the first 16 data *bits* with the initial CRC value. |
---|
| 293 | + movi v8.16b, #0 |
---|
| 294 | + mov v8.h[7], init_crc |
---|
| 295 | + eor v0.16b, v0.16b, v8.16b |
---|
| 296 | + |
---|
| 297 | + // Load the constants for folding across 128 bytes. |
---|
| 298 | + ld1 {fold_consts.2d}, [fold_consts_ptr] |
---|
| 299 | + __pmull_pre_\p fold_consts |
---|
| 300 | + |
---|
| 301 | + // Subtract 128 for the 128 data bytes just consumed. Subtract another |
---|
| 302 | + // 128 to simplify the termination condition of the following loop. |
---|
| 303 | + sub len, len, #256 |
---|
| 304 | + |
---|
| 305 | + // While >= 128 data bytes remain (not counting v0-v7), fold the 128 |
---|
| 306 | + // bytes v0-v7 into them, storing the result back into v0-v7. |
---|
| 307 | +.Lfold_128_bytes_loop_\@: |
---|
| 308 | + fold_32_bytes \p, v0, v1 |
---|
| 309 | + fold_32_bytes \p, v2, v3 |
---|
| 310 | + fold_32_bytes \p, v4, v5 |
---|
| 311 | + fold_32_bytes \p, v6, v7 |
---|
| 312 | + |
---|
| 313 | + subs len, len, #128 |
---|
| 314 | + b.ge .Lfold_128_bytes_loop_\@ |
---|
| 315 | + |
---|
| 316 | + // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7. |
---|
| 317 | + |
---|
| 318 | + // Fold across 64 bytes. |
---|
| 319 | + add fold_consts_ptr, fold_consts_ptr, #16 |
---|
| 320 | + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 |
---|
| 321 | + __pmull_pre_\p fold_consts |
---|
| 322 | + fold_16_bytes \p, v0, v4 |
---|
| 323 | + fold_16_bytes \p, v1, v5 |
---|
| 324 | + fold_16_bytes \p, v2, v6 |
---|
| 325 | + fold_16_bytes \p, v3, v7, 1 |
---|
| 326 | + // Fold across 32 bytes. |
---|
| 327 | + fold_16_bytes \p, v4, v6 |
---|
| 328 | + fold_16_bytes \p, v5, v7, 1 |
---|
| 329 | + // Fold across 16 bytes. |
---|
| 330 | + fold_16_bytes \p, v6, v7 |
---|
| 331 | + |
---|
| 332 | + // Add 128 to get the correct number of data bytes remaining in 0...127 |
---|
| 333 | + // (not counting v7), following the previous extra subtraction by 128. |
---|
| 334 | + // Then subtract 16 to simplify the termination condition of the |
---|
| 335 | + // following loop. |
---|
| 336 | + adds len, len, #(128-16) |
---|
| 337 | + |
---|
| 338 | + // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7 |
---|
| 339 | + // into them, storing the result back into v7. |
---|
| 340 | + b.lt .Lfold_16_bytes_loop_done_\@ |
---|
| 341 | +.Lfold_16_bytes_loop_\@: |
---|
| 342 | + __pmull_\p v8, v7, fold_consts |
---|
| 343 | + __pmull_\p v7, v7, fold_consts, 2 |
---|
243 | 344 | eor v7.16b, v7.16b, v8.16b |
---|
244 | | - |
---|
245 | | - ldr q0, [arg2], #16 |
---|
| 345 | + ldr q0, [buf], #16 |
---|
246 | 346 | CPU_LE( rev64 v0.16b, v0.16b ) |
---|
247 | 347 | CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) |
---|
248 | 348 | eor v7.16b, v7.16b, v0.16b |
---|
249 | | - subs arg3, arg3, #16 |
---|
| 349 | + subs len, len, #16 |
---|
| 350 | + b.ge .Lfold_16_bytes_loop_\@ |
---|
250 | 351 | |
---|
251 | | - // instead of a cmp instruction, we utilize the flags with the |
---|
252 | | - // jge instruction equivalent of: cmp arg3, 16-16 |
---|
253 | | - // check if there is any more 16B in the buffer to be able to fold |
---|
254 | | - b.ge _16B_reduction_loop |
---|
| 352 | +.Lfold_16_bytes_loop_done_\@: |
---|
| 353 | + // Add 16 to get the correct number of data bytes remaining in 0...15 |
---|
| 354 | + // (not counting v7), following the previous extra subtraction by 16. |
---|
| 355 | + adds len, len, #16 |
---|
| 356 | + b.eq .Lreduce_final_16_bytes_\@ |
---|
255 | 357 | |
---|
256 | | - // now we have 16+z bytes left to reduce, where 0<= z < 16. |
---|
257 | | - // first, we reduce the data in the xmm7 register |
---|
| 358 | +.Lhandle_partial_segment_\@: |
---|
| 359 | + // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first |
---|
| 360 | + // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To |
---|
| 361 | + // do this without needing a fold constant for each possible 'len', |
---|
| 362 | + // redivide the bytes into a first chunk of 'len' bytes and a second |
---|
| 363 | + // chunk of 16 bytes, then fold the first chunk into the second. |
---|
258 | 364 | |
---|
259 | | -_final_reduction_for_128: |
---|
260 | | - // check if any more data to fold. If not, compute the CRC of |
---|
261 | | - // the final 128 bits |
---|
262 | | - adds arg3, arg3, #16 |
---|
263 | | - b.eq _128_done |
---|
| 365 | + // v0 = last 16 original data bytes |
---|
| 366 | + add buf, buf, len |
---|
| 367 | + ldr q0, [buf, #-16] |
---|
| 368 | +CPU_LE( rev64 v0.16b, v0.16b ) |
---|
| 369 | +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) |
---|
264 | 370 | |
---|
265 | | - // here we are getting data that is less than 16 bytes. |
---|
266 | | - // since we know that there was data before the pointer, we can |
---|
267 | | - // offset the input pointer before the actual point, to receive |
---|
268 | | - // exactly 16 bytes. after that the registers need to be adjusted. |
---|
269 | | -_get_last_two_regs: |
---|
270 | | - add arg2, arg2, arg3 |
---|
271 | | - ldr q1, [arg2, #-16] |
---|
272 | | -CPU_LE( rev64 v1.16b, v1.16b ) |
---|
273 | | -CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) |
---|
| 371 | + // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes. |
---|
| 372 | + adr_l x4, .Lbyteshift_table + 16 |
---|
| 373 | + sub x4, x4, len |
---|
| 374 | + ld1 {v2.16b}, [x4] |
---|
| 375 | + tbl v1.16b, {v7.16b}, v2.16b |
---|
274 | 376 | |
---|
275 | | - // get rid of the extra data that was loaded before |
---|
276 | | - // load the shift constant |
---|
277 | | - adr_l x4, tbl_shf_table + 16 |
---|
278 | | - sub x4, x4, arg3 |
---|
279 | | - ld1 {v0.16b}, [x4] |
---|
| 377 | + // v3 = first chunk: v7 right-shifted by '16-len' bytes. |
---|
| 378 | + movi v3.16b, #0x80 |
---|
| 379 | + eor v2.16b, v2.16b, v3.16b |
---|
| 380 | + tbl v3.16b, {v7.16b}, v2.16b |
---|
280 | 381 | |
---|
281 | | - // shift v2 to the left by arg3 bytes |
---|
282 | | - tbl v2.16b, {v7.16b}, v0.16b |
---|
| 382 | + // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes. |
---|
| 383 | + sshr v2.16b, v2.16b, #7 |
---|
283 | 384 | |
---|
284 | | - // shift v7 to the right by 16-arg3 bytes |
---|
285 | | - movi v9.16b, #0x80 |
---|
286 | | - eor v0.16b, v0.16b, v9.16b |
---|
287 | | - tbl v7.16b, {v7.16b}, v0.16b |
---|
| 385 | + // v2 = second chunk: 'len' bytes from v0 (low-order bytes), |
---|
| 386 | + // then '16-len' bytes from v1 (high-order bytes). |
---|
| 387 | + bsl v2.16b, v1.16b, v0.16b |
---|
288 | 388 | |
---|
289 | | - // blend |
---|
290 | | - sshr v0.16b, v0.16b, #7 // convert to 8-bit mask |
---|
291 | | - bsl v0.16b, v2.16b, v1.16b |
---|
292 | | - |
---|
293 | | - // fold 16 Bytes |
---|
294 | | - pmull v8.1q, v7.1d, v10.1d |
---|
295 | | - pmull2 v7.1q, v7.2d, v10.2d |
---|
296 | | - eor v7.16b, v7.16b, v8.16b |
---|
| 389 | + // Fold the first chunk into the second chunk, storing the result in v7. |
---|
| 390 | + __pmull_\p v0, v3, fold_consts |
---|
| 391 | + __pmull_\p v7, v3, fold_consts, 2 |
---|
297 | 392 | eor v7.16b, v7.16b, v0.16b |
---|
| 393 | + eor v7.16b, v7.16b, v2.16b |
---|
298 | 394 | |
---|
299 | | -_128_done: |
---|
300 | | - // compute crc of a 128-bit value |
---|
301 | | - ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10 |
---|
| 395 | +.Lreduce_final_16_bytes_\@: |
---|
| 396 | + // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC. |
---|
302 | 397 | |
---|
303 | | - // 64b fold |
---|
304 | | - ext v0.16b, vzr.16b, v7.16b, #8 |
---|
305 | | - mov v7.d[0], v7.d[1] |
---|
306 | | - pmull v7.1q, v7.1d, v10.1d |
---|
307 | | - eor v7.16b, v7.16b, v0.16b |
---|
| 398 | + movi v2.16b, #0 // init zero register |
---|
308 | 399 | |
---|
309 | | - // 32b fold |
---|
310 | | - ext v0.16b, v7.16b, vzr.16b, #4 |
---|
311 | | - mov v7.s[3], vzr.s[0] |
---|
312 | | - pmull2 v0.1q, v0.2d, v10.2d |
---|
313 | | - eor v7.16b, v7.16b, v0.16b |
---|
| 400 | + // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. |
---|
| 401 | + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 |
---|
| 402 | + __pmull_pre_\p fold_consts |
---|
314 | 403 | |
---|
315 | | - // barrett reduction |
---|
316 | | -_barrett: |
---|
317 | | - ldr_l q10, rk7, x8 |
---|
318 | | - mov v0.d[0], v7.d[1] |
---|
| 404 | + // Fold the high 64 bits into the low 64 bits, while also multiplying by |
---|
| 405 | + // x^64. This produces a 128-bit value congruent to x^64 * M(x) and |
---|
| 406 | + // whose low 48 bits are 0. |
---|
| 407 | + ext v0.16b, v2.16b, v7.16b, #8 |
---|
| 408 | + __pmull_\p v7, v7, fold_consts, 2 // high bits * x^48 * (x^80 mod G(x)) |
---|
| 409 | + eor v0.16b, v0.16b, v7.16b // + low bits * x^64 |
---|
319 | 410 | |
---|
320 | | - pmull v0.1q, v0.1d, v10.1d |
---|
321 | | - ext v0.16b, vzr.16b, v0.16b, #12 |
---|
322 | | - pmull2 v0.1q, v0.2d, v10.2d |
---|
323 | | - ext v0.16b, vzr.16b, v0.16b, #12 |
---|
324 | | - eor v7.16b, v7.16b, v0.16b |
---|
325 | | - mov w0, v7.s[1] |
---|
| 411 | + // Fold the high 32 bits into the low 96 bits. This produces a 96-bit |
---|
| 412 | + // value congruent to x^64 * M(x) and whose low 48 bits are 0. |
---|
| 413 | + ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits |
---|
| 414 | + mov v0.s[3], v2.s[0] // zero high 32 bits |
---|
| 415 | + __pmull_\p v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x)) |
---|
| 416 | + eor v0.16b, v0.16b, v1.16b // + low bits |
---|
326 | 417 | |
---|
327 | | -_cleanup: |
---|
328 | | - // scale the result back to 16 bits |
---|
329 | | - lsr x0, x0, #16 |
---|
330 | | - frame_pop |
---|
| 418 | + // Load G(x) and floor(x^48 / G(x)). |
---|
| 419 | + ld1 {fold_consts.2d}, [fold_consts_ptr] |
---|
| 420 | + __pmull_pre_\p fold_consts |
---|
| 421 | + |
---|
| 422 | + // Use Barrett reduction to compute the final CRC value. |
---|
| 423 | + __pmull_\p v1, v0, fold_consts, 2 // high 32 bits * floor(x^48 / G(x)) |
---|
| 424 | + ushr v1.2d, v1.2d, #32 // /= x^32 |
---|
| 425 | + __pmull_\p v1, v1, fold_consts // *= G(x) |
---|
| 426 | + ushr v0.2d, v0.2d, #48 |
---|
| 427 | + eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits |
---|
| 428 | + // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0. |
---|
| 429 | + |
---|
| 430 | + umov w0, v0.h[0] |
---|
| 431 | + .ifc \p, p8 |
---|
| 432 | + ldp x29, x30, [sp], #16 |
---|
| 433 | + .endif |
---|
331 | 434 | ret |
---|
332 | 435 | |
---|
333 | | -_less_than_128: |
---|
334 | | - cbz arg3, _cleanup |
---|
| 436 | +.Lless_than_256_bytes_\@: |
---|
| 437 | + // Checksumming a buffer of length 16...255 bytes |
---|
335 | 438 | |
---|
336 | | - movi v0.16b, #0 |
---|
337 | | - mov v0.s[3], arg1_low32 // get the initial crc value |
---|
| 439 | + adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts |
---|
338 | 440 | |
---|
339 | | - ldr q7, [arg2], #0x10 |
---|
| 441 | + // Load the first 16 data bytes. |
---|
| 442 | + ldr q7, [buf], #0x10 |
---|
340 | 443 | CPU_LE( rev64 v7.16b, v7.16b ) |
---|
341 | 444 | CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) |
---|
342 | | - eor v7.16b, v7.16b, v0.16b // xor the initial crc value |
---|
343 | 445 | |
---|
344 | | - cmp arg3, #16 |
---|
345 | | - b.eq _128_done // exactly 16 left |
---|
346 | | - b.lt _less_than_16_left |
---|
| 446 | + // XOR the first 16 data *bits* with the initial CRC value. |
---|
| 447 | + movi v0.16b, #0 |
---|
| 448 | + mov v0.h[7], init_crc |
---|
| 449 | + eor v7.16b, v7.16b, v0.16b |
---|
347 | 450 | |
---|
348 | | - ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10 |
---|
| 451 | + // Load the fold-across-16-bytes constants. |
---|
| 452 | + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 |
---|
| 453 | + __pmull_pre_\p fold_consts |
---|
349 | 454 | |
---|
350 | | - // update the counter. subtract 32 instead of 16 to save one |
---|
351 | | - // instruction from the loop |
---|
352 | | - subs arg3, arg3, #32 |
---|
353 | | - b.ge _16B_reduction_loop |
---|
| 455 | + cmp len, #16 |
---|
| 456 | + b.eq .Lreduce_final_16_bytes_\@ // len == 16 |
---|
| 457 | + subs len, len, #32 |
---|
| 458 | + b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255 |
---|
| 459 | + add len, len, #16 |
---|
| 460 | + b .Lhandle_partial_segment_\@ // 17 <= len <= 31 |
---|
| 461 | + .endm |
---|
354 | 462 | |
---|
355 | | - add arg3, arg3, #16 |
---|
356 | | - b _get_last_two_regs |
---|
| 463 | +// |
---|
| 464 | +// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len); |
---|
| 465 | +// |
---|
| 466 | +// Assumes len >= 16. |
---|
| 467 | +// |
---|
| 468 | +SYM_FUNC_START(crc_t10dif_pmull_p8) |
---|
| 469 | + stp x29, x30, [sp, #-16]! |
---|
| 470 | + mov x29, sp |
---|
| 471 | + crc_t10dif_pmull p8 |
---|
| 472 | +SYM_FUNC_END(crc_t10dif_pmull_p8) |
---|
357 | 473 | |
---|
358 | | -_less_than_16_left: |
---|
359 | | - // shl r9, 4 |
---|
360 | | - adr_l x0, tbl_shf_table + 16 |
---|
361 | | - sub x0, x0, arg3 |
---|
362 | | - ld1 {v0.16b}, [x0] |
---|
363 | | - movi v9.16b, #0x80 |
---|
364 | | - eor v0.16b, v0.16b, v9.16b |
---|
365 | | - tbl v7.16b, {v7.16b}, v0.16b |
---|
366 | | - b _128_done |
---|
367 | | -ENDPROC(crc_t10dif_pmull) |
---|
| 474 | + .align 5 |
---|
| 475 | +// |
---|
| 476 | +// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); |
---|
| 477 | +// |
---|
| 478 | +// Assumes len >= 16. |
---|
| 479 | +// |
---|
| 480 | +SYM_FUNC_START(crc_t10dif_pmull_p64) |
---|
| 481 | + crc_t10dif_pmull p64 |
---|
| 482 | +SYM_FUNC_END(crc_t10dif_pmull_p64) |
---|
368 | 483 | |
---|
369 | | -// precomputed constants |
---|
370 | | -// these constants are precomputed from the poly: |
---|
371 | | -// 0x8bb70000 (0x8bb7 scaled to 32 bits) |
---|
372 | 484 | .section ".rodata", "a" |
---|
373 | 485 | .align 4 |
---|
374 | | -// Q = 0x18BB70000 |
---|
375 | | -// rk1 = 2^(32*3) mod Q << 32 |
---|
376 | | -// rk2 = 2^(32*5) mod Q << 32 |
---|
377 | | -// rk3 = 2^(32*15) mod Q << 32 |
---|
378 | | -// rk4 = 2^(32*17) mod Q << 32 |
---|
379 | | -// rk5 = 2^(32*3) mod Q << 32 |
---|
380 | | -// rk6 = 2^(32*2) mod Q << 32 |
---|
381 | | -// rk7 = floor(2^64/Q) |
---|
382 | | -// rk8 = Q |
---|
383 | 486 | |
---|
384 | | -rk1: .octa 0x06df0000000000002d56000000000000 |
---|
385 | | -rk3: .octa 0x7cf50000000000009d9d000000000000 |
---|
386 | | -rk5: .octa 0x13680000000000002d56000000000000 |
---|
387 | | -rk7: .octa 0x000000018bb7000000000001f65a57f8 |
---|
388 | | -rk9: .octa 0xbfd6000000000000ceae000000000000 |
---|
389 | | -rk11: .octa 0x713c0000000000001e16000000000000 |
---|
390 | | -rk13: .octa 0x80a6000000000000f7f9000000000000 |
---|
391 | | -rk15: .octa 0xe658000000000000044c000000000000 |
---|
392 | | -rk17: .octa 0xa497000000000000ad18000000000000 |
---|
393 | | -rk19: .octa 0xe7b50000000000006ee3000000000000 |
---|
| 487 | +// Fold constants precomputed from the polynomial 0x18bb7 |
---|
| 488 | +// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 |
---|
| 489 | +.Lfold_across_128_bytes_consts: |
---|
| 490 | + .quad 0x0000000000006123 // x^(8*128) mod G(x) |
---|
| 491 | + .quad 0x0000000000002295 // x^(8*128+64) mod G(x) |
---|
| 492 | +// .Lfold_across_64_bytes_consts: |
---|
| 493 | + .quad 0x0000000000001069 // x^(4*128) mod G(x) |
---|
| 494 | + .quad 0x000000000000dd31 // x^(4*128+64) mod G(x) |
---|
| 495 | +// .Lfold_across_32_bytes_consts: |
---|
| 496 | + .quad 0x000000000000857d // x^(2*128) mod G(x) |
---|
| 497 | + .quad 0x0000000000007acc // x^(2*128+64) mod G(x) |
---|
| 498 | +.Lfold_across_16_bytes_consts: |
---|
| 499 | + .quad 0x000000000000a010 // x^(1*128) mod G(x) |
---|
| 500 | + .quad 0x0000000000001faa // x^(1*128+64) mod G(x) |
---|
| 501 | +// .Lfinal_fold_consts: |
---|
| 502 | + .quad 0x1368000000000000 // x^48 * (x^48 mod G(x)) |
---|
| 503 | + .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x)) |
---|
| 504 | +// .Lbarrett_reduction_consts: |
---|
| 505 | + .quad 0x0000000000018bb7 // G(x) |
---|
| 506 | + .quad 0x00000001f65a57f8 // floor(x^48 / G(x)) |
---|
394 | 507 | |
---|
395 | | -tbl_shf_table: |
---|
396 | | -// use these values for shift constants for the tbl/tbx instruction |
---|
397 | | -// different alignments result in values as shown: |
---|
398 | | -// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 |
---|
399 | | -// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 |
---|
400 | | -// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 |
---|
401 | | -// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 |
---|
402 | | -// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 |
---|
403 | | -// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 |
---|
404 | | -// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 |
---|
405 | | -// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 |
---|
406 | | -// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 |
---|
407 | | -// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 |
---|
408 | | -// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 |
---|
409 | | -// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 |
---|
410 | | -// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 |
---|
411 | | -// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 |
---|
412 | | -// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 |
---|
413 | | - |
---|
| 508 | +// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - |
---|
| 509 | +// len] is the index vector to shift left by 'len' bytes, and is also {0x80, |
---|
| 510 | +// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes. |
---|
| 511 | +.Lbyteshift_table: |
---|
414 | 512 | .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 |
---|
415 | 513 | .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f |
---|
416 | 514 | .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
---|