| .. | .. |
|---|
| 2 | 2 | // Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions |
|---|
| 3 | 3 | // |
|---|
| 4 | 4 | // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> |
|---|
| 5 | +// Copyright (C) 2019 Google LLC <ebiggers@google.com> |
|---|
| 5 | 6 | // |
|---|
| 6 | 7 | // This program is free software; you can redistribute it and/or modify |
|---|
| 7 | 8 | // it under the terms of the GNU General Public License version 2 as |
|---|
| 8 | 9 | // published by the Free Software Foundation. |
|---|
| 9 | 10 | // |
|---|
| 10 | 11 | |
|---|
| 12 | +// Derived from the x86 version: |
|---|
| 11 | 13 | // |
|---|
| 12 | 14 | // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions |
|---|
| 13 | 15 | // |
|---|
| .. | .. |
|---|
| 54 | 56 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|---|
| 55 | 57 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|---|
| 56 | 58 | // |
|---|
| 57 | | -// Function API: |
|---|
| 58 | | -// UINT16 crc_t10dif_pcl( |
|---|
| 59 | | -// UINT16 init_crc, //initial CRC value, 16 bits |
|---|
| 60 | | -// const unsigned char *buf, //buffer pointer to calculate CRC on |
|---|
| 61 | | -// UINT64 len //buffer length in bytes (64-bit data) |
|---|
| 62 | | -// ); |
|---|
| 63 | | -// |
|---|
| 64 | 59 | // Reference paper titled "Fast CRC Computation for Generic |
|---|
| 65 | 60 | // Polynomials Using PCLMULQDQ Instruction" |
|---|
| 66 | 61 | // URL: http://www.intel.com/content/dam/www/public/us/en/documents |
|---|
| 67 | 62 | // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf |
|---|
| 68 | | -// |
|---|
| 69 | 63 | // |
|---|
| 70 | 64 | |
|---|
| 71 | 65 | #include <linux/linkage.h> |
|---|
| .. | .. |
|---|
| 78 | 72 | #endif |
|---|
| 79 | 73 | |
|---|
| 80 | 74 | .text |
|---|
| 75 | + .arch armv8-a |
|---|
| 81 | 76 | .fpu crypto-neon-fp-armv8 |
|---|
| 82 | 77 | |
|---|
| 83 | | - arg1_low32 .req r0 |
|---|
| 84 | | - arg2 .req r1 |
|---|
| 85 | | - arg3 .req r2 |
|---|
| 78 | + init_crc .req r0 |
|---|
| 79 | + buf .req r1 |
|---|
| 80 | + len .req r2 |
|---|
| 86 | 81 | |
|---|
| 87 | | - qzr .req q13 |
|---|
| 82 | + fold_consts_ptr .req ip |
|---|
| 88 | 83 | |
|---|
| 89 | 84 | q0l .req d0 |
|---|
| 90 | 85 | q0h .req d1 |
|---|
| .. | .. |
|---|
| 102 | 97 | q6h .req d13 |
|---|
| 103 | 98 | q7l .req d14 |
|---|
| 104 | 99 | q7h .req d15 |
|---|
| 100 | + q8l .req d16 |
|---|
| 101 | + q8h .req d17 |
|---|
| 102 | + q9l .req d18 |
|---|
| 103 | + q9h .req d19 |
|---|
| 104 | + q10l .req d20 |
|---|
| 105 | + q10h .req d21 |
|---|
| 106 | + q11l .req d22 |
|---|
| 107 | + q11h .req d23 |
|---|
| 108 | + q12l .req d24 |
|---|
| 109 | + q12h .req d25 |
|---|
| 105 | 110 | |
|---|
| 106 | | -ENTRY(crc_t10dif_pmull) |
|---|
| 107 | | - vmov.i8 qzr, #0 // init zero register |
|---|
| 111 | + FOLD_CONSTS .req q10 |
|---|
| 112 | + FOLD_CONST_L .req q10l |
|---|
| 113 | + FOLD_CONST_H .req q10h |
|---|
| 108 | 114 | |
|---|
| 109 | | - // adjust the 16-bit initial_crc value, scale it to 32 bits |
|---|
| 110 | | - lsl arg1_low32, arg1_low32, #16 |
|---|
| 115 | + // Fold reg1, reg2 into the next 32 data bytes, storing the result back |
|---|
| 116 | + // into reg1, reg2. |
|---|
| 117 | + .macro fold_32_bytes, reg1, reg2 |
|---|
| 118 | + vld1.64 {q11-q12}, [buf]! |
|---|
| 111 | 119 | |
|---|
| 112 | | - // check if smaller than 256 |
|---|
| 113 | | - cmp arg3, #256 |
|---|
| 120 | + vmull.p64 q8, \reg1\()h, FOLD_CONST_H |
|---|
| 121 | + vmull.p64 \reg1, \reg1\()l, FOLD_CONST_L |
|---|
| 122 | + vmull.p64 q9, \reg2\()h, FOLD_CONST_H |
|---|
| 123 | + vmull.p64 \reg2, \reg2\()l, FOLD_CONST_L |
|---|
| 114 | 124 | |
|---|
| 115 | | - // for sizes less than 128, we can't fold 64B at a time... |
|---|
| 116 | | - blt _less_than_128 |
|---|
| 117 | | - |
|---|
| 118 | | - // load the initial crc value |
|---|
| 119 | | - // crc value does not need to be byte-reflected, but it needs |
|---|
| 120 | | - // to be moved to the high part of the register. |
|---|
| 121 | | - // because data will be byte-reflected and will align with |
|---|
| 122 | | - // initial crc at correct place. |
|---|
| 123 | | - vmov s0, arg1_low32 // initial crc |
|---|
| 124 | | - vext.8 q10, qzr, q0, #4 |
|---|
| 125 | | - |
|---|
| 126 | | - // receive the initial 64B data, xor the initial crc value |
|---|
| 127 | | - vld1.64 {q0-q1}, [arg2]! |
|---|
| 128 | | - vld1.64 {q2-q3}, [arg2]! |
|---|
| 129 | | - vld1.64 {q4-q5}, [arg2]! |
|---|
| 130 | | - vld1.64 {q6-q7}, [arg2]! |
|---|
| 131 | | -CPU_LE( vrev64.8 q0, q0 ) |
|---|
| 132 | | -CPU_LE( vrev64.8 q1, q1 ) |
|---|
| 133 | | -CPU_LE( vrev64.8 q2, q2 ) |
|---|
| 134 | | -CPU_LE( vrev64.8 q3, q3 ) |
|---|
| 135 | | -CPU_LE( vrev64.8 q4, q4 ) |
|---|
| 136 | | -CPU_LE( vrev64.8 q5, q5 ) |
|---|
| 137 | | -CPU_LE( vrev64.8 q6, q6 ) |
|---|
| 138 | | -CPU_LE( vrev64.8 q7, q7 ) |
|---|
| 139 | | - |
|---|
| 140 | | - vswp d0, d1 |
|---|
| 141 | | - vswp d2, d3 |
|---|
| 142 | | - vswp d4, d5 |
|---|
| 143 | | - vswp d6, d7 |
|---|
| 144 | | - vswp d8, d9 |
|---|
| 145 | | - vswp d10, d11 |
|---|
| 146 | | - vswp d12, d13 |
|---|
| 147 | | - vswp d14, d15 |
|---|
| 148 | | - |
|---|
| 149 | | - // XOR the initial_crc value |
|---|
| 150 | | - veor.8 q0, q0, q10 |
|---|
| 151 | | - |
|---|
| 152 | | - adr ip, rk3 |
|---|
| 153 | | - vld1.64 {q10}, [ip, :128] // xmm10 has rk3 and rk4 |
|---|
| 154 | | - |
|---|
| 155 | | - // |
|---|
| 156 | | - // we subtract 256 instead of 128 to save one instruction from the loop |
|---|
| 157 | | - // |
|---|
| 158 | | - sub arg3, arg3, #256 |
|---|
| 159 | | - |
|---|
| 160 | | - // at this section of the code, there is 64*x+y (0<=y<64) bytes of |
|---|
| 161 | | - // buffer. The _fold_64_B_loop will fold 64B at a time |
|---|
| 162 | | - // until we have 64+y Bytes of buffer |
|---|
| 163 | | - |
|---|
| 164 | | - |
|---|
| 165 | | - // fold 64B at a time. This section of the code folds 4 vector |
|---|
| 166 | | - // registers in parallel |
|---|
| 167 | | -_fold_64_B_loop: |
|---|
| 168 | | - |
|---|
| 169 | | - .macro fold64, reg1, reg2 |
|---|
| 170 | | - vld1.64 {q11-q12}, [arg2]! |
|---|
| 171 | | - |
|---|
| 172 | | - vmull.p64 q8, \reg1\()h, d21 |
|---|
| 173 | | - vmull.p64 \reg1, \reg1\()l, d20 |
|---|
| 174 | | - vmull.p64 q9, \reg2\()h, d21 |
|---|
| 175 | | - vmull.p64 \reg2, \reg2\()l, d20 |
|---|
| 176 | | - |
|---|
| 177 | | -CPU_LE( vrev64.8 q11, q11 ) |
|---|
| 178 | | -CPU_LE( vrev64.8 q12, q12 ) |
|---|
| 179 | | - vswp d22, d23 |
|---|
| 180 | | - vswp d24, d25 |
|---|
| 125 | +CPU_LE( vrev64.8 q11, q11 ) |
|---|
| 126 | +CPU_LE( vrev64.8 q12, q12 ) |
|---|
| 127 | + vswp q11l, q11h |
|---|
| 128 | + vswp q12l, q12h |
|---|
| 181 | 129 | |
|---|
| 182 | 130 | veor.8 \reg1, \reg1, q8 |
|---|
| 183 | 131 | veor.8 \reg2, \reg2, q9 |
|---|
| .. | .. |
|---|
| 185 | 133 | veor.8 \reg2, \reg2, q12 |
|---|
| 186 | 134 | .endm |
|---|
| 187 | 135 | |
|---|
| 188 | | - fold64 q0, q1 |
|---|
| 189 | | - fold64 q2, q3 |
|---|
| 190 | | - fold64 q4, q5 |
|---|
| 191 | | - fold64 q6, q7 |
|---|
| 192 | | - |
|---|
| 193 | | - subs arg3, arg3, #128 |
|---|
| 194 | | - |
|---|
| 195 | | - // check if there is another 64B in the buffer to be able to fold |
|---|
| 196 | | - bge _fold_64_B_loop |
|---|
| 197 | | - |
|---|
| 198 | | - // at this point, the buffer pointer is pointing at the last y Bytes |
|---|
| 199 | | - // of the buffer the 64B of folded data is in 4 of the vector |
|---|
| 200 | | - // registers: v0, v1, v2, v3 |
|---|
| 201 | | - |
|---|
| 202 | | - // fold the 8 vector registers to 1 vector register with different |
|---|
| 203 | | - // constants |
|---|
| 204 | | - |
|---|
| 205 | | - adr ip, rk9 |
|---|
| 206 | | - vld1.64 {q10}, [ip, :128]! |
|---|
| 207 | | - |
|---|
| 208 | | - .macro fold16, reg, rk |
|---|
| 209 | | - vmull.p64 q8, \reg\()l, d20 |
|---|
| 210 | | - vmull.p64 \reg, \reg\()h, d21 |
|---|
| 211 | | - .ifnb \rk |
|---|
| 212 | | - vld1.64 {q10}, [ip, :128]! |
|---|
| 136 | + // Fold src_reg into dst_reg, optionally loading the next fold constants |
|---|
| 137 | + .macro fold_16_bytes, src_reg, dst_reg, load_next_consts |
|---|
| 138 | + vmull.p64 q8, \src_reg\()l, FOLD_CONST_L |
|---|
| 139 | + vmull.p64 \src_reg, \src_reg\()h, FOLD_CONST_H |
|---|
| 140 | + .ifnb \load_next_consts |
|---|
| 141 | + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! |
|---|
| 213 | 142 | .endif |
|---|
| 214 | | - veor.8 q7, q7, q8 |
|---|
| 215 | | - veor.8 q7, q7, \reg |
|---|
| 143 | + veor.8 \dst_reg, \dst_reg, q8 |
|---|
| 144 | + veor.8 \dst_reg, \dst_reg, \src_reg |
|---|
| 216 | 145 | .endm |
|---|
| 217 | 146 | |
|---|
| 218 | | - fold16 q0, rk11 |
|---|
| 219 | | - fold16 q1, rk13 |
|---|
| 220 | | - fold16 q2, rk15 |
|---|
| 221 | | - fold16 q3, rk17 |
|---|
| 222 | | - fold16 q4, rk19 |
|---|
| 223 | | - fold16 q5, rk1 |
|---|
| 224 | | - fold16 q6 |
|---|
| 147 | + .macro __adrl, out, sym |
|---|
| 148 | + movw \out, #:lower16:\sym |
|---|
| 149 | + movt \out, #:upper16:\sym |
|---|
| 150 | + .endm |
|---|
| 225 | 151 | |
|---|
| 226 | | - // instead of 64, we add 48 to the loop counter to save 1 instruction |
|---|
| 227 | | - // from the loop instead of a cmp instruction, we use the negative |
|---|
| 228 | | - // flag with the jl instruction |
|---|
| 229 | | - adds arg3, arg3, #(128-16) |
|---|
| 230 | | - blt _final_reduction_for_128 |
|---|
| 152 | +// |
|---|
| 153 | +// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len); |
|---|
| 154 | +// |
|---|
| 155 | +// Assumes len >= 16. |
|---|
| 156 | +// |
|---|
| 157 | +ENTRY(crc_t10dif_pmull) |
|---|
| 231 | 158 | |
|---|
| 232 | | - // now we have 16+y bytes left to reduce. 16 Bytes is in register v7 |
|---|
| 233 | | - // and the rest is in memory. We can fold 16 bytes at a time if y>=16 |
|---|
| 234 | | - // continue folding 16B at a time |
|---|
| 159 | + // For sizes less than 256 bytes, we can't fold 128 bytes at a time. |
|---|
| 160 | + cmp len, #256 |
|---|
| 161 | + blt .Lless_than_256_bytes |
|---|
| 235 | 162 | |
|---|
| 236 | | -_16B_reduction_loop: |
|---|
| 237 | | - vmull.p64 q8, d14, d20 |
|---|
| 238 | | - vmull.p64 q7, d15, d21 |
|---|
| 163 | + __adrl fold_consts_ptr, .Lfold_across_128_bytes_consts |
|---|
| 164 | + |
|---|
| 165 | + // Load the first 128 data bytes. Byte swapping is necessary to make |
|---|
| 166 | + // the bit order match the polynomial coefficient order. |
|---|
| 167 | + vld1.64 {q0-q1}, [buf]! |
|---|
| 168 | + vld1.64 {q2-q3}, [buf]! |
|---|
| 169 | + vld1.64 {q4-q5}, [buf]! |
|---|
| 170 | + vld1.64 {q6-q7}, [buf]! |
|---|
| 171 | +CPU_LE( vrev64.8 q0, q0 ) |
|---|
| 172 | +CPU_LE( vrev64.8 q1, q1 ) |
|---|
| 173 | +CPU_LE( vrev64.8 q2, q2 ) |
|---|
| 174 | +CPU_LE( vrev64.8 q3, q3 ) |
|---|
| 175 | +CPU_LE( vrev64.8 q4, q4 ) |
|---|
| 176 | +CPU_LE( vrev64.8 q5, q5 ) |
|---|
| 177 | +CPU_LE( vrev64.8 q6, q6 ) |
|---|
| 178 | +CPU_LE( vrev64.8 q7, q7 ) |
|---|
| 179 | + vswp q0l, q0h |
|---|
| 180 | + vswp q1l, q1h |
|---|
| 181 | + vswp q2l, q2h |
|---|
| 182 | + vswp q3l, q3h |
|---|
| 183 | + vswp q4l, q4h |
|---|
| 184 | + vswp q5l, q5h |
|---|
| 185 | + vswp q6l, q6h |
|---|
| 186 | + vswp q7l, q7h |
|---|
| 187 | + |
|---|
| 188 | + // XOR the first 16 data *bits* with the initial CRC value. |
|---|
| 189 | + vmov.i8 q8h, #0 |
|---|
| 190 | + vmov.u16 q8h[3], init_crc |
|---|
| 191 | + veor q0h, q0h, q8h |
|---|
| 192 | + |
|---|
| 193 | + // Load the constants for folding across 128 bytes. |
|---|
| 194 | + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! |
|---|
| 195 | + |
|---|
| 196 | + // Subtract 128 for the 128 data bytes just consumed. Subtract another |
|---|
| 197 | + // 128 to simplify the termination condition of the following loop. |
|---|
| 198 | + sub len, len, #256 |
|---|
| 199 | + |
|---|
| 200 | + // While >= 128 data bytes remain (not counting q0-q7), fold the 128 |
|---|
| 201 | + // bytes q0-q7 into them, storing the result back into q0-q7. |
|---|
| 202 | +.Lfold_128_bytes_loop: |
|---|
| 203 | + fold_32_bytes q0, q1 |
|---|
| 204 | + fold_32_bytes q2, q3 |
|---|
| 205 | + fold_32_bytes q4, q5 |
|---|
| 206 | + fold_32_bytes q6, q7 |
|---|
| 207 | + subs len, len, #128 |
|---|
| 208 | + bge .Lfold_128_bytes_loop |
|---|
| 209 | + |
|---|
| 210 | + // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7. |
|---|
| 211 | + |
|---|
| 212 | + // Fold across 64 bytes. |
|---|
| 213 | + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! |
|---|
| 214 | + fold_16_bytes q0, q4 |
|---|
| 215 | + fold_16_bytes q1, q5 |
|---|
| 216 | + fold_16_bytes q2, q6 |
|---|
| 217 | + fold_16_bytes q3, q7, 1 |
|---|
| 218 | + // Fold across 32 bytes. |
|---|
| 219 | + fold_16_bytes q4, q6 |
|---|
| 220 | + fold_16_bytes q5, q7, 1 |
|---|
| 221 | + // Fold across 16 bytes. |
|---|
| 222 | + fold_16_bytes q6, q7 |
|---|
| 223 | + |
|---|
| 224 | + // Add 128 to get the correct number of data bytes remaining in 0...127 |
|---|
| 225 | + // (not counting q7), following the previous extra subtraction by 128. |
|---|
| 226 | + // Then subtract 16 to simplify the termination condition of the |
|---|
| 227 | + // following loop. |
|---|
| 228 | + adds len, len, #(128-16) |
|---|
| 229 | + |
|---|
| 230 | + // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7 |
|---|
| 231 | + // into them, storing the result back into q7. |
|---|
| 232 | + blt .Lfold_16_bytes_loop_done |
|---|
| 233 | +.Lfold_16_bytes_loop: |
|---|
| 234 | + vmull.p64 q8, q7l, FOLD_CONST_L |
|---|
| 235 | + vmull.p64 q7, q7h, FOLD_CONST_H |
|---|
| 239 | 236 | veor.8 q7, q7, q8 |
|---|
| 240 | | - |
|---|
| 241 | | - vld1.64 {q0}, [arg2]! |
|---|
| 242 | | -CPU_LE( vrev64.8 q0, q0 ) |
|---|
| 243 | | - vswp d0, d1 |
|---|
| 237 | + vld1.64 {q0}, [buf]! |
|---|
| 238 | +CPU_LE( vrev64.8 q0, q0 ) |
|---|
| 239 | + vswp q0l, q0h |
|---|
| 244 | 240 | veor.8 q7, q7, q0 |
|---|
| 245 | | - subs arg3, arg3, #16 |
|---|
| 241 | + subs len, len, #16 |
|---|
| 242 | + bge .Lfold_16_bytes_loop |
|---|
| 246 | 243 | |
|---|
| 247 | | - // instead of a cmp instruction, we utilize the flags with the |
|---|
| 248 | | - // jge instruction equivalent of: cmp arg3, 16-16 |
|---|
| 249 | | - // check if there is any more 16B in the buffer to be able to fold |
|---|
| 250 | | - bge _16B_reduction_loop |
|---|
| 244 | +.Lfold_16_bytes_loop_done: |
|---|
| 245 | + // Add 16 to get the correct number of data bytes remaining in 0...15 |
|---|
| 246 | + // (not counting q7), following the previous extra subtraction by 16. |
|---|
| 247 | + adds len, len, #16 |
|---|
| 248 | + beq .Lreduce_final_16_bytes |
|---|
| 251 | 249 | |
|---|
| 252 | | - // now we have 16+z bytes left to reduce, where 0<= z < 16. |
|---|
| 253 | | - // first, we reduce the data in the xmm7 register |
|---|
| 250 | +.Lhandle_partial_segment: |
|---|
| 251 | + // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first |
|---|
| 252 | + // 16 bytes are in q7 and the rest are the remaining data in 'buf'. To |
|---|
| 253 | + // do this without needing a fold constant for each possible 'len', |
|---|
| 254 | + // redivide the bytes into a first chunk of 'len' bytes and a second |
|---|
| 255 | + // chunk of 16 bytes, then fold the first chunk into the second. |
|---|
| 254 | 256 | |
|---|
| 255 | | -_final_reduction_for_128: |
|---|
| 256 | | - // check if any more data to fold. If not, compute the CRC of |
|---|
| 257 | | - // the final 128 bits |
|---|
| 258 | | - adds arg3, arg3, #16 |
|---|
| 259 | | - beq _128_done |
|---|
| 257 | + // q0 = last 16 original data bytes |
|---|
| 258 | + add buf, buf, len |
|---|
| 259 | + sub buf, buf, #16 |
|---|
| 260 | + vld1.64 {q0}, [buf] |
|---|
| 261 | +CPU_LE( vrev64.8 q0, q0 ) |
|---|
| 262 | + vswp q0l, q0h |
|---|
| 260 | 263 | |
|---|
| 261 | | - // here we are getting data that is less than 16 bytes. |
|---|
| 262 | | - // since we know that there was data before the pointer, we can |
|---|
| 263 | | - // offset the input pointer before the actual point, to receive |
|---|
| 264 | | - // exactly 16 bytes. after that the registers need to be adjusted. |
|---|
| 265 | | -_get_last_two_regs: |
|---|
| 266 | | - add arg2, arg2, arg3 |
|---|
| 267 | | - sub arg2, arg2, #16 |
|---|
| 268 | | - vld1.64 {q1}, [arg2] |
|---|
| 269 | | -CPU_LE( vrev64.8 q1, q1 ) |
|---|
| 270 | | - vswp d2, d3 |
|---|
| 264 | + // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes. |
|---|
| 265 | + __adrl r3, .Lbyteshift_table + 16 |
|---|
| 266 | + sub r3, r3, len |
|---|
| 267 | + vld1.8 {q2}, [r3] |
|---|
| 268 | + vtbl.8 q1l, {q7l-q7h}, q2l |
|---|
| 269 | + vtbl.8 q1h, {q7l-q7h}, q2h |
|---|
| 271 | 270 | |
|---|
| 272 | | - // get rid of the extra data that was loaded before |
|---|
| 273 | | - // load the shift constant |
|---|
| 274 | | - adr ip, tbl_shf_table + 16 |
|---|
| 275 | | - sub ip, ip, arg3 |
|---|
| 276 | | - vld1.8 {q0}, [ip] |
|---|
| 271 | + // q3 = first chunk: q7 right-shifted by '16-len' bytes. |
|---|
| 272 | + vmov.i8 q3, #0x80 |
|---|
| 273 | + veor.8 q2, q2, q3 |
|---|
| 274 | + vtbl.8 q3l, {q7l-q7h}, q2l |
|---|
| 275 | + vtbl.8 q3h, {q7l-q7h}, q2h |
|---|
| 277 | 276 | |
|---|
| 278 | | - // shift v2 to the left by arg3 bytes |
|---|
| 279 | | - vtbl.8 d4, {d14-d15}, d0 |
|---|
| 280 | | - vtbl.8 d5, {d14-d15}, d1 |
|---|
| 277 | + // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes. |
|---|
| 278 | + vshr.s8 q2, q2, #7 |
|---|
| 281 | 279 | |
|---|
| 282 | | - // shift v7 to the right by 16-arg3 bytes |
|---|
| 283 | | - vmov.i8 q9, #0x80 |
|---|
| 284 | | - veor.8 q0, q0, q9 |
|---|
| 285 | | - vtbl.8 d18, {d14-d15}, d0 |
|---|
| 286 | | - vtbl.8 d19, {d14-d15}, d1 |
|---|
| 280 | + // q2 = second chunk: 'len' bytes from q0 (low-order bytes), |
|---|
| 281 | + // then '16-len' bytes from q1 (high-order bytes). |
|---|
| 282 | + vbsl.8 q2, q1, q0 |
|---|
| 287 | 283 | |
|---|
| 288 | | - // blend |
|---|
| 289 | | - vshr.s8 q0, q0, #7 // convert to 8-bit mask |
|---|
| 290 | | - vbsl.8 q0, q2, q1 |
|---|
| 291 | | - |
|---|
| 292 | | - // fold 16 Bytes |
|---|
| 293 | | - vmull.p64 q8, d18, d20 |
|---|
| 294 | | - vmull.p64 q7, d19, d21 |
|---|
| 295 | | - veor.8 q7, q7, q8 |
|---|
| 284 | + // Fold the first chunk into the second chunk, storing the result in q7. |
|---|
| 285 | + vmull.p64 q0, q3l, FOLD_CONST_L |
|---|
| 286 | + vmull.p64 q7, q3h, FOLD_CONST_H |
|---|
| 296 | 287 | veor.8 q7, q7, q0 |
|---|
| 288 | + veor.8 q7, q7, q2 |
|---|
| 297 | 289 | |
|---|
| 298 | | -_128_done: |
|---|
| 299 | | - // compute crc of a 128-bit value |
|---|
| 300 | | - vldr d20, rk5 |
|---|
| 301 | | - vldr d21, rk6 // rk5 and rk6 in xmm10 |
|---|
| 290 | +.Lreduce_final_16_bytes: |
|---|
| 291 | + // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC. |
|---|
| 302 | 292 | |
|---|
| 303 | | - // 64b fold |
|---|
| 304 | | - vext.8 q0, qzr, q7, #8 |
|---|
| 305 | | - vmull.p64 q7, d15, d20 |
|---|
| 306 | | - veor.8 q7, q7, q0 |
|---|
| 293 | + // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. |
|---|
| 294 | + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! |
|---|
| 307 | 295 | |
|---|
| 308 | | - // 32b fold |
|---|
| 309 | | - vext.8 q0, q7, qzr, #12 |
|---|
| 310 | | - vmov s31, s3 |
|---|
| 311 | | - vmull.p64 q0, d0, d21 |
|---|
| 312 | | - veor.8 q7, q0, q7 |
|---|
| 296 | + // Fold the high 64 bits into the low 64 bits, while also multiplying by |
|---|
| 297 | + // x^64. This produces a 128-bit value congruent to x^64 * M(x) and |
|---|
| 298 | + // whose low 48 bits are 0. |
|---|
| 299 | + vmull.p64 q0, q7h, FOLD_CONST_H // high bits * x^48 * (x^80 mod G(x)) |
|---|
| 300 | + veor.8 q0h, q0h, q7l // + low bits * x^64 |
|---|
| 313 | 301 | |
|---|
| 314 | | - // barrett reduction |
|---|
| 315 | | -_barrett: |
|---|
| 316 | | - vldr d20, rk7 |
|---|
| 317 | | - vldr d21, rk8 |
|---|
| 302 | + // Fold the high 32 bits into the low 96 bits. This produces a 96-bit |
|---|
| 303 | + // value congruent to x^64 * M(x) and whose low 48 bits are 0. |
|---|
| 304 | + vmov.i8 q1, #0 |
|---|
| 305 | + vmov s4, s3 // extract high 32 bits |
|---|
| 306 | + vmov s3, s5 // zero high 32 bits |
|---|
| 307 | + vmull.p64 q1, q1l, FOLD_CONST_L // high 32 bits * x^48 * (x^48 mod G(x)) |
|---|
| 308 | + veor.8 q0, q0, q1 // + low bits |
|---|
| 318 | 309 | |
|---|
| 319 | | - vmull.p64 q0, d15, d20 |
|---|
| 320 | | - vext.8 q0, qzr, q0, #12 |
|---|
| 321 | | - vmull.p64 q0, d1, d21 |
|---|
| 322 | | - vext.8 q0, qzr, q0, #12 |
|---|
| 323 | | - veor.8 q7, q7, q0 |
|---|
| 324 | | - vmov r0, s29 |
|---|
| 310 | + // Load G(x) and floor(x^48 / G(x)). |
|---|
| 311 | + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128] |
|---|
| 325 | 312 | |
|---|
| 326 | | -_cleanup: |
|---|
| 327 | | - // scale the result back to 16 bits |
|---|
| 328 | | - lsr r0, r0, #16 |
|---|
| 313 | + // Use Barrett reduction to compute the final CRC value. |
|---|
| 314 | + vmull.p64 q1, q0h, FOLD_CONST_H // high 32 bits * floor(x^48 / G(x)) |
|---|
| 315 | + vshr.u64 q1l, q1l, #32 // /= x^32 |
|---|
| 316 | + vmull.p64 q1, q1l, FOLD_CONST_L // *= G(x) |
|---|
| 317 | + vshr.u64 q0l, q0l, #48 |
|---|
| 318 | + veor.8 q0l, q0l, q1l // + low 16 nonzero bits |
|---|
| 319 | + // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0. |
|---|
| 320 | + |
|---|
| 321 | + vmov.u16 r0, q0l[0] |
|---|
| 329 | 322 | bx lr |
|---|
| 330 | 323 | |
|---|
| 331 | | -_less_than_128: |
|---|
| 332 | | - teq arg3, #0 |
|---|
| 333 | | - beq _cleanup |
|---|
| 324 | +.Lless_than_256_bytes: |
|---|
| 325 | + // Checksumming a buffer of length 16...255 bytes |
|---|
| 334 | 326 | |
|---|
| 335 | | - vmov.i8 q0, #0 |
|---|
| 336 | | - vmov s3, arg1_low32 // get the initial crc value |
|---|
| 327 | + __adrl fold_consts_ptr, .Lfold_across_16_bytes_consts |
|---|
| 337 | 328 | |
|---|
| 338 | | - vld1.64 {q7}, [arg2]! |
|---|
| 339 | | -CPU_LE( vrev64.8 q7, q7 ) |
|---|
| 340 | | - vswp d14, d15 |
|---|
| 341 | | - veor.8 q7, q7, q0 |
|---|
| 329 | + // Load the first 16 data bytes. |
|---|
| 330 | + vld1.64 {q7}, [buf]! |
|---|
| 331 | +CPU_LE( vrev64.8 q7, q7 ) |
|---|
| 332 | + vswp q7l, q7h |
|---|
| 342 | 333 | |
|---|
| 343 | | - cmp arg3, #16 |
|---|
| 344 | | - beq _128_done // exactly 16 left |
|---|
| 345 | | - blt _less_than_16_left |
|---|
| 334 | + // XOR the first 16 data *bits* with the initial CRC value. |
|---|
| 335 | + vmov.i8 q0h, #0 |
|---|
| 336 | + vmov.u16 q0h[3], init_crc |
|---|
| 337 | + veor.8 q7h, q7h, q0h |
|---|
| 346 | 338 | |
|---|
| 347 | | - // now if there is, load the constants |
|---|
| 348 | | - vldr d20, rk1 |
|---|
| 349 | | - vldr d21, rk2 // rk1 and rk2 in xmm10 |
|---|
| 339 | + // Load the fold-across-16-bytes constants. |
|---|
| 340 | + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! |
|---|
| 350 | 341 | |
|---|
| 351 | | - // check if there is enough buffer to be able to fold 16B at a time |
|---|
| 352 | | - subs arg3, arg3, #32 |
|---|
| 353 | | - addlt arg3, arg3, #16 |
|---|
| 354 | | - blt _get_last_two_regs |
|---|
| 355 | | - b _16B_reduction_loop |
|---|
| 356 | | - |
|---|
| 357 | | -_less_than_16_left: |
|---|
| 358 | | - // shl r9, 4 |
|---|
| 359 | | - adr ip, tbl_shf_table + 16 |
|---|
| 360 | | - sub ip, ip, arg3 |
|---|
| 361 | | - vld1.8 {q0}, [ip] |
|---|
| 362 | | - vmov.i8 q9, #0x80 |
|---|
| 363 | | - veor.8 q0, q0, q9 |
|---|
| 364 | | - vtbl.8 d18, {d14-d15}, d0 |
|---|
| 365 | | - vtbl.8 d15, {d14-d15}, d1 |
|---|
| 366 | | - vmov d14, d18 |
|---|
| 367 | | - b _128_done |
|---|
| 342 | + cmp len, #16 |
|---|
| 343 | + beq .Lreduce_final_16_bytes // len == 16 |
|---|
| 344 | + subs len, len, #32 |
|---|
| 345 | + addlt len, len, #16 |
|---|
| 346 | + blt .Lhandle_partial_segment // 17 <= len <= 31 |
|---|
| 347 | + b .Lfold_16_bytes_loop // 32 <= len <= 255 |
|---|
| 368 | 348 | ENDPROC(crc_t10dif_pmull) |
|---|
| 369 | 349 | |
|---|
| 370 | | -// precomputed constants |
|---|
| 371 | | -// these constants are precomputed from the poly: |
|---|
| 372 | | -// 0x8bb70000 (0x8bb7 scaled to 32 bits) |
|---|
| 350 | + .section ".rodata", "a" |
|---|
| 373 | 351 | .align 4 |
|---|
| 374 | | -// Q = 0x18BB70000 |
|---|
| 375 | | -// rk1 = 2^(32*3) mod Q << 32 |
|---|
| 376 | | -// rk2 = 2^(32*5) mod Q << 32 |
|---|
| 377 | | -// rk3 = 2^(32*15) mod Q << 32 |
|---|
| 378 | | -// rk4 = 2^(32*17) mod Q << 32 |
|---|
| 379 | | -// rk5 = 2^(32*3) mod Q << 32 |
|---|
| 380 | | -// rk6 = 2^(32*2) mod Q << 32 |
|---|
| 381 | | -// rk7 = floor(2^64/Q) |
|---|
| 382 | | -// rk8 = Q |
|---|
| 383 | 352 | |
|---|
| 384 | | -rk3: .quad 0x9d9d000000000000 |
|---|
| 385 | | -rk4: .quad 0x7cf5000000000000 |
|---|
| 386 | | -rk5: .quad 0x2d56000000000000 |
|---|
| 387 | | -rk6: .quad 0x1368000000000000 |
|---|
| 388 | | -rk7: .quad 0x00000001f65a57f8 |
|---|
| 389 | | -rk8: .quad 0x000000018bb70000 |
|---|
| 390 | | -rk9: .quad 0xceae000000000000 |
|---|
| 391 | | -rk10: .quad 0xbfd6000000000000 |
|---|
| 392 | | -rk11: .quad 0x1e16000000000000 |
|---|
| 393 | | -rk12: .quad 0x713c000000000000 |
|---|
| 394 | | -rk13: .quad 0xf7f9000000000000 |
|---|
| 395 | | -rk14: .quad 0x80a6000000000000 |
|---|
| 396 | | -rk15: .quad 0x044c000000000000 |
|---|
| 397 | | -rk16: .quad 0xe658000000000000 |
|---|
| 398 | | -rk17: .quad 0xad18000000000000 |
|---|
| 399 | | -rk18: .quad 0xa497000000000000 |
|---|
| 400 | | -rk19: .quad 0x6ee3000000000000 |
|---|
| 401 | | -rk20: .quad 0xe7b5000000000000 |
|---|
| 402 | | -rk1: .quad 0x2d56000000000000 |
|---|
| 403 | | -rk2: .quad 0x06df000000000000 |
|---|
| 353 | +// Fold constants precomputed from the polynomial 0x18bb7 |
|---|
| 354 | +// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 |
|---|
| 355 | +.Lfold_across_128_bytes_consts: |
|---|
| 356 | + .quad 0x0000000000006123 // x^(8*128) mod G(x) |
|---|
| 357 | + .quad 0x0000000000002295 // x^(8*128+64) mod G(x) |
|---|
| 358 | +// .Lfold_across_64_bytes_consts: |
|---|
| 359 | + .quad 0x0000000000001069 // x^(4*128) mod G(x) |
|---|
| 360 | + .quad 0x000000000000dd31 // x^(4*128+64) mod G(x) |
|---|
| 361 | +// .Lfold_across_32_bytes_consts: |
|---|
| 362 | + .quad 0x000000000000857d // x^(2*128) mod G(x) |
|---|
| 363 | + .quad 0x0000000000007acc // x^(2*128+64) mod G(x) |
|---|
| 364 | +.Lfold_across_16_bytes_consts: |
|---|
| 365 | + .quad 0x000000000000a010 // x^(1*128) mod G(x) |
|---|
| 366 | + .quad 0x0000000000001faa // x^(1*128+64) mod G(x) |
|---|
| 367 | +// .Lfinal_fold_consts: |
|---|
| 368 | + .quad 0x1368000000000000 // x^48 * (x^48 mod G(x)) |
|---|
| 369 | + .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x)) |
|---|
| 370 | +// .Lbarrett_reduction_consts: |
|---|
| 371 | + .quad 0x0000000000018bb7 // G(x) |
|---|
| 372 | + .quad 0x00000001f65a57f8 // floor(x^48 / G(x)) |
|---|
| 404 | 373 | |
|---|
| 405 | | -tbl_shf_table: |
|---|
| 406 | | -// use these values for shift constants for the tbl/tbx instruction |
|---|
| 407 | | -// different alignments result in values as shown: |
|---|
| 408 | | -// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 |
|---|
| 409 | | -// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 |
|---|
| 410 | | -// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 |
|---|
| 411 | | -// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 |
|---|
| 412 | | -// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 |
|---|
| 413 | | -// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 |
|---|
| 414 | | -// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 |
|---|
| 415 | | -// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 |
|---|
| 416 | | -// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 |
|---|
| 417 | | -// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 |
|---|
| 418 | | -// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 |
|---|
| 419 | | -// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 |
|---|
| 420 | | -// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 |
|---|
| 421 | | -// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 |
|---|
| 422 | | -// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 |
|---|
| 423 | | - |
|---|
| 374 | +// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - |
|---|
| 375 | +// len] is the index vector to shift left by 'len' bytes, and is also {0x80, |
|---|
| 376 | +// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes. |
|---|
| 377 | +.Lbyteshift_table: |
|---|
| 424 | 378 | .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 |
|---|
| 425 | 379 | .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f |
|---|
| 426 | 380 | .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
|---|