| .. | .. |
|---|
| 2 | 2 | // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions |
|---|
| 3 | 3 | // |
|---|
| 4 | 4 | // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> |
|---|
| 5 | +// Copyright (C) 2019 Google LLC <ebiggers@google.com> |
|---|
| 5 | 6 | // |
|---|
| 6 | 7 | // This program is free software; you can redistribute it and/or modify |
|---|
| 7 | 8 | // it under the terms of the GNU General Public License version 2 as |
|---|
| 8 | 9 | // published by the Free Software Foundation. |
|---|
| 9 | 10 | // |
|---|
| 10 | 11 | |
|---|
| 12 | +// Derived from the x86 version: |
|---|
| 11 | 13 | // |
|---|
| 12 | 14 | // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions |
|---|
| 13 | 15 | // |
|---|
| .. | .. |
|---|
| 54 | 56 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|---|
| 55 | 57 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|---|
| 56 | 58 | // |
|---|
| 57 | | -// Function API: |
|---|
| 58 | | -// UINT16 crc_t10dif_pcl( |
|---|
| 59 | | -// UINT16 init_crc, //initial CRC value, 16 bits |
|---|
| 60 | | -// const unsigned char *buf, //buffer pointer to calculate CRC on |
|---|
| 61 | | -// UINT64 len //buffer length in bytes (64-bit data) |
|---|
| 62 | | -// ); |
|---|
| 63 | | -// |
|---|
| 64 | 59 | // Reference paper titled "Fast CRC Computation for Generic |
|---|
| 65 | 60 | // Polynomials Using PCLMULQDQ Instruction" |
|---|
| 66 | 61 | // URL: http://www.intel.com/content/dam/www/public/us/en/documents |
|---|
| 67 | 62 | // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf |
|---|
| 68 | | -// |
|---|
| 69 | 63 | // |
|---|
| 70 | 64 | |
|---|
| 71 | 65 | #include <linux/linkage.h> |
|---|
| 72 | 66 | #include <asm/assembler.h> |
|---|
| 73 | 67 | |
|---|
| 74 | 68 | .text |
|---|
| 75 | | - .cpu generic+crypto |
|---|
| 69 | + .arch armv8-a+crypto |
|---|
| 76 | 70 | |
|---|
| 77 | | - arg1_low32 .req w19 |
|---|
| 78 | | - arg2 .req x20 |
|---|
| 79 | | - arg3 .req x21 |
|---|
| 71 | + init_crc .req w0 |
|---|
| 72 | + buf .req x1 |
|---|
| 73 | + len .req x2 |
|---|
| 74 | + fold_consts_ptr .req x3 |
|---|
| 80 | 75 | |
|---|
| 81 | | - vzr .req v13 |
|---|
| 76 | + fold_consts .req v10 |
|---|
| 82 | 77 | |
|---|
| 83 | | -ENTRY(crc_t10dif_pmull) |
|---|
| 84 | | - frame_push 3, 128 |
|---|
| 78 | + ad .req v14 |
|---|
| 85 | 79 | |
|---|
| 86 | | - mov arg1_low32, w0 |
|---|
| 87 | | - mov arg2, x1 |
|---|
| 88 | | - mov arg3, x2 |
|---|
| 80 | + k00_16 .req v15 |
|---|
| 81 | + k32_48 .req v16 |
|---|
| 89 | 82 | |
|---|
| 90 | | - movi vzr.16b, #0 // init zero register |
|---|
| 83 | + t3 .req v17 |
|---|
| 84 | + t4 .req v18 |
|---|
| 85 | + t5 .req v19 |
|---|
| 86 | + t6 .req v20 |
|---|
| 87 | + t7 .req v21 |
|---|
| 88 | + t8 .req v22 |
|---|
| 89 | + t9 .req v23 |
|---|
| 91 | 90 | |
|---|
| 92 | | - // adjust the 16-bit initial_crc value, scale it to 32 bits |
|---|
| 93 | | - lsl arg1_low32, arg1_low32, #16 |
|---|
| 91 | + perm1 .req v24 |
|---|
| 92 | + perm2 .req v25 |
|---|
| 93 | + perm3 .req v26 |
|---|
| 94 | + perm4 .req v27 |
|---|
| 94 | 95 | |
|---|
| 95 | | - // check if smaller than 256 |
|---|
| 96 | | - cmp arg3, #256 |
|---|
| 96 | + bd1 .req v28 |
|---|
| 97 | + bd2 .req v29 |
|---|
| 98 | + bd3 .req v30 |
|---|
| 99 | + bd4 .req v31 |
|---|
| 97 | 100 | |
|---|
| 98 | | - // for sizes less than 128, we can't fold 64B at a time... |
|---|
| 99 | | - b.lt _less_than_128 |
|---|
| 101 | + .macro __pmull_init_p64 |
|---|
| 102 | + .endm |
|---|
| 100 | 103 | |
|---|
| 101 | | - // load the initial crc value |
|---|
| 102 | | - // crc value does not need to be byte-reflected, but it needs |
|---|
| 103 | | - // to be moved to the high part of the register. |
|---|
| 104 | | - // because data will be byte-reflected and will align with |
|---|
| 105 | | - // initial crc at correct place. |
|---|
| 106 | | - movi v10.16b, #0 |
|---|
| 107 | | - mov v10.s[3], arg1_low32 // initial crc |
|---|
| 104 | + .macro __pmull_pre_p64, bd |
|---|
| 105 | + .endm |
|---|
| 108 | 106 | |
|---|
| 109 | | - // receive the initial 64B data, xor the initial crc value |
|---|
| 110 | | - ldp q0, q1, [arg2] |
|---|
| 111 | | - ldp q2, q3, [arg2, #0x20] |
|---|
| 112 | | - ldp q4, q5, [arg2, #0x40] |
|---|
| 113 | | - ldp q6, q7, [arg2, #0x60] |
|---|
| 114 | | - add arg2, arg2, #0x80 |
|---|
| 107 | + .macro __pmull_init_p8 |
|---|
| 108 | + // k00_16 := 0x0000000000000000_000000000000ffff |
|---|
| 109 | + // k32_48 := 0x00000000ffffffff_0000ffffffffffff |
|---|
| 110 | + movi k32_48.2d, #0xffffffff |
|---|
| 111 | + mov k32_48.h[2], k32_48.h[0] |
|---|
| 112 | + ushr k00_16.2d, k32_48.2d, #32 |
|---|
| 115 | 113 | |
|---|
| 116 | | -CPU_LE( rev64 v0.16b, v0.16b ) |
|---|
| 117 | | -CPU_LE( rev64 v1.16b, v1.16b ) |
|---|
| 118 | | -CPU_LE( rev64 v2.16b, v2.16b ) |
|---|
| 119 | | -CPU_LE( rev64 v3.16b, v3.16b ) |
|---|
| 120 | | -CPU_LE( rev64 v4.16b, v4.16b ) |
|---|
| 121 | | -CPU_LE( rev64 v5.16b, v5.16b ) |
|---|
| 122 | | -CPU_LE( rev64 v6.16b, v6.16b ) |
|---|
| 123 | | -CPU_LE( rev64 v7.16b, v7.16b ) |
|---|
| 114 | + // prepare the permutation vectors |
|---|
| 115 | + mov_q x5, 0x080f0e0d0c0b0a09 |
|---|
| 116 | + movi perm4.8b, #8 |
|---|
| 117 | + dup perm1.2d, x5 |
|---|
| 118 | + eor perm1.16b, perm1.16b, perm4.16b |
|---|
| 119 | + ushr perm2.2d, perm1.2d, #8 |
|---|
| 120 | + ushr perm3.2d, perm1.2d, #16 |
|---|
| 121 | + ushr perm4.2d, perm1.2d, #24 |
|---|
| 122 | + sli perm2.2d, perm1.2d, #56 |
|---|
| 123 | + sli perm3.2d, perm1.2d, #48 |
|---|
| 124 | + sli perm4.2d, perm1.2d, #40 |
|---|
| 125 | + .endm |
|---|
| 124 | 126 | |
|---|
| 125 | | -CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) |
|---|
| 126 | | -CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) |
|---|
| 127 | | -CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) |
|---|
| 128 | | -CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) |
|---|
| 129 | | -CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) |
|---|
| 130 | | -CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) |
|---|
| 131 | | -CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) |
|---|
| 132 | | -CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) |
|---|
| 127 | + .macro __pmull_pre_p8, bd |
|---|
| 128 | + tbl bd1.16b, {\bd\().16b}, perm1.16b |
|---|
| 129 | + tbl bd2.16b, {\bd\().16b}, perm2.16b |
|---|
| 130 | + tbl bd3.16b, {\bd\().16b}, perm3.16b |
|---|
| 131 | + tbl bd4.16b, {\bd\().16b}, perm4.16b |
|---|
| 132 | + .endm |
|---|
| 133 | 133 | |
|---|
| 134 | | - // XOR the initial_crc value |
|---|
| 135 | | - eor v0.16b, v0.16b, v10.16b |
|---|
| 134 | +SYM_FUNC_START_LOCAL(__pmull_p8_core) |
|---|
| 135 | +.L__pmull_p8_core: |
|---|
| 136 | + ext t4.8b, ad.8b, ad.8b, #1 // A1 |
|---|
| 137 | + ext t5.8b, ad.8b, ad.8b, #2 // A2 |
|---|
| 138 | + ext t6.8b, ad.8b, ad.8b, #3 // A3 |
|---|
| 136 | 139 | |
|---|
| 137 | | - ldr_l q10, rk3, x8 // xmm10 has rk3 and rk4 |
|---|
| 138 | | - // type of pmull instruction |
|---|
| 139 | | - // will determine which constant to use |
|---|
| 140 | + pmull t4.8h, t4.8b, fold_consts.8b // F = A1*B |
|---|
| 141 | + pmull t8.8h, ad.8b, bd1.8b // E = A*B1 |
|---|
| 142 | + pmull t5.8h, t5.8b, fold_consts.8b // H = A2*B |
|---|
| 143 | + pmull t7.8h, ad.8b, bd2.8b // G = A*B2 |
|---|
| 144 | + pmull t6.8h, t6.8b, fold_consts.8b // J = A3*B |
|---|
| 145 | + pmull t9.8h, ad.8b, bd3.8b // I = A*B3 |
|---|
| 146 | + pmull t3.8h, ad.8b, bd4.8b // K = A*B4 |
|---|
| 147 | + b 0f |
|---|
| 140 | 148 | |
|---|
| 141 | | - // |
|---|
| 142 | | - // we subtract 256 instead of 128 to save one instruction from the loop |
|---|
| 143 | | - // |
|---|
| 144 | | - sub arg3, arg3, #256 |
|---|
| 149 | +.L__pmull_p8_core2: |
|---|
| 150 | + tbl t4.16b, {ad.16b}, perm1.16b // A1 |
|---|
| 151 | + tbl t5.16b, {ad.16b}, perm2.16b // A2 |
|---|
| 152 | + tbl t6.16b, {ad.16b}, perm3.16b // A3 |
|---|
| 145 | 153 | |
|---|
| 146 | | - // at this section of the code, there is 64*x+y (0<=y<64) bytes of |
|---|
| 147 | | - // buffer. The _fold_64_B_loop will fold 64B at a time |
|---|
| 148 | | - // until we have 64+y Bytes of buffer |
|---|
| 154 | + pmull2 t4.8h, t4.16b, fold_consts.16b // F = A1*B |
|---|
| 155 | + pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1 |
|---|
| 156 | + pmull2 t5.8h, t5.16b, fold_consts.16b // H = A2*B |
|---|
| 157 | + pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2 |
|---|
| 158 | + pmull2 t6.8h, t6.16b, fold_consts.16b // J = A3*B |
|---|
| 159 | + pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3 |
|---|
| 160 | + pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4 |
|---|
| 149 | 161 | |
|---|
| 162 | +0: eor t4.16b, t4.16b, t8.16b // L = E + F |
|---|
| 163 | + eor t5.16b, t5.16b, t7.16b // M = G + H |
|---|
| 164 | + eor t6.16b, t6.16b, t9.16b // N = I + J |
|---|
| 150 | 165 | |
|---|
| 151 | | - // fold 64B at a time. This section of the code folds 4 vector |
|---|
| 152 | | - // registers in parallel |
|---|
| 153 | | -_fold_64_B_loop: |
|---|
| 166 | + uzp1 t8.2d, t4.2d, t5.2d |
|---|
| 167 | + uzp2 t4.2d, t4.2d, t5.2d |
|---|
| 168 | + uzp1 t7.2d, t6.2d, t3.2d |
|---|
| 169 | + uzp2 t6.2d, t6.2d, t3.2d |
|---|
| 154 | 170 | |
|---|
| 155 | | - .macro fold64, reg1, reg2 |
|---|
| 156 | | - ldp q11, q12, [arg2], #0x20 |
|---|
| 171 | + // t4 = (L) (P0 + P1) << 8 |
|---|
| 172 | + // t5 = (M) (P2 + P3) << 16 |
|---|
| 173 | + eor t8.16b, t8.16b, t4.16b |
|---|
| 174 | + and t4.16b, t4.16b, k32_48.16b |
|---|
| 157 | 175 | |
|---|
| 158 | | - pmull2 v8.1q, \reg1\().2d, v10.2d |
|---|
| 159 | | - pmull \reg1\().1q, \reg1\().1d, v10.1d |
|---|
| 176 | + // t6 = (N) (P4 + P5) << 24 |
|---|
| 177 | + // t7 = (K) (P6 + P7) << 32 |
|---|
| 178 | + eor t7.16b, t7.16b, t6.16b |
|---|
| 179 | + and t6.16b, t6.16b, k00_16.16b |
|---|
| 180 | + |
|---|
| 181 | + eor t8.16b, t8.16b, t4.16b |
|---|
| 182 | + eor t7.16b, t7.16b, t6.16b |
|---|
| 183 | + |
|---|
| 184 | + zip2 t5.2d, t8.2d, t4.2d |
|---|
| 185 | + zip1 t4.2d, t8.2d, t4.2d |
|---|
| 186 | + zip2 t3.2d, t7.2d, t6.2d |
|---|
| 187 | + zip1 t6.2d, t7.2d, t6.2d |
|---|
| 188 | + |
|---|
| 189 | + ext t4.16b, t4.16b, t4.16b, #15 |
|---|
| 190 | + ext t5.16b, t5.16b, t5.16b, #14 |
|---|
| 191 | + ext t6.16b, t6.16b, t6.16b, #13 |
|---|
| 192 | + ext t3.16b, t3.16b, t3.16b, #12 |
|---|
| 193 | + |
|---|
| 194 | + eor t4.16b, t4.16b, t5.16b |
|---|
| 195 | + eor t6.16b, t6.16b, t3.16b |
|---|
| 196 | + ret |
|---|
| 197 | +SYM_FUNC_END(__pmull_p8_core) |
|---|
| 198 | + |
|---|
| 199 | + .macro __pmull_p8, rq, ad, bd, i |
|---|
| 200 | + .ifnc \bd, fold_consts |
|---|
| 201 | + .err |
|---|
| 202 | + .endif |
|---|
| 203 | + mov ad.16b, \ad\().16b |
|---|
| 204 | + .ifb \i |
|---|
| 205 | + pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B |
|---|
| 206 | + .else |
|---|
| 207 | + pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B |
|---|
| 208 | + .endif |
|---|
| 209 | + |
|---|
| 210 | + bl .L__pmull_p8_core\i |
|---|
| 211 | + |
|---|
| 212 | + eor \rq\().16b, \rq\().16b, t4.16b |
|---|
| 213 | + eor \rq\().16b, \rq\().16b, t6.16b |
|---|
| 214 | + .endm |
|---|
| 215 | + |
|---|
| 216 | + // Fold reg1, reg2 into the next 32 data bytes, storing the result back |
|---|
| 217 | + // into reg1, reg2. |
|---|
| 218 | + .macro fold_32_bytes, p, reg1, reg2 |
|---|
| 219 | + ldp q11, q12, [buf], #0x20 |
|---|
| 220 | + |
|---|
| 221 | + __pmull_\p v8, \reg1, fold_consts, 2 |
|---|
| 222 | + __pmull_\p \reg1, \reg1, fold_consts |
|---|
| 160 | 223 | |
|---|
| 161 | 224 | CPU_LE( rev64 v11.16b, v11.16b ) |
|---|
| 162 | 225 | CPU_LE( rev64 v12.16b, v12.16b ) |
|---|
| 163 | 226 | |
|---|
| 164 | | - pmull2 v9.1q, \reg2\().2d, v10.2d |
|---|
| 165 | | - pmull \reg2\().1q, \reg2\().1d, v10.1d |
|---|
| 227 | + __pmull_\p v9, \reg2, fold_consts, 2 |
|---|
| 228 | + __pmull_\p \reg2, \reg2, fold_consts |
|---|
| 166 | 229 | |
|---|
| 167 | 230 | CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) |
|---|
| 168 | 231 | CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) |
|---|
| .. | .. |
|---|
| 173 | 236 | eor \reg2\().16b, \reg2\().16b, v12.16b |
|---|
| 174 | 237 | .endm |
|---|
| 175 | 238 | |
|---|
| 176 | | - fold64 v0, v1 |
|---|
| 177 | | - fold64 v2, v3 |
|---|
| 178 | | - fold64 v4, v5 |
|---|
| 179 | | - fold64 v6, v7 |
|---|
| 180 | | - |
|---|
| 181 | | - subs arg3, arg3, #128 |
|---|
| 182 | | - |
|---|
| 183 | | - // check if there is another 64B in the buffer to be able to fold |
|---|
| 184 | | - b.lt _fold_64_B_end |
|---|
| 185 | | - |
|---|
| 186 | | - if_will_cond_yield_neon |
|---|
| 187 | | - stp q0, q1, [sp, #.Lframe_local_offset] |
|---|
| 188 | | - stp q2, q3, [sp, #.Lframe_local_offset + 32] |
|---|
| 189 | | - stp q4, q5, [sp, #.Lframe_local_offset + 64] |
|---|
| 190 | | - stp q6, q7, [sp, #.Lframe_local_offset + 96] |
|---|
| 191 | | - do_cond_yield_neon |
|---|
| 192 | | - ldp q0, q1, [sp, #.Lframe_local_offset] |
|---|
| 193 | | - ldp q2, q3, [sp, #.Lframe_local_offset + 32] |
|---|
| 194 | | - ldp q4, q5, [sp, #.Lframe_local_offset + 64] |
|---|
| 195 | | - ldp q6, q7, [sp, #.Lframe_local_offset + 96] |
|---|
| 196 | | - ldr_l q10, rk3, x8 |
|---|
| 197 | | - movi vzr.16b, #0 // init zero register |
|---|
| 198 | | - endif_yield_neon |
|---|
| 199 | | - |
|---|
| 200 | | - b _fold_64_B_loop |
|---|
| 201 | | - |
|---|
| 202 | | -_fold_64_B_end: |
|---|
| 203 | | - // at this point, the buffer pointer is pointing at the last y Bytes |
|---|
| 204 | | - // of the buffer the 64B of folded data is in 4 of the vector |
|---|
| 205 | | - // registers: v0, v1, v2, v3 |
|---|
| 206 | | - |
|---|
| 207 | | - // fold the 8 vector registers to 1 vector register with different |
|---|
| 208 | | - // constants |
|---|
| 209 | | - |
|---|
| 210 | | - ldr_l q10, rk9, x8 |
|---|
| 211 | | - |
|---|
| 212 | | - .macro fold16, reg, rk |
|---|
| 213 | | - pmull v8.1q, \reg\().1d, v10.1d |
|---|
| 214 | | - pmull2 \reg\().1q, \reg\().2d, v10.2d |
|---|
| 215 | | - .ifnb \rk |
|---|
| 216 | | - ldr_l q10, \rk, x8 |
|---|
| 239 | + // Fold src_reg into dst_reg, optionally loading the next fold constants |
|---|
| 240 | + .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts |
|---|
| 241 | + __pmull_\p v8, \src_reg, fold_consts |
|---|
| 242 | + __pmull_\p \src_reg, \src_reg, fold_consts, 2 |
|---|
| 243 | + .ifnb \load_next_consts |
|---|
| 244 | + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 |
|---|
| 245 | + __pmull_pre_\p fold_consts |
|---|
| 217 | 246 | .endif |
|---|
| 218 | | - eor v7.16b, v7.16b, v8.16b |
|---|
| 219 | | - eor v7.16b, v7.16b, \reg\().16b |
|---|
| 247 | + eor \dst_reg\().16b, \dst_reg\().16b, v8.16b |
|---|
| 248 | + eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b |
|---|
| 220 | 249 | .endm |
|---|
| 221 | 250 | |
|---|
| 222 | | - fold16 v0, rk11 |
|---|
| 223 | | - fold16 v1, rk13 |
|---|
| 224 | | - fold16 v2, rk15 |
|---|
| 225 | | - fold16 v3, rk17 |
|---|
| 226 | | - fold16 v4, rk19 |
|---|
| 227 | | - fold16 v5, rk1 |
|---|
| 228 | | - fold16 v6 |
|---|
| 251 | + .macro __pmull_p64, rd, rn, rm, n |
|---|
| 252 | + .ifb \n |
|---|
| 253 | + pmull \rd\().1q, \rn\().1d, \rm\().1d |
|---|
| 254 | + .else |
|---|
| 255 | + pmull2 \rd\().1q, \rn\().2d, \rm\().2d |
|---|
| 256 | + .endif |
|---|
| 257 | + .endm |
|---|
| 229 | 258 | |
|---|
| 230 | | - // instead of 64, we add 48 to the loop counter to save 1 instruction |
|---|
| 231 | | - // from the loop instead of a cmp instruction, we use the negative |
|---|
| 232 | | - // flag with the jl instruction |
|---|
| 233 | | - adds arg3, arg3, #(128-16) |
|---|
| 234 | | - b.lt _final_reduction_for_128 |
|---|
| 259 | + .macro crc_t10dif_pmull, p |
|---|
| 260 | + __pmull_init_\p |
|---|
| 235 | 261 | |
|---|
| 236 | | - // now we have 16+y bytes left to reduce. 16 Bytes is in register v7 |
|---|
| 237 | | - // and the rest is in memory. We can fold 16 bytes at a time if y>=16 |
|---|
| 238 | | - // continue folding 16B at a time |
|---|
| 262 | + // For sizes less than 256 bytes, we can't fold 128 bytes at a time. |
|---|
| 263 | + cmp len, #256 |
|---|
| 264 | + b.lt .Lless_than_256_bytes_\@ |
|---|
| 239 | 265 | |
|---|
| 240 | | -_16B_reduction_loop: |
|---|
| 241 | | - pmull v8.1q, v7.1d, v10.1d |
|---|
| 242 | | - pmull2 v7.1q, v7.2d, v10.2d |
|---|
| 266 | + adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts |
|---|
| 267 | + |
|---|
| 268 | + // Load the first 128 data bytes. Byte swapping is necessary to make |
|---|
| 269 | + // the bit order match the polynomial coefficient order. |
|---|
| 270 | + ldp q0, q1, [buf] |
|---|
| 271 | + ldp q2, q3, [buf, #0x20] |
|---|
| 272 | + ldp q4, q5, [buf, #0x40] |
|---|
| 273 | + ldp q6, q7, [buf, #0x60] |
|---|
| 274 | + add buf, buf, #0x80 |
|---|
| 275 | +CPU_LE( rev64 v0.16b, v0.16b ) |
|---|
| 276 | +CPU_LE( rev64 v1.16b, v1.16b ) |
|---|
| 277 | +CPU_LE( rev64 v2.16b, v2.16b ) |
|---|
| 278 | +CPU_LE( rev64 v3.16b, v3.16b ) |
|---|
| 279 | +CPU_LE( rev64 v4.16b, v4.16b ) |
|---|
| 280 | +CPU_LE( rev64 v5.16b, v5.16b ) |
|---|
| 281 | +CPU_LE( rev64 v6.16b, v6.16b ) |
|---|
| 282 | +CPU_LE( rev64 v7.16b, v7.16b ) |
|---|
| 283 | +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) |
|---|
| 284 | +CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) |
|---|
| 285 | +CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) |
|---|
| 286 | +CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) |
|---|
| 287 | +CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) |
|---|
| 288 | +CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) |
|---|
| 289 | +CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) |
|---|
| 290 | +CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) |
|---|
| 291 | + |
|---|
| 292 | + // XOR the first 16 data *bits* with the initial CRC value. |
|---|
| 293 | + movi v8.16b, #0 |
|---|
| 294 | + mov v8.h[7], init_crc |
|---|
| 295 | + eor v0.16b, v0.16b, v8.16b |
|---|
| 296 | + |
|---|
| 297 | + // Load the constants for folding across 128 bytes. |
|---|
| 298 | + ld1 {fold_consts.2d}, [fold_consts_ptr] |
|---|
| 299 | + __pmull_pre_\p fold_consts |
|---|
| 300 | + |
|---|
| 301 | + // Subtract 128 for the 128 data bytes just consumed. Subtract another |
|---|
| 302 | + // 128 to simplify the termination condition of the following loop. |
|---|
| 303 | + sub len, len, #256 |
|---|
| 304 | + |
|---|
| 305 | + // While >= 128 data bytes remain (not counting v0-v7), fold the 128 |
|---|
| 306 | + // bytes v0-v7 into them, storing the result back into v0-v7. |
|---|
| 307 | +.Lfold_128_bytes_loop_\@: |
|---|
| 308 | + fold_32_bytes \p, v0, v1 |
|---|
| 309 | + fold_32_bytes \p, v2, v3 |
|---|
| 310 | + fold_32_bytes \p, v4, v5 |
|---|
| 311 | + fold_32_bytes \p, v6, v7 |
|---|
| 312 | + |
|---|
| 313 | + subs len, len, #128 |
|---|
| 314 | + b.ge .Lfold_128_bytes_loop_\@ |
|---|
| 315 | + |
|---|
| 316 | + // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7. |
|---|
| 317 | + |
|---|
| 318 | + // Fold across 64 bytes. |
|---|
| 319 | + add fold_consts_ptr, fold_consts_ptr, #16 |
|---|
| 320 | + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 |
|---|
| 321 | + __pmull_pre_\p fold_consts |
|---|
| 322 | + fold_16_bytes \p, v0, v4 |
|---|
| 323 | + fold_16_bytes \p, v1, v5 |
|---|
| 324 | + fold_16_bytes \p, v2, v6 |
|---|
| 325 | + fold_16_bytes \p, v3, v7, 1 |
|---|
| 326 | + // Fold across 32 bytes. |
|---|
| 327 | + fold_16_bytes \p, v4, v6 |
|---|
| 328 | + fold_16_bytes \p, v5, v7, 1 |
|---|
| 329 | + // Fold across 16 bytes. |
|---|
| 330 | + fold_16_bytes \p, v6, v7 |
|---|
| 331 | + |
|---|
| 332 | + // Add 128 to get the correct number of data bytes remaining in 0...127 |
|---|
| 333 | + // (not counting v7), following the previous extra subtraction by 128. |
|---|
| 334 | + // Then subtract 16 to simplify the termination condition of the |
|---|
| 335 | + // following loop. |
|---|
| 336 | + adds len, len, #(128-16) |
|---|
| 337 | + |
|---|
| 338 | + // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7 |
|---|
| 339 | + // into them, storing the result back into v7. |
|---|
| 340 | + b.lt .Lfold_16_bytes_loop_done_\@ |
|---|
| 341 | +.Lfold_16_bytes_loop_\@: |
|---|
| 342 | + __pmull_\p v8, v7, fold_consts |
|---|
| 343 | + __pmull_\p v7, v7, fold_consts, 2 |
|---|
| 243 | 344 | eor v7.16b, v7.16b, v8.16b |
|---|
| 244 | | - |
|---|
| 245 | | - ldr q0, [arg2], #16 |
|---|
| 345 | + ldr q0, [buf], #16 |
|---|
| 246 | 346 | CPU_LE( rev64 v0.16b, v0.16b ) |
|---|
| 247 | 347 | CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) |
|---|
| 248 | 348 | eor v7.16b, v7.16b, v0.16b |
|---|
| 249 | | - subs arg3, arg3, #16 |
|---|
| 349 | + subs len, len, #16 |
|---|
| 350 | + b.ge .Lfold_16_bytes_loop_\@ |
|---|
| 250 | 351 | |
|---|
| 251 | | - // instead of a cmp instruction, we utilize the flags with the |
|---|
| 252 | | - // jge instruction equivalent of: cmp arg3, 16-16 |
|---|
| 253 | | - // check if there is any more 16B in the buffer to be able to fold |
|---|
| 254 | | - b.ge _16B_reduction_loop |
|---|
| 352 | +.Lfold_16_bytes_loop_done_\@: |
|---|
| 353 | + // Add 16 to get the correct number of data bytes remaining in 0...15 |
|---|
| 354 | + // (not counting v7), following the previous extra subtraction by 16. |
|---|
| 355 | + adds len, len, #16 |
|---|
| 356 | + b.eq .Lreduce_final_16_bytes_\@ |
|---|
| 255 | 357 | |
|---|
| 256 | | - // now we have 16+z bytes left to reduce, where 0<= z < 16. |
|---|
| 257 | | - // first, we reduce the data in the xmm7 register |
|---|
| 358 | +.Lhandle_partial_segment_\@: |
|---|
| 359 | + // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first |
|---|
| 360 | + // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To |
|---|
| 361 | + // do this without needing a fold constant for each possible 'len', |
|---|
| 362 | + // redivide the bytes into a first chunk of 'len' bytes and a second |
|---|
| 363 | + // chunk of 16 bytes, then fold the first chunk into the second. |
|---|
| 258 | 364 | |
|---|
| 259 | | -_final_reduction_for_128: |
|---|
| 260 | | - // check if any more data to fold. If not, compute the CRC of |
|---|
| 261 | | - // the final 128 bits |
|---|
| 262 | | - adds arg3, arg3, #16 |
|---|
| 263 | | - b.eq _128_done |
|---|
| 365 | + // v0 = last 16 original data bytes |
|---|
| 366 | + add buf, buf, len |
|---|
| 367 | + ldr q0, [buf, #-16] |
|---|
| 368 | +CPU_LE( rev64 v0.16b, v0.16b ) |
|---|
| 369 | +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) |
|---|
| 264 | 370 | |
|---|
| 265 | | - // here we are getting data that is less than 16 bytes. |
|---|
| 266 | | - // since we know that there was data before the pointer, we can |
|---|
| 267 | | - // offset the input pointer before the actual point, to receive |
|---|
| 268 | | - // exactly 16 bytes. after that the registers need to be adjusted. |
|---|
| 269 | | -_get_last_two_regs: |
|---|
| 270 | | - add arg2, arg2, arg3 |
|---|
| 271 | | - ldr q1, [arg2, #-16] |
|---|
| 272 | | -CPU_LE( rev64 v1.16b, v1.16b ) |
|---|
| 273 | | -CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) |
|---|
| 371 | + // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes. |
|---|
| 372 | + adr_l x4, .Lbyteshift_table + 16 |
|---|
| 373 | + sub x4, x4, len |
|---|
| 374 | + ld1 {v2.16b}, [x4] |
|---|
| 375 | + tbl v1.16b, {v7.16b}, v2.16b |
|---|
| 274 | 376 | |
|---|
| 275 | | - // get rid of the extra data that was loaded before |
|---|
| 276 | | - // load the shift constant |
|---|
| 277 | | - adr_l x4, tbl_shf_table + 16 |
|---|
| 278 | | - sub x4, x4, arg3 |
|---|
| 279 | | - ld1 {v0.16b}, [x4] |
|---|
| 377 | + // v3 = first chunk: v7 right-shifted by '16-len' bytes. |
|---|
| 378 | + movi v3.16b, #0x80 |
|---|
| 379 | + eor v2.16b, v2.16b, v3.16b |
|---|
| 380 | + tbl v3.16b, {v7.16b}, v2.16b |
|---|
| 280 | 381 | |
|---|
| 281 | | - // shift v2 to the left by arg3 bytes |
|---|
| 282 | | - tbl v2.16b, {v7.16b}, v0.16b |
|---|
| 382 | + // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes. |
|---|
| 383 | + sshr v2.16b, v2.16b, #7 |
|---|
| 283 | 384 | |
|---|
| 284 | | - // shift v7 to the right by 16-arg3 bytes |
|---|
| 285 | | - movi v9.16b, #0x80 |
|---|
| 286 | | - eor v0.16b, v0.16b, v9.16b |
|---|
| 287 | | - tbl v7.16b, {v7.16b}, v0.16b |
|---|
| 385 | + // v2 = second chunk: 'len' bytes from v0 (low-order bytes), |
|---|
| 386 | + // then '16-len' bytes from v1 (high-order bytes). |
|---|
| 387 | + bsl v2.16b, v1.16b, v0.16b |
|---|
| 288 | 388 | |
|---|
| 289 | | - // blend |
|---|
| 290 | | - sshr v0.16b, v0.16b, #7 // convert to 8-bit mask |
|---|
| 291 | | - bsl v0.16b, v2.16b, v1.16b |
|---|
| 292 | | - |
|---|
| 293 | | - // fold 16 Bytes |
|---|
| 294 | | - pmull v8.1q, v7.1d, v10.1d |
|---|
| 295 | | - pmull2 v7.1q, v7.2d, v10.2d |
|---|
| 296 | | - eor v7.16b, v7.16b, v8.16b |
|---|
| 389 | + // Fold the first chunk into the second chunk, storing the result in v7. |
|---|
| 390 | + __pmull_\p v0, v3, fold_consts |
|---|
| 391 | + __pmull_\p v7, v3, fold_consts, 2 |
|---|
| 297 | 392 | eor v7.16b, v7.16b, v0.16b |
|---|
| 393 | + eor v7.16b, v7.16b, v2.16b |
|---|
| 298 | 394 | |
|---|
| 299 | | -_128_done: |
|---|
| 300 | | - // compute crc of a 128-bit value |
|---|
| 301 | | - ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10 |
|---|
| 395 | +.Lreduce_final_16_bytes_\@: |
|---|
| 396 | + // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC. |
|---|
| 302 | 397 | |
|---|
| 303 | | - // 64b fold |
|---|
| 304 | | - ext v0.16b, vzr.16b, v7.16b, #8 |
|---|
| 305 | | - mov v7.d[0], v7.d[1] |
|---|
| 306 | | - pmull v7.1q, v7.1d, v10.1d |
|---|
| 307 | | - eor v7.16b, v7.16b, v0.16b |
|---|
| 398 | + movi v2.16b, #0 // init zero register |
|---|
| 308 | 399 | |
|---|
| 309 | | - // 32b fold |
|---|
| 310 | | - ext v0.16b, v7.16b, vzr.16b, #4 |
|---|
| 311 | | - mov v7.s[3], vzr.s[0] |
|---|
| 312 | | - pmull2 v0.1q, v0.2d, v10.2d |
|---|
| 313 | | - eor v7.16b, v7.16b, v0.16b |
|---|
| 400 | + // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. |
|---|
| 401 | + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 |
|---|
| 402 | + __pmull_pre_\p fold_consts |
|---|
| 314 | 403 | |
|---|
| 315 | | - // barrett reduction |
|---|
| 316 | | -_barrett: |
|---|
| 317 | | - ldr_l q10, rk7, x8 |
|---|
| 318 | | - mov v0.d[0], v7.d[1] |
|---|
| 404 | + // Fold the high 64 bits into the low 64 bits, while also multiplying by |
|---|
| 405 | + // x^64. This produces a 128-bit value congruent to x^64 * M(x) and |
|---|
| 406 | + // whose low 48 bits are 0. |
|---|
| 407 | + ext v0.16b, v2.16b, v7.16b, #8 |
|---|
| 408 | + __pmull_\p v7, v7, fold_consts, 2 // high bits * x^48 * (x^80 mod G(x)) |
|---|
| 409 | + eor v0.16b, v0.16b, v7.16b // + low bits * x^64 |
|---|
| 319 | 410 | |
|---|
| 320 | | - pmull v0.1q, v0.1d, v10.1d |
|---|
| 321 | | - ext v0.16b, vzr.16b, v0.16b, #12 |
|---|
| 322 | | - pmull2 v0.1q, v0.2d, v10.2d |
|---|
| 323 | | - ext v0.16b, vzr.16b, v0.16b, #12 |
|---|
| 324 | | - eor v7.16b, v7.16b, v0.16b |
|---|
| 325 | | - mov w0, v7.s[1] |
|---|
| 411 | + // Fold the high 32 bits into the low 96 bits. This produces a 96-bit |
|---|
| 412 | + // value congruent to x^64 * M(x) and whose low 48 bits are 0. |
|---|
| 413 | + ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits |
|---|
| 414 | + mov v0.s[3], v2.s[0] // zero high 32 bits |
|---|
| 415 | + __pmull_\p v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x)) |
|---|
| 416 | + eor v0.16b, v0.16b, v1.16b // + low bits |
|---|
| 326 | 417 | |
|---|
| 327 | | -_cleanup: |
|---|
| 328 | | - // scale the result back to 16 bits |
|---|
| 329 | | - lsr x0, x0, #16 |
|---|
| 330 | | - frame_pop |
|---|
| 418 | + // Load G(x) and floor(x^48 / G(x)). |
|---|
| 419 | + ld1 {fold_consts.2d}, [fold_consts_ptr] |
|---|
| 420 | + __pmull_pre_\p fold_consts |
|---|
| 421 | + |
|---|
| 422 | + // Use Barrett reduction to compute the final CRC value. |
|---|
| 423 | + __pmull_\p v1, v0, fold_consts, 2 // high 32 bits * floor(x^48 / G(x)) |
|---|
| 424 | + ushr v1.2d, v1.2d, #32 // /= x^32 |
|---|
| 425 | + __pmull_\p v1, v1, fold_consts // *= G(x) |
|---|
| 426 | + ushr v0.2d, v0.2d, #48 |
|---|
| 427 | + eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits |
|---|
| 428 | + // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0. |
|---|
| 429 | + |
|---|
| 430 | + umov w0, v0.h[0] |
|---|
| 431 | + .ifc \p, p8 |
|---|
| 432 | + ldp x29, x30, [sp], #16 |
|---|
| 433 | + .endif |
|---|
| 331 | 434 | ret |
|---|
| 332 | 435 | |
|---|
| 333 | | -_less_than_128: |
|---|
| 334 | | - cbz arg3, _cleanup |
|---|
| 436 | +.Lless_than_256_bytes_\@: |
|---|
| 437 | + // Checksumming a buffer of length 16...255 bytes |
|---|
| 335 | 438 | |
|---|
| 336 | | - movi v0.16b, #0 |
|---|
| 337 | | - mov v0.s[3], arg1_low32 // get the initial crc value |
|---|
| 439 | + adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts |
|---|
| 338 | 440 | |
|---|
| 339 | | - ldr q7, [arg2], #0x10 |
|---|
| 441 | + // Load the first 16 data bytes. |
|---|
| 442 | + ldr q7, [buf], #0x10 |
|---|
| 340 | 443 | CPU_LE( rev64 v7.16b, v7.16b ) |
|---|
| 341 | 444 | CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) |
|---|
| 342 | | - eor v7.16b, v7.16b, v0.16b // xor the initial crc value |
|---|
| 343 | 445 | |
|---|
| 344 | | - cmp arg3, #16 |
|---|
| 345 | | - b.eq _128_done // exactly 16 left |
|---|
| 346 | | - b.lt _less_than_16_left |
|---|
| 446 | + // XOR the first 16 data *bits* with the initial CRC value. |
|---|
| 447 | + movi v0.16b, #0 |
|---|
| 448 | + mov v0.h[7], init_crc |
|---|
| 449 | + eor v7.16b, v7.16b, v0.16b |
|---|
| 347 | 450 | |
|---|
| 348 | | - ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10 |
|---|
| 451 | + // Load the fold-across-16-bytes constants. |
|---|
| 452 | + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 |
|---|
| 453 | + __pmull_pre_\p fold_consts |
|---|
| 349 | 454 | |
|---|
| 350 | | - // update the counter. subtract 32 instead of 16 to save one |
|---|
| 351 | | - // instruction from the loop |
|---|
| 352 | | - subs arg3, arg3, #32 |
|---|
| 353 | | - b.ge _16B_reduction_loop |
|---|
| 455 | + cmp len, #16 |
|---|
| 456 | + b.eq .Lreduce_final_16_bytes_\@ // len == 16 |
|---|
| 457 | + subs len, len, #32 |
|---|
| 458 | + b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255 |
|---|
| 459 | + add len, len, #16 |
|---|
| 460 | + b .Lhandle_partial_segment_\@ // 17 <= len <= 31 |
|---|
| 461 | + .endm |
|---|
| 354 | 462 | |
|---|
| 355 | | - add arg3, arg3, #16 |
|---|
| 356 | | - b _get_last_two_regs |
|---|
| 463 | +// |
|---|
| 464 | +// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len); |
|---|
| 465 | +// |
|---|
| 466 | +// Assumes len >= 16. |
|---|
| 467 | +// |
|---|
| 468 | +SYM_FUNC_START(crc_t10dif_pmull_p8) |
|---|
| 469 | + stp x29, x30, [sp, #-16]! |
|---|
| 470 | + mov x29, sp |
|---|
| 471 | + crc_t10dif_pmull p8 |
|---|
| 472 | +SYM_FUNC_END(crc_t10dif_pmull_p8) |
|---|
| 357 | 473 | |
|---|
| 358 | | -_less_than_16_left: |
|---|
| 359 | | - // shl r9, 4 |
|---|
| 360 | | - adr_l x0, tbl_shf_table + 16 |
|---|
| 361 | | - sub x0, x0, arg3 |
|---|
| 362 | | - ld1 {v0.16b}, [x0] |
|---|
| 363 | | - movi v9.16b, #0x80 |
|---|
| 364 | | - eor v0.16b, v0.16b, v9.16b |
|---|
| 365 | | - tbl v7.16b, {v7.16b}, v0.16b |
|---|
| 366 | | - b _128_done |
|---|
| 367 | | -ENDPROC(crc_t10dif_pmull) |
|---|
| 474 | + .align 5 |
|---|
| 475 | +// |
|---|
| 476 | +// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); |
|---|
| 477 | +// |
|---|
| 478 | +// Assumes len >= 16. |
|---|
| 479 | +// |
|---|
| 480 | +SYM_FUNC_START(crc_t10dif_pmull_p64) |
|---|
| 481 | + crc_t10dif_pmull p64 |
|---|
| 482 | +SYM_FUNC_END(crc_t10dif_pmull_p64) |
|---|
| 368 | 483 | |
|---|
| 369 | | -// precomputed constants |
|---|
| 370 | | -// these constants are precomputed from the poly: |
|---|
| 371 | | -// 0x8bb70000 (0x8bb7 scaled to 32 bits) |
|---|
| 372 | 484 | .section ".rodata", "a" |
|---|
| 373 | 485 | .align 4 |
|---|
| 374 | | -// Q = 0x18BB70000 |
|---|
| 375 | | -// rk1 = 2^(32*3) mod Q << 32 |
|---|
| 376 | | -// rk2 = 2^(32*5) mod Q << 32 |
|---|
| 377 | | -// rk3 = 2^(32*15) mod Q << 32 |
|---|
| 378 | | -// rk4 = 2^(32*17) mod Q << 32 |
|---|
| 379 | | -// rk5 = 2^(32*3) mod Q << 32 |
|---|
| 380 | | -// rk6 = 2^(32*2) mod Q << 32 |
|---|
| 381 | | -// rk7 = floor(2^64/Q) |
|---|
| 382 | | -// rk8 = Q |
|---|
| 383 | 486 | |
|---|
| 384 | | -rk1: .octa 0x06df0000000000002d56000000000000 |
|---|
| 385 | | -rk3: .octa 0x7cf50000000000009d9d000000000000 |
|---|
| 386 | | -rk5: .octa 0x13680000000000002d56000000000000 |
|---|
| 387 | | -rk7: .octa 0x000000018bb7000000000001f65a57f8 |
|---|
| 388 | | -rk9: .octa 0xbfd6000000000000ceae000000000000 |
|---|
| 389 | | -rk11: .octa 0x713c0000000000001e16000000000000 |
|---|
| 390 | | -rk13: .octa 0x80a6000000000000f7f9000000000000 |
|---|
| 391 | | -rk15: .octa 0xe658000000000000044c000000000000 |
|---|
| 392 | | -rk17: .octa 0xa497000000000000ad18000000000000 |
|---|
| 393 | | -rk19: .octa 0xe7b50000000000006ee3000000000000 |
|---|
| 487 | +// Fold constants precomputed from the polynomial 0x18bb7 |
|---|
| 488 | +// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 |
|---|
| 489 | +.Lfold_across_128_bytes_consts: |
|---|
| 490 | + .quad 0x0000000000006123 // x^(8*128) mod G(x) |
|---|
| 491 | + .quad 0x0000000000002295 // x^(8*128+64) mod G(x) |
|---|
| 492 | +// .Lfold_across_64_bytes_consts: |
|---|
| 493 | + .quad 0x0000000000001069 // x^(4*128) mod G(x) |
|---|
| 494 | + .quad 0x000000000000dd31 // x^(4*128+64) mod G(x) |
|---|
| 495 | +// .Lfold_across_32_bytes_consts: |
|---|
| 496 | + .quad 0x000000000000857d // x^(2*128) mod G(x) |
|---|
| 497 | + .quad 0x0000000000007acc // x^(2*128+64) mod G(x) |
|---|
| 498 | +.Lfold_across_16_bytes_consts: |
|---|
| 499 | + .quad 0x000000000000a010 // x^(1*128) mod G(x) |
|---|
| 500 | + .quad 0x0000000000001faa // x^(1*128+64) mod G(x) |
|---|
| 501 | +// .Lfinal_fold_consts: |
|---|
| 502 | + .quad 0x1368000000000000 // x^48 * (x^48 mod G(x)) |
|---|
| 503 | + .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x)) |
|---|
| 504 | +// .Lbarrett_reduction_consts: |
|---|
| 505 | + .quad 0x0000000000018bb7 // G(x) |
|---|
| 506 | + .quad 0x00000001f65a57f8 // floor(x^48 / G(x)) |
|---|
| 394 | 507 | |
|---|
| 395 | | -tbl_shf_table: |
|---|
| 396 | | -// use these values for shift constants for the tbl/tbx instruction |
|---|
| 397 | | -// different alignments result in values as shown: |
|---|
| 398 | | -// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 |
|---|
| 399 | | -// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 |
|---|
| 400 | | -// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 |
|---|
| 401 | | -// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 |
|---|
| 402 | | -// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 |
|---|
| 403 | | -// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 |
|---|
| 404 | | -// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 |
|---|
| 405 | | -// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 |
|---|
| 406 | | -// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 |
|---|
| 407 | | -// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 |
|---|
| 408 | | -// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 |
|---|
| 409 | | -// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 |
|---|
| 410 | | -// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 |
|---|
| 411 | | -// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 |
|---|
| 412 | | -// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 |
|---|
| 413 | | - |
|---|
| 508 | +// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - |
|---|
| 509 | +// len] is the index vector to shift left by 'len' bytes, and is also {0x80, |
|---|
| 510 | +// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes. |
|---|
| 511 | +.Lbyteshift_table: |
|---|
| 414 | 512 | .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 |
|---|
| 415 | 513 | .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f |
|---|
| 416 | 514 | .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
|---|