| .. | .. |
|---|
| 43 | 43 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
|---|
| 44 | 44 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|---|
| 45 | 45 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|---|
| 46 | | -######################################################################## |
|---|
| 47 | | -# Function API: |
|---|
| 48 | | -# UINT16 crc_t10dif_pcl( |
|---|
| 49 | | -# UINT16 init_crc, //initial CRC value, 16 bits |
|---|
| 50 | | -# const unsigned char *buf, //buffer pointer to calculate CRC on |
|---|
| 51 | | -# UINT64 len //buffer length in bytes (64-bit data) |
|---|
| 52 | | -# ); |
|---|
| 53 | 46 | # |
|---|
| 54 | 47 | # Reference paper titled "Fast CRC Computation for Generic |
|---|
| 55 | 48 | # Polynomials Using PCLMULQDQ Instruction" |
|---|
| 56 | 49 | # URL: http://www.intel.com/content/dam/www/public/us/en/documents |
|---|
| 57 | 50 | # /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf |
|---|
| 58 | 51 | # |
|---|
| 59 | | -# |
|---|
| 60 | 52 | |
|---|
| 61 | 53 | #include <linux/linkage.h> |
|---|
| 62 | 54 | |
|---|
| 63 | 55 | .text |
|---|
| 64 | 56 | |
|---|
| 65 | | -#define arg1 %rdi |
|---|
| 66 | | -#define arg2 %rsi |
|---|
| 67 | | -#define arg3 %rdx |
|---|
| 57 | +#define init_crc %edi |
|---|
| 58 | +#define buf %rsi |
|---|
| 59 | +#define len %rdx |
|---|
| 68 | 60 | |
|---|
| 69 | | -#define arg1_low32 %edi |
|---|
| 61 | +#define FOLD_CONSTS %xmm10 |
|---|
| 62 | +#define BSWAP_MASK %xmm11 |
|---|
| 70 | 63 | |
|---|
| 71 | | -ENTRY(crc_t10dif_pcl) |
|---|
| 64 | +# Fold reg1, reg2 into the next 32 data bytes, storing the result back into |
|---|
| 65 | +# reg1, reg2. |
|---|
| 66 | +.macro fold_32_bytes offset, reg1, reg2 |
|---|
| 67 | + movdqu \offset(buf), %xmm9 |
|---|
| 68 | + movdqu \offset+16(buf), %xmm12 |
|---|
| 69 | + pshufb BSWAP_MASK, %xmm9 |
|---|
| 70 | + pshufb BSWAP_MASK, %xmm12 |
|---|
| 71 | + movdqa \reg1, %xmm8 |
|---|
| 72 | + movdqa \reg2, %xmm13 |
|---|
| 73 | + pclmulqdq $0x00, FOLD_CONSTS, \reg1 |
|---|
| 74 | + pclmulqdq $0x11, FOLD_CONSTS, %xmm8 |
|---|
| 75 | + pclmulqdq $0x00, FOLD_CONSTS, \reg2 |
|---|
| 76 | + pclmulqdq $0x11, FOLD_CONSTS, %xmm13 |
|---|
| 77 | + pxor %xmm9 , \reg1 |
|---|
| 78 | + xorps %xmm8 , \reg1 |
|---|
| 79 | + pxor %xmm12, \reg2 |
|---|
| 80 | + xorps %xmm13, \reg2 |
|---|
| 81 | +.endm |
|---|
| 82 | + |
|---|
| 83 | +# Fold src_reg into dst_reg. |
|---|
| 84 | +.macro fold_16_bytes src_reg, dst_reg |
|---|
| 85 | + movdqa \src_reg, %xmm8 |
|---|
| 86 | + pclmulqdq $0x11, FOLD_CONSTS, \src_reg |
|---|
| 87 | + pclmulqdq $0x00, FOLD_CONSTS, %xmm8 |
|---|
| 88 | + pxor %xmm8, \dst_reg |
|---|
| 89 | + xorps \src_reg, \dst_reg |
|---|
| 90 | +.endm |
|---|
| 91 | + |
|---|
| 92 | +# |
|---|
| 93 | +# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len); |
|---|
| 94 | +# |
|---|
| 95 | +# Assumes len >= 16. |
|---|
| 96 | +# |
|---|
| 72 | 97 | .align 16 |
|---|
| 98 | +SYM_FUNC_START(crc_t10dif_pcl) |
|---|
| 73 | 99 | |
|---|
| 74 | | - # adjust the 16-bit initial_crc value, scale it to 32 bits |
|---|
| 75 | | - shl $16, arg1_low32 |
|---|
| 100 | + movdqa .Lbswap_mask(%rip), BSWAP_MASK |
|---|
| 76 | 101 | |
|---|
| 77 | | - # Allocate Stack Space |
|---|
| 78 | | - mov %rsp, %rcx |
|---|
| 79 | | - sub $16*2, %rsp |
|---|
| 80 | | - # align stack to 16 byte boundary |
|---|
| 81 | | - and $~(0x10 - 1), %rsp |
|---|
| 102 | + # For sizes less than 256 bytes, we can't fold 128 bytes at a time. |
|---|
| 103 | + cmp $256, len |
|---|
| 104 | + jl .Lless_than_256_bytes |
|---|
| 82 | 105 | |
|---|
| 83 | | - # check if smaller than 256 |
|---|
| 84 | | - cmp $256, arg3 |
|---|
| 106 | + # Load the first 128 data bytes. Byte swapping is necessary to make the |
|---|
| 107 | + # bit order match the polynomial coefficient order. |
|---|
| 108 | + movdqu 16*0(buf), %xmm0 |
|---|
| 109 | + movdqu 16*1(buf), %xmm1 |
|---|
| 110 | + movdqu 16*2(buf), %xmm2 |
|---|
| 111 | + movdqu 16*3(buf), %xmm3 |
|---|
| 112 | + movdqu 16*4(buf), %xmm4 |
|---|
| 113 | + movdqu 16*5(buf), %xmm5 |
|---|
| 114 | + movdqu 16*6(buf), %xmm6 |
|---|
| 115 | + movdqu 16*7(buf), %xmm7 |
|---|
| 116 | + add $128, buf |
|---|
| 117 | + pshufb BSWAP_MASK, %xmm0 |
|---|
| 118 | + pshufb BSWAP_MASK, %xmm1 |
|---|
| 119 | + pshufb BSWAP_MASK, %xmm2 |
|---|
| 120 | + pshufb BSWAP_MASK, %xmm3 |
|---|
| 121 | + pshufb BSWAP_MASK, %xmm4 |
|---|
| 122 | + pshufb BSWAP_MASK, %xmm5 |
|---|
| 123 | + pshufb BSWAP_MASK, %xmm6 |
|---|
| 124 | + pshufb BSWAP_MASK, %xmm7 |
|---|
| 85 | 125 | |
|---|
| 86 | | - # for sizes less than 128, we can't fold 64B at a time... |
|---|
| 87 | | - jl _less_than_128 |
|---|
| 126 | + # XOR the first 16 data *bits* with the initial CRC value. |
|---|
| 127 | + pxor %xmm8, %xmm8 |
|---|
| 128 | + pinsrw $7, init_crc, %xmm8 |
|---|
| 129 | + pxor %xmm8, %xmm0 |
|---|
| 88 | 130 | |
|---|
| 131 | + movdqa .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS |
|---|
| 89 | 132 | |
|---|
| 90 | | - # load the initial crc value |
|---|
| 91 | | - movd arg1_low32, %xmm10 # initial crc |
|---|
| 133 | + # Subtract 128 for the 128 data bytes just consumed. Subtract another |
|---|
| 134 | + # 128 to simplify the termination condition of the following loop. |
|---|
| 135 | + sub $256, len |
|---|
| 92 | 136 | |
|---|
| 93 | | - # crc value does not need to be byte-reflected, but it needs |
|---|
| 94 | | - # to be moved to the high part of the register. |
|---|
| 95 | | - # because data will be byte-reflected and will align with |
|---|
| 96 | | - # initial crc at correct place. |
|---|
| 97 | | - pslldq $12, %xmm10 |
|---|
| 137 | + # While >= 128 data bytes remain (not counting xmm0-7), fold the 128 |
|---|
| 138 | + # bytes xmm0-7 into them, storing the result back into xmm0-7. |
|---|
| 139 | +.Lfold_128_bytes_loop: |
|---|
| 140 | + fold_32_bytes 0, %xmm0, %xmm1 |
|---|
| 141 | + fold_32_bytes 32, %xmm2, %xmm3 |
|---|
| 142 | + fold_32_bytes 64, %xmm4, %xmm5 |
|---|
| 143 | + fold_32_bytes 96, %xmm6, %xmm7 |
|---|
| 144 | + add $128, buf |
|---|
| 145 | + sub $128, len |
|---|
| 146 | + jge .Lfold_128_bytes_loop |
|---|
| 98 | 147 | |
|---|
| 99 | | - movdqa SHUF_MASK(%rip), %xmm11 |
|---|
| 100 | | - # receive the initial 64B data, xor the initial crc value |
|---|
| 101 | | - movdqu 16*0(arg2), %xmm0 |
|---|
| 102 | | - movdqu 16*1(arg2), %xmm1 |
|---|
| 103 | | - movdqu 16*2(arg2), %xmm2 |
|---|
| 104 | | - movdqu 16*3(arg2), %xmm3 |
|---|
| 105 | | - movdqu 16*4(arg2), %xmm4 |
|---|
| 106 | | - movdqu 16*5(arg2), %xmm5 |
|---|
| 107 | | - movdqu 16*6(arg2), %xmm6 |
|---|
| 108 | | - movdqu 16*7(arg2), %xmm7 |
|---|
| 148 | + # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7. |
|---|
| 109 | 149 | |
|---|
| 110 | | - pshufb %xmm11, %xmm0 |
|---|
| 111 | | - # XOR the initial_crc value |
|---|
| 112 | | - pxor %xmm10, %xmm0 |
|---|
| 113 | | - pshufb %xmm11, %xmm1 |
|---|
| 114 | | - pshufb %xmm11, %xmm2 |
|---|
| 115 | | - pshufb %xmm11, %xmm3 |
|---|
| 116 | | - pshufb %xmm11, %xmm4 |
|---|
| 117 | | - pshufb %xmm11, %xmm5 |
|---|
| 118 | | - pshufb %xmm11, %xmm6 |
|---|
| 119 | | - pshufb %xmm11, %xmm7 |
|---|
| 150 | + # Fold across 64 bytes. |
|---|
| 151 | + movdqa .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS |
|---|
| 152 | + fold_16_bytes %xmm0, %xmm4 |
|---|
| 153 | + fold_16_bytes %xmm1, %xmm5 |
|---|
| 154 | + fold_16_bytes %xmm2, %xmm6 |
|---|
| 155 | + fold_16_bytes %xmm3, %xmm7 |
|---|
| 156 | + # Fold across 32 bytes. |
|---|
| 157 | + movdqa .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS |
|---|
| 158 | + fold_16_bytes %xmm4, %xmm6 |
|---|
| 159 | + fold_16_bytes %xmm5, %xmm7 |
|---|
| 160 | + # Fold across 16 bytes. |
|---|
| 161 | + movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS |
|---|
| 162 | + fold_16_bytes %xmm6, %xmm7 |
|---|
| 120 | 163 | |
|---|
| 121 | | - movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4 |
|---|
| 122 | | - #imm value of pclmulqdq instruction |
|---|
| 123 | | - #will determine which constant to use |
|---|
| 164 | + # Add 128 to get the correct number of data bytes remaining in 0...127 |
|---|
| 165 | + # (not counting xmm7), following the previous extra subtraction by 128. |
|---|
| 166 | + # Then subtract 16 to simplify the termination condition of the |
|---|
| 167 | + # following loop. |
|---|
| 168 | + add $128-16, len |
|---|
| 124 | 169 | |
|---|
| 125 | | - ################################################################# |
|---|
| 126 | | - # we subtract 256 instead of 128 to save one instruction from the loop |
|---|
| 127 | | - sub $256, arg3 |
|---|
| 128 | | - |
|---|
| 129 | | - # at this section of the code, there is 64*x+y (0<=y<64) bytes of |
|---|
| 130 | | - # buffer. The _fold_64_B_loop will fold 64B at a time |
|---|
| 131 | | - # until we have 64+y Bytes of buffer |
|---|
| 132 | | - |
|---|
| 133 | | - |
|---|
| 134 | | - # fold 64B at a time. This section of the code folds 4 xmm |
|---|
| 135 | | - # registers in parallel |
|---|
| 136 | | -_fold_64_B_loop: |
|---|
| 137 | | - |
|---|
| 138 | | - # update the buffer pointer |
|---|
| 139 | | - add $128, arg2 # buf += 64# |
|---|
| 140 | | - |
|---|
| 141 | | - movdqu 16*0(arg2), %xmm9 |
|---|
| 142 | | - movdqu 16*1(arg2), %xmm12 |
|---|
| 143 | | - pshufb %xmm11, %xmm9 |
|---|
| 144 | | - pshufb %xmm11, %xmm12 |
|---|
| 145 | | - movdqa %xmm0, %xmm8 |
|---|
| 146 | | - movdqa %xmm1, %xmm13 |
|---|
| 147 | | - pclmulqdq $0x0 , %xmm10, %xmm0 |
|---|
| 148 | | - pclmulqdq $0x11, %xmm10, %xmm8 |
|---|
| 149 | | - pclmulqdq $0x0 , %xmm10, %xmm1 |
|---|
| 150 | | - pclmulqdq $0x11, %xmm10, %xmm13 |
|---|
| 151 | | - pxor %xmm9 , %xmm0 |
|---|
| 152 | | - xorps %xmm8 , %xmm0 |
|---|
| 153 | | - pxor %xmm12, %xmm1 |
|---|
| 154 | | - xorps %xmm13, %xmm1 |
|---|
| 155 | | - |
|---|
| 156 | | - movdqu 16*2(arg2), %xmm9 |
|---|
| 157 | | - movdqu 16*3(arg2), %xmm12 |
|---|
| 158 | | - pshufb %xmm11, %xmm9 |
|---|
| 159 | | - pshufb %xmm11, %xmm12 |
|---|
| 160 | | - movdqa %xmm2, %xmm8 |
|---|
| 161 | | - movdqa %xmm3, %xmm13 |
|---|
| 162 | | - pclmulqdq $0x0, %xmm10, %xmm2 |
|---|
| 163 | | - pclmulqdq $0x11, %xmm10, %xmm8 |
|---|
| 164 | | - pclmulqdq $0x0, %xmm10, %xmm3 |
|---|
| 165 | | - pclmulqdq $0x11, %xmm10, %xmm13 |
|---|
| 166 | | - pxor %xmm9 , %xmm2 |
|---|
| 167 | | - xorps %xmm8 , %xmm2 |
|---|
| 168 | | - pxor %xmm12, %xmm3 |
|---|
| 169 | | - xorps %xmm13, %xmm3 |
|---|
| 170 | | - |
|---|
| 171 | | - movdqu 16*4(arg2), %xmm9 |
|---|
| 172 | | - movdqu 16*5(arg2), %xmm12 |
|---|
| 173 | | - pshufb %xmm11, %xmm9 |
|---|
| 174 | | - pshufb %xmm11, %xmm12 |
|---|
| 175 | | - movdqa %xmm4, %xmm8 |
|---|
| 176 | | - movdqa %xmm5, %xmm13 |
|---|
| 177 | | - pclmulqdq $0x0, %xmm10, %xmm4 |
|---|
| 178 | | - pclmulqdq $0x11, %xmm10, %xmm8 |
|---|
| 179 | | - pclmulqdq $0x0, %xmm10, %xmm5 |
|---|
| 180 | | - pclmulqdq $0x11, %xmm10, %xmm13 |
|---|
| 181 | | - pxor %xmm9 , %xmm4 |
|---|
| 182 | | - xorps %xmm8 , %xmm4 |
|---|
| 183 | | - pxor %xmm12, %xmm5 |
|---|
| 184 | | - xorps %xmm13, %xmm5 |
|---|
| 185 | | - |
|---|
| 186 | | - movdqu 16*6(arg2), %xmm9 |
|---|
| 187 | | - movdqu 16*7(arg2), %xmm12 |
|---|
| 188 | | - pshufb %xmm11, %xmm9 |
|---|
| 189 | | - pshufb %xmm11, %xmm12 |
|---|
| 190 | | - movdqa %xmm6 , %xmm8 |
|---|
| 191 | | - movdqa %xmm7 , %xmm13 |
|---|
| 192 | | - pclmulqdq $0x0 , %xmm10, %xmm6 |
|---|
| 193 | | - pclmulqdq $0x11, %xmm10, %xmm8 |
|---|
| 194 | | - pclmulqdq $0x0 , %xmm10, %xmm7 |
|---|
| 195 | | - pclmulqdq $0x11, %xmm10, %xmm13 |
|---|
| 196 | | - pxor %xmm9 , %xmm6 |
|---|
| 197 | | - xorps %xmm8 , %xmm6 |
|---|
| 198 | | - pxor %xmm12, %xmm7 |
|---|
| 199 | | - xorps %xmm13, %xmm7 |
|---|
| 200 | | - |
|---|
| 201 | | - sub $128, arg3 |
|---|
| 202 | | - |
|---|
| 203 | | - # check if there is another 64B in the buffer to be able to fold |
|---|
| 204 | | - jge _fold_64_B_loop |
|---|
| 205 | | - ################################################################## |
|---|
| 206 | | - |
|---|
| 207 | | - |
|---|
| 208 | | - add $128, arg2 |
|---|
| 209 | | - # at this point, the buffer pointer is pointing at the last y Bytes |
|---|
| 210 | | - # of the buffer the 64B of folded data is in 4 of the xmm |
|---|
| 211 | | - # registers: xmm0, xmm1, xmm2, xmm3 |
|---|
| 212 | | - |
|---|
| 213 | | - |
|---|
| 214 | | - # fold the 8 xmm registers to 1 xmm register with different constants |
|---|
| 215 | | - |
|---|
| 216 | | - movdqa rk9(%rip), %xmm10 |
|---|
| 217 | | - movdqa %xmm0, %xmm8 |
|---|
| 218 | | - pclmulqdq $0x11, %xmm10, %xmm0 |
|---|
| 219 | | - pclmulqdq $0x0 , %xmm10, %xmm8 |
|---|
| 220 | | - pxor %xmm8, %xmm7 |
|---|
| 221 | | - xorps %xmm0, %xmm7 |
|---|
| 222 | | - |
|---|
| 223 | | - movdqa rk11(%rip), %xmm10 |
|---|
| 224 | | - movdqa %xmm1, %xmm8 |
|---|
| 225 | | - pclmulqdq $0x11, %xmm10, %xmm1 |
|---|
| 226 | | - pclmulqdq $0x0 , %xmm10, %xmm8 |
|---|
| 227 | | - pxor %xmm8, %xmm7 |
|---|
| 228 | | - xorps %xmm1, %xmm7 |
|---|
| 229 | | - |
|---|
| 230 | | - movdqa rk13(%rip), %xmm10 |
|---|
| 231 | | - movdqa %xmm2, %xmm8 |
|---|
| 232 | | - pclmulqdq $0x11, %xmm10, %xmm2 |
|---|
| 233 | | - pclmulqdq $0x0 , %xmm10, %xmm8 |
|---|
| 234 | | - pxor %xmm8, %xmm7 |
|---|
| 235 | | - pxor %xmm2, %xmm7 |
|---|
| 236 | | - |
|---|
| 237 | | - movdqa rk15(%rip), %xmm10 |
|---|
| 238 | | - movdqa %xmm3, %xmm8 |
|---|
| 239 | | - pclmulqdq $0x11, %xmm10, %xmm3 |
|---|
| 240 | | - pclmulqdq $0x0 , %xmm10, %xmm8 |
|---|
| 241 | | - pxor %xmm8, %xmm7 |
|---|
| 242 | | - xorps %xmm3, %xmm7 |
|---|
| 243 | | - |
|---|
| 244 | | - movdqa rk17(%rip), %xmm10 |
|---|
| 245 | | - movdqa %xmm4, %xmm8 |
|---|
| 246 | | - pclmulqdq $0x11, %xmm10, %xmm4 |
|---|
| 247 | | - pclmulqdq $0x0 , %xmm10, %xmm8 |
|---|
| 248 | | - pxor %xmm8, %xmm7 |
|---|
| 249 | | - pxor %xmm4, %xmm7 |
|---|
| 250 | | - |
|---|
| 251 | | - movdqa rk19(%rip), %xmm10 |
|---|
| 252 | | - movdqa %xmm5, %xmm8 |
|---|
| 253 | | - pclmulqdq $0x11, %xmm10, %xmm5 |
|---|
| 254 | | - pclmulqdq $0x0 , %xmm10, %xmm8 |
|---|
| 255 | | - pxor %xmm8, %xmm7 |
|---|
| 256 | | - xorps %xmm5, %xmm7 |
|---|
| 257 | | - |
|---|
| 258 | | - movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2 |
|---|
| 259 | | - #imm value of pclmulqdq instruction |
|---|
| 260 | | - #will determine which constant to use |
|---|
| 261 | | - movdqa %xmm6, %xmm8 |
|---|
| 262 | | - pclmulqdq $0x11, %xmm10, %xmm6 |
|---|
| 263 | | - pclmulqdq $0x0 , %xmm10, %xmm8 |
|---|
| 264 | | - pxor %xmm8, %xmm7 |
|---|
| 265 | | - pxor %xmm6, %xmm7 |
|---|
| 266 | | - |
|---|
| 267 | | - |
|---|
| 268 | | - # instead of 64, we add 48 to the loop counter to save 1 instruction |
|---|
| 269 | | - # from the loop instead of a cmp instruction, we use the negative |
|---|
| 270 | | - # flag with the jl instruction |
|---|
| 271 | | - add $128-16, arg3 |
|---|
| 272 | | - jl _final_reduction_for_128 |
|---|
| 273 | | - |
|---|
| 274 | | - # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 |
|---|
| 275 | | - # and the rest is in memory. We can fold 16 bytes at a time if y>=16 |
|---|
| 276 | | - # continue folding 16B at a time |
|---|
| 277 | | - |
|---|
| 278 | | -_16B_reduction_loop: |
|---|
| 170 | + # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes |
|---|
| 171 | + # xmm7 into them, storing the result back into xmm7. |
|---|
| 172 | + jl .Lfold_16_bytes_loop_done |
|---|
| 173 | +.Lfold_16_bytes_loop: |
|---|
| 279 | 174 | movdqa %xmm7, %xmm8 |
|---|
| 280 | | - pclmulqdq $0x11, %xmm10, %xmm7 |
|---|
| 281 | | - pclmulqdq $0x0 , %xmm10, %xmm8 |
|---|
| 175 | + pclmulqdq $0x11, FOLD_CONSTS, %xmm7 |
|---|
| 176 | + pclmulqdq $0x00, FOLD_CONSTS, %xmm8 |
|---|
| 282 | 177 | pxor %xmm8, %xmm7 |
|---|
| 283 | | - movdqu (arg2), %xmm0 |
|---|
| 284 | | - pshufb %xmm11, %xmm0 |
|---|
| 178 | + movdqu (buf), %xmm0 |
|---|
| 179 | + pshufb BSWAP_MASK, %xmm0 |
|---|
| 285 | 180 | pxor %xmm0 , %xmm7 |
|---|
| 286 | | - add $16, arg2 |
|---|
| 287 | | - sub $16, arg3 |
|---|
| 288 | | - # instead of a cmp instruction, we utilize the flags with the |
|---|
| 289 | | - # jge instruction equivalent of: cmp arg3, 16-16 |
|---|
| 290 | | - # check if there is any more 16B in the buffer to be able to fold |
|---|
| 291 | | - jge _16B_reduction_loop |
|---|
| 181 | + add $16, buf |
|---|
| 182 | + sub $16, len |
|---|
| 183 | + jge .Lfold_16_bytes_loop |
|---|
| 292 | 184 | |
|---|
| 293 | | - #now we have 16+z bytes left to reduce, where 0<= z < 16. |
|---|
| 294 | | - #first, we reduce the data in the xmm7 register |
|---|
| 185 | +.Lfold_16_bytes_loop_done: |
|---|
| 186 | + # Add 16 to get the correct number of data bytes remaining in 0...15 |
|---|
| 187 | + # (not counting xmm7), following the previous extra subtraction by 16. |
|---|
| 188 | + add $16, len |
|---|
| 189 | + je .Lreduce_final_16_bytes |
|---|
| 295 | 190 | |
|---|
| 191 | +.Lhandle_partial_segment: |
|---|
| 192 | + # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16 |
|---|
| 193 | + # bytes are in xmm7 and the rest are the remaining data in 'buf'. To do |
|---|
| 194 | + # this without needing a fold constant for each possible 'len', redivide |
|---|
| 195 | + # the bytes into a first chunk of 'len' bytes and a second chunk of 16 |
|---|
| 196 | + # bytes, then fold the first chunk into the second. |
|---|
| 296 | 197 | |
|---|
| 297 | | -_final_reduction_for_128: |
|---|
| 298 | | - # check if any more data to fold. If not, compute the CRC of |
|---|
| 299 | | - # the final 128 bits |
|---|
| 300 | | - add $16, arg3 |
|---|
| 301 | | - je _128_done |
|---|
| 302 | | - |
|---|
| 303 | | - # here we are getting data that is less than 16 bytes. |
|---|
| 304 | | - # since we know that there was data before the pointer, we can |
|---|
| 305 | | - # offset the input pointer before the actual point, to receive |
|---|
| 306 | | - # exactly 16 bytes. after that the registers need to be adjusted. |
|---|
| 307 | | -_get_last_two_xmms: |
|---|
| 308 | 198 | movdqa %xmm7, %xmm2 |
|---|
| 309 | 199 | |
|---|
| 310 | | - movdqu -16(arg2, arg3), %xmm1 |
|---|
| 311 | | - pshufb %xmm11, %xmm1 |
|---|
| 200 | + # xmm1 = last 16 original data bytes |
|---|
| 201 | + movdqu -16(buf, len), %xmm1 |
|---|
| 202 | + pshufb BSWAP_MASK, %xmm1 |
|---|
| 312 | 203 | |
|---|
| 313 | | - # get rid of the extra data that was loaded before |
|---|
| 314 | | - # load the shift constant |
|---|
| 315 | | - lea pshufb_shf_table+16(%rip), %rax |
|---|
| 316 | | - sub arg3, %rax |
|---|
| 204 | + # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes. |
|---|
| 205 | + lea .Lbyteshift_table+16(%rip), %rax |
|---|
| 206 | + sub len, %rax |
|---|
| 317 | 207 | movdqu (%rax), %xmm0 |
|---|
| 318 | | - |
|---|
| 319 | | - # shift xmm2 to the left by arg3 bytes |
|---|
| 320 | 208 | pshufb %xmm0, %xmm2 |
|---|
| 321 | 209 | |
|---|
| 322 | | - # shift xmm7 to the right by 16-arg3 bytes |
|---|
| 323 | | - pxor mask1(%rip), %xmm0 |
|---|
| 210 | + # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes. |
|---|
| 211 | + pxor .Lmask1(%rip), %xmm0 |
|---|
| 324 | 212 | pshufb %xmm0, %xmm7 |
|---|
| 213 | + |
|---|
| 214 | + # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes), |
|---|
| 215 | + # then '16-len' bytes from xmm2 (high-order bytes). |
|---|
| 325 | 216 | pblendvb %xmm2, %xmm1 #xmm0 is implicit |
|---|
| 326 | 217 | |
|---|
| 327 | | - # fold 16 Bytes |
|---|
| 328 | | - movdqa %xmm1, %xmm2 |
|---|
| 218 | + # Fold the first chunk into the second chunk, storing the result in xmm7. |
|---|
| 329 | 219 | movdqa %xmm7, %xmm8 |
|---|
| 330 | | - pclmulqdq $0x11, %xmm10, %xmm7 |
|---|
| 331 | | - pclmulqdq $0x0 , %xmm10, %xmm8 |
|---|
| 220 | + pclmulqdq $0x11, FOLD_CONSTS, %xmm7 |
|---|
| 221 | + pclmulqdq $0x00, FOLD_CONSTS, %xmm8 |
|---|
| 332 | 222 | pxor %xmm8, %xmm7 |
|---|
| 333 | | - pxor %xmm2, %xmm7 |
|---|
| 223 | + pxor %xmm1, %xmm7 |
|---|
| 334 | 224 | |
|---|
| 335 | | -_128_done: |
|---|
| 336 | | - # compute crc of a 128-bit value |
|---|
| 337 | | - movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10 |
|---|
| 225 | +.Lreduce_final_16_bytes: |
|---|
| 226 | + # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC |
|---|
| 227 | + |
|---|
| 228 | + # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. |
|---|
| 229 | + movdqa .Lfinal_fold_consts(%rip), FOLD_CONSTS |
|---|
| 230 | + |
|---|
| 231 | + # Fold the high 64 bits into the low 64 bits, while also multiplying by |
|---|
| 232 | + # x^64. This produces a 128-bit value congruent to x^64 * M(x) and |
|---|
| 233 | + # whose low 48 bits are 0. |
|---|
| 338 | 234 | movdqa %xmm7, %xmm0 |
|---|
| 235 | + pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x)) |
|---|
| 236 | + pslldq $8, %xmm0 |
|---|
| 237 | + pxor %xmm0, %xmm7 # + low bits * x^64 |
|---|
| 339 | 238 | |
|---|
| 340 | | - #64b fold |
|---|
| 341 | | - pclmulqdq $0x1, %xmm10, %xmm7 |
|---|
| 342 | | - pslldq $8 , %xmm0 |
|---|
| 343 | | - pxor %xmm0, %xmm7 |
|---|
| 344 | | - |
|---|
| 345 | | - #32b fold |
|---|
| 239 | + # Fold the high 32 bits into the low 96 bits. This produces a 96-bit |
|---|
| 240 | + # value congruent to x^64 * M(x) and whose low 48 bits are 0. |
|---|
| 346 | 241 | movdqa %xmm7, %xmm0 |
|---|
| 242 | + pand .Lmask2(%rip), %xmm0 # zero high 32 bits |
|---|
| 243 | + psrldq $12, %xmm7 # extract high 32 bits |
|---|
| 244 | + pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x)) |
|---|
| 245 | + pxor %xmm0, %xmm7 # + low bits |
|---|
| 347 | 246 | |
|---|
| 348 | | - pand mask2(%rip), %xmm0 |
|---|
| 247 | + # Load G(x) and floor(x^48 / G(x)). |
|---|
| 248 | + movdqa .Lbarrett_reduction_consts(%rip), FOLD_CONSTS |
|---|
| 349 | 249 | |
|---|
| 350 | | - psrldq $12, %xmm7 |
|---|
| 351 | | - pclmulqdq $0x10, %xmm10, %xmm7 |
|---|
| 250 | + # Use Barrett reduction to compute the final CRC value. |
|---|
| 251 | + movdqa %xmm7, %xmm0 |
|---|
| 252 | + pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x)) |
|---|
| 253 | + psrlq $32, %xmm7 # /= x^32 |
|---|
| 254 | + pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x) |
|---|
| 255 | + psrlq $48, %xmm0 |
|---|
| 256 | + pxor %xmm7, %xmm0 # + low 16 nonzero bits |
|---|
| 257 | + # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0. |
|---|
| 258 | + |
|---|
| 259 | + pextrw $0, %xmm0, %eax |
|---|
| 260 | + RET |
|---|
| 261 | + |
|---|
| 262 | +.align 16 |
|---|
| 263 | +.Lless_than_256_bytes: |
|---|
| 264 | + # Checksumming a buffer of length 16...255 bytes |
|---|
| 265 | + |
|---|
| 266 | + # Load the first 16 data bytes. |
|---|
| 267 | + movdqu (buf), %xmm7 |
|---|
| 268 | + pshufb BSWAP_MASK, %xmm7 |
|---|
| 269 | + add $16, buf |
|---|
| 270 | + |
|---|
| 271 | + # XOR the first 16 data *bits* with the initial CRC value. |
|---|
| 272 | + pxor %xmm0, %xmm0 |
|---|
| 273 | + pinsrw $7, init_crc, %xmm0 |
|---|
| 352 | 274 | pxor %xmm0, %xmm7 |
|---|
| 353 | 275 | |
|---|
| 354 | | - #barrett reduction |
|---|
| 355 | | -_barrett: |
|---|
| 356 | | - movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10 |
|---|
| 357 | | - movdqa %xmm7, %xmm0 |
|---|
| 358 | | - pclmulqdq $0x01, %xmm10, %xmm7 |
|---|
| 359 | | - pslldq $4, %xmm7 |
|---|
| 360 | | - pclmulqdq $0x11, %xmm10, %xmm7 |
|---|
| 361 | | - |
|---|
| 362 | | - pslldq $4, %xmm7 |
|---|
| 363 | | - pxor %xmm0, %xmm7 |
|---|
| 364 | | - pextrd $1, %xmm7, %eax |
|---|
| 365 | | - |
|---|
| 366 | | -_cleanup: |
|---|
| 367 | | - # scale the result back to 16 bits |
|---|
| 368 | | - shr $16, %eax |
|---|
| 369 | | - mov %rcx, %rsp |
|---|
| 370 | | - ret |
|---|
| 371 | | - |
|---|
| 372 | | -######################################################################## |
|---|
| 373 | | - |
|---|
| 374 | | -.align 16 |
|---|
| 375 | | -_less_than_128: |
|---|
| 376 | | - |
|---|
| 377 | | - # check if there is enough buffer to be able to fold 16B at a time |
|---|
| 378 | | - cmp $32, arg3 |
|---|
| 379 | | - jl _less_than_32 |
|---|
| 380 | | - movdqa SHUF_MASK(%rip), %xmm11 |
|---|
| 381 | | - |
|---|
| 382 | | - # now if there is, load the constants |
|---|
| 383 | | - movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 |
|---|
| 384 | | - |
|---|
| 385 | | - movd arg1_low32, %xmm0 # get the initial crc value |
|---|
| 386 | | - pslldq $12, %xmm0 # align it to its correct place |
|---|
| 387 | | - movdqu (arg2), %xmm7 # load the plaintext |
|---|
| 388 | | - pshufb %xmm11, %xmm7 # byte-reflect the plaintext |
|---|
| 389 | | - pxor %xmm0, %xmm7 |
|---|
| 390 | | - |
|---|
| 391 | | - |
|---|
| 392 | | - # update the buffer pointer |
|---|
| 393 | | - add $16, arg2 |
|---|
| 394 | | - |
|---|
| 395 | | - # update the counter. subtract 32 instead of 16 to save one |
|---|
| 396 | | - # instruction from the loop |
|---|
| 397 | | - sub $32, arg3 |
|---|
| 398 | | - |
|---|
| 399 | | - jmp _16B_reduction_loop |
|---|
| 400 | | - |
|---|
| 401 | | - |
|---|
| 402 | | -.align 16 |
|---|
| 403 | | -_less_than_32: |
|---|
| 404 | | - # mov initial crc to the return value. this is necessary for |
|---|
| 405 | | - # zero-length buffers. |
|---|
| 406 | | - mov arg1_low32, %eax |
|---|
| 407 | | - test arg3, arg3 |
|---|
| 408 | | - je _cleanup |
|---|
| 409 | | - |
|---|
| 410 | | - movdqa SHUF_MASK(%rip), %xmm11 |
|---|
| 411 | | - |
|---|
| 412 | | - movd arg1_low32, %xmm0 # get the initial crc value |
|---|
| 413 | | - pslldq $12, %xmm0 # align it to its correct place |
|---|
| 414 | | - |
|---|
| 415 | | - cmp $16, arg3 |
|---|
| 416 | | - je _exact_16_left |
|---|
| 417 | | - jl _less_than_16_left |
|---|
| 418 | | - |
|---|
| 419 | | - movdqu (arg2), %xmm7 # load the plaintext |
|---|
| 420 | | - pshufb %xmm11, %xmm7 # byte-reflect the plaintext |
|---|
| 421 | | - pxor %xmm0 , %xmm7 # xor the initial crc value |
|---|
| 422 | | - add $16, arg2 |
|---|
| 423 | | - sub $16, arg3 |
|---|
| 424 | | - movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 |
|---|
| 425 | | - jmp _get_last_two_xmms |
|---|
| 426 | | - |
|---|
| 427 | | - |
|---|
| 428 | | -.align 16 |
|---|
| 429 | | -_less_than_16_left: |
|---|
| 430 | | - # use stack space to load data less than 16 bytes, zero-out |
|---|
| 431 | | - # the 16B in memory first. |
|---|
| 432 | | - |
|---|
| 433 | | - pxor %xmm1, %xmm1 |
|---|
| 434 | | - mov %rsp, %r11 |
|---|
| 435 | | - movdqa %xmm1, (%r11) |
|---|
| 436 | | - |
|---|
| 437 | | - cmp $4, arg3 |
|---|
| 438 | | - jl _only_less_than_4 |
|---|
| 439 | | - |
|---|
| 440 | | - # backup the counter value |
|---|
| 441 | | - mov arg3, %r9 |
|---|
| 442 | | - cmp $8, arg3 |
|---|
| 443 | | - jl _less_than_8_left |
|---|
| 444 | | - |
|---|
| 445 | | - # load 8 Bytes |
|---|
| 446 | | - mov (arg2), %rax |
|---|
| 447 | | - mov %rax, (%r11) |
|---|
| 448 | | - add $8, %r11 |
|---|
| 449 | | - sub $8, arg3 |
|---|
| 450 | | - add $8, arg2 |
|---|
| 451 | | -_less_than_8_left: |
|---|
| 452 | | - |
|---|
| 453 | | - cmp $4, arg3 |
|---|
| 454 | | - jl _less_than_4_left |
|---|
| 455 | | - |
|---|
| 456 | | - # load 4 Bytes |
|---|
| 457 | | - mov (arg2), %eax |
|---|
| 458 | | - mov %eax, (%r11) |
|---|
| 459 | | - add $4, %r11 |
|---|
| 460 | | - sub $4, arg3 |
|---|
| 461 | | - add $4, arg2 |
|---|
| 462 | | -_less_than_4_left: |
|---|
| 463 | | - |
|---|
| 464 | | - cmp $2, arg3 |
|---|
| 465 | | - jl _less_than_2_left |
|---|
| 466 | | - |
|---|
| 467 | | - # load 2 Bytes |
|---|
| 468 | | - mov (arg2), %ax |
|---|
| 469 | | - mov %ax, (%r11) |
|---|
| 470 | | - add $2, %r11 |
|---|
| 471 | | - sub $2, arg3 |
|---|
| 472 | | - add $2, arg2 |
|---|
| 473 | | -_less_than_2_left: |
|---|
| 474 | | - cmp $1, arg3 |
|---|
| 475 | | - jl _zero_left |
|---|
| 476 | | - |
|---|
| 477 | | - # load 1 Byte |
|---|
| 478 | | - mov (arg2), %al |
|---|
| 479 | | - mov %al, (%r11) |
|---|
| 480 | | -_zero_left: |
|---|
| 481 | | - movdqa (%rsp), %xmm7 |
|---|
| 482 | | - pshufb %xmm11, %xmm7 |
|---|
| 483 | | - pxor %xmm0 , %xmm7 # xor the initial crc value |
|---|
| 484 | | - |
|---|
| 485 | | - # shl r9, 4 |
|---|
| 486 | | - lea pshufb_shf_table+16(%rip), %rax |
|---|
| 487 | | - sub %r9, %rax |
|---|
| 488 | | - movdqu (%rax), %xmm0 |
|---|
| 489 | | - pxor mask1(%rip), %xmm0 |
|---|
| 490 | | - |
|---|
| 491 | | - pshufb %xmm0, %xmm7 |
|---|
| 492 | | - jmp _128_done |
|---|
| 493 | | - |
|---|
| 494 | | -.align 16 |
|---|
| 495 | | -_exact_16_left: |
|---|
| 496 | | - movdqu (arg2), %xmm7 |
|---|
| 497 | | - pshufb %xmm11, %xmm7 |
|---|
| 498 | | - pxor %xmm0 , %xmm7 # xor the initial crc value |
|---|
| 499 | | - |
|---|
| 500 | | - jmp _128_done |
|---|
| 501 | | - |
|---|
| 502 | | -_only_less_than_4: |
|---|
| 503 | | - cmp $3, arg3 |
|---|
| 504 | | - jl _only_less_than_3 |
|---|
| 505 | | - |
|---|
| 506 | | - # load 3 Bytes |
|---|
| 507 | | - mov (arg2), %al |
|---|
| 508 | | - mov %al, (%r11) |
|---|
| 509 | | - |
|---|
| 510 | | - mov 1(arg2), %al |
|---|
| 511 | | - mov %al, 1(%r11) |
|---|
| 512 | | - |
|---|
| 513 | | - mov 2(arg2), %al |
|---|
| 514 | | - mov %al, 2(%r11) |
|---|
| 515 | | - |
|---|
| 516 | | - movdqa (%rsp), %xmm7 |
|---|
| 517 | | - pshufb %xmm11, %xmm7 |
|---|
| 518 | | - pxor %xmm0 , %xmm7 # xor the initial crc value |
|---|
| 519 | | - |
|---|
| 520 | | - psrldq $5, %xmm7 |
|---|
| 521 | | - |
|---|
| 522 | | - jmp _barrett |
|---|
| 523 | | -_only_less_than_3: |
|---|
| 524 | | - cmp $2, arg3 |
|---|
| 525 | | - jl _only_less_than_2 |
|---|
| 526 | | - |
|---|
| 527 | | - # load 2 Bytes |
|---|
| 528 | | - mov (arg2), %al |
|---|
| 529 | | - mov %al, (%r11) |
|---|
| 530 | | - |
|---|
| 531 | | - mov 1(arg2), %al |
|---|
| 532 | | - mov %al, 1(%r11) |
|---|
| 533 | | - |
|---|
| 534 | | - movdqa (%rsp), %xmm7 |
|---|
| 535 | | - pshufb %xmm11, %xmm7 |
|---|
| 536 | | - pxor %xmm0 , %xmm7 # xor the initial crc value |
|---|
| 537 | | - |
|---|
| 538 | | - psrldq $6, %xmm7 |
|---|
| 539 | | - |
|---|
| 540 | | - jmp _barrett |
|---|
| 541 | | -_only_less_than_2: |
|---|
| 542 | | - |
|---|
| 543 | | - # load 1 Byte |
|---|
| 544 | | - mov (arg2), %al |
|---|
| 545 | | - mov %al, (%r11) |
|---|
| 546 | | - |
|---|
| 547 | | - movdqa (%rsp), %xmm7 |
|---|
| 548 | | - pshufb %xmm11, %xmm7 |
|---|
| 549 | | - pxor %xmm0 , %xmm7 # xor the initial crc value |
|---|
| 550 | | - |
|---|
| 551 | | - psrldq $7, %xmm7 |
|---|
| 552 | | - |
|---|
| 553 | | - jmp _barrett |
|---|
| 554 | | - |
|---|
| 555 | | -ENDPROC(crc_t10dif_pcl) |
|---|
| 276 | + movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS |
|---|
| 277 | + cmp $16, len |
|---|
| 278 | + je .Lreduce_final_16_bytes # len == 16 |
|---|
| 279 | + sub $32, len |
|---|
| 280 | + jge .Lfold_16_bytes_loop # 32 <= len <= 255 |
|---|
| 281 | + add $16, len |
|---|
| 282 | + jmp .Lhandle_partial_segment # 17 <= len <= 31 |
|---|
| 283 | +SYM_FUNC_END(crc_t10dif_pcl) |
|---|
| 556 | 284 | |
|---|
| 557 | 285 | .section .rodata, "a", @progbits |
|---|
| 558 | 286 | .align 16 |
|---|
| 559 | | -# precomputed constants |
|---|
| 560 | | -# these constants are precomputed from the poly: |
|---|
| 561 | | -# 0x8bb70000 (0x8bb7 scaled to 32 bits) |
|---|
| 562 | | -# Q = 0x18BB70000 |
|---|
| 563 | | -# rk1 = 2^(32*3) mod Q << 32 |
|---|
| 564 | | -# rk2 = 2^(32*5) mod Q << 32 |
|---|
| 565 | | -# rk3 = 2^(32*15) mod Q << 32 |
|---|
| 566 | | -# rk4 = 2^(32*17) mod Q << 32 |
|---|
| 567 | | -# rk5 = 2^(32*3) mod Q << 32 |
|---|
| 568 | | -# rk6 = 2^(32*2) mod Q << 32 |
|---|
| 569 | | -# rk7 = floor(2^64/Q) |
|---|
| 570 | | -# rk8 = Q |
|---|
| 571 | | -rk1: |
|---|
| 572 | | -.quad 0x2d56000000000000 |
|---|
| 573 | | -rk2: |
|---|
| 574 | | -.quad 0x06df000000000000 |
|---|
| 575 | | -rk3: |
|---|
| 576 | | -.quad 0x9d9d000000000000 |
|---|
| 577 | | -rk4: |
|---|
| 578 | | -.quad 0x7cf5000000000000 |
|---|
| 579 | | -rk5: |
|---|
| 580 | | -.quad 0x2d56000000000000 |
|---|
| 581 | | -rk6: |
|---|
| 582 | | -.quad 0x1368000000000000 |
|---|
| 583 | | -rk7: |
|---|
| 584 | | -.quad 0x00000001f65a57f8 |
|---|
| 585 | | -rk8: |
|---|
| 586 | | -.quad 0x000000018bb70000 |
|---|
| 587 | 287 | |
|---|
| 588 | | -rk9: |
|---|
| 589 | | -.quad 0xceae000000000000 |
|---|
| 590 | | -rk10: |
|---|
| 591 | | -.quad 0xbfd6000000000000 |
|---|
| 592 | | -rk11: |
|---|
| 593 | | -.quad 0x1e16000000000000 |
|---|
| 594 | | -rk12: |
|---|
| 595 | | -.quad 0x713c000000000000 |
|---|
| 596 | | -rk13: |
|---|
| 597 | | -.quad 0xf7f9000000000000 |
|---|
| 598 | | -rk14: |
|---|
| 599 | | -.quad 0x80a6000000000000 |
|---|
| 600 | | -rk15: |
|---|
| 601 | | -.quad 0x044c000000000000 |
|---|
| 602 | | -rk16: |
|---|
| 603 | | -.quad 0xe658000000000000 |
|---|
| 604 | | -rk17: |
|---|
| 605 | | -.quad 0xad18000000000000 |
|---|
| 606 | | -rk18: |
|---|
| 607 | | -.quad 0xa497000000000000 |
|---|
| 608 | | -rk19: |
|---|
| 609 | | -.quad 0x6ee3000000000000 |
|---|
| 610 | | -rk20: |
|---|
| 611 | | -.quad 0xe7b5000000000000 |
|---|
| 612 | | - |
|---|
| 613 | | - |
|---|
| 288 | +# Fold constants precomputed from the polynomial 0x18bb7 |
|---|
| 289 | +# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 |
|---|
| 290 | +.Lfold_across_128_bytes_consts: |
|---|
| 291 | + .quad 0x0000000000006123 # x^(8*128) mod G(x) |
|---|
| 292 | + .quad 0x0000000000002295 # x^(8*128+64) mod G(x) |
|---|
| 293 | +.Lfold_across_64_bytes_consts: |
|---|
| 294 | + .quad 0x0000000000001069 # x^(4*128) mod G(x) |
|---|
| 295 | + .quad 0x000000000000dd31 # x^(4*128+64) mod G(x) |
|---|
| 296 | +.Lfold_across_32_bytes_consts: |
|---|
| 297 | + .quad 0x000000000000857d # x^(2*128) mod G(x) |
|---|
| 298 | + .quad 0x0000000000007acc # x^(2*128+64) mod G(x) |
|---|
| 299 | +.Lfold_across_16_bytes_consts: |
|---|
| 300 | + .quad 0x000000000000a010 # x^(1*128) mod G(x) |
|---|
| 301 | + .quad 0x0000000000001faa # x^(1*128+64) mod G(x) |
|---|
| 302 | +.Lfinal_fold_consts: |
|---|
| 303 | + .quad 0x1368000000000000 # x^48 * (x^48 mod G(x)) |
|---|
| 304 | + .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x)) |
|---|
| 305 | +.Lbarrett_reduction_consts: |
|---|
| 306 | + .quad 0x0000000000018bb7 # G(x) |
|---|
| 307 | + .quad 0x00000001f65a57f8 # floor(x^48 / G(x)) |
|---|
| 614 | 308 | |
|---|
| 615 | 309 | .section .rodata.cst16.mask1, "aM", @progbits, 16 |
|---|
| 616 | 310 | .align 16 |
|---|
| 617 | | -mask1: |
|---|
| 618 | | -.octa 0x80808080808080808080808080808080 |
|---|
| 311 | +.Lmask1: |
|---|
| 312 | + .octa 0x80808080808080808080808080808080 |
|---|
| 619 | 313 | |
|---|
| 620 | 314 | .section .rodata.cst16.mask2, "aM", @progbits, 16 |
|---|
| 621 | 315 | .align 16 |
|---|
| 622 | | -mask2: |
|---|
| 623 | | -.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF |
|---|
| 316 | +.Lmask2: |
|---|
| 317 | + .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF |
|---|
| 624 | 318 | |
|---|
| 625 | | -.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 |
|---|
| 319 | +.section .rodata.cst16.bswap_mask, "aM", @progbits, 16 |
|---|
| 626 | 320 | .align 16 |
|---|
| 627 | | -SHUF_MASK: |
|---|
| 628 | | -.octa 0x000102030405060708090A0B0C0D0E0F |
|---|
| 321 | +.Lbswap_mask: |
|---|
| 322 | + .octa 0x000102030405060708090A0B0C0D0E0F |
|---|
| 629 | 323 | |
|---|
| 630 | | -.section .rodata.cst32.pshufb_shf_table, "aM", @progbits, 32 |
|---|
| 631 | | -.align 32 |
|---|
| 632 | | -pshufb_shf_table: |
|---|
| 633 | | -# use these values for shift constants for the pshufb instruction |
|---|
| 634 | | -# different alignments result in values as shown: |
|---|
| 635 | | -# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 |
|---|
| 636 | | -# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 |
|---|
| 637 | | -# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 |
|---|
| 638 | | -# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 |
|---|
| 639 | | -# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 |
|---|
| 640 | | -# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 |
|---|
| 641 | | -# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 |
|---|
| 642 | | -# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 |
|---|
| 643 | | -# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 |
|---|
| 644 | | -# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 |
|---|
| 645 | | -# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 |
|---|
| 646 | | -# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 |
|---|
| 647 | | -# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 |
|---|
| 648 | | -# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 |
|---|
| 649 | | -# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 |
|---|
| 650 | | -.octa 0x8f8e8d8c8b8a89888786858483828100 |
|---|
| 651 | | -.octa 0x000e0d0c0b0a09080706050403020100 |
|---|
| 324 | +.section .rodata.cst32.byteshift_table, "aM", @progbits, 32 |
|---|
| 325 | +.align 16 |
|---|
| 326 | +# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len] |
|---|
| 327 | +# is the index vector to shift left by 'len' bytes, and is also {0x80, ..., |
|---|
| 328 | +# 0x80} XOR the index vector to shift right by '16 - len' bytes. |
|---|
| 329 | +.Lbyteshift_table: |
|---|
| 330 | + .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 |
|---|
| 331 | + .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f |
|---|
| 332 | + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
|---|
| 333 | + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 |
|---|