hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/arch/x86/crypto/crct10dif-pcl-asm_64.S
....@@ -43,609 +43,291 @@
4343 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
4444 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
4545 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46
-########################################################################
47
-# Function API:
48
-# UINT16 crc_t10dif_pcl(
49
-# UINT16 init_crc, //initial CRC value, 16 bits
50
-# const unsigned char *buf, //buffer pointer to calculate CRC on
51
-# UINT64 len //buffer length in bytes (64-bit data)
52
-# );
5346 #
5447 # Reference paper titled "Fast CRC Computation for Generic
5548 # Polynomials Using PCLMULQDQ Instruction"
5649 # URL: http://www.intel.com/content/dam/www/public/us/en/documents
5750 # /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
5851 #
59
-#
6052
6153 #include <linux/linkage.h>
6254
6355 .text
6456
65
-#define arg1 %rdi
66
-#define arg2 %rsi
67
-#define arg3 %rdx
57
+#define init_crc %edi
58
+#define buf %rsi
59
+#define len %rdx
6860
69
-#define arg1_low32 %edi
61
+#define FOLD_CONSTS %xmm10
62
+#define BSWAP_MASK %xmm11
7063
71
-ENTRY(crc_t10dif_pcl)
64
+# Fold reg1, reg2 into the next 32 data bytes, storing the result back into
65
+# reg1, reg2.
66
+.macro fold_32_bytes offset, reg1, reg2
67
+ movdqu \offset(buf), %xmm9
68
+ movdqu \offset+16(buf), %xmm12
69
+ pshufb BSWAP_MASK, %xmm9
70
+ pshufb BSWAP_MASK, %xmm12
71
+ movdqa \reg1, %xmm8
72
+ movdqa \reg2, %xmm13
73
+ pclmulqdq $0x00, FOLD_CONSTS, \reg1
74
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm8
75
+ pclmulqdq $0x00, FOLD_CONSTS, \reg2
76
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm13
77
+ pxor %xmm9 , \reg1
78
+ xorps %xmm8 , \reg1
79
+ pxor %xmm12, \reg2
80
+ xorps %xmm13, \reg2
81
+.endm
82
+
83
+# Fold src_reg into dst_reg.
84
+.macro fold_16_bytes src_reg, dst_reg
85
+ movdqa \src_reg, %xmm8
86
+ pclmulqdq $0x11, FOLD_CONSTS, \src_reg
87
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
88
+ pxor %xmm8, \dst_reg
89
+ xorps \src_reg, \dst_reg
90
+.endm
91
+
92
+#
93
+# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len);
94
+#
95
+# Assumes len >= 16.
96
+#
7297 .align 16
98
+SYM_FUNC_START(crc_t10dif_pcl)
7399
74
- # adjust the 16-bit initial_crc value, scale it to 32 bits
75
- shl $16, arg1_low32
100
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
76101
77
- # Allocate Stack Space
78
- mov %rsp, %rcx
79
- sub $16*2, %rsp
80
- # align stack to 16 byte boundary
81
- and $~(0x10 - 1), %rsp
102
+ # For sizes less than 256 bytes, we can't fold 128 bytes at a time.
103
+ cmp $256, len
104
+ jl .Lless_than_256_bytes
82105
83
- # check if smaller than 256
84
- cmp $256, arg3
106
+ # Load the first 128 data bytes. Byte swapping is necessary to make the
107
+ # bit order match the polynomial coefficient order.
108
+ movdqu 16*0(buf), %xmm0
109
+ movdqu 16*1(buf), %xmm1
110
+ movdqu 16*2(buf), %xmm2
111
+ movdqu 16*3(buf), %xmm3
112
+ movdqu 16*4(buf), %xmm4
113
+ movdqu 16*5(buf), %xmm5
114
+ movdqu 16*6(buf), %xmm6
115
+ movdqu 16*7(buf), %xmm7
116
+ add $128, buf
117
+ pshufb BSWAP_MASK, %xmm0
118
+ pshufb BSWAP_MASK, %xmm1
119
+ pshufb BSWAP_MASK, %xmm2
120
+ pshufb BSWAP_MASK, %xmm3
121
+ pshufb BSWAP_MASK, %xmm4
122
+ pshufb BSWAP_MASK, %xmm5
123
+ pshufb BSWAP_MASK, %xmm6
124
+ pshufb BSWAP_MASK, %xmm7
85125
86
- # for sizes less than 128, we can't fold 64B at a time...
87
- jl _less_than_128
126
+ # XOR the first 16 data *bits* with the initial CRC value.
127
+ pxor %xmm8, %xmm8
128
+ pinsrw $7, init_crc, %xmm8
129
+ pxor %xmm8, %xmm0
88130
131
+ movdqa .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS
89132
90
- # load the initial crc value
91
- movd arg1_low32, %xmm10 # initial crc
133
+ # Subtract 128 for the 128 data bytes just consumed. Subtract another
134
+ # 128 to simplify the termination condition of the following loop.
135
+ sub $256, len
92136
93
- # crc value does not need to be byte-reflected, but it needs
94
- # to be moved to the high part of the register.
95
- # because data will be byte-reflected and will align with
96
- # initial crc at correct place.
97
- pslldq $12, %xmm10
137
+ # While >= 128 data bytes remain (not counting xmm0-7), fold the 128
138
+ # bytes xmm0-7 into them, storing the result back into xmm0-7.
139
+.Lfold_128_bytes_loop:
140
+ fold_32_bytes 0, %xmm0, %xmm1
141
+ fold_32_bytes 32, %xmm2, %xmm3
142
+ fold_32_bytes 64, %xmm4, %xmm5
143
+ fold_32_bytes 96, %xmm6, %xmm7
144
+ add $128, buf
145
+ sub $128, len
146
+ jge .Lfold_128_bytes_loop
98147
99
- movdqa SHUF_MASK(%rip), %xmm11
100
- # receive the initial 64B data, xor the initial crc value
101
- movdqu 16*0(arg2), %xmm0
102
- movdqu 16*1(arg2), %xmm1
103
- movdqu 16*2(arg2), %xmm2
104
- movdqu 16*3(arg2), %xmm3
105
- movdqu 16*4(arg2), %xmm4
106
- movdqu 16*5(arg2), %xmm5
107
- movdqu 16*6(arg2), %xmm6
108
- movdqu 16*7(arg2), %xmm7
148
+ # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
109149
110
- pshufb %xmm11, %xmm0
111
- # XOR the initial_crc value
112
- pxor %xmm10, %xmm0
113
- pshufb %xmm11, %xmm1
114
- pshufb %xmm11, %xmm2
115
- pshufb %xmm11, %xmm3
116
- pshufb %xmm11, %xmm4
117
- pshufb %xmm11, %xmm5
118
- pshufb %xmm11, %xmm6
119
- pshufb %xmm11, %xmm7
150
+ # Fold across 64 bytes.
151
+ movdqa .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS
152
+ fold_16_bytes %xmm0, %xmm4
153
+ fold_16_bytes %xmm1, %xmm5
154
+ fold_16_bytes %xmm2, %xmm6
155
+ fold_16_bytes %xmm3, %xmm7
156
+ # Fold across 32 bytes.
157
+ movdqa .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS
158
+ fold_16_bytes %xmm4, %xmm6
159
+ fold_16_bytes %xmm5, %xmm7
160
+ # Fold across 16 bytes.
161
+ movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
162
+ fold_16_bytes %xmm6, %xmm7
120163
121
- movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4
122
- #imm value of pclmulqdq instruction
123
- #will determine which constant to use
164
+ # Add 128 to get the correct number of data bytes remaining in 0...127
165
+ # (not counting xmm7), following the previous extra subtraction by 128.
166
+ # Then subtract 16 to simplify the termination condition of the
167
+ # following loop.
168
+ add $128-16, len
124169
125
- #################################################################
126
- # we subtract 256 instead of 128 to save one instruction from the loop
127
- sub $256, arg3
128
-
129
- # at this section of the code, there is 64*x+y (0<=y<64) bytes of
130
- # buffer. The _fold_64_B_loop will fold 64B at a time
131
- # until we have 64+y Bytes of buffer
132
-
133
-
134
- # fold 64B at a time. This section of the code folds 4 xmm
135
- # registers in parallel
136
-_fold_64_B_loop:
137
-
138
- # update the buffer pointer
139
- add $128, arg2 # buf += 64#
140
-
141
- movdqu 16*0(arg2), %xmm9
142
- movdqu 16*1(arg2), %xmm12
143
- pshufb %xmm11, %xmm9
144
- pshufb %xmm11, %xmm12
145
- movdqa %xmm0, %xmm8
146
- movdqa %xmm1, %xmm13
147
- pclmulqdq $0x0 , %xmm10, %xmm0
148
- pclmulqdq $0x11, %xmm10, %xmm8
149
- pclmulqdq $0x0 , %xmm10, %xmm1
150
- pclmulqdq $0x11, %xmm10, %xmm13
151
- pxor %xmm9 , %xmm0
152
- xorps %xmm8 , %xmm0
153
- pxor %xmm12, %xmm1
154
- xorps %xmm13, %xmm1
155
-
156
- movdqu 16*2(arg2), %xmm9
157
- movdqu 16*3(arg2), %xmm12
158
- pshufb %xmm11, %xmm9
159
- pshufb %xmm11, %xmm12
160
- movdqa %xmm2, %xmm8
161
- movdqa %xmm3, %xmm13
162
- pclmulqdq $0x0, %xmm10, %xmm2
163
- pclmulqdq $0x11, %xmm10, %xmm8
164
- pclmulqdq $0x0, %xmm10, %xmm3
165
- pclmulqdq $0x11, %xmm10, %xmm13
166
- pxor %xmm9 , %xmm2
167
- xorps %xmm8 , %xmm2
168
- pxor %xmm12, %xmm3
169
- xorps %xmm13, %xmm3
170
-
171
- movdqu 16*4(arg2), %xmm9
172
- movdqu 16*5(arg2), %xmm12
173
- pshufb %xmm11, %xmm9
174
- pshufb %xmm11, %xmm12
175
- movdqa %xmm4, %xmm8
176
- movdqa %xmm5, %xmm13
177
- pclmulqdq $0x0, %xmm10, %xmm4
178
- pclmulqdq $0x11, %xmm10, %xmm8
179
- pclmulqdq $0x0, %xmm10, %xmm5
180
- pclmulqdq $0x11, %xmm10, %xmm13
181
- pxor %xmm9 , %xmm4
182
- xorps %xmm8 , %xmm4
183
- pxor %xmm12, %xmm5
184
- xorps %xmm13, %xmm5
185
-
186
- movdqu 16*6(arg2), %xmm9
187
- movdqu 16*7(arg2), %xmm12
188
- pshufb %xmm11, %xmm9
189
- pshufb %xmm11, %xmm12
190
- movdqa %xmm6 , %xmm8
191
- movdqa %xmm7 , %xmm13
192
- pclmulqdq $0x0 , %xmm10, %xmm6
193
- pclmulqdq $0x11, %xmm10, %xmm8
194
- pclmulqdq $0x0 , %xmm10, %xmm7
195
- pclmulqdq $0x11, %xmm10, %xmm13
196
- pxor %xmm9 , %xmm6
197
- xorps %xmm8 , %xmm6
198
- pxor %xmm12, %xmm7
199
- xorps %xmm13, %xmm7
200
-
201
- sub $128, arg3
202
-
203
- # check if there is another 64B in the buffer to be able to fold
204
- jge _fold_64_B_loop
205
- ##################################################################
206
-
207
-
208
- add $128, arg2
209
- # at this point, the buffer pointer is pointing at the last y Bytes
210
- # of the buffer the 64B of folded data is in 4 of the xmm
211
- # registers: xmm0, xmm1, xmm2, xmm3
212
-
213
-
214
- # fold the 8 xmm registers to 1 xmm register with different constants
215
-
216
- movdqa rk9(%rip), %xmm10
217
- movdqa %xmm0, %xmm8
218
- pclmulqdq $0x11, %xmm10, %xmm0
219
- pclmulqdq $0x0 , %xmm10, %xmm8
220
- pxor %xmm8, %xmm7
221
- xorps %xmm0, %xmm7
222
-
223
- movdqa rk11(%rip), %xmm10
224
- movdqa %xmm1, %xmm8
225
- pclmulqdq $0x11, %xmm10, %xmm1
226
- pclmulqdq $0x0 , %xmm10, %xmm8
227
- pxor %xmm8, %xmm7
228
- xorps %xmm1, %xmm7
229
-
230
- movdqa rk13(%rip), %xmm10
231
- movdqa %xmm2, %xmm8
232
- pclmulqdq $0x11, %xmm10, %xmm2
233
- pclmulqdq $0x0 , %xmm10, %xmm8
234
- pxor %xmm8, %xmm7
235
- pxor %xmm2, %xmm7
236
-
237
- movdqa rk15(%rip), %xmm10
238
- movdqa %xmm3, %xmm8
239
- pclmulqdq $0x11, %xmm10, %xmm3
240
- pclmulqdq $0x0 , %xmm10, %xmm8
241
- pxor %xmm8, %xmm7
242
- xorps %xmm3, %xmm7
243
-
244
- movdqa rk17(%rip), %xmm10
245
- movdqa %xmm4, %xmm8
246
- pclmulqdq $0x11, %xmm10, %xmm4
247
- pclmulqdq $0x0 , %xmm10, %xmm8
248
- pxor %xmm8, %xmm7
249
- pxor %xmm4, %xmm7
250
-
251
- movdqa rk19(%rip), %xmm10
252
- movdqa %xmm5, %xmm8
253
- pclmulqdq $0x11, %xmm10, %xmm5
254
- pclmulqdq $0x0 , %xmm10, %xmm8
255
- pxor %xmm8, %xmm7
256
- xorps %xmm5, %xmm7
257
-
258
- movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2
259
- #imm value of pclmulqdq instruction
260
- #will determine which constant to use
261
- movdqa %xmm6, %xmm8
262
- pclmulqdq $0x11, %xmm10, %xmm6
263
- pclmulqdq $0x0 , %xmm10, %xmm8
264
- pxor %xmm8, %xmm7
265
- pxor %xmm6, %xmm7
266
-
267
-
268
- # instead of 64, we add 48 to the loop counter to save 1 instruction
269
- # from the loop instead of a cmp instruction, we use the negative
270
- # flag with the jl instruction
271
- add $128-16, arg3
272
- jl _final_reduction_for_128
273
-
274
- # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
275
- # and the rest is in memory. We can fold 16 bytes at a time if y>=16
276
- # continue folding 16B at a time
277
-
278
-_16B_reduction_loop:
170
+ # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes
171
+ # xmm7 into them, storing the result back into xmm7.
172
+ jl .Lfold_16_bytes_loop_done
173
+.Lfold_16_bytes_loop:
279174 movdqa %xmm7, %xmm8
280
- pclmulqdq $0x11, %xmm10, %xmm7
281
- pclmulqdq $0x0 , %xmm10, %xmm8
175
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7
176
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
282177 pxor %xmm8, %xmm7
283
- movdqu (arg2), %xmm0
284
- pshufb %xmm11, %xmm0
178
+ movdqu (buf), %xmm0
179
+ pshufb BSWAP_MASK, %xmm0
285180 pxor %xmm0 , %xmm7
286
- add $16, arg2
287
- sub $16, arg3
288
- # instead of a cmp instruction, we utilize the flags with the
289
- # jge instruction equivalent of: cmp arg3, 16-16
290
- # check if there is any more 16B in the buffer to be able to fold
291
- jge _16B_reduction_loop
181
+ add $16, buf
182
+ sub $16, len
183
+ jge .Lfold_16_bytes_loop
292184
293
- #now we have 16+z bytes left to reduce, where 0<= z < 16.
294
- #first, we reduce the data in the xmm7 register
185
+.Lfold_16_bytes_loop_done:
186
+ # Add 16 to get the correct number of data bytes remaining in 0...15
187
+ # (not counting xmm7), following the previous extra subtraction by 16.
188
+ add $16, len
189
+ je .Lreduce_final_16_bytes
295190
191
+.Lhandle_partial_segment:
192
+ # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16
193
+ # bytes are in xmm7 and the rest are the remaining data in 'buf'. To do
194
+ # this without needing a fold constant for each possible 'len', redivide
195
+ # the bytes into a first chunk of 'len' bytes and a second chunk of 16
196
+ # bytes, then fold the first chunk into the second.
296197
297
-_final_reduction_for_128:
298
- # check if any more data to fold. If not, compute the CRC of
299
- # the final 128 bits
300
- add $16, arg3
301
- je _128_done
302
-
303
- # here we are getting data that is less than 16 bytes.
304
- # since we know that there was data before the pointer, we can
305
- # offset the input pointer before the actual point, to receive
306
- # exactly 16 bytes. after that the registers need to be adjusted.
307
-_get_last_two_xmms:
308198 movdqa %xmm7, %xmm2
309199
310
- movdqu -16(arg2, arg3), %xmm1
311
- pshufb %xmm11, %xmm1
200
+ # xmm1 = last 16 original data bytes
201
+ movdqu -16(buf, len), %xmm1
202
+ pshufb BSWAP_MASK, %xmm1
312203
313
- # get rid of the extra data that was loaded before
314
- # load the shift constant
315
- lea pshufb_shf_table+16(%rip), %rax
316
- sub arg3, %rax
204
+ # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
205
+ lea .Lbyteshift_table+16(%rip), %rax
206
+ sub len, %rax
317207 movdqu (%rax), %xmm0
318
-
319
- # shift xmm2 to the left by arg3 bytes
320208 pshufb %xmm0, %xmm2
321209
322
- # shift xmm7 to the right by 16-arg3 bytes
323
- pxor mask1(%rip), %xmm0
210
+ # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
211
+ pxor .Lmask1(%rip), %xmm0
324212 pshufb %xmm0, %xmm7
213
+
214
+ # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
215
+ # then '16-len' bytes from xmm2 (high-order bytes).
325216 pblendvb %xmm2, %xmm1 #xmm0 is implicit
326217
327
- # fold 16 Bytes
328
- movdqa %xmm1, %xmm2
218
+ # Fold the first chunk into the second chunk, storing the result in xmm7.
329219 movdqa %xmm7, %xmm8
330
- pclmulqdq $0x11, %xmm10, %xmm7
331
- pclmulqdq $0x0 , %xmm10, %xmm8
220
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7
221
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
332222 pxor %xmm8, %xmm7
333
- pxor %xmm2, %xmm7
223
+ pxor %xmm1, %xmm7
334224
335
-_128_done:
336
- # compute crc of a 128-bit value
337
- movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10
225
+.Lreduce_final_16_bytes:
226
+ # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
227
+
228
+ # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
229
+ movdqa .Lfinal_fold_consts(%rip), FOLD_CONSTS
230
+
231
+ # Fold the high 64 bits into the low 64 bits, while also multiplying by
232
+ # x^64. This produces a 128-bit value congruent to x^64 * M(x) and
233
+ # whose low 48 bits are 0.
338234 movdqa %xmm7, %xmm0
235
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
236
+ pslldq $8, %xmm0
237
+ pxor %xmm0, %xmm7 # + low bits * x^64
339238
340
- #64b fold
341
- pclmulqdq $0x1, %xmm10, %xmm7
342
- pslldq $8 , %xmm0
343
- pxor %xmm0, %xmm7
344
-
345
- #32b fold
239
+ # Fold the high 32 bits into the low 96 bits. This produces a 96-bit
240
+ # value congruent to x^64 * M(x) and whose low 48 bits are 0.
346241 movdqa %xmm7, %xmm0
242
+ pand .Lmask2(%rip), %xmm0 # zero high 32 bits
243
+ psrldq $12, %xmm7 # extract high 32 bits
244
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
245
+ pxor %xmm0, %xmm7 # + low bits
347246
348
- pand mask2(%rip), %xmm0
247
+ # Load G(x) and floor(x^48 / G(x)).
248
+ movdqa .Lbarrett_reduction_consts(%rip), FOLD_CONSTS
349249
350
- psrldq $12, %xmm7
351
- pclmulqdq $0x10, %xmm10, %xmm7
250
+ # Use Barrett reduction to compute the final CRC value.
251
+ movdqa %xmm7, %xmm0
252
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
253
+ psrlq $32, %xmm7 # /= x^32
254
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x)
255
+ psrlq $48, %xmm0
256
+ pxor %xmm7, %xmm0 # + low 16 nonzero bits
257
+ # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
258
+
259
+ pextrw $0, %xmm0, %eax
260
+ RET
261
+
262
+.align 16
263
+.Lless_than_256_bytes:
264
+ # Checksumming a buffer of length 16...255 bytes
265
+
266
+ # Load the first 16 data bytes.
267
+ movdqu (buf), %xmm7
268
+ pshufb BSWAP_MASK, %xmm7
269
+ add $16, buf
270
+
271
+ # XOR the first 16 data *bits* with the initial CRC value.
272
+ pxor %xmm0, %xmm0
273
+ pinsrw $7, init_crc, %xmm0
352274 pxor %xmm0, %xmm7
353275
354
- #barrett reduction
355
-_barrett:
356
- movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10
357
- movdqa %xmm7, %xmm0
358
- pclmulqdq $0x01, %xmm10, %xmm7
359
- pslldq $4, %xmm7
360
- pclmulqdq $0x11, %xmm10, %xmm7
361
-
362
- pslldq $4, %xmm7
363
- pxor %xmm0, %xmm7
364
- pextrd $1, %xmm7, %eax
365
-
366
-_cleanup:
367
- # scale the result back to 16 bits
368
- shr $16, %eax
369
- mov %rcx, %rsp
370
- ret
371
-
372
-########################################################################
373
-
374
-.align 16
375
-_less_than_128:
376
-
377
- # check if there is enough buffer to be able to fold 16B at a time
378
- cmp $32, arg3
379
- jl _less_than_32
380
- movdqa SHUF_MASK(%rip), %xmm11
381
-
382
- # now if there is, load the constants
383
- movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
384
-
385
- movd arg1_low32, %xmm0 # get the initial crc value
386
- pslldq $12, %xmm0 # align it to its correct place
387
- movdqu (arg2), %xmm7 # load the plaintext
388
- pshufb %xmm11, %xmm7 # byte-reflect the plaintext
389
- pxor %xmm0, %xmm7
390
-
391
-
392
- # update the buffer pointer
393
- add $16, arg2
394
-
395
- # update the counter. subtract 32 instead of 16 to save one
396
- # instruction from the loop
397
- sub $32, arg3
398
-
399
- jmp _16B_reduction_loop
400
-
401
-
402
-.align 16
403
-_less_than_32:
404
- # mov initial crc to the return value. this is necessary for
405
- # zero-length buffers.
406
- mov arg1_low32, %eax
407
- test arg3, arg3
408
- je _cleanup
409
-
410
- movdqa SHUF_MASK(%rip), %xmm11
411
-
412
- movd arg1_low32, %xmm0 # get the initial crc value
413
- pslldq $12, %xmm0 # align it to its correct place
414
-
415
- cmp $16, arg3
416
- je _exact_16_left
417
- jl _less_than_16_left
418
-
419
- movdqu (arg2), %xmm7 # load the plaintext
420
- pshufb %xmm11, %xmm7 # byte-reflect the plaintext
421
- pxor %xmm0 , %xmm7 # xor the initial crc value
422
- add $16, arg2
423
- sub $16, arg3
424
- movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
425
- jmp _get_last_two_xmms
426
-
427
-
428
-.align 16
429
-_less_than_16_left:
430
- # use stack space to load data less than 16 bytes, zero-out
431
- # the 16B in memory first.
432
-
433
- pxor %xmm1, %xmm1
434
- mov %rsp, %r11
435
- movdqa %xmm1, (%r11)
436
-
437
- cmp $4, arg3
438
- jl _only_less_than_4
439
-
440
- # backup the counter value
441
- mov arg3, %r9
442
- cmp $8, arg3
443
- jl _less_than_8_left
444
-
445
- # load 8 Bytes
446
- mov (arg2), %rax
447
- mov %rax, (%r11)
448
- add $8, %r11
449
- sub $8, arg3
450
- add $8, arg2
451
-_less_than_8_left:
452
-
453
- cmp $4, arg3
454
- jl _less_than_4_left
455
-
456
- # load 4 Bytes
457
- mov (arg2), %eax
458
- mov %eax, (%r11)
459
- add $4, %r11
460
- sub $4, arg3
461
- add $4, arg2
462
-_less_than_4_left:
463
-
464
- cmp $2, arg3
465
- jl _less_than_2_left
466
-
467
- # load 2 Bytes
468
- mov (arg2), %ax
469
- mov %ax, (%r11)
470
- add $2, %r11
471
- sub $2, arg3
472
- add $2, arg2
473
-_less_than_2_left:
474
- cmp $1, arg3
475
- jl _zero_left
476
-
477
- # load 1 Byte
478
- mov (arg2), %al
479
- mov %al, (%r11)
480
-_zero_left:
481
- movdqa (%rsp), %xmm7
482
- pshufb %xmm11, %xmm7
483
- pxor %xmm0 , %xmm7 # xor the initial crc value
484
-
485
- # shl r9, 4
486
- lea pshufb_shf_table+16(%rip), %rax
487
- sub %r9, %rax
488
- movdqu (%rax), %xmm0
489
- pxor mask1(%rip), %xmm0
490
-
491
- pshufb %xmm0, %xmm7
492
- jmp _128_done
493
-
494
-.align 16
495
-_exact_16_left:
496
- movdqu (arg2), %xmm7
497
- pshufb %xmm11, %xmm7
498
- pxor %xmm0 , %xmm7 # xor the initial crc value
499
-
500
- jmp _128_done
501
-
502
-_only_less_than_4:
503
- cmp $3, arg3
504
- jl _only_less_than_3
505
-
506
- # load 3 Bytes
507
- mov (arg2), %al
508
- mov %al, (%r11)
509
-
510
- mov 1(arg2), %al
511
- mov %al, 1(%r11)
512
-
513
- mov 2(arg2), %al
514
- mov %al, 2(%r11)
515
-
516
- movdqa (%rsp), %xmm7
517
- pshufb %xmm11, %xmm7
518
- pxor %xmm0 , %xmm7 # xor the initial crc value
519
-
520
- psrldq $5, %xmm7
521
-
522
- jmp _barrett
523
-_only_less_than_3:
524
- cmp $2, arg3
525
- jl _only_less_than_2
526
-
527
- # load 2 Bytes
528
- mov (arg2), %al
529
- mov %al, (%r11)
530
-
531
- mov 1(arg2), %al
532
- mov %al, 1(%r11)
533
-
534
- movdqa (%rsp), %xmm7
535
- pshufb %xmm11, %xmm7
536
- pxor %xmm0 , %xmm7 # xor the initial crc value
537
-
538
- psrldq $6, %xmm7
539
-
540
- jmp _barrett
541
-_only_less_than_2:
542
-
543
- # load 1 Byte
544
- mov (arg2), %al
545
- mov %al, (%r11)
546
-
547
- movdqa (%rsp), %xmm7
548
- pshufb %xmm11, %xmm7
549
- pxor %xmm0 , %xmm7 # xor the initial crc value
550
-
551
- psrldq $7, %xmm7
552
-
553
- jmp _barrett
554
-
555
-ENDPROC(crc_t10dif_pcl)
276
+ movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
277
+ cmp $16, len
278
+ je .Lreduce_final_16_bytes # len == 16
279
+ sub $32, len
280
+ jge .Lfold_16_bytes_loop # 32 <= len <= 255
281
+ add $16, len
282
+ jmp .Lhandle_partial_segment # 17 <= len <= 31
283
+SYM_FUNC_END(crc_t10dif_pcl)
556284
557285 .section .rodata, "a", @progbits
558286 .align 16
559
-# precomputed constants
560
-# these constants are precomputed from the poly:
561
-# 0x8bb70000 (0x8bb7 scaled to 32 bits)
562
-# Q = 0x18BB70000
563
-# rk1 = 2^(32*3) mod Q << 32
564
-# rk2 = 2^(32*5) mod Q << 32
565
-# rk3 = 2^(32*15) mod Q << 32
566
-# rk4 = 2^(32*17) mod Q << 32
567
-# rk5 = 2^(32*3) mod Q << 32
568
-# rk6 = 2^(32*2) mod Q << 32
569
-# rk7 = floor(2^64/Q)
570
-# rk8 = Q
571
-rk1:
572
-.quad 0x2d56000000000000
573
-rk2:
574
-.quad 0x06df000000000000
575
-rk3:
576
-.quad 0x9d9d000000000000
577
-rk4:
578
-.quad 0x7cf5000000000000
579
-rk5:
580
-.quad 0x2d56000000000000
581
-rk6:
582
-.quad 0x1368000000000000
583
-rk7:
584
-.quad 0x00000001f65a57f8
585
-rk8:
586
-.quad 0x000000018bb70000
587287
588
-rk9:
589
-.quad 0xceae000000000000
590
-rk10:
591
-.quad 0xbfd6000000000000
592
-rk11:
593
-.quad 0x1e16000000000000
594
-rk12:
595
-.quad 0x713c000000000000
596
-rk13:
597
-.quad 0xf7f9000000000000
598
-rk14:
599
-.quad 0x80a6000000000000
600
-rk15:
601
-.quad 0x044c000000000000
602
-rk16:
603
-.quad 0xe658000000000000
604
-rk17:
605
-.quad 0xad18000000000000
606
-rk18:
607
-.quad 0xa497000000000000
608
-rk19:
609
-.quad 0x6ee3000000000000
610
-rk20:
611
-.quad 0xe7b5000000000000
612
-
613
-
288
+# Fold constants precomputed from the polynomial 0x18bb7
289
+# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
290
+.Lfold_across_128_bytes_consts:
291
+ .quad 0x0000000000006123 # x^(8*128) mod G(x)
292
+ .quad 0x0000000000002295 # x^(8*128+64) mod G(x)
293
+.Lfold_across_64_bytes_consts:
294
+ .quad 0x0000000000001069 # x^(4*128) mod G(x)
295
+ .quad 0x000000000000dd31 # x^(4*128+64) mod G(x)
296
+.Lfold_across_32_bytes_consts:
297
+ .quad 0x000000000000857d # x^(2*128) mod G(x)
298
+ .quad 0x0000000000007acc # x^(2*128+64) mod G(x)
299
+.Lfold_across_16_bytes_consts:
300
+ .quad 0x000000000000a010 # x^(1*128) mod G(x)
301
+ .quad 0x0000000000001faa # x^(1*128+64) mod G(x)
302
+.Lfinal_fold_consts:
303
+ .quad 0x1368000000000000 # x^48 * (x^48 mod G(x))
304
+ .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x))
305
+.Lbarrett_reduction_consts:
306
+ .quad 0x0000000000018bb7 # G(x)
307
+ .quad 0x00000001f65a57f8 # floor(x^48 / G(x))
614308
615309 .section .rodata.cst16.mask1, "aM", @progbits, 16
616310 .align 16
617
-mask1:
618
-.octa 0x80808080808080808080808080808080
311
+.Lmask1:
312
+ .octa 0x80808080808080808080808080808080
619313
620314 .section .rodata.cst16.mask2, "aM", @progbits, 16
621315 .align 16
622
-mask2:
623
-.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
316
+.Lmask2:
317
+ .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
624318
625
-.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
319
+.section .rodata.cst16.bswap_mask, "aM", @progbits, 16
626320 .align 16
627
-SHUF_MASK:
628
-.octa 0x000102030405060708090A0B0C0D0E0F
321
+.Lbswap_mask:
322
+ .octa 0x000102030405060708090A0B0C0D0E0F
629323
630
-.section .rodata.cst32.pshufb_shf_table, "aM", @progbits, 32
631
-.align 32
632
-pshufb_shf_table:
633
-# use these values for shift constants for the pshufb instruction
634
-# different alignments result in values as shown:
635
-# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
636
-# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
637
-# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
638
-# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
639
-# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
640
-# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
641
-# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
642
-# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
643
-# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
644
-# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
645
-# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
646
-# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
647
-# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
648
-# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
649
-# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
650
-.octa 0x8f8e8d8c8b8a89888786858483828100
651
-.octa 0x000e0d0c0b0a09080706050403020100
324
+.section .rodata.cst32.byteshift_table, "aM", @progbits, 32
325
+.align 16
326
+# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
327
+# is the index vector to shift left by 'len' bytes, and is also {0x80, ...,
328
+# 0x80} XOR the index vector to shift right by '16 - len' bytes.
329
+.Lbyteshift_table:
330
+ .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
331
+ .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
332
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
333
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0