hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/arch/arm64/crypto/crct10dif-ce-core.S
....@@ -2,12 +2,14 @@
22 // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
33 //
44 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5
+// Copyright (C) 2019 Google LLC <ebiggers@google.com>
56 //
67 // This program is free software; you can redistribute it and/or modify
78 // it under the terms of the GNU General Public License version 2 as
89 // published by the Free Software Foundation.
910 //
1011
12
+// Derived from the x86 version:
1113 //
1214 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
1315 //
....@@ -54,115 +56,176 @@
5456 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
5557 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5658 //
57
-// Function API:
58
-// UINT16 crc_t10dif_pcl(
59
-// UINT16 init_crc, //initial CRC value, 16 bits
60
-// const unsigned char *buf, //buffer pointer to calculate CRC on
61
-// UINT64 len //buffer length in bytes (64-bit data)
62
-// );
63
-//
6459 // Reference paper titled "Fast CRC Computation for Generic
6560 // Polynomials Using PCLMULQDQ Instruction"
6661 // URL: http://www.intel.com/content/dam/www/public/us/en/documents
6762 // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
68
-//
6963 //
7064
7165 #include <linux/linkage.h>
7266 #include <asm/assembler.h>
7367
7468 .text
75
- .cpu generic+crypto
69
+ .arch armv8-a+crypto
7670
77
- arg1_low32 .req w19
78
- arg2 .req x20
79
- arg3 .req x21
71
+ init_crc .req w0
72
+ buf .req x1
73
+ len .req x2
74
+ fold_consts_ptr .req x3
8075
81
- vzr .req v13
76
+ fold_consts .req v10
8277
83
-ENTRY(crc_t10dif_pmull)
84
- frame_push 3, 128
78
+ ad .req v14
8579
86
- mov arg1_low32, w0
87
- mov arg2, x1
88
- mov arg3, x2
80
+ k00_16 .req v15
81
+ k32_48 .req v16
8982
90
- movi vzr.16b, #0 // init zero register
83
+ t3 .req v17
84
+ t4 .req v18
85
+ t5 .req v19
86
+ t6 .req v20
87
+ t7 .req v21
88
+ t8 .req v22
89
+ t9 .req v23
9190
92
- // adjust the 16-bit initial_crc value, scale it to 32 bits
93
- lsl arg1_low32, arg1_low32, #16
91
+ perm1 .req v24
92
+ perm2 .req v25
93
+ perm3 .req v26
94
+ perm4 .req v27
9495
95
- // check if smaller than 256
96
- cmp arg3, #256
96
+ bd1 .req v28
97
+ bd2 .req v29
98
+ bd3 .req v30
99
+ bd4 .req v31
97100
98
- // for sizes less than 128, we can't fold 64B at a time...
99
- b.lt _less_than_128
101
+ .macro __pmull_init_p64
102
+ .endm
100103
101
- // load the initial crc value
102
- // crc value does not need to be byte-reflected, but it needs
103
- // to be moved to the high part of the register.
104
- // because data will be byte-reflected and will align with
105
- // initial crc at correct place.
106
- movi v10.16b, #0
107
- mov v10.s[3], arg1_low32 // initial crc
104
+ .macro __pmull_pre_p64, bd
105
+ .endm
108106
109
- // receive the initial 64B data, xor the initial crc value
110
- ldp q0, q1, [arg2]
111
- ldp q2, q3, [arg2, #0x20]
112
- ldp q4, q5, [arg2, #0x40]
113
- ldp q6, q7, [arg2, #0x60]
114
- add arg2, arg2, #0x80
107
+ .macro __pmull_init_p8
108
+ // k00_16 := 0x0000000000000000_000000000000ffff
109
+ // k32_48 := 0x00000000ffffffff_0000ffffffffffff
110
+ movi k32_48.2d, #0xffffffff
111
+ mov k32_48.h[2], k32_48.h[0]
112
+ ushr k00_16.2d, k32_48.2d, #32
115113
116
-CPU_LE( rev64 v0.16b, v0.16b )
117
-CPU_LE( rev64 v1.16b, v1.16b )
118
-CPU_LE( rev64 v2.16b, v2.16b )
119
-CPU_LE( rev64 v3.16b, v3.16b )
120
-CPU_LE( rev64 v4.16b, v4.16b )
121
-CPU_LE( rev64 v5.16b, v5.16b )
122
-CPU_LE( rev64 v6.16b, v6.16b )
123
-CPU_LE( rev64 v7.16b, v7.16b )
114
+ // prepare the permutation vectors
115
+ mov_q x5, 0x080f0e0d0c0b0a09
116
+ movi perm4.8b, #8
117
+ dup perm1.2d, x5
118
+ eor perm1.16b, perm1.16b, perm4.16b
119
+ ushr perm2.2d, perm1.2d, #8
120
+ ushr perm3.2d, perm1.2d, #16
121
+ ushr perm4.2d, perm1.2d, #24
122
+ sli perm2.2d, perm1.2d, #56
123
+ sli perm3.2d, perm1.2d, #48
124
+ sli perm4.2d, perm1.2d, #40
125
+ .endm
124126
125
-CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
126
-CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
127
-CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
128
-CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
129
-CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
130
-CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
131
-CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
132
-CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
127
+ .macro __pmull_pre_p8, bd
128
+ tbl bd1.16b, {\bd\().16b}, perm1.16b
129
+ tbl bd2.16b, {\bd\().16b}, perm2.16b
130
+ tbl bd3.16b, {\bd\().16b}, perm3.16b
131
+ tbl bd4.16b, {\bd\().16b}, perm4.16b
132
+ .endm
133133
134
- // XOR the initial_crc value
135
- eor v0.16b, v0.16b, v10.16b
134
+SYM_FUNC_START_LOCAL(__pmull_p8_core)
135
+.L__pmull_p8_core:
136
+ ext t4.8b, ad.8b, ad.8b, #1 // A1
137
+ ext t5.8b, ad.8b, ad.8b, #2 // A2
138
+ ext t6.8b, ad.8b, ad.8b, #3 // A3
136139
137
- ldr_l q10, rk3, x8 // xmm10 has rk3 and rk4
138
- // type of pmull instruction
139
- // will determine which constant to use
140
+ pmull t4.8h, t4.8b, fold_consts.8b // F = A1*B
141
+ pmull t8.8h, ad.8b, bd1.8b // E = A*B1
142
+ pmull t5.8h, t5.8b, fold_consts.8b // H = A2*B
143
+ pmull t7.8h, ad.8b, bd2.8b // G = A*B2
144
+ pmull t6.8h, t6.8b, fold_consts.8b // J = A3*B
145
+ pmull t9.8h, ad.8b, bd3.8b // I = A*B3
146
+ pmull t3.8h, ad.8b, bd4.8b // K = A*B4
147
+ b 0f
140148
141
- //
142
- // we subtract 256 instead of 128 to save one instruction from the loop
143
- //
144
- sub arg3, arg3, #256
149
+.L__pmull_p8_core2:
150
+ tbl t4.16b, {ad.16b}, perm1.16b // A1
151
+ tbl t5.16b, {ad.16b}, perm2.16b // A2
152
+ tbl t6.16b, {ad.16b}, perm3.16b // A3
145153
146
- // at this section of the code, there is 64*x+y (0<=y<64) bytes of
147
- // buffer. The _fold_64_B_loop will fold 64B at a time
148
- // until we have 64+y Bytes of buffer
154
+ pmull2 t4.8h, t4.16b, fold_consts.16b // F = A1*B
155
+ pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1
156
+ pmull2 t5.8h, t5.16b, fold_consts.16b // H = A2*B
157
+ pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2
158
+ pmull2 t6.8h, t6.16b, fold_consts.16b // J = A3*B
159
+ pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3
160
+ pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4
149161
162
+0: eor t4.16b, t4.16b, t8.16b // L = E + F
163
+ eor t5.16b, t5.16b, t7.16b // M = G + H
164
+ eor t6.16b, t6.16b, t9.16b // N = I + J
150165
151
- // fold 64B at a time. This section of the code folds 4 vector
152
- // registers in parallel
153
-_fold_64_B_loop:
166
+ uzp1 t8.2d, t4.2d, t5.2d
167
+ uzp2 t4.2d, t4.2d, t5.2d
168
+ uzp1 t7.2d, t6.2d, t3.2d
169
+ uzp2 t6.2d, t6.2d, t3.2d
154170
155
- .macro fold64, reg1, reg2
156
- ldp q11, q12, [arg2], #0x20
171
+ // t4 = (L) (P0 + P1) << 8
172
+ // t5 = (M) (P2 + P3) << 16
173
+ eor t8.16b, t8.16b, t4.16b
174
+ and t4.16b, t4.16b, k32_48.16b
157175
158
- pmull2 v8.1q, \reg1\().2d, v10.2d
159
- pmull \reg1\().1q, \reg1\().1d, v10.1d
176
+ // t6 = (N) (P4 + P5) << 24
177
+ // t7 = (K) (P6 + P7) << 32
178
+ eor t7.16b, t7.16b, t6.16b
179
+ and t6.16b, t6.16b, k00_16.16b
180
+
181
+ eor t8.16b, t8.16b, t4.16b
182
+ eor t7.16b, t7.16b, t6.16b
183
+
184
+ zip2 t5.2d, t8.2d, t4.2d
185
+ zip1 t4.2d, t8.2d, t4.2d
186
+ zip2 t3.2d, t7.2d, t6.2d
187
+ zip1 t6.2d, t7.2d, t6.2d
188
+
189
+ ext t4.16b, t4.16b, t4.16b, #15
190
+ ext t5.16b, t5.16b, t5.16b, #14
191
+ ext t6.16b, t6.16b, t6.16b, #13
192
+ ext t3.16b, t3.16b, t3.16b, #12
193
+
194
+ eor t4.16b, t4.16b, t5.16b
195
+ eor t6.16b, t6.16b, t3.16b
196
+ ret
197
+SYM_FUNC_END(__pmull_p8_core)
198
+
199
+ .macro __pmull_p8, rq, ad, bd, i
200
+ .ifnc \bd, fold_consts
201
+ .err
202
+ .endif
203
+ mov ad.16b, \ad\().16b
204
+ .ifb \i
205
+ pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B
206
+ .else
207
+ pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B
208
+ .endif
209
+
210
+ bl .L__pmull_p8_core\i
211
+
212
+ eor \rq\().16b, \rq\().16b, t4.16b
213
+ eor \rq\().16b, \rq\().16b, t6.16b
214
+ .endm
215
+
216
+ // Fold reg1, reg2 into the next 32 data bytes, storing the result back
217
+ // into reg1, reg2.
218
+ .macro fold_32_bytes, p, reg1, reg2
219
+ ldp q11, q12, [buf], #0x20
220
+
221
+ __pmull_\p v8, \reg1, fold_consts, 2
222
+ __pmull_\p \reg1, \reg1, fold_consts
160223
161224 CPU_LE( rev64 v11.16b, v11.16b )
162225 CPU_LE( rev64 v12.16b, v12.16b )
163226
164
- pmull2 v9.1q, \reg2\().2d, v10.2d
165
- pmull \reg2\().1q, \reg2\().1d, v10.1d
227
+ __pmull_\p v9, \reg2, fold_consts, 2
228
+ __pmull_\p \reg2, \reg2, fold_consts
166229
167230 CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
168231 CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
....@@ -173,244 +236,279 @@
173236 eor \reg2\().16b, \reg2\().16b, v12.16b
174237 .endm
175238
176
- fold64 v0, v1
177
- fold64 v2, v3
178
- fold64 v4, v5
179
- fold64 v6, v7
180
-
181
- subs arg3, arg3, #128
182
-
183
- // check if there is another 64B in the buffer to be able to fold
184
- b.lt _fold_64_B_end
185
-
186
- if_will_cond_yield_neon
187
- stp q0, q1, [sp, #.Lframe_local_offset]
188
- stp q2, q3, [sp, #.Lframe_local_offset + 32]
189
- stp q4, q5, [sp, #.Lframe_local_offset + 64]
190
- stp q6, q7, [sp, #.Lframe_local_offset + 96]
191
- do_cond_yield_neon
192
- ldp q0, q1, [sp, #.Lframe_local_offset]
193
- ldp q2, q3, [sp, #.Lframe_local_offset + 32]
194
- ldp q4, q5, [sp, #.Lframe_local_offset + 64]
195
- ldp q6, q7, [sp, #.Lframe_local_offset + 96]
196
- ldr_l q10, rk3, x8
197
- movi vzr.16b, #0 // init zero register
198
- endif_yield_neon
199
-
200
- b _fold_64_B_loop
201
-
202
-_fold_64_B_end:
203
- // at this point, the buffer pointer is pointing at the last y Bytes
204
- // of the buffer the 64B of folded data is in 4 of the vector
205
- // registers: v0, v1, v2, v3
206
-
207
- // fold the 8 vector registers to 1 vector register with different
208
- // constants
209
-
210
- ldr_l q10, rk9, x8
211
-
212
- .macro fold16, reg, rk
213
- pmull v8.1q, \reg\().1d, v10.1d
214
- pmull2 \reg\().1q, \reg\().2d, v10.2d
215
- .ifnb \rk
216
- ldr_l q10, \rk, x8
239
+ // Fold src_reg into dst_reg, optionally loading the next fold constants
240
+ .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
241
+ __pmull_\p v8, \src_reg, fold_consts
242
+ __pmull_\p \src_reg, \src_reg, fold_consts, 2
243
+ .ifnb \load_next_consts
244
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
245
+ __pmull_pre_\p fold_consts
217246 .endif
218
- eor v7.16b, v7.16b, v8.16b
219
- eor v7.16b, v7.16b, \reg\().16b
247
+ eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
248
+ eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
220249 .endm
221250
222
- fold16 v0, rk11
223
- fold16 v1, rk13
224
- fold16 v2, rk15
225
- fold16 v3, rk17
226
- fold16 v4, rk19
227
- fold16 v5, rk1
228
- fold16 v6
251
+ .macro __pmull_p64, rd, rn, rm, n
252
+ .ifb \n
253
+ pmull \rd\().1q, \rn\().1d, \rm\().1d
254
+ .else
255
+ pmull2 \rd\().1q, \rn\().2d, \rm\().2d
256
+ .endif
257
+ .endm
229258
230
- // instead of 64, we add 48 to the loop counter to save 1 instruction
231
- // from the loop instead of a cmp instruction, we use the negative
232
- // flag with the jl instruction
233
- adds arg3, arg3, #(128-16)
234
- b.lt _final_reduction_for_128
259
+ .macro crc_t10dif_pmull, p
260
+ __pmull_init_\p
235261
236
- // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
237
- // and the rest is in memory. We can fold 16 bytes at a time if y>=16
238
- // continue folding 16B at a time
262
+ // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
263
+ cmp len, #256
264
+ b.lt .Lless_than_256_bytes_\@
239265
240
-_16B_reduction_loop:
241
- pmull v8.1q, v7.1d, v10.1d
242
- pmull2 v7.1q, v7.2d, v10.2d
266
+ adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts
267
+
268
+ // Load the first 128 data bytes. Byte swapping is necessary to make
269
+ // the bit order match the polynomial coefficient order.
270
+ ldp q0, q1, [buf]
271
+ ldp q2, q3, [buf, #0x20]
272
+ ldp q4, q5, [buf, #0x40]
273
+ ldp q6, q7, [buf, #0x60]
274
+ add buf, buf, #0x80
275
+CPU_LE( rev64 v0.16b, v0.16b )
276
+CPU_LE( rev64 v1.16b, v1.16b )
277
+CPU_LE( rev64 v2.16b, v2.16b )
278
+CPU_LE( rev64 v3.16b, v3.16b )
279
+CPU_LE( rev64 v4.16b, v4.16b )
280
+CPU_LE( rev64 v5.16b, v5.16b )
281
+CPU_LE( rev64 v6.16b, v6.16b )
282
+CPU_LE( rev64 v7.16b, v7.16b )
283
+CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
284
+CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
285
+CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
286
+CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
287
+CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
288
+CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
289
+CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
290
+CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
291
+
292
+ // XOR the first 16 data *bits* with the initial CRC value.
293
+ movi v8.16b, #0
294
+ mov v8.h[7], init_crc
295
+ eor v0.16b, v0.16b, v8.16b
296
+
297
+ // Load the constants for folding across 128 bytes.
298
+ ld1 {fold_consts.2d}, [fold_consts_ptr]
299
+ __pmull_pre_\p fold_consts
300
+
301
+ // Subtract 128 for the 128 data bytes just consumed. Subtract another
302
+ // 128 to simplify the termination condition of the following loop.
303
+ sub len, len, #256
304
+
305
+ // While >= 128 data bytes remain (not counting v0-v7), fold the 128
306
+ // bytes v0-v7 into them, storing the result back into v0-v7.
307
+.Lfold_128_bytes_loop_\@:
308
+ fold_32_bytes \p, v0, v1
309
+ fold_32_bytes \p, v2, v3
310
+ fold_32_bytes \p, v4, v5
311
+ fold_32_bytes \p, v6, v7
312
+
313
+ subs len, len, #128
314
+ b.ge .Lfold_128_bytes_loop_\@
315
+
316
+ // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
317
+
318
+ // Fold across 64 bytes.
319
+ add fold_consts_ptr, fold_consts_ptr, #16
320
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
321
+ __pmull_pre_\p fold_consts
322
+ fold_16_bytes \p, v0, v4
323
+ fold_16_bytes \p, v1, v5
324
+ fold_16_bytes \p, v2, v6
325
+ fold_16_bytes \p, v3, v7, 1
326
+ // Fold across 32 bytes.
327
+ fold_16_bytes \p, v4, v6
328
+ fold_16_bytes \p, v5, v7, 1
329
+ // Fold across 16 bytes.
330
+ fold_16_bytes \p, v6, v7
331
+
332
+ // Add 128 to get the correct number of data bytes remaining in 0...127
333
+ // (not counting v7), following the previous extra subtraction by 128.
334
+ // Then subtract 16 to simplify the termination condition of the
335
+ // following loop.
336
+ adds len, len, #(128-16)
337
+
338
+ // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
339
+ // into them, storing the result back into v7.
340
+ b.lt .Lfold_16_bytes_loop_done_\@
341
+.Lfold_16_bytes_loop_\@:
342
+ __pmull_\p v8, v7, fold_consts
343
+ __pmull_\p v7, v7, fold_consts, 2
243344 eor v7.16b, v7.16b, v8.16b
244
-
245
- ldr q0, [arg2], #16
345
+ ldr q0, [buf], #16
246346 CPU_LE( rev64 v0.16b, v0.16b )
247347 CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
248348 eor v7.16b, v7.16b, v0.16b
249
- subs arg3, arg3, #16
349
+ subs len, len, #16
350
+ b.ge .Lfold_16_bytes_loop_\@
250351
251
- // instead of a cmp instruction, we utilize the flags with the
252
- // jge instruction equivalent of: cmp arg3, 16-16
253
- // check if there is any more 16B in the buffer to be able to fold
254
- b.ge _16B_reduction_loop
352
+.Lfold_16_bytes_loop_done_\@:
353
+ // Add 16 to get the correct number of data bytes remaining in 0...15
354
+ // (not counting v7), following the previous extra subtraction by 16.
355
+ adds len, len, #16
356
+ b.eq .Lreduce_final_16_bytes_\@
255357
256
- // now we have 16+z bytes left to reduce, where 0<= z < 16.
257
- // first, we reduce the data in the xmm7 register
358
+.Lhandle_partial_segment_\@:
359
+ // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
360
+ // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To
361
+ // do this without needing a fold constant for each possible 'len',
362
+ // redivide the bytes into a first chunk of 'len' bytes and a second
363
+ // chunk of 16 bytes, then fold the first chunk into the second.
258364
259
-_final_reduction_for_128:
260
- // check if any more data to fold. If not, compute the CRC of
261
- // the final 128 bits
262
- adds arg3, arg3, #16
263
- b.eq _128_done
365
+ // v0 = last 16 original data bytes
366
+ add buf, buf, len
367
+ ldr q0, [buf, #-16]
368
+CPU_LE( rev64 v0.16b, v0.16b )
369
+CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
264370
265
- // here we are getting data that is less than 16 bytes.
266
- // since we know that there was data before the pointer, we can
267
- // offset the input pointer before the actual point, to receive
268
- // exactly 16 bytes. after that the registers need to be adjusted.
269
-_get_last_two_regs:
270
- add arg2, arg2, arg3
271
- ldr q1, [arg2, #-16]
272
-CPU_LE( rev64 v1.16b, v1.16b )
273
-CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
371
+ // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
372
+ adr_l x4, .Lbyteshift_table + 16
373
+ sub x4, x4, len
374
+ ld1 {v2.16b}, [x4]
375
+ tbl v1.16b, {v7.16b}, v2.16b
274376
275
- // get rid of the extra data that was loaded before
276
- // load the shift constant
277
- adr_l x4, tbl_shf_table + 16
278
- sub x4, x4, arg3
279
- ld1 {v0.16b}, [x4]
377
+ // v3 = first chunk: v7 right-shifted by '16-len' bytes.
378
+ movi v3.16b, #0x80
379
+ eor v2.16b, v2.16b, v3.16b
380
+ tbl v3.16b, {v7.16b}, v2.16b
280381
281
- // shift v2 to the left by arg3 bytes
282
- tbl v2.16b, {v7.16b}, v0.16b
382
+ // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
383
+ sshr v2.16b, v2.16b, #7
283384
284
- // shift v7 to the right by 16-arg3 bytes
285
- movi v9.16b, #0x80
286
- eor v0.16b, v0.16b, v9.16b
287
- tbl v7.16b, {v7.16b}, v0.16b
385
+ // v2 = second chunk: 'len' bytes from v0 (low-order bytes),
386
+ // then '16-len' bytes from v1 (high-order bytes).
387
+ bsl v2.16b, v1.16b, v0.16b
288388
289
- // blend
290
- sshr v0.16b, v0.16b, #7 // convert to 8-bit mask
291
- bsl v0.16b, v2.16b, v1.16b
292
-
293
- // fold 16 Bytes
294
- pmull v8.1q, v7.1d, v10.1d
295
- pmull2 v7.1q, v7.2d, v10.2d
296
- eor v7.16b, v7.16b, v8.16b
389
+ // Fold the first chunk into the second chunk, storing the result in v7.
390
+ __pmull_\p v0, v3, fold_consts
391
+ __pmull_\p v7, v3, fold_consts, 2
297392 eor v7.16b, v7.16b, v0.16b
393
+ eor v7.16b, v7.16b, v2.16b
298394
299
-_128_done:
300
- // compute crc of a 128-bit value
301
- ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10
395
+.Lreduce_final_16_bytes_\@:
396
+ // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
302397
303
- // 64b fold
304
- ext v0.16b, vzr.16b, v7.16b, #8
305
- mov v7.d[0], v7.d[1]
306
- pmull v7.1q, v7.1d, v10.1d
307
- eor v7.16b, v7.16b, v0.16b
398
+ movi v2.16b, #0 // init zero register
308399
309
- // 32b fold
310
- ext v0.16b, v7.16b, vzr.16b, #4
311
- mov v7.s[3], vzr.s[0]
312
- pmull2 v0.1q, v0.2d, v10.2d
313
- eor v7.16b, v7.16b, v0.16b
400
+ // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
401
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
402
+ __pmull_pre_\p fold_consts
314403
315
- // barrett reduction
316
-_barrett:
317
- ldr_l q10, rk7, x8
318
- mov v0.d[0], v7.d[1]
404
+ // Fold the high 64 bits into the low 64 bits, while also multiplying by
405
+ // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
406
+ // whose low 48 bits are 0.
407
+ ext v0.16b, v2.16b, v7.16b, #8
408
+ __pmull_\p v7, v7, fold_consts, 2 // high bits * x^48 * (x^80 mod G(x))
409
+ eor v0.16b, v0.16b, v7.16b // + low bits * x^64
319410
320
- pmull v0.1q, v0.1d, v10.1d
321
- ext v0.16b, vzr.16b, v0.16b, #12
322
- pmull2 v0.1q, v0.2d, v10.2d
323
- ext v0.16b, vzr.16b, v0.16b, #12
324
- eor v7.16b, v7.16b, v0.16b
325
- mov w0, v7.s[1]
411
+ // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
412
+ // value congruent to x^64 * M(x) and whose low 48 bits are 0.
413
+ ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
414
+ mov v0.s[3], v2.s[0] // zero high 32 bits
415
+ __pmull_\p v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x))
416
+ eor v0.16b, v0.16b, v1.16b // + low bits
326417
327
-_cleanup:
328
- // scale the result back to 16 bits
329
- lsr x0, x0, #16
330
- frame_pop
418
+ // Load G(x) and floor(x^48 / G(x)).
419
+ ld1 {fold_consts.2d}, [fold_consts_ptr]
420
+ __pmull_pre_\p fold_consts
421
+
422
+ // Use Barrett reduction to compute the final CRC value.
423
+ __pmull_\p v1, v0, fold_consts, 2 // high 32 bits * floor(x^48 / G(x))
424
+ ushr v1.2d, v1.2d, #32 // /= x^32
425
+ __pmull_\p v1, v1, fold_consts // *= G(x)
426
+ ushr v0.2d, v0.2d, #48
427
+ eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
428
+ // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
429
+
430
+ umov w0, v0.h[0]
431
+ .ifc \p, p8
432
+ ldp x29, x30, [sp], #16
433
+ .endif
331434 ret
332435
333
-_less_than_128:
334
- cbz arg3, _cleanup
436
+.Lless_than_256_bytes_\@:
437
+ // Checksumming a buffer of length 16...255 bytes
335438
336
- movi v0.16b, #0
337
- mov v0.s[3], arg1_low32 // get the initial crc value
439
+ adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts
338440
339
- ldr q7, [arg2], #0x10
441
+ // Load the first 16 data bytes.
442
+ ldr q7, [buf], #0x10
340443 CPU_LE( rev64 v7.16b, v7.16b )
341444 CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
342
- eor v7.16b, v7.16b, v0.16b // xor the initial crc value
343445
344
- cmp arg3, #16
345
- b.eq _128_done // exactly 16 left
346
- b.lt _less_than_16_left
446
+ // XOR the first 16 data *bits* with the initial CRC value.
447
+ movi v0.16b, #0
448
+ mov v0.h[7], init_crc
449
+ eor v7.16b, v7.16b, v0.16b
347450
348
- ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10
451
+ // Load the fold-across-16-bytes constants.
452
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
453
+ __pmull_pre_\p fold_consts
349454
350
- // update the counter. subtract 32 instead of 16 to save one
351
- // instruction from the loop
352
- subs arg3, arg3, #32
353
- b.ge _16B_reduction_loop
455
+ cmp len, #16
456
+ b.eq .Lreduce_final_16_bytes_\@ // len == 16
457
+ subs len, len, #32
458
+ b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
459
+ add len, len, #16
460
+ b .Lhandle_partial_segment_\@ // 17 <= len <= 31
461
+ .endm
354462
355
- add arg3, arg3, #16
356
- b _get_last_two_regs
463
+//
464
+// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
465
+//
466
+// Assumes len >= 16.
467
+//
468
+SYM_FUNC_START(crc_t10dif_pmull_p8)
469
+ stp x29, x30, [sp, #-16]!
470
+ mov x29, sp
471
+ crc_t10dif_pmull p8
472
+SYM_FUNC_END(crc_t10dif_pmull_p8)
357473
358
-_less_than_16_left:
359
- // shl r9, 4
360
- adr_l x0, tbl_shf_table + 16
361
- sub x0, x0, arg3
362
- ld1 {v0.16b}, [x0]
363
- movi v9.16b, #0x80
364
- eor v0.16b, v0.16b, v9.16b
365
- tbl v7.16b, {v7.16b}, v0.16b
366
- b _128_done
367
-ENDPROC(crc_t10dif_pmull)
474
+ .align 5
475
+//
476
+// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
477
+//
478
+// Assumes len >= 16.
479
+//
480
+SYM_FUNC_START(crc_t10dif_pmull_p64)
481
+ crc_t10dif_pmull p64
482
+SYM_FUNC_END(crc_t10dif_pmull_p64)
368483
369
-// precomputed constants
370
-// these constants are precomputed from the poly:
371
-// 0x8bb70000 (0x8bb7 scaled to 32 bits)
372484 .section ".rodata", "a"
373485 .align 4
374
-// Q = 0x18BB70000
375
-// rk1 = 2^(32*3) mod Q << 32
376
-// rk2 = 2^(32*5) mod Q << 32
377
-// rk3 = 2^(32*15) mod Q << 32
378
-// rk4 = 2^(32*17) mod Q << 32
379
-// rk5 = 2^(32*3) mod Q << 32
380
-// rk6 = 2^(32*2) mod Q << 32
381
-// rk7 = floor(2^64/Q)
382
-// rk8 = Q
383486
384
-rk1: .octa 0x06df0000000000002d56000000000000
385
-rk3: .octa 0x7cf50000000000009d9d000000000000
386
-rk5: .octa 0x13680000000000002d56000000000000
387
-rk7: .octa 0x000000018bb7000000000001f65a57f8
388
-rk9: .octa 0xbfd6000000000000ceae000000000000
389
-rk11: .octa 0x713c0000000000001e16000000000000
390
-rk13: .octa 0x80a6000000000000f7f9000000000000
391
-rk15: .octa 0xe658000000000000044c000000000000
392
-rk17: .octa 0xa497000000000000ad18000000000000
393
-rk19: .octa 0xe7b50000000000006ee3000000000000
487
+// Fold constants precomputed from the polynomial 0x18bb7
488
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
489
+.Lfold_across_128_bytes_consts:
490
+ .quad 0x0000000000006123 // x^(8*128) mod G(x)
491
+ .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
492
+// .Lfold_across_64_bytes_consts:
493
+ .quad 0x0000000000001069 // x^(4*128) mod G(x)
494
+ .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
495
+// .Lfold_across_32_bytes_consts:
496
+ .quad 0x000000000000857d // x^(2*128) mod G(x)
497
+ .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
498
+.Lfold_across_16_bytes_consts:
499
+ .quad 0x000000000000a010 // x^(1*128) mod G(x)
500
+ .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
501
+// .Lfinal_fold_consts:
502
+ .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
503
+ .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
504
+// .Lbarrett_reduction_consts:
505
+ .quad 0x0000000000018bb7 // G(x)
506
+ .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
394507
395
-tbl_shf_table:
396
-// use these values for shift constants for the tbl/tbx instruction
397
-// different alignments result in values as shown:
398
-// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
399
-// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
400
-// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
401
-// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
402
-// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
403
-// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
404
-// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
405
-// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
406
-// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
407
-// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
408
-// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
409
-// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
410
-// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
411
-// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
412
-// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
413
-
508
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
509
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
510
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
511
+.Lbyteshift_table:
414512 .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
415513 .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
416514 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7