hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/arch/arm/crypto/crct10dif-ce-core.S
....@@ -2,12 +2,14 @@
22 // Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
33 //
44 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5
+// Copyright (C) 2019 Google LLC <ebiggers@google.com>
56 //
67 // This program is free software; you can redistribute it and/or modify
78 // it under the terms of the GNU General Public License version 2 as
89 // published by the Free Software Foundation.
910 //
1011
12
+// Derived from the x86 version:
1113 //
1214 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
1315 //
....@@ -54,18 +56,10 @@
5456 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
5557 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5658 //
57
-// Function API:
58
-// UINT16 crc_t10dif_pcl(
59
-// UINT16 init_crc, //initial CRC value, 16 bits
60
-// const unsigned char *buf, //buffer pointer to calculate CRC on
61
-// UINT64 len //buffer length in bytes (64-bit data)
62
-// );
63
-//
6459 // Reference paper titled "Fast CRC Computation for Generic
6560 // Polynomials Using PCLMULQDQ Instruction"
6661 // URL: http://www.intel.com/content/dam/www/public/us/en/documents
6762 // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
68
-//
6963 //
7064
7165 #include <linux/linkage.h>
....@@ -78,13 +72,14 @@
7872 #endif
7973
8074 .text
75
+ .arch armv8-a
8176 .fpu crypto-neon-fp-armv8
8277
83
- arg1_low32 .req r0
84
- arg2 .req r1
85
- arg3 .req r2
78
+ init_crc .req r0
79
+ buf .req r1
80
+ len .req r2
8681
87
- qzr .req q13
82
+ fold_consts_ptr .req ip
8883
8984 q0l .req d0
9085 q0h .req d1
....@@ -102,82 +97,35 @@
10297 q6h .req d13
10398 q7l .req d14
10499 q7h .req d15
100
+ q8l .req d16
101
+ q8h .req d17
102
+ q9l .req d18
103
+ q9h .req d19
104
+ q10l .req d20
105
+ q10h .req d21
106
+ q11l .req d22
107
+ q11h .req d23
108
+ q12l .req d24
109
+ q12h .req d25
105110
106
-ENTRY(crc_t10dif_pmull)
107
- vmov.i8 qzr, #0 // init zero register
111
+ FOLD_CONSTS .req q10
112
+ FOLD_CONST_L .req q10l
113
+ FOLD_CONST_H .req q10h
108114
109
- // adjust the 16-bit initial_crc value, scale it to 32 bits
110
- lsl arg1_low32, arg1_low32, #16
115
+ // Fold reg1, reg2 into the next 32 data bytes, storing the result back
116
+ // into reg1, reg2.
117
+ .macro fold_32_bytes, reg1, reg2
118
+ vld1.64 {q11-q12}, [buf]!
111119
112
- // check if smaller than 256
113
- cmp arg3, #256
120
+ vmull.p64 q8, \reg1\()h, FOLD_CONST_H
121
+ vmull.p64 \reg1, \reg1\()l, FOLD_CONST_L
122
+ vmull.p64 q9, \reg2\()h, FOLD_CONST_H
123
+ vmull.p64 \reg2, \reg2\()l, FOLD_CONST_L
114124
115
- // for sizes less than 128, we can't fold 64B at a time...
116
- blt _less_than_128
117
-
118
- // load the initial crc value
119
- // crc value does not need to be byte-reflected, but it needs
120
- // to be moved to the high part of the register.
121
- // because data will be byte-reflected and will align with
122
- // initial crc at correct place.
123
- vmov s0, arg1_low32 // initial crc
124
- vext.8 q10, qzr, q0, #4
125
-
126
- // receive the initial 64B data, xor the initial crc value
127
- vld1.64 {q0-q1}, [arg2]!
128
- vld1.64 {q2-q3}, [arg2]!
129
- vld1.64 {q4-q5}, [arg2]!
130
- vld1.64 {q6-q7}, [arg2]!
131
-CPU_LE( vrev64.8 q0, q0 )
132
-CPU_LE( vrev64.8 q1, q1 )
133
-CPU_LE( vrev64.8 q2, q2 )
134
-CPU_LE( vrev64.8 q3, q3 )
135
-CPU_LE( vrev64.8 q4, q4 )
136
-CPU_LE( vrev64.8 q5, q5 )
137
-CPU_LE( vrev64.8 q6, q6 )
138
-CPU_LE( vrev64.8 q7, q7 )
139
-
140
- vswp d0, d1
141
- vswp d2, d3
142
- vswp d4, d5
143
- vswp d6, d7
144
- vswp d8, d9
145
- vswp d10, d11
146
- vswp d12, d13
147
- vswp d14, d15
148
-
149
- // XOR the initial_crc value
150
- veor.8 q0, q0, q10
151
-
152
- adr ip, rk3
153
- vld1.64 {q10}, [ip, :128] // xmm10 has rk3 and rk4
154
-
155
- //
156
- // we subtract 256 instead of 128 to save one instruction from the loop
157
- //
158
- sub arg3, arg3, #256
159
-
160
- // at this section of the code, there is 64*x+y (0<=y<64) bytes of
161
- // buffer. The _fold_64_B_loop will fold 64B at a time
162
- // until we have 64+y Bytes of buffer
163
-
164
-
165
- // fold 64B at a time. This section of the code folds 4 vector
166
- // registers in parallel
167
-_fold_64_B_loop:
168
-
169
- .macro fold64, reg1, reg2
170
- vld1.64 {q11-q12}, [arg2]!
171
-
172
- vmull.p64 q8, \reg1\()h, d21
173
- vmull.p64 \reg1, \reg1\()l, d20
174
- vmull.p64 q9, \reg2\()h, d21
175
- vmull.p64 \reg2, \reg2\()l, d20
176
-
177
-CPU_LE( vrev64.8 q11, q11 )
178
-CPU_LE( vrev64.8 q12, q12 )
179
- vswp d22, d23
180
- vswp d24, d25
125
+CPU_LE( vrev64.8 q11, q11 )
126
+CPU_LE( vrev64.8 q12, q12 )
127
+ vswp q11l, q11h
128
+ vswp q12l, q12h
181129
182130 veor.8 \reg1, \reg1, q8
183131 veor.8 \reg2, \reg2, q9
....@@ -185,242 +133,248 @@
185133 veor.8 \reg2, \reg2, q12
186134 .endm
187135
188
- fold64 q0, q1
189
- fold64 q2, q3
190
- fold64 q4, q5
191
- fold64 q6, q7
192
-
193
- subs arg3, arg3, #128
194
-
195
- // check if there is another 64B in the buffer to be able to fold
196
- bge _fold_64_B_loop
197
-
198
- // at this point, the buffer pointer is pointing at the last y Bytes
199
- // of the buffer the 64B of folded data is in 4 of the vector
200
- // registers: v0, v1, v2, v3
201
-
202
- // fold the 8 vector registers to 1 vector register with different
203
- // constants
204
-
205
- adr ip, rk9
206
- vld1.64 {q10}, [ip, :128]!
207
-
208
- .macro fold16, reg, rk
209
- vmull.p64 q8, \reg\()l, d20
210
- vmull.p64 \reg, \reg\()h, d21
211
- .ifnb \rk
212
- vld1.64 {q10}, [ip, :128]!
136
+ // Fold src_reg into dst_reg, optionally loading the next fold constants
137
+ .macro fold_16_bytes, src_reg, dst_reg, load_next_consts
138
+ vmull.p64 q8, \src_reg\()l, FOLD_CONST_L
139
+ vmull.p64 \src_reg, \src_reg\()h, FOLD_CONST_H
140
+ .ifnb \load_next_consts
141
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
213142 .endif
214
- veor.8 q7, q7, q8
215
- veor.8 q7, q7, \reg
143
+ veor.8 \dst_reg, \dst_reg, q8
144
+ veor.8 \dst_reg, \dst_reg, \src_reg
216145 .endm
217146
218
- fold16 q0, rk11
219
- fold16 q1, rk13
220
- fold16 q2, rk15
221
- fold16 q3, rk17
222
- fold16 q4, rk19
223
- fold16 q5, rk1
224
- fold16 q6
147
+ .macro __adrl, out, sym
148
+ movw \out, #:lower16:\sym
149
+ movt \out, #:upper16:\sym
150
+ .endm
225151
226
- // instead of 64, we add 48 to the loop counter to save 1 instruction
227
- // from the loop instead of a cmp instruction, we use the negative
228
- // flag with the jl instruction
229
- adds arg3, arg3, #(128-16)
230
- blt _final_reduction_for_128
152
+//
153
+// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
154
+//
155
+// Assumes len >= 16.
156
+//
157
+ENTRY(crc_t10dif_pmull)
231158
232
- // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
233
- // and the rest is in memory. We can fold 16 bytes at a time if y>=16
234
- // continue folding 16B at a time
159
+ // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
160
+ cmp len, #256
161
+ blt .Lless_than_256_bytes
235162
236
-_16B_reduction_loop:
237
- vmull.p64 q8, d14, d20
238
- vmull.p64 q7, d15, d21
163
+ __adrl fold_consts_ptr, .Lfold_across_128_bytes_consts
164
+
165
+ // Load the first 128 data bytes. Byte swapping is necessary to make
166
+ // the bit order match the polynomial coefficient order.
167
+ vld1.64 {q0-q1}, [buf]!
168
+ vld1.64 {q2-q3}, [buf]!
169
+ vld1.64 {q4-q5}, [buf]!
170
+ vld1.64 {q6-q7}, [buf]!
171
+CPU_LE( vrev64.8 q0, q0 )
172
+CPU_LE( vrev64.8 q1, q1 )
173
+CPU_LE( vrev64.8 q2, q2 )
174
+CPU_LE( vrev64.8 q3, q3 )
175
+CPU_LE( vrev64.8 q4, q4 )
176
+CPU_LE( vrev64.8 q5, q5 )
177
+CPU_LE( vrev64.8 q6, q6 )
178
+CPU_LE( vrev64.8 q7, q7 )
179
+ vswp q0l, q0h
180
+ vswp q1l, q1h
181
+ vswp q2l, q2h
182
+ vswp q3l, q3h
183
+ vswp q4l, q4h
184
+ vswp q5l, q5h
185
+ vswp q6l, q6h
186
+ vswp q7l, q7h
187
+
188
+ // XOR the first 16 data *bits* with the initial CRC value.
189
+ vmov.i8 q8h, #0
190
+ vmov.u16 q8h[3], init_crc
191
+ veor q0h, q0h, q8h
192
+
193
+ // Load the constants for folding across 128 bytes.
194
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
195
+
196
+ // Subtract 128 for the 128 data bytes just consumed. Subtract another
197
+ // 128 to simplify the termination condition of the following loop.
198
+ sub len, len, #256
199
+
200
+ // While >= 128 data bytes remain (not counting q0-q7), fold the 128
201
+ // bytes q0-q7 into them, storing the result back into q0-q7.
202
+.Lfold_128_bytes_loop:
203
+ fold_32_bytes q0, q1
204
+ fold_32_bytes q2, q3
205
+ fold_32_bytes q4, q5
206
+ fold_32_bytes q6, q7
207
+ subs len, len, #128
208
+ bge .Lfold_128_bytes_loop
209
+
210
+ // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
211
+
212
+ // Fold across 64 bytes.
213
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
214
+ fold_16_bytes q0, q4
215
+ fold_16_bytes q1, q5
216
+ fold_16_bytes q2, q6
217
+ fold_16_bytes q3, q7, 1
218
+ // Fold across 32 bytes.
219
+ fold_16_bytes q4, q6
220
+ fold_16_bytes q5, q7, 1
221
+ // Fold across 16 bytes.
222
+ fold_16_bytes q6, q7
223
+
224
+ // Add 128 to get the correct number of data bytes remaining in 0...127
225
+ // (not counting q7), following the previous extra subtraction by 128.
226
+ // Then subtract 16 to simplify the termination condition of the
227
+ // following loop.
228
+ adds len, len, #(128-16)
229
+
230
+ // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
231
+ // into them, storing the result back into q7.
232
+ blt .Lfold_16_bytes_loop_done
233
+.Lfold_16_bytes_loop:
234
+ vmull.p64 q8, q7l, FOLD_CONST_L
235
+ vmull.p64 q7, q7h, FOLD_CONST_H
239236 veor.8 q7, q7, q8
240
-
241
- vld1.64 {q0}, [arg2]!
242
-CPU_LE( vrev64.8 q0, q0 )
243
- vswp d0, d1
237
+ vld1.64 {q0}, [buf]!
238
+CPU_LE( vrev64.8 q0, q0 )
239
+ vswp q0l, q0h
244240 veor.8 q7, q7, q0
245
- subs arg3, arg3, #16
241
+ subs len, len, #16
242
+ bge .Lfold_16_bytes_loop
246243
247
- // instead of a cmp instruction, we utilize the flags with the
248
- // jge instruction equivalent of: cmp arg3, 16-16
249
- // check if there is any more 16B in the buffer to be able to fold
250
- bge _16B_reduction_loop
244
+.Lfold_16_bytes_loop_done:
245
+ // Add 16 to get the correct number of data bytes remaining in 0...15
246
+ // (not counting q7), following the previous extra subtraction by 16.
247
+ adds len, len, #16
248
+ beq .Lreduce_final_16_bytes
251249
252
- // now we have 16+z bytes left to reduce, where 0<= z < 16.
253
- // first, we reduce the data in the xmm7 register
250
+.Lhandle_partial_segment:
251
+ // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
252
+ // 16 bytes are in q7 and the rest are the remaining data in 'buf'. To
253
+ // do this without needing a fold constant for each possible 'len',
254
+ // redivide the bytes into a first chunk of 'len' bytes and a second
255
+ // chunk of 16 bytes, then fold the first chunk into the second.
254256
255
-_final_reduction_for_128:
256
- // check if any more data to fold. If not, compute the CRC of
257
- // the final 128 bits
258
- adds arg3, arg3, #16
259
- beq _128_done
257
+ // q0 = last 16 original data bytes
258
+ add buf, buf, len
259
+ sub buf, buf, #16
260
+ vld1.64 {q0}, [buf]
261
+CPU_LE( vrev64.8 q0, q0 )
262
+ vswp q0l, q0h
260263
261
- // here we are getting data that is less than 16 bytes.
262
- // since we know that there was data before the pointer, we can
263
- // offset the input pointer before the actual point, to receive
264
- // exactly 16 bytes. after that the registers need to be adjusted.
265
-_get_last_two_regs:
266
- add arg2, arg2, arg3
267
- sub arg2, arg2, #16
268
- vld1.64 {q1}, [arg2]
269
-CPU_LE( vrev64.8 q1, q1 )
270
- vswp d2, d3
264
+ // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
265
+ __adrl r3, .Lbyteshift_table + 16
266
+ sub r3, r3, len
267
+ vld1.8 {q2}, [r3]
268
+ vtbl.8 q1l, {q7l-q7h}, q2l
269
+ vtbl.8 q1h, {q7l-q7h}, q2h
271270
272
- // get rid of the extra data that was loaded before
273
- // load the shift constant
274
- adr ip, tbl_shf_table + 16
275
- sub ip, ip, arg3
276
- vld1.8 {q0}, [ip]
271
+ // q3 = first chunk: q7 right-shifted by '16-len' bytes.
272
+ vmov.i8 q3, #0x80
273
+ veor.8 q2, q2, q3
274
+ vtbl.8 q3l, {q7l-q7h}, q2l
275
+ vtbl.8 q3h, {q7l-q7h}, q2h
277276
278
- // shift v2 to the left by arg3 bytes
279
- vtbl.8 d4, {d14-d15}, d0
280
- vtbl.8 d5, {d14-d15}, d1
277
+ // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
278
+ vshr.s8 q2, q2, #7
281279
282
- // shift v7 to the right by 16-arg3 bytes
283
- vmov.i8 q9, #0x80
284
- veor.8 q0, q0, q9
285
- vtbl.8 d18, {d14-d15}, d0
286
- vtbl.8 d19, {d14-d15}, d1
280
+ // q2 = second chunk: 'len' bytes from q0 (low-order bytes),
281
+ // then '16-len' bytes from q1 (high-order bytes).
282
+ vbsl.8 q2, q1, q0
287283
288
- // blend
289
- vshr.s8 q0, q0, #7 // convert to 8-bit mask
290
- vbsl.8 q0, q2, q1
291
-
292
- // fold 16 Bytes
293
- vmull.p64 q8, d18, d20
294
- vmull.p64 q7, d19, d21
295
- veor.8 q7, q7, q8
284
+ // Fold the first chunk into the second chunk, storing the result in q7.
285
+ vmull.p64 q0, q3l, FOLD_CONST_L
286
+ vmull.p64 q7, q3h, FOLD_CONST_H
296287 veor.8 q7, q7, q0
288
+ veor.8 q7, q7, q2
297289
298
-_128_done:
299
- // compute crc of a 128-bit value
300
- vldr d20, rk5
301
- vldr d21, rk6 // rk5 and rk6 in xmm10
290
+.Lreduce_final_16_bytes:
291
+ // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
302292
303
- // 64b fold
304
- vext.8 q0, qzr, q7, #8
305
- vmull.p64 q7, d15, d20
306
- veor.8 q7, q7, q0
293
+ // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
294
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
307295
308
- // 32b fold
309
- vext.8 q0, q7, qzr, #12
310
- vmov s31, s3
311
- vmull.p64 q0, d0, d21
312
- veor.8 q7, q0, q7
296
+ // Fold the high 64 bits into the low 64 bits, while also multiplying by
297
+ // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
298
+ // whose low 48 bits are 0.
299
+ vmull.p64 q0, q7h, FOLD_CONST_H // high bits * x^48 * (x^80 mod G(x))
300
+ veor.8 q0h, q0h, q7l // + low bits * x^64
313301
314
- // barrett reduction
315
-_barrett:
316
- vldr d20, rk7
317
- vldr d21, rk8
302
+ // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
303
+ // value congruent to x^64 * M(x) and whose low 48 bits are 0.
304
+ vmov.i8 q1, #0
305
+ vmov s4, s3 // extract high 32 bits
306
+ vmov s3, s5 // zero high 32 bits
307
+ vmull.p64 q1, q1l, FOLD_CONST_L // high 32 bits * x^48 * (x^48 mod G(x))
308
+ veor.8 q0, q0, q1 // + low bits
318309
319
- vmull.p64 q0, d15, d20
320
- vext.8 q0, qzr, q0, #12
321
- vmull.p64 q0, d1, d21
322
- vext.8 q0, qzr, q0, #12
323
- veor.8 q7, q7, q0
324
- vmov r0, s29
310
+ // Load G(x) and floor(x^48 / G(x)).
311
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]
325312
326
-_cleanup:
327
- // scale the result back to 16 bits
328
- lsr r0, r0, #16
313
+ // Use Barrett reduction to compute the final CRC value.
314
+ vmull.p64 q1, q0h, FOLD_CONST_H // high 32 bits * floor(x^48 / G(x))
315
+ vshr.u64 q1l, q1l, #32 // /= x^32
316
+ vmull.p64 q1, q1l, FOLD_CONST_L // *= G(x)
317
+ vshr.u64 q0l, q0l, #48
318
+ veor.8 q0l, q0l, q1l // + low 16 nonzero bits
319
+ // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0.
320
+
321
+ vmov.u16 r0, q0l[0]
329322 bx lr
330323
331
-_less_than_128:
332
- teq arg3, #0
333
- beq _cleanup
324
+.Lless_than_256_bytes:
325
+ // Checksumming a buffer of length 16...255 bytes
334326
335
- vmov.i8 q0, #0
336
- vmov s3, arg1_low32 // get the initial crc value
327
+ __adrl fold_consts_ptr, .Lfold_across_16_bytes_consts
337328
338
- vld1.64 {q7}, [arg2]!
339
-CPU_LE( vrev64.8 q7, q7 )
340
- vswp d14, d15
341
- veor.8 q7, q7, q0
329
+ // Load the first 16 data bytes.
330
+ vld1.64 {q7}, [buf]!
331
+CPU_LE( vrev64.8 q7, q7 )
332
+ vswp q7l, q7h
342333
343
- cmp arg3, #16
344
- beq _128_done // exactly 16 left
345
- blt _less_than_16_left
334
+ // XOR the first 16 data *bits* with the initial CRC value.
335
+ vmov.i8 q0h, #0
336
+ vmov.u16 q0h[3], init_crc
337
+ veor.8 q7h, q7h, q0h
346338
347
- // now if there is, load the constants
348
- vldr d20, rk1
349
- vldr d21, rk2 // rk1 and rk2 in xmm10
339
+ // Load the fold-across-16-bytes constants.
340
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
350341
351
- // check if there is enough buffer to be able to fold 16B at a time
352
- subs arg3, arg3, #32
353
- addlt arg3, arg3, #16
354
- blt _get_last_two_regs
355
- b _16B_reduction_loop
356
-
357
-_less_than_16_left:
358
- // shl r9, 4
359
- adr ip, tbl_shf_table + 16
360
- sub ip, ip, arg3
361
- vld1.8 {q0}, [ip]
362
- vmov.i8 q9, #0x80
363
- veor.8 q0, q0, q9
364
- vtbl.8 d18, {d14-d15}, d0
365
- vtbl.8 d15, {d14-d15}, d1
366
- vmov d14, d18
367
- b _128_done
342
+ cmp len, #16
343
+ beq .Lreduce_final_16_bytes // len == 16
344
+ subs len, len, #32
345
+ addlt len, len, #16
346
+ blt .Lhandle_partial_segment // 17 <= len <= 31
347
+ b .Lfold_16_bytes_loop // 32 <= len <= 255
368348 ENDPROC(crc_t10dif_pmull)
369349
370
-// precomputed constants
371
-// these constants are precomputed from the poly:
372
-// 0x8bb70000 (0x8bb7 scaled to 32 bits)
350
+ .section ".rodata", "a"
373351 .align 4
374
-// Q = 0x18BB70000
375
-// rk1 = 2^(32*3) mod Q << 32
376
-// rk2 = 2^(32*5) mod Q << 32
377
-// rk3 = 2^(32*15) mod Q << 32
378
-// rk4 = 2^(32*17) mod Q << 32
379
-// rk5 = 2^(32*3) mod Q << 32
380
-// rk6 = 2^(32*2) mod Q << 32
381
-// rk7 = floor(2^64/Q)
382
-// rk8 = Q
383352
384
-rk3: .quad 0x9d9d000000000000
385
-rk4: .quad 0x7cf5000000000000
386
-rk5: .quad 0x2d56000000000000
387
-rk6: .quad 0x1368000000000000
388
-rk7: .quad 0x00000001f65a57f8
389
-rk8: .quad 0x000000018bb70000
390
-rk9: .quad 0xceae000000000000
391
-rk10: .quad 0xbfd6000000000000
392
-rk11: .quad 0x1e16000000000000
393
-rk12: .quad 0x713c000000000000
394
-rk13: .quad 0xf7f9000000000000
395
-rk14: .quad 0x80a6000000000000
396
-rk15: .quad 0x044c000000000000
397
-rk16: .quad 0xe658000000000000
398
-rk17: .quad 0xad18000000000000
399
-rk18: .quad 0xa497000000000000
400
-rk19: .quad 0x6ee3000000000000
401
-rk20: .quad 0xe7b5000000000000
402
-rk1: .quad 0x2d56000000000000
403
-rk2: .quad 0x06df000000000000
353
+// Fold constants precomputed from the polynomial 0x18bb7
354
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
355
+.Lfold_across_128_bytes_consts:
356
+ .quad 0x0000000000006123 // x^(8*128) mod G(x)
357
+ .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
358
+// .Lfold_across_64_bytes_consts:
359
+ .quad 0x0000000000001069 // x^(4*128) mod G(x)
360
+ .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
361
+// .Lfold_across_32_bytes_consts:
362
+ .quad 0x000000000000857d // x^(2*128) mod G(x)
363
+ .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
364
+.Lfold_across_16_bytes_consts:
365
+ .quad 0x000000000000a010 // x^(1*128) mod G(x)
366
+ .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
367
+// .Lfinal_fold_consts:
368
+ .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
369
+ .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
370
+// .Lbarrett_reduction_consts:
371
+ .quad 0x0000000000018bb7 // G(x)
372
+ .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
404373
405
-tbl_shf_table:
406
-// use these values for shift constants for the tbl/tbx instruction
407
-// different alignments result in values as shown:
408
-// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
409
-// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
410
-// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
411
-// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
412
-// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
413
-// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
414
-// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
415
-// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
416
-// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
417
-// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
418
-// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
419
-// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
420
-// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
421
-// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
422
-// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
423
-
374
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
375
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
376
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
377
+.Lbyteshift_table:
424378 .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
425379 .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
426380 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7