~hc/RK356X_SDK_RELEASE.git

..	..	@@ -2,12 +2,14 @@
2	2	// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
3	3	//
4	4	// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
	5	+// Copyright (C) 2019 Google LLC <ebiggers@google.com>
5	6	//
6	7	// This program is free software; you can redistribute it and/or modify
7	8	// it under the terms of the GNU General Public License version 2 as
8	9	// published by the Free Software Foundation.
9	10	//
10	11
	12	+// Derived from the x86 version:
11	13	//
12	14	// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
13	15	//
..	..	@@ -54,18 +56,10 @@
54	56	// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
55	57	// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56	58	//
57		-// Function API:
58		-// UINT16 crc_t10dif_pcl(
59		-// UINT16 init_crc, //initial CRC value, 16 bits
60		-// const unsigned char *buf, //buffer pointer to calculate CRC on
61		-// UINT64 len //buffer length in bytes (64-bit data)
62		-// );
63		-//
64	59	// Reference paper titled "Fast CRC Computation for Generic
65	60	// Polynomials Using PCLMULQDQ Instruction"
66	61	// URL: http://www.intel.com/content/dam/www/public/us/en/documents
67	62	// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
68		-//
69	63	//
70	64
71	65	#include <linux/linkage.h>
..	..	@@ -78,13 +72,14 @@
78	72	#endif
79	73
80	74	.text
	75	+ .arch armv8-a
81	76	.fpu crypto-neon-fp-armv8
82	77
83		- arg1_low32 .req r0
84		- arg2 .req r1
85		- arg3 .req r2
	78	+ init_crc .req r0
	79	+ buf .req r1
	80	+ len .req r2
86	81
87		- qzr .req q13
	82	+ fold_consts_ptr .req ip
88	83
89	84	q0l .req d0
90	85	q0h .req d1
..	..	@@ -102,82 +97,35 @@
102	97	q6h .req d13
103	98	q7l .req d14
104	99	q7h .req d15
	100	+ q8l .req d16
	101	+ q8h .req d17
	102	+ q9l .req d18
	103	+ q9h .req d19
	104	+ q10l .req d20
	105	+ q10h .req d21
	106	+ q11l .req d22
	107	+ q11h .req d23
	108	+ q12l .req d24
	109	+ q12h .req d25
105	110
106		-ENTRY(crc_t10dif_pmull)
107		- vmov.i8 qzr, #0 // init zero register
	111	+ FOLD_CONSTS .req q10
	112	+ FOLD_CONST_L .req q10l
	113	+ FOLD_CONST_H .req q10h
108	114
109		- // adjust the 16-bit initial_crc value, scale it to 32 bits
110		- lsl arg1_low32, arg1_low32, #16
	115	+ // Fold reg1, reg2 into the next 32 data bytes, storing the result back
	116	+ // into reg1, reg2.
	117	+ .macro fold_32_bytes, reg1, reg2
	118	+ vld1.64 {q11-q12}, [buf]!
111	119
112		- // check if smaller than 256
113		- cmp arg3, #256
	120	+ vmull.p64 q8, \reg1\()h, FOLD_CONST_H
	121	+ vmull.p64 \reg1, \reg1\()l, FOLD_CONST_L
	122	+ vmull.p64 q9, \reg2\()h, FOLD_CONST_H
	123	+ vmull.p64 \reg2, \reg2\()l, FOLD_CONST_L
114	124
115		- // for sizes less than 128, we can't fold 64B at a time...
116		- blt _less_than_128
117		-
118		- // load the initial crc value
119		- // crc value does not need to be byte-reflected, but it needs
120		- // to be moved to the high part of the register.
121		- // because data will be byte-reflected and will align with
122		- // initial crc at correct place.
123		- vmov s0, arg1_low32 // initial crc
124		- vext.8 q10, qzr, q0, #4
125		-
126		- // receive the initial 64B data, xor the initial crc value
127		- vld1.64 {q0-q1}, [arg2]!
128		- vld1.64 {q2-q3}, [arg2]!
129		- vld1.64 {q4-q5}, [arg2]!
130		- vld1.64 {q6-q7}, [arg2]!
131		-CPU_LE( vrev64.8 q0, q0 )
132		-CPU_LE( vrev64.8 q1, q1 )
133		-CPU_LE( vrev64.8 q2, q2 )
134		-CPU_LE( vrev64.8 q3, q3 )
135		-CPU_LE( vrev64.8 q4, q4 )
136		-CPU_LE( vrev64.8 q5, q5 )
137		-CPU_LE( vrev64.8 q6, q6 )
138		-CPU_LE( vrev64.8 q7, q7 )
139		-
140		- vswp d0, d1
141		- vswp d2, d3
142		- vswp d4, d5
143		- vswp d6, d7
144		- vswp d8, d9
145		- vswp d10, d11
146		- vswp d12, d13
147		- vswp d14, d15
148		-
149		- // XOR the initial_crc value
150		- veor.8 q0, q0, q10
151		-
152		- adr ip, rk3
153		- vld1.64 {q10}, [ip, :128] // xmm10 has rk3 and rk4
154		-
155		- //
156		- // we subtract 256 instead of 128 to save one instruction from the loop
157		- //
158		- sub arg3, arg3, #256
159		-
160		- // at this section of the code, there is 64*x+y (0<=y<64) bytes of
161		- // buffer. The _fold_64_B_loop will fold 64B at a time
162		- // until we have 64+y Bytes of buffer
163		-
164		-
165		- // fold 64B at a time. This section of the code folds 4 vector
166		- // registers in parallel
167		-_fold_64_B_loop:
168		-
169		- .macro fold64, reg1, reg2
170		- vld1.64 {q11-q12}, [arg2]!
171		-
172		- vmull.p64 q8, \reg1\()h, d21
173		- vmull.p64 \reg1, \reg1\()l, d20
174		- vmull.p64 q9, \reg2\()h, d21
175		- vmull.p64 \reg2, \reg2\()l, d20
176		-
177		-CPU_LE( vrev64.8 q11, q11 )
178		-CPU_LE( vrev64.8 q12, q12 )
179		- vswp d22, d23
180		- vswp d24, d25
	125	+CPU_LE( vrev64.8 q11, q11 )
	126	+CPU_LE( vrev64.8 q12, q12 )
	127	+ vswp q11l, q11h
	128	+ vswp q12l, q12h
181	129
182	130	veor.8 \reg1, \reg1, q8
183	131	veor.8 \reg2, \reg2, q9
..	..	@@ -185,242 +133,248 @@
185	133	veor.8 \reg2, \reg2, q12
186	134	.endm
187	135
188		- fold64 q0, q1
189		- fold64 q2, q3
190		- fold64 q4, q5
191		- fold64 q6, q7
192		-
193		- subs arg3, arg3, #128
194		-
195		- // check if there is another 64B in the buffer to be able to fold
196		- bge _fold_64_B_loop
197		-
198		- // at this point, the buffer pointer is pointing at the last y Bytes
199		- // of the buffer the 64B of folded data is in 4 of the vector
200		- // registers: v0, v1, v2, v3
201		-
202		- // fold the 8 vector registers to 1 vector register with different
203		- // constants
204		-
205		- adr ip, rk9
206		- vld1.64 {q10}, [ip, :128]!
207		-
208		- .macro fold16, reg, rk
209		- vmull.p64 q8, \reg\()l, d20
210		- vmull.p64 \reg, \reg\()h, d21
211		- .ifnb \rk
212		- vld1.64 {q10}, [ip, :128]!
	136	+ // Fold src_reg into dst_reg, optionally loading the next fold constants
	137	+ .macro fold_16_bytes, src_reg, dst_reg, load_next_consts
	138	+ vmull.p64 q8, \src_reg\()l, FOLD_CONST_L
	139	+ vmull.p64 \src_reg, \src_reg\()h, FOLD_CONST_H
	140	+ .ifnb \load_next_consts
	141	+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
213	142	.endif
214		- veor.8 q7, q7, q8
215		- veor.8 q7, q7, \reg
	143	+ veor.8 \dst_reg, \dst_reg, q8
	144	+ veor.8 \dst_reg, \dst_reg, \src_reg
216	145	.endm
217	146
218		- fold16 q0, rk11
219		- fold16 q1, rk13
220		- fold16 q2, rk15
221		- fold16 q3, rk17
222		- fold16 q4, rk19
223		- fold16 q5, rk1
224		- fold16 q6
	147	+ .macro __adrl, out, sym
	148	+ movw \out, #:lower16:\sym
	149	+ movt \out, #:upper16:\sym
	150	+ .endm
225	151
226		- // instead of 64, we add 48 to the loop counter to save 1 instruction
227		- // from the loop instead of a cmp instruction, we use the negative
228		- // flag with the jl instruction
229		- adds arg3, arg3, #(128-16)
230		- blt _final_reduction_for_128
	152	+//
	153	+// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
	154	+//
	155	+// Assumes len >= 16.
	156	+//
	157	+ENTRY(crc_t10dif_pmull)
231	158
232		- // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
233		- // and the rest is in memory. We can fold 16 bytes at a time if y>=16
234		- // continue folding 16B at a time
	159	+ // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
	160	+ cmp len, #256
	161	+ blt .Lless_than_256_bytes
235	162
236		-_16B_reduction_loop:
237		- vmull.p64 q8, d14, d20
238		- vmull.p64 q7, d15, d21
	163	+ __adrl fold_consts_ptr, .Lfold_across_128_bytes_consts
	164	+
	165	+ // Load the first 128 data bytes. Byte swapping is necessary to make
	166	+ // the bit order match the polynomial coefficient order.
	167	+ vld1.64 {q0-q1}, [buf]!
	168	+ vld1.64 {q2-q3}, [buf]!
	169	+ vld1.64 {q4-q5}, [buf]!
	170	+ vld1.64 {q6-q7}, [buf]!
	171	+CPU_LE( vrev64.8 q0, q0 )
	172	+CPU_LE( vrev64.8 q1, q1 )
	173	+CPU_LE( vrev64.8 q2, q2 )
	174	+CPU_LE( vrev64.8 q3, q3 )
	175	+CPU_LE( vrev64.8 q4, q4 )
	176	+CPU_LE( vrev64.8 q5, q5 )
	177	+CPU_LE( vrev64.8 q6, q6 )
	178	+CPU_LE( vrev64.8 q7, q7 )
	179	+ vswp q0l, q0h
	180	+ vswp q1l, q1h
	181	+ vswp q2l, q2h
	182	+ vswp q3l, q3h
	183	+ vswp q4l, q4h
	184	+ vswp q5l, q5h
	185	+ vswp q6l, q6h
	186	+ vswp q7l, q7h
	187	+
	188	+ // XOR the first 16 data bits with the initial CRC value.
	189	+ vmov.i8 q8h, #0
	190	+ vmov.u16 q8h[3], init_crc
	191	+ veor q0h, q0h, q8h
	192	+
	193	+ // Load the constants for folding across 128 bytes.
	194	+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
	195	+
	196	+ // Subtract 128 for the 128 data bytes just consumed. Subtract another
	197	+ // 128 to simplify the termination condition of the following loop.
	198	+ sub len, len, #256
	199	+
	200	+ // While >= 128 data bytes remain (not counting q0-q7), fold the 128
	201	+ // bytes q0-q7 into them, storing the result back into q0-q7.
	202	+.Lfold_128_bytes_loop:
	203	+ fold_32_bytes q0, q1
	204	+ fold_32_bytes q2, q3
	205	+ fold_32_bytes q4, q5
	206	+ fold_32_bytes q6, q7
	207	+ subs len, len, #128
	208	+ bge .Lfold_128_bytes_loop
	209	+
	210	+ // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
	211	+
	212	+ // Fold across 64 bytes.
	213	+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
	214	+ fold_16_bytes q0, q4
	215	+ fold_16_bytes q1, q5
	216	+ fold_16_bytes q2, q6
	217	+ fold_16_bytes q3, q7, 1
	218	+ // Fold across 32 bytes.
	219	+ fold_16_bytes q4, q6
	220	+ fold_16_bytes q5, q7, 1
	221	+ // Fold across 16 bytes.
	222	+ fold_16_bytes q6, q7
	223	+
	224	+ // Add 128 to get the correct number of data bytes remaining in 0...127
	225	+ // (not counting q7), following the previous extra subtraction by 128.
	226	+ // Then subtract 16 to simplify the termination condition of the
	227	+ // following loop.
	228	+ adds len, len, #(128-16)
	229	+
	230	+ // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
	231	+ // into them, storing the result back into q7.
	232	+ blt .Lfold_16_bytes_loop_done
	233	+.Lfold_16_bytes_loop:
	234	+ vmull.p64 q8, q7l, FOLD_CONST_L
	235	+ vmull.p64 q7, q7h, FOLD_CONST_H
239	236	veor.8 q7, q7, q8
240		-
241		- vld1.64 {q0}, [arg2]!
242		-CPU_LE( vrev64.8 q0, q0 )
243		- vswp d0, d1
	237	+ vld1.64 {q0}, [buf]!
	238	+CPU_LE( vrev64.8 q0, q0 )
	239	+ vswp q0l, q0h
244	240	veor.8 q7, q7, q0
245		- subs arg3, arg3, #16
	241	+ subs len, len, #16
	242	+ bge .Lfold_16_bytes_loop
246	243
247		- // instead of a cmp instruction, we utilize the flags with the
248		- // jge instruction equivalent of: cmp arg3, 16-16
249		- // check if there is any more 16B in the buffer to be able to fold
250		- bge _16B_reduction_loop
	244	+.Lfold_16_bytes_loop_done:
	245	+ // Add 16 to get the correct number of data bytes remaining in 0...15
	246	+ // (not counting q7), following the previous extra subtraction by 16.
	247	+ adds len, len, #16
	248	+ beq .Lreduce_final_16_bytes
251	249
252		- // now we have 16+z bytes left to reduce, where 0<= z < 16.
253		- // first, we reduce the data in the xmm7 register
	250	+.Lhandle_partial_segment:
	251	+ // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
	252	+ // 16 bytes are in q7 and the rest are the remaining data in 'buf'. To
	253	+ // do this without needing a fold constant for each possible 'len',
	254	+ // redivide the bytes into a first chunk of 'len' bytes and a second
	255	+ // chunk of 16 bytes, then fold the first chunk into the second.
254	256
255		-_final_reduction_for_128:
256		- // check if any more data to fold. If not, compute the CRC of
257		- // the final 128 bits
258		- adds arg3, arg3, #16
259		- beq _128_done
	257	+ // q0 = last 16 original data bytes
	258	+ add buf, buf, len
	259	+ sub buf, buf, #16
	260	+ vld1.64 {q0}, [buf]
	261	+CPU_LE( vrev64.8 q0, q0 )
	262	+ vswp q0l, q0h
260	263
261		- // here we are getting data that is less than 16 bytes.
262		- // since we know that there was data before the pointer, we can
263		- // offset the input pointer before the actual point, to receive
264		- // exactly 16 bytes. after that the registers need to be adjusted.
265		-_get_last_two_regs:
266		- add arg2, arg2, arg3
267		- sub arg2, arg2, #16
268		- vld1.64 {q1}, [arg2]
269		-CPU_LE( vrev64.8 q1, q1 )
270		- vswp d2, d3
	264	+ // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
	265	+ __adrl r3, .Lbyteshift_table + 16
	266	+ sub r3, r3, len
	267	+ vld1.8 {q2}, [r3]
	268	+ vtbl.8 q1l, {q7l-q7h}, q2l
	269	+ vtbl.8 q1h, {q7l-q7h}, q2h
271	270
272		- // get rid of the extra data that was loaded before
273		- // load the shift constant
274		- adr ip, tbl_shf_table + 16
275		- sub ip, ip, arg3
276		- vld1.8 {q0}, [ip]
	271	+ // q3 = first chunk: q7 right-shifted by '16-len' bytes.
	272	+ vmov.i8 q3, #0x80
	273	+ veor.8 q2, q2, q3
	274	+ vtbl.8 q3l, {q7l-q7h}, q2l
	275	+ vtbl.8 q3h, {q7l-q7h}, q2h
277	276
278		- // shift v2 to the left by arg3 bytes
279		- vtbl.8 d4, {d14-d15}, d0
280		- vtbl.8 d5, {d14-d15}, d1
	277	+ // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
	278	+ vshr.s8 q2, q2, #7
281	279
282		- // shift v7 to the right by 16-arg3 bytes
283		- vmov.i8 q9, #0x80
284		- veor.8 q0, q0, q9
285		- vtbl.8 d18, {d14-d15}, d0
286		- vtbl.8 d19, {d14-d15}, d1
	280	+ // q2 = second chunk: 'len' bytes from q0 (low-order bytes),
	281	+ // then '16-len' bytes from q1 (high-order bytes).
	282	+ vbsl.8 q2, q1, q0
287	283
288		- // blend
289		- vshr.s8 q0, q0, #7 // convert to 8-bit mask
290		- vbsl.8 q0, q2, q1
291		-
292		- // fold 16 Bytes
293		- vmull.p64 q8, d18, d20
294		- vmull.p64 q7, d19, d21
295		- veor.8 q7, q7, q8
	284	+ // Fold the first chunk into the second chunk, storing the result in q7.
	285	+ vmull.p64 q0, q3l, FOLD_CONST_L
	286	+ vmull.p64 q7, q3h, FOLD_CONST_H
296	287	veor.8 q7, q7, q0
	288	+ veor.8 q7, q7, q2
297	289
298		-_128_done:
299		- // compute crc of a 128-bit value
300		- vldr d20, rk5
301		- vldr d21, rk6 // rk5 and rk6 in xmm10
	290	+.Lreduce_final_16_bytes:
	291	+ // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
302	292
303		- // 64b fold
304		- vext.8 q0, qzr, q7, #8
305		- vmull.p64 q7, d15, d20
306		- veor.8 q7, q7, q0
	293	+ // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
	294	+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
307	295
308		- // 32b fold
309		- vext.8 q0, q7, qzr, #12
310		- vmov s31, s3
311		- vmull.p64 q0, d0, d21
312		- veor.8 q7, q0, q7
	296	+ // Fold the high 64 bits into the low 64 bits, while also multiplying by
	297	+ // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
	298	+ // whose low 48 bits are 0.
	299	+ vmull.p64 q0, q7h, FOLD_CONST_H // high bits * x^48 * (x^80 mod G(x))
	300	+ veor.8 q0h, q0h, q7l // + low bits * x^64
313	301
314		- // barrett reduction
315		-_barrett:
316		- vldr d20, rk7
317		- vldr d21, rk8
	302	+ // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
	303	+ // value congruent to x^64 * M(x) and whose low 48 bits are 0.
	304	+ vmov.i8 q1, #0
	305	+ vmov s4, s3 // extract high 32 bits
	306	+ vmov s3, s5 // zero high 32 bits
	307	+ vmull.p64 q1, q1l, FOLD_CONST_L // high 32 bits * x^48 * (x^48 mod G(x))
	308	+ veor.8 q0, q0, q1 // + low bits
318	309
319		- vmull.p64 q0, d15, d20
320		- vext.8 q0, qzr, q0, #12
321		- vmull.p64 q0, d1, d21
322		- vext.8 q0, qzr, q0, #12
323		- veor.8 q7, q7, q0
324		- vmov r0, s29
	310	+ // Load G(x) and floor(x^48 / G(x)).
	311	+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]
325	312
326		-_cleanup:
327		- // scale the result back to 16 bits
328		- lsr r0, r0, #16
	313	+ // Use Barrett reduction to compute the final CRC value.
	314	+ vmull.p64 q1, q0h, FOLD_CONST_H // high 32 bits * floor(x^48 / G(x))
	315	+ vshr.u64 q1l, q1l, #32 // /= x^32
	316	+ vmull.p64 q1, q1l, FOLD_CONST_L // *= G(x)
	317	+ vshr.u64 q0l, q0l, #48
	318	+ veor.8 q0l, q0l, q1l // + low 16 nonzero bits
	319	+ // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0.
	320	+
	321	+ vmov.u16 r0, q0l[0]
329	322	bx lr
330	323
331		-_less_than_128:
332		- teq arg3, #0
333		- beq _cleanup
	324	+.Lless_than_256_bytes:
	325	+ // Checksumming a buffer of length 16...255 bytes
334	326
335		- vmov.i8 q0, #0
336		- vmov s3, arg1_low32 // get the initial crc value
	327	+ __adrl fold_consts_ptr, .Lfold_across_16_bytes_consts
337	328
338		- vld1.64 {q7}, [arg2]!
339		-CPU_LE( vrev64.8 q7, q7 )
340		- vswp d14, d15
341		- veor.8 q7, q7, q0
	329	+ // Load the first 16 data bytes.
	330	+ vld1.64 {q7}, [buf]!
	331	+CPU_LE( vrev64.8 q7, q7 )
	332	+ vswp q7l, q7h
342	333
343		- cmp arg3, #16
344		- beq _128_done // exactly 16 left
345		- blt _less_than_16_left
	334	+ // XOR the first 16 data bits with the initial CRC value.
	335	+ vmov.i8 q0h, #0
	336	+ vmov.u16 q0h[3], init_crc
	337	+ veor.8 q7h, q7h, q0h
346	338
347		- // now if there is, load the constants
348		- vldr d20, rk1
349		- vldr d21, rk2 // rk1 and rk2 in xmm10
	339	+ // Load the fold-across-16-bytes constants.
	340	+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
350	341
351		- // check if there is enough buffer to be able to fold 16B at a time
352		- subs arg3, arg3, #32
353		- addlt arg3, arg3, #16
354		- blt _get_last_two_regs
355		- b _16B_reduction_loop
356		-
357		-_less_than_16_left:
358		- // shl r9, 4
359		- adr ip, tbl_shf_table + 16
360		- sub ip, ip, arg3
361		- vld1.8 {q0}, [ip]
362		- vmov.i8 q9, #0x80
363		- veor.8 q0, q0, q9
364		- vtbl.8 d18, {d14-d15}, d0
365		- vtbl.8 d15, {d14-d15}, d1
366		- vmov d14, d18
367		- b _128_done
	342	+ cmp len, #16
	343	+ beq .Lreduce_final_16_bytes // len == 16
	344	+ subs len, len, #32
	345	+ addlt len, len, #16
	346	+ blt .Lhandle_partial_segment // 17 <= len <= 31
	347	+ b .Lfold_16_bytes_loop // 32 <= len <= 255
368	348	ENDPROC(crc_t10dif_pmull)
369	349
370		-// precomputed constants
371		-// these constants are precomputed from the poly:
372		-// 0x8bb70000 (0x8bb7 scaled to 32 bits)
	350	+ .section ".rodata", "a"
373	351	.align 4
374		-// Q = 0x18BB70000
375		-// rk1 = 2^(32*3) mod Q << 32
376		-// rk2 = 2^(32*5) mod Q << 32
377		-// rk3 = 2^(32*15) mod Q << 32
378		-// rk4 = 2^(32*17) mod Q << 32
379		-// rk5 = 2^(32*3) mod Q << 32
380		-// rk6 = 2^(32*2) mod Q << 32
381		-// rk7 = floor(2^64/Q)
382		-// rk8 = Q
383	352
384		-rk3: .quad 0x9d9d000000000000
385		-rk4: .quad 0x7cf5000000000000
386		-rk5: .quad 0x2d56000000000000
387		-rk6: .quad 0x1368000000000000
388		-rk7: .quad 0x00000001f65a57f8
389		-rk8: .quad 0x000000018bb70000
390		-rk9: .quad 0xceae000000000000
391		-rk10: .quad 0xbfd6000000000000
392		-rk11: .quad 0x1e16000000000000
393		-rk12: .quad 0x713c000000000000
394		-rk13: .quad 0xf7f9000000000000
395		-rk14: .quad 0x80a6000000000000
396		-rk15: .quad 0x044c000000000000
397		-rk16: .quad 0xe658000000000000
398		-rk17: .quad 0xad18000000000000
399		-rk18: .quad 0xa497000000000000
400		-rk19: .quad 0x6ee3000000000000
401		-rk20: .quad 0xe7b5000000000000
402		-rk1: .quad 0x2d56000000000000
403		-rk2: .quad 0x06df000000000000
	353	+// Fold constants precomputed from the polynomial 0x18bb7
	354	+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
	355	+.Lfold_across_128_bytes_consts:
	356	+ .quad 0x0000000000006123 // x^(8*128) mod G(x)
	357	+ .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
	358	+// .Lfold_across_64_bytes_consts:
	359	+ .quad 0x0000000000001069 // x^(4*128) mod G(x)
	360	+ .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
	361	+// .Lfold_across_32_bytes_consts:
	362	+ .quad 0x000000000000857d // x^(2*128) mod G(x)
	363	+ .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
	364	+.Lfold_across_16_bytes_consts:
	365	+ .quad 0x000000000000a010 // x^(1*128) mod G(x)
	366	+ .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
	367	+// .Lfinal_fold_consts:
	368	+ .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
	369	+ .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
	370	+// .Lbarrett_reduction_consts:
	371	+ .quad 0x0000000000018bb7 // G(x)
	372	+ .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
404	373
405		-tbl_shf_table:
406		-// use these values for shift constants for the tbl/tbx instruction
407		-// different alignments result in values as shown:
408		-// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
409		-// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
410		-// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
411		-// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
412		-// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
413		-// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
414		-// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
415		-// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
416		-// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
417		-// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
418		-// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
419		-// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
420		-// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
421		-// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
422		-// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
423		-
	374	+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
	375	+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
	376	+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
	377	+.Lbyteshift_table:
424	378	.byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
425	379	.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
426	380	.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7