~hc/RK356X_SDK_RELEASE.git

..	..	@@ -2,12 +2,14 @@
2	2	// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
3	3	//
4	4	// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
	5	+// Copyright (C) 2019 Google LLC <ebiggers@google.com>
5	6	//
6	7	// This program is free software; you can redistribute it and/or modify
7	8	// it under the terms of the GNU General Public License version 2 as
8	9	// published by the Free Software Foundation.
9	10	//
10	11
	12	+// Derived from the x86 version:
11	13	//
12	14	// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
13	15	//
..	..	@@ -54,115 +56,176 @@
54	56	// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
55	57	// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56	58	//
57		-// Function API:
58		-// UINT16 crc_t10dif_pcl(
59		-// UINT16 init_crc, //initial CRC value, 16 bits
60		-// const unsigned char *buf, //buffer pointer to calculate CRC on
61		-// UINT64 len //buffer length in bytes (64-bit data)
62		-// );
63		-//
64	59	// Reference paper titled "Fast CRC Computation for Generic
65	60	// Polynomials Using PCLMULQDQ Instruction"
66	61	// URL: http://www.intel.com/content/dam/www/public/us/en/documents
67	62	// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
68		-//
69	63	//
70	64
71	65	#include <linux/linkage.h>
72	66	#include <asm/assembler.h>
73	67
74	68	.text
75		- .cpu generic+crypto
	69	+ .arch armv8-a+crypto
76	70
77		- arg1_low32 .req w19
78		- arg2 .req x20
79		- arg3 .req x21
	71	+ init_crc .req w0
	72	+ buf .req x1
	73	+ len .req x2
	74	+ fold_consts_ptr .req x3
80	75
81		- vzr .req v13
	76	+ fold_consts .req v10
82	77
83		-ENTRY(crc_t10dif_pmull)
84		- frame_push 3, 128
	78	+ ad .req v14
85	79
86		- mov arg1_low32, w0
87		- mov arg2, x1
88		- mov arg3, x2
	80	+ k00_16 .req v15
	81	+ k32_48 .req v16
89	82
90		- movi vzr.16b, #0 // init zero register
	83	+ t3 .req v17
	84	+ t4 .req v18
	85	+ t5 .req v19
	86	+ t6 .req v20
	87	+ t7 .req v21
	88	+ t8 .req v22
	89	+ t9 .req v23
91	90
92		- // adjust the 16-bit initial_crc value, scale it to 32 bits
93		- lsl arg1_low32, arg1_low32, #16
	91	+ perm1 .req v24
	92	+ perm2 .req v25
	93	+ perm3 .req v26
	94	+ perm4 .req v27
94	95
95		- // check if smaller than 256
96		- cmp arg3, #256
	96	+ bd1 .req v28
	97	+ bd2 .req v29
	98	+ bd3 .req v30
	99	+ bd4 .req v31
97	100
98		- // for sizes less than 128, we can't fold 64B at a time...
99		- b.lt _less_than_128
	101	+ .macro __pmull_init_p64
	102	+ .endm
100	103
101		- // load the initial crc value
102		- // crc value does not need to be byte-reflected, but it needs
103		- // to be moved to the high part of the register.
104		- // because data will be byte-reflected and will align with
105		- // initial crc at correct place.
106		- movi v10.16b, #0
107		- mov v10.s[3], arg1_low32 // initial crc
	104	+ .macro __pmull_pre_p64, bd
	105	+ .endm
108	106
109		- // receive the initial 64B data, xor the initial crc value
110		- ldp q0, q1, [arg2]
111		- ldp q2, q3, [arg2, #0x20]
112		- ldp q4, q5, [arg2, #0x40]
113		- ldp q6, q7, [arg2, #0x60]
114		- add arg2, arg2, #0x80
	107	+ .macro __pmull_init_p8
	108	+ // k00_16 := 0x0000000000000000_000000000000ffff
	109	+ // k32_48 := 0x00000000ffffffff_0000ffffffffffff
	110	+ movi k32_48.2d, #0xffffffff
	111	+ mov k32_48.h[2], k32_48.h[0]
	112	+ ushr k00_16.2d, k32_48.2d, #32
115	113
116		-CPU_LE( rev64 v0.16b, v0.16b )
117		-CPU_LE( rev64 v1.16b, v1.16b )
118		-CPU_LE( rev64 v2.16b, v2.16b )
119		-CPU_LE( rev64 v3.16b, v3.16b )
120		-CPU_LE( rev64 v4.16b, v4.16b )
121		-CPU_LE( rev64 v5.16b, v5.16b )
122		-CPU_LE( rev64 v6.16b, v6.16b )
123		-CPU_LE( rev64 v7.16b, v7.16b )
	114	+ // prepare the permutation vectors
	115	+ mov_q x5, 0x080f0e0d0c0b0a09
	116	+ movi perm4.8b, #8
	117	+ dup perm1.2d, x5
	118	+ eor perm1.16b, perm1.16b, perm4.16b
	119	+ ushr perm2.2d, perm1.2d, #8
	120	+ ushr perm3.2d, perm1.2d, #16
	121	+ ushr perm4.2d, perm1.2d, #24
	122	+ sli perm2.2d, perm1.2d, #56
	123	+ sli perm3.2d, perm1.2d, #48
	124	+ sli perm4.2d, perm1.2d, #40
	125	+ .endm
124	126
125		-CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
126		-CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
127		-CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
128		-CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
129		-CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
130		-CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
131		-CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
132		-CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
	127	+ .macro __pmull_pre_p8, bd
	128	+ tbl bd1.16b, {\bd\().16b}, perm1.16b
	129	+ tbl bd2.16b, {\bd\().16b}, perm2.16b
	130	+ tbl bd3.16b, {\bd\().16b}, perm3.16b
	131	+ tbl bd4.16b, {\bd\().16b}, perm4.16b
	132	+ .endm
133	133
134		- // XOR the initial_crc value
135		- eor v0.16b, v0.16b, v10.16b
	134	+SYM_FUNC_START_LOCAL(__pmull_p8_core)
	135	+.L__pmull_p8_core:
	136	+ ext t4.8b, ad.8b, ad.8b, #1 // A1
	137	+ ext t5.8b, ad.8b, ad.8b, #2 // A2
	138	+ ext t6.8b, ad.8b, ad.8b, #3 // A3
136	139
137		- ldr_l q10, rk3, x8 // xmm10 has rk3 and rk4
138		- // type of pmull instruction
139		- // will determine which constant to use
	140	+ pmull t4.8h, t4.8b, fold_consts.8b // F = A1*B
	141	+ pmull t8.8h, ad.8b, bd1.8b // E = A*B1
	142	+ pmull t5.8h, t5.8b, fold_consts.8b // H = A2*B
	143	+ pmull t7.8h, ad.8b, bd2.8b // G = A*B2
	144	+ pmull t6.8h, t6.8b, fold_consts.8b // J = A3*B
	145	+ pmull t9.8h, ad.8b, bd3.8b // I = A*B3
	146	+ pmull t3.8h, ad.8b, bd4.8b // K = A*B4
	147	+ b 0f
140	148
141		- //
142		- // we subtract 256 instead of 128 to save one instruction from the loop
143		- //
144		- sub arg3, arg3, #256
	149	+.L__pmull_p8_core2:
	150	+ tbl t4.16b, {ad.16b}, perm1.16b // A1
	151	+ tbl t5.16b, {ad.16b}, perm2.16b // A2
	152	+ tbl t6.16b, {ad.16b}, perm3.16b // A3
145	153
146		- // at this section of the code, there is 64*x+y (0<=y<64) bytes of
147		- // buffer. The _fold_64_B_loop will fold 64B at a time
148		- // until we have 64+y Bytes of buffer
	154	+ pmull2 t4.8h, t4.16b, fold_consts.16b // F = A1*B
	155	+ pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1
	156	+ pmull2 t5.8h, t5.16b, fold_consts.16b // H = A2*B
	157	+ pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2
	158	+ pmull2 t6.8h, t6.16b, fold_consts.16b // J = A3*B
	159	+ pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3
	160	+ pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4
149	161
	162	+0: eor t4.16b, t4.16b, t8.16b // L = E + F
	163	+ eor t5.16b, t5.16b, t7.16b // M = G + H
	164	+ eor t6.16b, t6.16b, t9.16b // N = I + J
150	165
151		- // fold 64B at a time. This section of the code folds 4 vector
152		- // registers in parallel
153		-_fold_64_B_loop:
	166	+ uzp1 t8.2d, t4.2d, t5.2d
	167	+ uzp2 t4.2d, t4.2d, t5.2d
	168	+ uzp1 t7.2d, t6.2d, t3.2d
	169	+ uzp2 t6.2d, t6.2d, t3.2d
154	170
155		- .macro fold64, reg1, reg2
156		- ldp q11, q12, [arg2], #0x20
	171	+ // t4 = (L) (P0 + P1) << 8
	172	+ // t5 = (M) (P2 + P3) << 16
	173	+ eor t8.16b, t8.16b, t4.16b
	174	+ and t4.16b, t4.16b, k32_48.16b
157	175
158		- pmull2 v8.1q, \reg1\().2d, v10.2d
159		- pmull \reg1\().1q, \reg1\().1d, v10.1d
	176	+ // t6 = (N) (P4 + P5) << 24
	177	+ // t7 = (K) (P6 + P7) << 32
	178	+ eor t7.16b, t7.16b, t6.16b
	179	+ and t6.16b, t6.16b, k00_16.16b
	180	+
	181	+ eor t8.16b, t8.16b, t4.16b
	182	+ eor t7.16b, t7.16b, t6.16b
	183	+
	184	+ zip2 t5.2d, t8.2d, t4.2d
	185	+ zip1 t4.2d, t8.2d, t4.2d
	186	+ zip2 t3.2d, t7.2d, t6.2d
	187	+ zip1 t6.2d, t7.2d, t6.2d
	188	+
	189	+ ext t4.16b, t4.16b, t4.16b, #15
	190	+ ext t5.16b, t5.16b, t5.16b, #14
	191	+ ext t6.16b, t6.16b, t6.16b, #13
	192	+ ext t3.16b, t3.16b, t3.16b, #12
	193	+
	194	+ eor t4.16b, t4.16b, t5.16b
	195	+ eor t6.16b, t6.16b, t3.16b
	196	+ ret
	197	+SYM_FUNC_END(__pmull_p8_core)
	198	+
	199	+ .macro __pmull_p8, rq, ad, bd, i
	200	+ .ifnc \bd, fold_consts
	201	+ .err
	202	+ .endif
	203	+ mov ad.16b, \ad\().16b
	204	+ .ifb \i
	205	+ pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B
	206	+ .else
	207	+ pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B
	208	+ .endif
	209	+
	210	+ bl .L__pmull_p8_core\i
	211	+
	212	+ eor \rq\().16b, \rq\().16b, t4.16b
	213	+ eor \rq\().16b, \rq\().16b, t6.16b
	214	+ .endm
	215	+
	216	+ // Fold reg1, reg2 into the next 32 data bytes, storing the result back
	217	+ // into reg1, reg2.
	218	+ .macro fold_32_bytes, p, reg1, reg2
	219	+ ldp q11, q12, [buf], #0x20
	220	+
	221	+ __pmull_\p v8, \reg1, fold_consts, 2
	222	+ __pmull_\p \reg1, \reg1, fold_consts
160	223
161	224	CPU_LE( rev64 v11.16b, v11.16b )
162	225	CPU_LE( rev64 v12.16b, v12.16b )
163	226
164		- pmull2 v9.1q, \reg2\().2d, v10.2d
165		- pmull \reg2\().1q, \reg2\().1d, v10.1d
	227	+ __pmull_\p v9, \reg2, fold_consts, 2
	228	+ __pmull_\p \reg2, \reg2, fold_consts
166	229
167	230	CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
168	231	CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
..	..	@@ -173,244 +236,279 @@
173	236	eor \reg2\().16b, \reg2\().16b, v12.16b
174	237	.endm
175	238
176		- fold64 v0, v1
177		- fold64 v2, v3
178		- fold64 v4, v5
179		- fold64 v6, v7
180		-
181		- subs arg3, arg3, #128
182		-
183		- // check if there is another 64B in the buffer to be able to fold
184		- b.lt _fold_64_B_end
185		-
186		- if_will_cond_yield_neon
187		- stp q0, q1, [sp, #.Lframe_local_offset]
188		- stp q2, q3, [sp, #.Lframe_local_offset + 32]
189		- stp q4, q5, [sp, #.Lframe_local_offset + 64]
190		- stp q6, q7, [sp, #.Lframe_local_offset + 96]
191		- do_cond_yield_neon
192		- ldp q0, q1, [sp, #.Lframe_local_offset]
193		- ldp q2, q3, [sp, #.Lframe_local_offset + 32]
194		- ldp q4, q5, [sp, #.Lframe_local_offset + 64]
195		- ldp q6, q7, [sp, #.Lframe_local_offset + 96]
196		- ldr_l q10, rk3, x8
197		- movi vzr.16b, #0 // init zero register
198		- endif_yield_neon
199		-
200		- b _fold_64_B_loop
201		-
202		-_fold_64_B_end:
203		- // at this point, the buffer pointer is pointing at the last y Bytes
204		- // of the buffer the 64B of folded data is in 4 of the vector
205		- // registers: v0, v1, v2, v3
206		-
207		- // fold the 8 vector registers to 1 vector register with different
208		- // constants
209		-
210		- ldr_l q10, rk9, x8
211		-
212		- .macro fold16, reg, rk
213		- pmull v8.1q, \reg\().1d, v10.1d
214		- pmull2 \reg\().1q, \reg\().2d, v10.2d
215		- .ifnb \rk
216		- ldr_l q10, \rk, x8
	239	+ // Fold src_reg into dst_reg, optionally loading the next fold constants
	240	+ .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
	241	+ __pmull_\p v8, \src_reg, fold_consts
	242	+ __pmull_\p \src_reg, \src_reg, fold_consts, 2
	243	+ .ifnb \load_next_consts
	244	+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
	245	+ __pmull_pre_\p fold_consts
217	246	.endif
218		- eor v7.16b, v7.16b, v8.16b
219		- eor v7.16b, v7.16b, \reg\().16b
	247	+ eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
	248	+ eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
220	249	.endm
221	250
222		- fold16 v0, rk11
223		- fold16 v1, rk13
224		- fold16 v2, rk15
225		- fold16 v3, rk17
226		- fold16 v4, rk19
227		- fold16 v5, rk1
228		- fold16 v6
	251	+ .macro __pmull_p64, rd, rn, rm, n
	252	+ .ifb \n
	253	+ pmull \rd\().1q, \rn\().1d, \rm\().1d
	254	+ .else
	255	+ pmull2 \rd\().1q, \rn\().2d, \rm\().2d
	256	+ .endif
	257	+ .endm
229	258
230		- // instead of 64, we add 48 to the loop counter to save 1 instruction
231		- // from the loop instead of a cmp instruction, we use the negative
232		- // flag with the jl instruction
233		- adds arg3, arg3, #(128-16)
234		- b.lt _final_reduction_for_128
	259	+ .macro crc_t10dif_pmull, p
	260	+ __pmull_init_\p
235	261
236		- // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
237		- // and the rest is in memory. We can fold 16 bytes at a time if y>=16
238		- // continue folding 16B at a time
	262	+ // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
	263	+ cmp len, #256
	264	+ b.lt .Lless_than_256_bytes_\@
239	265
240		-_16B_reduction_loop:
241		- pmull v8.1q, v7.1d, v10.1d
242		- pmull2 v7.1q, v7.2d, v10.2d
	266	+ adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts
	267	+
	268	+ // Load the first 128 data bytes. Byte swapping is necessary to make
	269	+ // the bit order match the polynomial coefficient order.
	270	+ ldp q0, q1, [buf]
	271	+ ldp q2, q3, [buf, #0x20]
	272	+ ldp q4, q5, [buf, #0x40]
	273	+ ldp q6, q7, [buf, #0x60]
	274	+ add buf, buf, #0x80
	275	+CPU_LE( rev64 v0.16b, v0.16b )
	276	+CPU_LE( rev64 v1.16b, v1.16b )
	277	+CPU_LE( rev64 v2.16b, v2.16b )
	278	+CPU_LE( rev64 v3.16b, v3.16b )
	279	+CPU_LE( rev64 v4.16b, v4.16b )
	280	+CPU_LE( rev64 v5.16b, v5.16b )
	281	+CPU_LE( rev64 v6.16b, v6.16b )
	282	+CPU_LE( rev64 v7.16b, v7.16b )
	283	+CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
	284	+CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
	285	+CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
	286	+CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
	287	+CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
	288	+CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
	289	+CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
	290	+CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
	291	+
	292	+ // XOR the first 16 data bits with the initial CRC value.
	293	+ movi v8.16b, #0
	294	+ mov v8.h[7], init_crc
	295	+ eor v0.16b, v0.16b, v8.16b
	296	+
	297	+ // Load the constants for folding across 128 bytes.
	298	+ ld1 {fold_consts.2d}, [fold_consts_ptr]
	299	+ __pmull_pre_\p fold_consts
	300	+
	301	+ // Subtract 128 for the 128 data bytes just consumed. Subtract another
	302	+ // 128 to simplify the termination condition of the following loop.
	303	+ sub len, len, #256
	304	+
	305	+ // While >= 128 data bytes remain (not counting v0-v7), fold the 128
	306	+ // bytes v0-v7 into them, storing the result back into v0-v7.
	307	+.Lfold_128_bytes_loop_\@:
	308	+ fold_32_bytes \p, v0, v1
	309	+ fold_32_bytes \p, v2, v3
	310	+ fold_32_bytes \p, v4, v5
	311	+ fold_32_bytes \p, v6, v7
	312	+
	313	+ subs len, len, #128
	314	+ b.ge .Lfold_128_bytes_loop_\@
	315	+
	316	+ // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
	317	+
	318	+ // Fold across 64 bytes.
	319	+ add fold_consts_ptr, fold_consts_ptr, #16
	320	+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
	321	+ __pmull_pre_\p fold_consts
	322	+ fold_16_bytes \p, v0, v4
	323	+ fold_16_bytes \p, v1, v5
	324	+ fold_16_bytes \p, v2, v6
	325	+ fold_16_bytes \p, v3, v7, 1
	326	+ // Fold across 32 bytes.
	327	+ fold_16_bytes \p, v4, v6
	328	+ fold_16_bytes \p, v5, v7, 1
	329	+ // Fold across 16 bytes.
	330	+ fold_16_bytes \p, v6, v7
	331	+
	332	+ // Add 128 to get the correct number of data bytes remaining in 0...127
	333	+ // (not counting v7), following the previous extra subtraction by 128.
	334	+ // Then subtract 16 to simplify the termination condition of the
	335	+ // following loop.
	336	+ adds len, len, #(128-16)
	337	+
	338	+ // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
	339	+ // into them, storing the result back into v7.
	340	+ b.lt .Lfold_16_bytes_loop_done_\@
	341	+.Lfold_16_bytes_loop_\@:
	342	+ __pmull_\p v8, v7, fold_consts
	343	+ __pmull_\p v7, v7, fold_consts, 2
243	344	eor v7.16b, v7.16b, v8.16b
244		-
245		- ldr q0, [arg2], #16
	345	+ ldr q0, [buf], #16
246	346	CPU_LE( rev64 v0.16b, v0.16b )
247	347	CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
248	348	eor v7.16b, v7.16b, v0.16b
249		- subs arg3, arg3, #16
	349	+ subs len, len, #16
	350	+ b.ge .Lfold_16_bytes_loop_\@
250	351
251		- // instead of a cmp instruction, we utilize the flags with the
252		- // jge instruction equivalent of: cmp arg3, 16-16
253		- // check if there is any more 16B in the buffer to be able to fold
254		- b.ge _16B_reduction_loop
	352	+.Lfold_16_bytes_loop_done_\@:
	353	+ // Add 16 to get the correct number of data bytes remaining in 0...15
	354	+ // (not counting v7), following the previous extra subtraction by 16.
	355	+ adds len, len, #16
	356	+ b.eq .Lreduce_final_16_bytes_\@
255	357
256		- // now we have 16+z bytes left to reduce, where 0<= z < 16.
257		- // first, we reduce the data in the xmm7 register
	358	+.Lhandle_partial_segment_\@:
	359	+ // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
	360	+ // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To
	361	+ // do this without needing a fold constant for each possible 'len',
	362	+ // redivide the bytes into a first chunk of 'len' bytes and a second
	363	+ // chunk of 16 bytes, then fold the first chunk into the second.
258	364
259		-_final_reduction_for_128:
260		- // check if any more data to fold. If not, compute the CRC of
261		- // the final 128 bits
262		- adds arg3, arg3, #16
263		- b.eq _128_done
	365	+ // v0 = last 16 original data bytes
	366	+ add buf, buf, len
	367	+ ldr q0, [buf, #-16]
	368	+CPU_LE( rev64 v0.16b, v0.16b )
	369	+CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
264	370
265		- // here we are getting data that is less than 16 bytes.
266		- // since we know that there was data before the pointer, we can
267		- // offset the input pointer before the actual point, to receive
268		- // exactly 16 bytes. after that the registers need to be adjusted.
269		-_get_last_two_regs:
270		- add arg2, arg2, arg3
271		- ldr q1, [arg2, #-16]
272		-CPU_LE( rev64 v1.16b, v1.16b )
273		-CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
	371	+ // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
	372	+ adr_l x4, .Lbyteshift_table + 16
	373	+ sub x4, x4, len
	374	+ ld1 {v2.16b}, [x4]
	375	+ tbl v1.16b, {v7.16b}, v2.16b
274	376
275		- // get rid of the extra data that was loaded before
276		- // load the shift constant
277		- adr_l x4, tbl_shf_table + 16
278		- sub x4, x4, arg3
279		- ld1 {v0.16b}, [x4]
	377	+ // v3 = first chunk: v7 right-shifted by '16-len' bytes.
	378	+ movi v3.16b, #0x80
	379	+ eor v2.16b, v2.16b, v3.16b
	380	+ tbl v3.16b, {v7.16b}, v2.16b
280	381
281		- // shift v2 to the left by arg3 bytes
282		- tbl v2.16b, {v7.16b}, v0.16b
	382	+ // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
	383	+ sshr v2.16b, v2.16b, #7
283	384
284		- // shift v7 to the right by 16-arg3 bytes
285		- movi v9.16b, #0x80
286		- eor v0.16b, v0.16b, v9.16b
287		- tbl v7.16b, {v7.16b}, v0.16b
	385	+ // v2 = second chunk: 'len' bytes from v0 (low-order bytes),
	386	+ // then '16-len' bytes from v1 (high-order bytes).
	387	+ bsl v2.16b, v1.16b, v0.16b
288	388
289		- // blend
290		- sshr v0.16b, v0.16b, #7 // convert to 8-bit mask
291		- bsl v0.16b, v2.16b, v1.16b
292		-
293		- // fold 16 Bytes
294		- pmull v8.1q, v7.1d, v10.1d
295		- pmull2 v7.1q, v7.2d, v10.2d
296		- eor v7.16b, v7.16b, v8.16b
	389	+ // Fold the first chunk into the second chunk, storing the result in v7.
	390	+ __pmull_\p v0, v3, fold_consts
	391	+ __pmull_\p v7, v3, fold_consts, 2
297	392	eor v7.16b, v7.16b, v0.16b
	393	+ eor v7.16b, v7.16b, v2.16b
298	394
299		-_128_done:
300		- // compute crc of a 128-bit value
301		- ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10
	395	+.Lreduce_final_16_bytes_\@:
	396	+ // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
302	397
303		- // 64b fold
304		- ext v0.16b, vzr.16b, v7.16b, #8
305		- mov v7.d[0], v7.d[1]
306		- pmull v7.1q, v7.1d, v10.1d
307		- eor v7.16b, v7.16b, v0.16b
	398	+ movi v2.16b, #0 // init zero register
308	399
309		- // 32b fold
310		- ext v0.16b, v7.16b, vzr.16b, #4
311		- mov v7.s[3], vzr.s[0]
312		- pmull2 v0.1q, v0.2d, v10.2d
313		- eor v7.16b, v7.16b, v0.16b
	400	+ // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
	401	+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
	402	+ __pmull_pre_\p fold_consts
314	403
315		- // barrett reduction
316		-_barrett:
317		- ldr_l q10, rk7, x8
318		- mov v0.d[0], v7.d[1]
	404	+ // Fold the high 64 bits into the low 64 bits, while also multiplying by
	405	+ // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
	406	+ // whose low 48 bits are 0.
	407	+ ext v0.16b, v2.16b, v7.16b, #8
	408	+ __pmull_\p v7, v7, fold_consts, 2 // high bits * x^48 * (x^80 mod G(x))
	409	+ eor v0.16b, v0.16b, v7.16b // + low bits * x^64
319	410
320		- pmull v0.1q, v0.1d, v10.1d
321		- ext v0.16b, vzr.16b, v0.16b, #12
322		- pmull2 v0.1q, v0.2d, v10.2d
323		- ext v0.16b, vzr.16b, v0.16b, #12
324		- eor v7.16b, v7.16b, v0.16b
325		- mov w0, v7.s[1]
	411	+ // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
	412	+ // value congruent to x^64 * M(x) and whose low 48 bits are 0.
	413	+ ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
	414	+ mov v0.s[3], v2.s[0] // zero high 32 bits
	415	+ __pmull_\p v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x))
	416	+ eor v0.16b, v0.16b, v1.16b // + low bits
326	417
327		-_cleanup:
328		- // scale the result back to 16 bits
329		- lsr x0, x0, #16
330		- frame_pop
	418	+ // Load G(x) and floor(x^48 / G(x)).
	419	+ ld1 {fold_consts.2d}, [fold_consts_ptr]
	420	+ __pmull_pre_\p fold_consts
	421	+
	422	+ // Use Barrett reduction to compute the final CRC value.
	423	+ __pmull_\p v1, v0, fold_consts, 2 // high 32 bits * floor(x^48 / G(x))
	424	+ ushr v1.2d, v1.2d, #32 // /= x^32
	425	+ __pmull_\p v1, v1, fold_consts // *= G(x)
	426	+ ushr v0.2d, v0.2d, #48
	427	+ eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
	428	+ // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
	429	+
	430	+ umov w0, v0.h[0]
	431	+ .ifc \p, p8
	432	+ ldp x29, x30, [sp], #16
	433	+ .endif
331	434	ret
332	435
333		-_less_than_128:
334		- cbz arg3, _cleanup
	436	+.Lless_than_256_bytes_\@:
	437	+ // Checksumming a buffer of length 16...255 bytes
335	438
336		- movi v0.16b, #0
337		- mov v0.s[3], arg1_low32 // get the initial crc value
	439	+ adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts
338	440
339		- ldr q7, [arg2], #0x10
	441	+ // Load the first 16 data bytes.
	442	+ ldr q7, [buf], #0x10
340	443	CPU_LE( rev64 v7.16b, v7.16b )
341	444	CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
342		- eor v7.16b, v7.16b, v0.16b // xor the initial crc value
343	445
344		- cmp arg3, #16
345		- b.eq _128_done // exactly 16 left
346		- b.lt _less_than_16_left
	446	+ // XOR the first 16 data bits with the initial CRC value.
	447	+ movi v0.16b, #0
	448	+ mov v0.h[7], init_crc
	449	+ eor v7.16b, v7.16b, v0.16b
347	450
348		- ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10
	451	+ // Load the fold-across-16-bytes constants.
	452	+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
	453	+ __pmull_pre_\p fold_consts
349	454
350		- // update the counter. subtract 32 instead of 16 to save one
351		- // instruction from the loop
352		- subs arg3, arg3, #32
353		- b.ge _16B_reduction_loop
	455	+ cmp len, #16
	456	+ b.eq .Lreduce_final_16_bytes_\@ // len == 16
	457	+ subs len, len, #32
	458	+ b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
	459	+ add len, len, #16
	460	+ b .Lhandle_partial_segment_\@ // 17 <= len <= 31
	461	+ .endm
354	462
355		- add arg3, arg3, #16
356		- b _get_last_two_regs
	463	+//
	464	+// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
	465	+//
	466	+// Assumes len >= 16.
	467	+//
	468	+SYM_FUNC_START(crc_t10dif_pmull_p8)
	469	+ stp x29, x30, [sp, #-16]!
	470	+ mov x29, sp
	471	+ crc_t10dif_pmull p8
	472	+SYM_FUNC_END(crc_t10dif_pmull_p8)
357	473
358		-_less_than_16_left:
359		- // shl r9, 4
360		- adr_l x0, tbl_shf_table + 16
361		- sub x0, x0, arg3
362		- ld1 {v0.16b}, [x0]
363		- movi v9.16b, #0x80
364		- eor v0.16b, v0.16b, v9.16b
365		- tbl v7.16b, {v7.16b}, v0.16b
366		- b _128_done
367		-ENDPROC(crc_t10dif_pmull)
	474	+ .align 5
	475	+//
	476	+// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
	477	+//
	478	+// Assumes len >= 16.
	479	+//
	480	+SYM_FUNC_START(crc_t10dif_pmull_p64)
	481	+ crc_t10dif_pmull p64
	482	+SYM_FUNC_END(crc_t10dif_pmull_p64)
368	483
369		-// precomputed constants
370		-// these constants are precomputed from the poly:
371		-// 0x8bb70000 (0x8bb7 scaled to 32 bits)
372	484	.section ".rodata", "a"
373	485	.align 4
374		-// Q = 0x18BB70000
375		-// rk1 = 2^(32*3) mod Q << 32
376		-// rk2 = 2^(32*5) mod Q << 32
377		-// rk3 = 2^(32*15) mod Q << 32
378		-// rk4 = 2^(32*17) mod Q << 32
379		-// rk5 = 2^(32*3) mod Q << 32
380		-// rk6 = 2^(32*2) mod Q << 32
381		-// rk7 = floor(2^64/Q)
382		-// rk8 = Q
383	486
384		-rk1: .octa 0x06df0000000000002d56000000000000
385		-rk3: .octa 0x7cf50000000000009d9d000000000000
386		-rk5: .octa 0x13680000000000002d56000000000000
387		-rk7: .octa 0x000000018bb7000000000001f65a57f8
388		-rk9: .octa 0xbfd6000000000000ceae000000000000
389		-rk11: .octa 0x713c0000000000001e16000000000000
390		-rk13: .octa 0x80a6000000000000f7f9000000000000
391		-rk15: .octa 0xe658000000000000044c000000000000
392		-rk17: .octa 0xa497000000000000ad18000000000000
393		-rk19: .octa 0xe7b50000000000006ee3000000000000
	487	+// Fold constants precomputed from the polynomial 0x18bb7
	488	+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
	489	+.Lfold_across_128_bytes_consts:
	490	+ .quad 0x0000000000006123 // x^(8*128) mod G(x)
	491	+ .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
	492	+// .Lfold_across_64_bytes_consts:
	493	+ .quad 0x0000000000001069 // x^(4*128) mod G(x)
	494	+ .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
	495	+// .Lfold_across_32_bytes_consts:
	496	+ .quad 0x000000000000857d // x^(2*128) mod G(x)
	497	+ .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
	498	+.Lfold_across_16_bytes_consts:
	499	+ .quad 0x000000000000a010 // x^(1*128) mod G(x)
	500	+ .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
	501	+// .Lfinal_fold_consts:
	502	+ .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
	503	+ .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
	504	+// .Lbarrett_reduction_consts:
	505	+ .quad 0x0000000000018bb7 // G(x)
	506	+ .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
394	507
395		-tbl_shf_table:
396		-// use these values for shift constants for the tbl/tbx instruction
397		-// different alignments result in values as shown:
398		-// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
399		-// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
400		-// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
401		-// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
402		-// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
403		-// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
404		-// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
405		-// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
406		-// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
407		-// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
408		-// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
409		-// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
410		-// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
411		-// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
412		-// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
413		-
	508	+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
	509	+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
	510	+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
	511	+.Lbyteshift_table:
414	512	.byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
415	513	.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
416	514	.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7