~hc/RK356X_SDK_RELEASE.git

..	..	@@ -43,609 +43,291 @@
43	43	# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
44	44	# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
45	45	# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46		-########################################################################
47		-# Function API:
48		-# UINT16 crc_t10dif_pcl(
49		-# UINT16 init_crc, //initial CRC value, 16 bits
50		-# const unsigned char *buf, //buffer pointer to calculate CRC on
51		-# UINT64 len //buffer length in bytes (64-bit data)
52		-# );
53	46	#
54	47	# Reference paper titled "Fast CRC Computation for Generic
55	48	# Polynomials Using PCLMULQDQ Instruction"
56	49	# URL: http://www.intel.com/content/dam/www/public/us/en/documents
57	50	# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
58	51	#
59		-#
60	52
61	53	#include <linux/linkage.h>
62	54
63	55	.text
64	56
65		-#define arg1 %rdi
66		-#define arg2 %rsi
67		-#define arg3 %rdx
	57	+#define init_crc %edi
	58	+#define buf %rsi
	59	+#define len %rdx
68	60
69		-#define arg1_low32 %edi
	61	+#define FOLD_CONSTS %xmm10
	62	+#define BSWAP_MASK %xmm11
70	63
71		-ENTRY(crc_t10dif_pcl)
	64	+# Fold reg1, reg2 into the next 32 data bytes, storing the result back into
	65	+# reg1, reg2.
	66	+.macro fold_32_bytes offset, reg1, reg2
	67	+ movdqu \offset(buf), %xmm9
	68	+ movdqu \offset+16(buf), %xmm12
	69	+ pshufb BSWAP_MASK, %xmm9
	70	+ pshufb BSWAP_MASK, %xmm12
	71	+ movdqa \reg1, %xmm8
	72	+ movdqa \reg2, %xmm13
	73	+ pclmulqdq $0x00, FOLD_CONSTS, \reg1
	74	+ pclmulqdq $0x11, FOLD_CONSTS, %xmm8
	75	+ pclmulqdq $0x00, FOLD_CONSTS, \reg2
	76	+ pclmulqdq $0x11, FOLD_CONSTS, %xmm13
	77	+ pxor %xmm9 , \reg1
	78	+ xorps %xmm8 , \reg1
	79	+ pxor %xmm12, \reg2
	80	+ xorps %xmm13, \reg2
	81	+.endm
	82	+
	83	+# Fold src_reg into dst_reg.
	84	+.macro fold_16_bytes src_reg, dst_reg
	85	+ movdqa \src_reg, %xmm8
	86	+ pclmulqdq $0x11, FOLD_CONSTS, \src_reg
	87	+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
	88	+ pxor %xmm8, \dst_reg
	89	+ xorps \src_reg, \dst_reg
	90	+.endm
	91	+
	92	+#
	93	+# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len);
	94	+#
	95	+# Assumes len >= 16.
	96	+#
72	97	.align 16
	98	+SYM_FUNC_START(crc_t10dif_pcl)
73	99
74		- # adjust the 16-bit initial_crc value, scale it to 32 bits
75		- shl $16, arg1_low32
	100	+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
76	101
77		- # Allocate Stack Space
78		- mov %rsp, %rcx
79		- sub $16*2, %rsp
80		- # align stack to 16 byte boundary
81		- and $~(0x10 - 1), %rsp
	102	+ # For sizes less than 256 bytes, we can't fold 128 bytes at a time.
	103	+ cmp $256, len
	104	+ jl .Lless_than_256_bytes
82	105
83		- # check if smaller than 256
84		- cmp $256, arg3
	106	+ # Load the first 128 data bytes. Byte swapping is necessary to make the
	107	+ # bit order match the polynomial coefficient order.
	108	+ movdqu 16*0(buf), %xmm0
	109	+ movdqu 16*1(buf), %xmm1
	110	+ movdqu 16*2(buf), %xmm2
	111	+ movdqu 16*3(buf), %xmm3
	112	+ movdqu 16*4(buf), %xmm4
	113	+ movdqu 16*5(buf), %xmm5
	114	+ movdqu 16*6(buf), %xmm6
	115	+ movdqu 16*7(buf), %xmm7
	116	+ add $128, buf
	117	+ pshufb BSWAP_MASK, %xmm0
	118	+ pshufb BSWAP_MASK, %xmm1
	119	+ pshufb BSWAP_MASK, %xmm2
	120	+ pshufb BSWAP_MASK, %xmm3
	121	+ pshufb BSWAP_MASK, %xmm4
	122	+ pshufb BSWAP_MASK, %xmm5
	123	+ pshufb BSWAP_MASK, %xmm6
	124	+ pshufb BSWAP_MASK, %xmm7
85	125
86		- # for sizes less than 128, we can't fold 64B at a time...
87		- jl _less_than_128
	126	+ # XOR the first 16 data bits with the initial CRC value.
	127	+ pxor %xmm8, %xmm8
	128	+ pinsrw $7, init_crc, %xmm8
	129	+ pxor %xmm8, %xmm0
88	130
	131	+ movdqa .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS
89	132
90		- # load the initial crc value
91		- movd arg1_low32, %xmm10 # initial crc
	133	+ # Subtract 128 for the 128 data bytes just consumed. Subtract another
	134	+ # 128 to simplify the termination condition of the following loop.
	135	+ sub $256, len
92	136
93		- # crc value does not need to be byte-reflected, but it needs
94		- # to be moved to the high part of the register.
95		- # because data will be byte-reflected and will align with
96		- # initial crc at correct place.
97		- pslldq $12, %xmm10
	137	+ # While >= 128 data bytes remain (not counting xmm0-7), fold the 128
	138	+ # bytes xmm0-7 into them, storing the result back into xmm0-7.
	139	+.Lfold_128_bytes_loop:
	140	+ fold_32_bytes 0, %xmm0, %xmm1
	141	+ fold_32_bytes 32, %xmm2, %xmm3
	142	+ fold_32_bytes 64, %xmm4, %xmm5
	143	+ fold_32_bytes 96, %xmm6, %xmm7
	144	+ add $128, buf
	145	+ sub $128, len
	146	+ jge .Lfold_128_bytes_loop
98	147
99		- movdqa SHUF_MASK(%rip), %xmm11
100		- # receive the initial 64B data, xor the initial crc value
101		- movdqu 16*0(arg2), %xmm0
102		- movdqu 16*1(arg2), %xmm1
103		- movdqu 16*2(arg2), %xmm2
104		- movdqu 16*3(arg2), %xmm3
105		- movdqu 16*4(arg2), %xmm4
106		- movdqu 16*5(arg2), %xmm5
107		- movdqu 16*6(arg2), %xmm6
108		- movdqu 16*7(arg2), %xmm7
	148	+ # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
109	149
110		- pshufb %xmm11, %xmm0
111		- # XOR the initial_crc value
112		- pxor %xmm10, %xmm0
113		- pshufb %xmm11, %xmm1
114		- pshufb %xmm11, %xmm2
115		- pshufb %xmm11, %xmm3
116		- pshufb %xmm11, %xmm4
117		- pshufb %xmm11, %xmm5
118		- pshufb %xmm11, %xmm6
119		- pshufb %xmm11, %xmm7
	150	+ # Fold across 64 bytes.
	151	+ movdqa .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS
	152	+ fold_16_bytes %xmm0, %xmm4
	153	+ fold_16_bytes %xmm1, %xmm5
	154	+ fold_16_bytes %xmm2, %xmm6
	155	+ fold_16_bytes %xmm3, %xmm7
	156	+ # Fold across 32 bytes.
	157	+ movdqa .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS
	158	+ fold_16_bytes %xmm4, %xmm6
	159	+ fold_16_bytes %xmm5, %xmm7
	160	+ # Fold across 16 bytes.
	161	+ movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
	162	+ fold_16_bytes %xmm6, %xmm7
120	163
121		- movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4
122		- #imm value of pclmulqdq instruction
123		- #will determine which constant to use
	164	+ # Add 128 to get the correct number of data bytes remaining in 0...127
	165	+ # (not counting xmm7), following the previous extra subtraction by 128.
	166	+ # Then subtract 16 to simplify the termination condition of the
	167	+ # following loop.
	168	+ add $128-16, len
124	169
125		- #################################################################
126		- # we subtract 256 instead of 128 to save one instruction from the loop
127		- sub $256, arg3
128		-
129		- # at this section of the code, there is 64*x+y (0<=y<64) bytes of
130		- # buffer. The _fold_64_B_loop will fold 64B at a time
131		- # until we have 64+y Bytes of buffer
132		-
133		-
134		- # fold 64B at a time. This section of the code folds 4 xmm
135		- # registers in parallel
136		-_fold_64_B_loop:
137		-
138		- # update the buffer pointer
139		- add $128, arg2 # buf += 64#
140		-
141		- movdqu 16*0(arg2), %xmm9
142		- movdqu 16*1(arg2), %xmm12
143		- pshufb %xmm11, %xmm9
144		- pshufb %xmm11, %xmm12
145		- movdqa %xmm0, %xmm8
146		- movdqa %xmm1, %xmm13
147		- pclmulqdq $0x0 , %xmm10, %xmm0
148		- pclmulqdq $0x11, %xmm10, %xmm8
149		- pclmulqdq $0x0 , %xmm10, %xmm1
150		- pclmulqdq $0x11, %xmm10, %xmm13
151		- pxor %xmm9 , %xmm0
152		- xorps %xmm8 , %xmm0
153		- pxor %xmm12, %xmm1
154		- xorps %xmm13, %xmm1
155		-
156		- movdqu 16*2(arg2), %xmm9
157		- movdqu 16*3(arg2), %xmm12
158		- pshufb %xmm11, %xmm9
159		- pshufb %xmm11, %xmm12
160		- movdqa %xmm2, %xmm8
161		- movdqa %xmm3, %xmm13
162		- pclmulqdq $0x0, %xmm10, %xmm2
163		- pclmulqdq $0x11, %xmm10, %xmm8
164		- pclmulqdq $0x0, %xmm10, %xmm3
165		- pclmulqdq $0x11, %xmm10, %xmm13
166		- pxor %xmm9 , %xmm2
167		- xorps %xmm8 , %xmm2
168		- pxor %xmm12, %xmm3
169		- xorps %xmm13, %xmm3
170		-
171		- movdqu 16*4(arg2), %xmm9
172		- movdqu 16*5(arg2), %xmm12
173		- pshufb %xmm11, %xmm9
174		- pshufb %xmm11, %xmm12
175		- movdqa %xmm4, %xmm8
176		- movdqa %xmm5, %xmm13
177		- pclmulqdq $0x0, %xmm10, %xmm4
178		- pclmulqdq $0x11, %xmm10, %xmm8
179		- pclmulqdq $0x0, %xmm10, %xmm5
180		- pclmulqdq $0x11, %xmm10, %xmm13
181		- pxor %xmm9 , %xmm4
182		- xorps %xmm8 , %xmm4
183		- pxor %xmm12, %xmm5
184		- xorps %xmm13, %xmm5
185		-
186		- movdqu 16*6(arg2), %xmm9
187		- movdqu 16*7(arg2), %xmm12
188		- pshufb %xmm11, %xmm9
189		- pshufb %xmm11, %xmm12
190		- movdqa %xmm6 , %xmm8
191		- movdqa %xmm7 , %xmm13
192		- pclmulqdq $0x0 , %xmm10, %xmm6
193		- pclmulqdq $0x11, %xmm10, %xmm8
194		- pclmulqdq $0x0 , %xmm10, %xmm7
195		- pclmulqdq $0x11, %xmm10, %xmm13
196		- pxor %xmm9 , %xmm6
197		- xorps %xmm8 , %xmm6
198		- pxor %xmm12, %xmm7
199		- xorps %xmm13, %xmm7
200		-
201		- sub $128, arg3
202		-
203		- # check if there is another 64B in the buffer to be able to fold
204		- jge _fold_64_B_loop
205		- ##################################################################
206		-
207		-
208		- add $128, arg2
209		- # at this point, the buffer pointer is pointing at the last y Bytes
210		- # of the buffer the 64B of folded data is in 4 of the xmm
211		- # registers: xmm0, xmm1, xmm2, xmm3
212		-
213		-
214		- # fold the 8 xmm registers to 1 xmm register with different constants
215		-
216		- movdqa rk9(%rip), %xmm10
217		- movdqa %xmm0, %xmm8
218		- pclmulqdq $0x11, %xmm10, %xmm0
219		- pclmulqdq $0x0 , %xmm10, %xmm8
220		- pxor %xmm8, %xmm7
221		- xorps %xmm0, %xmm7
222		-
223		- movdqa rk11(%rip), %xmm10
224		- movdqa %xmm1, %xmm8
225		- pclmulqdq $0x11, %xmm10, %xmm1
226		- pclmulqdq $0x0 , %xmm10, %xmm8
227		- pxor %xmm8, %xmm7
228		- xorps %xmm1, %xmm7
229		-
230		- movdqa rk13(%rip), %xmm10
231		- movdqa %xmm2, %xmm8
232		- pclmulqdq $0x11, %xmm10, %xmm2
233		- pclmulqdq $0x0 , %xmm10, %xmm8
234		- pxor %xmm8, %xmm7
235		- pxor %xmm2, %xmm7
236		-
237		- movdqa rk15(%rip), %xmm10
238		- movdqa %xmm3, %xmm8
239		- pclmulqdq $0x11, %xmm10, %xmm3
240		- pclmulqdq $0x0 , %xmm10, %xmm8
241		- pxor %xmm8, %xmm7
242		- xorps %xmm3, %xmm7
243		-
244		- movdqa rk17(%rip), %xmm10
245		- movdqa %xmm4, %xmm8
246		- pclmulqdq $0x11, %xmm10, %xmm4
247		- pclmulqdq $0x0 , %xmm10, %xmm8
248		- pxor %xmm8, %xmm7
249		- pxor %xmm4, %xmm7
250		-
251		- movdqa rk19(%rip), %xmm10
252		- movdqa %xmm5, %xmm8
253		- pclmulqdq $0x11, %xmm10, %xmm5
254		- pclmulqdq $0x0 , %xmm10, %xmm8
255		- pxor %xmm8, %xmm7
256		- xorps %xmm5, %xmm7
257		-
258		- movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2
259		- #imm value of pclmulqdq instruction
260		- #will determine which constant to use
261		- movdqa %xmm6, %xmm8
262		- pclmulqdq $0x11, %xmm10, %xmm6
263		- pclmulqdq $0x0 , %xmm10, %xmm8
264		- pxor %xmm8, %xmm7
265		- pxor %xmm6, %xmm7
266		-
267		-
268		- # instead of 64, we add 48 to the loop counter to save 1 instruction
269		- # from the loop instead of a cmp instruction, we use the negative
270		- # flag with the jl instruction
271		- add $128-16, arg3
272		- jl _final_reduction_for_128
273		-
274		- # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
275		- # and the rest is in memory. We can fold 16 bytes at a time if y>=16
276		- # continue folding 16B at a time
277		-
278		-_16B_reduction_loop:
	170	+ # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes
	171	+ # xmm7 into them, storing the result back into xmm7.
	172	+ jl .Lfold_16_bytes_loop_done
	173	+.Lfold_16_bytes_loop:
279	174	movdqa %xmm7, %xmm8
280		- pclmulqdq $0x11, %xmm10, %xmm7
281		- pclmulqdq $0x0 , %xmm10, %xmm8
	175	+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7
	176	+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
282	177	pxor %xmm8, %xmm7
283		- movdqu (arg2), %xmm0
284		- pshufb %xmm11, %xmm0
	178	+ movdqu (buf), %xmm0
	179	+ pshufb BSWAP_MASK, %xmm0
285	180	pxor %xmm0 , %xmm7
286		- add $16, arg2
287		- sub $16, arg3
288		- # instead of a cmp instruction, we utilize the flags with the
289		- # jge instruction equivalent of: cmp arg3, 16-16
290		- # check if there is any more 16B in the buffer to be able to fold
291		- jge _16B_reduction_loop
	181	+ add $16, buf
	182	+ sub $16, len
	183	+ jge .Lfold_16_bytes_loop
292	184
293		- #now we have 16+z bytes left to reduce, where 0<= z < 16.
294		- #first, we reduce the data in the xmm7 register
	185	+.Lfold_16_bytes_loop_done:
	186	+ # Add 16 to get the correct number of data bytes remaining in 0...15
	187	+ # (not counting xmm7), following the previous extra subtraction by 16.
	188	+ add $16, len
	189	+ je .Lreduce_final_16_bytes
295	190
	191	+.Lhandle_partial_segment:
	192	+ # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16
	193	+ # bytes are in xmm7 and the rest are the remaining data in 'buf'. To do
	194	+ # this without needing a fold constant for each possible 'len', redivide
	195	+ # the bytes into a first chunk of 'len' bytes and a second chunk of 16
	196	+ # bytes, then fold the first chunk into the second.
296	197
297		-_final_reduction_for_128:
298		- # check if any more data to fold. If not, compute the CRC of
299		- # the final 128 bits
300		- add $16, arg3
301		- je _128_done
302		-
303		- # here we are getting data that is less than 16 bytes.
304		- # since we know that there was data before the pointer, we can
305		- # offset the input pointer before the actual point, to receive
306		- # exactly 16 bytes. after that the registers need to be adjusted.
307		-_get_last_two_xmms:
308	198	movdqa %xmm7, %xmm2
309	199
310		- movdqu -16(arg2, arg3), %xmm1
311		- pshufb %xmm11, %xmm1
	200	+ # xmm1 = last 16 original data bytes
	201	+ movdqu -16(buf, len), %xmm1
	202	+ pshufb BSWAP_MASK, %xmm1
312	203
313		- # get rid of the extra data that was loaded before
314		- # load the shift constant
315		- lea pshufb_shf_table+16(%rip), %rax
316		- sub arg3, %rax
	204	+ # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
	205	+ lea .Lbyteshift_table+16(%rip), %rax
	206	+ sub len, %rax
317	207	movdqu (%rax), %xmm0
318		-
319		- # shift xmm2 to the left by arg3 bytes
320	208	pshufb %xmm0, %xmm2
321	209
322		- # shift xmm7 to the right by 16-arg3 bytes
323		- pxor mask1(%rip), %xmm0
	210	+ # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
	211	+ pxor .Lmask1(%rip), %xmm0
324	212	pshufb %xmm0, %xmm7
	213	+
	214	+ # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
	215	+ # then '16-len' bytes from xmm2 (high-order bytes).
325	216	pblendvb %xmm2, %xmm1 #xmm0 is implicit
326	217
327		- # fold 16 Bytes
328		- movdqa %xmm1, %xmm2
	218	+ # Fold the first chunk into the second chunk, storing the result in xmm7.
329	219	movdqa %xmm7, %xmm8
330		- pclmulqdq $0x11, %xmm10, %xmm7
331		- pclmulqdq $0x0 , %xmm10, %xmm8
	220	+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7
	221	+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
332	222	pxor %xmm8, %xmm7
333		- pxor %xmm2, %xmm7
	223	+ pxor %xmm1, %xmm7
334	224
335		-_128_done:
336		- # compute crc of a 128-bit value
337		- movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10
	225	+.Lreduce_final_16_bytes:
	226	+ # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
	227	+
	228	+ # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
	229	+ movdqa .Lfinal_fold_consts(%rip), FOLD_CONSTS
	230	+
	231	+ # Fold the high 64 bits into the low 64 bits, while also multiplying by
	232	+ # x^64. This produces a 128-bit value congruent to x^64 * M(x) and
	233	+ # whose low 48 bits are 0.
338	234	movdqa %xmm7, %xmm0
	235	+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
	236	+ pslldq $8, %xmm0
	237	+ pxor %xmm0, %xmm7 # + low bits * x^64
339	238
340		- #64b fold
341		- pclmulqdq $0x1, %xmm10, %xmm7
342		- pslldq $8 , %xmm0
343		- pxor %xmm0, %xmm7
344		-
345		- #32b fold
	239	+ # Fold the high 32 bits into the low 96 bits. This produces a 96-bit
	240	+ # value congruent to x^64 * M(x) and whose low 48 bits are 0.
346	241	movdqa %xmm7, %xmm0
	242	+ pand .Lmask2(%rip), %xmm0 # zero high 32 bits
	243	+ psrldq $12, %xmm7 # extract high 32 bits
	244	+ pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
	245	+ pxor %xmm0, %xmm7 # + low bits
347	246
348		- pand mask2(%rip), %xmm0
	247	+ # Load G(x) and floor(x^48 / G(x)).
	248	+ movdqa .Lbarrett_reduction_consts(%rip), FOLD_CONSTS
349	249
350		- psrldq $12, %xmm7
351		- pclmulqdq $0x10, %xmm10, %xmm7
	250	+ # Use Barrett reduction to compute the final CRC value.
	251	+ movdqa %xmm7, %xmm0
	252	+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
	253	+ psrlq $32, %xmm7 # /= x^32
	254	+ pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x)
	255	+ psrlq $48, %xmm0
	256	+ pxor %xmm7, %xmm0 # + low 16 nonzero bits
	257	+ # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
	258	+
	259	+ pextrw $0, %xmm0, %eax
	260	+ RET
	261	+
	262	+.align 16
	263	+.Lless_than_256_bytes:
	264	+ # Checksumming a buffer of length 16...255 bytes
	265	+
	266	+ # Load the first 16 data bytes.
	267	+ movdqu (buf), %xmm7
	268	+ pshufb BSWAP_MASK, %xmm7
	269	+ add $16, buf
	270	+
	271	+ # XOR the first 16 data bits with the initial CRC value.
	272	+ pxor %xmm0, %xmm0
	273	+ pinsrw $7, init_crc, %xmm0
352	274	pxor %xmm0, %xmm7
353	275
354		- #barrett reduction
355		-_barrett:
356		- movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10
357		- movdqa %xmm7, %xmm0
358		- pclmulqdq $0x01, %xmm10, %xmm7
359		- pslldq $4, %xmm7
360		- pclmulqdq $0x11, %xmm10, %xmm7
361		-
362		- pslldq $4, %xmm7
363		- pxor %xmm0, %xmm7
364		- pextrd $1, %xmm7, %eax
365		-
366		-_cleanup:
367		- # scale the result back to 16 bits
368		- shr $16, %eax
369		- mov %rcx, %rsp
370		- ret
371		-
372		-########################################################################
373		-
374		-.align 16
375		-_less_than_128:
376		-
377		- # check if there is enough buffer to be able to fold 16B at a time
378		- cmp $32, arg3
379		- jl _less_than_32
380		- movdqa SHUF_MASK(%rip), %xmm11
381		-
382		- # now if there is, load the constants
383		- movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
384		-
385		- movd arg1_low32, %xmm0 # get the initial crc value
386		- pslldq $12, %xmm0 # align it to its correct place
387		- movdqu (arg2), %xmm7 # load the plaintext
388		- pshufb %xmm11, %xmm7 # byte-reflect the plaintext
389		- pxor %xmm0, %xmm7
390		-
391		-
392		- # update the buffer pointer
393		- add $16, arg2
394		-
395		- # update the counter. subtract 32 instead of 16 to save one
396		- # instruction from the loop
397		- sub $32, arg3
398		-
399		- jmp _16B_reduction_loop
400		-
401		-
402		-.align 16
403		-_less_than_32:
404		- # mov initial crc to the return value. this is necessary for
405		- # zero-length buffers.
406		- mov arg1_low32, %eax
407		- test arg3, arg3
408		- je _cleanup
409		-
410		- movdqa SHUF_MASK(%rip), %xmm11
411		-
412		- movd arg1_low32, %xmm0 # get the initial crc value
413		- pslldq $12, %xmm0 # align it to its correct place
414		-
415		- cmp $16, arg3
416		- je _exact_16_left
417		- jl _less_than_16_left
418		-
419		- movdqu (arg2), %xmm7 # load the plaintext
420		- pshufb %xmm11, %xmm7 # byte-reflect the plaintext
421		- pxor %xmm0 , %xmm7 # xor the initial crc value
422		- add $16, arg2
423		- sub $16, arg3
424		- movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
425		- jmp _get_last_two_xmms
426		-
427		-
428		-.align 16
429		-_less_than_16_left:
430		- # use stack space to load data less than 16 bytes, zero-out
431		- # the 16B in memory first.
432		-
433		- pxor %xmm1, %xmm1
434		- mov %rsp, %r11
435		- movdqa %xmm1, (%r11)
436		-
437		- cmp $4, arg3
438		- jl _only_less_than_4
439		-
440		- # backup the counter value
441		- mov arg3, %r9
442		- cmp $8, arg3
443		- jl _less_than_8_left
444		-
445		- # load 8 Bytes
446		- mov (arg2), %rax
447		- mov %rax, (%r11)
448		- add $8, %r11
449		- sub $8, arg3
450		- add $8, arg2
451		-_less_than_8_left:
452		-
453		- cmp $4, arg3
454		- jl _less_than_4_left
455		-
456		- # load 4 Bytes
457		- mov (arg2), %eax
458		- mov %eax, (%r11)
459		- add $4, %r11
460		- sub $4, arg3
461		- add $4, arg2
462		-_less_than_4_left:
463		-
464		- cmp $2, arg3
465		- jl _less_than_2_left
466		-
467		- # load 2 Bytes
468		- mov (arg2), %ax
469		- mov %ax, (%r11)
470		- add $2, %r11
471		- sub $2, arg3
472		- add $2, arg2
473		-_less_than_2_left:
474		- cmp $1, arg3
475		- jl _zero_left
476		-
477		- # load 1 Byte
478		- mov (arg2), %al
479		- mov %al, (%r11)
480		-_zero_left:
481		- movdqa (%rsp), %xmm7
482		- pshufb %xmm11, %xmm7
483		- pxor %xmm0 , %xmm7 # xor the initial crc value
484		-
485		- # shl r9, 4
486		- lea pshufb_shf_table+16(%rip), %rax
487		- sub %r9, %rax
488		- movdqu (%rax), %xmm0
489		- pxor mask1(%rip), %xmm0
490		-
491		- pshufb %xmm0, %xmm7
492		- jmp _128_done
493		-
494		-.align 16
495		-_exact_16_left:
496		- movdqu (arg2), %xmm7
497		- pshufb %xmm11, %xmm7
498		- pxor %xmm0 , %xmm7 # xor the initial crc value
499		-
500		- jmp _128_done
501		-
502		-_only_less_than_4:
503		- cmp $3, arg3
504		- jl _only_less_than_3
505		-
506		- # load 3 Bytes
507		- mov (arg2), %al
508		- mov %al, (%r11)
509		-
510		- mov 1(arg2), %al
511		- mov %al, 1(%r11)
512		-
513		- mov 2(arg2), %al
514		- mov %al, 2(%r11)
515		-
516		- movdqa (%rsp), %xmm7
517		- pshufb %xmm11, %xmm7
518		- pxor %xmm0 , %xmm7 # xor the initial crc value
519		-
520		- psrldq $5, %xmm7
521		-
522		- jmp _barrett
523		-_only_less_than_3:
524		- cmp $2, arg3
525		- jl _only_less_than_2
526		-
527		- # load 2 Bytes
528		- mov (arg2), %al
529		- mov %al, (%r11)
530		-
531		- mov 1(arg2), %al
532		- mov %al, 1(%r11)
533		-
534		- movdqa (%rsp), %xmm7
535		- pshufb %xmm11, %xmm7
536		- pxor %xmm0 , %xmm7 # xor the initial crc value
537		-
538		- psrldq $6, %xmm7
539		-
540		- jmp _barrett
541		-_only_less_than_2:
542		-
543		- # load 1 Byte
544		- mov (arg2), %al
545		- mov %al, (%r11)
546		-
547		- movdqa (%rsp), %xmm7
548		- pshufb %xmm11, %xmm7
549		- pxor %xmm0 , %xmm7 # xor the initial crc value
550		-
551		- psrldq $7, %xmm7
552		-
553		- jmp _barrett
554		-
555		-ENDPROC(crc_t10dif_pcl)
	276	+ movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
	277	+ cmp $16, len
	278	+ je .Lreduce_final_16_bytes # len == 16
	279	+ sub $32, len
	280	+ jge .Lfold_16_bytes_loop # 32 <= len <= 255
	281	+ add $16, len
	282	+ jmp .Lhandle_partial_segment # 17 <= len <= 31
	283	+SYM_FUNC_END(crc_t10dif_pcl)
556	284
557	285	.section .rodata, "a", @progbits
558	286	.align 16
559		-# precomputed constants
560		-# these constants are precomputed from the poly:
561		-# 0x8bb70000 (0x8bb7 scaled to 32 bits)
562		-# Q = 0x18BB70000
563		-# rk1 = 2^(32*3) mod Q << 32
564		-# rk2 = 2^(32*5) mod Q << 32
565		-# rk3 = 2^(32*15) mod Q << 32
566		-# rk4 = 2^(32*17) mod Q << 32
567		-# rk5 = 2^(32*3) mod Q << 32
568		-# rk6 = 2^(32*2) mod Q << 32
569		-# rk7 = floor(2^64/Q)
570		-# rk8 = Q
571		-rk1:
572		-.quad 0x2d56000000000000
573		-rk2:
574		-.quad 0x06df000000000000
575		-rk3:
576		-.quad 0x9d9d000000000000
577		-rk4:
578		-.quad 0x7cf5000000000000
579		-rk5:
580		-.quad 0x2d56000000000000
581		-rk6:
582		-.quad 0x1368000000000000
583		-rk7:
584		-.quad 0x00000001f65a57f8
585		-rk8:
586		-.quad 0x000000018bb70000
587	287
588		-rk9:
589		-.quad 0xceae000000000000
590		-rk10:
591		-.quad 0xbfd6000000000000
592		-rk11:
593		-.quad 0x1e16000000000000
594		-rk12:
595		-.quad 0x713c000000000000
596		-rk13:
597		-.quad 0xf7f9000000000000
598		-rk14:
599		-.quad 0x80a6000000000000
600		-rk15:
601		-.quad 0x044c000000000000
602		-rk16:
603		-.quad 0xe658000000000000
604		-rk17:
605		-.quad 0xad18000000000000
606		-rk18:
607		-.quad 0xa497000000000000
608		-rk19:
609		-.quad 0x6ee3000000000000
610		-rk20:
611		-.quad 0xe7b5000000000000
612		-
613		-
	288	+# Fold constants precomputed from the polynomial 0x18bb7
	289	+# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
	290	+.Lfold_across_128_bytes_consts:
	291	+ .quad 0x0000000000006123 # x^(8*128) mod G(x)
	292	+ .quad 0x0000000000002295 # x^(8*128+64) mod G(x)
	293	+.Lfold_across_64_bytes_consts:
	294	+ .quad 0x0000000000001069 # x^(4*128) mod G(x)
	295	+ .quad 0x000000000000dd31 # x^(4*128+64) mod G(x)
	296	+.Lfold_across_32_bytes_consts:
	297	+ .quad 0x000000000000857d # x^(2*128) mod G(x)
	298	+ .quad 0x0000000000007acc # x^(2*128+64) mod G(x)
	299	+.Lfold_across_16_bytes_consts:
	300	+ .quad 0x000000000000a010 # x^(1*128) mod G(x)
	301	+ .quad 0x0000000000001faa # x^(1*128+64) mod G(x)
	302	+.Lfinal_fold_consts:
	303	+ .quad 0x1368000000000000 # x^48 * (x^48 mod G(x))
	304	+ .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x))
	305	+.Lbarrett_reduction_consts:
	306	+ .quad 0x0000000000018bb7 # G(x)
	307	+ .quad 0x00000001f65a57f8 # floor(x^48 / G(x))
614	308
615	309	.section .rodata.cst16.mask1, "aM", @progbits, 16
616	310	.align 16
617		-mask1:
618		-.octa 0x80808080808080808080808080808080
	311	+.Lmask1:
	312	+ .octa 0x80808080808080808080808080808080
619	313
620	314	.section .rodata.cst16.mask2, "aM", @progbits, 16
621	315	.align 16
622		-mask2:
623		-.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
	316	+.Lmask2:
	317	+ .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
624	318
625		-.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
	319	+.section .rodata.cst16.bswap_mask, "aM", @progbits, 16
626	320	.align 16
627		-SHUF_MASK:
628		-.octa 0x000102030405060708090A0B0C0D0E0F
	321	+.Lbswap_mask:
	322	+ .octa 0x000102030405060708090A0B0C0D0E0F
629	323
630		-.section .rodata.cst32.pshufb_shf_table, "aM", @progbits, 32
631		-.align 32
632		-pshufb_shf_table:
633		-# use these values for shift constants for the pshufb instruction
634		-# different alignments result in values as shown:
635		-# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
636		-# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
637		-# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
638		-# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
639		-# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
640		-# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
641		-# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
642		-# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
643		-# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
644		-# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
645		-# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
646		-# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
647		-# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
648		-# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
649		-# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
650		-.octa 0x8f8e8d8c8b8a89888786858483828100
651		-.octa 0x000e0d0c0b0a09080706050403020100
	324	+.section .rodata.cst32.byteshift_table, "aM", @progbits, 32
	325	+.align 16
	326	+# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
	327	+# is the index vector to shift left by 'len' bytes, and is also {0x80, ...,
	328	+# 0x80} XOR the index vector to shift right by '16 - len' bytes.
	329	+.Lbyteshift_table:
	330	+ .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
	331	+ .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
	332	+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
	333	+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0