From 1f93a7dfd1f8d5ff7a5c53246c7534fe2332d6f4 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Mon, 11 Dec 2023 02:46:07 +0000
Subject: [PATCH] add audio

---
 kernel/arch/arm64/crypto/crct10dif-ce-core.S |  642 +++++++++++++++++++++++++++++++++------------------------
 1 files changed, 370 insertions(+), 272 deletions(-)

diff --git a/kernel/arch/arm64/crypto/crct10dif-ce-core.S b/kernel/arch/arm64/crypto/crct10dif-ce-core.S
index 663ea71..dce6dce 100644
--- a/kernel/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/kernel/arch/arm64/crypto/crct10dif-ce-core.S
@@ -2,12 +2,14 @@
 // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
 //
 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+// Copyright (C) 2019 Google LLC <ebiggers@google.com>
 //
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License version 2 as
 // published by the Free Software Foundation.
 //
 
+// Derived from the x86 version:
 //
 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
 //
@@ -54,115 +56,176 @@
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-//       Function API:
-//       UINT16 crc_t10dif_pcl(
-//               UINT16 init_crc, //initial CRC value, 16 bits
-//               const unsigned char *buf, //buffer pointer to calculate CRC on
-//               UINT64 len //buffer length in bytes (64-bit data)
-//       );
-//
 //       Reference paper titled "Fast CRC Computation for Generic
 //	Polynomials Using PCLMULQDQ Instruction"
 //       URL: http://www.intel.com/content/dam/www/public/us/en/documents
 //  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-//
 //
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
 	.text
-	.cpu		generic+crypto
+	.arch		armv8-a+crypto
 
-	arg1_low32	.req	w19
-	arg2		.req	x20
-	arg3		.req	x21
+	init_crc	.req	w0
+	buf		.req	x1
+	len		.req	x2
+	fold_consts_ptr	.req	x3
 
-	vzr		.req	v13
+	fold_consts	.req	v10
 
-ENTRY(crc_t10dif_pmull)
-	frame_push	3, 128
+	ad		.req	v14
 
-	mov		arg1_low32, w0
-	mov		arg2, x1
-	mov		arg3, x2
+	k00_16		.req	v15
+	k32_48		.req	v16
 
-	movi		vzr.16b, #0		// init zero register
+	t3		.req	v17
+	t4		.req	v18
+	t5		.req	v19
+	t6		.req	v20
+	t7		.req	v21
+	t8		.req	v22
+	t9		.req	v23
 
-	// adjust the 16-bit initial_crc value, scale it to 32 bits
-	lsl		arg1_low32, arg1_low32, #16
+	perm1		.req	v24
+	perm2		.req	v25
+	perm3		.req	v26
+	perm4		.req	v27
 
-	// check if smaller than 256
-	cmp		arg3, #256
+	bd1		.req	v28
+	bd2		.req	v29
+	bd3		.req	v30
+	bd4		.req	v31
 
-	// for sizes less than 128, we can't fold 64B at a time...
-	b.lt		_less_than_128
+	.macro		__pmull_init_p64
+	.endm
 
-	// load the initial crc value
-	// crc value does not need to be byte-reflected, but it needs
-	// to be moved to the high part of the register.
-	// because data will be byte-reflected and will align with
-	// initial crc at correct place.
-	movi		v10.16b, #0
-	mov		v10.s[3], arg1_low32		// initial crc
+	.macro		__pmull_pre_p64, bd
+	.endm
 
-	// receive the initial 64B data, xor the initial crc value
-	ldp		q0, q1, [arg2]
-	ldp		q2, q3, [arg2, #0x20]
-	ldp		q4, q5, [arg2, #0x40]
-	ldp		q6, q7, [arg2, #0x60]
-	add		arg2, arg2, #0x80
+	.macro		__pmull_init_p8
+	// k00_16 := 0x0000000000000000_000000000000ffff
+	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
+	movi		k32_48.2d, #0xffffffff
+	mov		k32_48.h[2], k32_48.h[0]
+	ushr		k00_16.2d, k32_48.2d, #32
 
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	rev64		v1.16b, v1.16b			)
-CPU_LE(	rev64		v2.16b, v2.16b			)
-CPU_LE(	rev64		v3.16b, v3.16b			)
-CPU_LE(	rev64		v4.16b, v4.16b			)
-CPU_LE(	rev64		v5.16b, v5.16b			)
-CPU_LE(	rev64		v6.16b, v6.16b			)
-CPU_LE(	rev64		v7.16b, v7.16b			)
+	// prepare the permutation vectors
+	mov_q		x5, 0x080f0e0d0c0b0a09
+	movi		perm4.8b, #8
+	dup		perm1.2d, x5
+	eor		perm1.16b, perm1.16b, perm4.16b
+	ushr		perm2.2d, perm1.2d, #8
+	ushr		perm3.2d, perm1.2d, #16
+	ushr		perm4.2d, perm1.2d, #24
+	sli		perm2.2d, perm1.2d, #56
+	sli		perm3.2d, perm1.2d, #48
+	sli		perm4.2d, perm1.2d, #40
+	.endm
 
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
-CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
-CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
-CPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
-CPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
-CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
-CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
-CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+	.macro		__pmull_pre_p8, bd
+	tbl		bd1.16b, {\bd\().16b}, perm1.16b
+	tbl		bd2.16b, {\bd\().16b}, perm2.16b
+	tbl		bd3.16b, {\bd\().16b}, perm3.16b
+	tbl		bd4.16b, {\bd\().16b}, perm4.16b
+	.endm
 
-	// XOR the initial_crc value
-	eor		v0.16b, v0.16b, v10.16b
+SYM_FUNC_START_LOCAL(__pmull_p8_core)
+.L__pmull_p8_core:
+	ext		t4.8b, ad.8b, ad.8b, #1			// A1
+	ext		t5.8b, ad.8b, ad.8b, #2			// A2
+	ext		t6.8b, ad.8b, ad.8b, #3			// A3
 
-	ldr_l		q10, rk3, x8	// xmm10 has rk3 and rk4
-					// type of pmull instruction
-					// will determine which constant to use
+	pmull		t4.8h, t4.8b, fold_consts.8b		// F = A1*B
+	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
+	pmull		t5.8h, t5.8b, fold_consts.8b		// H = A2*B
+	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
+	pmull		t6.8h, t6.8b, fold_consts.8b		// J = A3*B
+	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
+	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
+	b		0f
 
-	//
-	// we subtract 256 instead of 128 to save one instruction from the loop
-	//
-	sub		arg3, arg3, #256
+.L__pmull_p8_core2:
+	tbl		t4.16b, {ad.16b}, perm1.16b		// A1
+	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
+	tbl		t6.16b, {ad.16b}, perm3.16b		// A3
 
-	// at this section of the code, there is 64*x+y (0<=y<64) bytes of
-	// buffer. The _fold_64_B_loop will fold 64B at a time
-	// until we have 64+y Bytes of buffer
+	pmull2		t4.8h, t4.16b, fold_consts.16b		// F = A1*B
+	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
+	pmull2		t5.8h, t5.16b, fold_consts.16b		// H = A2*B
+	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
+	pmull2		t6.8h, t6.16b, fold_consts.16b		// J = A3*B
+	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
+	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
 
+0:	eor		t4.16b, t4.16b, t8.16b			// L = E + F
+	eor		t5.16b, t5.16b, t7.16b			// M = G + H
+	eor		t6.16b, t6.16b, t9.16b			// N = I + J
 
-	// fold 64B at a time. This section of the code folds 4 vector
-	// registers in parallel
-_fold_64_B_loop:
+	uzp1		t8.2d, t4.2d, t5.2d
+	uzp2		t4.2d, t4.2d, t5.2d
+	uzp1		t7.2d, t6.2d, t3.2d
+	uzp2		t6.2d, t6.2d, t3.2d
 
-	.macro		fold64, reg1, reg2
-	ldp		q11, q12, [arg2], #0x20
+	// t4 = (L) (P0 + P1) << 8
+	// t5 = (M) (P2 + P3) << 16
+	eor		t8.16b, t8.16b, t4.16b
+	and		t4.16b, t4.16b, k32_48.16b
 
-	pmull2		v8.1q, \reg1\().2d, v10.2d
-	pmull		\reg1\().1q, \reg1\().1d, v10.1d
+	// t6 = (N) (P4 + P5) << 24
+	// t7 = (K) (P6 + P7) << 32
+	eor		t7.16b, t7.16b, t6.16b
+	and		t6.16b, t6.16b, k00_16.16b
+
+	eor		t8.16b, t8.16b, t4.16b
+	eor		t7.16b, t7.16b, t6.16b
+
+	zip2		t5.2d, t8.2d, t4.2d
+	zip1		t4.2d, t8.2d, t4.2d
+	zip2		t3.2d, t7.2d, t6.2d
+	zip1		t6.2d, t7.2d, t6.2d
+
+	ext		t4.16b, t4.16b, t4.16b, #15
+	ext		t5.16b, t5.16b, t5.16b, #14
+	ext		t6.16b, t6.16b, t6.16b, #13
+	ext		t3.16b, t3.16b, t3.16b, #12
+
+	eor		t4.16b, t4.16b, t5.16b
+	eor		t6.16b, t6.16b, t3.16b
+	ret
+SYM_FUNC_END(__pmull_p8_core)
+
+	.macro		__pmull_p8, rq, ad, bd, i
+	.ifnc		\bd, fold_consts
+	.err
+	.endif
+	mov		ad.16b, \ad\().16b
+	.ifb		\i
+	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
+	.else
+	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
+	.endif
+
+	bl		.L__pmull_p8_core\i
+
+	eor		\rq\().16b, \rq\().16b, t4.16b
+	eor		\rq\().16b, \rq\().16b, t6.16b
+	.endm
+
+	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
+	// into reg1, reg2.
+	.macro		fold_32_bytes, p, reg1, reg2
+	ldp		q11, q12, [buf], #0x20
+
+	__pmull_\p	v8, \reg1, fold_consts, 2
+	__pmull_\p	\reg1, \reg1, fold_consts
 
 CPU_LE(	rev64		v11.16b, v11.16b		)
 CPU_LE(	rev64		v12.16b, v12.16b		)
 
-	pmull2		v9.1q, \reg2\().2d, v10.2d
-	pmull		\reg2\().1q, \reg2\().1d, v10.1d
+	__pmull_\p	v9, \reg2, fold_consts, 2
+	__pmull_\p	\reg2, \reg2, fold_consts
 
 CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
 CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
@@ -173,244 +236,279 @@
 	eor		\reg2\().16b, \reg2\().16b, v12.16b
 	.endm
 
-	fold64		v0, v1
-	fold64		v2, v3
-	fold64		v4, v5
-	fold64		v6, v7
-
-	subs		arg3, arg3, #128
-
-	// check if there is another 64B in the buffer to be able to fold
-	b.lt		_fold_64_B_end
-
-	if_will_cond_yield_neon
-	stp		q0, q1, [sp, #.Lframe_local_offset]
-	stp		q2, q3, [sp, #.Lframe_local_offset + 32]
-	stp		q4, q5, [sp, #.Lframe_local_offset + 64]
-	stp		q6, q7, [sp, #.Lframe_local_offset + 96]
-	do_cond_yield_neon
-	ldp		q0, q1, [sp, #.Lframe_local_offset]
-	ldp		q2, q3, [sp, #.Lframe_local_offset + 32]
-	ldp		q4, q5, [sp, #.Lframe_local_offset + 64]
-	ldp		q6, q7, [sp, #.Lframe_local_offset + 96]
-	ldr_l		q10, rk3, x8
-	movi		vzr.16b, #0		// init zero register
-	endif_yield_neon
-
-	b		_fold_64_B_loop
-
-_fold_64_B_end:
-	// at this point, the buffer pointer is pointing at the last y Bytes
-	// of the buffer the 64B of folded data is in 4 of the vector
-	// registers: v0, v1, v2, v3
-
-	// fold the 8 vector registers to 1 vector register with different
-	// constants
-
-	ldr_l		q10, rk9, x8
-
-	.macro		fold16, reg, rk
-	pmull		v8.1q, \reg\().1d, v10.1d
-	pmull2		\reg\().1q, \reg\().2d, v10.2d
-	.ifnb		\rk
-	ldr_l		q10, \rk, x8
+	// Fold src_reg into dst_reg, optionally loading the next fold constants
+	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
+	__pmull_\p	v8, \src_reg, fold_consts
+	__pmull_\p	\src_reg, \src_reg, fold_consts, 2
+	.ifnb		\load_next_consts
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	__pmull_pre_\p	fold_consts
 	.endif
-	eor		v7.16b, v7.16b, v8.16b
-	eor		v7.16b, v7.16b, \reg\().16b
+	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
+	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
 	.endm
 
-	fold16		v0, rk11
-	fold16		v1, rk13
-	fold16		v2, rk15
-	fold16		v3, rk17
-	fold16		v4, rk19
-	fold16		v5, rk1
-	fold16		v6
+	.macro		__pmull_p64, rd, rn, rm, n
+	.ifb		\n
+	pmull		\rd\().1q, \rn\().1d, \rm\().1d
+	.else
+	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
+	.endif
+	.endm
 
-	// instead of 64, we add 48 to the loop counter to save 1 instruction
-	// from the loop instead of a cmp instruction, we use the negative
-	// flag with the jl instruction
-	adds		arg3, arg3, #(128-16)
-	b.lt		_final_reduction_for_128
+	.macro		crc_t10dif_pmull, p
+	__pmull_init_\p
 
-	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
-	// and the rest is in memory. We can fold 16 bytes at a time if y>=16
-	// continue folding 16B at a time
+	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+	cmp		len, #256
+	b.lt		.Lless_than_256_bytes_\@
 
-_16B_reduction_loop:
-	pmull		v8.1q, v7.1d, v10.1d
-	pmull2		v7.1q, v7.2d, v10.2d
+	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
+
+	// Load the first 128 data bytes.  Byte swapping is necessary to make
+	// the bit order match the polynomial coefficient order.
+	ldp		q0, q1, [buf]
+	ldp		q2, q3, [buf, #0x20]
+	ldp		q4, q5, [buf, #0x40]
+	ldp		q6, q7, [buf, #0x60]
+	add		buf, buf, #0x80
+CPU_LE(	rev64		v0.16b, v0.16b			)
+CPU_LE(	rev64		v1.16b, v1.16b			)
+CPU_LE(	rev64		v2.16b, v2.16b			)
+CPU_LE(	rev64		v3.16b, v3.16b			)
+CPU_LE(	rev64		v4.16b, v4.16b			)
+CPU_LE(	rev64		v5.16b, v5.16b			)
+CPU_LE(	rev64		v6.16b, v6.16b			)
+CPU_LE(	rev64		v7.16b, v7.16b			)
+CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
+CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
+CPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
+CPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
+CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
+CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
+CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+
+	// XOR the first 16 data *bits* with the initial CRC value.
+	movi		v8.16b, #0
+	mov		v8.h[7], init_crc
+	eor		v0.16b, v0.16b, v8.16b
+
+	// Load the constants for folding across 128 bytes.
+	ld1		{fold_consts.2d}, [fold_consts_ptr]
+	__pmull_pre_\p	fold_consts
+
+	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
+	// 128 to simplify the termination condition of the following loop.
+	sub		len, len, #256
+
+	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
+	// bytes v0-v7 into them, storing the result back into v0-v7.
+.Lfold_128_bytes_loop_\@:
+	fold_32_bytes	\p, v0, v1
+	fold_32_bytes	\p, v2, v3
+	fold_32_bytes	\p, v4, v5
+	fold_32_bytes	\p, v6, v7
+
+	subs		len, len, #128
+	b.ge		.Lfold_128_bytes_loop_\@
+
+	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
+
+	// Fold across 64 bytes.
+	add		fold_consts_ptr, fold_consts_ptr, #16
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	__pmull_pre_\p	fold_consts
+	fold_16_bytes	\p, v0, v4
+	fold_16_bytes	\p, v1, v5
+	fold_16_bytes	\p, v2, v6
+	fold_16_bytes	\p, v3, v7, 1
+	// Fold across 32 bytes.
+	fold_16_bytes	\p, v4, v6
+	fold_16_bytes	\p, v5, v7, 1
+	// Fold across 16 bytes.
+	fold_16_bytes	\p, v6, v7
+
+	// Add 128 to get the correct number of data bytes remaining in 0...127
+	// (not counting v7), following the previous extra subtraction by 128.
+	// Then subtract 16 to simplify the termination condition of the
+	// following loop.
+	adds		len, len, #(128-16)
+
+	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
+	// into them, storing the result back into v7.
+	b.lt		.Lfold_16_bytes_loop_done_\@
+.Lfold_16_bytes_loop_\@:
+	__pmull_\p	v8, v7, fold_consts
+	__pmull_\p	v7, v7, fold_consts, 2
 	eor		v7.16b, v7.16b, v8.16b
-
-	ldr		q0, [arg2], #16
+	ldr		q0, [buf], #16
 CPU_LE(	rev64		v0.16b, v0.16b			)
 CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 	eor		v7.16b, v7.16b, v0.16b
-	subs		arg3, arg3, #16
+	subs		len, len, #16
+	b.ge		.Lfold_16_bytes_loop_\@
 
-	// instead of a cmp instruction, we utilize the flags with the
-	// jge instruction equivalent of: cmp arg3, 16-16
-	// check if there is any more 16B in the buffer to be able to fold
-	b.ge		_16B_reduction_loop
+.Lfold_16_bytes_loop_done_\@:
+	// Add 16 to get the correct number of data bytes remaining in 0...15
+	// (not counting v7), following the previous extra subtraction by 16.
+	adds		len, len, #16
+	b.eq		.Lreduce_final_16_bytes_\@
 
-	// now we have 16+z bytes left to reduce, where 0<= z < 16.
-	// first, we reduce the data in the xmm7 register
+.Lhandle_partial_segment_\@:
+	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
+	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
+	// do this without needing a fold constant for each possible 'len',
+	// redivide the bytes into a first chunk of 'len' bytes and a second
+	// chunk of 16 bytes, then fold the first chunk into the second.
 
-_final_reduction_for_128:
-	// check if any more data to fold. If not, compute the CRC of
-	// the final 128 bits
-	adds		arg3, arg3, #16
-	b.eq		_128_done
+	// v0 = last 16 original data bytes
+	add		buf, buf, len
+	ldr		q0, [buf, #-16]
+CPU_LE(	rev64		v0.16b, v0.16b			)
+CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 
-	// here we are getting data that is less than 16 bytes.
-	// since we know that there was data before the pointer, we can
-	// offset the input pointer before the actual point, to receive
-	// exactly 16 bytes. after that the registers need to be adjusted.
-_get_last_two_regs:
-	add		arg2, arg2, arg3
-	ldr		q1, [arg2, #-16]
-CPU_LE(	rev64		v1.16b, v1.16b			)
-CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
+	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
+	adr_l		x4, .Lbyteshift_table + 16
+	sub		x4, x4, len
+	ld1		{v2.16b}, [x4]
+	tbl		v1.16b, {v7.16b}, v2.16b
 
-	// get rid of the extra data that was loaded before
-	// load the shift constant
-	adr_l		x4, tbl_shf_table + 16
-	sub		x4, x4, arg3
-	ld1		{v0.16b}, [x4]
+	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
+	movi		v3.16b, #0x80
+	eor		v2.16b, v2.16b, v3.16b
+	tbl		v3.16b, {v7.16b}, v2.16b
 
-	// shift v2 to the left by arg3 bytes
-	tbl		v2.16b, {v7.16b}, v0.16b
+	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
+	sshr		v2.16b, v2.16b, #7
 
-	// shift v7 to the right by 16-arg3 bytes
-	movi		v9.16b, #0x80
-	eor		v0.16b, v0.16b, v9.16b
-	tbl		v7.16b, {v7.16b}, v0.16b
+	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
+	// then '16-len' bytes from v1 (high-order bytes).
+	bsl		v2.16b, v1.16b, v0.16b
 
-	// blend
-	sshr		v0.16b, v0.16b, #7	// convert to 8-bit mask
-	bsl		v0.16b, v2.16b, v1.16b
-
-	// fold 16 Bytes
-	pmull		v8.1q, v7.1d, v10.1d
-	pmull2		v7.1q, v7.2d, v10.2d
-	eor		v7.16b, v7.16b, v8.16b
+	// Fold the first chunk into the second chunk, storing the result in v7.
+	__pmull_\p	v0, v3, fold_consts
+	__pmull_\p	v7, v3, fold_consts, 2
 	eor		v7.16b, v7.16b, v0.16b
+	eor		v7.16b, v7.16b, v2.16b
 
-_128_done:
-	// compute crc of a 128-bit value
-	ldr_l		q10, rk5, x8		// rk5 and rk6 in xmm10
+.Lreduce_final_16_bytes_\@:
+	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
 
-	// 64b fold
-	ext		v0.16b, vzr.16b, v7.16b, #8
-	mov		v7.d[0], v7.d[1]
-	pmull		v7.1q, v7.1d, v10.1d
-	eor		v7.16b, v7.16b, v0.16b
+	movi		v2.16b, #0		// init zero register
 
-	// 32b fold
-	ext		v0.16b, v7.16b, vzr.16b, #4
-	mov		v7.s[3], vzr.s[0]
-	pmull2		v0.1q, v0.2d, v10.2d
-	eor		v7.16b, v7.16b, v0.16b
+	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	__pmull_pre_\p	fold_consts
 
-	// barrett reduction
-_barrett:
-	ldr_l		q10, rk7, x8
-	mov		v0.d[0], v7.d[1]
+	// Fold the high 64 bits into the low 64 bits, while also multiplying by
+	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
+	// whose low 48 bits are 0.
+	ext		v0.16b, v2.16b, v7.16b, #8
+	__pmull_\p	v7, v7, fold_consts, 2	// high bits * x^48 * (x^80 mod G(x))
+	eor		v0.16b, v0.16b, v7.16b	// + low bits * x^64
 
-	pmull		v0.1q, v0.1d, v10.1d
-	ext		v0.16b, vzr.16b, v0.16b, #12
-	pmull2		v0.1q, v0.2d, v10.2d
-	ext		v0.16b, vzr.16b, v0.16b, #12
-	eor		v7.16b, v7.16b, v0.16b
-	mov		w0, v7.s[1]
+	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
+	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
+	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
+	mov		v0.s[3], v2.s[0]	// zero high 32 bits
+	__pmull_\p	v1, v1, fold_consts	// high 32 bits * x^48 * (x^48 mod G(x))
+	eor		v0.16b, v0.16b, v1.16b	// + low bits
 
-_cleanup:
-	// scale the result back to 16 bits
-	lsr		x0, x0, #16
-	frame_pop
+	// Load G(x) and floor(x^48 / G(x)).
+	ld1		{fold_consts.2d}, [fold_consts_ptr]
+	__pmull_pre_\p	fold_consts
+
+	// Use Barrett reduction to compute the final CRC value.
+	__pmull_\p	v1, v0, fold_consts, 2	// high 32 bits * floor(x^48 / G(x))
+	ushr		v1.2d, v1.2d, #32	// /= x^32
+	__pmull_\p	v1, v1, fold_consts	// *= G(x)
+	ushr		v0.2d, v0.2d, #48
+	eor		v0.16b, v0.16b, v1.16b	// + low 16 nonzero bits
+	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
+
+	umov		w0, v0.h[0]
+	.ifc		\p, p8
+	ldp		x29, x30, [sp], #16
+	.endif
 	ret
 
-_less_than_128:
-	cbz		arg3, _cleanup
+.Lless_than_256_bytes_\@:
+	// Checksumming a buffer of length 16...255 bytes
 
-	movi		v0.16b, #0
-	mov		v0.s[3], arg1_low32	// get the initial crc value
+	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
 
-	ldr		q7, [arg2], #0x10
+	// Load the first 16 data bytes.
+	ldr		q7, [buf], #0x10
 CPU_LE(	rev64		v7.16b, v7.16b			)
 CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
-	eor		v7.16b, v7.16b, v0.16b	// xor the initial crc value
 
-	cmp		arg3, #16
-	b.eq		_128_done		// exactly 16 left
-	b.lt		_less_than_16_left
+	// XOR the first 16 data *bits* with the initial CRC value.
+	movi		v0.16b, #0
+	mov		v0.h[7], init_crc
+	eor		v7.16b, v7.16b, v0.16b
 
-	ldr_l		q10, rk1, x8		// rk1 and rk2 in xmm10
+	// Load the fold-across-16-bytes constants.
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	__pmull_pre_\p	fold_consts
 
-	// update the counter. subtract 32 instead of 16 to save one
-	// instruction from the loop
-	subs		arg3, arg3, #32
-	b.ge		_16B_reduction_loop
+	cmp		len, #16
+	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
+	subs		len, len, #32
+	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
+	add		len, len, #16
+	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
+	.endm
 
-	add		arg3, arg3, #16
-	b		_get_last_two_regs
+//
+// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+SYM_FUNC_START(crc_t10dif_pmull_p8)
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+	crc_t10dif_pmull p8
+SYM_FUNC_END(crc_t10dif_pmull_p8)
 
-_less_than_16_left:
-	// shl r9, 4
-	adr_l		x0, tbl_shf_table + 16
-	sub		x0, x0, arg3
-	ld1		{v0.16b}, [x0]
-	movi		v9.16b, #0x80
-	eor		v0.16b, v0.16b, v9.16b
-	tbl		v7.16b, {v7.16b}, v0.16b
-	b		_128_done
-ENDPROC(crc_t10dif_pmull)
+	.align		5
+//
+// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+SYM_FUNC_START(crc_t10dif_pmull_p64)
+	crc_t10dif_pmull	p64
+SYM_FUNC_END(crc_t10dif_pmull_p64)
 
-// precomputed constants
-// these constants are precomputed from the poly:
-// 0x8bb70000 (0x8bb7 scaled to 32 bits)
 	.section	".rodata", "a"
 	.align		4
-// Q = 0x18BB70000
-// rk1 = 2^(32*3) mod Q << 32
-// rk2 = 2^(32*5) mod Q << 32
-// rk3 = 2^(32*15) mod Q << 32
-// rk4 = 2^(32*17) mod Q << 32
-// rk5 = 2^(32*3) mod Q << 32
-// rk6 = 2^(32*2) mod Q << 32
-// rk7 = floor(2^64/Q)
-// rk8 = Q
 
-rk1:	.octa		0x06df0000000000002d56000000000000
-rk3:	.octa		0x7cf50000000000009d9d000000000000
-rk5:	.octa		0x13680000000000002d56000000000000
-rk7:	.octa		0x000000018bb7000000000001f65a57f8
-rk9:	.octa		0xbfd6000000000000ceae000000000000
-rk11:	.octa		0x713c0000000000001e16000000000000
-rk13:	.octa		0x80a6000000000000f7f9000000000000
-rk15:	.octa		0xe658000000000000044c000000000000
-rk17:	.octa		0xa497000000000000ad18000000000000
-rk19:	.octa		0xe7b50000000000006ee3000000000000
+// Fold constants precomputed from the polynomial 0x18bb7
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
+	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
+// .Lfold_across_64_bytes_consts:
+	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
+	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
+// .Lfold_across_32_bytes_consts:
+	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
+	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
+.Lfold_across_16_bytes_consts:
+	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
+	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
+// .Lfinal_fold_consts:
+	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
+	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
+// .Lbarrett_reduction_consts:
+	.quad		0x0000000000018bb7	// G(x)
+	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
 
-tbl_shf_table:
-// use these values for shift constants for the tbl/tbx instruction
-// different alignments result in values as shown:
-//	DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
-//	DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
-//	DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
-//	DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
-//	DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
-//	DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
-//	DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
-//	DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
-//	DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
-//	DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
-//	DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
-//	DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
-//	DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
-//	DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
-//	DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
-
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
 	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
 	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7

--
Gitblit v1.6.2