~ljy/RK356X_SDK_RELEASE.git

..	..	@@ -1,11 +1,8 @@
	1	+/* SPDX-License-Identifier: GPL-2.0-only */
1	2	/*
2	3	* Accelerated GHASH implementation with ARMv8 PMULL instructions.
3	4	*
4	5	* Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
5		- *
6		- * This program is free software; you can redistribute it and/or modify it
7		- * under the terms of the GNU General Public License version 2 as published
8		- * by the Free Software Foundation.
9	6	*/
10	7
11	8	#include <linux/linkage.h>
..	..	@@ -16,8 +13,8 @@
16	13	T1 .req v2
17	14	T2 .req v3
18	15	MASK .req v4
19		- XL .req v5
20		- XM .req v6
	16	+ XM .req v5
	17	+ XL .req v6
21	18	XH .req v7
22	19	IN1 .req v7
23	20
..	..	@@ -353,28 +350,45 @@
353	350	* void pmull_ghash_update(int blocks, u64 dg[], const char *src,
354	351	* struct ghash_key const k, const char head)
355	352	*/
356		-ENTRY(pmull_ghash_update_p64)
	353	+SYM_FUNC_START(pmull_ghash_update_p64)
357	354	__pmull_ghash p64
358		-ENDPROC(pmull_ghash_update_p64)
	355	+SYM_FUNC_END(pmull_ghash_update_p64)
359	356
360		-ENTRY(pmull_ghash_update_p8)
	357	+SYM_FUNC_START(pmull_ghash_update_p8)
361	358	__pmull_ghash p8
362		-ENDPROC(pmull_ghash_update_p8)
	359	+SYM_FUNC_END(pmull_ghash_update_p8)
363	360
364		- KS0 .req v12
365		- KS1 .req v13
366		- INP0 .req v14
367		- INP1 .req v15
	361	+ KS0 .req v8
	362	+ KS1 .req v9
	363	+ KS2 .req v10
	364	+ KS3 .req v11
368	365
369		- .macro load_round_keys, rounds, rk
370		- cmp \rounds, #12
371		- blo 2222f /* 128 bits */
372		- beq 1111f /* 192 bits */
373		- ld1 {v17.4s-v18.4s}, [\rk], #32
374		-1111: ld1 {v19.4s-v20.4s}, [\rk], #32
375		-2222: ld1 {v21.4s-v24.4s}, [\rk], #64
376		- ld1 {v25.4s-v28.4s}, [\rk], #64
377		- ld1 {v29.4s-v31.4s}, [\rk]
	366	+ INP0 .req v21
	367	+ INP1 .req v22
	368	+ INP2 .req v23
	369	+ INP3 .req v24
	370	+
	371	+ K0 .req v25
	372	+ K1 .req v26
	373	+ K2 .req v27
	374	+ K3 .req v28
	375	+ K4 .req v12
	376	+ K5 .req v13
	377	+ K6 .req v4
	378	+ K7 .req v5
	379	+ K8 .req v14
	380	+ K9 .req v15
	381	+ KK .req v29
	382	+ KL .req v30
	383	+ KM .req v31
	384	+
	385	+ .macro load_round_keys, rounds, rk, tmp
	386	+ add \tmp, \rk, #64
	387	+ ld1 {K0.4s-K3.4s}, [\rk]
	388	+ ld1 {K4.4s-K5.4s}, [\tmp]
	389	+ add \tmp, \rk, \rounds, lsl #4
	390	+ sub \tmp, \tmp, #32
	391	+ ld1 {KK.4s-KM.4s}, [\tmp]
378	392	.endm
379	393
380	394	.macro enc_round, state, key
..	..	@@ -382,197 +396,382 @@
382	396	aesmc \state\().16b, \state\().16b
383	397	.endm
384	398
385		- .macro enc_block, state, rounds
386		- cmp \rounds, #12
387		- b.lo 2222f /* 128 bits */
388		- b.eq 1111f /* 192 bits */
389		- enc_round \state, v17
390		- enc_round \state, v18
391		-1111: enc_round \state, v19
392		- enc_round \state, v20
393		-2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
	399	+ .macro enc_qround, s0, s1, s2, s3, key
	400	+ enc_round \s0, \key
	401	+ enc_round \s1, \key
	402	+ enc_round \s2, \key
	403	+ enc_round \s3, \key
	404	+ .endm
	405	+
	406	+ .macro enc_block, state, rounds, rk, tmp
	407	+ add \tmp, \rk, #96
	408	+ ld1 {K6.4s-K7.4s}, [\tmp], #32
	409	+ .irp key, K0, K1, K2, K3, K4 K5
394	410	enc_round \state, \key
395	411	.endr
396		- aese \state\().16b, v30.16b
397		- eor \state\().16b, \state\().16b, v31.16b
	412	+
	413	+ tbnz \rounds, #2, .Lnot128_\@
	414	+.Lout256_\@:
	415	+ enc_round \state, K6
	416	+ enc_round \state, K7
	417	+
	418	+.Lout192_\@:
	419	+ enc_round \state, KK
	420	+ aese \state\().16b, KL.16b
	421	+ eor \state\().16b, \state\().16b, KM.16b
	422	+
	423	+ .subsection 1
	424	+.Lnot128_\@:
	425	+ ld1 {K8.4s-K9.4s}, [\tmp], #32
	426	+ enc_round \state, K6
	427	+ enc_round \state, K7
	428	+ ld1 {K6.4s-K7.4s}, [\tmp]
	429	+ enc_round \state, K8
	430	+ enc_round \state, K9
	431	+ tbz \rounds, #1, .Lout192_\@
	432	+ b .Lout256_\@
	433	+ .previous
398	434	.endm
399	435
	436	+ .align 6
400	437	.macro pmull_gcm_do_crypt, enc
401		- ld1 {SHASH.2d}, [x4], #16
402		- ld1 {HH.2d}, [x4]
403		- ld1 {XL.2d}, [x1]
404		- ldr x8, [x5, #8] // load lower counter
	438	+ stp x29, x30, [sp, #-32]!
	439	+ mov x29, sp
	440	+ str x19, [sp, #24]
405	441
406		- movi MASK.16b, #0xe1
	442	+ load_round_keys x7, x6, x8
	443	+
	444	+ ld1 {SHASH.2d}, [x3], #16
	445	+ ld1 {HH.2d-HH4.2d}, [x3]
	446	+
407	447	trn1 SHASH2.2d, SHASH.2d, HH.2d
408	448	trn2 T1.2d, SHASH.2d, HH.2d
409		-CPU_LE( rev x8, x8 )
410		- shl MASK.2d, MASK.2d, #57
411	449	eor SHASH2.16b, SHASH2.16b, T1.16b
412	450
413		- .if \enc == 1
414		- ldr x10, [sp]
415		- ld1 {KS0.16b-KS1.16b}, [x10]
	451	+ trn1 HH34.2d, HH3.2d, HH4.2d
	452	+ trn2 T1.2d, HH3.2d, HH4.2d
	453	+ eor HH34.16b, HH34.16b, T1.16b
	454	+
	455	+ ld1 {XL.2d}, [x4]
	456	+
	457	+ cbz x0, 3f // tag only?
	458	+
	459	+ ldr w8, [x5, #12] // load lower counter
	460	+CPU_LE( rev w8, w8 )
	461	+
	462	+0: mov w9, #4 // max blocks per round
	463	+ add x10, x0, #0xf
	464	+ lsr x10, x10, #4 // remaining blocks
	465	+
	466	+ subs x0, x0, #64
	467	+ csel w9, w10, w9, mi
	468	+ add w8, w8, w9
	469	+
	470	+ bmi 1f
	471	+ ld1 {INP0.16b-INP3.16b}, [x2], #64
	472	+ .subsection 1
	473	+ /*
	474	+ * Populate the four input registers right to left with up to 63 bytes
	475	+ * of data, using overlapping loads to avoid branches.
	476	+ *
	477	+ * INP0 INP1 INP2 INP3
	478	+ * 1 byte \| \| \| \|x \|
	479	+ * 16 bytes \| \| \| \|xxxxxxxx\|
	480	+ * 17 bytes \| \| \|xxxxxxxx\|x \|
	481	+ * 47 bytes \| \|xxxxxxxx\|xxxxxxxx\|xxxxxxx \|
	482	+ * etc etc
	483	+ *
	484	+ * Note that this code may read up to 15 bytes before the start of
	485	+ * the input. It is up to the calling code to ensure this is safe if
	486	+ * this happens in the first iteration of the loop (i.e., when the
	487	+ * input size is < 16 bytes)
	488	+ */
	489	+1: mov x15, #16
	490	+ ands x19, x0, #0xf
	491	+ csel x19, x19, x15, ne
	492	+ adr_l x17, .Lpermute_table + 16
	493	+
	494	+ sub x11, x15, x19
	495	+ add x12, x17, x11
	496	+ sub x17, x17, x11
	497	+ ld1 {T1.16b}, [x12]
	498	+ sub x10, x1, x11
	499	+ sub x11, x2, x11
	500	+
	501	+ cmp x0, #-16
	502	+ csel x14, x15, xzr, gt
	503	+ cmp x0, #-32
	504	+ csel x15, x15, xzr, gt
	505	+ cmp x0, #-48
	506	+ csel x16, x19, xzr, gt
	507	+ csel x1, x1, x10, gt
	508	+ csel x2, x2, x11, gt
	509	+
	510	+ ld1 {INP0.16b}, [x2], x14
	511	+ ld1 {INP1.16b}, [x2], x15
	512	+ ld1 {INP2.16b}, [x2], x16
	513	+ ld1 {INP3.16b}, [x2]
	514	+ tbl INP3.16b, {INP3.16b}, T1.16b
	515	+ b 2f
	516	+ .previous
	517	+
	518	+2: .if \enc == 0
	519	+ bl pmull_gcm_ghash_4x
416	520	.endif
417	521
418		- cbnz x6, 4f
	522	+ bl pmull_gcm_enc_4x
419	523
420		-0: ld1 {INP0.16b-INP1.16b}, [x3], #32
	524	+ tbnz x0, #63, 6f
	525	+ st1 {INP0.16b-INP3.16b}, [x1], #64
	526	+ .if \enc == 1
	527	+ bl pmull_gcm_ghash_4x
	528	+ .endif
	529	+ bne 0b
421	530
422		- rev x9, x8
423		- add x11, x8, #1
424		- add x8, x8, #2
	531	+3: ldp x19, x10, [sp, #24]
	532	+ cbz x10, 5f // output tag?
	533	+
	534	+ ld1 {INP3.16b}, [x10] // load lengths[]
	535	+ mov w9, #1
	536	+ bl pmull_gcm_ghash_4x
	537	+
	538	+ mov w11, #(0x1 << 24) // BE '1U'
	539	+ ld1 {KS0.16b}, [x5]
	540	+ mov KS0.s[3], w11
	541	+
	542	+ enc_block KS0, x7, x6, x12
	543	+
	544	+ ext XL.16b, XL.16b, XL.16b, #8
	545	+ rev64 XL.16b, XL.16b
	546	+ eor XL.16b, XL.16b, KS0.16b
425	547
426	548	.if \enc == 1
427		- eor INP0.16b, INP0.16b, KS0.16b // encrypt input
428		- eor INP1.16b, INP1.16b, KS1.16b
	549	+ st1 {XL.16b}, [x10] // store tag
	550	+ .else
	551	+ ldp x11, x12, [sp, #40] // load tag pointer and authsize
	552	+ adr_l x17, .Lpermute_table
	553	+ ld1 {KS0.16b}, [x11] // load supplied tag
	554	+ add x17, x17, x12
	555	+ ld1 {KS1.16b}, [x17] // load permute vector
	556	+
	557	+ cmeq XL.16b, XL.16b, KS0.16b // compare tags
	558	+ mvn XL.16b, XL.16b // -1 for fail, 0 for pass
	559	+ tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only
	560	+ sminv b0, XL.16b // signed minimum across XL
	561	+ smov w0, v0.b[0] // return b0
429	562	.endif
430	563
431		- ld1 {KS0.8b}, [x5] // load upper counter
432		- rev x11, x11
433		- sub w0, w0, #2
434		- mov KS1.8b, KS0.8b
435		- ins KS0.d[1], x9 // set lower counter
436		- ins KS1.d[1], x11
437		-
438		- rev64 T1.16b, INP1.16b
439		-
440		- cmp w7, #12
441		- b.ge 2f // AES-192/256?
442		-
443		-1: enc_round KS0, v21
444		- ext IN1.16b, T1.16b, T1.16b, #8
445		-
446		- enc_round KS1, v21
447		- pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
448		-
449		- enc_round KS0, v22
450		- eor T1.16b, T1.16b, IN1.16b
451		-
452		- enc_round KS1, v22
453		- pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
454		-
455		- enc_round KS0, v23
456		- pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
457		-
458		- enc_round KS1, v23
459		- rev64 T1.16b, INP0.16b
460		- ext T2.16b, XL.16b, XL.16b, #8
461		-
462		- enc_round KS0, v24
463		- ext IN1.16b, T1.16b, T1.16b, #8
464		- eor T1.16b, T1.16b, T2.16b
465		-
466		- enc_round KS1, v24
467		- eor XL.16b, XL.16b, IN1.16b
468		-
469		- enc_round KS0, v25
470		- eor T1.16b, T1.16b, XL.16b
471		-
472		- enc_round KS1, v25
473		- pmull2 XH.1q, HH.2d, XL.2d // a1 * b1
474		-
475		- enc_round KS0, v26
476		- pmull XL.1q, HH.1d, XL.1d // a0 * b0
477		-
478		- enc_round KS1, v26
479		- pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0)
480		-
481		- enc_round KS0, v27
482		- eor XL.16b, XL.16b, XL2.16b
483		- eor XH.16b, XH.16b, XH2.16b
484		-
485		- enc_round KS1, v27
486		- eor XM.16b, XM.16b, XM2.16b
487		- ext T1.16b, XL.16b, XH.16b, #8
488		-
489		- enc_round KS0, v28
490		- eor T2.16b, XL.16b, XH.16b
491		- eor XM.16b, XM.16b, T1.16b
492		-
493		- enc_round KS1, v28
494		- eor XM.16b, XM.16b, T2.16b
495		-
496		- enc_round KS0, v29
497		- pmull T2.1q, XL.1d, MASK.1d
498		-
499		- enc_round KS1, v29
500		- mov XH.d[0], XM.d[1]
501		- mov XM.d[1], XL.d[0]
502		-
503		- aese KS0.16b, v30.16b
504		- eor XL.16b, XM.16b, T2.16b
505		-
506		- aese KS1.16b, v30.16b
507		- ext T2.16b, XL.16b, XL.16b, #8
508		-
509		- eor KS0.16b, KS0.16b, v31.16b
510		- pmull XL.1q, XL.1d, MASK.1d
511		- eor T2.16b, T2.16b, XH.16b
512		-
513		- eor KS1.16b, KS1.16b, v31.16b
514		- eor XL.16b, XL.16b, T2.16b
515		-
516		- .if \enc == 0
517		- eor INP0.16b, INP0.16b, KS0.16b
518		- eor INP1.16b, INP1.16b, KS1.16b
519		- .endif
520		-
521		- st1 {INP0.16b-INP1.16b}, [x2], #32
522		-
523		- cbnz w0, 0b
524		-
525		-CPU_LE( rev x8, x8 )
526		- st1 {XL.2d}, [x1]
527		- str x8, [x5, #8] // store lower counter
528		-
529		- .if \enc == 1
530		- st1 {KS0.16b-KS1.16b}, [x10]
531		- .endif
532		-
	564	+4: ldp x29, x30, [sp], #32
533	565	ret
534	566
535		-2: b.eq 3f // AES-192?
536		- enc_round KS0, v17
537		- enc_round KS1, v17
538		- enc_round KS0, v18
539		- enc_round KS1, v18
540		-3: enc_round KS0, v19
541		- enc_round KS1, v19
542		- enc_round KS0, v20
543		- enc_round KS1, v20
544		- b 1b
	567	+5:
	568	+CPU_LE( rev w8, w8 )
	569	+ str w8, [x5, #12] // store lower counter
	570	+ st1 {XL.2d}, [x4]
	571	+ b 4b
545	572
546		-4: load_round_keys w7, x6
547		- b 0b
	573	+6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors
	574	+ sub x17, x17, x19, lsl #1
	575	+
	576	+ cmp w9, #1
	577	+ beq 7f
	578	+ .subsection 1
	579	+7: ld1 {INP2.16b}, [x1]
	580	+ tbx INP2.16b, {INP3.16b}, T1.16b
	581	+ mov INP3.16b, INP2.16b
	582	+ b 8f
	583	+ .previous
	584	+
	585	+ st1 {INP0.16b}, [x1], x14
	586	+ st1 {INP1.16b}, [x1], x15
	587	+ st1 {INP2.16b}, [x1], x16
	588	+ tbl INP3.16b, {INP3.16b}, T1.16b
	589	+ tbx INP3.16b, {INP2.16b}, T2.16b
	590	+8: st1 {INP3.16b}, [x1]
	591	+
	592	+ .if \enc == 1
	593	+ ld1 {T1.16b}, [x17]
	594	+ tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits
	595	+ bl pmull_gcm_ghash_4x
	596	+ .endif
	597	+ b 3b
548	598	.endm
549	599
550	600	/*
551		- * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
552		- * struct ghash_key const *k, u8 ctr[],
553		- * int rounds, u8 ks[])
	601	+ * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
	602	+ * struct ghash_key const *k, u64 dg[], u8 ctr[],
	603	+ * int rounds, u8 tag)
554	604	*/
555		-ENTRY(pmull_gcm_encrypt)
	605	+SYM_FUNC_START(pmull_gcm_encrypt)
556	606	pmull_gcm_do_crypt 1
557		-ENDPROC(pmull_gcm_encrypt)
	607	+SYM_FUNC_END(pmull_gcm_encrypt)
558	608
559	609	/*
560		- * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
561		- * struct ghash_key const *k, u8 ctr[],
562		- * int rounds)
	610	+ * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
	611	+ * struct ghash_key const *k, u64 dg[], u8 ctr[],
	612	+ * int rounds, u8 tag)
563	613	*/
564		-ENTRY(pmull_gcm_decrypt)
	614	+SYM_FUNC_START(pmull_gcm_decrypt)
565	615	pmull_gcm_do_crypt 0
566		-ENDPROC(pmull_gcm_decrypt)
	616	+SYM_FUNC_END(pmull_gcm_decrypt)
567	617
568		- /*
569		- * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
570		- */
571		-ENTRY(pmull_gcm_encrypt_block)
572		- cbz x2, 0f
573		- load_round_keys w3, x2
574		-0: ld1 {v0.16b}, [x1]
575		- enc_block v0, w3
576		- st1 {v0.16b}, [x0]
	618	+SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
	619	+ movi MASK.16b, #0xe1
	620	+ shl MASK.2d, MASK.2d, #57
	621	+
	622	+ rev64 T1.16b, INP0.16b
	623	+ rev64 T2.16b, INP1.16b
	624	+ rev64 TT3.16b, INP2.16b
	625	+ rev64 TT4.16b, INP3.16b
	626	+
	627	+ ext XL.16b, XL.16b, XL.16b, #8
	628	+
	629	+ tbz w9, #2, 0f // <4 blocks?
	630	+ .subsection 1
	631	+0: movi XH2.16b, #0
	632	+ movi XM2.16b, #0
	633	+ movi XL2.16b, #0
	634	+
	635	+ tbz w9, #0, 1f // 2 blocks?
	636	+ tbz w9, #1, 2f // 1 block?
	637	+
	638	+ eor T2.16b, T2.16b, XL.16b
	639	+ ext T1.16b, T2.16b, T2.16b, #8
	640	+ b .Lgh3
	641	+
	642	+1: eor TT3.16b, TT3.16b, XL.16b
	643	+ ext T2.16b, TT3.16b, TT3.16b, #8
	644	+ b .Lgh2
	645	+
	646	+2: eor TT4.16b, TT4.16b, XL.16b
	647	+ ext IN1.16b, TT4.16b, TT4.16b, #8
	648	+ b .Lgh1
	649	+ .previous
	650	+
	651	+ eor T1.16b, T1.16b, XL.16b
	652	+ ext IN1.16b, T1.16b, T1.16b, #8
	653	+
	654	+ pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1
	655	+ eor T1.16b, T1.16b, IN1.16b
	656	+ pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0
	657	+ pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
	658	+
	659	+ ext T1.16b, T2.16b, T2.16b, #8
	660	+.Lgh3: eor T2.16b, T2.16b, T1.16b
	661	+ pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1
	662	+ pmull XL.1q, HH3.1d, T1.1d // a0 * b0
	663	+ pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
	664	+
	665	+ eor XH2.16b, XH2.16b, XH.16b
	666	+ eor XL2.16b, XL2.16b, XL.16b
	667	+ eor XM2.16b, XM2.16b, XM.16b
	668	+
	669	+ ext T2.16b, TT3.16b, TT3.16b, #8
	670	+.Lgh2: eor TT3.16b, TT3.16b, T2.16b
	671	+ pmull2 XH.1q, HH.2d, T2.2d // a1 * b1
	672	+ pmull XL.1q, HH.1d, T2.1d // a0 * b0
	673	+ pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
	674	+
	675	+ eor XH2.16b, XH2.16b, XH.16b
	676	+ eor XL2.16b, XL2.16b, XL.16b
	677	+ eor XM2.16b, XM2.16b, XM.16b
	678	+
	679	+ ext IN1.16b, TT4.16b, TT4.16b, #8
	680	+.Lgh1: eor TT4.16b, TT4.16b, IN1.16b
	681	+ pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0
	682	+ pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1
	683	+ pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
	684	+
	685	+ eor XH.16b, XH.16b, XH2.16b
	686	+ eor XL.16b, XL.16b, XL2.16b
	687	+ eor XM.16b, XM.16b, XM2.16b
	688	+
	689	+ eor T2.16b, XL.16b, XH.16b
	690	+ ext T1.16b, XL.16b, XH.16b, #8
	691	+ eor XM.16b, XM.16b, T2.16b
	692	+
	693	+ __pmull_reduce_p64
	694	+
	695	+ eor T2.16b, T2.16b, XH.16b
	696	+ eor XL.16b, XL.16b, T2.16b
	697	+
577	698	ret
578		-ENDPROC(pmull_gcm_encrypt_block)
	699	+SYM_FUNC_END(pmull_gcm_ghash_4x)
	700	+
	701	+SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
	702	+ ld1 {KS0.16b}, [x5] // load upper counter
	703	+ sub w10, w8, #4
	704	+ sub w11, w8, #3
	705	+ sub w12, w8, #2
	706	+ sub w13, w8, #1
	707	+ rev w10, w10
	708	+ rev w11, w11
	709	+ rev w12, w12
	710	+ rev w13, w13
	711	+ mov KS1.16b, KS0.16b
	712	+ mov KS2.16b, KS0.16b
	713	+ mov KS3.16b, KS0.16b
	714	+ ins KS0.s[3], w10 // set lower counter
	715	+ ins KS1.s[3], w11
	716	+ ins KS2.s[3], w12
	717	+ ins KS3.s[3], w13
	718	+
	719	+ add x10, x6, #96 // round key pointer
	720	+ ld1 {K6.4s-K7.4s}, [x10], #32
	721	+ .irp key, K0, K1, K2, K3, K4, K5
	722	+ enc_qround KS0, KS1, KS2, KS3, \key
	723	+ .endr
	724	+
	725	+ tbnz x7, #2, .Lnot128
	726	+ .subsection 1
	727	+.Lnot128:
	728	+ ld1 {K8.4s-K9.4s}, [x10], #32
	729	+ .irp key, K6, K7
	730	+ enc_qround KS0, KS1, KS2, KS3, \key
	731	+ .endr
	732	+ ld1 {K6.4s-K7.4s}, [x10]
	733	+ .irp key, K8, K9
	734	+ enc_qround KS0, KS1, KS2, KS3, \key
	735	+ .endr
	736	+ tbz x7, #1, .Lout192
	737	+ b .Lout256
	738	+ .previous
	739	+
	740	+.Lout256:
	741	+ .irp key, K6, K7
	742	+ enc_qround KS0, KS1, KS2, KS3, \key
	743	+ .endr
	744	+
	745	+.Lout192:
	746	+ enc_qround KS0, KS1, KS2, KS3, KK
	747	+
	748	+ aese KS0.16b, KL.16b
	749	+ aese KS1.16b, KL.16b
	750	+ aese KS2.16b, KL.16b
	751	+ aese KS3.16b, KL.16b
	752	+
	753	+ eor KS0.16b, KS0.16b, KM.16b
	754	+ eor KS1.16b, KS1.16b, KM.16b
	755	+ eor KS2.16b, KS2.16b, KM.16b
	756	+ eor KS3.16b, KS3.16b, KM.16b
	757	+
	758	+ eor INP0.16b, INP0.16b, KS0.16b
	759	+ eor INP1.16b, INP1.16b, KS1.16b
	760	+ eor INP2.16b, INP2.16b, KS2.16b
	761	+ eor INP3.16b, INP3.16b, KS3.16b
	762	+
	763	+ ret
	764	+SYM_FUNC_END(pmull_gcm_enc_4x)
	765	+
	766	+ .section ".rodata", "a"
	767	+ .align 6
	768	+.Lpermute_table:
	769	+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
	770	+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
	771	+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
	772	+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
	773	+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
	774	+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
	775	+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
	776	+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
	777	+ .previous