hc
2023-12-08 01573e231f18eb2d99162747186f59511f56b64d
kernel/arch/arm64/crypto/ghash-ce-core.S
....@@ -1,11 +1,8 @@
1
+/* SPDX-License-Identifier: GPL-2.0-only */
12 /*
23 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
34 *
45 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
5
- *
6
- * This program is free software; you can redistribute it and/or modify it
7
- * under the terms of the GNU General Public License version 2 as published
8
- * by the Free Software Foundation.
96 */
107
118 #include <linux/linkage.h>
....@@ -16,8 +13,8 @@
1613 T1 .req v2
1714 T2 .req v3
1815 MASK .req v4
19
- XL .req v5
20
- XM .req v6
16
+ XM .req v5
17
+ XL .req v6
2118 XH .req v7
2219 IN1 .req v7
2320
....@@ -353,28 +350,45 @@
353350 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
354351 * struct ghash_key const *k, const char *head)
355352 */
356
-ENTRY(pmull_ghash_update_p64)
353
+SYM_FUNC_START(pmull_ghash_update_p64)
357354 __pmull_ghash p64
358
-ENDPROC(pmull_ghash_update_p64)
355
+SYM_FUNC_END(pmull_ghash_update_p64)
359356
360
-ENTRY(pmull_ghash_update_p8)
357
+SYM_FUNC_START(pmull_ghash_update_p8)
361358 __pmull_ghash p8
362
-ENDPROC(pmull_ghash_update_p8)
359
+SYM_FUNC_END(pmull_ghash_update_p8)
363360
364
- KS0 .req v12
365
- KS1 .req v13
366
- INP0 .req v14
367
- INP1 .req v15
361
+ KS0 .req v8
362
+ KS1 .req v9
363
+ KS2 .req v10
364
+ KS3 .req v11
368365
369
- .macro load_round_keys, rounds, rk
370
- cmp \rounds, #12
371
- blo 2222f /* 128 bits */
372
- beq 1111f /* 192 bits */
373
- ld1 {v17.4s-v18.4s}, [\rk], #32
374
-1111: ld1 {v19.4s-v20.4s}, [\rk], #32
375
-2222: ld1 {v21.4s-v24.4s}, [\rk], #64
376
- ld1 {v25.4s-v28.4s}, [\rk], #64
377
- ld1 {v29.4s-v31.4s}, [\rk]
366
+ INP0 .req v21
367
+ INP1 .req v22
368
+ INP2 .req v23
369
+ INP3 .req v24
370
+
371
+ K0 .req v25
372
+ K1 .req v26
373
+ K2 .req v27
374
+ K3 .req v28
375
+ K4 .req v12
376
+ K5 .req v13
377
+ K6 .req v4
378
+ K7 .req v5
379
+ K8 .req v14
380
+ K9 .req v15
381
+ KK .req v29
382
+ KL .req v30
383
+ KM .req v31
384
+
385
+ .macro load_round_keys, rounds, rk, tmp
386
+ add \tmp, \rk, #64
387
+ ld1 {K0.4s-K3.4s}, [\rk]
388
+ ld1 {K4.4s-K5.4s}, [\tmp]
389
+ add \tmp, \rk, \rounds, lsl #4
390
+ sub \tmp, \tmp, #32
391
+ ld1 {KK.4s-KM.4s}, [\tmp]
378392 .endm
379393
380394 .macro enc_round, state, key
....@@ -382,197 +396,382 @@
382396 aesmc \state\().16b, \state\().16b
383397 .endm
384398
385
- .macro enc_block, state, rounds
386
- cmp \rounds, #12
387
- b.lo 2222f /* 128 bits */
388
- b.eq 1111f /* 192 bits */
389
- enc_round \state, v17
390
- enc_round \state, v18
391
-1111: enc_round \state, v19
392
- enc_round \state, v20
393
-2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
399
+ .macro enc_qround, s0, s1, s2, s3, key
400
+ enc_round \s0, \key
401
+ enc_round \s1, \key
402
+ enc_round \s2, \key
403
+ enc_round \s3, \key
404
+ .endm
405
+
406
+ .macro enc_block, state, rounds, rk, tmp
407
+ add \tmp, \rk, #96
408
+ ld1 {K6.4s-K7.4s}, [\tmp], #32
409
+ .irp key, K0, K1, K2, K3, K4 K5
394410 enc_round \state, \key
395411 .endr
396
- aese \state\().16b, v30.16b
397
- eor \state\().16b, \state\().16b, v31.16b
412
+
413
+ tbnz \rounds, #2, .Lnot128_\@
414
+.Lout256_\@:
415
+ enc_round \state, K6
416
+ enc_round \state, K7
417
+
418
+.Lout192_\@:
419
+ enc_round \state, KK
420
+ aese \state\().16b, KL.16b
421
+ eor \state\().16b, \state\().16b, KM.16b
422
+
423
+ .subsection 1
424
+.Lnot128_\@:
425
+ ld1 {K8.4s-K9.4s}, [\tmp], #32
426
+ enc_round \state, K6
427
+ enc_round \state, K7
428
+ ld1 {K6.4s-K7.4s}, [\tmp]
429
+ enc_round \state, K8
430
+ enc_round \state, K9
431
+ tbz \rounds, #1, .Lout192_\@
432
+ b .Lout256_\@
433
+ .previous
398434 .endm
399435
436
+ .align 6
400437 .macro pmull_gcm_do_crypt, enc
401
- ld1 {SHASH.2d}, [x4], #16
402
- ld1 {HH.2d}, [x4]
403
- ld1 {XL.2d}, [x1]
404
- ldr x8, [x5, #8] // load lower counter
438
+ stp x29, x30, [sp, #-32]!
439
+ mov x29, sp
440
+ str x19, [sp, #24]
405441
406
- movi MASK.16b, #0xe1
442
+ load_round_keys x7, x6, x8
443
+
444
+ ld1 {SHASH.2d}, [x3], #16
445
+ ld1 {HH.2d-HH4.2d}, [x3]
446
+
407447 trn1 SHASH2.2d, SHASH.2d, HH.2d
408448 trn2 T1.2d, SHASH.2d, HH.2d
409
-CPU_LE( rev x8, x8 )
410
- shl MASK.2d, MASK.2d, #57
411449 eor SHASH2.16b, SHASH2.16b, T1.16b
412450
413
- .if \enc == 1
414
- ldr x10, [sp]
415
- ld1 {KS0.16b-KS1.16b}, [x10]
451
+ trn1 HH34.2d, HH3.2d, HH4.2d
452
+ trn2 T1.2d, HH3.2d, HH4.2d
453
+ eor HH34.16b, HH34.16b, T1.16b
454
+
455
+ ld1 {XL.2d}, [x4]
456
+
457
+ cbz x0, 3f // tag only?
458
+
459
+ ldr w8, [x5, #12] // load lower counter
460
+CPU_LE( rev w8, w8 )
461
+
462
+0: mov w9, #4 // max blocks per round
463
+ add x10, x0, #0xf
464
+ lsr x10, x10, #4 // remaining blocks
465
+
466
+ subs x0, x0, #64
467
+ csel w9, w10, w9, mi
468
+ add w8, w8, w9
469
+
470
+ bmi 1f
471
+ ld1 {INP0.16b-INP3.16b}, [x2], #64
472
+ .subsection 1
473
+ /*
474
+ * Populate the four input registers right to left with up to 63 bytes
475
+ * of data, using overlapping loads to avoid branches.
476
+ *
477
+ * INP0 INP1 INP2 INP3
478
+ * 1 byte | | | |x |
479
+ * 16 bytes | | | |xxxxxxxx|
480
+ * 17 bytes | | |xxxxxxxx|x |
481
+ * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx |
482
+ * etc etc
483
+ *
484
+ * Note that this code may read up to 15 bytes before the start of
485
+ * the input. It is up to the calling code to ensure this is safe if
486
+ * this happens in the first iteration of the loop (i.e., when the
487
+ * input size is < 16 bytes)
488
+ */
489
+1: mov x15, #16
490
+ ands x19, x0, #0xf
491
+ csel x19, x19, x15, ne
492
+ adr_l x17, .Lpermute_table + 16
493
+
494
+ sub x11, x15, x19
495
+ add x12, x17, x11
496
+ sub x17, x17, x11
497
+ ld1 {T1.16b}, [x12]
498
+ sub x10, x1, x11
499
+ sub x11, x2, x11
500
+
501
+ cmp x0, #-16
502
+ csel x14, x15, xzr, gt
503
+ cmp x0, #-32
504
+ csel x15, x15, xzr, gt
505
+ cmp x0, #-48
506
+ csel x16, x19, xzr, gt
507
+ csel x1, x1, x10, gt
508
+ csel x2, x2, x11, gt
509
+
510
+ ld1 {INP0.16b}, [x2], x14
511
+ ld1 {INP1.16b}, [x2], x15
512
+ ld1 {INP2.16b}, [x2], x16
513
+ ld1 {INP3.16b}, [x2]
514
+ tbl INP3.16b, {INP3.16b}, T1.16b
515
+ b 2f
516
+ .previous
517
+
518
+2: .if \enc == 0
519
+ bl pmull_gcm_ghash_4x
416520 .endif
417521
418
- cbnz x6, 4f
522
+ bl pmull_gcm_enc_4x
419523
420
-0: ld1 {INP0.16b-INP1.16b}, [x3], #32
524
+ tbnz x0, #63, 6f
525
+ st1 {INP0.16b-INP3.16b}, [x1], #64
526
+ .if \enc == 1
527
+ bl pmull_gcm_ghash_4x
528
+ .endif
529
+ bne 0b
421530
422
- rev x9, x8
423
- add x11, x8, #1
424
- add x8, x8, #2
531
+3: ldp x19, x10, [sp, #24]
532
+ cbz x10, 5f // output tag?
533
+
534
+ ld1 {INP3.16b}, [x10] // load lengths[]
535
+ mov w9, #1
536
+ bl pmull_gcm_ghash_4x
537
+
538
+ mov w11, #(0x1 << 24) // BE '1U'
539
+ ld1 {KS0.16b}, [x5]
540
+ mov KS0.s[3], w11
541
+
542
+ enc_block KS0, x7, x6, x12
543
+
544
+ ext XL.16b, XL.16b, XL.16b, #8
545
+ rev64 XL.16b, XL.16b
546
+ eor XL.16b, XL.16b, KS0.16b
425547
426548 .if \enc == 1
427
- eor INP0.16b, INP0.16b, KS0.16b // encrypt input
428
- eor INP1.16b, INP1.16b, KS1.16b
549
+ st1 {XL.16b}, [x10] // store tag
550
+ .else
551
+ ldp x11, x12, [sp, #40] // load tag pointer and authsize
552
+ adr_l x17, .Lpermute_table
553
+ ld1 {KS0.16b}, [x11] // load supplied tag
554
+ add x17, x17, x12
555
+ ld1 {KS1.16b}, [x17] // load permute vector
556
+
557
+ cmeq XL.16b, XL.16b, KS0.16b // compare tags
558
+ mvn XL.16b, XL.16b // -1 for fail, 0 for pass
559
+ tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only
560
+ sminv b0, XL.16b // signed minimum across XL
561
+ smov w0, v0.b[0] // return b0
429562 .endif
430563
431
- ld1 {KS0.8b}, [x5] // load upper counter
432
- rev x11, x11
433
- sub w0, w0, #2
434
- mov KS1.8b, KS0.8b
435
- ins KS0.d[1], x9 // set lower counter
436
- ins KS1.d[1], x11
437
-
438
- rev64 T1.16b, INP1.16b
439
-
440
- cmp w7, #12
441
- b.ge 2f // AES-192/256?
442
-
443
-1: enc_round KS0, v21
444
- ext IN1.16b, T1.16b, T1.16b, #8
445
-
446
- enc_round KS1, v21
447
- pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
448
-
449
- enc_round KS0, v22
450
- eor T1.16b, T1.16b, IN1.16b
451
-
452
- enc_round KS1, v22
453
- pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
454
-
455
- enc_round KS0, v23
456
- pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
457
-
458
- enc_round KS1, v23
459
- rev64 T1.16b, INP0.16b
460
- ext T2.16b, XL.16b, XL.16b, #8
461
-
462
- enc_round KS0, v24
463
- ext IN1.16b, T1.16b, T1.16b, #8
464
- eor T1.16b, T1.16b, T2.16b
465
-
466
- enc_round KS1, v24
467
- eor XL.16b, XL.16b, IN1.16b
468
-
469
- enc_round KS0, v25
470
- eor T1.16b, T1.16b, XL.16b
471
-
472
- enc_round KS1, v25
473
- pmull2 XH.1q, HH.2d, XL.2d // a1 * b1
474
-
475
- enc_round KS0, v26
476
- pmull XL.1q, HH.1d, XL.1d // a0 * b0
477
-
478
- enc_round KS1, v26
479
- pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0)
480
-
481
- enc_round KS0, v27
482
- eor XL.16b, XL.16b, XL2.16b
483
- eor XH.16b, XH.16b, XH2.16b
484
-
485
- enc_round KS1, v27
486
- eor XM.16b, XM.16b, XM2.16b
487
- ext T1.16b, XL.16b, XH.16b, #8
488
-
489
- enc_round KS0, v28
490
- eor T2.16b, XL.16b, XH.16b
491
- eor XM.16b, XM.16b, T1.16b
492
-
493
- enc_round KS1, v28
494
- eor XM.16b, XM.16b, T2.16b
495
-
496
- enc_round KS0, v29
497
- pmull T2.1q, XL.1d, MASK.1d
498
-
499
- enc_round KS1, v29
500
- mov XH.d[0], XM.d[1]
501
- mov XM.d[1], XL.d[0]
502
-
503
- aese KS0.16b, v30.16b
504
- eor XL.16b, XM.16b, T2.16b
505
-
506
- aese KS1.16b, v30.16b
507
- ext T2.16b, XL.16b, XL.16b, #8
508
-
509
- eor KS0.16b, KS0.16b, v31.16b
510
- pmull XL.1q, XL.1d, MASK.1d
511
- eor T2.16b, T2.16b, XH.16b
512
-
513
- eor KS1.16b, KS1.16b, v31.16b
514
- eor XL.16b, XL.16b, T2.16b
515
-
516
- .if \enc == 0
517
- eor INP0.16b, INP0.16b, KS0.16b
518
- eor INP1.16b, INP1.16b, KS1.16b
519
- .endif
520
-
521
- st1 {INP0.16b-INP1.16b}, [x2], #32
522
-
523
- cbnz w0, 0b
524
-
525
-CPU_LE( rev x8, x8 )
526
- st1 {XL.2d}, [x1]
527
- str x8, [x5, #8] // store lower counter
528
-
529
- .if \enc == 1
530
- st1 {KS0.16b-KS1.16b}, [x10]
531
- .endif
532
-
564
+4: ldp x29, x30, [sp], #32
533565 ret
534566
535
-2: b.eq 3f // AES-192?
536
- enc_round KS0, v17
537
- enc_round KS1, v17
538
- enc_round KS0, v18
539
- enc_round KS1, v18
540
-3: enc_round KS0, v19
541
- enc_round KS1, v19
542
- enc_round KS0, v20
543
- enc_round KS1, v20
544
- b 1b
567
+5:
568
+CPU_LE( rev w8, w8 )
569
+ str w8, [x5, #12] // store lower counter
570
+ st1 {XL.2d}, [x4]
571
+ b 4b
545572
546
-4: load_round_keys w7, x6
547
- b 0b
573
+6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors
574
+ sub x17, x17, x19, lsl #1
575
+
576
+ cmp w9, #1
577
+ beq 7f
578
+ .subsection 1
579
+7: ld1 {INP2.16b}, [x1]
580
+ tbx INP2.16b, {INP3.16b}, T1.16b
581
+ mov INP3.16b, INP2.16b
582
+ b 8f
583
+ .previous
584
+
585
+ st1 {INP0.16b}, [x1], x14
586
+ st1 {INP1.16b}, [x1], x15
587
+ st1 {INP2.16b}, [x1], x16
588
+ tbl INP3.16b, {INP3.16b}, T1.16b
589
+ tbx INP3.16b, {INP2.16b}, T2.16b
590
+8: st1 {INP3.16b}, [x1]
591
+
592
+ .if \enc == 1
593
+ ld1 {T1.16b}, [x17]
594
+ tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits
595
+ bl pmull_gcm_ghash_4x
596
+ .endif
597
+ b 3b
548598 .endm
549599
550600 /*
551
- * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
552
- * struct ghash_key const *k, u8 ctr[],
553
- * int rounds, u8 ks[])
601
+ * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
602
+ * struct ghash_key const *k, u64 dg[], u8 ctr[],
603
+ * int rounds, u8 tag)
554604 */
555
-ENTRY(pmull_gcm_encrypt)
605
+SYM_FUNC_START(pmull_gcm_encrypt)
556606 pmull_gcm_do_crypt 1
557
-ENDPROC(pmull_gcm_encrypt)
607
+SYM_FUNC_END(pmull_gcm_encrypt)
558608
559609 /*
560
- * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
561
- * struct ghash_key const *k, u8 ctr[],
562
- * int rounds)
610
+ * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
611
+ * struct ghash_key const *k, u64 dg[], u8 ctr[],
612
+ * int rounds, u8 tag)
563613 */
564
-ENTRY(pmull_gcm_decrypt)
614
+SYM_FUNC_START(pmull_gcm_decrypt)
565615 pmull_gcm_do_crypt 0
566
-ENDPROC(pmull_gcm_decrypt)
616
+SYM_FUNC_END(pmull_gcm_decrypt)
567617
568
- /*
569
- * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
570
- */
571
-ENTRY(pmull_gcm_encrypt_block)
572
- cbz x2, 0f
573
- load_round_keys w3, x2
574
-0: ld1 {v0.16b}, [x1]
575
- enc_block v0, w3
576
- st1 {v0.16b}, [x0]
618
+SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
619
+ movi MASK.16b, #0xe1
620
+ shl MASK.2d, MASK.2d, #57
621
+
622
+ rev64 T1.16b, INP0.16b
623
+ rev64 T2.16b, INP1.16b
624
+ rev64 TT3.16b, INP2.16b
625
+ rev64 TT4.16b, INP3.16b
626
+
627
+ ext XL.16b, XL.16b, XL.16b, #8
628
+
629
+ tbz w9, #2, 0f // <4 blocks?
630
+ .subsection 1
631
+0: movi XH2.16b, #0
632
+ movi XM2.16b, #0
633
+ movi XL2.16b, #0
634
+
635
+ tbz w9, #0, 1f // 2 blocks?
636
+ tbz w9, #1, 2f // 1 block?
637
+
638
+ eor T2.16b, T2.16b, XL.16b
639
+ ext T1.16b, T2.16b, T2.16b, #8
640
+ b .Lgh3
641
+
642
+1: eor TT3.16b, TT3.16b, XL.16b
643
+ ext T2.16b, TT3.16b, TT3.16b, #8
644
+ b .Lgh2
645
+
646
+2: eor TT4.16b, TT4.16b, XL.16b
647
+ ext IN1.16b, TT4.16b, TT4.16b, #8
648
+ b .Lgh1
649
+ .previous
650
+
651
+ eor T1.16b, T1.16b, XL.16b
652
+ ext IN1.16b, T1.16b, T1.16b, #8
653
+
654
+ pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1
655
+ eor T1.16b, T1.16b, IN1.16b
656
+ pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0
657
+ pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
658
+
659
+ ext T1.16b, T2.16b, T2.16b, #8
660
+.Lgh3: eor T2.16b, T2.16b, T1.16b
661
+ pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1
662
+ pmull XL.1q, HH3.1d, T1.1d // a0 * b0
663
+ pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
664
+
665
+ eor XH2.16b, XH2.16b, XH.16b
666
+ eor XL2.16b, XL2.16b, XL.16b
667
+ eor XM2.16b, XM2.16b, XM.16b
668
+
669
+ ext T2.16b, TT3.16b, TT3.16b, #8
670
+.Lgh2: eor TT3.16b, TT3.16b, T2.16b
671
+ pmull2 XH.1q, HH.2d, T2.2d // a1 * b1
672
+ pmull XL.1q, HH.1d, T2.1d // a0 * b0
673
+ pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
674
+
675
+ eor XH2.16b, XH2.16b, XH.16b
676
+ eor XL2.16b, XL2.16b, XL.16b
677
+ eor XM2.16b, XM2.16b, XM.16b
678
+
679
+ ext IN1.16b, TT4.16b, TT4.16b, #8
680
+.Lgh1: eor TT4.16b, TT4.16b, IN1.16b
681
+ pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0
682
+ pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1
683
+ pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
684
+
685
+ eor XH.16b, XH.16b, XH2.16b
686
+ eor XL.16b, XL.16b, XL2.16b
687
+ eor XM.16b, XM.16b, XM2.16b
688
+
689
+ eor T2.16b, XL.16b, XH.16b
690
+ ext T1.16b, XL.16b, XH.16b, #8
691
+ eor XM.16b, XM.16b, T2.16b
692
+
693
+ __pmull_reduce_p64
694
+
695
+ eor T2.16b, T2.16b, XH.16b
696
+ eor XL.16b, XL.16b, T2.16b
697
+
577698 ret
578
-ENDPROC(pmull_gcm_encrypt_block)
699
+SYM_FUNC_END(pmull_gcm_ghash_4x)
700
+
701
+SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
702
+ ld1 {KS0.16b}, [x5] // load upper counter
703
+ sub w10, w8, #4
704
+ sub w11, w8, #3
705
+ sub w12, w8, #2
706
+ sub w13, w8, #1
707
+ rev w10, w10
708
+ rev w11, w11
709
+ rev w12, w12
710
+ rev w13, w13
711
+ mov KS1.16b, KS0.16b
712
+ mov KS2.16b, KS0.16b
713
+ mov KS3.16b, KS0.16b
714
+ ins KS0.s[3], w10 // set lower counter
715
+ ins KS1.s[3], w11
716
+ ins KS2.s[3], w12
717
+ ins KS3.s[3], w13
718
+
719
+ add x10, x6, #96 // round key pointer
720
+ ld1 {K6.4s-K7.4s}, [x10], #32
721
+ .irp key, K0, K1, K2, K3, K4, K5
722
+ enc_qround KS0, KS1, KS2, KS3, \key
723
+ .endr
724
+
725
+ tbnz x7, #2, .Lnot128
726
+ .subsection 1
727
+.Lnot128:
728
+ ld1 {K8.4s-K9.4s}, [x10], #32
729
+ .irp key, K6, K7
730
+ enc_qround KS0, KS1, KS2, KS3, \key
731
+ .endr
732
+ ld1 {K6.4s-K7.4s}, [x10]
733
+ .irp key, K8, K9
734
+ enc_qround KS0, KS1, KS2, KS3, \key
735
+ .endr
736
+ tbz x7, #1, .Lout192
737
+ b .Lout256
738
+ .previous
739
+
740
+.Lout256:
741
+ .irp key, K6, K7
742
+ enc_qround KS0, KS1, KS2, KS3, \key
743
+ .endr
744
+
745
+.Lout192:
746
+ enc_qround KS0, KS1, KS2, KS3, KK
747
+
748
+ aese KS0.16b, KL.16b
749
+ aese KS1.16b, KL.16b
750
+ aese KS2.16b, KL.16b
751
+ aese KS3.16b, KL.16b
752
+
753
+ eor KS0.16b, KS0.16b, KM.16b
754
+ eor KS1.16b, KS1.16b, KM.16b
755
+ eor KS2.16b, KS2.16b, KM.16b
756
+ eor KS3.16b, KS3.16b, KM.16b
757
+
758
+ eor INP0.16b, INP0.16b, KS0.16b
759
+ eor INP1.16b, INP1.16b, KS1.16b
760
+ eor INP2.16b, INP2.16b, KS2.16b
761
+ eor INP3.16b, INP3.16b, KS3.16b
762
+
763
+ ret
764
+SYM_FUNC_END(pmull_gcm_enc_4x)
765
+
766
+ .section ".rodata", "a"
767
+ .align 6
768
+.Lpermute_table:
769
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
770
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
771
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
772
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
773
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
774
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
775
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
776
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
777
+ .previous