| .. | .. |
|---|
| 1 | +/* SPDX-License-Identifier: GPL-2.0-only */ |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * Accelerated GHASH implementation with ARMv8 PMULL instructions. |
|---|
| 3 | 4 | * |
|---|
| 4 | 5 | * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> |
|---|
| 5 | | - * |
|---|
| 6 | | - * This program is free software; you can redistribute it and/or modify it |
|---|
| 7 | | - * under the terms of the GNU General Public License version 2 as published |
|---|
| 8 | | - * by the Free Software Foundation. |
|---|
| 9 | 6 | */ |
|---|
| 10 | 7 | |
|---|
| 11 | 8 | #include <linux/linkage.h> |
|---|
| .. | .. |
|---|
| 16 | 13 | T1 .req v2 |
|---|
| 17 | 14 | T2 .req v3 |
|---|
| 18 | 15 | MASK .req v4 |
|---|
| 19 | | - XL .req v5 |
|---|
| 20 | | - XM .req v6 |
|---|
| 16 | + XM .req v5 |
|---|
| 17 | + XL .req v6 |
|---|
| 21 | 18 | XH .req v7 |
|---|
| 22 | 19 | IN1 .req v7 |
|---|
| 23 | 20 | |
|---|
| .. | .. |
|---|
| 353 | 350 | * void pmull_ghash_update(int blocks, u64 dg[], const char *src, |
|---|
| 354 | 351 | * struct ghash_key const *k, const char *head) |
|---|
| 355 | 352 | */ |
|---|
| 356 | | -ENTRY(pmull_ghash_update_p64) |
|---|
| 353 | +SYM_FUNC_START(pmull_ghash_update_p64) |
|---|
| 357 | 354 | __pmull_ghash p64 |
|---|
| 358 | | -ENDPROC(pmull_ghash_update_p64) |
|---|
| 355 | +SYM_FUNC_END(pmull_ghash_update_p64) |
|---|
| 359 | 356 | |
|---|
| 360 | | -ENTRY(pmull_ghash_update_p8) |
|---|
| 357 | +SYM_FUNC_START(pmull_ghash_update_p8) |
|---|
| 361 | 358 | __pmull_ghash p8 |
|---|
| 362 | | -ENDPROC(pmull_ghash_update_p8) |
|---|
| 359 | +SYM_FUNC_END(pmull_ghash_update_p8) |
|---|
| 363 | 360 | |
|---|
| 364 | | - KS0 .req v12 |
|---|
| 365 | | - KS1 .req v13 |
|---|
| 366 | | - INP0 .req v14 |
|---|
| 367 | | - INP1 .req v15 |
|---|
| 361 | + KS0 .req v8 |
|---|
| 362 | + KS1 .req v9 |
|---|
| 363 | + KS2 .req v10 |
|---|
| 364 | + KS3 .req v11 |
|---|
| 368 | 365 | |
|---|
| 369 | | - .macro load_round_keys, rounds, rk |
|---|
| 370 | | - cmp \rounds, #12 |
|---|
| 371 | | - blo 2222f /* 128 bits */ |
|---|
| 372 | | - beq 1111f /* 192 bits */ |
|---|
| 373 | | - ld1 {v17.4s-v18.4s}, [\rk], #32 |
|---|
| 374 | | -1111: ld1 {v19.4s-v20.4s}, [\rk], #32 |
|---|
| 375 | | -2222: ld1 {v21.4s-v24.4s}, [\rk], #64 |
|---|
| 376 | | - ld1 {v25.4s-v28.4s}, [\rk], #64 |
|---|
| 377 | | - ld1 {v29.4s-v31.4s}, [\rk] |
|---|
| 366 | + INP0 .req v21 |
|---|
| 367 | + INP1 .req v22 |
|---|
| 368 | + INP2 .req v23 |
|---|
| 369 | + INP3 .req v24 |
|---|
| 370 | + |
|---|
| 371 | + K0 .req v25 |
|---|
| 372 | + K1 .req v26 |
|---|
| 373 | + K2 .req v27 |
|---|
| 374 | + K3 .req v28 |
|---|
| 375 | + K4 .req v12 |
|---|
| 376 | + K5 .req v13 |
|---|
| 377 | + K6 .req v4 |
|---|
| 378 | + K7 .req v5 |
|---|
| 379 | + K8 .req v14 |
|---|
| 380 | + K9 .req v15 |
|---|
| 381 | + KK .req v29 |
|---|
| 382 | + KL .req v30 |
|---|
| 383 | + KM .req v31 |
|---|
| 384 | + |
|---|
| 385 | + .macro load_round_keys, rounds, rk, tmp |
|---|
| 386 | + add \tmp, \rk, #64 |
|---|
| 387 | + ld1 {K0.4s-K3.4s}, [\rk] |
|---|
| 388 | + ld1 {K4.4s-K5.4s}, [\tmp] |
|---|
| 389 | + add \tmp, \rk, \rounds, lsl #4 |
|---|
| 390 | + sub \tmp, \tmp, #32 |
|---|
| 391 | + ld1 {KK.4s-KM.4s}, [\tmp] |
|---|
| 378 | 392 | .endm |
|---|
| 379 | 393 | |
|---|
| 380 | 394 | .macro enc_round, state, key |
|---|
| .. | .. |
|---|
| 382 | 396 | aesmc \state\().16b, \state\().16b |
|---|
| 383 | 397 | .endm |
|---|
| 384 | 398 | |
|---|
| 385 | | - .macro enc_block, state, rounds |
|---|
| 386 | | - cmp \rounds, #12 |
|---|
| 387 | | - b.lo 2222f /* 128 bits */ |
|---|
| 388 | | - b.eq 1111f /* 192 bits */ |
|---|
| 389 | | - enc_round \state, v17 |
|---|
| 390 | | - enc_round \state, v18 |
|---|
| 391 | | -1111: enc_round \state, v19 |
|---|
| 392 | | - enc_round \state, v20 |
|---|
| 393 | | -2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 |
|---|
| 399 | + .macro enc_qround, s0, s1, s2, s3, key |
|---|
| 400 | + enc_round \s0, \key |
|---|
| 401 | + enc_round \s1, \key |
|---|
| 402 | + enc_round \s2, \key |
|---|
| 403 | + enc_round \s3, \key |
|---|
| 404 | + .endm |
|---|
| 405 | + |
|---|
| 406 | + .macro enc_block, state, rounds, rk, tmp |
|---|
| 407 | + add \tmp, \rk, #96 |
|---|
| 408 | + ld1 {K6.4s-K7.4s}, [\tmp], #32 |
|---|
| 409 | + .irp key, K0, K1, K2, K3, K4 K5 |
|---|
| 394 | 410 | enc_round \state, \key |
|---|
| 395 | 411 | .endr |
|---|
| 396 | | - aese \state\().16b, v30.16b |
|---|
| 397 | | - eor \state\().16b, \state\().16b, v31.16b |
|---|
| 412 | + |
|---|
| 413 | + tbnz \rounds, #2, .Lnot128_\@ |
|---|
| 414 | +.Lout256_\@: |
|---|
| 415 | + enc_round \state, K6 |
|---|
| 416 | + enc_round \state, K7 |
|---|
| 417 | + |
|---|
| 418 | +.Lout192_\@: |
|---|
| 419 | + enc_round \state, KK |
|---|
| 420 | + aese \state\().16b, KL.16b |
|---|
| 421 | + eor \state\().16b, \state\().16b, KM.16b |
|---|
| 422 | + |
|---|
| 423 | + .subsection 1 |
|---|
| 424 | +.Lnot128_\@: |
|---|
| 425 | + ld1 {K8.4s-K9.4s}, [\tmp], #32 |
|---|
| 426 | + enc_round \state, K6 |
|---|
| 427 | + enc_round \state, K7 |
|---|
| 428 | + ld1 {K6.4s-K7.4s}, [\tmp] |
|---|
| 429 | + enc_round \state, K8 |
|---|
| 430 | + enc_round \state, K9 |
|---|
| 431 | + tbz \rounds, #1, .Lout192_\@ |
|---|
| 432 | + b .Lout256_\@ |
|---|
| 433 | + .previous |
|---|
| 398 | 434 | .endm |
|---|
| 399 | 435 | |
|---|
| 436 | + .align 6 |
|---|
| 400 | 437 | .macro pmull_gcm_do_crypt, enc |
|---|
| 401 | | - ld1 {SHASH.2d}, [x4], #16 |
|---|
| 402 | | - ld1 {HH.2d}, [x4] |
|---|
| 403 | | - ld1 {XL.2d}, [x1] |
|---|
| 404 | | - ldr x8, [x5, #8] // load lower counter |
|---|
| 438 | + stp x29, x30, [sp, #-32]! |
|---|
| 439 | + mov x29, sp |
|---|
| 440 | + str x19, [sp, #24] |
|---|
| 405 | 441 | |
|---|
| 406 | | - movi MASK.16b, #0xe1 |
|---|
| 442 | + load_round_keys x7, x6, x8 |
|---|
| 443 | + |
|---|
| 444 | + ld1 {SHASH.2d}, [x3], #16 |
|---|
| 445 | + ld1 {HH.2d-HH4.2d}, [x3] |
|---|
| 446 | + |
|---|
| 407 | 447 | trn1 SHASH2.2d, SHASH.2d, HH.2d |
|---|
| 408 | 448 | trn2 T1.2d, SHASH.2d, HH.2d |
|---|
| 409 | | -CPU_LE( rev x8, x8 ) |
|---|
| 410 | | - shl MASK.2d, MASK.2d, #57 |
|---|
| 411 | 449 | eor SHASH2.16b, SHASH2.16b, T1.16b |
|---|
| 412 | 450 | |
|---|
| 413 | | - .if \enc == 1 |
|---|
| 414 | | - ldr x10, [sp] |
|---|
| 415 | | - ld1 {KS0.16b-KS1.16b}, [x10] |
|---|
| 451 | + trn1 HH34.2d, HH3.2d, HH4.2d |
|---|
| 452 | + trn2 T1.2d, HH3.2d, HH4.2d |
|---|
| 453 | + eor HH34.16b, HH34.16b, T1.16b |
|---|
| 454 | + |
|---|
| 455 | + ld1 {XL.2d}, [x4] |
|---|
| 456 | + |
|---|
| 457 | + cbz x0, 3f // tag only? |
|---|
| 458 | + |
|---|
| 459 | + ldr w8, [x5, #12] // load lower counter |
|---|
| 460 | +CPU_LE( rev w8, w8 ) |
|---|
| 461 | + |
|---|
| 462 | +0: mov w9, #4 // max blocks per round |
|---|
| 463 | + add x10, x0, #0xf |
|---|
| 464 | + lsr x10, x10, #4 // remaining blocks |
|---|
| 465 | + |
|---|
| 466 | + subs x0, x0, #64 |
|---|
| 467 | + csel w9, w10, w9, mi |
|---|
| 468 | + add w8, w8, w9 |
|---|
| 469 | + |
|---|
| 470 | + bmi 1f |
|---|
| 471 | + ld1 {INP0.16b-INP3.16b}, [x2], #64 |
|---|
| 472 | + .subsection 1 |
|---|
| 473 | + /* |
|---|
| 474 | + * Populate the four input registers right to left with up to 63 bytes |
|---|
| 475 | + * of data, using overlapping loads to avoid branches. |
|---|
| 476 | + * |
|---|
| 477 | + * INP0 INP1 INP2 INP3 |
|---|
| 478 | + * 1 byte | | | |x | |
|---|
| 479 | + * 16 bytes | | | |xxxxxxxx| |
|---|
| 480 | + * 17 bytes | | |xxxxxxxx|x | |
|---|
| 481 | + * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx | |
|---|
| 482 | + * etc etc |
|---|
| 483 | + * |
|---|
| 484 | + * Note that this code may read up to 15 bytes before the start of |
|---|
| 485 | + * the input. It is up to the calling code to ensure this is safe if |
|---|
| 486 | + * this happens in the first iteration of the loop (i.e., when the |
|---|
| 487 | + * input size is < 16 bytes) |
|---|
| 488 | + */ |
|---|
| 489 | +1: mov x15, #16 |
|---|
| 490 | + ands x19, x0, #0xf |
|---|
| 491 | + csel x19, x19, x15, ne |
|---|
| 492 | + adr_l x17, .Lpermute_table + 16 |
|---|
| 493 | + |
|---|
| 494 | + sub x11, x15, x19 |
|---|
| 495 | + add x12, x17, x11 |
|---|
| 496 | + sub x17, x17, x11 |
|---|
| 497 | + ld1 {T1.16b}, [x12] |
|---|
| 498 | + sub x10, x1, x11 |
|---|
| 499 | + sub x11, x2, x11 |
|---|
| 500 | + |
|---|
| 501 | + cmp x0, #-16 |
|---|
| 502 | + csel x14, x15, xzr, gt |
|---|
| 503 | + cmp x0, #-32 |
|---|
| 504 | + csel x15, x15, xzr, gt |
|---|
| 505 | + cmp x0, #-48 |
|---|
| 506 | + csel x16, x19, xzr, gt |
|---|
| 507 | + csel x1, x1, x10, gt |
|---|
| 508 | + csel x2, x2, x11, gt |
|---|
| 509 | + |
|---|
| 510 | + ld1 {INP0.16b}, [x2], x14 |
|---|
| 511 | + ld1 {INP1.16b}, [x2], x15 |
|---|
| 512 | + ld1 {INP2.16b}, [x2], x16 |
|---|
| 513 | + ld1 {INP3.16b}, [x2] |
|---|
| 514 | + tbl INP3.16b, {INP3.16b}, T1.16b |
|---|
| 515 | + b 2f |
|---|
| 516 | + .previous |
|---|
| 517 | + |
|---|
| 518 | +2: .if \enc == 0 |
|---|
| 519 | + bl pmull_gcm_ghash_4x |
|---|
| 416 | 520 | .endif |
|---|
| 417 | 521 | |
|---|
| 418 | | - cbnz x6, 4f |
|---|
| 522 | + bl pmull_gcm_enc_4x |
|---|
| 419 | 523 | |
|---|
| 420 | | -0: ld1 {INP0.16b-INP1.16b}, [x3], #32 |
|---|
| 524 | + tbnz x0, #63, 6f |
|---|
| 525 | + st1 {INP0.16b-INP3.16b}, [x1], #64 |
|---|
| 526 | + .if \enc == 1 |
|---|
| 527 | + bl pmull_gcm_ghash_4x |
|---|
| 528 | + .endif |
|---|
| 529 | + bne 0b |
|---|
| 421 | 530 | |
|---|
| 422 | | - rev x9, x8 |
|---|
| 423 | | - add x11, x8, #1 |
|---|
| 424 | | - add x8, x8, #2 |
|---|
| 531 | +3: ldp x19, x10, [sp, #24] |
|---|
| 532 | + cbz x10, 5f // output tag? |
|---|
| 533 | + |
|---|
| 534 | + ld1 {INP3.16b}, [x10] // load lengths[] |
|---|
| 535 | + mov w9, #1 |
|---|
| 536 | + bl pmull_gcm_ghash_4x |
|---|
| 537 | + |
|---|
| 538 | + mov w11, #(0x1 << 24) // BE '1U' |
|---|
| 539 | + ld1 {KS0.16b}, [x5] |
|---|
| 540 | + mov KS0.s[3], w11 |
|---|
| 541 | + |
|---|
| 542 | + enc_block KS0, x7, x6, x12 |
|---|
| 543 | + |
|---|
| 544 | + ext XL.16b, XL.16b, XL.16b, #8 |
|---|
| 545 | + rev64 XL.16b, XL.16b |
|---|
| 546 | + eor XL.16b, XL.16b, KS0.16b |
|---|
| 425 | 547 | |
|---|
| 426 | 548 | .if \enc == 1 |
|---|
| 427 | | - eor INP0.16b, INP0.16b, KS0.16b // encrypt input |
|---|
| 428 | | - eor INP1.16b, INP1.16b, KS1.16b |
|---|
| 549 | + st1 {XL.16b}, [x10] // store tag |
|---|
| 550 | + .else |
|---|
| 551 | + ldp x11, x12, [sp, #40] // load tag pointer and authsize |
|---|
| 552 | + adr_l x17, .Lpermute_table |
|---|
| 553 | + ld1 {KS0.16b}, [x11] // load supplied tag |
|---|
| 554 | + add x17, x17, x12 |
|---|
| 555 | + ld1 {KS1.16b}, [x17] // load permute vector |
|---|
| 556 | + |
|---|
| 557 | + cmeq XL.16b, XL.16b, KS0.16b // compare tags |
|---|
| 558 | + mvn XL.16b, XL.16b // -1 for fail, 0 for pass |
|---|
| 559 | + tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only |
|---|
| 560 | + sminv b0, XL.16b // signed minimum across XL |
|---|
| 561 | + smov w0, v0.b[0] // return b0 |
|---|
| 429 | 562 | .endif |
|---|
| 430 | 563 | |
|---|
| 431 | | - ld1 {KS0.8b}, [x5] // load upper counter |
|---|
| 432 | | - rev x11, x11 |
|---|
| 433 | | - sub w0, w0, #2 |
|---|
| 434 | | - mov KS1.8b, KS0.8b |
|---|
| 435 | | - ins KS0.d[1], x9 // set lower counter |
|---|
| 436 | | - ins KS1.d[1], x11 |
|---|
| 437 | | - |
|---|
| 438 | | - rev64 T1.16b, INP1.16b |
|---|
| 439 | | - |
|---|
| 440 | | - cmp w7, #12 |
|---|
| 441 | | - b.ge 2f // AES-192/256? |
|---|
| 442 | | - |
|---|
| 443 | | -1: enc_round KS0, v21 |
|---|
| 444 | | - ext IN1.16b, T1.16b, T1.16b, #8 |
|---|
| 445 | | - |
|---|
| 446 | | - enc_round KS1, v21 |
|---|
| 447 | | - pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 |
|---|
| 448 | | - |
|---|
| 449 | | - enc_round KS0, v22 |
|---|
| 450 | | - eor T1.16b, T1.16b, IN1.16b |
|---|
| 451 | | - |
|---|
| 452 | | - enc_round KS1, v22 |
|---|
| 453 | | - pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 |
|---|
| 454 | | - |
|---|
| 455 | | - enc_round KS0, v23 |
|---|
| 456 | | - pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) |
|---|
| 457 | | - |
|---|
| 458 | | - enc_round KS1, v23 |
|---|
| 459 | | - rev64 T1.16b, INP0.16b |
|---|
| 460 | | - ext T2.16b, XL.16b, XL.16b, #8 |
|---|
| 461 | | - |
|---|
| 462 | | - enc_round KS0, v24 |
|---|
| 463 | | - ext IN1.16b, T1.16b, T1.16b, #8 |
|---|
| 464 | | - eor T1.16b, T1.16b, T2.16b |
|---|
| 465 | | - |
|---|
| 466 | | - enc_round KS1, v24 |
|---|
| 467 | | - eor XL.16b, XL.16b, IN1.16b |
|---|
| 468 | | - |
|---|
| 469 | | - enc_round KS0, v25 |
|---|
| 470 | | - eor T1.16b, T1.16b, XL.16b |
|---|
| 471 | | - |
|---|
| 472 | | - enc_round KS1, v25 |
|---|
| 473 | | - pmull2 XH.1q, HH.2d, XL.2d // a1 * b1 |
|---|
| 474 | | - |
|---|
| 475 | | - enc_round KS0, v26 |
|---|
| 476 | | - pmull XL.1q, HH.1d, XL.1d // a0 * b0 |
|---|
| 477 | | - |
|---|
| 478 | | - enc_round KS1, v26 |
|---|
| 479 | | - pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0) |
|---|
| 480 | | - |
|---|
| 481 | | - enc_round KS0, v27 |
|---|
| 482 | | - eor XL.16b, XL.16b, XL2.16b |
|---|
| 483 | | - eor XH.16b, XH.16b, XH2.16b |
|---|
| 484 | | - |
|---|
| 485 | | - enc_round KS1, v27 |
|---|
| 486 | | - eor XM.16b, XM.16b, XM2.16b |
|---|
| 487 | | - ext T1.16b, XL.16b, XH.16b, #8 |
|---|
| 488 | | - |
|---|
| 489 | | - enc_round KS0, v28 |
|---|
| 490 | | - eor T2.16b, XL.16b, XH.16b |
|---|
| 491 | | - eor XM.16b, XM.16b, T1.16b |
|---|
| 492 | | - |
|---|
| 493 | | - enc_round KS1, v28 |
|---|
| 494 | | - eor XM.16b, XM.16b, T2.16b |
|---|
| 495 | | - |
|---|
| 496 | | - enc_round KS0, v29 |
|---|
| 497 | | - pmull T2.1q, XL.1d, MASK.1d |
|---|
| 498 | | - |
|---|
| 499 | | - enc_round KS1, v29 |
|---|
| 500 | | - mov XH.d[0], XM.d[1] |
|---|
| 501 | | - mov XM.d[1], XL.d[0] |
|---|
| 502 | | - |
|---|
| 503 | | - aese KS0.16b, v30.16b |
|---|
| 504 | | - eor XL.16b, XM.16b, T2.16b |
|---|
| 505 | | - |
|---|
| 506 | | - aese KS1.16b, v30.16b |
|---|
| 507 | | - ext T2.16b, XL.16b, XL.16b, #8 |
|---|
| 508 | | - |
|---|
| 509 | | - eor KS0.16b, KS0.16b, v31.16b |
|---|
| 510 | | - pmull XL.1q, XL.1d, MASK.1d |
|---|
| 511 | | - eor T2.16b, T2.16b, XH.16b |
|---|
| 512 | | - |
|---|
| 513 | | - eor KS1.16b, KS1.16b, v31.16b |
|---|
| 514 | | - eor XL.16b, XL.16b, T2.16b |
|---|
| 515 | | - |
|---|
| 516 | | - .if \enc == 0 |
|---|
| 517 | | - eor INP0.16b, INP0.16b, KS0.16b |
|---|
| 518 | | - eor INP1.16b, INP1.16b, KS1.16b |
|---|
| 519 | | - .endif |
|---|
| 520 | | - |
|---|
| 521 | | - st1 {INP0.16b-INP1.16b}, [x2], #32 |
|---|
| 522 | | - |
|---|
| 523 | | - cbnz w0, 0b |
|---|
| 524 | | - |
|---|
| 525 | | -CPU_LE( rev x8, x8 ) |
|---|
| 526 | | - st1 {XL.2d}, [x1] |
|---|
| 527 | | - str x8, [x5, #8] // store lower counter |
|---|
| 528 | | - |
|---|
| 529 | | - .if \enc == 1 |
|---|
| 530 | | - st1 {KS0.16b-KS1.16b}, [x10] |
|---|
| 531 | | - .endif |
|---|
| 532 | | - |
|---|
| 564 | +4: ldp x29, x30, [sp], #32 |
|---|
| 533 | 565 | ret |
|---|
| 534 | 566 | |
|---|
| 535 | | -2: b.eq 3f // AES-192? |
|---|
| 536 | | - enc_round KS0, v17 |
|---|
| 537 | | - enc_round KS1, v17 |
|---|
| 538 | | - enc_round KS0, v18 |
|---|
| 539 | | - enc_round KS1, v18 |
|---|
| 540 | | -3: enc_round KS0, v19 |
|---|
| 541 | | - enc_round KS1, v19 |
|---|
| 542 | | - enc_round KS0, v20 |
|---|
| 543 | | - enc_round KS1, v20 |
|---|
| 544 | | - b 1b |
|---|
| 567 | +5: |
|---|
| 568 | +CPU_LE( rev w8, w8 ) |
|---|
| 569 | + str w8, [x5, #12] // store lower counter |
|---|
| 570 | + st1 {XL.2d}, [x4] |
|---|
| 571 | + b 4b |
|---|
| 545 | 572 | |
|---|
| 546 | | -4: load_round_keys w7, x6 |
|---|
| 547 | | - b 0b |
|---|
| 573 | +6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors |
|---|
| 574 | + sub x17, x17, x19, lsl #1 |
|---|
| 575 | + |
|---|
| 576 | + cmp w9, #1 |
|---|
| 577 | + beq 7f |
|---|
| 578 | + .subsection 1 |
|---|
| 579 | +7: ld1 {INP2.16b}, [x1] |
|---|
| 580 | + tbx INP2.16b, {INP3.16b}, T1.16b |
|---|
| 581 | + mov INP3.16b, INP2.16b |
|---|
| 582 | + b 8f |
|---|
| 583 | + .previous |
|---|
| 584 | + |
|---|
| 585 | + st1 {INP0.16b}, [x1], x14 |
|---|
| 586 | + st1 {INP1.16b}, [x1], x15 |
|---|
| 587 | + st1 {INP2.16b}, [x1], x16 |
|---|
| 588 | + tbl INP3.16b, {INP3.16b}, T1.16b |
|---|
| 589 | + tbx INP3.16b, {INP2.16b}, T2.16b |
|---|
| 590 | +8: st1 {INP3.16b}, [x1] |
|---|
| 591 | + |
|---|
| 592 | + .if \enc == 1 |
|---|
| 593 | + ld1 {T1.16b}, [x17] |
|---|
| 594 | + tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits |
|---|
| 595 | + bl pmull_gcm_ghash_4x |
|---|
| 596 | + .endif |
|---|
| 597 | + b 3b |
|---|
| 548 | 598 | .endm |
|---|
| 549 | 599 | |
|---|
| 550 | 600 | /* |
|---|
| 551 | | - * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], |
|---|
| 552 | | - * struct ghash_key const *k, u8 ctr[], |
|---|
| 553 | | - * int rounds, u8 ks[]) |
|---|
| 601 | + * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[], |
|---|
| 602 | + * struct ghash_key const *k, u64 dg[], u8 ctr[], |
|---|
| 603 | + * int rounds, u8 tag) |
|---|
| 554 | 604 | */ |
|---|
| 555 | | -ENTRY(pmull_gcm_encrypt) |
|---|
| 605 | +SYM_FUNC_START(pmull_gcm_encrypt) |
|---|
| 556 | 606 | pmull_gcm_do_crypt 1 |
|---|
| 557 | | -ENDPROC(pmull_gcm_encrypt) |
|---|
| 607 | +SYM_FUNC_END(pmull_gcm_encrypt) |
|---|
| 558 | 608 | |
|---|
| 559 | 609 | /* |
|---|
| 560 | | - * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], |
|---|
| 561 | | - * struct ghash_key const *k, u8 ctr[], |
|---|
| 562 | | - * int rounds) |
|---|
| 610 | + * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[], |
|---|
| 611 | + * struct ghash_key const *k, u64 dg[], u8 ctr[], |
|---|
| 612 | + * int rounds, u8 tag) |
|---|
| 563 | 613 | */ |
|---|
| 564 | | -ENTRY(pmull_gcm_decrypt) |
|---|
| 614 | +SYM_FUNC_START(pmull_gcm_decrypt) |
|---|
| 565 | 615 | pmull_gcm_do_crypt 0 |
|---|
| 566 | | -ENDPROC(pmull_gcm_decrypt) |
|---|
| 616 | +SYM_FUNC_END(pmull_gcm_decrypt) |
|---|
| 567 | 617 | |
|---|
| 568 | | - /* |
|---|
| 569 | | - * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds) |
|---|
| 570 | | - */ |
|---|
| 571 | | -ENTRY(pmull_gcm_encrypt_block) |
|---|
| 572 | | - cbz x2, 0f |
|---|
| 573 | | - load_round_keys w3, x2 |
|---|
| 574 | | -0: ld1 {v0.16b}, [x1] |
|---|
| 575 | | - enc_block v0, w3 |
|---|
| 576 | | - st1 {v0.16b}, [x0] |
|---|
| 618 | +SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x) |
|---|
| 619 | + movi MASK.16b, #0xe1 |
|---|
| 620 | + shl MASK.2d, MASK.2d, #57 |
|---|
| 621 | + |
|---|
| 622 | + rev64 T1.16b, INP0.16b |
|---|
| 623 | + rev64 T2.16b, INP1.16b |
|---|
| 624 | + rev64 TT3.16b, INP2.16b |
|---|
| 625 | + rev64 TT4.16b, INP3.16b |
|---|
| 626 | + |
|---|
| 627 | + ext XL.16b, XL.16b, XL.16b, #8 |
|---|
| 628 | + |
|---|
| 629 | + tbz w9, #2, 0f // <4 blocks? |
|---|
| 630 | + .subsection 1 |
|---|
| 631 | +0: movi XH2.16b, #0 |
|---|
| 632 | + movi XM2.16b, #0 |
|---|
| 633 | + movi XL2.16b, #0 |
|---|
| 634 | + |
|---|
| 635 | + tbz w9, #0, 1f // 2 blocks? |
|---|
| 636 | + tbz w9, #1, 2f // 1 block? |
|---|
| 637 | + |
|---|
| 638 | + eor T2.16b, T2.16b, XL.16b |
|---|
| 639 | + ext T1.16b, T2.16b, T2.16b, #8 |
|---|
| 640 | + b .Lgh3 |
|---|
| 641 | + |
|---|
| 642 | +1: eor TT3.16b, TT3.16b, XL.16b |
|---|
| 643 | + ext T2.16b, TT3.16b, TT3.16b, #8 |
|---|
| 644 | + b .Lgh2 |
|---|
| 645 | + |
|---|
| 646 | +2: eor TT4.16b, TT4.16b, XL.16b |
|---|
| 647 | + ext IN1.16b, TT4.16b, TT4.16b, #8 |
|---|
| 648 | + b .Lgh1 |
|---|
| 649 | + .previous |
|---|
| 650 | + |
|---|
| 651 | + eor T1.16b, T1.16b, XL.16b |
|---|
| 652 | + ext IN1.16b, T1.16b, T1.16b, #8 |
|---|
| 653 | + |
|---|
| 654 | + pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1 |
|---|
| 655 | + eor T1.16b, T1.16b, IN1.16b |
|---|
| 656 | + pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0 |
|---|
| 657 | + pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) |
|---|
| 658 | + |
|---|
| 659 | + ext T1.16b, T2.16b, T2.16b, #8 |
|---|
| 660 | +.Lgh3: eor T2.16b, T2.16b, T1.16b |
|---|
| 661 | + pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1 |
|---|
| 662 | + pmull XL.1q, HH3.1d, T1.1d // a0 * b0 |
|---|
| 663 | + pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) |
|---|
| 664 | + |
|---|
| 665 | + eor XH2.16b, XH2.16b, XH.16b |
|---|
| 666 | + eor XL2.16b, XL2.16b, XL.16b |
|---|
| 667 | + eor XM2.16b, XM2.16b, XM.16b |
|---|
| 668 | + |
|---|
| 669 | + ext T2.16b, TT3.16b, TT3.16b, #8 |
|---|
| 670 | +.Lgh2: eor TT3.16b, TT3.16b, T2.16b |
|---|
| 671 | + pmull2 XH.1q, HH.2d, T2.2d // a1 * b1 |
|---|
| 672 | + pmull XL.1q, HH.1d, T2.1d // a0 * b0 |
|---|
| 673 | + pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) |
|---|
| 674 | + |
|---|
| 675 | + eor XH2.16b, XH2.16b, XH.16b |
|---|
| 676 | + eor XL2.16b, XL2.16b, XL.16b |
|---|
| 677 | + eor XM2.16b, XM2.16b, XM.16b |
|---|
| 678 | + |
|---|
| 679 | + ext IN1.16b, TT4.16b, TT4.16b, #8 |
|---|
| 680 | +.Lgh1: eor TT4.16b, TT4.16b, IN1.16b |
|---|
| 681 | + pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0 |
|---|
| 682 | + pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1 |
|---|
| 683 | + pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) |
|---|
| 684 | + |
|---|
| 685 | + eor XH.16b, XH.16b, XH2.16b |
|---|
| 686 | + eor XL.16b, XL.16b, XL2.16b |
|---|
| 687 | + eor XM.16b, XM.16b, XM2.16b |
|---|
| 688 | + |
|---|
| 689 | + eor T2.16b, XL.16b, XH.16b |
|---|
| 690 | + ext T1.16b, XL.16b, XH.16b, #8 |
|---|
| 691 | + eor XM.16b, XM.16b, T2.16b |
|---|
| 692 | + |
|---|
| 693 | + __pmull_reduce_p64 |
|---|
| 694 | + |
|---|
| 695 | + eor T2.16b, T2.16b, XH.16b |
|---|
| 696 | + eor XL.16b, XL.16b, T2.16b |
|---|
| 697 | + |
|---|
| 577 | 698 | ret |
|---|
| 578 | | -ENDPROC(pmull_gcm_encrypt_block) |
|---|
| 699 | +SYM_FUNC_END(pmull_gcm_ghash_4x) |
|---|
| 700 | + |
|---|
| 701 | +SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x) |
|---|
| 702 | + ld1 {KS0.16b}, [x5] // load upper counter |
|---|
| 703 | + sub w10, w8, #4 |
|---|
| 704 | + sub w11, w8, #3 |
|---|
| 705 | + sub w12, w8, #2 |
|---|
| 706 | + sub w13, w8, #1 |
|---|
| 707 | + rev w10, w10 |
|---|
| 708 | + rev w11, w11 |
|---|
| 709 | + rev w12, w12 |
|---|
| 710 | + rev w13, w13 |
|---|
| 711 | + mov KS1.16b, KS0.16b |
|---|
| 712 | + mov KS2.16b, KS0.16b |
|---|
| 713 | + mov KS3.16b, KS0.16b |
|---|
| 714 | + ins KS0.s[3], w10 // set lower counter |
|---|
| 715 | + ins KS1.s[3], w11 |
|---|
| 716 | + ins KS2.s[3], w12 |
|---|
| 717 | + ins KS3.s[3], w13 |
|---|
| 718 | + |
|---|
| 719 | + add x10, x6, #96 // round key pointer |
|---|
| 720 | + ld1 {K6.4s-K7.4s}, [x10], #32 |
|---|
| 721 | + .irp key, K0, K1, K2, K3, K4, K5 |
|---|
| 722 | + enc_qround KS0, KS1, KS2, KS3, \key |
|---|
| 723 | + .endr |
|---|
| 724 | + |
|---|
| 725 | + tbnz x7, #2, .Lnot128 |
|---|
| 726 | + .subsection 1 |
|---|
| 727 | +.Lnot128: |
|---|
| 728 | + ld1 {K8.4s-K9.4s}, [x10], #32 |
|---|
| 729 | + .irp key, K6, K7 |
|---|
| 730 | + enc_qround KS0, KS1, KS2, KS3, \key |
|---|
| 731 | + .endr |
|---|
| 732 | + ld1 {K6.4s-K7.4s}, [x10] |
|---|
| 733 | + .irp key, K8, K9 |
|---|
| 734 | + enc_qround KS0, KS1, KS2, KS3, \key |
|---|
| 735 | + .endr |
|---|
| 736 | + tbz x7, #1, .Lout192 |
|---|
| 737 | + b .Lout256 |
|---|
| 738 | + .previous |
|---|
| 739 | + |
|---|
| 740 | +.Lout256: |
|---|
| 741 | + .irp key, K6, K7 |
|---|
| 742 | + enc_qround KS0, KS1, KS2, KS3, \key |
|---|
| 743 | + .endr |
|---|
| 744 | + |
|---|
| 745 | +.Lout192: |
|---|
| 746 | + enc_qround KS0, KS1, KS2, KS3, KK |
|---|
| 747 | + |
|---|
| 748 | + aese KS0.16b, KL.16b |
|---|
| 749 | + aese KS1.16b, KL.16b |
|---|
| 750 | + aese KS2.16b, KL.16b |
|---|
| 751 | + aese KS3.16b, KL.16b |
|---|
| 752 | + |
|---|
| 753 | + eor KS0.16b, KS0.16b, KM.16b |
|---|
| 754 | + eor KS1.16b, KS1.16b, KM.16b |
|---|
| 755 | + eor KS2.16b, KS2.16b, KM.16b |
|---|
| 756 | + eor KS3.16b, KS3.16b, KM.16b |
|---|
| 757 | + |
|---|
| 758 | + eor INP0.16b, INP0.16b, KS0.16b |
|---|
| 759 | + eor INP1.16b, INP1.16b, KS1.16b |
|---|
| 760 | + eor INP2.16b, INP2.16b, KS2.16b |
|---|
| 761 | + eor INP3.16b, INP3.16b, KS3.16b |
|---|
| 762 | + |
|---|
| 763 | + ret |
|---|
| 764 | +SYM_FUNC_END(pmull_gcm_enc_4x) |
|---|
| 765 | + |
|---|
| 766 | + .section ".rodata", "a" |
|---|
| 767 | + .align 6 |
|---|
| 768 | +.Lpermute_table: |
|---|
| 769 | + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
|---|
| 770 | + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
|---|
| 771 | + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
|---|
| 772 | + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf |
|---|
| 773 | + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
|---|
| 774 | + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
|---|
| 775 | + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
|---|
| 776 | + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf |
|---|
| 777 | + .previous |
|---|