.. | .. |
---|
| 1 | +/* SPDX-License-Identifier: GPL-2.0-only */ |
---|
1 | 2 | /* |
---|
2 | 3 | * Accelerated GHASH implementation with ARMv8 PMULL instructions. |
---|
3 | 4 | * |
---|
4 | 5 | * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> |
---|
5 | | - * |
---|
6 | | - * This program is free software; you can redistribute it and/or modify it |
---|
7 | | - * under the terms of the GNU General Public License version 2 as published |
---|
8 | | - * by the Free Software Foundation. |
---|
9 | 6 | */ |
---|
10 | 7 | |
---|
11 | 8 | #include <linux/linkage.h> |
---|
.. | .. |
---|
16 | 13 | T1 .req v2 |
---|
17 | 14 | T2 .req v3 |
---|
18 | 15 | MASK .req v4 |
---|
19 | | - XL .req v5 |
---|
20 | | - XM .req v6 |
---|
| 16 | + XM .req v5 |
---|
| 17 | + XL .req v6 |
---|
21 | 18 | XH .req v7 |
---|
22 | 19 | IN1 .req v7 |
---|
23 | 20 | |
---|
.. | .. |
---|
353 | 350 | * void pmull_ghash_update(int blocks, u64 dg[], const char *src, |
---|
354 | 351 | * struct ghash_key const *k, const char *head) |
---|
355 | 352 | */ |
---|
356 | | -ENTRY(pmull_ghash_update_p64) |
---|
| 353 | +SYM_FUNC_START(pmull_ghash_update_p64) |
---|
357 | 354 | __pmull_ghash p64 |
---|
358 | | -ENDPROC(pmull_ghash_update_p64) |
---|
| 355 | +SYM_FUNC_END(pmull_ghash_update_p64) |
---|
359 | 356 | |
---|
360 | | -ENTRY(pmull_ghash_update_p8) |
---|
| 357 | +SYM_FUNC_START(pmull_ghash_update_p8) |
---|
361 | 358 | __pmull_ghash p8 |
---|
362 | | -ENDPROC(pmull_ghash_update_p8) |
---|
| 359 | +SYM_FUNC_END(pmull_ghash_update_p8) |
---|
363 | 360 | |
---|
364 | | - KS0 .req v12 |
---|
365 | | - KS1 .req v13 |
---|
366 | | - INP0 .req v14 |
---|
367 | | - INP1 .req v15 |
---|
| 361 | + KS0 .req v8 |
---|
| 362 | + KS1 .req v9 |
---|
| 363 | + KS2 .req v10 |
---|
| 364 | + KS3 .req v11 |
---|
368 | 365 | |
---|
369 | | - .macro load_round_keys, rounds, rk |
---|
370 | | - cmp \rounds, #12 |
---|
371 | | - blo 2222f /* 128 bits */ |
---|
372 | | - beq 1111f /* 192 bits */ |
---|
373 | | - ld1 {v17.4s-v18.4s}, [\rk], #32 |
---|
374 | | -1111: ld1 {v19.4s-v20.4s}, [\rk], #32 |
---|
375 | | -2222: ld1 {v21.4s-v24.4s}, [\rk], #64 |
---|
376 | | - ld1 {v25.4s-v28.4s}, [\rk], #64 |
---|
377 | | - ld1 {v29.4s-v31.4s}, [\rk] |
---|
| 366 | + INP0 .req v21 |
---|
| 367 | + INP1 .req v22 |
---|
| 368 | + INP2 .req v23 |
---|
| 369 | + INP3 .req v24 |
---|
| 370 | + |
---|
| 371 | + K0 .req v25 |
---|
| 372 | + K1 .req v26 |
---|
| 373 | + K2 .req v27 |
---|
| 374 | + K3 .req v28 |
---|
| 375 | + K4 .req v12 |
---|
| 376 | + K5 .req v13 |
---|
| 377 | + K6 .req v4 |
---|
| 378 | + K7 .req v5 |
---|
| 379 | + K8 .req v14 |
---|
| 380 | + K9 .req v15 |
---|
| 381 | + KK .req v29 |
---|
| 382 | + KL .req v30 |
---|
| 383 | + KM .req v31 |
---|
| 384 | + |
---|
| 385 | + .macro load_round_keys, rounds, rk, tmp |
---|
| 386 | + add \tmp, \rk, #64 |
---|
| 387 | + ld1 {K0.4s-K3.4s}, [\rk] |
---|
| 388 | + ld1 {K4.4s-K5.4s}, [\tmp] |
---|
| 389 | + add \tmp, \rk, \rounds, lsl #4 |
---|
| 390 | + sub \tmp, \tmp, #32 |
---|
| 391 | + ld1 {KK.4s-KM.4s}, [\tmp] |
---|
378 | 392 | .endm |
---|
379 | 393 | |
---|
380 | 394 | .macro enc_round, state, key |
---|
.. | .. |
---|
382 | 396 | aesmc \state\().16b, \state\().16b |
---|
383 | 397 | .endm |
---|
384 | 398 | |
---|
385 | | - .macro enc_block, state, rounds |
---|
386 | | - cmp \rounds, #12 |
---|
387 | | - b.lo 2222f /* 128 bits */ |
---|
388 | | - b.eq 1111f /* 192 bits */ |
---|
389 | | - enc_round \state, v17 |
---|
390 | | - enc_round \state, v18 |
---|
391 | | -1111: enc_round \state, v19 |
---|
392 | | - enc_round \state, v20 |
---|
393 | | -2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 |
---|
| 399 | + .macro enc_qround, s0, s1, s2, s3, key |
---|
| 400 | + enc_round \s0, \key |
---|
| 401 | + enc_round \s1, \key |
---|
| 402 | + enc_round \s2, \key |
---|
| 403 | + enc_round \s3, \key |
---|
| 404 | + .endm |
---|
| 405 | + |
---|
| 406 | + .macro enc_block, state, rounds, rk, tmp |
---|
| 407 | + add \tmp, \rk, #96 |
---|
| 408 | + ld1 {K6.4s-K7.4s}, [\tmp], #32 |
---|
| 409 | + .irp key, K0, K1, K2, K3, K4 K5 |
---|
394 | 410 | enc_round \state, \key |
---|
395 | 411 | .endr |
---|
396 | | - aese \state\().16b, v30.16b |
---|
397 | | - eor \state\().16b, \state\().16b, v31.16b |
---|
| 412 | + |
---|
| 413 | + tbnz \rounds, #2, .Lnot128_\@ |
---|
| 414 | +.Lout256_\@: |
---|
| 415 | + enc_round \state, K6 |
---|
| 416 | + enc_round \state, K7 |
---|
| 417 | + |
---|
| 418 | +.Lout192_\@: |
---|
| 419 | + enc_round \state, KK |
---|
| 420 | + aese \state\().16b, KL.16b |
---|
| 421 | + eor \state\().16b, \state\().16b, KM.16b |
---|
| 422 | + |
---|
| 423 | + .subsection 1 |
---|
| 424 | +.Lnot128_\@: |
---|
| 425 | + ld1 {K8.4s-K9.4s}, [\tmp], #32 |
---|
| 426 | + enc_round \state, K6 |
---|
| 427 | + enc_round \state, K7 |
---|
| 428 | + ld1 {K6.4s-K7.4s}, [\tmp] |
---|
| 429 | + enc_round \state, K8 |
---|
| 430 | + enc_round \state, K9 |
---|
| 431 | + tbz \rounds, #1, .Lout192_\@ |
---|
| 432 | + b .Lout256_\@ |
---|
| 433 | + .previous |
---|
398 | 434 | .endm |
---|
399 | 435 | |
---|
| 436 | + .align 6 |
---|
400 | 437 | .macro pmull_gcm_do_crypt, enc |
---|
401 | | - ld1 {SHASH.2d}, [x4], #16 |
---|
402 | | - ld1 {HH.2d}, [x4] |
---|
403 | | - ld1 {XL.2d}, [x1] |
---|
404 | | - ldr x8, [x5, #8] // load lower counter |
---|
| 438 | + stp x29, x30, [sp, #-32]! |
---|
| 439 | + mov x29, sp |
---|
| 440 | + str x19, [sp, #24] |
---|
405 | 441 | |
---|
406 | | - movi MASK.16b, #0xe1 |
---|
| 442 | + load_round_keys x7, x6, x8 |
---|
| 443 | + |
---|
| 444 | + ld1 {SHASH.2d}, [x3], #16 |
---|
| 445 | + ld1 {HH.2d-HH4.2d}, [x3] |
---|
| 446 | + |
---|
407 | 447 | trn1 SHASH2.2d, SHASH.2d, HH.2d |
---|
408 | 448 | trn2 T1.2d, SHASH.2d, HH.2d |
---|
409 | | -CPU_LE( rev x8, x8 ) |
---|
410 | | - shl MASK.2d, MASK.2d, #57 |
---|
411 | 449 | eor SHASH2.16b, SHASH2.16b, T1.16b |
---|
412 | 450 | |
---|
413 | | - .if \enc == 1 |
---|
414 | | - ldr x10, [sp] |
---|
415 | | - ld1 {KS0.16b-KS1.16b}, [x10] |
---|
| 451 | + trn1 HH34.2d, HH3.2d, HH4.2d |
---|
| 452 | + trn2 T1.2d, HH3.2d, HH4.2d |
---|
| 453 | + eor HH34.16b, HH34.16b, T1.16b |
---|
| 454 | + |
---|
| 455 | + ld1 {XL.2d}, [x4] |
---|
| 456 | + |
---|
| 457 | + cbz x0, 3f // tag only? |
---|
| 458 | + |
---|
| 459 | + ldr w8, [x5, #12] // load lower counter |
---|
| 460 | +CPU_LE( rev w8, w8 ) |
---|
| 461 | + |
---|
| 462 | +0: mov w9, #4 // max blocks per round |
---|
| 463 | + add x10, x0, #0xf |
---|
| 464 | + lsr x10, x10, #4 // remaining blocks |
---|
| 465 | + |
---|
| 466 | + subs x0, x0, #64 |
---|
| 467 | + csel w9, w10, w9, mi |
---|
| 468 | + add w8, w8, w9 |
---|
| 469 | + |
---|
| 470 | + bmi 1f |
---|
| 471 | + ld1 {INP0.16b-INP3.16b}, [x2], #64 |
---|
| 472 | + .subsection 1 |
---|
| 473 | + /* |
---|
| 474 | + * Populate the four input registers right to left with up to 63 bytes |
---|
| 475 | + * of data, using overlapping loads to avoid branches. |
---|
| 476 | + * |
---|
| 477 | + * INP0 INP1 INP2 INP3 |
---|
| 478 | + * 1 byte | | | |x | |
---|
| 479 | + * 16 bytes | | | |xxxxxxxx| |
---|
| 480 | + * 17 bytes | | |xxxxxxxx|x | |
---|
| 481 | + * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx | |
---|
| 482 | + * etc etc |
---|
| 483 | + * |
---|
| 484 | + * Note that this code may read up to 15 bytes before the start of |
---|
| 485 | + * the input. It is up to the calling code to ensure this is safe if |
---|
| 486 | + * this happens in the first iteration of the loop (i.e., when the |
---|
| 487 | + * input size is < 16 bytes) |
---|
| 488 | + */ |
---|
| 489 | +1: mov x15, #16 |
---|
| 490 | + ands x19, x0, #0xf |
---|
| 491 | + csel x19, x19, x15, ne |
---|
| 492 | + adr_l x17, .Lpermute_table + 16 |
---|
| 493 | + |
---|
| 494 | + sub x11, x15, x19 |
---|
| 495 | + add x12, x17, x11 |
---|
| 496 | + sub x17, x17, x11 |
---|
| 497 | + ld1 {T1.16b}, [x12] |
---|
| 498 | + sub x10, x1, x11 |
---|
| 499 | + sub x11, x2, x11 |
---|
| 500 | + |
---|
| 501 | + cmp x0, #-16 |
---|
| 502 | + csel x14, x15, xzr, gt |
---|
| 503 | + cmp x0, #-32 |
---|
| 504 | + csel x15, x15, xzr, gt |
---|
| 505 | + cmp x0, #-48 |
---|
| 506 | + csel x16, x19, xzr, gt |
---|
| 507 | + csel x1, x1, x10, gt |
---|
| 508 | + csel x2, x2, x11, gt |
---|
| 509 | + |
---|
| 510 | + ld1 {INP0.16b}, [x2], x14 |
---|
| 511 | + ld1 {INP1.16b}, [x2], x15 |
---|
| 512 | + ld1 {INP2.16b}, [x2], x16 |
---|
| 513 | + ld1 {INP3.16b}, [x2] |
---|
| 514 | + tbl INP3.16b, {INP3.16b}, T1.16b |
---|
| 515 | + b 2f |
---|
| 516 | + .previous |
---|
| 517 | + |
---|
| 518 | +2: .if \enc == 0 |
---|
| 519 | + bl pmull_gcm_ghash_4x |
---|
416 | 520 | .endif |
---|
417 | 521 | |
---|
418 | | - cbnz x6, 4f |
---|
| 522 | + bl pmull_gcm_enc_4x |
---|
419 | 523 | |
---|
420 | | -0: ld1 {INP0.16b-INP1.16b}, [x3], #32 |
---|
| 524 | + tbnz x0, #63, 6f |
---|
| 525 | + st1 {INP0.16b-INP3.16b}, [x1], #64 |
---|
| 526 | + .if \enc == 1 |
---|
| 527 | + bl pmull_gcm_ghash_4x |
---|
| 528 | + .endif |
---|
| 529 | + bne 0b |
---|
421 | 530 | |
---|
422 | | - rev x9, x8 |
---|
423 | | - add x11, x8, #1 |
---|
424 | | - add x8, x8, #2 |
---|
| 531 | +3: ldp x19, x10, [sp, #24] |
---|
| 532 | + cbz x10, 5f // output tag? |
---|
| 533 | + |
---|
| 534 | + ld1 {INP3.16b}, [x10] // load lengths[] |
---|
| 535 | + mov w9, #1 |
---|
| 536 | + bl pmull_gcm_ghash_4x |
---|
| 537 | + |
---|
| 538 | + mov w11, #(0x1 << 24) // BE '1U' |
---|
| 539 | + ld1 {KS0.16b}, [x5] |
---|
| 540 | + mov KS0.s[3], w11 |
---|
| 541 | + |
---|
| 542 | + enc_block KS0, x7, x6, x12 |
---|
| 543 | + |
---|
| 544 | + ext XL.16b, XL.16b, XL.16b, #8 |
---|
| 545 | + rev64 XL.16b, XL.16b |
---|
| 546 | + eor XL.16b, XL.16b, KS0.16b |
---|
425 | 547 | |
---|
426 | 548 | .if \enc == 1 |
---|
427 | | - eor INP0.16b, INP0.16b, KS0.16b // encrypt input |
---|
428 | | - eor INP1.16b, INP1.16b, KS1.16b |
---|
| 549 | + st1 {XL.16b}, [x10] // store tag |
---|
| 550 | + .else |
---|
| 551 | + ldp x11, x12, [sp, #40] // load tag pointer and authsize |
---|
| 552 | + adr_l x17, .Lpermute_table |
---|
| 553 | + ld1 {KS0.16b}, [x11] // load supplied tag |
---|
| 554 | + add x17, x17, x12 |
---|
| 555 | + ld1 {KS1.16b}, [x17] // load permute vector |
---|
| 556 | + |
---|
| 557 | + cmeq XL.16b, XL.16b, KS0.16b // compare tags |
---|
| 558 | + mvn XL.16b, XL.16b // -1 for fail, 0 for pass |
---|
| 559 | + tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only |
---|
| 560 | + sminv b0, XL.16b // signed minimum across XL |
---|
| 561 | + smov w0, v0.b[0] // return b0 |
---|
429 | 562 | .endif |
---|
430 | 563 | |
---|
431 | | - ld1 {KS0.8b}, [x5] // load upper counter |
---|
432 | | - rev x11, x11 |
---|
433 | | - sub w0, w0, #2 |
---|
434 | | - mov KS1.8b, KS0.8b |
---|
435 | | - ins KS0.d[1], x9 // set lower counter |
---|
436 | | - ins KS1.d[1], x11 |
---|
437 | | - |
---|
438 | | - rev64 T1.16b, INP1.16b |
---|
439 | | - |
---|
440 | | - cmp w7, #12 |
---|
441 | | - b.ge 2f // AES-192/256? |
---|
442 | | - |
---|
443 | | -1: enc_round KS0, v21 |
---|
444 | | - ext IN1.16b, T1.16b, T1.16b, #8 |
---|
445 | | - |
---|
446 | | - enc_round KS1, v21 |
---|
447 | | - pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 |
---|
448 | | - |
---|
449 | | - enc_round KS0, v22 |
---|
450 | | - eor T1.16b, T1.16b, IN1.16b |
---|
451 | | - |
---|
452 | | - enc_round KS1, v22 |
---|
453 | | - pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 |
---|
454 | | - |
---|
455 | | - enc_round KS0, v23 |
---|
456 | | - pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) |
---|
457 | | - |
---|
458 | | - enc_round KS1, v23 |
---|
459 | | - rev64 T1.16b, INP0.16b |
---|
460 | | - ext T2.16b, XL.16b, XL.16b, #8 |
---|
461 | | - |
---|
462 | | - enc_round KS0, v24 |
---|
463 | | - ext IN1.16b, T1.16b, T1.16b, #8 |
---|
464 | | - eor T1.16b, T1.16b, T2.16b |
---|
465 | | - |
---|
466 | | - enc_round KS1, v24 |
---|
467 | | - eor XL.16b, XL.16b, IN1.16b |
---|
468 | | - |
---|
469 | | - enc_round KS0, v25 |
---|
470 | | - eor T1.16b, T1.16b, XL.16b |
---|
471 | | - |
---|
472 | | - enc_round KS1, v25 |
---|
473 | | - pmull2 XH.1q, HH.2d, XL.2d // a1 * b1 |
---|
474 | | - |
---|
475 | | - enc_round KS0, v26 |
---|
476 | | - pmull XL.1q, HH.1d, XL.1d // a0 * b0 |
---|
477 | | - |
---|
478 | | - enc_round KS1, v26 |
---|
479 | | - pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0) |
---|
480 | | - |
---|
481 | | - enc_round KS0, v27 |
---|
482 | | - eor XL.16b, XL.16b, XL2.16b |
---|
483 | | - eor XH.16b, XH.16b, XH2.16b |
---|
484 | | - |
---|
485 | | - enc_round KS1, v27 |
---|
486 | | - eor XM.16b, XM.16b, XM2.16b |
---|
487 | | - ext T1.16b, XL.16b, XH.16b, #8 |
---|
488 | | - |
---|
489 | | - enc_round KS0, v28 |
---|
490 | | - eor T2.16b, XL.16b, XH.16b |
---|
491 | | - eor XM.16b, XM.16b, T1.16b |
---|
492 | | - |
---|
493 | | - enc_round KS1, v28 |
---|
494 | | - eor XM.16b, XM.16b, T2.16b |
---|
495 | | - |
---|
496 | | - enc_round KS0, v29 |
---|
497 | | - pmull T2.1q, XL.1d, MASK.1d |
---|
498 | | - |
---|
499 | | - enc_round KS1, v29 |
---|
500 | | - mov XH.d[0], XM.d[1] |
---|
501 | | - mov XM.d[1], XL.d[0] |
---|
502 | | - |
---|
503 | | - aese KS0.16b, v30.16b |
---|
504 | | - eor XL.16b, XM.16b, T2.16b |
---|
505 | | - |
---|
506 | | - aese KS1.16b, v30.16b |
---|
507 | | - ext T2.16b, XL.16b, XL.16b, #8 |
---|
508 | | - |
---|
509 | | - eor KS0.16b, KS0.16b, v31.16b |
---|
510 | | - pmull XL.1q, XL.1d, MASK.1d |
---|
511 | | - eor T2.16b, T2.16b, XH.16b |
---|
512 | | - |
---|
513 | | - eor KS1.16b, KS1.16b, v31.16b |
---|
514 | | - eor XL.16b, XL.16b, T2.16b |
---|
515 | | - |
---|
516 | | - .if \enc == 0 |
---|
517 | | - eor INP0.16b, INP0.16b, KS0.16b |
---|
518 | | - eor INP1.16b, INP1.16b, KS1.16b |
---|
519 | | - .endif |
---|
520 | | - |
---|
521 | | - st1 {INP0.16b-INP1.16b}, [x2], #32 |
---|
522 | | - |
---|
523 | | - cbnz w0, 0b |
---|
524 | | - |
---|
525 | | -CPU_LE( rev x8, x8 ) |
---|
526 | | - st1 {XL.2d}, [x1] |
---|
527 | | - str x8, [x5, #8] // store lower counter |
---|
528 | | - |
---|
529 | | - .if \enc == 1 |
---|
530 | | - st1 {KS0.16b-KS1.16b}, [x10] |
---|
531 | | - .endif |
---|
532 | | - |
---|
| 564 | +4: ldp x29, x30, [sp], #32 |
---|
533 | 565 | ret |
---|
534 | 566 | |
---|
535 | | -2: b.eq 3f // AES-192? |
---|
536 | | - enc_round KS0, v17 |
---|
537 | | - enc_round KS1, v17 |
---|
538 | | - enc_round KS0, v18 |
---|
539 | | - enc_round KS1, v18 |
---|
540 | | -3: enc_round KS0, v19 |
---|
541 | | - enc_round KS1, v19 |
---|
542 | | - enc_round KS0, v20 |
---|
543 | | - enc_round KS1, v20 |
---|
544 | | - b 1b |
---|
| 567 | +5: |
---|
| 568 | +CPU_LE( rev w8, w8 ) |
---|
| 569 | + str w8, [x5, #12] // store lower counter |
---|
| 570 | + st1 {XL.2d}, [x4] |
---|
| 571 | + b 4b |
---|
545 | 572 | |
---|
546 | | -4: load_round_keys w7, x6 |
---|
547 | | - b 0b |
---|
| 573 | +6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors |
---|
| 574 | + sub x17, x17, x19, lsl #1 |
---|
| 575 | + |
---|
| 576 | + cmp w9, #1 |
---|
| 577 | + beq 7f |
---|
| 578 | + .subsection 1 |
---|
| 579 | +7: ld1 {INP2.16b}, [x1] |
---|
| 580 | + tbx INP2.16b, {INP3.16b}, T1.16b |
---|
| 581 | + mov INP3.16b, INP2.16b |
---|
| 582 | + b 8f |
---|
| 583 | + .previous |
---|
| 584 | + |
---|
| 585 | + st1 {INP0.16b}, [x1], x14 |
---|
| 586 | + st1 {INP1.16b}, [x1], x15 |
---|
| 587 | + st1 {INP2.16b}, [x1], x16 |
---|
| 588 | + tbl INP3.16b, {INP3.16b}, T1.16b |
---|
| 589 | + tbx INP3.16b, {INP2.16b}, T2.16b |
---|
| 590 | +8: st1 {INP3.16b}, [x1] |
---|
| 591 | + |
---|
| 592 | + .if \enc == 1 |
---|
| 593 | + ld1 {T1.16b}, [x17] |
---|
| 594 | + tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits |
---|
| 595 | + bl pmull_gcm_ghash_4x |
---|
| 596 | + .endif |
---|
| 597 | + b 3b |
---|
548 | 598 | .endm |
---|
549 | 599 | |
---|
550 | 600 | /* |
---|
551 | | - * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], |
---|
552 | | - * struct ghash_key const *k, u8 ctr[], |
---|
553 | | - * int rounds, u8 ks[]) |
---|
| 601 | + * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[], |
---|
| 602 | + * struct ghash_key const *k, u64 dg[], u8 ctr[], |
---|
| 603 | + * int rounds, u8 tag) |
---|
554 | 604 | */ |
---|
555 | | -ENTRY(pmull_gcm_encrypt) |
---|
| 605 | +SYM_FUNC_START(pmull_gcm_encrypt) |
---|
556 | 606 | pmull_gcm_do_crypt 1 |
---|
557 | | -ENDPROC(pmull_gcm_encrypt) |
---|
| 607 | +SYM_FUNC_END(pmull_gcm_encrypt) |
---|
558 | 608 | |
---|
559 | 609 | /* |
---|
560 | | - * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], |
---|
561 | | - * struct ghash_key const *k, u8 ctr[], |
---|
562 | | - * int rounds) |
---|
| 610 | + * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[], |
---|
| 611 | + * struct ghash_key const *k, u64 dg[], u8 ctr[], |
---|
| 612 | + * int rounds, u8 tag) |
---|
563 | 613 | */ |
---|
564 | | -ENTRY(pmull_gcm_decrypt) |
---|
| 614 | +SYM_FUNC_START(pmull_gcm_decrypt) |
---|
565 | 615 | pmull_gcm_do_crypt 0 |
---|
566 | | -ENDPROC(pmull_gcm_decrypt) |
---|
| 616 | +SYM_FUNC_END(pmull_gcm_decrypt) |
---|
567 | 617 | |
---|
568 | | - /* |
---|
569 | | - * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds) |
---|
570 | | - */ |
---|
571 | | -ENTRY(pmull_gcm_encrypt_block) |
---|
572 | | - cbz x2, 0f |
---|
573 | | - load_round_keys w3, x2 |
---|
574 | | -0: ld1 {v0.16b}, [x1] |
---|
575 | | - enc_block v0, w3 |
---|
576 | | - st1 {v0.16b}, [x0] |
---|
| 618 | +SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x) |
---|
| 619 | + movi MASK.16b, #0xe1 |
---|
| 620 | + shl MASK.2d, MASK.2d, #57 |
---|
| 621 | + |
---|
| 622 | + rev64 T1.16b, INP0.16b |
---|
| 623 | + rev64 T2.16b, INP1.16b |
---|
| 624 | + rev64 TT3.16b, INP2.16b |
---|
| 625 | + rev64 TT4.16b, INP3.16b |
---|
| 626 | + |
---|
| 627 | + ext XL.16b, XL.16b, XL.16b, #8 |
---|
| 628 | + |
---|
| 629 | + tbz w9, #2, 0f // <4 blocks? |
---|
| 630 | + .subsection 1 |
---|
| 631 | +0: movi XH2.16b, #0 |
---|
| 632 | + movi XM2.16b, #0 |
---|
| 633 | + movi XL2.16b, #0 |
---|
| 634 | + |
---|
| 635 | + tbz w9, #0, 1f // 2 blocks? |
---|
| 636 | + tbz w9, #1, 2f // 1 block? |
---|
| 637 | + |
---|
| 638 | + eor T2.16b, T2.16b, XL.16b |
---|
| 639 | + ext T1.16b, T2.16b, T2.16b, #8 |
---|
| 640 | + b .Lgh3 |
---|
| 641 | + |
---|
| 642 | +1: eor TT3.16b, TT3.16b, XL.16b |
---|
| 643 | + ext T2.16b, TT3.16b, TT3.16b, #8 |
---|
| 644 | + b .Lgh2 |
---|
| 645 | + |
---|
| 646 | +2: eor TT4.16b, TT4.16b, XL.16b |
---|
| 647 | + ext IN1.16b, TT4.16b, TT4.16b, #8 |
---|
| 648 | + b .Lgh1 |
---|
| 649 | + .previous |
---|
| 650 | + |
---|
| 651 | + eor T1.16b, T1.16b, XL.16b |
---|
| 652 | + ext IN1.16b, T1.16b, T1.16b, #8 |
---|
| 653 | + |
---|
| 654 | + pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1 |
---|
| 655 | + eor T1.16b, T1.16b, IN1.16b |
---|
| 656 | + pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0 |
---|
| 657 | + pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) |
---|
| 658 | + |
---|
| 659 | + ext T1.16b, T2.16b, T2.16b, #8 |
---|
| 660 | +.Lgh3: eor T2.16b, T2.16b, T1.16b |
---|
| 661 | + pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1 |
---|
| 662 | + pmull XL.1q, HH3.1d, T1.1d // a0 * b0 |
---|
| 663 | + pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) |
---|
| 664 | + |
---|
| 665 | + eor XH2.16b, XH2.16b, XH.16b |
---|
| 666 | + eor XL2.16b, XL2.16b, XL.16b |
---|
| 667 | + eor XM2.16b, XM2.16b, XM.16b |
---|
| 668 | + |
---|
| 669 | + ext T2.16b, TT3.16b, TT3.16b, #8 |
---|
| 670 | +.Lgh2: eor TT3.16b, TT3.16b, T2.16b |
---|
| 671 | + pmull2 XH.1q, HH.2d, T2.2d // a1 * b1 |
---|
| 672 | + pmull XL.1q, HH.1d, T2.1d // a0 * b0 |
---|
| 673 | + pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) |
---|
| 674 | + |
---|
| 675 | + eor XH2.16b, XH2.16b, XH.16b |
---|
| 676 | + eor XL2.16b, XL2.16b, XL.16b |
---|
| 677 | + eor XM2.16b, XM2.16b, XM.16b |
---|
| 678 | + |
---|
| 679 | + ext IN1.16b, TT4.16b, TT4.16b, #8 |
---|
| 680 | +.Lgh1: eor TT4.16b, TT4.16b, IN1.16b |
---|
| 681 | + pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0 |
---|
| 682 | + pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1 |
---|
| 683 | + pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) |
---|
| 684 | + |
---|
| 685 | + eor XH.16b, XH.16b, XH2.16b |
---|
| 686 | + eor XL.16b, XL.16b, XL2.16b |
---|
| 687 | + eor XM.16b, XM.16b, XM2.16b |
---|
| 688 | + |
---|
| 689 | + eor T2.16b, XL.16b, XH.16b |
---|
| 690 | + ext T1.16b, XL.16b, XH.16b, #8 |
---|
| 691 | + eor XM.16b, XM.16b, T2.16b |
---|
| 692 | + |
---|
| 693 | + __pmull_reduce_p64 |
---|
| 694 | + |
---|
| 695 | + eor T2.16b, T2.16b, XH.16b |
---|
| 696 | + eor XL.16b, XL.16b, T2.16b |
---|
| 697 | + |
---|
577 | 698 | ret |
---|
578 | | -ENDPROC(pmull_gcm_encrypt_block) |
---|
| 699 | +SYM_FUNC_END(pmull_gcm_ghash_4x) |
---|
| 700 | + |
---|
| 701 | +SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x) |
---|
| 702 | + ld1 {KS0.16b}, [x5] // load upper counter |
---|
| 703 | + sub w10, w8, #4 |
---|
| 704 | + sub w11, w8, #3 |
---|
| 705 | + sub w12, w8, #2 |
---|
| 706 | + sub w13, w8, #1 |
---|
| 707 | + rev w10, w10 |
---|
| 708 | + rev w11, w11 |
---|
| 709 | + rev w12, w12 |
---|
| 710 | + rev w13, w13 |
---|
| 711 | + mov KS1.16b, KS0.16b |
---|
| 712 | + mov KS2.16b, KS0.16b |
---|
| 713 | + mov KS3.16b, KS0.16b |
---|
| 714 | + ins KS0.s[3], w10 // set lower counter |
---|
| 715 | + ins KS1.s[3], w11 |
---|
| 716 | + ins KS2.s[3], w12 |
---|
| 717 | + ins KS3.s[3], w13 |
---|
| 718 | + |
---|
| 719 | + add x10, x6, #96 // round key pointer |
---|
| 720 | + ld1 {K6.4s-K7.4s}, [x10], #32 |
---|
| 721 | + .irp key, K0, K1, K2, K3, K4, K5 |
---|
| 722 | + enc_qround KS0, KS1, KS2, KS3, \key |
---|
| 723 | + .endr |
---|
| 724 | + |
---|
| 725 | + tbnz x7, #2, .Lnot128 |
---|
| 726 | + .subsection 1 |
---|
| 727 | +.Lnot128: |
---|
| 728 | + ld1 {K8.4s-K9.4s}, [x10], #32 |
---|
| 729 | + .irp key, K6, K7 |
---|
| 730 | + enc_qround KS0, KS1, KS2, KS3, \key |
---|
| 731 | + .endr |
---|
| 732 | + ld1 {K6.4s-K7.4s}, [x10] |
---|
| 733 | + .irp key, K8, K9 |
---|
| 734 | + enc_qround KS0, KS1, KS2, KS3, \key |
---|
| 735 | + .endr |
---|
| 736 | + tbz x7, #1, .Lout192 |
---|
| 737 | + b .Lout256 |
---|
| 738 | + .previous |
---|
| 739 | + |
---|
| 740 | +.Lout256: |
---|
| 741 | + .irp key, K6, K7 |
---|
| 742 | + enc_qround KS0, KS1, KS2, KS3, \key |
---|
| 743 | + .endr |
---|
| 744 | + |
---|
| 745 | +.Lout192: |
---|
| 746 | + enc_qround KS0, KS1, KS2, KS3, KK |
---|
| 747 | + |
---|
| 748 | + aese KS0.16b, KL.16b |
---|
| 749 | + aese KS1.16b, KL.16b |
---|
| 750 | + aese KS2.16b, KL.16b |
---|
| 751 | + aese KS3.16b, KL.16b |
---|
| 752 | + |
---|
| 753 | + eor KS0.16b, KS0.16b, KM.16b |
---|
| 754 | + eor KS1.16b, KS1.16b, KM.16b |
---|
| 755 | + eor KS2.16b, KS2.16b, KM.16b |
---|
| 756 | + eor KS3.16b, KS3.16b, KM.16b |
---|
| 757 | + |
---|
| 758 | + eor INP0.16b, INP0.16b, KS0.16b |
---|
| 759 | + eor INP1.16b, INP1.16b, KS1.16b |
---|
| 760 | + eor INP2.16b, INP2.16b, KS2.16b |
---|
| 761 | + eor INP3.16b, INP3.16b, KS3.16b |
---|
| 762 | + |
---|
| 763 | + ret |
---|
| 764 | +SYM_FUNC_END(pmull_gcm_enc_4x) |
---|
| 765 | + |
---|
| 766 | + .section ".rodata", "a" |
---|
| 767 | + .align 6 |
---|
| 768 | +.Lpermute_table: |
---|
| 769 | + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
---|
| 770 | + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
---|
| 771 | + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
---|
| 772 | + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf |
---|
| 773 | + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
---|
| 774 | + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
---|
| 775 | + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
---|
| 776 | + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf |
---|
| 777 | + .previous |
---|