| .. | .. |
|---|
| 1 | +/* SPDX-License-Identifier: GPL-2.0-only */ |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * Bit sliced AES using NEON instructions |
|---|
| 3 | 4 | * |
|---|
| 4 | 5 | * Copyright (C) 2017 Linaro Ltd. |
|---|
| 5 | 6 | * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org> |
|---|
| 6 | | - * |
|---|
| 7 | | - * This program is free software; you can redistribute it and/or modify |
|---|
| 8 | | - * it under the terms of the GNU General Public License version 2 as |
|---|
| 9 | | - * published by the Free Software Foundation. |
|---|
| 10 | 7 | */ |
|---|
| 11 | 8 | |
|---|
| 12 | 9 | /* |
|---|
| .. | .. |
|---|
| 78 | 75 | .macro __ldr, out, sym |
|---|
| 79 | 76 | vldr \out\()l, \sym |
|---|
| 80 | 77 | vldr \out\()h, \sym + 8 |
|---|
| 81 | | - .endm |
|---|
| 82 | | - |
|---|
| 83 | | - .macro __adr, reg, lbl |
|---|
| 84 | | - adr \reg, \lbl |
|---|
| 85 | | -THUMB( orr \reg, \reg, #1 ) |
|---|
| 86 | 78 | .endm |
|---|
| 87 | 79 | |
|---|
| 88 | 80 | .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 |
|---|
| .. | .. |
|---|
| 632 | 624 | push {r4-r6, lr} |
|---|
| 633 | 625 | ldr r5, [sp, #16] // number of blocks |
|---|
| 634 | 626 | |
|---|
| 635 | | -99: __adr ip, 0f |
|---|
| 627 | +99: adr ip, 0f |
|---|
| 636 | 628 | and lr, r5, #7 |
|---|
| 637 | 629 | cmp r5, #8 |
|---|
| 638 | 630 | sub ip, ip, lr, lsl #2 |
|---|
| 639 | | - bxlt ip // computed goto if blocks < 8 |
|---|
| 631 | + movlt pc, ip // computed goto if blocks < 8 |
|---|
| 640 | 632 | |
|---|
| 641 | 633 | vld1.8 {q0}, [r1]! |
|---|
| 642 | 634 | vld1.8 {q1}, [r1]! |
|---|
| .. | .. |
|---|
| 651 | 643 | mov rounds, r3 |
|---|
| 652 | 644 | bl \do8 |
|---|
| 653 | 645 | |
|---|
| 654 | | - __adr ip, 1f |
|---|
| 646 | + adr ip, 1f |
|---|
| 655 | 647 | and lr, r5, #7 |
|---|
| 656 | 648 | cmp r5, #8 |
|---|
| 657 | 649 | sub ip, ip, lr, lsl #2 |
|---|
| 658 | | - bxlt ip // computed goto if blocks < 8 |
|---|
| 650 | + movlt pc, ip // computed goto if blocks < 8 |
|---|
| 659 | 651 | |
|---|
| 660 | 652 | vst1.8 {\o0}, [r0]! |
|---|
| 661 | 653 | vst1.8 {\o1}, [r0]! |
|---|
| .. | .. |
|---|
| 692 | 684 | push {r4-r6, lr} |
|---|
| 693 | 685 | ldm ip, {r5-r6} // load args 4-5 |
|---|
| 694 | 686 | |
|---|
| 695 | | -99: __adr ip, 0f |
|---|
| 687 | +99: adr ip, 0f |
|---|
| 696 | 688 | and lr, r5, #7 |
|---|
| 697 | 689 | cmp r5, #8 |
|---|
| 698 | 690 | sub ip, ip, lr, lsl #2 |
|---|
| 699 | 691 | mov lr, r1 |
|---|
| 700 | | - bxlt ip // computed goto if blocks < 8 |
|---|
| 692 | + movlt pc, ip // computed goto if blocks < 8 |
|---|
| 701 | 693 | |
|---|
| 702 | 694 | vld1.8 {q0}, [lr]! |
|---|
| 703 | 695 | vld1.8 {q1}, [lr]! |
|---|
| .. | .. |
|---|
| 721 | 713 | vmov q14, q8 |
|---|
| 722 | 714 | vmov q15, q8 |
|---|
| 723 | 715 | |
|---|
| 724 | | - __adr ip, 1f |
|---|
| 716 | + adr ip, 1f |
|---|
| 725 | 717 | and lr, r5, #7 |
|---|
| 726 | 718 | cmp r5, #8 |
|---|
| 727 | 719 | sub ip, ip, lr, lsl #2 |
|---|
| 728 | | - bxlt ip // computed goto if blocks < 8 |
|---|
| 720 | + movlt pc, ip // computed goto if blocks < 8 |
|---|
| 729 | 721 | |
|---|
| 730 | 722 | vld1.8 {q9}, [r1]! |
|---|
| 731 | 723 | vld1.8 {q10}, [r1]! |
|---|
| .. | .. |
|---|
| 736 | 728 | vld1.8 {q15}, [r1]! |
|---|
| 737 | 729 | W(nop) |
|---|
| 738 | 730 | |
|---|
| 739 | | -1: __adr ip, 2f |
|---|
| 731 | +1: adr ip, 2f |
|---|
| 740 | 732 | sub ip, ip, lr, lsl #3 |
|---|
| 741 | | - bxlt ip // computed goto if blocks < 8 |
|---|
| 733 | + movlt pc, ip // computed goto if blocks < 8 |
|---|
| 742 | 734 | |
|---|
| 743 | 735 | veor q0, q0, q8 |
|---|
| 744 | 736 | vst1.8 {q0}, [r0]! |
|---|
| .. | .. |
|---|
| 807 | 799 | vmov q6, q0 |
|---|
| 808 | 800 | vmov q7, q0 |
|---|
| 809 | 801 | |
|---|
| 810 | | - __adr ip, 0f |
|---|
| 802 | + adr ip, 0f |
|---|
| 811 | 803 | sub lr, r5, #1 |
|---|
| 812 | 804 | and lr, lr, #7 |
|---|
| 813 | 805 | cmp r5, #8 |
|---|
| 814 | 806 | sub ip, ip, lr, lsl #5 |
|---|
| 815 | 807 | sub ip, ip, lr, lsl #2 |
|---|
| 816 | | - bxlt ip // computed goto if blocks < 8 |
|---|
| 808 | + movlt pc, ip // computed goto if blocks < 8 |
|---|
| 817 | 809 | |
|---|
| 818 | 810 | next_ctr q1 |
|---|
| 819 | 811 | next_ctr q2 |
|---|
| .. | .. |
|---|
| 827 | 819 | mov rounds, r3 |
|---|
| 828 | 820 | bl aesbs_encrypt8 |
|---|
| 829 | 821 | |
|---|
| 830 | | - __adr ip, 1f |
|---|
| 822 | + adr ip, 1f |
|---|
| 831 | 823 | and lr, r5, #7 |
|---|
| 832 | 824 | cmp r5, #8 |
|---|
| 833 | 825 | movgt r4, #0 |
|---|
| 834 | 826 | ldrle r4, [sp, #40] // load final in the last round |
|---|
| 835 | 827 | sub ip, ip, lr, lsl #2 |
|---|
| 836 | | - bxlt ip // computed goto if blocks < 8 |
|---|
| 828 | + movlt pc, ip // computed goto if blocks < 8 |
|---|
| 837 | 829 | |
|---|
| 838 | 830 | vld1.8 {q8}, [r1]! |
|---|
| 839 | 831 | vld1.8 {q9}, [r1]! |
|---|
| .. | .. |
|---|
| 846 | 838 | 1: bne 2f |
|---|
| 847 | 839 | vld1.8 {q15}, [r1]! |
|---|
| 848 | 840 | |
|---|
| 849 | | -2: __adr ip, 3f |
|---|
| 841 | +2: adr ip, 3f |
|---|
| 850 | 842 | cmp r5, #8 |
|---|
| 851 | 843 | sub ip, ip, lr, lsl #3 |
|---|
| 852 | | - bxlt ip // computed goto if blocks < 8 |
|---|
| 844 | + movlt pc, ip // computed goto if blocks < 8 |
|---|
| 853 | 845 | |
|---|
| 854 | 846 | veor q0, q0, q8 |
|---|
| 855 | 847 | vst1.8 {q0}, [r0]! |
|---|
| .. | .. |
|---|
| 890 | 882 | veor \out, \out, \tmp |
|---|
| 891 | 883 | .endm |
|---|
| 892 | 884 | |
|---|
| 893 | | - .align 4 |
|---|
| 894 | | -.Lxts_mul_x: |
|---|
| 895 | | - .quad 1, 0x87 |
|---|
| 896 | | - |
|---|
| 897 | 885 | /* |
|---|
| 898 | 886 | * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
|---|
| 899 | | - * int blocks, u8 iv[]) |
|---|
| 887 | + * int blocks, u8 iv[], int reorder_last_tweak) |
|---|
| 900 | 888 | * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
|---|
| 901 | | - * int blocks, u8 iv[]) |
|---|
| 889 | + * int blocks, u8 iv[], int reorder_last_tweak) |
|---|
| 902 | 890 | */ |
|---|
| 903 | 891 | __xts_prepare8: |
|---|
| 904 | 892 | vld1.8 {q14}, [r7] // load iv |
|---|
| 905 | | - __ldr q15, .Lxts_mul_x // load tweak mask |
|---|
| 893 | + vmov.i32 d30, #0x87 // compose tweak mask vector |
|---|
| 894 | + vmovl.u32 q15, d30 |
|---|
| 895 | + vshr.u64 d30, d31, #7 |
|---|
| 906 | 896 | vmov q12, q14 |
|---|
| 907 | 897 | |
|---|
| 908 | | - __adr ip, 0f |
|---|
| 898 | + adr ip, 0f |
|---|
| 909 | 899 | and r4, r6, #7 |
|---|
| 910 | 900 | cmp r6, #8 |
|---|
| 911 | 901 | sub ip, ip, r4, lsl #5 |
|---|
| 912 | 902 | mov r4, sp |
|---|
| 913 | | - bxlt ip // computed goto if blocks < 8 |
|---|
| 903 | + movlt pc, ip // computed goto if blocks < 8 |
|---|
| 914 | 904 | |
|---|
| 915 | 905 | vld1.8 {q0}, [r1]! |
|---|
| 916 | 906 | next_tweak q12, q14, q15, q13 |
|---|
| .. | .. |
|---|
| 949 | 939 | |
|---|
| 950 | 940 | vld1.8 {q7}, [r1]! |
|---|
| 951 | 941 | next_tweak q14, q12, q15, q13 |
|---|
| 952 | | - veor q7, q7, q12 |
|---|
| 942 | +THUMB( itt le ) |
|---|
| 943 | + W(cmple) r8, #0 |
|---|
| 944 | + ble 1f |
|---|
| 945 | +0: veor q7, q7, q12 |
|---|
| 953 | 946 | vst1.8 {q12}, [r4, :128] |
|---|
| 954 | 947 | |
|---|
| 955 | | -0: vst1.8 {q14}, [r7] // store next iv |
|---|
| 948 | + vst1.8 {q14}, [r7] // store next iv |
|---|
| 956 | 949 | bx lr |
|---|
| 950 | + |
|---|
| 951 | +1: vswp q12, q14 |
|---|
| 952 | + b 0b |
|---|
| 957 | 953 | ENDPROC(__xts_prepare8) |
|---|
| 958 | 954 | |
|---|
| 959 | 955 | .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 |
|---|
| 960 | 956 | push {r4-r8, lr} |
|---|
| 961 | 957 | mov r5, sp // preserve sp |
|---|
| 962 | 958 | ldrd r6, r7, [sp, #24] // get blocks and iv args |
|---|
| 959 | + rsb r8, ip, #1 |
|---|
| 963 | 960 | sub ip, sp, #128 // make room for 8x tweak |
|---|
| 964 | 961 | bic ip, ip, #0xf // align sp to 16 bytes |
|---|
| 965 | 962 | mov sp, ip |
|---|
| .. | .. |
|---|
| 970 | 967 | mov rounds, r3 |
|---|
| 971 | 968 | bl \do8 |
|---|
| 972 | 969 | |
|---|
| 973 | | - __adr ip, 0f |
|---|
| 970 | + adr ip, 0f |
|---|
| 974 | 971 | and lr, r6, #7 |
|---|
| 975 | 972 | cmp r6, #8 |
|---|
| 976 | 973 | sub ip, ip, lr, lsl #2 |
|---|
| 977 | 974 | mov r4, sp |
|---|
| 978 | | - bxlt ip // computed goto if blocks < 8 |
|---|
| 975 | + movlt pc, ip // computed goto if blocks < 8 |
|---|
| 979 | 976 | |
|---|
| 980 | 977 | vld1.8 {q8}, [r4, :128]! |
|---|
| 981 | 978 | vld1.8 {q9}, [r4, :128]! |
|---|
| .. | .. |
|---|
| 986 | 983 | vld1.8 {q14}, [r4, :128]! |
|---|
| 987 | 984 | vld1.8 {q15}, [r4, :128] |
|---|
| 988 | 985 | |
|---|
| 989 | | -0: __adr ip, 1f |
|---|
| 986 | +0: adr ip, 1f |
|---|
| 990 | 987 | sub ip, ip, lr, lsl #3 |
|---|
| 991 | | - bxlt ip // computed goto if blocks < 8 |
|---|
| 988 | + movlt pc, ip // computed goto if blocks < 8 |
|---|
| 992 | 989 | |
|---|
| 993 | 990 | veor \o0, \o0, q8 |
|---|
| 994 | 991 | vst1.8 {\o0}, [r0]! |
|---|
| .. | .. |
|---|
| 1015 | 1012 | .endm |
|---|
| 1016 | 1013 | |
|---|
| 1017 | 1014 | ENTRY(aesbs_xts_encrypt) |
|---|
| 1015 | + mov ip, #0 // never reorder final tweak |
|---|
| 1018 | 1016 | __xts_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5 |
|---|
| 1019 | 1017 | ENDPROC(aesbs_xts_encrypt) |
|---|
| 1020 | 1018 | |
|---|
| 1021 | 1019 | ENTRY(aesbs_xts_decrypt) |
|---|
| 1020 | + ldr ip, [sp, #8] // reorder final tweak? |
|---|
| 1022 | 1021 | __xts_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5 |
|---|
| 1023 | 1022 | ENDPROC(aesbs_xts_decrypt) |
|---|