| .. | .. |
|---|
| 1 | +/* SPDX-License-Identifier: GPL-2.0-only */ |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES |
|---|
| 3 | 4 | * |
|---|
| 4 | 5 | * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> |
|---|
| 5 | | - * |
|---|
| 6 | | - * This program is free software; you can redistribute it and/or modify |
|---|
| 7 | | - * it under the terms of the GNU General Public License version 2 as |
|---|
| 8 | | - * published by the Free Software Foundation. |
|---|
| 9 | 6 | */ |
|---|
| 10 | 7 | |
|---|
| 11 | 8 | /* included by aes-ce.S and aes-neon.S */ |
|---|
| .. | .. |
|---|
| 13 | 10 | .text |
|---|
| 14 | 11 | .align 4 |
|---|
| 15 | 12 | |
|---|
| 16 | | -aes_encrypt_block4x: |
|---|
| 17 | | - encrypt_block4x v0, v1, v2, v3, w22, x21, x8, w7 |
|---|
| 18 | | - ret |
|---|
| 19 | | -ENDPROC(aes_encrypt_block4x) |
|---|
| 13 | +#ifndef MAX_STRIDE |
|---|
| 14 | +#define MAX_STRIDE 4 |
|---|
| 15 | +#endif |
|---|
| 20 | 16 | |
|---|
| 21 | | -aes_decrypt_block4x: |
|---|
| 22 | | - decrypt_block4x v0, v1, v2, v3, w22, x21, x8, w7 |
|---|
| 17 | +#if MAX_STRIDE == 4 |
|---|
| 18 | +#define ST4(x...) x |
|---|
| 19 | +#define ST5(x...) |
|---|
| 20 | +#else |
|---|
| 21 | +#define ST4(x...) |
|---|
| 22 | +#define ST5(x...) x |
|---|
| 23 | +#endif |
|---|
| 24 | + |
|---|
| 25 | +SYM_FUNC_START_LOCAL(aes_encrypt_block4x) |
|---|
| 26 | + encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 |
|---|
| 23 | 27 | ret |
|---|
| 24 | | -ENDPROC(aes_decrypt_block4x) |
|---|
| 28 | +SYM_FUNC_END(aes_encrypt_block4x) |
|---|
| 29 | + |
|---|
| 30 | +SYM_FUNC_START_LOCAL(aes_decrypt_block4x) |
|---|
| 31 | + decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 |
|---|
| 32 | + ret |
|---|
| 33 | +SYM_FUNC_END(aes_decrypt_block4x) |
|---|
| 34 | + |
|---|
| 35 | +#if MAX_STRIDE == 5 |
|---|
| 36 | +SYM_FUNC_START_LOCAL(aes_encrypt_block5x) |
|---|
| 37 | + encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 |
|---|
| 38 | + ret |
|---|
| 39 | +SYM_FUNC_END(aes_encrypt_block5x) |
|---|
| 40 | + |
|---|
| 41 | +SYM_FUNC_START_LOCAL(aes_decrypt_block5x) |
|---|
| 42 | + decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 |
|---|
| 43 | + ret |
|---|
| 44 | +SYM_FUNC_END(aes_decrypt_block5x) |
|---|
| 45 | +#endif |
|---|
| 25 | 46 | |
|---|
| 26 | 47 | /* |
|---|
| 27 | 48 | * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
|---|
| .. | .. |
|---|
| 30 | 51 | * int blocks) |
|---|
| 31 | 52 | */ |
|---|
| 32 | 53 | |
|---|
| 33 | | -AES_ENTRY(aes_ecb_encrypt) |
|---|
| 34 | | - frame_push 5 |
|---|
| 54 | +AES_FUNC_START(aes_ecb_encrypt) |
|---|
| 55 | + stp x29, x30, [sp, #-16]! |
|---|
| 56 | + mov x29, sp |
|---|
| 35 | 57 | |
|---|
| 36 | | - mov x19, x0 |
|---|
| 37 | | - mov x20, x1 |
|---|
| 38 | | - mov x21, x2 |
|---|
| 39 | | - mov x22, x3 |
|---|
| 40 | | - mov x23, x4 |
|---|
| 41 | | - |
|---|
| 42 | | -.Lecbencrestart: |
|---|
| 43 | | - enc_prepare w22, x21, x5 |
|---|
| 58 | + enc_prepare w3, x2, x5 |
|---|
| 44 | 59 | |
|---|
| 45 | 60 | .LecbencloopNx: |
|---|
| 46 | | - subs w23, w23, #4 |
|---|
| 61 | + subs w4, w4, #MAX_STRIDE |
|---|
| 47 | 62 | bmi .Lecbenc1x |
|---|
| 48 | | - ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */ |
|---|
| 49 | | - bl aes_encrypt_block4x |
|---|
| 50 | | - st1 {v0.16b-v3.16b}, [x19], #64 |
|---|
| 51 | | - cond_yield_neon .Lecbencrestart |
|---|
| 63 | + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ |
|---|
| 64 | +ST4( bl aes_encrypt_block4x ) |
|---|
| 65 | +ST5( ld1 {v4.16b}, [x1], #16 ) |
|---|
| 66 | +ST5( bl aes_encrypt_block5x ) |
|---|
| 67 | + st1 {v0.16b-v3.16b}, [x0], #64 |
|---|
| 68 | +ST5( st1 {v4.16b}, [x0], #16 ) |
|---|
| 52 | 69 | b .LecbencloopNx |
|---|
| 53 | 70 | .Lecbenc1x: |
|---|
| 54 | | - adds w23, w23, #4 |
|---|
| 71 | + adds w4, w4, #MAX_STRIDE |
|---|
| 55 | 72 | beq .Lecbencout |
|---|
| 56 | 73 | .Lecbencloop: |
|---|
| 57 | | - ld1 {v0.16b}, [x20], #16 /* get next pt block */ |
|---|
| 58 | | - encrypt_block v0, w22, x21, x5, w6 |
|---|
| 59 | | - st1 {v0.16b}, [x19], #16 |
|---|
| 60 | | - subs w23, w23, #1 |
|---|
| 74 | + ld1 {v0.16b}, [x1], #16 /* get next pt block */ |
|---|
| 75 | + encrypt_block v0, w3, x2, x5, w6 |
|---|
| 76 | + st1 {v0.16b}, [x0], #16 |
|---|
| 77 | + subs w4, w4, #1 |
|---|
| 61 | 78 | bne .Lecbencloop |
|---|
| 62 | 79 | .Lecbencout: |
|---|
| 63 | | - frame_pop |
|---|
| 80 | + ldp x29, x30, [sp], #16 |
|---|
| 64 | 81 | ret |
|---|
| 65 | | -AES_ENDPROC(aes_ecb_encrypt) |
|---|
| 82 | +AES_FUNC_END(aes_ecb_encrypt) |
|---|
| 66 | 83 | |
|---|
| 67 | 84 | |
|---|
| 68 | | -AES_ENTRY(aes_ecb_decrypt) |
|---|
| 69 | | - frame_push 5 |
|---|
| 85 | +AES_FUNC_START(aes_ecb_decrypt) |
|---|
| 86 | + stp x29, x30, [sp, #-16]! |
|---|
| 87 | + mov x29, sp |
|---|
| 70 | 88 | |
|---|
| 71 | | - mov x19, x0 |
|---|
| 72 | | - mov x20, x1 |
|---|
| 73 | | - mov x21, x2 |
|---|
| 74 | | - mov x22, x3 |
|---|
| 75 | | - mov x23, x4 |
|---|
| 76 | | - |
|---|
| 77 | | -.Lecbdecrestart: |
|---|
| 78 | | - dec_prepare w22, x21, x5 |
|---|
| 89 | + dec_prepare w3, x2, x5 |
|---|
| 79 | 90 | |
|---|
| 80 | 91 | .LecbdecloopNx: |
|---|
| 81 | | - subs w23, w23, #4 |
|---|
| 92 | + subs w4, w4, #MAX_STRIDE |
|---|
| 82 | 93 | bmi .Lecbdec1x |
|---|
| 83 | | - ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */ |
|---|
| 84 | | - bl aes_decrypt_block4x |
|---|
| 85 | | - st1 {v0.16b-v3.16b}, [x19], #64 |
|---|
| 86 | | - cond_yield_neon .Lecbdecrestart |
|---|
| 94 | + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ |
|---|
| 95 | +ST4( bl aes_decrypt_block4x ) |
|---|
| 96 | +ST5( ld1 {v4.16b}, [x1], #16 ) |
|---|
| 97 | +ST5( bl aes_decrypt_block5x ) |
|---|
| 98 | + st1 {v0.16b-v3.16b}, [x0], #64 |
|---|
| 99 | +ST5( st1 {v4.16b}, [x0], #16 ) |
|---|
| 87 | 100 | b .LecbdecloopNx |
|---|
| 88 | 101 | .Lecbdec1x: |
|---|
| 89 | | - adds w23, w23, #4 |
|---|
| 102 | + adds w4, w4, #MAX_STRIDE |
|---|
| 90 | 103 | beq .Lecbdecout |
|---|
| 91 | 104 | .Lecbdecloop: |
|---|
| 92 | | - ld1 {v0.16b}, [x20], #16 /* get next ct block */ |
|---|
| 93 | | - decrypt_block v0, w22, x21, x5, w6 |
|---|
| 94 | | - st1 {v0.16b}, [x19], #16 |
|---|
| 95 | | - subs w23, w23, #1 |
|---|
| 105 | + ld1 {v0.16b}, [x1], #16 /* get next ct block */ |
|---|
| 106 | + decrypt_block v0, w3, x2, x5, w6 |
|---|
| 107 | + st1 {v0.16b}, [x0], #16 |
|---|
| 108 | + subs w4, w4, #1 |
|---|
| 96 | 109 | bne .Lecbdecloop |
|---|
| 97 | 110 | .Lecbdecout: |
|---|
| 98 | | - frame_pop |
|---|
| 111 | + ldp x29, x30, [sp], #16 |
|---|
| 99 | 112 | ret |
|---|
| 100 | | -AES_ENDPROC(aes_ecb_decrypt) |
|---|
| 113 | +AES_FUNC_END(aes_ecb_decrypt) |
|---|
| 101 | 114 | |
|---|
| 102 | 115 | |
|---|
| 103 | 116 | /* |
|---|
| .. | .. |
|---|
| 105 | 118 | * int blocks, u8 iv[]) |
|---|
| 106 | 119 | * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
|---|
| 107 | 120 | * int blocks, u8 iv[]) |
|---|
| 121 | + * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[], |
|---|
| 122 | + * int rounds, int blocks, u8 iv[], |
|---|
| 123 | + * u32 const rk2[]); |
|---|
| 124 | + * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[], |
|---|
| 125 | + * int rounds, int blocks, u8 iv[], |
|---|
| 126 | + * u32 const rk2[]); |
|---|
| 108 | 127 | */ |
|---|
| 109 | 128 | |
|---|
| 110 | | -AES_ENTRY(aes_cbc_encrypt) |
|---|
| 111 | | - frame_push 6 |
|---|
| 129 | +AES_FUNC_START(aes_essiv_cbc_encrypt) |
|---|
| 130 | + ld1 {v4.16b}, [x5] /* get iv */ |
|---|
| 112 | 131 | |
|---|
| 113 | | - mov x19, x0 |
|---|
| 114 | | - mov x20, x1 |
|---|
| 115 | | - mov x21, x2 |
|---|
| 116 | | - mov x22, x3 |
|---|
| 117 | | - mov x23, x4 |
|---|
| 118 | | - mov x24, x5 |
|---|
| 132 | + mov w8, #14 /* AES-256: 14 rounds */ |
|---|
| 133 | + enc_prepare w8, x6, x7 |
|---|
| 134 | + encrypt_block v4, w8, x6, x7, w9 |
|---|
| 135 | + enc_switch_key w3, x2, x6 |
|---|
| 136 | + b .Lcbcencloop4x |
|---|
| 119 | 137 | |
|---|
| 120 | | -.Lcbcencrestart: |
|---|
| 121 | | - ld1 {v4.16b}, [x24] /* get iv */ |
|---|
| 122 | | - enc_prepare w22, x21, x6 |
|---|
| 138 | +AES_FUNC_START(aes_cbc_encrypt) |
|---|
| 139 | + ld1 {v4.16b}, [x5] /* get iv */ |
|---|
| 140 | + enc_prepare w3, x2, x6 |
|---|
| 123 | 141 | |
|---|
| 124 | 142 | .Lcbcencloop4x: |
|---|
| 125 | | - subs w23, w23, #4 |
|---|
| 143 | + subs w4, w4, #4 |
|---|
| 126 | 144 | bmi .Lcbcenc1x |
|---|
| 127 | | - ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */ |
|---|
| 145 | + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ |
|---|
| 128 | 146 | eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */ |
|---|
| 129 | | - encrypt_block v0, w22, x21, x6, w7 |
|---|
| 147 | + encrypt_block v0, w3, x2, x6, w7 |
|---|
| 130 | 148 | eor v1.16b, v1.16b, v0.16b |
|---|
| 131 | | - encrypt_block v1, w22, x21, x6, w7 |
|---|
| 149 | + encrypt_block v1, w3, x2, x6, w7 |
|---|
| 132 | 150 | eor v2.16b, v2.16b, v1.16b |
|---|
| 133 | | - encrypt_block v2, w22, x21, x6, w7 |
|---|
| 151 | + encrypt_block v2, w3, x2, x6, w7 |
|---|
| 134 | 152 | eor v3.16b, v3.16b, v2.16b |
|---|
| 135 | | - encrypt_block v3, w22, x21, x6, w7 |
|---|
| 136 | | - st1 {v0.16b-v3.16b}, [x19], #64 |
|---|
| 153 | + encrypt_block v3, w3, x2, x6, w7 |
|---|
| 154 | + st1 {v0.16b-v3.16b}, [x0], #64 |
|---|
| 137 | 155 | mov v4.16b, v3.16b |
|---|
| 138 | | - st1 {v4.16b}, [x24] /* return iv */ |
|---|
| 139 | | - cond_yield_neon .Lcbcencrestart |
|---|
| 140 | 156 | b .Lcbcencloop4x |
|---|
| 141 | 157 | .Lcbcenc1x: |
|---|
| 142 | | - adds w23, w23, #4 |
|---|
| 158 | + adds w4, w4, #4 |
|---|
| 143 | 159 | beq .Lcbcencout |
|---|
| 144 | 160 | .Lcbcencloop: |
|---|
| 145 | | - ld1 {v0.16b}, [x20], #16 /* get next pt block */ |
|---|
| 161 | + ld1 {v0.16b}, [x1], #16 /* get next pt block */ |
|---|
| 146 | 162 | eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */ |
|---|
| 147 | | - encrypt_block v4, w22, x21, x6, w7 |
|---|
| 148 | | - st1 {v4.16b}, [x19], #16 |
|---|
| 149 | | - subs w23, w23, #1 |
|---|
| 163 | + encrypt_block v4, w3, x2, x6, w7 |
|---|
| 164 | + st1 {v4.16b}, [x0], #16 |
|---|
| 165 | + subs w4, w4, #1 |
|---|
| 150 | 166 | bne .Lcbcencloop |
|---|
| 151 | 167 | .Lcbcencout: |
|---|
| 152 | | - st1 {v4.16b}, [x24] /* return iv */ |
|---|
| 153 | | - frame_pop |
|---|
| 168 | + st1 {v4.16b}, [x5] /* return iv */ |
|---|
| 154 | 169 | ret |
|---|
| 155 | | -AES_ENDPROC(aes_cbc_encrypt) |
|---|
| 170 | +AES_FUNC_END(aes_cbc_encrypt) |
|---|
| 171 | +AES_FUNC_END(aes_essiv_cbc_encrypt) |
|---|
| 156 | 172 | |
|---|
| 173 | +AES_FUNC_START(aes_essiv_cbc_decrypt) |
|---|
| 174 | + stp x29, x30, [sp, #-16]! |
|---|
| 175 | + mov x29, sp |
|---|
| 157 | 176 | |
|---|
| 158 | | -AES_ENTRY(aes_cbc_decrypt) |
|---|
| 159 | | - frame_push 6 |
|---|
| 177 | + ld1 {cbciv.16b}, [x5] /* get iv */ |
|---|
| 160 | 178 | |
|---|
| 161 | | - mov x19, x0 |
|---|
| 162 | | - mov x20, x1 |
|---|
| 163 | | - mov x21, x2 |
|---|
| 164 | | - mov x22, x3 |
|---|
| 165 | | - mov x23, x4 |
|---|
| 166 | | - mov x24, x5 |
|---|
| 179 | + mov w8, #14 /* AES-256: 14 rounds */ |
|---|
| 180 | + enc_prepare w8, x6, x7 |
|---|
| 181 | + encrypt_block cbciv, w8, x6, x7, w9 |
|---|
| 182 | + b .Lessivcbcdecstart |
|---|
| 167 | 183 | |
|---|
| 168 | | -.Lcbcdecrestart: |
|---|
| 169 | | - ld1 {v7.16b}, [x24] /* get iv */ |
|---|
| 170 | | - dec_prepare w22, x21, x6 |
|---|
| 184 | +AES_FUNC_START(aes_cbc_decrypt) |
|---|
| 185 | + stp x29, x30, [sp, #-16]! |
|---|
| 186 | + mov x29, sp |
|---|
| 187 | + |
|---|
| 188 | + ld1 {cbciv.16b}, [x5] /* get iv */ |
|---|
| 189 | +.Lessivcbcdecstart: |
|---|
| 190 | + dec_prepare w3, x2, x6 |
|---|
| 171 | 191 | |
|---|
| 172 | 192 | .LcbcdecloopNx: |
|---|
| 173 | | - subs w23, w23, #4 |
|---|
| 193 | + subs w4, w4, #MAX_STRIDE |
|---|
| 174 | 194 | bmi .Lcbcdec1x |
|---|
| 175 | | - ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */ |
|---|
| 195 | + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ |
|---|
| 196 | +#if MAX_STRIDE == 5 |
|---|
| 197 | + ld1 {v4.16b}, [x1], #16 /* get 1 ct block */ |
|---|
| 198 | + mov v5.16b, v0.16b |
|---|
| 199 | + mov v6.16b, v1.16b |
|---|
| 200 | + mov v7.16b, v2.16b |
|---|
| 201 | + bl aes_decrypt_block5x |
|---|
| 202 | + sub x1, x1, #32 |
|---|
| 203 | + eor v0.16b, v0.16b, cbciv.16b |
|---|
| 204 | + eor v1.16b, v1.16b, v5.16b |
|---|
| 205 | + ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */ |
|---|
| 206 | + ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ |
|---|
| 207 | + eor v2.16b, v2.16b, v6.16b |
|---|
| 208 | + eor v3.16b, v3.16b, v7.16b |
|---|
| 209 | + eor v4.16b, v4.16b, v5.16b |
|---|
| 210 | +#else |
|---|
| 176 | 211 | mov v4.16b, v0.16b |
|---|
| 177 | 212 | mov v5.16b, v1.16b |
|---|
| 178 | 213 | mov v6.16b, v2.16b |
|---|
| 179 | 214 | bl aes_decrypt_block4x |
|---|
| 180 | | - sub x20, x20, #16 |
|---|
| 181 | | - eor v0.16b, v0.16b, v7.16b |
|---|
| 215 | + sub x1, x1, #16 |
|---|
| 216 | + eor v0.16b, v0.16b, cbciv.16b |
|---|
| 182 | 217 | eor v1.16b, v1.16b, v4.16b |
|---|
| 183 | | - ld1 {v7.16b}, [x20], #16 /* reload 1 ct block */ |
|---|
| 218 | + ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ |
|---|
| 184 | 219 | eor v2.16b, v2.16b, v5.16b |
|---|
| 185 | 220 | eor v3.16b, v3.16b, v6.16b |
|---|
| 186 | | - st1 {v0.16b-v3.16b}, [x19], #64 |
|---|
| 187 | | - st1 {v7.16b}, [x24] /* return iv */ |
|---|
| 188 | | - cond_yield_neon .Lcbcdecrestart |
|---|
| 221 | +#endif |
|---|
| 222 | + st1 {v0.16b-v3.16b}, [x0], #64 |
|---|
| 223 | +ST5( st1 {v4.16b}, [x0], #16 ) |
|---|
| 189 | 224 | b .LcbcdecloopNx |
|---|
| 190 | 225 | .Lcbcdec1x: |
|---|
| 191 | | - adds w23, w23, #4 |
|---|
| 226 | + adds w4, w4, #MAX_STRIDE |
|---|
| 192 | 227 | beq .Lcbcdecout |
|---|
| 193 | 228 | .Lcbcdecloop: |
|---|
| 194 | | - ld1 {v1.16b}, [x20], #16 /* get next ct block */ |
|---|
| 229 | + ld1 {v1.16b}, [x1], #16 /* get next ct block */ |
|---|
| 195 | 230 | mov v0.16b, v1.16b /* ...and copy to v0 */ |
|---|
| 196 | | - decrypt_block v0, w22, x21, x6, w7 |
|---|
| 197 | | - eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */ |
|---|
| 198 | | - mov v7.16b, v1.16b /* ct is next iv */ |
|---|
| 199 | | - st1 {v0.16b}, [x19], #16 |
|---|
| 200 | | - subs w23, w23, #1 |
|---|
| 231 | + decrypt_block v0, w3, x2, x6, w7 |
|---|
| 232 | + eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */ |
|---|
| 233 | + mov cbciv.16b, v1.16b /* ct is next iv */ |
|---|
| 234 | + st1 {v0.16b}, [x0], #16 |
|---|
| 235 | + subs w4, w4, #1 |
|---|
| 201 | 236 | bne .Lcbcdecloop |
|---|
| 202 | 237 | .Lcbcdecout: |
|---|
| 203 | | - st1 {v7.16b}, [x24] /* return iv */ |
|---|
| 204 | | - frame_pop |
|---|
| 238 | + st1 {cbciv.16b}, [x5] /* return iv */ |
|---|
| 239 | + ldp x29, x30, [sp], #16 |
|---|
| 205 | 240 | ret |
|---|
| 206 | | -AES_ENDPROC(aes_cbc_decrypt) |
|---|
| 241 | +AES_FUNC_END(aes_cbc_decrypt) |
|---|
| 242 | +AES_FUNC_END(aes_essiv_cbc_decrypt) |
|---|
| 243 | + |
|---|
| 244 | + |
|---|
| 245 | + /* |
|---|
| 246 | + * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], |
|---|
| 247 | + * int rounds, int bytes, u8 const iv[]) |
|---|
| 248 | + * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], |
|---|
| 249 | + * int rounds, int bytes, u8 const iv[]) |
|---|
| 250 | + */ |
|---|
| 251 | + |
|---|
| 252 | +AES_FUNC_START(aes_cbc_cts_encrypt) |
|---|
| 253 | + adr_l x8, .Lcts_permute_table |
|---|
| 254 | + sub x4, x4, #16 |
|---|
| 255 | + add x9, x8, #32 |
|---|
| 256 | + add x8, x8, x4 |
|---|
| 257 | + sub x9, x9, x4 |
|---|
| 258 | + ld1 {v3.16b}, [x8] |
|---|
| 259 | + ld1 {v4.16b}, [x9] |
|---|
| 260 | + |
|---|
| 261 | + ld1 {v0.16b}, [x1], x4 /* overlapping loads */ |
|---|
| 262 | + ld1 {v1.16b}, [x1] |
|---|
| 263 | + |
|---|
| 264 | + ld1 {v5.16b}, [x5] /* get iv */ |
|---|
| 265 | + enc_prepare w3, x2, x6 |
|---|
| 266 | + |
|---|
| 267 | + eor v0.16b, v0.16b, v5.16b /* xor with iv */ |
|---|
| 268 | + tbl v1.16b, {v1.16b}, v4.16b |
|---|
| 269 | + encrypt_block v0, w3, x2, x6, w7 |
|---|
| 270 | + |
|---|
| 271 | + eor v1.16b, v1.16b, v0.16b |
|---|
| 272 | + tbl v0.16b, {v0.16b}, v3.16b |
|---|
| 273 | + encrypt_block v1, w3, x2, x6, w7 |
|---|
| 274 | + |
|---|
| 275 | + add x4, x0, x4 |
|---|
| 276 | + st1 {v0.16b}, [x4] /* overlapping stores */ |
|---|
| 277 | + st1 {v1.16b}, [x0] |
|---|
| 278 | + ret |
|---|
| 279 | +AES_FUNC_END(aes_cbc_cts_encrypt) |
|---|
| 280 | + |
|---|
| 281 | +AES_FUNC_START(aes_cbc_cts_decrypt) |
|---|
| 282 | + adr_l x8, .Lcts_permute_table |
|---|
| 283 | + sub x4, x4, #16 |
|---|
| 284 | + add x9, x8, #32 |
|---|
| 285 | + add x8, x8, x4 |
|---|
| 286 | + sub x9, x9, x4 |
|---|
| 287 | + ld1 {v3.16b}, [x8] |
|---|
| 288 | + ld1 {v4.16b}, [x9] |
|---|
| 289 | + |
|---|
| 290 | + ld1 {v0.16b}, [x1], x4 /* overlapping loads */ |
|---|
| 291 | + ld1 {v1.16b}, [x1] |
|---|
| 292 | + |
|---|
| 293 | + ld1 {v5.16b}, [x5] /* get iv */ |
|---|
| 294 | + dec_prepare w3, x2, x6 |
|---|
| 295 | + |
|---|
| 296 | + decrypt_block v0, w3, x2, x6, w7 |
|---|
| 297 | + tbl v2.16b, {v0.16b}, v3.16b |
|---|
| 298 | + eor v2.16b, v2.16b, v1.16b |
|---|
| 299 | + |
|---|
| 300 | + tbx v0.16b, {v1.16b}, v4.16b |
|---|
| 301 | + decrypt_block v0, w3, x2, x6, w7 |
|---|
| 302 | + eor v0.16b, v0.16b, v5.16b /* xor with iv */ |
|---|
| 303 | + |
|---|
| 304 | + add x4, x0, x4 |
|---|
| 305 | + st1 {v2.16b}, [x4] /* overlapping stores */ |
|---|
| 306 | + st1 {v0.16b}, [x0] |
|---|
| 307 | + ret |
|---|
| 308 | +AES_FUNC_END(aes_cbc_cts_decrypt) |
|---|
| 309 | + |
|---|
| 310 | + .section ".rodata", "a" |
|---|
| 311 | + .align 6 |
|---|
| 312 | +.Lcts_permute_table: |
|---|
| 313 | + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
|---|
| 314 | + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
|---|
| 315 | + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
|---|
| 316 | + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf |
|---|
| 317 | + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
|---|
| 318 | + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
|---|
| 319 | + .previous |
|---|
| 207 | 320 | |
|---|
| 208 | 321 | |
|---|
| 209 | 322 | /* |
|---|
| .. | .. |
|---|
| 211 | 324 | * int blocks, u8 ctr[]) |
|---|
| 212 | 325 | */ |
|---|
| 213 | 326 | |
|---|
| 214 | | -AES_ENTRY(aes_ctr_encrypt) |
|---|
| 215 | | - frame_push 6 |
|---|
| 327 | +AES_FUNC_START(aes_ctr_encrypt) |
|---|
| 328 | + stp x29, x30, [sp, #-16]! |
|---|
| 329 | + mov x29, sp |
|---|
| 216 | 330 | |
|---|
| 217 | | - mov x19, x0 |
|---|
| 218 | | - mov x20, x1 |
|---|
| 219 | | - mov x21, x2 |
|---|
| 220 | | - mov x22, x3 |
|---|
| 221 | | - mov x23, x4 |
|---|
| 222 | | - mov x24, x5 |
|---|
| 331 | + enc_prepare w3, x2, x6 |
|---|
| 332 | + ld1 {vctr.16b}, [x5] |
|---|
| 223 | 333 | |
|---|
| 224 | | -.Lctrrestart: |
|---|
| 225 | | - enc_prepare w22, x21, x6 |
|---|
| 226 | | - ld1 {v4.16b}, [x24] |
|---|
| 227 | | - |
|---|
| 228 | | - umov x6, v4.d[1] /* keep swabbed ctr in reg */ |
|---|
| 334 | + umov x6, vctr.d[1] /* keep swabbed ctr in reg */ |
|---|
| 229 | 335 | rev x6, x6 |
|---|
| 336 | + cmn w6, w4 /* 32 bit overflow? */ |
|---|
| 337 | + bcs .Lctrloop |
|---|
| 230 | 338 | .LctrloopNx: |
|---|
| 231 | | - subs w23, w23, #4 |
|---|
| 339 | + subs w4, w4, #MAX_STRIDE |
|---|
| 232 | 340 | bmi .Lctr1x |
|---|
| 233 | | - cmn w6, #4 /* 32 bit overflow? */ |
|---|
| 234 | | - bcs .Lctr1x |
|---|
| 235 | 341 | add w7, w6, #1 |
|---|
| 236 | | - mov v0.16b, v4.16b |
|---|
| 342 | + mov v0.16b, vctr.16b |
|---|
| 237 | 343 | add w8, w6, #2 |
|---|
| 238 | | - mov v1.16b, v4.16b |
|---|
| 344 | + mov v1.16b, vctr.16b |
|---|
| 239 | 345 | add w9, w6, #3 |
|---|
| 240 | | - mov v2.16b, v4.16b |
|---|
| 346 | + mov v2.16b, vctr.16b |
|---|
| 347 | + add w9, w6, #3 |
|---|
| 241 | 348 | rev w7, w7 |
|---|
| 242 | | - mov v3.16b, v4.16b |
|---|
| 349 | + mov v3.16b, vctr.16b |
|---|
| 243 | 350 | rev w8, w8 |
|---|
| 351 | +ST5( mov v4.16b, vctr.16b ) |
|---|
| 244 | 352 | mov v1.s[3], w7 |
|---|
| 245 | 353 | rev w9, w9 |
|---|
| 354 | +ST5( add w10, w6, #4 ) |
|---|
| 246 | 355 | mov v2.s[3], w8 |
|---|
| 356 | +ST5( rev w10, w10 ) |
|---|
| 247 | 357 | mov v3.s[3], w9 |
|---|
| 248 | | - ld1 {v5.16b-v7.16b}, [x20], #48 /* get 3 input blocks */ |
|---|
| 249 | | - bl aes_encrypt_block4x |
|---|
| 358 | +ST5( mov v4.s[3], w10 ) |
|---|
| 359 | + ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ |
|---|
| 360 | +ST4( bl aes_encrypt_block4x ) |
|---|
| 361 | +ST5( bl aes_encrypt_block5x ) |
|---|
| 250 | 362 | eor v0.16b, v5.16b, v0.16b |
|---|
| 251 | | - ld1 {v5.16b}, [x20], #16 /* get 1 input block */ |
|---|
| 363 | +ST4( ld1 {v5.16b}, [x1], #16 ) |
|---|
| 252 | 364 | eor v1.16b, v6.16b, v1.16b |
|---|
| 365 | +ST5( ld1 {v5.16b-v6.16b}, [x1], #32 ) |
|---|
| 253 | 366 | eor v2.16b, v7.16b, v2.16b |
|---|
| 254 | 367 | eor v3.16b, v5.16b, v3.16b |
|---|
| 255 | | - st1 {v0.16b-v3.16b}, [x19], #64 |
|---|
| 256 | | - add x6, x6, #4 |
|---|
| 368 | +ST5( eor v4.16b, v6.16b, v4.16b ) |
|---|
| 369 | + st1 {v0.16b-v3.16b}, [x0], #64 |
|---|
| 370 | +ST5( st1 {v4.16b}, [x0], #16 ) |
|---|
| 371 | + add x6, x6, #MAX_STRIDE |
|---|
| 257 | 372 | rev x7, x6 |
|---|
| 258 | | - ins v4.d[1], x7 |
|---|
| 259 | | - cbz w23, .Lctrout |
|---|
| 260 | | - st1 {v4.16b}, [x24] /* return next CTR value */ |
|---|
| 261 | | - cond_yield_neon .Lctrrestart |
|---|
| 373 | + ins vctr.d[1], x7 |
|---|
| 374 | + cbz w4, .Lctrout |
|---|
| 262 | 375 | b .LctrloopNx |
|---|
| 263 | 376 | .Lctr1x: |
|---|
| 264 | | - adds w23, w23, #4 |
|---|
| 377 | + adds w4, w4, #MAX_STRIDE |
|---|
| 265 | 378 | beq .Lctrout |
|---|
| 266 | 379 | .Lctrloop: |
|---|
| 267 | | - mov v0.16b, v4.16b |
|---|
| 268 | | - encrypt_block v0, w22, x21, x8, w7 |
|---|
| 380 | + mov v0.16b, vctr.16b |
|---|
| 381 | + encrypt_block v0, w3, x2, x8, w7 |
|---|
| 269 | 382 | |
|---|
| 270 | 383 | adds x6, x6, #1 /* increment BE ctr */ |
|---|
| 271 | 384 | rev x7, x6 |
|---|
| 272 | | - ins v4.d[1], x7 |
|---|
| 385 | + ins vctr.d[1], x7 |
|---|
| 273 | 386 | bcs .Lctrcarry /* overflow? */ |
|---|
| 274 | 387 | |
|---|
| 275 | 388 | .Lctrcarrydone: |
|---|
| 276 | | - subs w23, w23, #1 |
|---|
| 389 | + subs w4, w4, #1 |
|---|
| 277 | 390 | bmi .Lctrtailblock /* blocks <0 means tail block */ |
|---|
| 278 | | - ld1 {v3.16b}, [x20], #16 |
|---|
| 391 | + ld1 {v3.16b}, [x1], #16 |
|---|
| 279 | 392 | eor v3.16b, v0.16b, v3.16b |
|---|
| 280 | | - st1 {v3.16b}, [x19], #16 |
|---|
| 393 | + st1 {v3.16b}, [x0], #16 |
|---|
| 281 | 394 | bne .Lctrloop |
|---|
| 282 | 395 | |
|---|
| 283 | 396 | .Lctrout: |
|---|
| 284 | | - st1 {v4.16b}, [x24] /* return next CTR value */ |
|---|
| 285 | | -.Lctrret: |
|---|
| 286 | | - frame_pop |
|---|
| 397 | + st1 {vctr.16b}, [x5] /* return next CTR value */ |
|---|
| 398 | + ldp x29, x30, [sp], #16 |
|---|
| 287 | 399 | ret |
|---|
| 288 | 400 | |
|---|
| 289 | 401 | .Lctrtailblock: |
|---|
| 290 | | - st1 {v0.16b}, [x19] |
|---|
| 291 | | - b .Lctrret |
|---|
| 402 | + st1 {v0.16b}, [x0] |
|---|
| 403 | + b .Lctrout |
|---|
| 292 | 404 | |
|---|
| 293 | 405 | .Lctrcarry: |
|---|
| 294 | | - umov x7, v4.d[0] /* load upper word of ctr */ |
|---|
| 406 | + umov x7, vctr.d[0] /* load upper word of ctr */ |
|---|
| 295 | 407 | rev x7, x7 /* ... to handle the carry */ |
|---|
| 296 | 408 | add x7, x7, #1 |
|---|
| 297 | 409 | rev x7, x7 |
|---|
| 298 | | - ins v4.d[0], x7 |
|---|
| 410 | + ins vctr.d[0], x7 |
|---|
| 299 | 411 | b .Lctrcarrydone |
|---|
| 300 | | -AES_ENDPROC(aes_ctr_encrypt) |
|---|
| 301 | | - .ltorg |
|---|
| 412 | +AES_FUNC_END(aes_ctr_encrypt) |
|---|
| 302 | 413 | |
|---|
| 303 | 414 | |
|---|
| 304 | 415 | /* |
|---|
| 416 | + * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, |
|---|
| 417 | + * int bytes, u8 const rk2[], u8 iv[], int first) |
|---|
| 305 | 418 | * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, |
|---|
| 306 | | - * int blocks, u8 const rk2[], u8 iv[], int first) |
|---|
| 307 | | - * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, |
|---|
| 308 | | - * int blocks, u8 const rk2[], u8 iv[], int first) |
|---|
| 419 | + * int bytes, u8 const rk2[], u8 iv[], int first) |
|---|
| 309 | 420 | */ |
|---|
| 310 | 421 | |
|---|
| 311 | | - .macro next_tweak, out, in, const, tmp |
|---|
| 422 | + .macro next_tweak, out, in, tmp |
|---|
| 312 | 423 | sshr \tmp\().2d, \in\().2d, #63 |
|---|
| 313 | | - and \tmp\().16b, \tmp\().16b, \const\().16b |
|---|
| 424 | + and \tmp\().16b, \tmp\().16b, xtsmask.16b |
|---|
| 314 | 425 | add \out\().2d, \in\().2d, \in\().2d |
|---|
| 315 | 426 | ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 |
|---|
| 316 | 427 | eor \out\().16b, \out\().16b, \tmp\().16b |
|---|
| 317 | 428 | .endm |
|---|
| 318 | 429 | |
|---|
| 319 | | -.Lxts_mul_x: |
|---|
| 320 | | -CPU_LE( .quad 1, 0x87 ) |
|---|
| 321 | | -CPU_BE( .quad 0x87, 1 ) |
|---|
| 430 | + .macro xts_load_mask, tmp |
|---|
| 431 | + movi xtsmask.2s, #0x1 |
|---|
| 432 | + movi \tmp\().2s, #0x87 |
|---|
| 433 | + uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s |
|---|
| 434 | + .endm |
|---|
| 322 | 435 | |
|---|
| 323 | | -AES_ENTRY(aes_xts_encrypt) |
|---|
| 324 | | - frame_push 6 |
|---|
| 436 | +AES_FUNC_START(aes_xts_encrypt) |
|---|
| 437 | + stp x29, x30, [sp, #-16]! |
|---|
| 438 | + mov x29, sp |
|---|
| 325 | 439 | |
|---|
| 326 | | - mov x19, x0 |
|---|
| 327 | | - mov x20, x1 |
|---|
| 328 | | - mov x21, x2 |
|---|
| 329 | | - mov x22, x3 |
|---|
| 330 | | - mov x23, x4 |
|---|
| 331 | | - mov x24, x6 |
|---|
| 332 | | - |
|---|
| 333 | | - ld1 {v4.16b}, [x24] |
|---|
| 440 | + ld1 {v4.16b}, [x6] |
|---|
| 441 | + xts_load_mask v8 |
|---|
| 334 | 442 | cbz w7, .Lxtsencnotfirst |
|---|
| 335 | 443 | |
|---|
| 336 | 444 | enc_prepare w3, x5, x8 |
|---|
| 445 | + xts_cts_skip_tw w7, .LxtsencNx |
|---|
| 337 | 446 | encrypt_block v4, w3, x5, x8, w7 /* first tweak */ |
|---|
| 338 | 447 | enc_switch_key w3, x2, x8 |
|---|
| 339 | | - ldr q7, .Lxts_mul_x |
|---|
| 340 | 448 | b .LxtsencNx |
|---|
| 341 | 449 | |
|---|
| 342 | | -.Lxtsencrestart: |
|---|
| 343 | | - ld1 {v4.16b}, [x24] |
|---|
| 344 | 450 | .Lxtsencnotfirst: |
|---|
| 345 | | - enc_prepare w22, x21, x8 |
|---|
| 451 | + enc_prepare w3, x2, x8 |
|---|
| 346 | 452 | .LxtsencloopNx: |
|---|
| 347 | | - ldr q7, .Lxts_mul_x |
|---|
| 348 | | - next_tweak v4, v4, v7, v8 |
|---|
| 453 | + next_tweak v4, v4, v8 |
|---|
| 349 | 454 | .LxtsencNx: |
|---|
| 350 | | - subs w23, w23, #4 |
|---|
| 455 | + subs w4, w4, #64 |
|---|
| 351 | 456 | bmi .Lxtsenc1x |
|---|
| 352 | | - ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */ |
|---|
| 353 | | - next_tweak v5, v4, v7, v8 |
|---|
| 457 | + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ |
|---|
| 458 | + next_tweak v5, v4, v8 |
|---|
| 354 | 459 | eor v0.16b, v0.16b, v4.16b |
|---|
| 355 | | - next_tweak v6, v5, v7, v8 |
|---|
| 460 | + next_tweak v6, v5, v8 |
|---|
| 356 | 461 | eor v1.16b, v1.16b, v5.16b |
|---|
| 357 | 462 | eor v2.16b, v2.16b, v6.16b |
|---|
| 358 | | - next_tweak v7, v6, v7, v8 |
|---|
| 463 | + next_tweak v7, v6, v8 |
|---|
| 359 | 464 | eor v3.16b, v3.16b, v7.16b |
|---|
| 360 | 465 | bl aes_encrypt_block4x |
|---|
| 361 | 466 | eor v3.16b, v3.16b, v7.16b |
|---|
| 362 | 467 | eor v0.16b, v0.16b, v4.16b |
|---|
| 363 | 468 | eor v1.16b, v1.16b, v5.16b |
|---|
| 364 | 469 | eor v2.16b, v2.16b, v6.16b |
|---|
| 365 | | - st1 {v0.16b-v3.16b}, [x19], #64 |
|---|
| 470 | + st1 {v0.16b-v3.16b}, [x0], #64 |
|---|
| 366 | 471 | mov v4.16b, v7.16b |
|---|
| 367 | | - cbz w23, .Lxtsencout |
|---|
| 368 | | - st1 {v4.16b}, [x24] |
|---|
| 369 | | - cond_yield_neon .Lxtsencrestart |
|---|
| 472 | + cbz w4, .Lxtsencret |
|---|
| 473 | + xts_reload_mask v8 |
|---|
| 370 | 474 | b .LxtsencloopNx |
|---|
| 371 | 475 | .Lxtsenc1x: |
|---|
| 372 | | - adds w23, w23, #4 |
|---|
| 476 | + adds w4, w4, #64 |
|---|
| 373 | 477 | beq .Lxtsencout |
|---|
| 478 | + subs w4, w4, #16 |
|---|
| 479 | + bmi .LxtsencctsNx |
|---|
| 374 | 480 | .Lxtsencloop: |
|---|
| 375 | | - ld1 {v1.16b}, [x20], #16 |
|---|
| 376 | | - eor v0.16b, v1.16b, v4.16b |
|---|
| 377 | | - encrypt_block v0, w22, x21, x8, w7 |
|---|
| 481 | + ld1 {v0.16b}, [x1], #16 |
|---|
| 482 | +.Lxtsencctsout: |
|---|
| 378 | 483 | eor v0.16b, v0.16b, v4.16b |
|---|
| 379 | | - st1 {v0.16b}, [x19], #16 |
|---|
| 380 | | - subs w23, w23, #1 |
|---|
| 381 | | - beq .Lxtsencout |
|---|
| 382 | | - next_tweak v4, v4, v7, v8 |
|---|
| 484 | + encrypt_block v0, w3, x2, x8, w7 |
|---|
| 485 | + eor v0.16b, v0.16b, v4.16b |
|---|
| 486 | + cbz w4, .Lxtsencout |
|---|
| 487 | + subs w4, w4, #16 |
|---|
| 488 | + next_tweak v4, v4, v8 |
|---|
| 489 | + bmi .Lxtsenccts |
|---|
| 490 | + st1 {v0.16b}, [x0], #16 |
|---|
| 383 | 491 | b .Lxtsencloop |
|---|
| 384 | 492 | .Lxtsencout: |
|---|
| 385 | | - st1 {v4.16b}, [x24] |
|---|
| 386 | | - frame_pop |
|---|
| 493 | + st1 {v0.16b}, [x0] |
|---|
| 494 | +.Lxtsencret: |
|---|
| 495 | + st1 {v4.16b}, [x6] |
|---|
| 496 | + ldp x29, x30, [sp], #16 |
|---|
| 387 | 497 | ret |
|---|
| 388 | | -AES_ENDPROC(aes_xts_encrypt) |
|---|
| 389 | 498 | |
|---|
| 499 | +.LxtsencctsNx: |
|---|
| 500 | + mov v0.16b, v3.16b |
|---|
| 501 | + sub x0, x0, #16 |
|---|
| 502 | +.Lxtsenccts: |
|---|
| 503 | + adr_l x8, .Lcts_permute_table |
|---|
| 390 | 504 | |
|---|
| 391 | | -AES_ENTRY(aes_xts_decrypt) |
|---|
| 392 | | - frame_push 6 |
|---|
| 505 | + add x1, x1, w4, sxtw /* rewind input pointer */ |
|---|
| 506 | + add w4, w4, #16 /* # bytes in final block */ |
|---|
| 507 | + add x9, x8, #32 |
|---|
| 508 | + add x8, x8, x4 |
|---|
| 509 | + sub x9, x9, x4 |
|---|
| 510 | + add x4, x0, x4 /* output address of final block */ |
|---|
| 393 | 511 | |
|---|
| 394 | | - mov x19, x0 |
|---|
| 395 | | - mov x20, x1 |
|---|
| 396 | | - mov x21, x2 |
|---|
| 397 | | - mov x22, x3 |
|---|
| 398 | | - mov x23, x4 |
|---|
| 399 | | - mov x24, x6 |
|---|
| 512 | + ld1 {v1.16b}, [x1] /* load final block */ |
|---|
| 513 | + ld1 {v2.16b}, [x8] |
|---|
| 514 | + ld1 {v3.16b}, [x9] |
|---|
| 400 | 515 | |
|---|
| 401 | | - ld1 {v4.16b}, [x24] |
|---|
| 516 | + tbl v2.16b, {v0.16b}, v2.16b |
|---|
| 517 | + tbx v0.16b, {v1.16b}, v3.16b |
|---|
| 518 | + st1 {v2.16b}, [x4] /* overlapping stores */ |
|---|
| 519 | + mov w4, wzr |
|---|
| 520 | + b .Lxtsencctsout |
|---|
| 521 | +AES_FUNC_END(aes_xts_encrypt) |
|---|
| 522 | + |
|---|
| 523 | +AES_FUNC_START(aes_xts_decrypt) |
|---|
| 524 | + stp x29, x30, [sp, #-16]! |
|---|
| 525 | + mov x29, sp |
|---|
| 526 | + |
|---|
| 527 | + /* subtract 16 bytes if we are doing CTS */ |
|---|
| 528 | + sub w8, w4, #0x10 |
|---|
| 529 | + tst w4, #0xf |
|---|
| 530 | + csel w4, w4, w8, eq |
|---|
| 531 | + |
|---|
| 532 | + ld1 {v4.16b}, [x6] |
|---|
| 533 | + xts_load_mask v8 |
|---|
| 534 | + xts_cts_skip_tw w7, .Lxtsdecskiptw |
|---|
| 402 | 535 | cbz w7, .Lxtsdecnotfirst |
|---|
| 403 | 536 | |
|---|
| 404 | 537 | enc_prepare w3, x5, x8 |
|---|
| 405 | 538 | encrypt_block v4, w3, x5, x8, w7 /* first tweak */ |
|---|
| 539 | +.Lxtsdecskiptw: |
|---|
| 406 | 540 | dec_prepare w3, x2, x8 |
|---|
| 407 | | - ldr q7, .Lxts_mul_x |
|---|
| 408 | 541 | b .LxtsdecNx |
|---|
| 409 | 542 | |
|---|
| 410 | | -.Lxtsdecrestart: |
|---|
| 411 | | - ld1 {v4.16b}, [x24] |
|---|
| 412 | 543 | .Lxtsdecnotfirst: |
|---|
| 413 | | - dec_prepare w22, x21, x8 |
|---|
| 544 | + dec_prepare w3, x2, x8 |
|---|
| 414 | 545 | .LxtsdecloopNx: |
|---|
| 415 | | - ldr q7, .Lxts_mul_x |
|---|
| 416 | | - next_tweak v4, v4, v7, v8 |
|---|
| 546 | + next_tweak v4, v4, v8 |
|---|
| 417 | 547 | .LxtsdecNx: |
|---|
| 418 | | - subs w23, w23, #4 |
|---|
| 548 | + subs w4, w4, #64 |
|---|
| 419 | 549 | bmi .Lxtsdec1x |
|---|
| 420 | | - ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */ |
|---|
| 421 | | - next_tweak v5, v4, v7, v8 |
|---|
| 550 | + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ |
|---|
| 551 | + next_tweak v5, v4, v8 |
|---|
| 422 | 552 | eor v0.16b, v0.16b, v4.16b |
|---|
| 423 | | - next_tweak v6, v5, v7, v8 |
|---|
| 553 | + next_tweak v6, v5, v8 |
|---|
| 424 | 554 | eor v1.16b, v1.16b, v5.16b |
|---|
| 425 | 555 | eor v2.16b, v2.16b, v6.16b |
|---|
| 426 | | - next_tweak v7, v6, v7, v8 |
|---|
| 556 | + next_tweak v7, v6, v8 |
|---|
| 427 | 557 | eor v3.16b, v3.16b, v7.16b |
|---|
| 428 | 558 | bl aes_decrypt_block4x |
|---|
| 429 | 559 | eor v3.16b, v3.16b, v7.16b |
|---|
| 430 | 560 | eor v0.16b, v0.16b, v4.16b |
|---|
| 431 | 561 | eor v1.16b, v1.16b, v5.16b |
|---|
| 432 | 562 | eor v2.16b, v2.16b, v6.16b |
|---|
| 433 | | - st1 {v0.16b-v3.16b}, [x19], #64 |
|---|
| 563 | + st1 {v0.16b-v3.16b}, [x0], #64 |
|---|
| 434 | 564 | mov v4.16b, v7.16b |
|---|
| 435 | | - cbz w23, .Lxtsdecout |
|---|
| 436 | | - st1 {v4.16b}, [x24] |
|---|
| 437 | | - cond_yield_neon .Lxtsdecrestart |
|---|
| 565 | + cbz w4, .Lxtsdecout |
|---|
| 566 | + xts_reload_mask v8 |
|---|
| 438 | 567 | b .LxtsdecloopNx |
|---|
| 439 | 568 | .Lxtsdec1x: |
|---|
| 440 | | - adds w23, w23, #4 |
|---|
| 569 | + adds w4, w4, #64 |
|---|
| 441 | 570 | beq .Lxtsdecout |
|---|
| 571 | + subs w4, w4, #16 |
|---|
| 442 | 572 | .Lxtsdecloop: |
|---|
| 443 | | - ld1 {v1.16b}, [x20], #16 |
|---|
| 444 | | - eor v0.16b, v1.16b, v4.16b |
|---|
| 445 | | - decrypt_block v0, w22, x21, x8, w7 |
|---|
| 573 | + ld1 {v0.16b}, [x1], #16 |
|---|
| 574 | + bmi .Lxtsdeccts |
|---|
| 575 | +.Lxtsdecctsout: |
|---|
| 446 | 576 | eor v0.16b, v0.16b, v4.16b |
|---|
| 447 | | - st1 {v0.16b}, [x19], #16 |
|---|
| 448 | | - subs w23, w23, #1 |
|---|
| 449 | | - beq .Lxtsdecout |
|---|
| 450 | | - next_tweak v4, v4, v7, v8 |
|---|
| 577 | + decrypt_block v0, w3, x2, x8, w7 |
|---|
| 578 | + eor v0.16b, v0.16b, v4.16b |
|---|
| 579 | + st1 {v0.16b}, [x0], #16 |
|---|
| 580 | + cbz w4, .Lxtsdecout |
|---|
| 581 | + subs w4, w4, #16 |
|---|
| 582 | + next_tweak v4, v4, v8 |
|---|
| 451 | 583 | b .Lxtsdecloop |
|---|
| 452 | 584 | .Lxtsdecout: |
|---|
| 453 | | - st1 {v4.16b}, [x24] |
|---|
| 454 | | - frame_pop |
|---|
| 585 | + st1 {v4.16b}, [x6] |
|---|
| 586 | + ldp x29, x30, [sp], #16 |
|---|
| 455 | 587 | ret |
|---|
| 456 | | -AES_ENDPROC(aes_xts_decrypt) |
|---|
| 588 | + |
|---|
| 589 | +.Lxtsdeccts: |
|---|
| 590 | + adr_l x8, .Lcts_permute_table |
|---|
| 591 | + |
|---|
| 592 | + add x1, x1, w4, sxtw /* rewind input pointer */ |
|---|
| 593 | + add w4, w4, #16 /* # bytes in final block */ |
|---|
| 594 | + add x9, x8, #32 |
|---|
| 595 | + add x8, x8, x4 |
|---|
| 596 | + sub x9, x9, x4 |
|---|
| 597 | + add x4, x0, x4 /* output address of final block */ |
|---|
| 598 | + |
|---|
| 599 | + next_tweak v5, v4, v8 |
|---|
| 600 | + |
|---|
| 601 | + ld1 {v1.16b}, [x1] /* load final block */ |
|---|
| 602 | + ld1 {v2.16b}, [x8] |
|---|
| 603 | + ld1 {v3.16b}, [x9] |
|---|
| 604 | + |
|---|
| 605 | + eor v0.16b, v0.16b, v5.16b |
|---|
| 606 | + decrypt_block v0, w3, x2, x8, w7 |
|---|
| 607 | + eor v0.16b, v0.16b, v5.16b |
|---|
| 608 | + |
|---|
| 609 | + tbl v2.16b, {v0.16b}, v2.16b |
|---|
| 610 | + tbx v0.16b, {v1.16b}, v3.16b |
|---|
| 611 | + |
|---|
| 612 | + st1 {v2.16b}, [x4] /* overlapping stores */ |
|---|
| 613 | + mov w4, wzr |
|---|
| 614 | + b .Lxtsdecctsout |
|---|
| 615 | +AES_FUNC_END(aes_xts_decrypt) |
|---|
| 457 | 616 | |
|---|
| 458 | 617 | /* |
|---|
| 459 | 618 | * aes_mac_update(u8 const in[], u32 const rk[], int rounds, |
|---|
| 460 | 619 | * int blocks, u8 dg[], int enc_before, int enc_after) |
|---|
| 461 | 620 | */ |
|---|
| 462 | | -AES_ENTRY(aes_mac_update) |
|---|
| 463 | | - frame_push 6 |
|---|
| 464 | | - |
|---|
| 465 | | - mov x19, x0 |
|---|
| 466 | | - mov x20, x1 |
|---|
| 467 | | - mov x21, x2 |
|---|
| 468 | | - mov x22, x3 |
|---|
| 469 | | - mov x23, x4 |
|---|
| 470 | | - mov x24, x6 |
|---|
| 471 | | - |
|---|
| 472 | | - ld1 {v0.16b}, [x23] /* get dg */ |
|---|
| 621 | +AES_FUNC_START(aes_mac_update) |
|---|
| 622 | + ld1 {v0.16b}, [x4] /* get dg */ |
|---|
| 473 | 623 | enc_prepare w2, x1, x7 |
|---|
| 474 | 624 | cbz w5, .Lmacloop4x |
|---|
| 475 | 625 | |
|---|
| 476 | 626 | encrypt_block v0, w2, x1, x7, w8 |
|---|
| 477 | 627 | |
|---|
| 478 | 628 | .Lmacloop4x: |
|---|
| 479 | | - subs w22, w22, #4 |
|---|
| 629 | + subs w3, w3, #4 |
|---|
| 480 | 630 | bmi .Lmac1x |
|---|
| 481 | | - ld1 {v1.16b-v4.16b}, [x19], #64 /* get next pt block */ |
|---|
| 631 | + ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */ |
|---|
| 482 | 632 | eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ |
|---|
| 483 | | - encrypt_block v0, w21, x20, x7, w8 |
|---|
| 633 | + encrypt_block v0, w2, x1, x7, w8 |
|---|
| 484 | 634 | eor v0.16b, v0.16b, v2.16b |
|---|
| 485 | | - encrypt_block v0, w21, x20, x7, w8 |
|---|
| 635 | + encrypt_block v0, w2, x1, x7, w8 |
|---|
| 486 | 636 | eor v0.16b, v0.16b, v3.16b |
|---|
| 487 | | - encrypt_block v0, w21, x20, x7, w8 |
|---|
| 637 | + encrypt_block v0, w2, x1, x7, w8 |
|---|
| 488 | 638 | eor v0.16b, v0.16b, v4.16b |
|---|
| 489 | | - cmp w22, wzr |
|---|
| 490 | | - csinv x5, x24, xzr, eq |
|---|
| 639 | + cmp w3, wzr |
|---|
| 640 | + csinv x5, x6, xzr, eq |
|---|
| 491 | 641 | cbz w5, .Lmacout |
|---|
| 492 | | - encrypt_block v0, w21, x20, x7, w8 |
|---|
| 493 | | - st1 {v0.16b}, [x23] /* return dg */ |
|---|
| 494 | | - cond_yield_neon .Lmacrestart |
|---|
| 642 | + encrypt_block v0, w2, x1, x7, w8 |
|---|
| 643 | + st1 {v0.16b}, [x4] /* return dg */ |
|---|
| 644 | + cond_yield .Lmacout, x7, x8 |
|---|
| 495 | 645 | b .Lmacloop4x |
|---|
| 496 | 646 | .Lmac1x: |
|---|
| 497 | | - add w22, w22, #4 |
|---|
| 647 | + add w3, w3, #4 |
|---|
| 498 | 648 | .Lmacloop: |
|---|
| 499 | | - cbz w22, .Lmacout |
|---|
| 500 | | - ld1 {v1.16b}, [x19], #16 /* get next pt block */ |
|---|
| 649 | + cbz w3, .Lmacout |
|---|
| 650 | + ld1 {v1.16b}, [x0], #16 /* get next pt block */ |
|---|
| 501 | 651 | eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ |
|---|
| 502 | 652 | |
|---|
| 503 | | - subs w22, w22, #1 |
|---|
| 504 | | - csinv x5, x24, xzr, eq |
|---|
| 653 | + subs w3, w3, #1 |
|---|
| 654 | + csinv x5, x6, xzr, eq |
|---|
| 505 | 655 | cbz w5, .Lmacout |
|---|
| 506 | 656 | |
|---|
| 507 | 657 | .Lmacenc: |
|---|
| 508 | | - encrypt_block v0, w21, x20, x7, w8 |
|---|
| 658 | + encrypt_block v0, w2, x1, x7, w8 |
|---|
| 509 | 659 | b .Lmacloop |
|---|
| 510 | 660 | |
|---|
| 511 | 661 | .Lmacout: |
|---|
| 512 | | - st1 {v0.16b}, [x23] /* return dg */ |
|---|
| 513 | | - frame_pop |
|---|
| 662 | + st1 {v0.16b}, [x4] /* return dg */ |
|---|
| 663 | + mov w0, w3 |
|---|
| 514 | 664 | ret |
|---|
| 515 | | - |
|---|
| 516 | | -.Lmacrestart: |
|---|
| 517 | | - ld1 {v0.16b}, [x23] /* get dg */ |
|---|
| 518 | | - enc_prepare w21, x20, x0 |
|---|
| 519 | | - b .Lmacloop4x |
|---|
| 520 | | -AES_ENDPROC(aes_mac_update) |
|---|
| 665 | +AES_FUNC_END(aes_mac_update) |
|---|