.. | .. |
---|
120 | 120 | ## |
---|
121 | 121 | |
---|
122 | 122 | #include <linux/linkage.h> |
---|
123 | | -#include <asm/inst.h> |
---|
124 | 123 | |
---|
125 | 124 | # constants in mergeable sections, linker can reorder and merge |
---|
126 | 125 | .section .rodata.cst16.POLY, "aM", @progbits, 16 |
---|
.. | .. |
---|
182 | 181 | .text |
---|
183 | 182 | |
---|
184 | 183 | |
---|
185 | | -##define the fields of the gcm aes context |
---|
186 | | -#{ |
---|
187 | | -# u8 expanded_keys[16*11] store expanded keys |
---|
188 | | -# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here |
---|
189 | | -# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here |
---|
190 | | -# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here |
---|
191 | | -# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here |
---|
192 | | -# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here |
---|
193 | | -# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here |
---|
194 | | -# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here |
---|
195 | | -# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here |
---|
196 | | -# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes) |
---|
197 | | -# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes) |
---|
198 | | -# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes) |
---|
199 | | -# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes) |
---|
200 | | -# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes) |
---|
201 | | -# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes) |
---|
202 | | -# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes) |
---|
203 | | -# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes) |
---|
204 | | -#} gcm_ctx# |
---|
| 184 | +#define AadHash 16*0 |
---|
| 185 | +#define AadLen 16*1 |
---|
| 186 | +#define InLen (16*1)+8 |
---|
| 187 | +#define PBlockEncKey 16*2 |
---|
| 188 | +#define OrigIV 16*3 |
---|
| 189 | +#define CurCount 16*4 |
---|
| 190 | +#define PBlockLen 16*5 |
---|
205 | 191 | |
---|
206 | | -HashKey = 16*11 # store HashKey <<1 mod poly here |
---|
207 | | -HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here |
---|
208 | | -HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here |
---|
209 | | -HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here |
---|
210 | | -HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here |
---|
211 | | -HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here |
---|
212 | | -HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here |
---|
213 | | -HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here |
---|
214 | | -HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) |
---|
215 | | -HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) |
---|
216 | | -HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) |
---|
217 | | -HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) |
---|
218 | | -HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) |
---|
219 | | -HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) |
---|
220 | | -HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) |
---|
221 | | -HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) |
---|
| 192 | +HashKey = 16*6 # store HashKey <<1 mod poly here |
---|
| 193 | +HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here |
---|
| 194 | +HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here |
---|
| 195 | +HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here |
---|
| 196 | +HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here |
---|
| 197 | +HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here |
---|
| 198 | +HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here |
---|
| 199 | +HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here |
---|
| 200 | +HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) |
---|
| 201 | +HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) |
---|
| 202 | +HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) |
---|
| 203 | +HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) |
---|
| 204 | +HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) |
---|
| 205 | +HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) |
---|
| 206 | +HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) |
---|
| 207 | +HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) |
---|
222 | 208 | |
---|
223 | 209 | #define arg1 %rdi |
---|
224 | 210 | #define arg2 %rsi |
---|
.. | .. |
---|
229 | 215 | #define arg7 STACK_OFFSET+8*1(%r14) |
---|
230 | 216 | #define arg8 STACK_OFFSET+8*2(%r14) |
---|
231 | 217 | #define arg9 STACK_OFFSET+8*3(%r14) |
---|
| 218 | +#define arg10 STACK_OFFSET+8*4(%r14) |
---|
| 219 | +#define keysize 2*15*16(arg1) |
---|
232 | 220 | |
---|
233 | 221 | i = 0 |
---|
234 | 222 | j = 0 |
---|
.. | .. |
---|
267 | 255 | # Utility Macros |
---|
268 | 256 | ################################ |
---|
269 | 257 | |
---|
270 | | -# Encryption of a single block |
---|
271 | | -.macro ENCRYPT_SINGLE_BLOCK XMM0 |
---|
272 | | - vpxor (arg1), \XMM0, \XMM0 |
---|
273 | | - i = 1 |
---|
274 | | - setreg |
---|
275 | | -.rep 9 |
---|
276 | | - vaesenc 16*i(arg1), \XMM0, \XMM0 |
---|
277 | | - i = (i+1) |
---|
278 | | - setreg |
---|
279 | | -.endr |
---|
280 | | - vaesenclast 16*10(arg1), \XMM0, \XMM0 |
---|
| 258 | +.macro FUNC_SAVE |
---|
| 259 | + #the number of pushes must equal STACK_OFFSET |
---|
| 260 | + push %r12 |
---|
| 261 | + push %r13 |
---|
| 262 | + push %r14 |
---|
| 263 | + push %r15 |
---|
| 264 | + |
---|
| 265 | + mov %rsp, %r14 |
---|
| 266 | + |
---|
| 267 | + |
---|
| 268 | + |
---|
| 269 | + sub $VARIABLE_OFFSET, %rsp |
---|
| 270 | + and $~63, %rsp # align rsp to 64 bytes |
---|
281 | 271 | .endm |
---|
282 | 272 | |
---|
283 | | -#ifdef CONFIG_AS_AVX |
---|
| 273 | +.macro FUNC_RESTORE |
---|
| 274 | + mov %r14, %rsp |
---|
| 275 | + |
---|
| 276 | + pop %r15 |
---|
| 277 | + pop %r14 |
---|
| 278 | + pop %r13 |
---|
| 279 | + pop %r12 |
---|
| 280 | +.endm |
---|
| 281 | + |
---|
| 282 | +# Encryption of a single block |
---|
| 283 | +.macro ENCRYPT_SINGLE_BLOCK REP XMM0 |
---|
| 284 | + vpxor (arg1), \XMM0, \XMM0 |
---|
| 285 | + i = 1 |
---|
| 286 | + setreg |
---|
| 287 | +.rep \REP |
---|
| 288 | + vaesenc 16*i(arg1), \XMM0, \XMM0 |
---|
| 289 | + i = (i+1) |
---|
| 290 | + setreg |
---|
| 291 | +.endr |
---|
| 292 | + vaesenclast 16*i(arg1), \XMM0, \XMM0 |
---|
| 293 | +.endm |
---|
| 294 | + |
---|
| 295 | +# combined for GCM encrypt and decrypt functions |
---|
| 296 | +# clobbering all xmm registers |
---|
| 297 | +# clobbering r10, r11, r12, r13, r14, r15 |
---|
| 298 | +.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP |
---|
| 299 | + vmovdqu AadHash(arg2), %xmm8 |
---|
| 300 | + vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey |
---|
| 301 | + add arg5, InLen(arg2) |
---|
| 302 | + |
---|
| 303 | + # initialize the data pointer offset as zero |
---|
| 304 | + xor %r11d, %r11d |
---|
| 305 | + |
---|
| 306 | + PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC |
---|
| 307 | + sub %r11, arg5 |
---|
| 308 | + |
---|
| 309 | + mov arg5, %r13 # save the number of bytes of plaintext/ciphertext |
---|
| 310 | + and $-16, %r13 # r13 = r13 - (r13 mod 16) |
---|
| 311 | + |
---|
| 312 | + mov %r13, %r12 |
---|
| 313 | + shr $4, %r12 |
---|
| 314 | + and $7, %r12 |
---|
| 315 | + jz _initial_num_blocks_is_0\@ |
---|
| 316 | + |
---|
| 317 | + cmp $7, %r12 |
---|
| 318 | + je _initial_num_blocks_is_7\@ |
---|
| 319 | + cmp $6, %r12 |
---|
| 320 | + je _initial_num_blocks_is_6\@ |
---|
| 321 | + cmp $5, %r12 |
---|
| 322 | + je _initial_num_blocks_is_5\@ |
---|
| 323 | + cmp $4, %r12 |
---|
| 324 | + je _initial_num_blocks_is_4\@ |
---|
| 325 | + cmp $3, %r12 |
---|
| 326 | + je _initial_num_blocks_is_3\@ |
---|
| 327 | + cmp $2, %r12 |
---|
| 328 | + je _initial_num_blocks_is_2\@ |
---|
| 329 | + |
---|
| 330 | + jmp _initial_num_blocks_is_1\@ |
---|
| 331 | + |
---|
| 332 | +_initial_num_blocks_is_7\@: |
---|
| 333 | + \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
| 334 | + sub $16*7, %r13 |
---|
| 335 | + jmp _initial_blocks_encrypted\@ |
---|
| 336 | + |
---|
| 337 | +_initial_num_blocks_is_6\@: |
---|
| 338 | + \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
| 339 | + sub $16*6, %r13 |
---|
| 340 | + jmp _initial_blocks_encrypted\@ |
---|
| 341 | + |
---|
| 342 | +_initial_num_blocks_is_5\@: |
---|
| 343 | + \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
| 344 | + sub $16*5, %r13 |
---|
| 345 | + jmp _initial_blocks_encrypted\@ |
---|
| 346 | + |
---|
| 347 | +_initial_num_blocks_is_4\@: |
---|
| 348 | + \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
| 349 | + sub $16*4, %r13 |
---|
| 350 | + jmp _initial_blocks_encrypted\@ |
---|
| 351 | + |
---|
| 352 | +_initial_num_blocks_is_3\@: |
---|
| 353 | + \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
| 354 | + sub $16*3, %r13 |
---|
| 355 | + jmp _initial_blocks_encrypted\@ |
---|
| 356 | + |
---|
| 357 | +_initial_num_blocks_is_2\@: |
---|
| 358 | + \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
| 359 | + sub $16*2, %r13 |
---|
| 360 | + jmp _initial_blocks_encrypted\@ |
---|
| 361 | + |
---|
| 362 | +_initial_num_blocks_is_1\@: |
---|
| 363 | + \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
| 364 | + sub $16*1, %r13 |
---|
| 365 | + jmp _initial_blocks_encrypted\@ |
---|
| 366 | + |
---|
| 367 | +_initial_num_blocks_is_0\@: |
---|
| 368 | + \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
| 369 | + |
---|
| 370 | + |
---|
| 371 | +_initial_blocks_encrypted\@: |
---|
| 372 | + test %r13, %r13 |
---|
| 373 | + je _zero_cipher_left\@ |
---|
| 374 | + |
---|
| 375 | + sub $128, %r13 |
---|
| 376 | + je _eight_cipher_left\@ |
---|
| 377 | + |
---|
| 378 | + |
---|
| 379 | + |
---|
| 380 | + |
---|
| 381 | + vmovd %xmm9, %r15d |
---|
| 382 | + and $255, %r15d |
---|
| 383 | + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
| 384 | + |
---|
| 385 | + |
---|
| 386 | +_encrypt_by_8_new\@: |
---|
| 387 | + cmp $(255-8), %r15d |
---|
| 388 | + jg _encrypt_by_8\@ |
---|
| 389 | + |
---|
| 390 | + |
---|
| 391 | + |
---|
| 392 | + add $8, %r15b |
---|
| 393 | + \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC |
---|
| 394 | + add $128, %r11 |
---|
| 395 | + sub $128, %r13 |
---|
| 396 | + jne _encrypt_by_8_new\@ |
---|
| 397 | + |
---|
| 398 | + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
| 399 | + jmp _eight_cipher_left\@ |
---|
| 400 | + |
---|
| 401 | +_encrypt_by_8\@: |
---|
| 402 | + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
| 403 | + add $8, %r15b |
---|
| 404 | + \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC |
---|
| 405 | + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
| 406 | + add $128, %r11 |
---|
| 407 | + sub $128, %r13 |
---|
| 408 | + jne _encrypt_by_8_new\@ |
---|
| 409 | + |
---|
| 410 | + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
| 411 | + |
---|
| 412 | + |
---|
| 413 | + |
---|
| 414 | + |
---|
| 415 | +_eight_cipher_left\@: |
---|
| 416 | + \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 |
---|
| 417 | + |
---|
| 418 | + |
---|
| 419 | +_zero_cipher_left\@: |
---|
| 420 | + vmovdqu %xmm14, AadHash(arg2) |
---|
| 421 | + vmovdqu %xmm9, CurCount(arg2) |
---|
| 422 | + |
---|
| 423 | + # check for 0 length |
---|
| 424 | + mov arg5, %r13 |
---|
| 425 | + and $15, %r13 # r13 = (arg5 mod 16) |
---|
| 426 | + |
---|
| 427 | + je _multiple_of_16_bytes\@ |
---|
| 428 | + |
---|
| 429 | + # handle the last <16 Byte block separately |
---|
| 430 | + |
---|
| 431 | + mov %r13, PBlockLen(arg2) |
---|
| 432 | + |
---|
| 433 | + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn |
---|
| 434 | + vmovdqu %xmm9, CurCount(arg2) |
---|
| 435 | + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
| 436 | + |
---|
| 437 | + ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) |
---|
| 438 | + vmovdqu %xmm9, PBlockEncKey(arg2) |
---|
| 439 | + |
---|
| 440 | + cmp $16, arg5 |
---|
| 441 | + jge _large_enough_update\@ |
---|
| 442 | + |
---|
| 443 | + lea (arg4,%r11,1), %r10 |
---|
| 444 | + mov %r13, %r12 |
---|
| 445 | + |
---|
| 446 | + READ_PARTIAL_BLOCK %r10 %r12 %xmm1 |
---|
| 447 | + |
---|
| 448 | + lea SHIFT_MASK+16(%rip), %r12 |
---|
| 449 | + sub %r13, %r12 # adjust the shuffle mask pointer to be |
---|
| 450 | + # able to shift 16-r13 bytes (r13 is the |
---|
| 451 | + # number of bytes in plaintext mod 16) |
---|
| 452 | + |
---|
| 453 | + jmp _final_ghash_mul\@ |
---|
| 454 | + |
---|
| 455 | +_large_enough_update\@: |
---|
| 456 | + sub $16, %r11 |
---|
| 457 | + add %r13, %r11 |
---|
| 458 | + |
---|
| 459 | + # receive the last <16 Byte block |
---|
| 460 | + vmovdqu (arg4, %r11, 1), %xmm1 |
---|
| 461 | + |
---|
| 462 | + sub %r13, %r11 |
---|
| 463 | + add $16, %r11 |
---|
| 464 | + |
---|
| 465 | + lea SHIFT_MASK+16(%rip), %r12 |
---|
| 466 | + # adjust the shuffle mask pointer to be able to shift 16-r13 bytes |
---|
| 467 | + # (r13 is the number of bytes in plaintext mod 16) |
---|
| 468 | + sub %r13, %r12 |
---|
| 469 | + # get the appropriate shuffle mask |
---|
| 470 | + vmovdqu (%r12), %xmm2 |
---|
| 471 | + # shift right 16-r13 bytes |
---|
| 472 | + vpshufb %xmm2, %xmm1, %xmm1 |
---|
| 473 | + |
---|
| 474 | +_final_ghash_mul\@: |
---|
| 475 | + .if \ENC_DEC == DEC |
---|
| 476 | + vmovdqa %xmm1, %xmm2 |
---|
| 477 | + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) |
---|
| 478 | + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to |
---|
| 479 | + # mask out top 16-r13 bytes of xmm9 |
---|
| 480 | + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 |
---|
| 481 | + vpand %xmm1, %xmm2, %xmm2 |
---|
| 482 | + vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 |
---|
| 483 | + vpxor %xmm2, %xmm14, %xmm14 |
---|
| 484 | + |
---|
| 485 | + vmovdqu %xmm14, AadHash(arg2) |
---|
| 486 | + .else |
---|
| 487 | + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) |
---|
| 488 | + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to |
---|
| 489 | + # mask out top 16-r13 bytes of xmm9 |
---|
| 490 | + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 |
---|
| 491 | + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
| 492 | + vpxor %xmm9, %xmm14, %xmm14 |
---|
| 493 | + |
---|
| 494 | + vmovdqu %xmm14, AadHash(arg2) |
---|
| 495 | + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext |
---|
| 496 | + .endif |
---|
| 497 | + |
---|
| 498 | + |
---|
| 499 | + ############################# |
---|
| 500 | + # output r13 Bytes |
---|
| 501 | + vmovq %xmm9, %rax |
---|
| 502 | + cmp $8, %r13 |
---|
| 503 | + jle _less_than_8_bytes_left\@ |
---|
| 504 | + |
---|
| 505 | + mov %rax, (arg3 , %r11) |
---|
| 506 | + add $8, %r11 |
---|
| 507 | + vpsrldq $8, %xmm9, %xmm9 |
---|
| 508 | + vmovq %xmm9, %rax |
---|
| 509 | + sub $8, %r13 |
---|
| 510 | + |
---|
| 511 | +_less_than_8_bytes_left\@: |
---|
| 512 | + movb %al, (arg3 , %r11) |
---|
| 513 | + add $1, %r11 |
---|
| 514 | + shr $8, %rax |
---|
| 515 | + sub $1, %r13 |
---|
| 516 | + jne _less_than_8_bytes_left\@ |
---|
| 517 | + ############################# |
---|
| 518 | + |
---|
| 519 | +_multiple_of_16_bytes\@: |
---|
| 520 | +.endm |
---|
| 521 | + |
---|
| 522 | + |
---|
| 523 | +# GCM_COMPLETE Finishes update of tag of last partial block |
---|
| 524 | +# Output: Authorization Tag (AUTH_TAG) |
---|
| 525 | +# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 |
---|
| 526 | +.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN |
---|
| 527 | + vmovdqu AadHash(arg2), %xmm14 |
---|
| 528 | + vmovdqu HashKey(arg2), %xmm13 |
---|
| 529 | + |
---|
| 530 | + mov PBlockLen(arg2), %r12 |
---|
| 531 | + test %r12, %r12 |
---|
| 532 | + je _partial_done\@ |
---|
| 533 | + |
---|
| 534 | + #GHASH computation for the last <16 Byte block |
---|
| 535 | + \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 |
---|
| 536 | + |
---|
| 537 | +_partial_done\@: |
---|
| 538 | + mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) |
---|
| 539 | + shl $3, %r12 # convert into number of bits |
---|
| 540 | + vmovd %r12d, %xmm15 # len(A) in xmm15 |
---|
| 541 | + |
---|
| 542 | + mov InLen(arg2), %r12 |
---|
| 543 | + shl $3, %r12 # len(C) in bits (*128) |
---|
| 544 | + vmovq %r12, %xmm1 |
---|
| 545 | + vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 |
---|
| 546 | + vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) |
---|
| 547 | + |
---|
| 548 | + vpxor %xmm15, %xmm14, %xmm14 |
---|
| 549 | + \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation |
---|
| 550 | + vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap |
---|
| 551 | + |
---|
| 552 | + vmovdqu OrigIV(arg2), %xmm9 |
---|
| 553 | + |
---|
| 554 | + ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) |
---|
| 555 | + |
---|
| 556 | + vpxor %xmm14, %xmm9, %xmm9 |
---|
| 557 | + |
---|
| 558 | + |
---|
| 559 | + |
---|
| 560 | +_return_T\@: |
---|
| 561 | + mov \AUTH_TAG, %r10 # r10 = authTag |
---|
| 562 | + mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len |
---|
| 563 | + |
---|
| 564 | + cmp $16, %r11 |
---|
| 565 | + je _T_16\@ |
---|
| 566 | + |
---|
| 567 | + cmp $8, %r11 |
---|
| 568 | + jl _T_4\@ |
---|
| 569 | + |
---|
| 570 | +_T_8\@: |
---|
| 571 | + vmovq %xmm9, %rax |
---|
| 572 | + mov %rax, (%r10) |
---|
| 573 | + add $8, %r10 |
---|
| 574 | + sub $8, %r11 |
---|
| 575 | + vpsrldq $8, %xmm9, %xmm9 |
---|
| 576 | + test %r11, %r11 |
---|
| 577 | + je _return_T_done\@ |
---|
| 578 | +_T_4\@: |
---|
| 579 | + vmovd %xmm9, %eax |
---|
| 580 | + mov %eax, (%r10) |
---|
| 581 | + add $4, %r10 |
---|
| 582 | + sub $4, %r11 |
---|
| 583 | + vpsrldq $4, %xmm9, %xmm9 |
---|
| 584 | + test %r11, %r11 |
---|
| 585 | + je _return_T_done\@ |
---|
| 586 | +_T_123\@: |
---|
| 587 | + vmovd %xmm9, %eax |
---|
| 588 | + cmp $2, %r11 |
---|
| 589 | + jl _T_1\@ |
---|
| 590 | + mov %ax, (%r10) |
---|
| 591 | + cmp $2, %r11 |
---|
| 592 | + je _return_T_done\@ |
---|
| 593 | + add $2, %r10 |
---|
| 594 | + sar $16, %eax |
---|
| 595 | +_T_1\@: |
---|
| 596 | + mov %al, (%r10) |
---|
| 597 | + jmp _return_T_done\@ |
---|
| 598 | + |
---|
| 599 | +_T_16\@: |
---|
| 600 | + vmovdqu %xmm9, (%r10) |
---|
| 601 | + |
---|
| 602 | +_return_T_done\@: |
---|
| 603 | +.endm |
---|
| 604 | + |
---|
| 605 | +.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 |
---|
| 606 | + |
---|
| 607 | + mov \AAD, %r10 # r10 = AAD |
---|
| 608 | + mov \AADLEN, %r12 # r12 = aadLen |
---|
| 609 | + |
---|
| 610 | + |
---|
| 611 | + mov %r12, %r11 |
---|
| 612 | + |
---|
| 613 | + vpxor \T8, \T8, \T8 |
---|
| 614 | + vpxor \T7, \T7, \T7 |
---|
| 615 | + cmp $16, %r11 |
---|
| 616 | + jl _get_AAD_rest8\@ |
---|
| 617 | +_get_AAD_blocks\@: |
---|
| 618 | + vmovdqu (%r10), \T7 |
---|
| 619 | + vpshufb SHUF_MASK(%rip), \T7, \T7 |
---|
| 620 | + vpxor \T7, \T8, \T8 |
---|
| 621 | + \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6 |
---|
| 622 | + add $16, %r10 |
---|
| 623 | + sub $16, %r12 |
---|
| 624 | + sub $16, %r11 |
---|
| 625 | + cmp $16, %r11 |
---|
| 626 | + jge _get_AAD_blocks\@ |
---|
| 627 | + vmovdqu \T8, \T7 |
---|
| 628 | + test %r11, %r11 |
---|
| 629 | + je _get_AAD_done\@ |
---|
| 630 | + |
---|
| 631 | + vpxor \T7, \T7, \T7 |
---|
| 632 | + |
---|
| 633 | + /* read the last <16B of AAD. since we have at least 4B of |
---|
| 634 | + data right after the AAD (the ICV, and maybe some CT), we can |
---|
| 635 | + read 4B/8B blocks safely, and then get rid of the extra stuff */ |
---|
| 636 | +_get_AAD_rest8\@: |
---|
| 637 | + cmp $4, %r11 |
---|
| 638 | + jle _get_AAD_rest4\@ |
---|
| 639 | + movq (%r10), \T1 |
---|
| 640 | + add $8, %r10 |
---|
| 641 | + sub $8, %r11 |
---|
| 642 | + vpslldq $8, \T1, \T1 |
---|
| 643 | + vpsrldq $8, \T7, \T7 |
---|
| 644 | + vpxor \T1, \T7, \T7 |
---|
| 645 | + jmp _get_AAD_rest8\@ |
---|
| 646 | +_get_AAD_rest4\@: |
---|
| 647 | + test %r11, %r11 |
---|
| 648 | + jle _get_AAD_rest0\@ |
---|
| 649 | + mov (%r10), %eax |
---|
| 650 | + movq %rax, \T1 |
---|
| 651 | + add $4, %r10 |
---|
| 652 | + sub $4, %r11 |
---|
| 653 | + vpslldq $12, \T1, \T1 |
---|
| 654 | + vpsrldq $4, \T7, \T7 |
---|
| 655 | + vpxor \T1, \T7, \T7 |
---|
| 656 | +_get_AAD_rest0\@: |
---|
| 657 | + /* finalize: shift out the extra bytes we read, and align |
---|
| 658 | + left. since pslldq can only shift by an immediate, we use |
---|
| 659 | + vpshufb and an array of shuffle masks */ |
---|
| 660 | + movq %r12, %r11 |
---|
| 661 | + salq $4, %r11 |
---|
| 662 | + vmovdqu aad_shift_arr(%r11), \T1 |
---|
| 663 | + vpshufb \T1, \T7, \T7 |
---|
| 664 | +_get_AAD_rest_final\@: |
---|
| 665 | + vpshufb SHUF_MASK(%rip), \T7, \T7 |
---|
| 666 | + vpxor \T8, \T7, \T7 |
---|
| 667 | + \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 |
---|
| 668 | + |
---|
| 669 | +_get_AAD_done\@: |
---|
| 670 | + vmovdqu \T7, AadHash(arg2) |
---|
| 671 | +.endm |
---|
| 672 | + |
---|
| 673 | +.macro INIT GHASH_MUL PRECOMPUTE |
---|
| 674 | + mov arg6, %r11 |
---|
| 675 | + mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length |
---|
| 676 | + xor %r11d, %r11d |
---|
| 677 | + mov %r11, InLen(arg2) # ctx_data.in_length = 0 |
---|
| 678 | + |
---|
| 679 | + mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 |
---|
| 680 | + mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 |
---|
| 681 | + mov arg3, %rax |
---|
| 682 | + movdqu (%rax), %xmm0 |
---|
| 683 | + movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv |
---|
| 684 | + |
---|
| 685 | + vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 |
---|
| 686 | + movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv |
---|
| 687 | + |
---|
| 688 | + vmovdqu (arg4), %xmm6 # xmm6 = HashKey |
---|
| 689 | + |
---|
| 690 | + vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 |
---|
| 691 | + ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey |
---|
| 692 | + vmovdqa %xmm6, %xmm2 |
---|
| 693 | + vpsllq $1, %xmm6, %xmm6 |
---|
| 694 | + vpsrlq $63, %xmm2, %xmm2 |
---|
| 695 | + vmovdqa %xmm2, %xmm1 |
---|
| 696 | + vpslldq $8, %xmm2, %xmm2 |
---|
| 697 | + vpsrldq $8, %xmm1, %xmm1 |
---|
| 698 | + vpor %xmm2, %xmm6, %xmm6 |
---|
| 699 | + #reduction |
---|
| 700 | + vpshufd $0b00100100, %xmm1, %xmm2 |
---|
| 701 | + vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 |
---|
| 702 | + vpand POLY(%rip), %xmm2, %xmm2 |
---|
| 703 | + vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly |
---|
| 704 | + ####################################################################### |
---|
| 705 | + vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly |
---|
| 706 | + |
---|
| 707 | + CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 |
---|
| 708 | + |
---|
| 709 | + \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 |
---|
| 710 | +.endm |
---|
| 711 | + |
---|
| 712 | + |
---|
| 713 | +# Reads DLEN bytes starting at DPTR and stores in XMMDst |
---|
| 714 | +# where 0 < DLEN < 16 |
---|
| 715 | +# Clobbers %rax, DLEN |
---|
| 716 | +.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst |
---|
| 717 | + vpxor \XMMDst, \XMMDst, \XMMDst |
---|
| 718 | + |
---|
| 719 | + cmp $8, \DLEN |
---|
| 720 | + jl _read_lt8_\@ |
---|
| 721 | + mov (\DPTR), %rax |
---|
| 722 | + vpinsrq $0, %rax, \XMMDst, \XMMDst |
---|
| 723 | + sub $8, \DLEN |
---|
| 724 | + jz _done_read_partial_block_\@ |
---|
| 725 | + xor %eax, %eax |
---|
| 726 | +_read_next_byte_\@: |
---|
| 727 | + shl $8, %rax |
---|
| 728 | + mov 7(\DPTR, \DLEN, 1), %al |
---|
| 729 | + dec \DLEN |
---|
| 730 | + jnz _read_next_byte_\@ |
---|
| 731 | + vpinsrq $1, %rax, \XMMDst, \XMMDst |
---|
| 732 | + jmp _done_read_partial_block_\@ |
---|
| 733 | +_read_lt8_\@: |
---|
| 734 | + xor %eax, %eax |
---|
| 735 | +_read_next_byte_lt8_\@: |
---|
| 736 | + shl $8, %rax |
---|
| 737 | + mov -1(\DPTR, \DLEN, 1), %al |
---|
| 738 | + dec \DLEN |
---|
| 739 | + jnz _read_next_byte_lt8_\@ |
---|
| 740 | + vpinsrq $0, %rax, \XMMDst, \XMMDst |
---|
| 741 | +_done_read_partial_block_\@: |
---|
| 742 | +.endm |
---|
| 743 | + |
---|
| 744 | +# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks |
---|
| 745 | +# between update calls. |
---|
| 746 | +# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK |
---|
| 747 | +# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context |
---|
| 748 | +# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 |
---|
| 749 | +.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ |
---|
| 750 | + AAD_HASH ENC_DEC |
---|
| 751 | + mov PBlockLen(arg2), %r13 |
---|
| 752 | + test %r13, %r13 |
---|
| 753 | + je _partial_block_done_\@ # Leave Macro if no partial blocks |
---|
| 754 | + # Read in input data without over reading |
---|
| 755 | + cmp $16, \PLAIN_CYPH_LEN |
---|
| 756 | + jl _fewer_than_16_bytes_\@ |
---|
| 757 | + vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm |
---|
| 758 | + jmp _data_read_\@ |
---|
| 759 | + |
---|
| 760 | +_fewer_than_16_bytes_\@: |
---|
| 761 | + lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 |
---|
| 762 | + mov \PLAIN_CYPH_LEN, %r12 |
---|
| 763 | + READ_PARTIAL_BLOCK %r10 %r12 %xmm1 |
---|
| 764 | + |
---|
| 765 | + mov PBlockLen(arg2), %r13 |
---|
| 766 | + |
---|
| 767 | +_data_read_\@: # Finished reading in data |
---|
| 768 | + |
---|
| 769 | + vmovdqu PBlockEncKey(arg2), %xmm9 |
---|
| 770 | + vmovdqu HashKey(arg2), %xmm13 |
---|
| 771 | + |
---|
| 772 | + lea SHIFT_MASK(%rip), %r12 |
---|
| 773 | + |
---|
| 774 | + # adjust the shuffle mask pointer to be able to shift r13 bytes |
---|
| 775 | + # r16-r13 is the number of bytes in plaintext mod 16) |
---|
| 776 | + add %r13, %r12 |
---|
| 777 | + vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask |
---|
| 778 | + vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes |
---|
| 779 | + |
---|
| 780 | +.if \ENC_DEC == DEC |
---|
| 781 | + vmovdqa %xmm1, %xmm3 |
---|
| 782 | + pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) |
---|
| 783 | + |
---|
| 784 | + mov \PLAIN_CYPH_LEN, %r10 |
---|
| 785 | + add %r13, %r10 |
---|
| 786 | + # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling |
---|
| 787 | + sub $16, %r10 |
---|
| 788 | + # Determine if if partial block is not being filled and |
---|
| 789 | + # shift mask accordingly |
---|
| 790 | + jge _no_extra_mask_1_\@ |
---|
| 791 | + sub %r10, %r12 |
---|
| 792 | +_no_extra_mask_1_\@: |
---|
| 793 | + |
---|
| 794 | + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 |
---|
| 795 | + # get the appropriate mask to mask out bottom r13 bytes of xmm9 |
---|
| 796 | + vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9 |
---|
| 797 | + |
---|
| 798 | + vpand %xmm1, %xmm3, %xmm3 |
---|
| 799 | + vmovdqa SHUF_MASK(%rip), %xmm10 |
---|
| 800 | + vpshufb %xmm10, %xmm3, %xmm3 |
---|
| 801 | + vpshufb %xmm2, %xmm3, %xmm3 |
---|
| 802 | + vpxor %xmm3, \AAD_HASH, \AAD_HASH |
---|
| 803 | + |
---|
| 804 | + test %r10, %r10 |
---|
| 805 | + jl _partial_incomplete_1_\@ |
---|
| 806 | + |
---|
| 807 | + # GHASH computation for the last <16 Byte block |
---|
| 808 | + \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 |
---|
| 809 | + xor %eax,%eax |
---|
| 810 | + |
---|
| 811 | + mov %rax, PBlockLen(arg2) |
---|
| 812 | + jmp _dec_done_\@ |
---|
| 813 | +_partial_incomplete_1_\@: |
---|
| 814 | + add \PLAIN_CYPH_LEN, PBlockLen(arg2) |
---|
| 815 | +_dec_done_\@: |
---|
| 816 | + vmovdqu \AAD_HASH, AadHash(arg2) |
---|
| 817 | +.else |
---|
| 818 | + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) |
---|
| 819 | + |
---|
| 820 | + mov \PLAIN_CYPH_LEN, %r10 |
---|
| 821 | + add %r13, %r10 |
---|
| 822 | + # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling |
---|
| 823 | + sub $16, %r10 |
---|
| 824 | + # Determine if if partial block is not being filled and |
---|
| 825 | + # shift mask accordingly |
---|
| 826 | + jge _no_extra_mask_2_\@ |
---|
| 827 | + sub %r10, %r12 |
---|
| 828 | +_no_extra_mask_2_\@: |
---|
| 829 | + |
---|
| 830 | + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 |
---|
| 831 | + # get the appropriate mask to mask out bottom r13 bytes of xmm9 |
---|
| 832 | + vpand %xmm1, %xmm9, %xmm9 |
---|
| 833 | + |
---|
| 834 | + vmovdqa SHUF_MASK(%rip), %xmm1 |
---|
| 835 | + vpshufb %xmm1, %xmm9, %xmm9 |
---|
| 836 | + vpshufb %xmm2, %xmm9, %xmm9 |
---|
| 837 | + vpxor %xmm9, \AAD_HASH, \AAD_HASH |
---|
| 838 | + |
---|
| 839 | + test %r10, %r10 |
---|
| 840 | + jl _partial_incomplete_2_\@ |
---|
| 841 | + |
---|
| 842 | + # GHASH computation for the last <16 Byte block |
---|
| 843 | + \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 |
---|
| 844 | + xor %eax,%eax |
---|
| 845 | + |
---|
| 846 | + mov %rax, PBlockLen(arg2) |
---|
| 847 | + jmp _encode_done_\@ |
---|
| 848 | +_partial_incomplete_2_\@: |
---|
| 849 | + add \PLAIN_CYPH_LEN, PBlockLen(arg2) |
---|
| 850 | +_encode_done_\@: |
---|
| 851 | + vmovdqu \AAD_HASH, AadHash(arg2) |
---|
| 852 | + |
---|
| 853 | + vmovdqa SHUF_MASK(%rip), %xmm10 |
---|
| 854 | + # shuffle xmm9 back to output as ciphertext |
---|
| 855 | + vpshufb %xmm10, %xmm9, %xmm9 |
---|
| 856 | + vpshufb %xmm2, %xmm9, %xmm9 |
---|
| 857 | +.endif |
---|
| 858 | + # output encrypted Bytes |
---|
| 859 | + test %r10, %r10 |
---|
| 860 | + jl _partial_fill_\@ |
---|
| 861 | + mov %r13, %r12 |
---|
| 862 | + mov $16, %r13 |
---|
| 863 | + # Set r13 to be the number of bytes to write out |
---|
| 864 | + sub %r12, %r13 |
---|
| 865 | + jmp _count_set_\@ |
---|
| 866 | +_partial_fill_\@: |
---|
| 867 | + mov \PLAIN_CYPH_LEN, %r13 |
---|
| 868 | +_count_set_\@: |
---|
| 869 | + vmovdqa %xmm9, %xmm0 |
---|
| 870 | + vmovq %xmm0, %rax |
---|
| 871 | + cmp $8, %r13 |
---|
| 872 | + jle _less_than_8_bytes_left_\@ |
---|
| 873 | + |
---|
| 874 | + mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) |
---|
| 875 | + add $8, \DATA_OFFSET |
---|
| 876 | + psrldq $8, %xmm0 |
---|
| 877 | + vmovq %xmm0, %rax |
---|
| 878 | + sub $8, %r13 |
---|
| 879 | +_less_than_8_bytes_left_\@: |
---|
| 880 | + movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) |
---|
| 881 | + add $1, \DATA_OFFSET |
---|
| 882 | + shr $8, %rax |
---|
| 883 | + sub $1, %r13 |
---|
| 884 | + jne _less_than_8_bytes_left_\@ |
---|
| 885 | +_partial_block_done_\@: |
---|
| 886 | +.endm # PARTIAL_BLOCK |
---|
| 887 | + |
---|
284 | 888 | ############################################################################### |
---|
285 | 889 | # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) |
---|
286 | 890 | # Input: A and B (128-bits each, bit-reflected) |
---|
.. | .. |
---|
341 | 945 | |
---|
342 | 946 | vpshufd $0b01001110, \T5, \T1 |
---|
343 | 947 | vpxor \T5, \T1, \T1 |
---|
344 | | - vmovdqa \T1, HashKey_k(arg1) |
---|
| 948 | + vmovdqu \T1, HashKey_k(arg2) |
---|
345 | 949 | |
---|
346 | 950 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly |
---|
347 | | - vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly |
---|
| 951 | + vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly |
---|
348 | 952 | vpshufd $0b01001110, \T5, \T1 |
---|
349 | 953 | vpxor \T5, \T1, \T1 |
---|
350 | | - vmovdqa \T1, HashKey_2_k(arg1) |
---|
| 954 | + vmovdqu \T1, HashKey_2_k(arg2) |
---|
351 | 955 | |
---|
352 | 956 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly |
---|
353 | | - vmovdqa \T5, HashKey_3(arg1) |
---|
| 957 | + vmovdqu \T5, HashKey_3(arg2) |
---|
354 | 958 | vpshufd $0b01001110, \T5, \T1 |
---|
355 | 959 | vpxor \T5, \T1, \T1 |
---|
356 | | - vmovdqa \T1, HashKey_3_k(arg1) |
---|
| 960 | + vmovdqu \T1, HashKey_3_k(arg2) |
---|
357 | 961 | |
---|
358 | 962 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly |
---|
359 | | - vmovdqa \T5, HashKey_4(arg1) |
---|
| 963 | + vmovdqu \T5, HashKey_4(arg2) |
---|
360 | 964 | vpshufd $0b01001110, \T5, \T1 |
---|
361 | 965 | vpxor \T5, \T1, \T1 |
---|
362 | | - vmovdqa \T1, HashKey_4_k(arg1) |
---|
| 966 | + vmovdqu \T1, HashKey_4_k(arg2) |
---|
363 | 967 | |
---|
364 | 968 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly |
---|
365 | | - vmovdqa \T5, HashKey_5(arg1) |
---|
| 969 | + vmovdqu \T5, HashKey_5(arg2) |
---|
366 | 970 | vpshufd $0b01001110, \T5, \T1 |
---|
367 | 971 | vpxor \T5, \T1, \T1 |
---|
368 | | - vmovdqa \T1, HashKey_5_k(arg1) |
---|
| 972 | + vmovdqu \T1, HashKey_5_k(arg2) |
---|
369 | 973 | |
---|
370 | 974 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly |
---|
371 | | - vmovdqa \T5, HashKey_6(arg1) |
---|
| 975 | + vmovdqu \T5, HashKey_6(arg2) |
---|
372 | 976 | vpshufd $0b01001110, \T5, \T1 |
---|
373 | 977 | vpxor \T5, \T1, \T1 |
---|
374 | | - vmovdqa \T1, HashKey_6_k(arg1) |
---|
| 978 | + vmovdqu \T1, HashKey_6_k(arg2) |
---|
375 | 979 | |
---|
376 | 980 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly |
---|
377 | | - vmovdqa \T5, HashKey_7(arg1) |
---|
| 981 | + vmovdqu \T5, HashKey_7(arg2) |
---|
378 | 982 | vpshufd $0b01001110, \T5, \T1 |
---|
379 | 983 | vpxor \T5, \T1, \T1 |
---|
380 | | - vmovdqa \T1, HashKey_7_k(arg1) |
---|
| 984 | + vmovdqu \T1, HashKey_7_k(arg2) |
---|
381 | 985 | |
---|
382 | 986 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly |
---|
383 | | - vmovdqa \T5, HashKey_8(arg1) |
---|
| 987 | + vmovdqu \T5, HashKey_8(arg2) |
---|
384 | 988 | vpshufd $0b01001110, \T5, \T1 |
---|
385 | 989 | vpxor \T5, \T1, \T1 |
---|
386 | | - vmovdqa \T1, HashKey_8_k(arg1) |
---|
| 990 | + vmovdqu \T1, HashKey_8_k(arg2) |
---|
387 | 991 | |
---|
388 | 992 | .endm |
---|
389 | 993 | |
---|
.. | .. |
---|
392 | 996 | ## num_initial_blocks = b mod 4# |
---|
393 | 997 | ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext |
---|
394 | 998 | ## r10, r11, r12, rax are clobbered |
---|
395 | | -## arg1, arg2, arg3, r14 are used as a pointer only, not modified |
---|
| 999 | +## arg1, arg3, arg4, r14 are used as a pointer only, not modified |
---|
396 | 1000 | |
---|
397 | | -.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC |
---|
| 1001 | +.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC |
---|
398 | 1002 | i = (8-\num_initial_blocks) |
---|
399 | | - j = 0 |
---|
400 | 1003 | setreg |
---|
401 | | - |
---|
402 | | - mov arg6, %r10 # r10 = AAD |
---|
403 | | - mov arg7, %r12 # r12 = aadLen |
---|
404 | | - |
---|
405 | | - |
---|
406 | | - mov %r12, %r11 |
---|
407 | | - |
---|
408 | | - vpxor reg_j, reg_j, reg_j |
---|
409 | | - vpxor reg_i, reg_i, reg_i |
---|
410 | | - cmp $16, %r11 |
---|
411 | | - jl _get_AAD_rest8\@ |
---|
412 | | -_get_AAD_blocks\@: |
---|
413 | | - vmovdqu (%r10), reg_i |
---|
414 | | - vpshufb SHUF_MASK(%rip), reg_i, reg_i |
---|
415 | | - vpxor reg_i, reg_j, reg_j |
---|
416 | | - GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 |
---|
417 | | - add $16, %r10 |
---|
418 | | - sub $16, %r12 |
---|
419 | | - sub $16, %r11 |
---|
420 | | - cmp $16, %r11 |
---|
421 | | - jge _get_AAD_blocks\@ |
---|
422 | | - vmovdqu reg_j, reg_i |
---|
423 | | - cmp $0, %r11 |
---|
424 | | - je _get_AAD_done\@ |
---|
425 | | - |
---|
426 | | - vpxor reg_i, reg_i, reg_i |
---|
427 | | - |
---|
428 | | - /* read the last <16B of AAD. since we have at least 4B of |
---|
429 | | - data right after the AAD (the ICV, and maybe some CT), we can |
---|
430 | | - read 4B/8B blocks safely, and then get rid of the extra stuff */ |
---|
431 | | -_get_AAD_rest8\@: |
---|
432 | | - cmp $4, %r11 |
---|
433 | | - jle _get_AAD_rest4\@ |
---|
434 | | - movq (%r10), \T1 |
---|
435 | | - add $8, %r10 |
---|
436 | | - sub $8, %r11 |
---|
437 | | - vpslldq $8, \T1, \T1 |
---|
438 | | - vpsrldq $8, reg_i, reg_i |
---|
439 | | - vpxor \T1, reg_i, reg_i |
---|
440 | | - jmp _get_AAD_rest8\@ |
---|
441 | | -_get_AAD_rest4\@: |
---|
442 | | - cmp $0, %r11 |
---|
443 | | - jle _get_AAD_rest0\@ |
---|
444 | | - mov (%r10), %eax |
---|
445 | | - movq %rax, \T1 |
---|
446 | | - add $4, %r10 |
---|
447 | | - sub $4, %r11 |
---|
448 | | - vpslldq $12, \T1, \T1 |
---|
449 | | - vpsrldq $4, reg_i, reg_i |
---|
450 | | - vpxor \T1, reg_i, reg_i |
---|
451 | | -_get_AAD_rest0\@: |
---|
452 | | - /* finalize: shift out the extra bytes we read, and align |
---|
453 | | - left. since pslldq can only shift by an immediate, we use |
---|
454 | | - vpshufb and an array of shuffle masks */ |
---|
455 | | - movq %r12, %r11 |
---|
456 | | - salq $4, %r11 |
---|
457 | | - movdqu aad_shift_arr(%r11), \T1 |
---|
458 | | - vpshufb \T1, reg_i, reg_i |
---|
459 | | -_get_AAD_rest_final\@: |
---|
460 | | - vpshufb SHUF_MASK(%rip), reg_i, reg_i |
---|
461 | | - vpxor reg_j, reg_i, reg_i |
---|
462 | | - GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 |
---|
463 | | - |
---|
464 | | -_get_AAD_done\@: |
---|
465 | | - # initialize the data pointer offset as zero |
---|
466 | | - xor %r11d, %r11d |
---|
| 1004 | + vmovdqu AadHash(arg2), reg_i |
---|
467 | 1005 | |
---|
468 | 1006 | # start AES for num_initial_blocks blocks |
---|
469 | | - mov arg5, %rax # rax = *Y0 |
---|
470 | | - vmovdqu (%rax), \CTR # CTR = Y0 |
---|
471 | | - vpshufb SHUF_MASK(%rip), \CTR, \CTR |
---|
472 | | - |
---|
| 1007 | + vmovdqu CurCount(arg2), \CTR |
---|
473 | 1008 | |
---|
474 | 1009 | i = (9-\num_initial_blocks) |
---|
475 | 1010 | setreg |
---|
.. | .. |
---|
490 | 1025 | setreg |
---|
491 | 1026 | .endr |
---|
492 | 1027 | |
---|
493 | | - j = 1 |
---|
494 | | - setreg |
---|
495 | | -.rep 9 |
---|
496 | | - vmovdqa 16*j(arg1), \T_key |
---|
| 1028 | + j = 1 |
---|
| 1029 | + setreg |
---|
| 1030 | +.rep \REP |
---|
| 1031 | + vmovdqa 16*j(arg1), \T_key |
---|
497 | 1032 | i = (9-\num_initial_blocks) |
---|
498 | 1033 | setreg |
---|
499 | 1034 | .rep \num_initial_blocks |
---|
.. | .. |
---|
502 | 1037 | setreg |
---|
503 | 1038 | .endr |
---|
504 | 1039 | |
---|
505 | | - j = (j+1) |
---|
506 | | - setreg |
---|
| 1040 | + j = (j+1) |
---|
| 1041 | + setreg |
---|
507 | 1042 | .endr |
---|
508 | 1043 | |
---|
509 | | - |
---|
510 | | - vmovdqa 16*10(arg1), \T_key |
---|
| 1044 | + vmovdqa 16*j(arg1), \T_key |
---|
511 | 1045 | i = (9-\num_initial_blocks) |
---|
512 | 1046 | setreg |
---|
513 | 1047 | .rep \num_initial_blocks |
---|
.. | .. |
---|
519 | 1053 | i = (9-\num_initial_blocks) |
---|
520 | 1054 | setreg |
---|
521 | 1055 | .rep \num_initial_blocks |
---|
522 | | - vmovdqu (arg3, %r11), \T1 |
---|
| 1056 | + vmovdqu (arg4, %r11), \T1 |
---|
523 | 1057 | vpxor \T1, reg_i, reg_i |
---|
524 | | - vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks |
---|
| 1058 | + vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks |
---|
525 | 1059 | add $16, %r11 |
---|
526 | 1060 | .if \ENC_DEC == DEC |
---|
527 | 1061 | vmovdqa \T1, reg_i |
---|
.. | .. |
---|
595 | 1129 | vpxor \T_key, \XMM7, \XMM7 |
---|
596 | 1130 | vpxor \T_key, \XMM8, \XMM8 |
---|
597 | 1131 | |
---|
598 | | - i = 1 |
---|
599 | | - setreg |
---|
600 | | -.rep 9 # do 9 rounds |
---|
| 1132 | + i = 1 |
---|
| 1133 | + setreg |
---|
| 1134 | +.rep \REP # do REP rounds |
---|
601 | 1135 | vmovdqa 16*i(arg1), \T_key |
---|
602 | 1136 | vaesenc \T_key, \XMM1, \XMM1 |
---|
603 | 1137 | vaesenc \T_key, \XMM2, \XMM2 |
---|
.. | .. |
---|
607 | 1141 | vaesenc \T_key, \XMM6, \XMM6 |
---|
608 | 1142 | vaesenc \T_key, \XMM7, \XMM7 |
---|
609 | 1143 | vaesenc \T_key, \XMM8, \XMM8 |
---|
610 | | - i = (i+1) |
---|
611 | | - setreg |
---|
| 1144 | + i = (i+1) |
---|
| 1145 | + setreg |
---|
612 | 1146 | .endr |
---|
613 | | - |
---|
614 | 1147 | |
---|
615 | 1148 | vmovdqa 16*i(arg1), \T_key |
---|
616 | 1149 | vaesenclast \T_key, \XMM1, \XMM1 |
---|
.. | .. |
---|
622 | 1155 | vaesenclast \T_key, \XMM7, \XMM7 |
---|
623 | 1156 | vaesenclast \T_key, \XMM8, \XMM8 |
---|
624 | 1157 | |
---|
625 | | - vmovdqu (arg3, %r11), \T1 |
---|
| 1158 | + vmovdqu (arg4, %r11), \T1 |
---|
626 | 1159 | vpxor \T1, \XMM1, \XMM1 |
---|
627 | | - vmovdqu \XMM1, (arg2 , %r11) |
---|
| 1160 | + vmovdqu \XMM1, (arg3 , %r11) |
---|
628 | 1161 | .if \ENC_DEC == DEC |
---|
629 | 1162 | vmovdqa \T1, \XMM1 |
---|
630 | 1163 | .endif |
---|
631 | 1164 | |
---|
632 | | - vmovdqu 16*1(arg3, %r11), \T1 |
---|
| 1165 | + vmovdqu 16*1(arg4, %r11), \T1 |
---|
633 | 1166 | vpxor \T1, \XMM2, \XMM2 |
---|
634 | | - vmovdqu \XMM2, 16*1(arg2 , %r11) |
---|
| 1167 | + vmovdqu \XMM2, 16*1(arg3 , %r11) |
---|
635 | 1168 | .if \ENC_DEC == DEC |
---|
636 | 1169 | vmovdqa \T1, \XMM2 |
---|
637 | 1170 | .endif |
---|
638 | 1171 | |
---|
639 | | - vmovdqu 16*2(arg3, %r11), \T1 |
---|
| 1172 | + vmovdqu 16*2(arg4, %r11), \T1 |
---|
640 | 1173 | vpxor \T1, \XMM3, \XMM3 |
---|
641 | | - vmovdqu \XMM3, 16*2(arg2 , %r11) |
---|
| 1174 | + vmovdqu \XMM3, 16*2(arg3 , %r11) |
---|
642 | 1175 | .if \ENC_DEC == DEC |
---|
643 | 1176 | vmovdqa \T1, \XMM3 |
---|
644 | 1177 | .endif |
---|
645 | 1178 | |
---|
646 | | - vmovdqu 16*3(arg3, %r11), \T1 |
---|
| 1179 | + vmovdqu 16*3(arg4, %r11), \T1 |
---|
647 | 1180 | vpxor \T1, \XMM4, \XMM4 |
---|
648 | | - vmovdqu \XMM4, 16*3(arg2 , %r11) |
---|
| 1181 | + vmovdqu \XMM4, 16*3(arg3 , %r11) |
---|
649 | 1182 | .if \ENC_DEC == DEC |
---|
650 | 1183 | vmovdqa \T1, \XMM4 |
---|
651 | 1184 | .endif |
---|
652 | 1185 | |
---|
653 | | - vmovdqu 16*4(arg3, %r11), \T1 |
---|
| 1186 | + vmovdqu 16*4(arg4, %r11), \T1 |
---|
654 | 1187 | vpxor \T1, \XMM5, \XMM5 |
---|
655 | | - vmovdqu \XMM5, 16*4(arg2 , %r11) |
---|
| 1188 | + vmovdqu \XMM5, 16*4(arg3 , %r11) |
---|
656 | 1189 | .if \ENC_DEC == DEC |
---|
657 | 1190 | vmovdqa \T1, \XMM5 |
---|
658 | 1191 | .endif |
---|
659 | 1192 | |
---|
660 | | - vmovdqu 16*5(arg3, %r11), \T1 |
---|
| 1193 | + vmovdqu 16*5(arg4, %r11), \T1 |
---|
661 | 1194 | vpxor \T1, \XMM6, \XMM6 |
---|
662 | | - vmovdqu \XMM6, 16*5(arg2 , %r11) |
---|
| 1195 | + vmovdqu \XMM6, 16*5(arg3 , %r11) |
---|
663 | 1196 | .if \ENC_DEC == DEC |
---|
664 | 1197 | vmovdqa \T1, \XMM6 |
---|
665 | 1198 | .endif |
---|
666 | 1199 | |
---|
667 | | - vmovdqu 16*6(arg3, %r11), \T1 |
---|
| 1200 | + vmovdqu 16*6(arg4, %r11), \T1 |
---|
668 | 1201 | vpxor \T1, \XMM7, \XMM7 |
---|
669 | | - vmovdqu \XMM7, 16*6(arg2 , %r11) |
---|
| 1202 | + vmovdqu \XMM7, 16*6(arg3 , %r11) |
---|
670 | 1203 | .if \ENC_DEC == DEC |
---|
671 | 1204 | vmovdqa \T1, \XMM7 |
---|
672 | 1205 | .endif |
---|
673 | 1206 | |
---|
674 | | - vmovdqu 16*7(arg3, %r11), \T1 |
---|
| 1207 | + vmovdqu 16*7(arg4, %r11), \T1 |
---|
675 | 1208 | vpxor \T1, \XMM8, \XMM8 |
---|
676 | | - vmovdqu \XMM8, 16*7(arg2 , %r11) |
---|
| 1209 | + vmovdqu \XMM8, 16*7(arg3 , %r11) |
---|
677 | 1210 | .if \ENC_DEC == DEC |
---|
678 | 1211 | vmovdqa \T1, \XMM8 |
---|
679 | 1212 | .endif |
---|
.. | .. |
---|
698 | 1231 | |
---|
699 | 1232 | # encrypt 8 blocks at a time |
---|
700 | 1233 | # ghash the 8 previously encrypted ciphertext blocks |
---|
701 | | -# arg1, arg2, arg3 are used as pointers only, not modified |
---|
| 1234 | +# arg1, arg3, arg4 are used as pointers only, not modified |
---|
702 | 1235 | # r11 is the data offset value |
---|
703 | | -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC |
---|
| 1236 | +.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC |
---|
704 | 1237 | |
---|
705 | 1238 | vmovdqa \XMM1, \T2 |
---|
706 | 1239 | vmovdqa \XMM2, TMP2(%rsp) |
---|
.. | .. |
---|
784 | 1317 | |
---|
785 | 1318 | ####################################################################### |
---|
786 | 1319 | |
---|
787 | | - vmovdqa HashKey_8(arg1), \T5 |
---|
| 1320 | + vmovdqu HashKey_8(arg2), \T5 |
---|
788 | 1321 | vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 |
---|
789 | 1322 | vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 |
---|
790 | 1323 | |
---|
791 | 1324 | vpshufd $0b01001110, \T2, \T6 |
---|
792 | 1325 | vpxor \T2, \T6, \T6 |
---|
793 | 1326 | |
---|
794 | | - vmovdqa HashKey_8_k(arg1), \T5 |
---|
| 1327 | + vmovdqu HashKey_8_k(arg2), \T5 |
---|
795 | 1328 | vpclmulqdq $0x00, \T5, \T6, \T6 |
---|
796 | 1329 | |
---|
797 | 1330 | vmovdqu 16*3(arg1), \T1 |
---|
.. | .. |
---|
805 | 1338 | vaesenc \T1, \XMM8, \XMM8 |
---|
806 | 1339 | |
---|
807 | 1340 | vmovdqa TMP2(%rsp), \T1 |
---|
808 | | - vmovdqa HashKey_7(arg1), \T5 |
---|
| 1341 | + vmovdqu HashKey_7(arg2), \T5 |
---|
809 | 1342 | vpclmulqdq $0x11, \T5, \T1, \T3 |
---|
810 | 1343 | vpxor \T3, \T4, \T4 |
---|
811 | 1344 | vpclmulqdq $0x00, \T5, \T1, \T3 |
---|
.. | .. |
---|
813 | 1346 | |
---|
814 | 1347 | vpshufd $0b01001110, \T1, \T3 |
---|
815 | 1348 | vpxor \T1, \T3, \T3 |
---|
816 | | - vmovdqa HashKey_7_k(arg1), \T5 |
---|
| 1349 | + vmovdqu HashKey_7_k(arg2), \T5 |
---|
817 | 1350 | vpclmulqdq $0x10, \T5, \T3, \T3 |
---|
818 | 1351 | vpxor \T3, \T6, \T6 |
---|
819 | 1352 | |
---|
.. | .. |
---|
830 | 1363 | ####################################################################### |
---|
831 | 1364 | |
---|
832 | 1365 | vmovdqa TMP3(%rsp), \T1 |
---|
833 | | - vmovdqa HashKey_6(arg1), \T5 |
---|
| 1366 | + vmovdqu HashKey_6(arg2), \T5 |
---|
834 | 1367 | vpclmulqdq $0x11, \T5, \T1, \T3 |
---|
835 | 1368 | vpxor \T3, \T4, \T4 |
---|
836 | 1369 | vpclmulqdq $0x00, \T5, \T1, \T3 |
---|
.. | .. |
---|
838 | 1371 | |
---|
839 | 1372 | vpshufd $0b01001110, \T1, \T3 |
---|
840 | 1373 | vpxor \T1, \T3, \T3 |
---|
841 | | - vmovdqa HashKey_6_k(arg1), \T5 |
---|
| 1374 | + vmovdqu HashKey_6_k(arg2), \T5 |
---|
842 | 1375 | vpclmulqdq $0x10, \T5, \T3, \T3 |
---|
843 | 1376 | vpxor \T3, \T6, \T6 |
---|
844 | 1377 | |
---|
.. | .. |
---|
853 | 1386 | vaesenc \T1, \XMM8, \XMM8 |
---|
854 | 1387 | |
---|
855 | 1388 | vmovdqa TMP4(%rsp), \T1 |
---|
856 | | - vmovdqa HashKey_5(arg1), \T5 |
---|
| 1389 | + vmovdqu HashKey_5(arg2), \T5 |
---|
857 | 1390 | vpclmulqdq $0x11, \T5, \T1, \T3 |
---|
858 | 1391 | vpxor \T3, \T4, \T4 |
---|
859 | 1392 | vpclmulqdq $0x00, \T5, \T1, \T3 |
---|
.. | .. |
---|
861 | 1394 | |
---|
862 | 1395 | vpshufd $0b01001110, \T1, \T3 |
---|
863 | 1396 | vpxor \T1, \T3, \T3 |
---|
864 | | - vmovdqa HashKey_5_k(arg1), \T5 |
---|
| 1397 | + vmovdqu HashKey_5_k(arg2), \T5 |
---|
865 | 1398 | vpclmulqdq $0x10, \T5, \T3, \T3 |
---|
866 | 1399 | vpxor \T3, \T6, \T6 |
---|
867 | 1400 | |
---|
.. | .. |
---|
877 | 1410 | |
---|
878 | 1411 | |
---|
879 | 1412 | vmovdqa TMP5(%rsp), \T1 |
---|
880 | | - vmovdqa HashKey_4(arg1), \T5 |
---|
| 1413 | + vmovdqu HashKey_4(arg2), \T5 |
---|
881 | 1414 | vpclmulqdq $0x11, \T5, \T1, \T3 |
---|
882 | 1415 | vpxor \T3, \T4, \T4 |
---|
883 | 1416 | vpclmulqdq $0x00, \T5, \T1, \T3 |
---|
.. | .. |
---|
885 | 1418 | |
---|
886 | 1419 | vpshufd $0b01001110, \T1, \T3 |
---|
887 | 1420 | vpxor \T1, \T3, \T3 |
---|
888 | | - vmovdqa HashKey_4_k(arg1), \T5 |
---|
| 1421 | + vmovdqu HashKey_4_k(arg2), \T5 |
---|
889 | 1422 | vpclmulqdq $0x10, \T5, \T3, \T3 |
---|
890 | 1423 | vpxor \T3, \T6, \T6 |
---|
891 | 1424 | |
---|
.. | .. |
---|
900 | 1433 | vaesenc \T1, \XMM8, \XMM8 |
---|
901 | 1434 | |
---|
902 | 1435 | vmovdqa TMP6(%rsp), \T1 |
---|
903 | | - vmovdqa HashKey_3(arg1), \T5 |
---|
| 1436 | + vmovdqu HashKey_3(arg2), \T5 |
---|
904 | 1437 | vpclmulqdq $0x11, \T5, \T1, \T3 |
---|
905 | 1438 | vpxor \T3, \T4, \T4 |
---|
906 | 1439 | vpclmulqdq $0x00, \T5, \T1, \T3 |
---|
.. | .. |
---|
908 | 1441 | |
---|
909 | 1442 | vpshufd $0b01001110, \T1, \T3 |
---|
910 | 1443 | vpxor \T1, \T3, \T3 |
---|
911 | | - vmovdqa HashKey_3_k(arg1), \T5 |
---|
| 1444 | + vmovdqu HashKey_3_k(arg2), \T5 |
---|
912 | 1445 | vpclmulqdq $0x10, \T5, \T3, \T3 |
---|
913 | 1446 | vpxor \T3, \T6, \T6 |
---|
914 | 1447 | |
---|
.. | .. |
---|
924 | 1457 | vaesenc \T1, \XMM8, \XMM8 |
---|
925 | 1458 | |
---|
926 | 1459 | vmovdqa TMP7(%rsp), \T1 |
---|
927 | | - vmovdqa HashKey_2(arg1), \T5 |
---|
| 1460 | + vmovdqu HashKey_2(arg2), \T5 |
---|
928 | 1461 | vpclmulqdq $0x11, \T5, \T1, \T3 |
---|
929 | 1462 | vpxor \T3, \T4, \T4 |
---|
930 | 1463 | vpclmulqdq $0x00, \T5, \T1, \T3 |
---|
.. | .. |
---|
932 | 1465 | |
---|
933 | 1466 | vpshufd $0b01001110, \T1, \T3 |
---|
934 | 1467 | vpxor \T1, \T3, \T3 |
---|
935 | | - vmovdqa HashKey_2_k(arg1), \T5 |
---|
| 1468 | + vmovdqu HashKey_2_k(arg2), \T5 |
---|
936 | 1469 | vpclmulqdq $0x10, \T5, \T3, \T3 |
---|
937 | 1470 | vpxor \T3, \T6, \T6 |
---|
938 | 1471 | |
---|
.. | .. |
---|
949 | 1482 | vaesenc \T5, \XMM8, \XMM8 |
---|
950 | 1483 | |
---|
951 | 1484 | vmovdqa TMP8(%rsp), \T1 |
---|
952 | | - vmovdqa HashKey(arg1), \T5 |
---|
| 1485 | + vmovdqu HashKey(arg2), \T5 |
---|
953 | 1486 | vpclmulqdq $0x11, \T5, \T1, \T3 |
---|
954 | 1487 | vpxor \T3, \T4, \T4 |
---|
955 | 1488 | vpclmulqdq $0x00, \T5, \T1, \T3 |
---|
.. | .. |
---|
957 | 1490 | |
---|
958 | 1491 | vpshufd $0b01001110, \T1, \T3 |
---|
959 | 1492 | vpxor \T1, \T3, \T3 |
---|
960 | | - vmovdqa HashKey_k(arg1), \T5 |
---|
| 1493 | + vmovdqu HashKey_k(arg2), \T5 |
---|
961 | 1494 | vpclmulqdq $0x10, \T5, \T3, \T3 |
---|
962 | 1495 | vpxor \T3, \T6, \T6 |
---|
963 | 1496 | |
---|
.. | .. |
---|
966 | 1499 | |
---|
967 | 1500 | vmovdqu 16*10(arg1), \T5 |
---|
968 | 1501 | |
---|
| 1502 | + i = 11 |
---|
| 1503 | + setreg |
---|
| 1504 | +.rep (\REP-9) |
---|
| 1505 | + |
---|
| 1506 | + vaesenc \T5, \XMM1, \XMM1 |
---|
| 1507 | + vaesenc \T5, \XMM2, \XMM2 |
---|
| 1508 | + vaesenc \T5, \XMM3, \XMM3 |
---|
| 1509 | + vaesenc \T5, \XMM4, \XMM4 |
---|
| 1510 | + vaesenc \T5, \XMM5, \XMM5 |
---|
| 1511 | + vaesenc \T5, \XMM6, \XMM6 |
---|
| 1512 | + vaesenc \T5, \XMM7, \XMM7 |
---|
| 1513 | + vaesenc \T5, \XMM8, \XMM8 |
---|
| 1514 | + |
---|
| 1515 | + vmovdqu 16*i(arg1), \T5 |
---|
| 1516 | + i = i + 1 |
---|
| 1517 | + setreg |
---|
| 1518 | +.endr |
---|
| 1519 | + |
---|
969 | 1520 | i = 0 |
---|
970 | 1521 | j = 1 |
---|
971 | 1522 | setreg |
---|
972 | 1523 | .rep 8 |
---|
973 | | - vpxor 16*i(arg3, %r11), \T5, \T2 |
---|
| 1524 | + vpxor 16*i(arg4, %r11), \T5, \T2 |
---|
974 | 1525 | .if \ENC_DEC == ENC |
---|
975 | 1526 | vaesenclast \T2, reg_j, reg_j |
---|
976 | 1527 | .else |
---|
977 | 1528 | vaesenclast \T2, reg_j, \T3 |
---|
978 | | - vmovdqu 16*i(arg3, %r11), reg_j |
---|
979 | | - vmovdqu \T3, 16*i(arg2, %r11) |
---|
| 1529 | + vmovdqu 16*i(arg4, %r11), reg_j |
---|
| 1530 | + vmovdqu \T3, 16*i(arg3, %r11) |
---|
980 | 1531 | .endif |
---|
981 | 1532 | i = (i+1) |
---|
982 | 1533 | j = (j+1) |
---|
.. | .. |
---|
1008 | 1559 | vpxor \T2, \T7, \T7 # first phase of the reduction complete |
---|
1009 | 1560 | ####################################################################### |
---|
1010 | 1561 | .if \ENC_DEC == ENC |
---|
1011 | | - vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer |
---|
1012 | | - vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer |
---|
1013 | | - vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer |
---|
1014 | | - vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer |
---|
1015 | | - vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer |
---|
1016 | | - vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer |
---|
1017 | | - vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer |
---|
1018 | | - vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer |
---|
| 1562 | + vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer |
---|
| 1563 | + vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer |
---|
| 1564 | + vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer |
---|
| 1565 | + vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer |
---|
| 1566 | + vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer |
---|
| 1567 | + vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer |
---|
| 1568 | + vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer |
---|
| 1569 | + vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer |
---|
1019 | 1570 | .endif |
---|
1020 | 1571 | |
---|
1021 | 1572 | ####################################################################### |
---|
.. | .. |
---|
1056 | 1607 | |
---|
1057 | 1608 | vpshufd $0b01001110, \XMM1, \T2 |
---|
1058 | 1609 | vpxor \XMM1, \T2, \T2 |
---|
1059 | | - vmovdqa HashKey_8(arg1), \T5 |
---|
| 1610 | + vmovdqu HashKey_8(arg2), \T5 |
---|
1060 | 1611 | vpclmulqdq $0x11, \T5, \XMM1, \T6 |
---|
1061 | 1612 | vpclmulqdq $0x00, \T5, \XMM1, \T7 |
---|
1062 | 1613 | |
---|
1063 | | - vmovdqa HashKey_8_k(arg1), \T3 |
---|
| 1614 | + vmovdqu HashKey_8_k(arg2), \T3 |
---|
1064 | 1615 | vpclmulqdq $0x00, \T3, \T2, \XMM1 |
---|
1065 | 1616 | |
---|
1066 | 1617 | ###################### |
---|
1067 | 1618 | |
---|
1068 | 1619 | vpshufd $0b01001110, \XMM2, \T2 |
---|
1069 | 1620 | vpxor \XMM2, \T2, \T2 |
---|
1070 | | - vmovdqa HashKey_7(arg1), \T5 |
---|
| 1621 | + vmovdqu HashKey_7(arg2), \T5 |
---|
1071 | 1622 | vpclmulqdq $0x11, \T5, \XMM2, \T4 |
---|
1072 | 1623 | vpxor \T4, \T6, \T6 |
---|
1073 | 1624 | |
---|
1074 | 1625 | vpclmulqdq $0x00, \T5, \XMM2, \T4 |
---|
1075 | 1626 | vpxor \T4, \T7, \T7 |
---|
1076 | 1627 | |
---|
1077 | | - vmovdqa HashKey_7_k(arg1), \T3 |
---|
| 1628 | + vmovdqu HashKey_7_k(arg2), \T3 |
---|
1078 | 1629 | vpclmulqdq $0x00, \T3, \T2, \T2 |
---|
1079 | 1630 | vpxor \T2, \XMM1, \XMM1 |
---|
1080 | 1631 | |
---|
.. | .. |
---|
1082 | 1633 | |
---|
1083 | 1634 | vpshufd $0b01001110, \XMM3, \T2 |
---|
1084 | 1635 | vpxor \XMM3, \T2, \T2 |
---|
1085 | | - vmovdqa HashKey_6(arg1), \T5 |
---|
| 1636 | + vmovdqu HashKey_6(arg2), \T5 |
---|
1086 | 1637 | vpclmulqdq $0x11, \T5, \XMM3, \T4 |
---|
1087 | 1638 | vpxor \T4, \T6, \T6 |
---|
1088 | 1639 | |
---|
1089 | 1640 | vpclmulqdq $0x00, \T5, \XMM3, \T4 |
---|
1090 | 1641 | vpxor \T4, \T7, \T7 |
---|
1091 | 1642 | |
---|
1092 | | - vmovdqa HashKey_6_k(arg1), \T3 |
---|
| 1643 | + vmovdqu HashKey_6_k(arg2), \T3 |
---|
1093 | 1644 | vpclmulqdq $0x00, \T3, \T2, \T2 |
---|
1094 | 1645 | vpxor \T2, \XMM1, \XMM1 |
---|
1095 | 1646 | |
---|
.. | .. |
---|
1097 | 1648 | |
---|
1098 | 1649 | vpshufd $0b01001110, \XMM4, \T2 |
---|
1099 | 1650 | vpxor \XMM4, \T2, \T2 |
---|
1100 | | - vmovdqa HashKey_5(arg1), \T5 |
---|
| 1651 | + vmovdqu HashKey_5(arg2), \T5 |
---|
1101 | 1652 | vpclmulqdq $0x11, \T5, \XMM4, \T4 |
---|
1102 | 1653 | vpxor \T4, \T6, \T6 |
---|
1103 | 1654 | |
---|
1104 | 1655 | vpclmulqdq $0x00, \T5, \XMM4, \T4 |
---|
1105 | 1656 | vpxor \T4, \T7, \T7 |
---|
1106 | 1657 | |
---|
1107 | | - vmovdqa HashKey_5_k(arg1), \T3 |
---|
| 1658 | + vmovdqu HashKey_5_k(arg2), \T3 |
---|
1108 | 1659 | vpclmulqdq $0x00, \T3, \T2, \T2 |
---|
1109 | 1660 | vpxor \T2, \XMM1, \XMM1 |
---|
1110 | 1661 | |
---|
.. | .. |
---|
1112 | 1663 | |
---|
1113 | 1664 | vpshufd $0b01001110, \XMM5, \T2 |
---|
1114 | 1665 | vpxor \XMM5, \T2, \T2 |
---|
1115 | | - vmovdqa HashKey_4(arg1), \T5 |
---|
| 1666 | + vmovdqu HashKey_4(arg2), \T5 |
---|
1116 | 1667 | vpclmulqdq $0x11, \T5, \XMM5, \T4 |
---|
1117 | 1668 | vpxor \T4, \T6, \T6 |
---|
1118 | 1669 | |
---|
1119 | 1670 | vpclmulqdq $0x00, \T5, \XMM5, \T4 |
---|
1120 | 1671 | vpxor \T4, \T7, \T7 |
---|
1121 | 1672 | |
---|
1122 | | - vmovdqa HashKey_4_k(arg1), \T3 |
---|
| 1673 | + vmovdqu HashKey_4_k(arg2), \T3 |
---|
1123 | 1674 | vpclmulqdq $0x00, \T3, \T2, \T2 |
---|
1124 | 1675 | vpxor \T2, \XMM1, \XMM1 |
---|
1125 | 1676 | |
---|
.. | .. |
---|
1127 | 1678 | |
---|
1128 | 1679 | vpshufd $0b01001110, \XMM6, \T2 |
---|
1129 | 1680 | vpxor \XMM6, \T2, \T2 |
---|
1130 | | - vmovdqa HashKey_3(arg1), \T5 |
---|
| 1681 | + vmovdqu HashKey_3(arg2), \T5 |
---|
1131 | 1682 | vpclmulqdq $0x11, \T5, \XMM6, \T4 |
---|
1132 | 1683 | vpxor \T4, \T6, \T6 |
---|
1133 | 1684 | |
---|
1134 | 1685 | vpclmulqdq $0x00, \T5, \XMM6, \T4 |
---|
1135 | 1686 | vpxor \T4, \T7, \T7 |
---|
1136 | 1687 | |
---|
1137 | | - vmovdqa HashKey_3_k(arg1), \T3 |
---|
| 1688 | + vmovdqu HashKey_3_k(arg2), \T3 |
---|
1138 | 1689 | vpclmulqdq $0x00, \T3, \T2, \T2 |
---|
1139 | 1690 | vpxor \T2, \XMM1, \XMM1 |
---|
1140 | 1691 | |
---|
.. | .. |
---|
1142 | 1693 | |
---|
1143 | 1694 | vpshufd $0b01001110, \XMM7, \T2 |
---|
1144 | 1695 | vpxor \XMM7, \T2, \T2 |
---|
1145 | | - vmovdqa HashKey_2(arg1), \T5 |
---|
| 1696 | + vmovdqu HashKey_2(arg2), \T5 |
---|
1146 | 1697 | vpclmulqdq $0x11, \T5, \XMM7, \T4 |
---|
1147 | 1698 | vpxor \T4, \T6, \T6 |
---|
1148 | 1699 | |
---|
1149 | 1700 | vpclmulqdq $0x00, \T5, \XMM7, \T4 |
---|
1150 | 1701 | vpxor \T4, \T7, \T7 |
---|
1151 | 1702 | |
---|
1152 | | - vmovdqa HashKey_2_k(arg1), \T3 |
---|
| 1703 | + vmovdqu HashKey_2_k(arg2), \T3 |
---|
1153 | 1704 | vpclmulqdq $0x00, \T3, \T2, \T2 |
---|
1154 | 1705 | vpxor \T2, \XMM1, \XMM1 |
---|
1155 | 1706 | |
---|
.. | .. |
---|
1157 | 1708 | |
---|
1158 | 1709 | vpshufd $0b01001110, \XMM8, \T2 |
---|
1159 | 1710 | vpxor \XMM8, \T2, \T2 |
---|
1160 | | - vmovdqa HashKey(arg1), \T5 |
---|
| 1711 | + vmovdqu HashKey(arg2), \T5 |
---|
1161 | 1712 | vpclmulqdq $0x11, \T5, \XMM8, \T4 |
---|
1162 | 1713 | vpxor \T4, \T6, \T6 |
---|
1163 | 1714 | |
---|
1164 | 1715 | vpclmulqdq $0x00, \T5, \XMM8, \T4 |
---|
1165 | 1716 | vpxor \T4, \T7, \T7 |
---|
1166 | 1717 | |
---|
1167 | | - vmovdqa HashKey_k(arg1), \T3 |
---|
| 1718 | + vmovdqu HashKey_k(arg2), \T3 |
---|
1168 | 1719 | vpclmulqdq $0x00, \T3, \T2, \T2 |
---|
1169 | 1720 | |
---|
1170 | 1721 | vpxor \T2, \XMM1, \XMM1 |
---|
.. | .. |
---|
1210 | 1761 | |
---|
1211 | 1762 | .endm |
---|
1212 | 1763 | |
---|
1213 | | - |
---|
1214 | | -# combined for GCM encrypt and decrypt functions |
---|
1215 | | -# clobbering all xmm registers |
---|
1216 | | -# clobbering r10, r11, r12, r13, r14, r15 |
---|
1217 | | -.macro GCM_ENC_DEC_AVX ENC_DEC |
---|
1218 | | - |
---|
1219 | | - #the number of pushes must equal STACK_OFFSET |
---|
1220 | | - push %r12 |
---|
1221 | | - push %r13 |
---|
1222 | | - push %r14 |
---|
1223 | | - push %r15 |
---|
1224 | | - |
---|
1225 | | - mov %rsp, %r14 |
---|
1226 | | - |
---|
1227 | | - |
---|
1228 | | - |
---|
1229 | | - |
---|
1230 | | - sub $VARIABLE_OFFSET, %rsp |
---|
1231 | | - and $~63, %rsp # align rsp to 64 bytes |
---|
1232 | | - |
---|
1233 | | - |
---|
1234 | | - vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey |
---|
1235 | | - |
---|
1236 | | - mov arg4, %r13 # save the number of bytes of plaintext/ciphertext |
---|
1237 | | - and $-16, %r13 # r13 = r13 - (r13 mod 16) |
---|
1238 | | - |
---|
1239 | | - mov %r13, %r12 |
---|
1240 | | - shr $4, %r12 |
---|
1241 | | - and $7, %r12 |
---|
1242 | | - jz _initial_num_blocks_is_0\@ |
---|
1243 | | - |
---|
1244 | | - cmp $7, %r12 |
---|
1245 | | - je _initial_num_blocks_is_7\@ |
---|
1246 | | - cmp $6, %r12 |
---|
1247 | | - je _initial_num_blocks_is_6\@ |
---|
1248 | | - cmp $5, %r12 |
---|
1249 | | - je _initial_num_blocks_is_5\@ |
---|
1250 | | - cmp $4, %r12 |
---|
1251 | | - je _initial_num_blocks_is_4\@ |
---|
1252 | | - cmp $3, %r12 |
---|
1253 | | - je _initial_num_blocks_is_3\@ |
---|
1254 | | - cmp $2, %r12 |
---|
1255 | | - je _initial_num_blocks_is_2\@ |
---|
1256 | | - |
---|
1257 | | - jmp _initial_num_blocks_is_1\@ |
---|
1258 | | - |
---|
1259 | | -_initial_num_blocks_is_7\@: |
---|
1260 | | - INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
1261 | | - sub $16*7, %r13 |
---|
1262 | | - jmp _initial_blocks_encrypted\@ |
---|
1263 | | - |
---|
1264 | | -_initial_num_blocks_is_6\@: |
---|
1265 | | - INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
1266 | | - sub $16*6, %r13 |
---|
1267 | | - jmp _initial_blocks_encrypted\@ |
---|
1268 | | - |
---|
1269 | | -_initial_num_blocks_is_5\@: |
---|
1270 | | - INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
1271 | | - sub $16*5, %r13 |
---|
1272 | | - jmp _initial_blocks_encrypted\@ |
---|
1273 | | - |
---|
1274 | | -_initial_num_blocks_is_4\@: |
---|
1275 | | - INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
1276 | | - sub $16*4, %r13 |
---|
1277 | | - jmp _initial_blocks_encrypted\@ |
---|
1278 | | - |
---|
1279 | | -_initial_num_blocks_is_3\@: |
---|
1280 | | - INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
1281 | | - sub $16*3, %r13 |
---|
1282 | | - jmp _initial_blocks_encrypted\@ |
---|
1283 | | - |
---|
1284 | | -_initial_num_blocks_is_2\@: |
---|
1285 | | - INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
1286 | | - sub $16*2, %r13 |
---|
1287 | | - jmp _initial_blocks_encrypted\@ |
---|
1288 | | - |
---|
1289 | | -_initial_num_blocks_is_1\@: |
---|
1290 | | - INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
1291 | | - sub $16*1, %r13 |
---|
1292 | | - jmp _initial_blocks_encrypted\@ |
---|
1293 | | - |
---|
1294 | | -_initial_num_blocks_is_0\@: |
---|
1295 | | - INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
1296 | | - |
---|
1297 | | - |
---|
1298 | | -_initial_blocks_encrypted\@: |
---|
1299 | | - cmp $0, %r13 |
---|
1300 | | - je _zero_cipher_left\@ |
---|
1301 | | - |
---|
1302 | | - sub $128, %r13 |
---|
1303 | | - je _eight_cipher_left\@ |
---|
1304 | | - |
---|
1305 | | - |
---|
1306 | | - |
---|
1307 | | - |
---|
1308 | | - vmovd %xmm9, %r15d |
---|
1309 | | - and $255, %r15d |
---|
1310 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
1311 | | - |
---|
1312 | | - |
---|
1313 | | -_encrypt_by_8_new\@: |
---|
1314 | | - cmp $(255-8), %r15d |
---|
1315 | | - jg _encrypt_by_8\@ |
---|
1316 | | - |
---|
1317 | | - |
---|
1318 | | - |
---|
1319 | | - add $8, %r15b |
---|
1320 | | - GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC |
---|
1321 | | - add $128, %r11 |
---|
1322 | | - sub $128, %r13 |
---|
1323 | | - jne _encrypt_by_8_new\@ |
---|
1324 | | - |
---|
1325 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
1326 | | - jmp _eight_cipher_left\@ |
---|
1327 | | - |
---|
1328 | | -_encrypt_by_8\@: |
---|
1329 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
1330 | | - add $8, %r15b |
---|
1331 | | - GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC |
---|
1332 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
1333 | | - add $128, %r11 |
---|
1334 | | - sub $128, %r13 |
---|
1335 | | - jne _encrypt_by_8_new\@ |
---|
1336 | | - |
---|
1337 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
1338 | | - |
---|
1339 | | - |
---|
1340 | | - |
---|
1341 | | - |
---|
1342 | | -_eight_cipher_left\@: |
---|
1343 | | - GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 |
---|
1344 | | - |
---|
1345 | | - |
---|
1346 | | -_zero_cipher_left\@: |
---|
1347 | | - cmp $16, arg4 |
---|
1348 | | - jl _only_less_than_16\@ |
---|
1349 | | - |
---|
1350 | | - mov arg4, %r13 |
---|
1351 | | - and $15, %r13 # r13 = (arg4 mod 16) |
---|
1352 | | - |
---|
1353 | | - je _multiple_of_16_bytes\@ |
---|
1354 | | - |
---|
1355 | | - # handle the last <16 Byte block seperately |
---|
1356 | | - |
---|
1357 | | - |
---|
1358 | | - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn |
---|
1359 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
1360 | | - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) |
---|
1361 | | - |
---|
1362 | | - sub $16, %r11 |
---|
1363 | | - add %r13, %r11 |
---|
1364 | | - vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block |
---|
1365 | | - |
---|
1366 | | - lea SHIFT_MASK+16(%rip), %r12 |
---|
1367 | | - sub %r13, %r12 # adjust the shuffle mask pointer to be |
---|
1368 | | - # able to shift 16-r13 bytes (r13 is the |
---|
1369 | | - # number of bytes in plaintext mod 16) |
---|
1370 | | - vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask |
---|
1371 | | - vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes |
---|
1372 | | - jmp _final_ghash_mul\@ |
---|
1373 | | - |
---|
1374 | | -_only_less_than_16\@: |
---|
1375 | | - # check for 0 length |
---|
1376 | | - mov arg4, %r13 |
---|
1377 | | - and $15, %r13 # r13 = (arg4 mod 16) |
---|
1378 | | - |
---|
1379 | | - je _multiple_of_16_bytes\@ |
---|
1380 | | - |
---|
1381 | | - # handle the last <16 Byte block seperately |
---|
1382 | | - |
---|
1383 | | - |
---|
1384 | | - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn |
---|
1385 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
1386 | | - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) |
---|
1387 | | - |
---|
1388 | | - |
---|
1389 | | - lea SHIFT_MASK+16(%rip), %r12 |
---|
1390 | | - sub %r13, %r12 # adjust the shuffle mask pointer to be |
---|
1391 | | - # able to shift 16-r13 bytes (r13 is the |
---|
1392 | | - # number of bytes in plaintext mod 16) |
---|
1393 | | - |
---|
1394 | | -_get_last_16_byte_loop\@: |
---|
1395 | | - movb (arg3, %r11), %al |
---|
1396 | | - movb %al, TMP1 (%rsp , %r11) |
---|
1397 | | - add $1, %r11 |
---|
1398 | | - cmp %r13, %r11 |
---|
1399 | | - jne _get_last_16_byte_loop\@ |
---|
1400 | | - |
---|
1401 | | - vmovdqu TMP1(%rsp), %xmm1 |
---|
1402 | | - |
---|
1403 | | - sub $16, %r11 |
---|
1404 | | - |
---|
1405 | | -_final_ghash_mul\@: |
---|
1406 | | - .if \ENC_DEC == DEC |
---|
1407 | | - vmovdqa %xmm1, %xmm2 |
---|
1408 | | - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) |
---|
1409 | | - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to |
---|
1410 | | - # mask out top 16-r13 bytes of xmm9 |
---|
1411 | | - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 |
---|
1412 | | - vpand %xmm1, %xmm2, %xmm2 |
---|
1413 | | - vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 |
---|
1414 | | - vpxor %xmm2, %xmm14, %xmm14 |
---|
1415 | | - #GHASH computation for the last <16 Byte block |
---|
1416 | | - GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 |
---|
1417 | | - sub %r13, %r11 |
---|
1418 | | - add $16, %r11 |
---|
1419 | | - .else |
---|
1420 | | - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) |
---|
1421 | | - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to |
---|
1422 | | - # mask out top 16-r13 bytes of xmm9 |
---|
1423 | | - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 |
---|
1424 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
1425 | | - vpxor %xmm9, %xmm14, %xmm14 |
---|
1426 | | - #GHASH computation for the last <16 Byte block |
---|
1427 | | - GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 |
---|
1428 | | - sub %r13, %r11 |
---|
1429 | | - add $16, %r11 |
---|
1430 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext |
---|
1431 | | - .endif |
---|
1432 | | - |
---|
1433 | | - |
---|
1434 | | - ############################# |
---|
1435 | | - # output r13 Bytes |
---|
1436 | | - vmovq %xmm9, %rax |
---|
1437 | | - cmp $8, %r13 |
---|
1438 | | - jle _less_than_8_bytes_left\@ |
---|
1439 | | - |
---|
1440 | | - mov %rax, (arg2 , %r11) |
---|
1441 | | - add $8, %r11 |
---|
1442 | | - vpsrldq $8, %xmm9, %xmm9 |
---|
1443 | | - vmovq %xmm9, %rax |
---|
1444 | | - sub $8, %r13 |
---|
1445 | | - |
---|
1446 | | -_less_than_8_bytes_left\@: |
---|
1447 | | - movb %al, (arg2 , %r11) |
---|
1448 | | - add $1, %r11 |
---|
1449 | | - shr $8, %rax |
---|
1450 | | - sub $1, %r13 |
---|
1451 | | - jne _less_than_8_bytes_left\@ |
---|
1452 | | - ############################# |
---|
1453 | | - |
---|
1454 | | -_multiple_of_16_bytes\@: |
---|
1455 | | - mov arg7, %r12 # r12 = aadLen (number of bytes) |
---|
1456 | | - shl $3, %r12 # convert into number of bits |
---|
1457 | | - vmovd %r12d, %xmm15 # len(A) in xmm15 |
---|
1458 | | - |
---|
1459 | | - shl $3, arg4 # len(C) in bits (*128) |
---|
1460 | | - vmovq arg4, %xmm1 |
---|
1461 | | - vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 |
---|
1462 | | - vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) |
---|
1463 | | - |
---|
1464 | | - vpxor %xmm15, %xmm14, %xmm14 |
---|
1465 | | - GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation |
---|
1466 | | - vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap |
---|
1467 | | - |
---|
1468 | | - mov arg5, %rax # rax = *Y0 |
---|
1469 | | - vmovdqu (%rax), %xmm9 # xmm9 = Y0 |
---|
1470 | | - |
---|
1471 | | - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) |
---|
1472 | | - |
---|
1473 | | - vpxor %xmm14, %xmm9, %xmm9 |
---|
1474 | | - |
---|
1475 | | - |
---|
1476 | | - |
---|
1477 | | -_return_T\@: |
---|
1478 | | - mov arg8, %r10 # r10 = authTag |
---|
1479 | | - mov arg9, %r11 # r11 = auth_tag_len |
---|
1480 | | - |
---|
1481 | | - cmp $16, %r11 |
---|
1482 | | - je _T_16\@ |
---|
1483 | | - |
---|
1484 | | - cmp $8, %r11 |
---|
1485 | | - jl _T_4\@ |
---|
1486 | | - |
---|
1487 | | -_T_8\@: |
---|
1488 | | - vmovq %xmm9, %rax |
---|
1489 | | - mov %rax, (%r10) |
---|
1490 | | - add $8, %r10 |
---|
1491 | | - sub $8, %r11 |
---|
1492 | | - vpsrldq $8, %xmm9, %xmm9 |
---|
1493 | | - cmp $0, %r11 |
---|
1494 | | - je _return_T_done\@ |
---|
1495 | | -_T_4\@: |
---|
1496 | | - vmovd %xmm9, %eax |
---|
1497 | | - mov %eax, (%r10) |
---|
1498 | | - add $4, %r10 |
---|
1499 | | - sub $4, %r11 |
---|
1500 | | - vpsrldq $4, %xmm9, %xmm9 |
---|
1501 | | - cmp $0, %r11 |
---|
1502 | | - je _return_T_done\@ |
---|
1503 | | -_T_123\@: |
---|
1504 | | - vmovd %xmm9, %eax |
---|
1505 | | - cmp $2, %r11 |
---|
1506 | | - jl _T_1\@ |
---|
1507 | | - mov %ax, (%r10) |
---|
1508 | | - cmp $2, %r11 |
---|
1509 | | - je _return_T_done\@ |
---|
1510 | | - add $2, %r10 |
---|
1511 | | - sar $16, %eax |
---|
1512 | | -_T_1\@: |
---|
1513 | | - mov %al, (%r10) |
---|
1514 | | - jmp _return_T_done\@ |
---|
1515 | | - |
---|
1516 | | -_T_16\@: |
---|
1517 | | - vmovdqu %xmm9, (%r10) |
---|
1518 | | - |
---|
1519 | | -_return_T_done\@: |
---|
1520 | | - mov %r14, %rsp |
---|
1521 | | - |
---|
1522 | | - pop %r15 |
---|
1523 | | - pop %r14 |
---|
1524 | | - pop %r13 |
---|
1525 | | - pop %r12 |
---|
1526 | | -.endm |
---|
1527 | | - |
---|
1528 | | - |
---|
1529 | 1764 | ############################################################# |
---|
1530 | 1765 | #void aesni_gcm_precomp_avx_gen2 |
---|
1531 | 1766 | # (gcm_data *my_ctx_data, |
---|
1532 | | -# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ |
---|
| 1767 | +# gcm_context_data *data, |
---|
| 1768 | +# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ |
---|
| 1769 | +# u8 *iv, /* Pre-counter block j0: 4 byte salt |
---|
| 1770 | +# (from Security Association) concatenated with 8 byte |
---|
| 1771 | +# Initialisation Vector (from IPSec ESP Payload) |
---|
| 1772 | +# concatenated with 0x00000001. 16-byte aligned pointer. */ |
---|
| 1773 | +# const u8 *aad, /* Additional Authentication Data (AAD)*/ |
---|
| 1774 | +# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ |
---|
1533 | 1775 | ############################################################# |
---|
1534 | | -ENTRY(aesni_gcm_precomp_avx_gen2) |
---|
1535 | | - #the number of pushes must equal STACK_OFFSET |
---|
1536 | | - push %r12 |
---|
1537 | | - push %r13 |
---|
1538 | | - push %r14 |
---|
1539 | | - push %r15 |
---|
1540 | | - |
---|
1541 | | - mov %rsp, %r14 |
---|
1542 | | - |
---|
1543 | | - |
---|
1544 | | - |
---|
1545 | | - sub $VARIABLE_OFFSET, %rsp |
---|
1546 | | - and $~63, %rsp # align rsp to 64 bytes |
---|
1547 | | - |
---|
1548 | | - vmovdqu (arg2), %xmm6 # xmm6 = HashKey |
---|
1549 | | - |
---|
1550 | | - vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 |
---|
1551 | | - ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey |
---|
1552 | | - vmovdqa %xmm6, %xmm2 |
---|
1553 | | - vpsllq $1, %xmm6, %xmm6 |
---|
1554 | | - vpsrlq $63, %xmm2, %xmm2 |
---|
1555 | | - vmovdqa %xmm2, %xmm1 |
---|
1556 | | - vpslldq $8, %xmm2, %xmm2 |
---|
1557 | | - vpsrldq $8, %xmm1, %xmm1 |
---|
1558 | | - vpor %xmm2, %xmm6, %xmm6 |
---|
1559 | | - #reduction |
---|
1560 | | - vpshufd $0b00100100, %xmm1, %xmm2 |
---|
1561 | | - vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 |
---|
1562 | | - vpand POLY(%rip), %xmm2, %xmm2 |
---|
1563 | | - vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly |
---|
1564 | | - ####################################################################### |
---|
1565 | | - vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly |
---|
1566 | | - |
---|
1567 | | - |
---|
1568 | | - PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 |
---|
1569 | | - |
---|
1570 | | - mov %r14, %rsp |
---|
1571 | | - |
---|
1572 | | - pop %r15 |
---|
1573 | | - pop %r14 |
---|
1574 | | - pop %r13 |
---|
1575 | | - pop %r12 |
---|
1576 | | - ret |
---|
1577 | | -ENDPROC(aesni_gcm_precomp_avx_gen2) |
---|
| 1776 | +SYM_FUNC_START(aesni_gcm_init_avx_gen2) |
---|
| 1777 | + FUNC_SAVE |
---|
| 1778 | + INIT GHASH_MUL_AVX, PRECOMPUTE_AVX |
---|
| 1779 | + FUNC_RESTORE |
---|
| 1780 | + RET |
---|
| 1781 | +SYM_FUNC_END(aesni_gcm_init_avx_gen2) |
---|
1578 | 1782 | |
---|
1579 | 1783 | ############################################################################### |
---|
1580 | | -#void aesni_gcm_enc_avx_gen2( |
---|
| 1784 | +#void aesni_gcm_enc_update_avx_gen2( |
---|
1581 | 1785 | # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ |
---|
| 1786 | +# gcm_context_data *data, |
---|
1582 | 1787 | # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ |
---|
1583 | 1788 | # const u8 *in, /* Plaintext input */ |
---|
1584 | | -# u64 plaintext_len, /* Length of data in Bytes for encryption. */ |
---|
1585 | | -# u8 *iv, /* Pre-counter block j0: 4 byte salt |
---|
1586 | | -# (from Security Association) concatenated with 8 byte |
---|
1587 | | -# Initialisation Vector (from IPSec ESP Payload) |
---|
1588 | | -# concatenated with 0x00000001. 16-byte aligned pointer. */ |
---|
1589 | | -# const u8 *aad, /* Additional Authentication Data (AAD)*/ |
---|
1590 | | -# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ |
---|
1591 | | -# u8 *auth_tag, /* Authenticated Tag output. */ |
---|
1592 | | -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. |
---|
1593 | | -# Valid values are 16 (most likely), 12 or 8. */ |
---|
| 1789 | +# u64 plaintext_len) /* Length of data in Bytes for encryption. */ |
---|
1594 | 1790 | ############################################################################### |
---|
1595 | | -ENTRY(aesni_gcm_enc_avx_gen2) |
---|
1596 | | - GCM_ENC_DEC_AVX ENC |
---|
1597 | | - ret |
---|
1598 | | -ENDPROC(aesni_gcm_enc_avx_gen2) |
---|
| 1791 | +SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2) |
---|
| 1792 | + FUNC_SAVE |
---|
| 1793 | + mov keysize, %eax |
---|
| 1794 | + cmp $32, %eax |
---|
| 1795 | + je key_256_enc_update |
---|
| 1796 | + cmp $16, %eax |
---|
| 1797 | + je key_128_enc_update |
---|
| 1798 | + # must be 192 |
---|
| 1799 | + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 |
---|
| 1800 | + FUNC_RESTORE |
---|
| 1801 | + RET |
---|
| 1802 | +key_128_enc_update: |
---|
| 1803 | + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 |
---|
| 1804 | + FUNC_RESTORE |
---|
| 1805 | + RET |
---|
| 1806 | +key_256_enc_update: |
---|
| 1807 | + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 |
---|
| 1808 | + FUNC_RESTORE |
---|
| 1809 | + RET |
---|
| 1810 | +SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2) |
---|
1599 | 1811 | |
---|
1600 | 1812 | ############################################################################### |
---|
1601 | | -#void aesni_gcm_dec_avx_gen2( |
---|
| 1813 | +#void aesni_gcm_dec_update_avx_gen2( |
---|
1602 | 1814 | # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ |
---|
| 1815 | +# gcm_context_data *data, |
---|
1603 | 1816 | # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ |
---|
1604 | 1817 | # const u8 *in, /* Ciphertext input */ |
---|
1605 | | -# u64 plaintext_len, /* Length of data in Bytes for encryption. */ |
---|
1606 | | -# u8 *iv, /* Pre-counter block j0: 4 byte salt |
---|
1607 | | -# (from Security Association) concatenated with 8 byte |
---|
1608 | | -# Initialisation Vector (from IPSec ESP Payload) |
---|
1609 | | -# concatenated with 0x00000001. 16-byte aligned pointer. */ |
---|
1610 | | -# const u8 *aad, /* Additional Authentication Data (AAD)*/ |
---|
1611 | | -# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ |
---|
| 1818 | +# u64 plaintext_len) /* Length of data in Bytes for encryption. */ |
---|
| 1819 | +############################################################################### |
---|
| 1820 | +SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2) |
---|
| 1821 | + FUNC_SAVE |
---|
| 1822 | + mov keysize,%eax |
---|
| 1823 | + cmp $32, %eax |
---|
| 1824 | + je key_256_dec_update |
---|
| 1825 | + cmp $16, %eax |
---|
| 1826 | + je key_128_dec_update |
---|
| 1827 | + # must be 192 |
---|
| 1828 | + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 |
---|
| 1829 | + FUNC_RESTORE |
---|
| 1830 | + RET |
---|
| 1831 | +key_128_dec_update: |
---|
| 1832 | + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 |
---|
| 1833 | + FUNC_RESTORE |
---|
| 1834 | + RET |
---|
| 1835 | +key_256_dec_update: |
---|
| 1836 | + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 |
---|
| 1837 | + FUNC_RESTORE |
---|
| 1838 | + RET |
---|
| 1839 | +SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2) |
---|
| 1840 | + |
---|
| 1841 | +############################################################################### |
---|
| 1842 | +#void aesni_gcm_finalize_avx_gen2( |
---|
| 1843 | +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ |
---|
| 1844 | +# gcm_context_data *data, |
---|
1612 | 1845 | # u8 *auth_tag, /* Authenticated Tag output. */ |
---|
1613 | 1846 | # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. |
---|
1614 | 1847 | # Valid values are 16 (most likely), 12 or 8. */ |
---|
1615 | 1848 | ############################################################################### |
---|
1616 | | -ENTRY(aesni_gcm_dec_avx_gen2) |
---|
1617 | | - GCM_ENC_DEC_AVX DEC |
---|
1618 | | - ret |
---|
1619 | | -ENDPROC(aesni_gcm_dec_avx_gen2) |
---|
1620 | | -#endif /* CONFIG_AS_AVX */ |
---|
| 1849 | +SYM_FUNC_START(aesni_gcm_finalize_avx_gen2) |
---|
| 1850 | + FUNC_SAVE |
---|
| 1851 | + mov keysize,%eax |
---|
| 1852 | + cmp $32, %eax |
---|
| 1853 | + je key_256_finalize |
---|
| 1854 | + cmp $16, %eax |
---|
| 1855 | + je key_128_finalize |
---|
| 1856 | + # must be 192 |
---|
| 1857 | + GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 |
---|
| 1858 | + FUNC_RESTORE |
---|
| 1859 | + RET |
---|
| 1860 | +key_128_finalize: |
---|
| 1861 | + GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 |
---|
| 1862 | + FUNC_RESTORE |
---|
| 1863 | + RET |
---|
| 1864 | +key_256_finalize: |
---|
| 1865 | + GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 |
---|
| 1866 | + FUNC_RESTORE |
---|
| 1867 | + RET |
---|
| 1868 | +SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) |
---|
1621 | 1869 | |
---|
1622 | | -#ifdef CONFIG_AS_AVX2 |
---|
1623 | 1870 | ############################################################################### |
---|
1624 | 1871 | # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) |
---|
1625 | 1872 | # Input: A and B (128-bits each, bit-reflected) |
---|
.. | .. |
---|
1670 | 1917 | # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i |
---|
1671 | 1918 | vmovdqa \HK, \T5 |
---|
1672 | 1919 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly |
---|
1673 | | - vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly |
---|
| 1920 | + vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly |
---|
1674 | 1921 | |
---|
1675 | 1922 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly |
---|
1676 | | - vmovdqa \T5, HashKey_3(arg1) |
---|
| 1923 | + vmovdqu \T5, HashKey_3(arg2) |
---|
1677 | 1924 | |
---|
1678 | 1925 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly |
---|
1679 | | - vmovdqa \T5, HashKey_4(arg1) |
---|
| 1926 | + vmovdqu \T5, HashKey_4(arg2) |
---|
1680 | 1927 | |
---|
1681 | 1928 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly |
---|
1682 | | - vmovdqa \T5, HashKey_5(arg1) |
---|
| 1929 | + vmovdqu \T5, HashKey_5(arg2) |
---|
1683 | 1930 | |
---|
1684 | 1931 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly |
---|
1685 | | - vmovdqa \T5, HashKey_6(arg1) |
---|
| 1932 | + vmovdqu \T5, HashKey_6(arg2) |
---|
1686 | 1933 | |
---|
1687 | 1934 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly |
---|
1688 | | - vmovdqa \T5, HashKey_7(arg1) |
---|
| 1935 | + vmovdqu \T5, HashKey_7(arg2) |
---|
1689 | 1936 | |
---|
1690 | 1937 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly |
---|
1691 | | - vmovdqa \T5, HashKey_8(arg1) |
---|
| 1938 | + vmovdqu \T5, HashKey_8(arg2) |
---|
1692 | 1939 | |
---|
1693 | 1940 | .endm |
---|
1694 | | - |
---|
1695 | 1941 | |
---|
1696 | 1942 | ## if a = number of total plaintext bytes |
---|
1697 | 1943 | ## b = floor(a/16) |
---|
1698 | 1944 | ## num_initial_blocks = b mod 4# |
---|
1699 | 1945 | ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext |
---|
1700 | 1946 | ## r10, r11, r12, rax are clobbered |
---|
1701 | | -## arg1, arg2, arg3, r14 are used as a pointer only, not modified |
---|
| 1947 | +## arg1, arg3, arg4, r14 are used as a pointer only, not modified |
---|
1702 | 1948 | |
---|
1703 | | -.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER |
---|
| 1949 | +.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER |
---|
1704 | 1950 | i = (8-\num_initial_blocks) |
---|
1705 | | - j = 0 |
---|
1706 | 1951 | setreg |
---|
1707 | | - |
---|
1708 | | - mov arg6, %r10 # r10 = AAD |
---|
1709 | | - mov arg7, %r12 # r12 = aadLen |
---|
1710 | | - |
---|
1711 | | - |
---|
1712 | | - mov %r12, %r11 |
---|
1713 | | - |
---|
1714 | | - vpxor reg_j, reg_j, reg_j |
---|
1715 | | - vpxor reg_i, reg_i, reg_i |
---|
1716 | | - |
---|
1717 | | - cmp $16, %r11 |
---|
1718 | | - jl _get_AAD_rest8\@ |
---|
1719 | | -_get_AAD_blocks\@: |
---|
1720 | | - vmovdqu (%r10), reg_i |
---|
1721 | | - vpshufb SHUF_MASK(%rip), reg_i, reg_i |
---|
1722 | | - vpxor reg_i, reg_j, reg_j |
---|
1723 | | - GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 |
---|
1724 | | - add $16, %r10 |
---|
1725 | | - sub $16, %r12 |
---|
1726 | | - sub $16, %r11 |
---|
1727 | | - cmp $16, %r11 |
---|
1728 | | - jge _get_AAD_blocks\@ |
---|
1729 | | - vmovdqu reg_j, reg_i |
---|
1730 | | - cmp $0, %r11 |
---|
1731 | | - je _get_AAD_done\@ |
---|
1732 | | - |
---|
1733 | | - vpxor reg_i, reg_i, reg_i |
---|
1734 | | - |
---|
1735 | | - /* read the last <16B of AAD. since we have at least 4B of |
---|
1736 | | - data right after the AAD (the ICV, and maybe some CT), we can |
---|
1737 | | - read 4B/8B blocks safely, and then get rid of the extra stuff */ |
---|
1738 | | -_get_AAD_rest8\@: |
---|
1739 | | - cmp $4, %r11 |
---|
1740 | | - jle _get_AAD_rest4\@ |
---|
1741 | | - movq (%r10), \T1 |
---|
1742 | | - add $8, %r10 |
---|
1743 | | - sub $8, %r11 |
---|
1744 | | - vpslldq $8, \T1, \T1 |
---|
1745 | | - vpsrldq $8, reg_i, reg_i |
---|
1746 | | - vpxor \T1, reg_i, reg_i |
---|
1747 | | - jmp _get_AAD_rest8\@ |
---|
1748 | | -_get_AAD_rest4\@: |
---|
1749 | | - cmp $0, %r11 |
---|
1750 | | - jle _get_AAD_rest0\@ |
---|
1751 | | - mov (%r10), %eax |
---|
1752 | | - movq %rax, \T1 |
---|
1753 | | - add $4, %r10 |
---|
1754 | | - sub $4, %r11 |
---|
1755 | | - vpslldq $12, \T1, \T1 |
---|
1756 | | - vpsrldq $4, reg_i, reg_i |
---|
1757 | | - vpxor \T1, reg_i, reg_i |
---|
1758 | | -_get_AAD_rest0\@: |
---|
1759 | | - /* finalize: shift out the extra bytes we read, and align |
---|
1760 | | - left. since pslldq can only shift by an immediate, we use |
---|
1761 | | - vpshufb and an array of shuffle masks */ |
---|
1762 | | - movq %r12, %r11 |
---|
1763 | | - salq $4, %r11 |
---|
1764 | | - movdqu aad_shift_arr(%r11), \T1 |
---|
1765 | | - vpshufb \T1, reg_i, reg_i |
---|
1766 | | -_get_AAD_rest_final\@: |
---|
1767 | | - vpshufb SHUF_MASK(%rip), reg_i, reg_i |
---|
1768 | | - vpxor reg_j, reg_i, reg_i |
---|
1769 | | - GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 |
---|
1770 | | - |
---|
1771 | | -_get_AAD_done\@: |
---|
1772 | | - # initialize the data pointer offset as zero |
---|
1773 | | - xor %r11d, %r11d |
---|
| 1952 | + vmovdqu AadHash(arg2), reg_i |
---|
1774 | 1953 | |
---|
1775 | 1954 | # start AES for num_initial_blocks blocks |
---|
1776 | | - mov arg5, %rax # rax = *Y0 |
---|
1777 | | - vmovdqu (%rax), \CTR # CTR = Y0 |
---|
1778 | | - vpshufb SHUF_MASK(%rip), \CTR, \CTR |
---|
1779 | | - |
---|
| 1955 | + vmovdqu CurCount(arg2), \CTR |
---|
1780 | 1956 | |
---|
1781 | 1957 | i = (9-\num_initial_blocks) |
---|
1782 | 1958 | setreg |
---|
.. | .. |
---|
1799 | 1975 | |
---|
1800 | 1976 | j = 1 |
---|
1801 | 1977 | setreg |
---|
1802 | | -.rep 9 |
---|
| 1978 | +.rep \REP |
---|
1803 | 1979 | vmovdqa 16*j(arg1), \T_key |
---|
1804 | 1980 | i = (9-\num_initial_blocks) |
---|
1805 | 1981 | setreg |
---|
.. | .. |
---|
1814 | 1990 | .endr |
---|
1815 | 1991 | |
---|
1816 | 1992 | |
---|
1817 | | - vmovdqa 16*10(arg1), \T_key |
---|
| 1993 | + vmovdqa 16*j(arg1), \T_key |
---|
1818 | 1994 | i = (9-\num_initial_blocks) |
---|
1819 | 1995 | setreg |
---|
1820 | 1996 | .rep \num_initial_blocks |
---|
.. | .. |
---|
1826 | 2002 | i = (9-\num_initial_blocks) |
---|
1827 | 2003 | setreg |
---|
1828 | 2004 | .rep \num_initial_blocks |
---|
1829 | | - vmovdqu (arg3, %r11), \T1 |
---|
| 2005 | + vmovdqu (arg4, %r11), \T1 |
---|
1830 | 2006 | vpxor \T1, reg_i, reg_i |
---|
1831 | | - vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for |
---|
| 2007 | + vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for |
---|
1832 | 2008 | # num_initial_blocks blocks |
---|
1833 | 2009 | add $16, %r11 |
---|
1834 | 2010 | .if \ENC_DEC == DEC |
---|
.. | .. |
---|
1905 | 2081 | |
---|
1906 | 2082 | i = 1 |
---|
1907 | 2083 | setreg |
---|
1908 | | -.rep 9 # do 9 rounds |
---|
| 2084 | +.rep \REP # do REP rounds |
---|
1909 | 2085 | vmovdqa 16*i(arg1), \T_key |
---|
1910 | 2086 | vaesenc \T_key, \XMM1, \XMM1 |
---|
1911 | 2087 | vaesenc \T_key, \XMM2, \XMM2 |
---|
.. | .. |
---|
1930 | 2106 | vaesenclast \T_key, \XMM7, \XMM7 |
---|
1931 | 2107 | vaesenclast \T_key, \XMM8, \XMM8 |
---|
1932 | 2108 | |
---|
1933 | | - vmovdqu (arg3, %r11), \T1 |
---|
| 2109 | + vmovdqu (arg4, %r11), \T1 |
---|
1934 | 2110 | vpxor \T1, \XMM1, \XMM1 |
---|
1935 | | - vmovdqu \XMM1, (arg2 , %r11) |
---|
| 2111 | + vmovdqu \XMM1, (arg3 , %r11) |
---|
1936 | 2112 | .if \ENC_DEC == DEC |
---|
1937 | 2113 | vmovdqa \T1, \XMM1 |
---|
1938 | 2114 | .endif |
---|
1939 | 2115 | |
---|
1940 | | - vmovdqu 16*1(arg3, %r11), \T1 |
---|
| 2116 | + vmovdqu 16*1(arg4, %r11), \T1 |
---|
1941 | 2117 | vpxor \T1, \XMM2, \XMM2 |
---|
1942 | | - vmovdqu \XMM2, 16*1(arg2 , %r11) |
---|
| 2118 | + vmovdqu \XMM2, 16*1(arg3 , %r11) |
---|
1943 | 2119 | .if \ENC_DEC == DEC |
---|
1944 | 2120 | vmovdqa \T1, \XMM2 |
---|
1945 | 2121 | .endif |
---|
1946 | 2122 | |
---|
1947 | | - vmovdqu 16*2(arg3, %r11), \T1 |
---|
| 2123 | + vmovdqu 16*2(arg4, %r11), \T1 |
---|
1948 | 2124 | vpxor \T1, \XMM3, \XMM3 |
---|
1949 | | - vmovdqu \XMM3, 16*2(arg2 , %r11) |
---|
| 2125 | + vmovdqu \XMM3, 16*2(arg3 , %r11) |
---|
1950 | 2126 | .if \ENC_DEC == DEC |
---|
1951 | 2127 | vmovdqa \T1, \XMM3 |
---|
1952 | 2128 | .endif |
---|
1953 | 2129 | |
---|
1954 | | - vmovdqu 16*3(arg3, %r11), \T1 |
---|
| 2130 | + vmovdqu 16*3(arg4, %r11), \T1 |
---|
1955 | 2131 | vpxor \T1, \XMM4, \XMM4 |
---|
1956 | | - vmovdqu \XMM4, 16*3(arg2 , %r11) |
---|
| 2132 | + vmovdqu \XMM4, 16*3(arg3 , %r11) |
---|
1957 | 2133 | .if \ENC_DEC == DEC |
---|
1958 | 2134 | vmovdqa \T1, \XMM4 |
---|
1959 | 2135 | .endif |
---|
1960 | 2136 | |
---|
1961 | | - vmovdqu 16*4(arg3, %r11), \T1 |
---|
| 2137 | + vmovdqu 16*4(arg4, %r11), \T1 |
---|
1962 | 2138 | vpxor \T1, \XMM5, \XMM5 |
---|
1963 | | - vmovdqu \XMM5, 16*4(arg2 , %r11) |
---|
| 2139 | + vmovdqu \XMM5, 16*4(arg3 , %r11) |
---|
1964 | 2140 | .if \ENC_DEC == DEC |
---|
1965 | 2141 | vmovdqa \T1, \XMM5 |
---|
1966 | 2142 | .endif |
---|
1967 | 2143 | |
---|
1968 | | - vmovdqu 16*5(arg3, %r11), \T1 |
---|
| 2144 | + vmovdqu 16*5(arg4, %r11), \T1 |
---|
1969 | 2145 | vpxor \T1, \XMM6, \XMM6 |
---|
1970 | | - vmovdqu \XMM6, 16*5(arg2 , %r11) |
---|
| 2146 | + vmovdqu \XMM6, 16*5(arg3 , %r11) |
---|
1971 | 2147 | .if \ENC_DEC == DEC |
---|
1972 | 2148 | vmovdqa \T1, \XMM6 |
---|
1973 | 2149 | .endif |
---|
1974 | 2150 | |
---|
1975 | | - vmovdqu 16*6(arg3, %r11), \T1 |
---|
| 2151 | + vmovdqu 16*6(arg4, %r11), \T1 |
---|
1976 | 2152 | vpxor \T1, \XMM7, \XMM7 |
---|
1977 | | - vmovdqu \XMM7, 16*6(arg2 , %r11) |
---|
| 2153 | + vmovdqu \XMM7, 16*6(arg3 , %r11) |
---|
1978 | 2154 | .if \ENC_DEC == DEC |
---|
1979 | 2155 | vmovdqa \T1, \XMM7 |
---|
1980 | 2156 | .endif |
---|
1981 | 2157 | |
---|
1982 | | - vmovdqu 16*7(arg3, %r11), \T1 |
---|
| 2158 | + vmovdqu 16*7(arg4, %r11), \T1 |
---|
1983 | 2159 | vpxor \T1, \XMM8, \XMM8 |
---|
1984 | | - vmovdqu \XMM8, 16*7(arg2 , %r11) |
---|
| 2160 | + vmovdqu \XMM8, 16*7(arg3 , %r11) |
---|
1985 | 2161 | .if \ENC_DEC == DEC |
---|
1986 | 2162 | vmovdqa \T1, \XMM8 |
---|
1987 | 2163 | .endif |
---|
.. | .. |
---|
2010 | 2186 | |
---|
2011 | 2187 | # encrypt 8 blocks at a time |
---|
2012 | 2188 | # ghash the 8 previously encrypted ciphertext blocks |
---|
2013 | | -# arg1, arg2, arg3 are used as pointers only, not modified |
---|
| 2189 | +# arg1, arg3, arg4 are used as pointers only, not modified |
---|
2014 | 2190 | # r11 is the data offset value |
---|
2015 | | -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC |
---|
| 2191 | +.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC |
---|
2016 | 2192 | |
---|
2017 | 2193 | vmovdqa \XMM1, \T2 |
---|
2018 | 2194 | vmovdqa \XMM2, TMP2(%rsp) |
---|
.. | .. |
---|
2096 | 2272 | |
---|
2097 | 2273 | ####################################################################### |
---|
2098 | 2274 | |
---|
2099 | | - vmovdqa HashKey_8(arg1), \T5 |
---|
| 2275 | + vmovdqu HashKey_8(arg2), \T5 |
---|
2100 | 2276 | vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 |
---|
2101 | 2277 | vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 |
---|
2102 | 2278 | vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 |
---|
.. | .. |
---|
2114 | 2290 | vaesenc \T1, \XMM8, \XMM8 |
---|
2115 | 2291 | |
---|
2116 | 2292 | vmovdqa TMP2(%rsp), \T1 |
---|
2117 | | - vmovdqa HashKey_7(arg1), \T5 |
---|
| 2293 | + vmovdqu HashKey_7(arg2), \T5 |
---|
2118 | 2294 | vpclmulqdq $0x11, \T5, \T1, \T3 |
---|
2119 | 2295 | vpxor \T3, \T4, \T4 |
---|
2120 | 2296 | |
---|
.. | .. |
---|
2140 | 2316 | ####################################################################### |
---|
2141 | 2317 | |
---|
2142 | 2318 | vmovdqa TMP3(%rsp), \T1 |
---|
2143 | | - vmovdqa HashKey_6(arg1), \T5 |
---|
| 2319 | + vmovdqu HashKey_6(arg2), \T5 |
---|
2144 | 2320 | vpclmulqdq $0x11, \T5, \T1, \T3 |
---|
2145 | 2321 | vpxor \T3, \T4, \T4 |
---|
2146 | 2322 | |
---|
.. | .. |
---|
2164 | 2340 | vaesenc \T1, \XMM8, \XMM8 |
---|
2165 | 2341 | |
---|
2166 | 2342 | vmovdqa TMP4(%rsp), \T1 |
---|
2167 | | - vmovdqa HashKey_5(arg1), \T5 |
---|
| 2343 | + vmovdqu HashKey_5(arg2), \T5 |
---|
2168 | 2344 | vpclmulqdq $0x11, \T5, \T1, \T3 |
---|
2169 | 2345 | vpxor \T3, \T4, \T4 |
---|
2170 | 2346 | |
---|
.. | .. |
---|
2189 | 2365 | |
---|
2190 | 2366 | |
---|
2191 | 2367 | vmovdqa TMP5(%rsp), \T1 |
---|
2192 | | - vmovdqa HashKey_4(arg1), \T5 |
---|
| 2368 | + vmovdqu HashKey_4(arg2), \T5 |
---|
2193 | 2369 | vpclmulqdq $0x11, \T5, \T1, \T3 |
---|
2194 | 2370 | vpxor \T3, \T4, \T4 |
---|
2195 | 2371 | |
---|
.. | .. |
---|
2213 | 2389 | vaesenc \T1, \XMM8, \XMM8 |
---|
2214 | 2390 | |
---|
2215 | 2391 | vmovdqa TMP6(%rsp), \T1 |
---|
2216 | | - vmovdqa HashKey_3(arg1), \T5 |
---|
| 2392 | + vmovdqu HashKey_3(arg2), \T5 |
---|
2217 | 2393 | vpclmulqdq $0x11, \T5, \T1, \T3 |
---|
2218 | 2394 | vpxor \T3, \T4, \T4 |
---|
2219 | 2395 | |
---|
.. | .. |
---|
2237 | 2413 | vaesenc \T1, \XMM8, \XMM8 |
---|
2238 | 2414 | |
---|
2239 | 2415 | vmovdqa TMP7(%rsp), \T1 |
---|
2240 | | - vmovdqa HashKey_2(arg1), \T5 |
---|
| 2416 | + vmovdqu HashKey_2(arg2), \T5 |
---|
2241 | 2417 | vpclmulqdq $0x11, \T5, \T1, \T3 |
---|
2242 | 2418 | vpxor \T3, \T4, \T4 |
---|
2243 | 2419 | |
---|
.. | .. |
---|
2264 | 2440 | vaesenc \T5, \XMM8, \XMM8 |
---|
2265 | 2441 | |
---|
2266 | 2442 | vmovdqa TMP8(%rsp), \T1 |
---|
2267 | | - vmovdqa HashKey(arg1), \T5 |
---|
| 2443 | + vmovdqu HashKey(arg2), \T5 |
---|
2268 | 2444 | |
---|
2269 | 2445 | vpclmulqdq $0x00, \T5, \T1, \T3 |
---|
2270 | 2446 | vpxor \T3, \T7, \T7 |
---|
.. | .. |
---|
2281 | 2457 | |
---|
2282 | 2458 | vmovdqu 16*10(arg1), \T5 |
---|
2283 | 2459 | |
---|
| 2460 | + i = 11 |
---|
| 2461 | + setreg |
---|
| 2462 | +.rep (\REP-9) |
---|
| 2463 | + vaesenc \T5, \XMM1, \XMM1 |
---|
| 2464 | + vaesenc \T5, \XMM2, \XMM2 |
---|
| 2465 | + vaesenc \T5, \XMM3, \XMM3 |
---|
| 2466 | + vaesenc \T5, \XMM4, \XMM4 |
---|
| 2467 | + vaesenc \T5, \XMM5, \XMM5 |
---|
| 2468 | + vaesenc \T5, \XMM6, \XMM6 |
---|
| 2469 | + vaesenc \T5, \XMM7, \XMM7 |
---|
| 2470 | + vaesenc \T5, \XMM8, \XMM8 |
---|
| 2471 | + |
---|
| 2472 | + vmovdqu 16*i(arg1), \T5 |
---|
| 2473 | + i = i + 1 |
---|
| 2474 | + setreg |
---|
| 2475 | +.endr |
---|
| 2476 | + |
---|
2284 | 2477 | i = 0 |
---|
2285 | 2478 | j = 1 |
---|
2286 | 2479 | setreg |
---|
2287 | 2480 | .rep 8 |
---|
2288 | | - vpxor 16*i(arg3, %r11), \T5, \T2 |
---|
| 2481 | + vpxor 16*i(arg4, %r11), \T5, \T2 |
---|
2289 | 2482 | .if \ENC_DEC == ENC |
---|
2290 | 2483 | vaesenclast \T2, reg_j, reg_j |
---|
2291 | 2484 | .else |
---|
2292 | 2485 | vaesenclast \T2, reg_j, \T3 |
---|
2293 | | - vmovdqu 16*i(arg3, %r11), reg_j |
---|
2294 | | - vmovdqu \T3, 16*i(arg2, %r11) |
---|
| 2486 | + vmovdqu 16*i(arg4, %r11), reg_j |
---|
| 2487 | + vmovdqu \T3, 16*i(arg3, %r11) |
---|
2295 | 2488 | .endif |
---|
2296 | 2489 | i = (i+1) |
---|
2297 | 2490 | j = (j+1) |
---|
.. | .. |
---|
2317 | 2510 | vpxor \T2, \T7, \T7 # first phase of the reduction complete |
---|
2318 | 2511 | ####################################################################### |
---|
2319 | 2512 | .if \ENC_DEC == ENC |
---|
2320 | | - vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer |
---|
2321 | | - vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer |
---|
2322 | | - vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer |
---|
2323 | | - vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer |
---|
2324 | | - vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer |
---|
2325 | | - vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer |
---|
2326 | | - vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer |
---|
2327 | | - vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer |
---|
| 2513 | + vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer |
---|
| 2514 | + vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer |
---|
| 2515 | + vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer |
---|
| 2516 | + vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer |
---|
| 2517 | + vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer |
---|
| 2518 | + vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer |
---|
| 2519 | + vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer |
---|
| 2520 | + vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer |
---|
2328 | 2521 | .endif |
---|
2329 | 2522 | |
---|
2330 | 2523 | ####################################################################### |
---|
.. | .. |
---|
2361 | 2554 | |
---|
2362 | 2555 | ## Karatsuba Method |
---|
2363 | 2556 | |
---|
2364 | | - vmovdqa HashKey_8(arg1), \T5 |
---|
| 2557 | + vmovdqu HashKey_8(arg2), \T5 |
---|
2365 | 2558 | |
---|
2366 | 2559 | vpshufd $0b01001110, \XMM1, \T2 |
---|
2367 | 2560 | vpshufd $0b01001110, \T5, \T3 |
---|
.. | .. |
---|
2375 | 2568 | |
---|
2376 | 2569 | ###################### |
---|
2377 | 2570 | |
---|
2378 | | - vmovdqa HashKey_7(arg1), \T5 |
---|
| 2571 | + vmovdqu HashKey_7(arg2), \T5 |
---|
2379 | 2572 | vpshufd $0b01001110, \XMM2, \T2 |
---|
2380 | 2573 | vpshufd $0b01001110, \T5, \T3 |
---|
2381 | 2574 | vpxor \XMM2, \T2, \T2 |
---|
.. | .. |
---|
2393 | 2586 | |
---|
2394 | 2587 | ###################### |
---|
2395 | 2588 | |
---|
2396 | | - vmovdqa HashKey_6(arg1), \T5 |
---|
| 2589 | + vmovdqu HashKey_6(arg2), \T5 |
---|
2397 | 2590 | vpshufd $0b01001110, \XMM3, \T2 |
---|
2398 | 2591 | vpshufd $0b01001110, \T5, \T3 |
---|
2399 | 2592 | vpxor \XMM3, \T2, \T2 |
---|
.. | .. |
---|
2411 | 2604 | |
---|
2412 | 2605 | ###################### |
---|
2413 | 2606 | |
---|
2414 | | - vmovdqa HashKey_5(arg1), \T5 |
---|
| 2607 | + vmovdqu HashKey_5(arg2), \T5 |
---|
2415 | 2608 | vpshufd $0b01001110, \XMM4, \T2 |
---|
2416 | 2609 | vpshufd $0b01001110, \T5, \T3 |
---|
2417 | 2610 | vpxor \XMM4, \T2, \T2 |
---|
.. | .. |
---|
2429 | 2622 | |
---|
2430 | 2623 | ###################### |
---|
2431 | 2624 | |
---|
2432 | | - vmovdqa HashKey_4(arg1), \T5 |
---|
| 2625 | + vmovdqu HashKey_4(arg2), \T5 |
---|
2433 | 2626 | vpshufd $0b01001110, \XMM5, \T2 |
---|
2434 | 2627 | vpshufd $0b01001110, \T5, \T3 |
---|
2435 | 2628 | vpxor \XMM5, \T2, \T2 |
---|
.. | .. |
---|
2447 | 2640 | |
---|
2448 | 2641 | ###################### |
---|
2449 | 2642 | |
---|
2450 | | - vmovdqa HashKey_3(arg1), \T5 |
---|
| 2643 | + vmovdqu HashKey_3(arg2), \T5 |
---|
2451 | 2644 | vpshufd $0b01001110, \XMM6, \T2 |
---|
2452 | 2645 | vpshufd $0b01001110, \T5, \T3 |
---|
2453 | 2646 | vpxor \XMM6, \T2, \T2 |
---|
.. | .. |
---|
2465 | 2658 | |
---|
2466 | 2659 | ###################### |
---|
2467 | 2660 | |
---|
2468 | | - vmovdqa HashKey_2(arg1), \T5 |
---|
| 2661 | + vmovdqu HashKey_2(arg2), \T5 |
---|
2469 | 2662 | vpshufd $0b01001110, \XMM7, \T2 |
---|
2470 | 2663 | vpshufd $0b01001110, \T5, \T3 |
---|
2471 | 2664 | vpxor \XMM7, \T2, \T2 |
---|
.. | .. |
---|
2483 | 2676 | |
---|
2484 | 2677 | ###################### |
---|
2485 | 2678 | |
---|
2486 | | - vmovdqa HashKey(arg1), \T5 |
---|
| 2679 | + vmovdqu HashKey(arg2), \T5 |
---|
2487 | 2680 | vpshufd $0b01001110, \XMM8, \T2 |
---|
2488 | 2681 | vpshufd $0b01001110, \T5, \T3 |
---|
2489 | 2682 | vpxor \XMM8, \T2, \T2 |
---|
.. | .. |
---|
2536 | 2729 | |
---|
2537 | 2730 | |
---|
2538 | 2731 | |
---|
2539 | | -# combined for GCM encrypt and decrypt functions |
---|
2540 | | -# clobbering all xmm registers |
---|
2541 | | -# clobbering r10, r11, r12, r13, r14, r15 |
---|
2542 | | -.macro GCM_ENC_DEC_AVX2 ENC_DEC |
---|
2543 | | - |
---|
2544 | | - #the number of pushes must equal STACK_OFFSET |
---|
2545 | | - push %r12 |
---|
2546 | | - push %r13 |
---|
2547 | | - push %r14 |
---|
2548 | | - push %r15 |
---|
2549 | | - |
---|
2550 | | - mov %rsp, %r14 |
---|
2551 | | - |
---|
2552 | | - |
---|
2553 | | - |
---|
2554 | | - |
---|
2555 | | - sub $VARIABLE_OFFSET, %rsp |
---|
2556 | | - and $~63, %rsp # align rsp to 64 bytes |
---|
2557 | | - |
---|
2558 | | - |
---|
2559 | | - vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey |
---|
2560 | | - |
---|
2561 | | - mov arg4, %r13 # save the number of bytes of plaintext/ciphertext |
---|
2562 | | - and $-16, %r13 # r13 = r13 - (r13 mod 16) |
---|
2563 | | - |
---|
2564 | | - mov %r13, %r12 |
---|
2565 | | - shr $4, %r12 |
---|
2566 | | - and $7, %r12 |
---|
2567 | | - jz _initial_num_blocks_is_0\@ |
---|
2568 | | - |
---|
2569 | | - cmp $7, %r12 |
---|
2570 | | - je _initial_num_blocks_is_7\@ |
---|
2571 | | - cmp $6, %r12 |
---|
2572 | | - je _initial_num_blocks_is_6\@ |
---|
2573 | | - cmp $5, %r12 |
---|
2574 | | - je _initial_num_blocks_is_5\@ |
---|
2575 | | - cmp $4, %r12 |
---|
2576 | | - je _initial_num_blocks_is_4\@ |
---|
2577 | | - cmp $3, %r12 |
---|
2578 | | - je _initial_num_blocks_is_3\@ |
---|
2579 | | - cmp $2, %r12 |
---|
2580 | | - je _initial_num_blocks_is_2\@ |
---|
2581 | | - |
---|
2582 | | - jmp _initial_num_blocks_is_1\@ |
---|
2583 | | - |
---|
2584 | | -_initial_num_blocks_is_7\@: |
---|
2585 | | - INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
2586 | | - sub $16*7, %r13 |
---|
2587 | | - jmp _initial_blocks_encrypted\@ |
---|
2588 | | - |
---|
2589 | | -_initial_num_blocks_is_6\@: |
---|
2590 | | - INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
2591 | | - sub $16*6, %r13 |
---|
2592 | | - jmp _initial_blocks_encrypted\@ |
---|
2593 | | - |
---|
2594 | | -_initial_num_blocks_is_5\@: |
---|
2595 | | - INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
2596 | | - sub $16*5, %r13 |
---|
2597 | | - jmp _initial_blocks_encrypted\@ |
---|
2598 | | - |
---|
2599 | | -_initial_num_blocks_is_4\@: |
---|
2600 | | - INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
2601 | | - sub $16*4, %r13 |
---|
2602 | | - jmp _initial_blocks_encrypted\@ |
---|
2603 | | - |
---|
2604 | | -_initial_num_blocks_is_3\@: |
---|
2605 | | - INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
2606 | | - sub $16*3, %r13 |
---|
2607 | | - jmp _initial_blocks_encrypted\@ |
---|
2608 | | - |
---|
2609 | | -_initial_num_blocks_is_2\@: |
---|
2610 | | - INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
2611 | | - sub $16*2, %r13 |
---|
2612 | | - jmp _initial_blocks_encrypted\@ |
---|
2613 | | - |
---|
2614 | | -_initial_num_blocks_is_1\@: |
---|
2615 | | - INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
2616 | | - sub $16*1, %r13 |
---|
2617 | | - jmp _initial_blocks_encrypted\@ |
---|
2618 | | - |
---|
2619 | | -_initial_num_blocks_is_0\@: |
---|
2620 | | - INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC |
---|
2621 | | - |
---|
2622 | | - |
---|
2623 | | -_initial_blocks_encrypted\@: |
---|
2624 | | - cmp $0, %r13 |
---|
2625 | | - je _zero_cipher_left\@ |
---|
2626 | | - |
---|
2627 | | - sub $128, %r13 |
---|
2628 | | - je _eight_cipher_left\@ |
---|
2629 | | - |
---|
2630 | | - |
---|
2631 | | - |
---|
2632 | | - |
---|
2633 | | - vmovd %xmm9, %r15d |
---|
2634 | | - and $255, %r15d |
---|
2635 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
2636 | | - |
---|
2637 | | - |
---|
2638 | | -_encrypt_by_8_new\@: |
---|
2639 | | - cmp $(255-8), %r15d |
---|
2640 | | - jg _encrypt_by_8\@ |
---|
2641 | | - |
---|
2642 | | - |
---|
2643 | | - |
---|
2644 | | - add $8, %r15b |
---|
2645 | | - GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC |
---|
2646 | | - add $128, %r11 |
---|
2647 | | - sub $128, %r13 |
---|
2648 | | - jne _encrypt_by_8_new\@ |
---|
2649 | | - |
---|
2650 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
2651 | | - jmp _eight_cipher_left\@ |
---|
2652 | | - |
---|
2653 | | -_encrypt_by_8\@: |
---|
2654 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
2655 | | - add $8, %r15b |
---|
2656 | | - GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC |
---|
2657 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
2658 | | - add $128, %r11 |
---|
2659 | | - sub $128, %r13 |
---|
2660 | | - jne _encrypt_by_8_new\@ |
---|
2661 | | - |
---|
2662 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
2663 | | - |
---|
2664 | | - |
---|
2665 | | - |
---|
2666 | | - |
---|
2667 | | -_eight_cipher_left\@: |
---|
2668 | | - GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 |
---|
2669 | | - |
---|
2670 | | - |
---|
2671 | | -_zero_cipher_left\@: |
---|
2672 | | - cmp $16, arg4 |
---|
2673 | | - jl _only_less_than_16\@ |
---|
2674 | | - |
---|
2675 | | - mov arg4, %r13 |
---|
2676 | | - and $15, %r13 # r13 = (arg4 mod 16) |
---|
2677 | | - |
---|
2678 | | - je _multiple_of_16_bytes\@ |
---|
2679 | | - |
---|
2680 | | - # handle the last <16 Byte block seperately |
---|
2681 | | - |
---|
2682 | | - |
---|
2683 | | - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn |
---|
2684 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
2685 | | - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) |
---|
2686 | | - |
---|
2687 | | - sub $16, %r11 |
---|
2688 | | - add %r13, %r11 |
---|
2689 | | - vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block |
---|
2690 | | - |
---|
2691 | | - lea SHIFT_MASK+16(%rip), %r12 |
---|
2692 | | - sub %r13, %r12 # adjust the shuffle mask pointer |
---|
2693 | | - # to be able to shift 16-r13 bytes |
---|
2694 | | - # (r13 is the number of bytes in plaintext mod 16) |
---|
2695 | | - vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask |
---|
2696 | | - vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes |
---|
2697 | | - jmp _final_ghash_mul\@ |
---|
2698 | | - |
---|
2699 | | -_only_less_than_16\@: |
---|
2700 | | - # check for 0 length |
---|
2701 | | - mov arg4, %r13 |
---|
2702 | | - and $15, %r13 # r13 = (arg4 mod 16) |
---|
2703 | | - |
---|
2704 | | - je _multiple_of_16_bytes\@ |
---|
2705 | | - |
---|
2706 | | - # handle the last <16 Byte block seperately |
---|
2707 | | - |
---|
2708 | | - |
---|
2709 | | - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn |
---|
2710 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
2711 | | - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) |
---|
2712 | | - |
---|
2713 | | - |
---|
2714 | | - lea SHIFT_MASK+16(%rip), %r12 |
---|
2715 | | - sub %r13, %r12 # adjust the shuffle mask pointer to be |
---|
2716 | | - # able to shift 16-r13 bytes (r13 is the |
---|
2717 | | - # number of bytes in plaintext mod 16) |
---|
2718 | | - |
---|
2719 | | -_get_last_16_byte_loop\@: |
---|
2720 | | - movb (arg3, %r11), %al |
---|
2721 | | - movb %al, TMP1 (%rsp , %r11) |
---|
2722 | | - add $1, %r11 |
---|
2723 | | - cmp %r13, %r11 |
---|
2724 | | - jne _get_last_16_byte_loop\@ |
---|
2725 | | - |
---|
2726 | | - vmovdqu TMP1(%rsp), %xmm1 |
---|
2727 | | - |
---|
2728 | | - sub $16, %r11 |
---|
2729 | | - |
---|
2730 | | -_final_ghash_mul\@: |
---|
2731 | | - .if \ENC_DEC == DEC |
---|
2732 | | - vmovdqa %xmm1, %xmm2 |
---|
2733 | | - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) |
---|
2734 | | - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 |
---|
2735 | | - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 |
---|
2736 | | - vpand %xmm1, %xmm2, %xmm2 |
---|
2737 | | - vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 |
---|
2738 | | - vpxor %xmm2, %xmm14, %xmm14 |
---|
2739 | | - #GHASH computation for the last <16 Byte block |
---|
2740 | | - GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 |
---|
2741 | | - sub %r13, %r11 |
---|
2742 | | - add $16, %r11 |
---|
2743 | | - .else |
---|
2744 | | - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) |
---|
2745 | | - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 |
---|
2746 | | - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 |
---|
2747 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 |
---|
2748 | | - vpxor %xmm9, %xmm14, %xmm14 |
---|
2749 | | - #GHASH computation for the last <16 Byte block |
---|
2750 | | - GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 |
---|
2751 | | - sub %r13, %r11 |
---|
2752 | | - add $16, %r11 |
---|
2753 | | - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext |
---|
2754 | | - .endif |
---|
2755 | | - |
---|
2756 | | - |
---|
2757 | | - ############################# |
---|
2758 | | - # output r13 Bytes |
---|
2759 | | - vmovq %xmm9, %rax |
---|
2760 | | - cmp $8, %r13 |
---|
2761 | | - jle _less_than_8_bytes_left\@ |
---|
2762 | | - |
---|
2763 | | - mov %rax, (arg2 , %r11) |
---|
2764 | | - add $8, %r11 |
---|
2765 | | - vpsrldq $8, %xmm9, %xmm9 |
---|
2766 | | - vmovq %xmm9, %rax |
---|
2767 | | - sub $8, %r13 |
---|
2768 | | - |
---|
2769 | | -_less_than_8_bytes_left\@: |
---|
2770 | | - movb %al, (arg2 , %r11) |
---|
2771 | | - add $1, %r11 |
---|
2772 | | - shr $8, %rax |
---|
2773 | | - sub $1, %r13 |
---|
2774 | | - jne _less_than_8_bytes_left\@ |
---|
2775 | | - ############################# |
---|
2776 | | - |
---|
2777 | | -_multiple_of_16_bytes\@: |
---|
2778 | | - mov arg7, %r12 # r12 = aadLen (number of bytes) |
---|
2779 | | - shl $3, %r12 # convert into number of bits |
---|
2780 | | - vmovd %r12d, %xmm15 # len(A) in xmm15 |
---|
2781 | | - |
---|
2782 | | - shl $3, arg4 # len(C) in bits (*128) |
---|
2783 | | - vmovq arg4, %xmm1 |
---|
2784 | | - vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 |
---|
2785 | | - vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) |
---|
2786 | | - |
---|
2787 | | - vpxor %xmm15, %xmm14, %xmm14 |
---|
2788 | | - GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation |
---|
2789 | | - vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap |
---|
2790 | | - |
---|
2791 | | - mov arg5, %rax # rax = *Y0 |
---|
2792 | | - vmovdqu (%rax), %xmm9 # xmm9 = Y0 |
---|
2793 | | - |
---|
2794 | | - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) |
---|
2795 | | - |
---|
2796 | | - vpxor %xmm14, %xmm9, %xmm9 |
---|
2797 | | - |
---|
2798 | | - |
---|
2799 | | - |
---|
2800 | | -_return_T\@: |
---|
2801 | | - mov arg8, %r10 # r10 = authTag |
---|
2802 | | - mov arg9, %r11 # r11 = auth_tag_len |
---|
2803 | | - |
---|
2804 | | - cmp $16, %r11 |
---|
2805 | | - je _T_16\@ |
---|
2806 | | - |
---|
2807 | | - cmp $8, %r11 |
---|
2808 | | - jl _T_4\@ |
---|
2809 | | - |
---|
2810 | | -_T_8\@: |
---|
2811 | | - vmovq %xmm9, %rax |
---|
2812 | | - mov %rax, (%r10) |
---|
2813 | | - add $8, %r10 |
---|
2814 | | - sub $8, %r11 |
---|
2815 | | - vpsrldq $8, %xmm9, %xmm9 |
---|
2816 | | - cmp $0, %r11 |
---|
2817 | | - je _return_T_done\@ |
---|
2818 | | -_T_4\@: |
---|
2819 | | - vmovd %xmm9, %eax |
---|
2820 | | - mov %eax, (%r10) |
---|
2821 | | - add $4, %r10 |
---|
2822 | | - sub $4, %r11 |
---|
2823 | | - vpsrldq $4, %xmm9, %xmm9 |
---|
2824 | | - cmp $0, %r11 |
---|
2825 | | - je _return_T_done\@ |
---|
2826 | | -_T_123\@: |
---|
2827 | | - vmovd %xmm9, %eax |
---|
2828 | | - cmp $2, %r11 |
---|
2829 | | - jl _T_1\@ |
---|
2830 | | - mov %ax, (%r10) |
---|
2831 | | - cmp $2, %r11 |
---|
2832 | | - je _return_T_done\@ |
---|
2833 | | - add $2, %r10 |
---|
2834 | | - sar $16, %eax |
---|
2835 | | -_T_1\@: |
---|
2836 | | - mov %al, (%r10) |
---|
2837 | | - jmp _return_T_done\@ |
---|
2838 | | - |
---|
2839 | | -_T_16\@: |
---|
2840 | | - vmovdqu %xmm9, (%r10) |
---|
2841 | | - |
---|
2842 | | -_return_T_done\@: |
---|
2843 | | - mov %r14, %rsp |
---|
2844 | | - |
---|
2845 | | - pop %r15 |
---|
2846 | | - pop %r14 |
---|
2847 | | - pop %r13 |
---|
2848 | | - pop %r12 |
---|
2849 | | -.endm |
---|
2850 | | - |
---|
2851 | | - |
---|
2852 | 2732 | ############################################################# |
---|
2853 | | -#void aesni_gcm_precomp_avx_gen4 |
---|
| 2733 | +#void aesni_gcm_init_avx_gen4 |
---|
2854 | 2734 | # (gcm_data *my_ctx_data, |
---|
2855 | | -# u8 *hash_subkey)# /* H, the Hash sub key input. |
---|
2856 | | -# Data starts on a 16-byte boundary. */ |
---|
2857 | | -############################################################# |
---|
2858 | | -ENTRY(aesni_gcm_precomp_avx_gen4) |
---|
2859 | | - #the number of pushes must equal STACK_OFFSET |
---|
2860 | | - push %r12 |
---|
2861 | | - push %r13 |
---|
2862 | | - push %r14 |
---|
2863 | | - push %r15 |
---|
2864 | | - |
---|
2865 | | - mov %rsp, %r14 |
---|
2866 | | - |
---|
2867 | | - |
---|
2868 | | - |
---|
2869 | | - sub $VARIABLE_OFFSET, %rsp |
---|
2870 | | - and $~63, %rsp # align rsp to 64 bytes |
---|
2871 | | - |
---|
2872 | | - vmovdqu (arg2), %xmm6 # xmm6 = HashKey |
---|
2873 | | - |
---|
2874 | | - vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 |
---|
2875 | | - ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey |
---|
2876 | | - vmovdqa %xmm6, %xmm2 |
---|
2877 | | - vpsllq $1, %xmm6, %xmm6 |
---|
2878 | | - vpsrlq $63, %xmm2, %xmm2 |
---|
2879 | | - vmovdqa %xmm2, %xmm1 |
---|
2880 | | - vpslldq $8, %xmm2, %xmm2 |
---|
2881 | | - vpsrldq $8, %xmm1, %xmm1 |
---|
2882 | | - vpor %xmm2, %xmm6, %xmm6 |
---|
2883 | | - #reduction |
---|
2884 | | - vpshufd $0b00100100, %xmm1, %xmm2 |
---|
2885 | | - vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 |
---|
2886 | | - vpand POLY(%rip), %xmm2, %xmm2 |
---|
2887 | | - vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly |
---|
2888 | | - ####################################################################### |
---|
2889 | | - vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly |
---|
2890 | | - |
---|
2891 | | - |
---|
2892 | | - PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 |
---|
2893 | | - |
---|
2894 | | - mov %r14, %rsp |
---|
2895 | | - |
---|
2896 | | - pop %r15 |
---|
2897 | | - pop %r14 |
---|
2898 | | - pop %r13 |
---|
2899 | | - pop %r12 |
---|
2900 | | - ret |
---|
2901 | | -ENDPROC(aesni_gcm_precomp_avx_gen4) |
---|
2902 | | - |
---|
2903 | | - |
---|
2904 | | -############################################################################### |
---|
2905 | | -#void aesni_gcm_enc_avx_gen4( |
---|
2906 | | -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ |
---|
2907 | | -# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ |
---|
2908 | | -# const u8 *in, /* Plaintext input */ |
---|
2909 | | -# u64 plaintext_len, /* Length of data in Bytes for encryption. */ |
---|
2910 | | -# u8 *iv, /* Pre-counter block j0: 4 byte salt |
---|
2911 | | -# (from Security Association) concatenated with 8 byte |
---|
2912 | | -# Initialisation Vector (from IPSec ESP Payload) |
---|
2913 | | -# concatenated with 0x00000001. 16-byte aligned pointer. */ |
---|
2914 | | -# const u8 *aad, /* Additional Authentication Data (AAD)*/ |
---|
2915 | | -# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ |
---|
2916 | | -# u8 *auth_tag, /* Authenticated Tag output. */ |
---|
2917 | | -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. |
---|
2918 | | -# Valid values are 16 (most likely), 12 or 8. */ |
---|
2919 | | -############################################################################### |
---|
2920 | | -ENTRY(aesni_gcm_enc_avx_gen4) |
---|
2921 | | - GCM_ENC_DEC_AVX2 ENC |
---|
2922 | | - ret |
---|
2923 | | -ENDPROC(aesni_gcm_enc_avx_gen4) |
---|
2924 | | - |
---|
2925 | | -############################################################################### |
---|
2926 | | -#void aesni_gcm_dec_avx_gen4( |
---|
2927 | | -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ |
---|
2928 | | -# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ |
---|
2929 | | -# const u8 *in, /* Ciphertext input */ |
---|
2930 | | -# u64 plaintext_len, /* Length of data in Bytes for encryption. */ |
---|
| 2735 | +# gcm_context_data *data, |
---|
2931 | 2736 | # u8 *iv, /* Pre-counter block j0: 4 byte salt |
---|
2932 | 2737 | # (from Security Association) concatenated with 8 byte |
---|
2933 | 2738 | # Initialisation Vector (from IPSec ESP Payload) |
---|
2934 | 2739 | # concatenated with 0x00000001. 16-byte aligned pointer. */ |
---|
| 2740 | +# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ |
---|
2935 | 2741 | # const u8 *aad, /* Additional Authentication Data (AAD)*/ |
---|
2936 | | -# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ |
---|
| 2742 | +# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ |
---|
| 2743 | +############################################################# |
---|
| 2744 | +SYM_FUNC_START(aesni_gcm_init_avx_gen4) |
---|
| 2745 | + FUNC_SAVE |
---|
| 2746 | + INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 |
---|
| 2747 | + FUNC_RESTORE |
---|
| 2748 | + RET |
---|
| 2749 | +SYM_FUNC_END(aesni_gcm_init_avx_gen4) |
---|
| 2750 | + |
---|
| 2751 | +############################################################################### |
---|
| 2752 | +#void aesni_gcm_enc_avx_gen4( |
---|
| 2753 | +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ |
---|
| 2754 | +# gcm_context_data *data, |
---|
| 2755 | +# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ |
---|
| 2756 | +# const u8 *in, /* Plaintext input */ |
---|
| 2757 | +# u64 plaintext_len) /* Length of data in Bytes for encryption. */ |
---|
| 2758 | +############################################################################### |
---|
| 2759 | +SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4) |
---|
| 2760 | + FUNC_SAVE |
---|
| 2761 | + mov keysize,%eax |
---|
| 2762 | + cmp $32, %eax |
---|
| 2763 | + je key_256_enc_update4 |
---|
| 2764 | + cmp $16, %eax |
---|
| 2765 | + je key_128_enc_update4 |
---|
| 2766 | + # must be 192 |
---|
| 2767 | + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 |
---|
| 2768 | + FUNC_RESTORE |
---|
| 2769 | + RET |
---|
| 2770 | +key_128_enc_update4: |
---|
| 2771 | + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 |
---|
| 2772 | + FUNC_RESTORE |
---|
| 2773 | + RET |
---|
| 2774 | +key_256_enc_update4: |
---|
| 2775 | + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 |
---|
| 2776 | + FUNC_RESTORE |
---|
| 2777 | + RET |
---|
| 2778 | +SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4) |
---|
| 2779 | + |
---|
| 2780 | +############################################################################### |
---|
| 2781 | +#void aesni_gcm_dec_update_avx_gen4( |
---|
| 2782 | +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ |
---|
| 2783 | +# gcm_context_data *data, |
---|
| 2784 | +# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ |
---|
| 2785 | +# const u8 *in, /* Ciphertext input */ |
---|
| 2786 | +# u64 plaintext_len) /* Length of data in Bytes for encryption. */ |
---|
| 2787 | +############################################################################### |
---|
| 2788 | +SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4) |
---|
| 2789 | + FUNC_SAVE |
---|
| 2790 | + mov keysize,%eax |
---|
| 2791 | + cmp $32, %eax |
---|
| 2792 | + je key_256_dec_update4 |
---|
| 2793 | + cmp $16, %eax |
---|
| 2794 | + je key_128_dec_update4 |
---|
| 2795 | + # must be 192 |
---|
| 2796 | + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 |
---|
| 2797 | + FUNC_RESTORE |
---|
| 2798 | + RET |
---|
| 2799 | +key_128_dec_update4: |
---|
| 2800 | + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 |
---|
| 2801 | + FUNC_RESTORE |
---|
| 2802 | + RET |
---|
| 2803 | +key_256_dec_update4: |
---|
| 2804 | + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 |
---|
| 2805 | + FUNC_RESTORE |
---|
| 2806 | + RET |
---|
| 2807 | +SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4) |
---|
| 2808 | + |
---|
| 2809 | +############################################################################### |
---|
| 2810 | +#void aesni_gcm_finalize_avx_gen4( |
---|
| 2811 | +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ |
---|
| 2812 | +# gcm_context_data *data, |
---|
2937 | 2813 | # u8 *auth_tag, /* Authenticated Tag output. */ |
---|
2938 | 2814 | # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. |
---|
2939 | | -# Valid values are 16 (most likely), 12 or 8. */ |
---|
| 2815 | +# Valid values are 16 (most likely), 12 or 8. */ |
---|
2940 | 2816 | ############################################################################### |
---|
2941 | | -ENTRY(aesni_gcm_dec_avx_gen4) |
---|
2942 | | - GCM_ENC_DEC_AVX2 DEC |
---|
2943 | | - ret |
---|
2944 | | -ENDPROC(aesni_gcm_dec_avx_gen4) |
---|
2945 | | - |
---|
2946 | | -#endif /* CONFIG_AS_AVX2 */ |
---|
| 2817 | +SYM_FUNC_START(aesni_gcm_finalize_avx_gen4) |
---|
| 2818 | + FUNC_SAVE |
---|
| 2819 | + mov keysize,%eax |
---|
| 2820 | + cmp $32, %eax |
---|
| 2821 | + je key_256_finalize4 |
---|
| 2822 | + cmp $16, %eax |
---|
| 2823 | + je key_128_finalize4 |
---|
| 2824 | + # must be 192 |
---|
| 2825 | + GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 |
---|
| 2826 | + FUNC_RESTORE |
---|
| 2827 | + RET |
---|
| 2828 | +key_128_finalize4: |
---|
| 2829 | + GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 |
---|
| 2830 | + FUNC_RESTORE |
---|
| 2831 | + RET |
---|
| 2832 | +key_256_finalize4: |
---|
| 2833 | + GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 |
---|
| 2834 | + FUNC_RESTORE |
---|
| 2835 | + RET |
---|
| 2836 | +SYM_FUNC_END(aesni_gcm_finalize_avx_gen4) |
---|