forked from ~ljy/RK356X_SDK_RELEASE

hc
2024-05-10 9999e48639b3cecb08ffb37358bcba3b48161b29
kernel/arch/x86/crypto/aesni-intel_avx-x86_64.S
....@@ -120,7 +120,6 @@
120120 ##
121121
122122 #include <linux/linkage.h>
123
-#include <asm/inst.h>
124123
125124 # constants in mergeable sections, linker can reorder and merge
126125 .section .rodata.cst16.POLY, "aM", @progbits, 16
....@@ -182,43 +181,30 @@
182181 .text
183182
184183
185
-##define the fields of the gcm aes context
186
-#{
187
-# u8 expanded_keys[16*11] store expanded keys
188
-# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
189
-# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
190
-# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
191
-# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
192
-# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
193
-# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
194
-# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
195
-# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
196
-# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
197
-# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
198
-# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
199
-# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
200
-# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
201
-# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
202
-# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
203
-# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
204
-#} gcm_ctx#
184
+#define AadHash 16*0
185
+#define AadLen 16*1
186
+#define InLen (16*1)+8
187
+#define PBlockEncKey 16*2
188
+#define OrigIV 16*3
189
+#define CurCount 16*4
190
+#define PBlockLen 16*5
205191
206
-HashKey = 16*11 # store HashKey <<1 mod poly here
207
-HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
208
-HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
209
-HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
210
-HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
211
-HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
212
-HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
213
-HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
214
-HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
215
-HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
216
-HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
217
-HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
218
-HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
219
-HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
220
-HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
221
-HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
192
+HashKey = 16*6 # store HashKey <<1 mod poly here
193
+HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
194
+HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
195
+HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
196
+HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
197
+HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
198
+HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
199
+HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
200
+HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
201
+HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
202
+HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
203
+HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
204
+HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
205
+HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
206
+HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
207
+HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
222208
223209 #define arg1 %rdi
224210 #define arg2 %rsi
....@@ -229,6 +215,8 @@
229215 #define arg7 STACK_OFFSET+8*1(%r14)
230216 #define arg8 STACK_OFFSET+8*2(%r14)
231217 #define arg9 STACK_OFFSET+8*3(%r14)
218
+#define arg10 STACK_OFFSET+8*4(%r14)
219
+#define keysize 2*15*16(arg1)
232220
233221 i = 0
234222 j = 0
....@@ -267,20 +255,636 @@
267255 # Utility Macros
268256 ################################
269257
270
-# Encryption of a single block
271
-.macro ENCRYPT_SINGLE_BLOCK XMM0
272
- vpxor (arg1), \XMM0, \XMM0
273
- i = 1
274
- setreg
275
-.rep 9
276
- vaesenc 16*i(arg1), \XMM0, \XMM0
277
- i = (i+1)
278
- setreg
279
-.endr
280
- vaesenclast 16*10(arg1), \XMM0, \XMM0
258
+.macro FUNC_SAVE
259
+ #the number of pushes must equal STACK_OFFSET
260
+ push %r12
261
+ push %r13
262
+ push %r14
263
+ push %r15
264
+
265
+ mov %rsp, %r14
266
+
267
+
268
+
269
+ sub $VARIABLE_OFFSET, %rsp
270
+ and $~63, %rsp # align rsp to 64 bytes
281271 .endm
282272
283
-#ifdef CONFIG_AS_AVX
273
+.macro FUNC_RESTORE
274
+ mov %r14, %rsp
275
+
276
+ pop %r15
277
+ pop %r14
278
+ pop %r13
279
+ pop %r12
280
+.endm
281
+
282
+# Encryption of a single block
283
+.macro ENCRYPT_SINGLE_BLOCK REP XMM0
284
+ vpxor (arg1), \XMM0, \XMM0
285
+ i = 1
286
+ setreg
287
+.rep \REP
288
+ vaesenc 16*i(arg1), \XMM0, \XMM0
289
+ i = (i+1)
290
+ setreg
291
+.endr
292
+ vaesenclast 16*i(arg1), \XMM0, \XMM0
293
+.endm
294
+
295
+# combined for GCM encrypt and decrypt functions
296
+# clobbering all xmm registers
297
+# clobbering r10, r11, r12, r13, r14, r15
298
+.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
299
+ vmovdqu AadHash(arg2), %xmm8
300
+ vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
301
+ add arg5, InLen(arg2)
302
+
303
+ # initialize the data pointer offset as zero
304
+ xor %r11d, %r11d
305
+
306
+ PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
307
+ sub %r11, arg5
308
+
309
+ mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
310
+ and $-16, %r13 # r13 = r13 - (r13 mod 16)
311
+
312
+ mov %r13, %r12
313
+ shr $4, %r12
314
+ and $7, %r12
315
+ jz _initial_num_blocks_is_0\@
316
+
317
+ cmp $7, %r12
318
+ je _initial_num_blocks_is_7\@
319
+ cmp $6, %r12
320
+ je _initial_num_blocks_is_6\@
321
+ cmp $5, %r12
322
+ je _initial_num_blocks_is_5\@
323
+ cmp $4, %r12
324
+ je _initial_num_blocks_is_4\@
325
+ cmp $3, %r12
326
+ je _initial_num_blocks_is_3\@
327
+ cmp $2, %r12
328
+ je _initial_num_blocks_is_2\@
329
+
330
+ jmp _initial_num_blocks_is_1\@
331
+
332
+_initial_num_blocks_is_7\@:
333
+ \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
334
+ sub $16*7, %r13
335
+ jmp _initial_blocks_encrypted\@
336
+
337
+_initial_num_blocks_is_6\@:
338
+ \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
339
+ sub $16*6, %r13
340
+ jmp _initial_blocks_encrypted\@
341
+
342
+_initial_num_blocks_is_5\@:
343
+ \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
344
+ sub $16*5, %r13
345
+ jmp _initial_blocks_encrypted\@
346
+
347
+_initial_num_blocks_is_4\@:
348
+ \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
349
+ sub $16*4, %r13
350
+ jmp _initial_blocks_encrypted\@
351
+
352
+_initial_num_blocks_is_3\@:
353
+ \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
354
+ sub $16*3, %r13
355
+ jmp _initial_blocks_encrypted\@
356
+
357
+_initial_num_blocks_is_2\@:
358
+ \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
359
+ sub $16*2, %r13
360
+ jmp _initial_blocks_encrypted\@
361
+
362
+_initial_num_blocks_is_1\@:
363
+ \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
364
+ sub $16*1, %r13
365
+ jmp _initial_blocks_encrypted\@
366
+
367
+_initial_num_blocks_is_0\@:
368
+ \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
369
+
370
+
371
+_initial_blocks_encrypted\@:
372
+ test %r13, %r13
373
+ je _zero_cipher_left\@
374
+
375
+ sub $128, %r13
376
+ je _eight_cipher_left\@
377
+
378
+
379
+
380
+
381
+ vmovd %xmm9, %r15d
382
+ and $255, %r15d
383
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
384
+
385
+
386
+_encrypt_by_8_new\@:
387
+ cmp $(255-8), %r15d
388
+ jg _encrypt_by_8\@
389
+
390
+
391
+
392
+ add $8, %r15b
393
+ \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
394
+ add $128, %r11
395
+ sub $128, %r13
396
+ jne _encrypt_by_8_new\@
397
+
398
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
399
+ jmp _eight_cipher_left\@
400
+
401
+_encrypt_by_8\@:
402
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
403
+ add $8, %r15b
404
+ \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
405
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
406
+ add $128, %r11
407
+ sub $128, %r13
408
+ jne _encrypt_by_8_new\@
409
+
410
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
411
+
412
+
413
+
414
+
415
+_eight_cipher_left\@:
416
+ \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
417
+
418
+
419
+_zero_cipher_left\@:
420
+ vmovdqu %xmm14, AadHash(arg2)
421
+ vmovdqu %xmm9, CurCount(arg2)
422
+
423
+ # check for 0 length
424
+ mov arg5, %r13
425
+ and $15, %r13 # r13 = (arg5 mod 16)
426
+
427
+ je _multiple_of_16_bytes\@
428
+
429
+ # handle the last <16 Byte block separately
430
+
431
+ mov %r13, PBlockLen(arg2)
432
+
433
+ vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
434
+ vmovdqu %xmm9, CurCount(arg2)
435
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
436
+
437
+ ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
438
+ vmovdqu %xmm9, PBlockEncKey(arg2)
439
+
440
+ cmp $16, arg5
441
+ jge _large_enough_update\@
442
+
443
+ lea (arg4,%r11,1), %r10
444
+ mov %r13, %r12
445
+
446
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm1
447
+
448
+ lea SHIFT_MASK+16(%rip), %r12
449
+ sub %r13, %r12 # adjust the shuffle mask pointer to be
450
+ # able to shift 16-r13 bytes (r13 is the
451
+ # number of bytes in plaintext mod 16)
452
+
453
+ jmp _final_ghash_mul\@
454
+
455
+_large_enough_update\@:
456
+ sub $16, %r11
457
+ add %r13, %r11
458
+
459
+ # receive the last <16 Byte block
460
+ vmovdqu (arg4, %r11, 1), %xmm1
461
+
462
+ sub %r13, %r11
463
+ add $16, %r11
464
+
465
+ lea SHIFT_MASK+16(%rip), %r12
466
+ # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
467
+ # (r13 is the number of bytes in plaintext mod 16)
468
+ sub %r13, %r12
469
+ # get the appropriate shuffle mask
470
+ vmovdqu (%r12), %xmm2
471
+ # shift right 16-r13 bytes
472
+ vpshufb %xmm2, %xmm1, %xmm1
473
+
474
+_final_ghash_mul\@:
475
+ .if \ENC_DEC == DEC
476
+ vmovdqa %xmm1, %xmm2
477
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
478
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
479
+ # mask out top 16-r13 bytes of xmm9
480
+ vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
481
+ vpand %xmm1, %xmm2, %xmm2
482
+ vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
483
+ vpxor %xmm2, %xmm14, %xmm14
484
+
485
+ vmovdqu %xmm14, AadHash(arg2)
486
+ .else
487
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
488
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
489
+ # mask out top 16-r13 bytes of xmm9
490
+ vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
491
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
492
+ vpxor %xmm9, %xmm14, %xmm14
493
+
494
+ vmovdqu %xmm14, AadHash(arg2)
495
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
496
+ .endif
497
+
498
+
499
+ #############################
500
+ # output r13 Bytes
501
+ vmovq %xmm9, %rax
502
+ cmp $8, %r13
503
+ jle _less_than_8_bytes_left\@
504
+
505
+ mov %rax, (arg3 , %r11)
506
+ add $8, %r11
507
+ vpsrldq $8, %xmm9, %xmm9
508
+ vmovq %xmm9, %rax
509
+ sub $8, %r13
510
+
511
+_less_than_8_bytes_left\@:
512
+ movb %al, (arg3 , %r11)
513
+ add $1, %r11
514
+ shr $8, %rax
515
+ sub $1, %r13
516
+ jne _less_than_8_bytes_left\@
517
+ #############################
518
+
519
+_multiple_of_16_bytes\@:
520
+.endm
521
+
522
+
523
+# GCM_COMPLETE Finishes update of tag of last partial block
524
+# Output: Authorization Tag (AUTH_TAG)
525
+# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
526
+.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
527
+ vmovdqu AadHash(arg2), %xmm14
528
+ vmovdqu HashKey(arg2), %xmm13
529
+
530
+ mov PBlockLen(arg2), %r12
531
+ test %r12, %r12
532
+ je _partial_done\@
533
+
534
+ #GHASH computation for the last <16 Byte block
535
+ \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
536
+
537
+_partial_done\@:
538
+ mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
539
+ shl $3, %r12 # convert into number of bits
540
+ vmovd %r12d, %xmm15 # len(A) in xmm15
541
+
542
+ mov InLen(arg2), %r12
543
+ shl $3, %r12 # len(C) in bits (*128)
544
+ vmovq %r12, %xmm1
545
+ vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
546
+ vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
547
+
548
+ vpxor %xmm15, %xmm14, %xmm14
549
+ \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
550
+ vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
551
+
552
+ vmovdqu OrigIV(arg2), %xmm9
553
+
554
+ ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
555
+
556
+ vpxor %xmm14, %xmm9, %xmm9
557
+
558
+
559
+
560
+_return_T\@:
561
+ mov \AUTH_TAG, %r10 # r10 = authTag
562
+ mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
563
+
564
+ cmp $16, %r11
565
+ je _T_16\@
566
+
567
+ cmp $8, %r11
568
+ jl _T_4\@
569
+
570
+_T_8\@:
571
+ vmovq %xmm9, %rax
572
+ mov %rax, (%r10)
573
+ add $8, %r10
574
+ sub $8, %r11
575
+ vpsrldq $8, %xmm9, %xmm9
576
+ test %r11, %r11
577
+ je _return_T_done\@
578
+_T_4\@:
579
+ vmovd %xmm9, %eax
580
+ mov %eax, (%r10)
581
+ add $4, %r10
582
+ sub $4, %r11
583
+ vpsrldq $4, %xmm9, %xmm9
584
+ test %r11, %r11
585
+ je _return_T_done\@
586
+_T_123\@:
587
+ vmovd %xmm9, %eax
588
+ cmp $2, %r11
589
+ jl _T_1\@
590
+ mov %ax, (%r10)
591
+ cmp $2, %r11
592
+ je _return_T_done\@
593
+ add $2, %r10
594
+ sar $16, %eax
595
+_T_1\@:
596
+ mov %al, (%r10)
597
+ jmp _return_T_done\@
598
+
599
+_T_16\@:
600
+ vmovdqu %xmm9, (%r10)
601
+
602
+_return_T_done\@:
603
+.endm
604
+
605
+.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
606
+
607
+ mov \AAD, %r10 # r10 = AAD
608
+ mov \AADLEN, %r12 # r12 = aadLen
609
+
610
+
611
+ mov %r12, %r11
612
+
613
+ vpxor \T8, \T8, \T8
614
+ vpxor \T7, \T7, \T7
615
+ cmp $16, %r11
616
+ jl _get_AAD_rest8\@
617
+_get_AAD_blocks\@:
618
+ vmovdqu (%r10), \T7
619
+ vpshufb SHUF_MASK(%rip), \T7, \T7
620
+ vpxor \T7, \T8, \T8
621
+ \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
622
+ add $16, %r10
623
+ sub $16, %r12
624
+ sub $16, %r11
625
+ cmp $16, %r11
626
+ jge _get_AAD_blocks\@
627
+ vmovdqu \T8, \T7
628
+ test %r11, %r11
629
+ je _get_AAD_done\@
630
+
631
+ vpxor \T7, \T7, \T7
632
+
633
+ /* read the last <16B of AAD. since we have at least 4B of
634
+ data right after the AAD (the ICV, and maybe some CT), we can
635
+ read 4B/8B blocks safely, and then get rid of the extra stuff */
636
+_get_AAD_rest8\@:
637
+ cmp $4, %r11
638
+ jle _get_AAD_rest4\@
639
+ movq (%r10), \T1
640
+ add $8, %r10
641
+ sub $8, %r11
642
+ vpslldq $8, \T1, \T1
643
+ vpsrldq $8, \T7, \T7
644
+ vpxor \T1, \T7, \T7
645
+ jmp _get_AAD_rest8\@
646
+_get_AAD_rest4\@:
647
+ test %r11, %r11
648
+ jle _get_AAD_rest0\@
649
+ mov (%r10), %eax
650
+ movq %rax, \T1
651
+ add $4, %r10
652
+ sub $4, %r11
653
+ vpslldq $12, \T1, \T1
654
+ vpsrldq $4, \T7, \T7
655
+ vpxor \T1, \T7, \T7
656
+_get_AAD_rest0\@:
657
+ /* finalize: shift out the extra bytes we read, and align
658
+ left. since pslldq can only shift by an immediate, we use
659
+ vpshufb and an array of shuffle masks */
660
+ movq %r12, %r11
661
+ salq $4, %r11
662
+ vmovdqu aad_shift_arr(%r11), \T1
663
+ vpshufb \T1, \T7, \T7
664
+_get_AAD_rest_final\@:
665
+ vpshufb SHUF_MASK(%rip), \T7, \T7
666
+ vpxor \T8, \T7, \T7
667
+ \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
668
+
669
+_get_AAD_done\@:
670
+ vmovdqu \T7, AadHash(arg2)
671
+.endm
672
+
673
+.macro INIT GHASH_MUL PRECOMPUTE
674
+ mov arg6, %r11
675
+ mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
676
+ xor %r11d, %r11d
677
+ mov %r11, InLen(arg2) # ctx_data.in_length = 0
678
+
679
+ mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
680
+ mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
681
+ mov arg3, %rax
682
+ movdqu (%rax), %xmm0
683
+ movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
684
+
685
+ vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
686
+ movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
687
+
688
+ vmovdqu (arg4), %xmm6 # xmm6 = HashKey
689
+
690
+ vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
691
+ ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
692
+ vmovdqa %xmm6, %xmm2
693
+ vpsllq $1, %xmm6, %xmm6
694
+ vpsrlq $63, %xmm2, %xmm2
695
+ vmovdqa %xmm2, %xmm1
696
+ vpslldq $8, %xmm2, %xmm2
697
+ vpsrldq $8, %xmm1, %xmm1
698
+ vpor %xmm2, %xmm6, %xmm6
699
+ #reduction
700
+ vpshufd $0b00100100, %xmm1, %xmm2
701
+ vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
702
+ vpand POLY(%rip), %xmm2, %xmm2
703
+ vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
704
+ #######################################################################
705
+ vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
706
+
707
+ CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
708
+
709
+ \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
710
+.endm
711
+
712
+
713
+# Reads DLEN bytes starting at DPTR and stores in XMMDst
714
+# where 0 < DLEN < 16
715
+# Clobbers %rax, DLEN
716
+.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
717
+ vpxor \XMMDst, \XMMDst, \XMMDst
718
+
719
+ cmp $8, \DLEN
720
+ jl _read_lt8_\@
721
+ mov (\DPTR), %rax
722
+ vpinsrq $0, %rax, \XMMDst, \XMMDst
723
+ sub $8, \DLEN
724
+ jz _done_read_partial_block_\@
725
+ xor %eax, %eax
726
+_read_next_byte_\@:
727
+ shl $8, %rax
728
+ mov 7(\DPTR, \DLEN, 1), %al
729
+ dec \DLEN
730
+ jnz _read_next_byte_\@
731
+ vpinsrq $1, %rax, \XMMDst, \XMMDst
732
+ jmp _done_read_partial_block_\@
733
+_read_lt8_\@:
734
+ xor %eax, %eax
735
+_read_next_byte_lt8_\@:
736
+ shl $8, %rax
737
+ mov -1(\DPTR, \DLEN, 1), %al
738
+ dec \DLEN
739
+ jnz _read_next_byte_lt8_\@
740
+ vpinsrq $0, %rax, \XMMDst, \XMMDst
741
+_done_read_partial_block_\@:
742
+.endm
743
+
744
+# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
745
+# between update calls.
746
+# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
747
+# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
748
+# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
749
+.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
750
+ AAD_HASH ENC_DEC
751
+ mov PBlockLen(arg2), %r13
752
+ test %r13, %r13
753
+ je _partial_block_done_\@ # Leave Macro if no partial blocks
754
+ # Read in input data without over reading
755
+ cmp $16, \PLAIN_CYPH_LEN
756
+ jl _fewer_than_16_bytes_\@
757
+ vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
758
+ jmp _data_read_\@
759
+
760
+_fewer_than_16_bytes_\@:
761
+ lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
762
+ mov \PLAIN_CYPH_LEN, %r12
763
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm1
764
+
765
+ mov PBlockLen(arg2), %r13
766
+
767
+_data_read_\@: # Finished reading in data
768
+
769
+ vmovdqu PBlockEncKey(arg2), %xmm9
770
+ vmovdqu HashKey(arg2), %xmm13
771
+
772
+ lea SHIFT_MASK(%rip), %r12
773
+
774
+ # adjust the shuffle mask pointer to be able to shift r13 bytes
775
+ # r16-r13 is the number of bytes in plaintext mod 16)
776
+ add %r13, %r12
777
+ vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
778
+ vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
779
+
780
+.if \ENC_DEC == DEC
781
+ vmovdqa %xmm1, %xmm3
782
+ pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
783
+
784
+ mov \PLAIN_CYPH_LEN, %r10
785
+ add %r13, %r10
786
+ # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
787
+ sub $16, %r10
788
+ # Determine if if partial block is not being filled and
789
+ # shift mask accordingly
790
+ jge _no_extra_mask_1_\@
791
+ sub %r10, %r12
792
+_no_extra_mask_1_\@:
793
+
794
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
795
+ # get the appropriate mask to mask out bottom r13 bytes of xmm9
796
+ vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
797
+
798
+ vpand %xmm1, %xmm3, %xmm3
799
+ vmovdqa SHUF_MASK(%rip), %xmm10
800
+ vpshufb %xmm10, %xmm3, %xmm3
801
+ vpshufb %xmm2, %xmm3, %xmm3
802
+ vpxor %xmm3, \AAD_HASH, \AAD_HASH
803
+
804
+ test %r10, %r10
805
+ jl _partial_incomplete_1_\@
806
+
807
+ # GHASH computation for the last <16 Byte block
808
+ \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
809
+ xor %eax,%eax
810
+
811
+ mov %rax, PBlockLen(arg2)
812
+ jmp _dec_done_\@
813
+_partial_incomplete_1_\@:
814
+ add \PLAIN_CYPH_LEN, PBlockLen(arg2)
815
+_dec_done_\@:
816
+ vmovdqu \AAD_HASH, AadHash(arg2)
817
+.else
818
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
819
+
820
+ mov \PLAIN_CYPH_LEN, %r10
821
+ add %r13, %r10
822
+ # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
823
+ sub $16, %r10
824
+ # Determine if if partial block is not being filled and
825
+ # shift mask accordingly
826
+ jge _no_extra_mask_2_\@
827
+ sub %r10, %r12
828
+_no_extra_mask_2_\@:
829
+
830
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
831
+ # get the appropriate mask to mask out bottom r13 bytes of xmm9
832
+ vpand %xmm1, %xmm9, %xmm9
833
+
834
+ vmovdqa SHUF_MASK(%rip), %xmm1
835
+ vpshufb %xmm1, %xmm9, %xmm9
836
+ vpshufb %xmm2, %xmm9, %xmm9
837
+ vpxor %xmm9, \AAD_HASH, \AAD_HASH
838
+
839
+ test %r10, %r10
840
+ jl _partial_incomplete_2_\@
841
+
842
+ # GHASH computation for the last <16 Byte block
843
+ \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
844
+ xor %eax,%eax
845
+
846
+ mov %rax, PBlockLen(arg2)
847
+ jmp _encode_done_\@
848
+_partial_incomplete_2_\@:
849
+ add \PLAIN_CYPH_LEN, PBlockLen(arg2)
850
+_encode_done_\@:
851
+ vmovdqu \AAD_HASH, AadHash(arg2)
852
+
853
+ vmovdqa SHUF_MASK(%rip), %xmm10
854
+ # shuffle xmm9 back to output as ciphertext
855
+ vpshufb %xmm10, %xmm9, %xmm9
856
+ vpshufb %xmm2, %xmm9, %xmm9
857
+.endif
858
+ # output encrypted Bytes
859
+ test %r10, %r10
860
+ jl _partial_fill_\@
861
+ mov %r13, %r12
862
+ mov $16, %r13
863
+ # Set r13 to be the number of bytes to write out
864
+ sub %r12, %r13
865
+ jmp _count_set_\@
866
+_partial_fill_\@:
867
+ mov \PLAIN_CYPH_LEN, %r13
868
+_count_set_\@:
869
+ vmovdqa %xmm9, %xmm0
870
+ vmovq %xmm0, %rax
871
+ cmp $8, %r13
872
+ jle _less_than_8_bytes_left_\@
873
+
874
+ mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
875
+ add $8, \DATA_OFFSET
876
+ psrldq $8, %xmm0
877
+ vmovq %xmm0, %rax
878
+ sub $8, %r13
879
+_less_than_8_bytes_left_\@:
880
+ movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
881
+ add $1, \DATA_OFFSET
882
+ shr $8, %rax
883
+ sub $1, %r13
884
+ jne _less_than_8_bytes_left_\@
885
+_partial_block_done_\@:
886
+.endm # PARTIAL_BLOCK
887
+
284888 ###############################################################################
285889 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
286890 # Input: A and B (128-bits each, bit-reflected)
....@@ -341,49 +945,49 @@
341945
342946 vpshufd $0b01001110, \T5, \T1
343947 vpxor \T5, \T1, \T1
344
- vmovdqa \T1, HashKey_k(arg1)
948
+ vmovdqu \T1, HashKey_k(arg2)
345949
346950 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
347
- vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
951
+ vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
348952 vpshufd $0b01001110, \T5, \T1
349953 vpxor \T5, \T1, \T1
350
- vmovdqa \T1, HashKey_2_k(arg1)
954
+ vmovdqu \T1, HashKey_2_k(arg2)
351955
352956 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
353
- vmovdqa \T5, HashKey_3(arg1)
957
+ vmovdqu \T5, HashKey_3(arg2)
354958 vpshufd $0b01001110, \T5, \T1
355959 vpxor \T5, \T1, \T1
356
- vmovdqa \T1, HashKey_3_k(arg1)
960
+ vmovdqu \T1, HashKey_3_k(arg2)
357961
358962 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
359
- vmovdqa \T5, HashKey_4(arg1)
963
+ vmovdqu \T5, HashKey_4(arg2)
360964 vpshufd $0b01001110, \T5, \T1
361965 vpxor \T5, \T1, \T1
362
- vmovdqa \T1, HashKey_4_k(arg1)
966
+ vmovdqu \T1, HashKey_4_k(arg2)
363967
364968 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
365
- vmovdqa \T5, HashKey_5(arg1)
969
+ vmovdqu \T5, HashKey_5(arg2)
366970 vpshufd $0b01001110, \T5, \T1
367971 vpxor \T5, \T1, \T1
368
- vmovdqa \T1, HashKey_5_k(arg1)
972
+ vmovdqu \T1, HashKey_5_k(arg2)
369973
370974 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
371
- vmovdqa \T5, HashKey_6(arg1)
975
+ vmovdqu \T5, HashKey_6(arg2)
372976 vpshufd $0b01001110, \T5, \T1
373977 vpxor \T5, \T1, \T1
374
- vmovdqa \T1, HashKey_6_k(arg1)
978
+ vmovdqu \T1, HashKey_6_k(arg2)
375979
376980 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
377
- vmovdqa \T5, HashKey_7(arg1)
981
+ vmovdqu \T5, HashKey_7(arg2)
378982 vpshufd $0b01001110, \T5, \T1
379983 vpxor \T5, \T1, \T1
380
- vmovdqa \T1, HashKey_7_k(arg1)
984
+ vmovdqu \T1, HashKey_7_k(arg2)
381985
382986 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
383
- vmovdqa \T5, HashKey_8(arg1)
987
+ vmovdqu \T5, HashKey_8(arg2)
384988 vpshufd $0b01001110, \T5, \T1
385989 vpxor \T5, \T1, \T1
386
- vmovdqa \T1, HashKey_8_k(arg1)
990
+ vmovdqu \T1, HashKey_8_k(arg2)
387991
388992 .endm
389993
....@@ -392,84 +996,15 @@
392996 ## num_initial_blocks = b mod 4#
393997 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
394998 ## r10, r11, r12, rax are clobbered
395
-## arg1, arg2, arg3, r14 are used as a pointer only, not modified
999
+## arg1, arg3, arg4, r14 are used as a pointer only, not modified
3961000
397
-.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
1001
+.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
3981002 i = (8-\num_initial_blocks)
399
- j = 0
4001003 setreg
401
-
402
- mov arg6, %r10 # r10 = AAD
403
- mov arg7, %r12 # r12 = aadLen
404
-
405
-
406
- mov %r12, %r11
407
-
408
- vpxor reg_j, reg_j, reg_j
409
- vpxor reg_i, reg_i, reg_i
410
- cmp $16, %r11
411
- jl _get_AAD_rest8\@
412
-_get_AAD_blocks\@:
413
- vmovdqu (%r10), reg_i
414
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
415
- vpxor reg_i, reg_j, reg_j
416
- GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
417
- add $16, %r10
418
- sub $16, %r12
419
- sub $16, %r11
420
- cmp $16, %r11
421
- jge _get_AAD_blocks\@
422
- vmovdqu reg_j, reg_i
423
- cmp $0, %r11
424
- je _get_AAD_done\@
425
-
426
- vpxor reg_i, reg_i, reg_i
427
-
428
- /* read the last <16B of AAD. since we have at least 4B of
429
- data right after the AAD (the ICV, and maybe some CT), we can
430
- read 4B/8B blocks safely, and then get rid of the extra stuff */
431
-_get_AAD_rest8\@:
432
- cmp $4, %r11
433
- jle _get_AAD_rest4\@
434
- movq (%r10), \T1
435
- add $8, %r10
436
- sub $8, %r11
437
- vpslldq $8, \T1, \T1
438
- vpsrldq $8, reg_i, reg_i
439
- vpxor \T1, reg_i, reg_i
440
- jmp _get_AAD_rest8\@
441
-_get_AAD_rest4\@:
442
- cmp $0, %r11
443
- jle _get_AAD_rest0\@
444
- mov (%r10), %eax
445
- movq %rax, \T1
446
- add $4, %r10
447
- sub $4, %r11
448
- vpslldq $12, \T1, \T1
449
- vpsrldq $4, reg_i, reg_i
450
- vpxor \T1, reg_i, reg_i
451
-_get_AAD_rest0\@:
452
- /* finalize: shift out the extra bytes we read, and align
453
- left. since pslldq can only shift by an immediate, we use
454
- vpshufb and an array of shuffle masks */
455
- movq %r12, %r11
456
- salq $4, %r11
457
- movdqu aad_shift_arr(%r11), \T1
458
- vpshufb \T1, reg_i, reg_i
459
-_get_AAD_rest_final\@:
460
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
461
- vpxor reg_j, reg_i, reg_i
462
- GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
463
-
464
-_get_AAD_done\@:
465
- # initialize the data pointer offset as zero
466
- xor %r11d, %r11d
1004
+ vmovdqu AadHash(arg2), reg_i
4671005
4681006 # start AES for num_initial_blocks blocks
469
- mov arg5, %rax # rax = *Y0
470
- vmovdqu (%rax), \CTR # CTR = Y0
471
- vpshufb SHUF_MASK(%rip), \CTR, \CTR
472
-
1007
+ vmovdqu CurCount(arg2), \CTR
4731008
4741009 i = (9-\num_initial_blocks)
4751010 setreg
....@@ -490,10 +1025,10 @@
4901025 setreg
4911026 .endr
4921027
493
- j = 1
494
- setreg
495
-.rep 9
496
- vmovdqa 16*j(arg1), \T_key
1028
+ j = 1
1029
+ setreg
1030
+.rep \REP
1031
+ vmovdqa 16*j(arg1), \T_key
4971032 i = (9-\num_initial_blocks)
4981033 setreg
4991034 .rep \num_initial_blocks
....@@ -502,12 +1037,11 @@
5021037 setreg
5031038 .endr
5041039
505
- j = (j+1)
506
- setreg
1040
+ j = (j+1)
1041
+ setreg
5071042 .endr
5081043
509
-
510
- vmovdqa 16*10(arg1), \T_key
1044
+ vmovdqa 16*j(arg1), \T_key
5111045 i = (9-\num_initial_blocks)
5121046 setreg
5131047 .rep \num_initial_blocks
....@@ -519,9 +1053,9 @@
5191053 i = (9-\num_initial_blocks)
5201054 setreg
5211055 .rep \num_initial_blocks
522
- vmovdqu (arg3, %r11), \T1
1056
+ vmovdqu (arg4, %r11), \T1
5231057 vpxor \T1, reg_i, reg_i
524
- vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
1058
+ vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
5251059 add $16, %r11
5261060 .if \ENC_DEC == DEC
5271061 vmovdqa \T1, reg_i
....@@ -595,9 +1129,9 @@
5951129 vpxor \T_key, \XMM7, \XMM7
5961130 vpxor \T_key, \XMM8, \XMM8
5971131
598
- i = 1
599
- setreg
600
-.rep 9 # do 9 rounds
1132
+ i = 1
1133
+ setreg
1134
+.rep \REP # do REP rounds
6011135 vmovdqa 16*i(arg1), \T_key
6021136 vaesenc \T_key, \XMM1, \XMM1
6031137 vaesenc \T_key, \XMM2, \XMM2
....@@ -607,10 +1141,9 @@
6071141 vaesenc \T_key, \XMM6, \XMM6
6081142 vaesenc \T_key, \XMM7, \XMM7
6091143 vaesenc \T_key, \XMM8, \XMM8
610
- i = (i+1)
611
- setreg
1144
+ i = (i+1)
1145
+ setreg
6121146 .endr
613
-
6141147
6151148 vmovdqa 16*i(arg1), \T_key
6161149 vaesenclast \T_key, \XMM1, \XMM1
....@@ -622,58 +1155,58 @@
6221155 vaesenclast \T_key, \XMM7, \XMM7
6231156 vaesenclast \T_key, \XMM8, \XMM8
6241157
625
- vmovdqu (arg3, %r11), \T1
1158
+ vmovdqu (arg4, %r11), \T1
6261159 vpxor \T1, \XMM1, \XMM1
627
- vmovdqu \XMM1, (arg2 , %r11)
1160
+ vmovdqu \XMM1, (arg3 , %r11)
6281161 .if \ENC_DEC == DEC
6291162 vmovdqa \T1, \XMM1
6301163 .endif
6311164
632
- vmovdqu 16*1(arg3, %r11), \T1
1165
+ vmovdqu 16*1(arg4, %r11), \T1
6331166 vpxor \T1, \XMM2, \XMM2
634
- vmovdqu \XMM2, 16*1(arg2 , %r11)
1167
+ vmovdqu \XMM2, 16*1(arg3 , %r11)
6351168 .if \ENC_DEC == DEC
6361169 vmovdqa \T1, \XMM2
6371170 .endif
6381171
639
- vmovdqu 16*2(arg3, %r11), \T1
1172
+ vmovdqu 16*2(arg4, %r11), \T1
6401173 vpxor \T1, \XMM3, \XMM3
641
- vmovdqu \XMM3, 16*2(arg2 , %r11)
1174
+ vmovdqu \XMM3, 16*2(arg3 , %r11)
6421175 .if \ENC_DEC == DEC
6431176 vmovdqa \T1, \XMM3
6441177 .endif
6451178
646
- vmovdqu 16*3(arg3, %r11), \T1
1179
+ vmovdqu 16*3(arg4, %r11), \T1
6471180 vpxor \T1, \XMM4, \XMM4
648
- vmovdqu \XMM4, 16*3(arg2 , %r11)
1181
+ vmovdqu \XMM4, 16*3(arg3 , %r11)
6491182 .if \ENC_DEC == DEC
6501183 vmovdqa \T1, \XMM4
6511184 .endif
6521185
653
- vmovdqu 16*4(arg3, %r11), \T1
1186
+ vmovdqu 16*4(arg4, %r11), \T1
6541187 vpxor \T1, \XMM5, \XMM5
655
- vmovdqu \XMM5, 16*4(arg2 , %r11)
1188
+ vmovdqu \XMM5, 16*4(arg3 , %r11)
6561189 .if \ENC_DEC == DEC
6571190 vmovdqa \T1, \XMM5
6581191 .endif
6591192
660
- vmovdqu 16*5(arg3, %r11), \T1
1193
+ vmovdqu 16*5(arg4, %r11), \T1
6611194 vpxor \T1, \XMM6, \XMM6
662
- vmovdqu \XMM6, 16*5(arg2 , %r11)
1195
+ vmovdqu \XMM6, 16*5(arg3 , %r11)
6631196 .if \ENC_DEC == DEC
6641197 vmovdqa \T1, \XMM6
6651198 .endif
6661199
667
- vmovdqu 16*6(arg3, %r11), \T1
1200
+ vmovdqu 16*6(arg4, %r11), \T1
6681201 vpxor \T1, \XMM7, \XMM7
669
- vmovdqu \XMM7, 16*6(arg2 , %r11)
1202
+ vmovdqu \XMM7, 16*6(arg3 , %r11)
6701203 .if \ENC_DEC == DEC
6711204 vmovdqa \T1, \XMM7
6721205 .endif
6731206
674
- vmovdqu 16*7(arg3, %r11), \T1
1207
+ vmovdqu 16*7(arg4, %r11), \T1
6751208 vpxor \T1, \XMM8, \XMM8
676
- vmovdqu \XMM8, 16*7(arg2 , %r11)
1209
+ vmovdqu \XMM8, 16*7(arg3 , %r11)
6771210 .if \ENC_DEC == DEC
6781211 vmovdqa \T1, \XMM8
6791212 .endif
....@@ -698,9 +1231,9 @@
6981231
6991232 # encrypt 8 blocks at a time
7001233 # ghash the 8 previously encrypted ciphertext blocks
701
-# arg1, arg2, arg3 are used as pointers only, not modified
1234
+# arg1, arg3, arg4 are used as pointers only, not modified
7021235 # r11 is the data offset value
703
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1236
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
7041237
7051238 vmovdqa \XMM1, \T2
7061239 vmovdqa \XMM2, TMP2(%rsp)
....@@ -784,14 +1317,14 @@
7841317
7851318 #######################################################################
7861319
787
- vmovdqa HashKey_8(arg1), \T5
1320
+ vmovdqu HashKey_8(arg2), \T5
7881321 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
7891322 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
7901323
7911324 vpshufd $0b01001110, \T2, \T6
7921325 vpxor \T2, \T6, \T6
7931326
794
- vmovdqa HashKey_8_k(arg1), \T5
1327
+ vmovdqu HashKey_8_k(arg2), \T5
7951328 vpclmulqdq $0x00, \T5, \T6, \T6
7961329
7971330 vmovdqu 16*3(arg1), \T1
....@@ -805,7 +1338,7 @@
8051338 vaesenc \T1, \XMM8, \XMM8
8061339
8071340 vmovdqa TMP2(%rsp), \T1
808
- vmovdqa HashKey_7(arg1), \T5
1341
+ vmovdqu HashKey_7(arg2), \T5
8091342 vpclmulqdq $0x11, \T5, \T1, \T3
8101343 vpxor \T3, \T4, \T4
8111344 vpclmulqdq $0x00, \T5, \T1, \T3
....@@ -813,7 +1346,7 @@
8131346
8141347 vpshufd $0b01001110, \T1, \T3
8151348 vpxor \T1, \T3, \T3
816
- vmovdqa HashKey_7_k(arg1), \T5
1349
+ vmovdqu HashKey_7_k(arg2), \T5
8171350 vpclmulqdq $0x10, \T5, \T3, \T3
8181351 vpxor \T3, \T6, \T6
8191352
....@@ -830,7 +1363,7 @@
8301363 #######################################################################
8311364
8321365 vmovdqa TMP3(%rsp), \T1
833
- vmovdqa HashKey_6(arg1), \T5
1366
+ vmovdqu HashKey_6(arg2), \T5
8341367 vpclmulqdq $0x11, \T5, \T1, \T3
8351368 vpxor \T3, \T4, \T4
8361369 vpclmulqdq $0x00, \T5, \T1, \T3
....@@ -838,7 +1371,7 @@
8381371
8391372 vpshufd $0b01001110, \T1, \T3
8401373 vpxor \T1, \T3, \T3
841
- vmovdqa HashKey_6_k(arg1), \T5
1374
+ vmovdqu HashKey_6_k(arg2), \T5
8421375 vpclmulqdq $0x10, \T5, \T3, \T3
8431376 vpxor \T3, \T6, \T6
8441377
....@@ -853,7 +1386,7 @@
8531386 vaesenc \T1, \XMM8, \XMM8
8541387
8551388 vmovdqa TMP4(%rsp), \T1
856
- vmovdqa HashKey_5(arg1), \T5
1389
+ vmovdqu HashKey_5(arg2), \T5
8571390 vpclmulqdq $0x11, \T5, \T1, \T3
8581391 vpxor \T3, \T4, \T4
8591392 vpclmulqdq $0x00, \T5, \T1, \T3
....@@ -861,7 +1394,7 @@
8611394
8621395 vpshufd $0b01001110, \T1, \T3
8631396 vpxor \T1, \T3, \T3
864
- vmovdqa HashKey_5_k(arg1), \T5
1397
+ vmovdqu HashKey_5_k(arg2), \T5
8651398 vpclmulqdq $0x10, \T5, \T3, \T3
8661399 vpxor \T3, \T6, \T6
8671400
....@@ -877,7 +1410,7 @@
8771410
8781411
8791412 vmovdqa TMP5(%rsp), \T1
880
- vmovdqa HashKey_4(arg1), \T5
1413
+ vmovdqu HashKey_4(arg2), \T5
8811414 vpclmulqdq $0x11, \T5, \T1, \T3
8821415 vpxor \T3, \T4, \T4
8831416 vpclmulqdq $0x00, \T5, \T1, \T3
....@@ -885,7 +1418,7 @@
8851418
8861419 vpshufd $0b01001110, \T1, \T3
8871420 vpxor \T1, \T3, \T3
888
- vmovdqa HashKey_4_k(arg1), \T5
1421
+ vmovdqu HashKey_4_k(arg2), \T5
8891422 vpclmulqdq $0x10, \T5, \T3, \T3
8901423 vpxor \T3, \T6, \T6
8911424
....@@ -900,7 +1433,7 @@
9001433 vaesenc \T1, \XMM8, \XMM8
9011434
9021435 vmovdqa TMP6(%rsp), \T1
903
- vmovdqa HashKey_3(arg1), \T5
1436
+ vmovdqu HashKey_3(arg2), \T5
9041437 vpclmulqdq $0x11, \T5, \T1, \T3
9051438 vpxor \T3, \T4, \T4
9061439 vpclmulqdq $0x00, \T5, \T1, \T3
....@@ -908,7 +1441,7 @@
9081441
9091442 vpshufd $0b01001110, \T1, \T3
9101443 vpxor \T1, \T3, \T3
911
- vmovdqa HashKey_3_k(arg1), \T5
1444
+ vmovdqu HashKey_3_k(arg2), \T5
9121445 vpclmulqdq $0x10, \T5, \T3, \T3
9131446 vpxor \T3, \T6, \T6
9141447
....@@ -924,7 +1457,7 @@
9241457 vaesenc \T1, \XMM8, \XMM8
9251458
9261459 vmovdqa TMP7(%rsp), \T1
927
- vmovdqa HashKey_2(arg1), \T5
1460
+ vmovdqu HashKey_2(arg2), \T5
9281461 vpclmulqdq $0x11, \T5, \T1, \T3
9291462 vpxor \T3, \T4, \T4
9301463 vpclmulqdq $0x00, \T5, \T1, \T3
....@@ -932,7 +1465,7 @@
9321465
9331466 vpshufd $0b01001110, \T1, \T3
9341467 vpxor \T1, \T3, \T3
935
- vmovdqa HashKey_2_k(arg1), \T5
1468
+ vmovdqu HashKey_2_k(arg2), \T5
9361469 vpclmulqdq $0x10, \T5, \T3, \T3
9371470 vpxor \T3, \T6, \T6
9381471
....@@ -949,7 +1482,7 @@
9491482 vaesenc \T5, \XMM8, \XMM8
9501483
9511484 vmovdqa TMP8(%rsp), \T1
952
- vmovdqa HashKey(arg1), \T5
1485
+ vmovdqu HashKey(arg2), \T5
9531486 vpclmulqdq $0x11, \T5, \T1, \T3
9541487 vpxor \T3, \T4, \T4
9551488 vpclmulqdq $0x00, \T5, \T1, \T3
....@@ -957,7 +1490,7 @@
9571490
9581491 vpshufd $0b01001110, \T1, \T3
9591492 vpxor \T1, \T3, \T3
960
- vmovdqa HashKey_k(arg1), \T5
1493
+ vmovdqu HashKey_k(arg2), \T5
9611494 vpclmulqdq $0x10, \T5, \T3, \T3
9621495 vpxor \T3, \T6, \T6
9631496
....@@ -966,17 +1499,35 @@
9661499
9671500 vmovdqu 16*10(arg1), \T5
9681501
1502
+ i = 11
1503
+ setreg
1504
+.rep (\REP-9)
1505
+
1506
+ vaesenc \T5, \XMM1, \XMM1
1507
+ vaesenc \T5, \XMM2, \XMM2
1508
+ vaesenc \T5, \XMM3, \XMM3
1509
+ vaesenc \T5, \XMM4, \XMM4
1510
+ vaesenc \T5, \XMM5, \XMM5
1511
+ vaesenc \T5, \XMM6, \XMM6
1512
+ vaesenc \T5, \XMM7, \XMM7
1513
+ vaesenc \T5, \XMM8, \XMM8
1514
+
1515
+ vmovdqu 16*i(arg1), \T5
1516
+ i = i + 1
1517
+ setreg
1518
+.endr
1519
+
9691520 i = 0
9701521 j = 1
9711522 setreg
9721523 .rep 8
973
- vpxor 16*i(arg3, %r11), \T5, \T2
1524
+ vpxor 16*i(arg4, %r11), \T5, \T2
9741525 .if \ENC_DEC == ENC
9751526 vaesenclast \T2, reg_j, reg_j
9761527 .else
9771528 vaesenclast \T2, reg_j, \T3
978
- vmovdqu 16*i(arg3, %r11), reg_j
979
- vmovdqu \T3, 16*i(arg2, %r11)
1529
+ vmovdqu 16*i(arg4, %r11), reg_j
1530
+ vmovdqu \T3, 16*i(arg3, %r11)
9801531 .endif
9811532 i = (i+1)
9821533 j = (j+1)
....@@ -1008,14 +1559,14 @@
10081559 vpxor \T2, \T7, \T7 # first phase of the reduction complete
10091560 #######################################################################
10101561 .if \ENC_DEC == ENC
1011
- vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
1012
- vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
1013
- vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
1014
- vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
1015
- vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
1016
- vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
1017
- vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
1018
- vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
1562
+ vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
1563
+ vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
1564
+ vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
1565
+ vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
1566
+ vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
1567
+ vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
1568
+ vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
1569
+ vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
10191570 .endif
10201571
10211572 #######################################################################
....@@ -1056,25 +1607,25 @@
10561607
10571608 vpshufd $0b01001110, \XMM1, \T2
10581609 vpxor \XMM1, \T2, \T2
1059
- vmovdqa HashKey_8(arg1), \T5
1610
+ vmovdqu HashKey_8(arg2), \T5
10601611 vpclmulqdq $0x11, \T5, \XMM1, \T6
10611612 vpclmulqdq $0x00, \T5, \XMM1, \T7
10621613
1063
- vmovdqa HashKey_8_k(arg1), \T3
1614
+ vmovdqu HashKey_8_k(arg2), \T3
10641615 vpclmulqdq $0x00, \T3, \T2, \XMM1
10651616
10661617 ######################
10671618
10681619 vpshufd $0b01001110, \XMM2, \T2
10691620 vpxor \XMM2, \T2, \T2
1070
- vmovdqa HashKey_7(arg1), \T5
1621
+ vmovdqu HashKey_7(arg2), \T5
10711622 vpclmulqdq $0x11, \T5, \XMM2, \T4
10721623 vpxor \T4, \T6, \T6
10731624
10741625 vpclmulqdq $0x00, \T5, \XMM2, \T4
10751626 vpxor \T4, \T7, \T7
10761627
1077
- vmovdqa HashKey_7_k(arg1), \T3
1628
+ vmovdqu HashKey_7_k(arg2), \T3
10781629 vpclmulqdq $0x00, \T3, \T2, \T2
10791630 vpxor \T2, \XMM1, \XMM1
10801631
....@@ -1082,14 +1633,14 @@
10821633
10831634 vpshufd $0b01001110, \XMM3, \T2
10841635 vpxor \XMM3, \T2, \T2
1085
- vmovdqa HashKey_6(arg1), \T5
1636
+ vmovdqu HashKey_6(arg2), \T5
10861637 vpclmulqdq $0x11, \T5, \XMM3, \T4
10871638 vpxor \T4, \T6, \T6
10881639
10891640 vpclmulqdq $0x00, \T5, \XMM3, \T4
10901641 vpxor \T4, \T7, \T7
10911642
1092
- vmovdqa HashKey_6_k(arg1), \T3
1643
+ vmovdqu HashKey_6_k(arg2), \T3
10931644 vpclmulqdq $0x00, \T3, \T2, \T2
10941645 vpxor \T2, \XMM1, \XMM1
10951646
....@@ -1097,14 +1648,14 @@
10971648
10981649 vpshufd $0b01001110, \XMM4, \T2
10991650 vpxor \XMM4, \T2, \T2
1100
- vmovdqa HashKey_5(arg1), \T5
1651
+ vmovdqu HashKey_5(arg2), \T5
11011652 vpclmulqdq $0x11, \T5, \XMM4, \T4
11021653 vpxor \T4, \T6, \T6
11031654
11041655 vpclmulqdq $0x00, \T5, \XMM4, \T4
11051656 vpxor \T4, \T7, \T7
11061657
1107
- vmovdqa HashKey_5_k(arg1), \T3
1658
+ vmovdqu HashKey_5_k(arg2), \T3
11081659 vpclmulqdq $0x00, \T3, \T2, \T2
11091660 vpxor \T2, \XMM1, \XMM1
11101661
....@@ -1112,14 +1663,14 @@
11121663
11131664 vpshufd $0b01001110, \XMM5, \T2
11141665 vpxor \XMM5, \T2, \T2
1115
- vmovdqa HashKey_4(arg1), \T5
1666
+ vmovdqu HashKey_4(arg2), \T5
11161667 vpclmulqdq $0x11, \T5, \XMM5, \T4
11171668 vpxor \T4, \T6, \T6
11181669
11191670 vpclmulqdq $0x00, \T5, \XMM5, \T4
11201671 vpxor \T4, \T7, \T7
11211672
1122
- vmovdqa HashKey_4_k(arg1), \T3
1673
+ vmovdqu HashKey_4_k(arg2), \T3
11231674 vpclmulqdq $0x00, \T3, \T2, \T2
11241675 vpxor \T2, \XMM1, \XMM1
11251676
....@@ -1127,14 +1678,14 @@
11271678
11281679 vpshufd $0b01001110, \XMM6, \T2
11291680 vpxor \XMM6, \T2, \T2
1130
- vmovdqa HashKey_3(arg1), \T5
1681
+ vmovdqu HashKey_3(arg2), \T5
11311682 vpclmulqdq $0x11, \T5, \XMM6, \T4
11321683 vpxor \T4, \T6, \T6
11331684
11341685 vpclmulqdq $0x00, \T5, \XMM6, \T4
11351686 vpxor \T4, \T7, \T7
11361687
1137
- vmovdqa HashKey_3_k(arg1), \T3
1688
+ vmovdqu HashKey_3_k(arg2), \T3
11381689 vpclmulqdq $0x00, \T3, \T2, \T2
11391690 vpxor \T2, \XMM1, \XMM1
11401691
....@@ -1142,14 +1693,14 @@
11421693
11431694 vpshufd $0b01001110, \XMM7, \T2
11441695 vpxor \XMM7, \T2, \T2
1145
- vmovdqa HashKey_2(arg1), \T5
1696
+ vmovdqu HashKey_2(arg2), \T5
11461697 vpclmulqdq $0x11, \T5, \XMM7, \T4
11471698 vpxor \T4, \T6, \T6
11481699
11491700 vpclmulqdq $0x00, \T5, \XMM7, \T4
11501701 vpxor \T4, \T7, \T7
11511702
1152
- vmovdqa HashKey_2_k(arg1), \T3
1703
+ vmovdqu HashKey_2_k(arg2), \T3
11531704 vpclmulqdq $0x00, \T3, \T2, \T2
11541705 vpxor \T2, \XMM1, \XMM1
11551706
....@@ -1157,14 +1708,14 @@
11571708
11581709 vpshufd $0b01001110, \XMM8, \T2
11591710 vpxor \XMM8, \T2, \T2
1160
- vmovdqa HashKey(arg1), \T5
1711
+ vmovdqu HashKey(arg2), \T5
11611712 vpclmulqdq $0x11, \T5, \XMM8, \T4
11621713 vpxor \T4, \T6, \T6
11631714
11641715 vpclmulqdq $0x00, \T5, \XMM8, \T4
11651716 vpxor \T4, \T7, \T7
11661717
1167
- vmovdqa HashKey_k(arg1), \T3
1718
+ vmovdqu HashKey_k(arg2), \T3
11681719 vpclmulqdq $0x00, \T3, \T2, \T2
11691720
11701721 vpxor \T2, \XMM1, \XMM1
....@@ -1210,416 +1761,112 @@
12101761
12111762 .endm
12121763
1213
-
1214
-# combined for GCM encrypt and decrypt functions
1215
-# clobbering all xmm registers
1216
-# clobbering r10, r11, r12, r13, r14, r15
1217
-.macro GCM_ENC_DEC_AVX ENC_DEC
1218
-
1219
- #the number of pushes must equal STACK_OFFSET
1220
- push %r12
1221
- push %r13
1222
- push %r14
1223
- push %r15
1224
-
1225
- mov %rsp, %r14
1226
-
1227
-
1228
-
1229
-
1230
- sub $VARIABLE_OFFSET, %rsp
1231
- and $~63, %rsp # align rsp to 64 bytes
1232
-
1233
-
1234
- vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
1235
-
1236
- mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
1237
- and $-16, %r13 # r13 = r13 - (r13 mod 16)
1238
-
1239
- mov %r13, %r12
1240
- shr $4, %r12
1241
- and $7, %r12
1242
- jz _initial_num_blocks_is_0\@
1243
-
1244
- cmp $7, %r12
1245
- je _initial_num_blocks_is_7\@
1246
- cmp $6, %r12
1247
- je _initial_num_blocks_is_6\@
1248
- cmp $5, %r12
1249
- je _initial_num_blocks_is_5\@
1250
- cmp $4, %r12
1251
- je _initial_num_blocks_is_4\@
1252
- cmp $3, %r12
1253
- je _initial_num_blocks_is_3\@
1254
- cmp $2, %r12
1255
- je _initial_num_blocks_is_2\@
1256
-
1257
- jmp _initial_num_blocks_is_1\@
1258
-
1259
-_initial_num_blocks_is_7\@:
1260
- INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1261
- sub $16*7, %r13
1262
- jmp _initial_blocks_encrypted\@
1263
-
1264
-_initial_num_blocks_is_6\@:
1265
- INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1266
- sub $16*6, %r13
1267
- jmp _initial_blocks_encrypted\@
1268
-
1269
-_initial_num_blocks_is_5\@:
1270
- INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1271
- sub $16*5, %r13
1272
- jmp _initial_blocks_encrypted\@
1273
-
1274
-_initial_num_blocks_is_4\@:
1275
- INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1276
- sub $16*4, %r13
1277
- jmp _initial_blocks_encrypted\@
1278
-
1279
-_initial_num_blocks_is_3\@:
1280
- INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1281
- sub $16*3, %r13
1282
- jmp _initial_blocks_encrypted\@
1283
-
1284
-_initial_num_blocks_is_2\@:
1285
- INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1286
- sub $16*2, %r13
1287
- jmp _initial_blocks_encrypted\@
1288
-
1289
-_initial_num_blocks_is_1\@:
1290
- INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1291
- sub $16*1, %r13
1292
- jmp _initial_blocks_encrypted\@
1293
-
1294
-_initial_num_blocks_is_0\@:
1295
- INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1296
-
1297
-
1298
-_initial_blocks_encrypted\@:
1299
- cmp $0, %r13
1300
- je _zero_cipher_left\@
1301
-
1302
- sub $128, %r13
1303
- je _eight_cipher_left\@
1304
-
1305
-
1306
-
1307
-
1308
- vmovd %xmm9, %r15d
1309
- and $255, %r15d
1310
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1311
-
1312
-
1313
-_encrypt_by_8_new\@:
1314
- cmp $(255-8), %r15d
1315
- jg _encrypt_by_8\@
1316
-
1317
-
1318
-
1319
- add $8, %r15b
1320
- GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1321
- add $128, %r11
1322
- sub $128, %r13
1323
- jne _encrypt_by_8_new\@
1324
-
1325
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1326
- jmp _eight_cipher_left\@
1327
-
1328
-_encrypt_by_8\@:
1329
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1330
- add $8, %r15b
1331
- GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1332
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1333
- add $128, %r11
1334
- sub $128, %r13
1335
- jne _encrypt_by_8_new\@
1336
-
1337
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1338
-
1339
-
1340
-
1341
-
1342
-_eight_cipher_left\@:
1343
- GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1344
-
1345
-
1346
-_zero_cipher_left\@:
1347
- cmp $16, arg4
1348
- jl _only_less_than_16\@
1349
-
1350
- mov arg4, %r13
1351
- and $15, %r13 # r13 = (arg4 mod 16)
1352
-
1353
- je _multiple_of_16_bytes\@
1354
-
1355
- # handle the last <16 Byte block seperately
1356
-
1357
-
1358
- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
1359
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1360
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
1361
-
1362
- sub $16, %r11
1363
- add %r13, %r11
1364
- vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
1365
-
1366
- lea SHIFT_MASK+16(%rip), %r12
1367
- sub %r13, %r12 # adjust the shuffle mask pointer to be
1368
- # able to shift 16-r13 bytes (r13 is the
1369
- # number of bytes in plaintext mod 16)
1370
- vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
1371
- vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
1372
- jmp _final_ghash_mul\@
1373
-
1374
-_only_less_than_16\@:
1375
- # check for 0 length
1376
- mov arg4, %r13
1377
- and $15, %r13 # r13 = (arg4 mod 16)
1378
-
1379
- je _multiple_of_16_bytes\@
1380
-
1381
- # handle the last <16 Byte block seperately
1382
-
1383
-
1384
- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
1385
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1386
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
1387
-
1388
-
1389
- lea SHIFT_MASK+16(%rip), %r12
1390
- sub %r13, %r12 # adjust the shuffle mask pointer to be
1391
- # able to shift 16-r13 bytes (r13 is the
1392
- # number of bytes in plaintext mod 16)
1393
-
1394
-_get_last_16_byte_loop\@:
1395
- movb (arg3, %r11), %al
1396
- movb %al, TMP1 (%rsp , %r11)
1397
- add $1, %r11
1398
- cmp %r13, %r11
1399
- jne _get_last_16_byte_loop\@
1400
-
1401
- vmovdqu TMP1(%rsp), %xmm1
1402
-
1403
- sub $16, %r11
1404
-
1405
-_final_ghash_mul\@:
1406
- .if \ENC_DEC == DEC
1407
- vmovdqa %xmm1, %xmm2
1408
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
1409
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
1410
- # mask out top 16-r13 bytes of xmm9
1411
- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
1412
- vpand %xmm1, %xmm2, %xmm2
1413
- vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1414
- vpxor %xmm2, %xmm14, %xmm14
1415
- #GHASH computation for the last <16 Byte block
1416
- GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1417
- sub %r13, %r11
1418
- add $16, %r11
1419
- .else
1420
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
1421
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
1422
- # mask out top 16-r13 bytes of xmm9
1423
- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
1424
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1425
- vpxor %xmm9, %xmm14, %xmm14
1426
- #GHASH computation for the last <16 Byte block
1427
- GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1428
- sub %r13, %r11
1429
- add $16, %r11
1430
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
1431
- .endif
1432
-
1433
-
1434
- #############################
1435
- # output r13 Bytes
1436
- vmovq %xmm9, %rax
1437
- cmp $8, %r13
1438
- jle _less_than_8_bytes_left\@
1439
-
1440
- mov %rax, (arg2 , %r11)
1441
- add $8, %r11
1442
- vpsrldq $8, %xmm9, %xmm9
1443
- vmovq %xmm9, %rax
1444
- sub $8, %r13
1445
-
1446
-_less_than_8_bytes_left\@:
1447
- movb %al, (arg2 , %r11)
1448
- add $1, %r11
1449
- shr $8, %rax
1450
- sub $1, %r13
1451
- jne _less_than_8_bytes_left\@
1452
- #############################
1453
-
1454
-_multiple_of_16_bytes\@:
1455
- mov arg7, %r12 # r12 = aadLen (number of bytes)
1456
- shl $3, %r12 # convert into number of bits
1457
- vmovd %r12d, %xmm15 # len(A) in xmm15
1458
-
1459
- shl $3, arg4 # len(C) in bits (*128)
1460
- vmovq arg4, %xmm1
1461
- vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
1462
- vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
1463
-
1464
- vpxor %xmm15, %xmm14, %xmm14
1465
- GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
1466
- vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
1467
-
1468
- mov arg5, %rax # rax = *Y0
1469
- vmovdqu (%rax), %xmm9 # xmm9 = Y0
1470
-
1471
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
1472
-
1473
- vpxor %xmm14, %xmm9, %xmm9
1474
-
1475
-
1476
-
1477
-_return_T\@:
1478
- mov arg8, %r10 # r10 = authTag
1479
- mov arg9, %r11 # r11 = auth_tag_len
1480
-
1481
- cmp $16, %r11
1482
- je _T_16\@
1483
-
1484
- cmp $8, %r11
1485
- jl _T_4\@
1486
-
1487
-_T_8\@:
1488
- vmovq %xmm9, %rax
1489
- mov %rax, (%r10)
1490
- add $8, %r10
1491
- sub $8, %r11
1492
- vpsrldq $8, %xmm9, %xmm9
1493
- cmp $0, %r11
1494
- je _return_T_done\@
1495
-_T_4\@:
1496
- vmovd %xmm9, %eax
1497
- mov %eax, (%r10)
1498
- add $4, %r10
1499
- sub $4, %r11
1500
- vpsrldq $4, %xmm9, %xmm9
1501
- cmp $0, %r11
1502
- je _return_T_done\@
1503
-_T_123\@:
1504
- vmovd %xmm9, %eax
1505
- cmp $2, %r11
1506
- jl _T_1\@
1507
- mov %ax, (%r10)
1508
- cmp $2, %r11
1509
- je _return_T_done\@
1510
- add $2, %r10
1511
- sar $16, %eax
1512
-_T_1\@:
1513
- mov %al, (%r10)
1514
- jmp _return_T_done\@
1515
-
1516
-_T_16\@:
1517
- vmovdqu %xmm9, (%r10)
1518
-
1519
-_return_T_done\@:
1520
- mov %r14, %rsp
1521
-
1522
- pop %r15
1523
- pop %r14
1524
- pop %r13
1525
- pop %r12
1526
-.endm
1527
-
1528
-
15291764 #############################################################
15301765 #void aesni_gcm_precomp_avx_gen2
15311766 # (gcm_data *my_ctx_data,
1532
-# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1767
+# gcm_context_data *data,
1768
+# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1769
+# u8 *iv, /* Pre-counter block j0: 4 byte salt
1770
+# (from Security Association) concatenated with 8 byte
1771
+# Initialisation Vector (from IPSec ESP Payload)
1772
+# concatenated with 0x00000001. 16-byte aligned pointer. */
1773
+# const u8 *aad, /* Additional Authentication Data (AAD)*/
1774
+# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
15331775 #############################################################
1534
-ENTRY(aesni_gcm_precomp_avx_gen2)
1535
- #the number of pushes must equal STACK_OFFSET
1536
- push %r12
1537
- push %r13
1538
- push %r14
1539
- push %r15
1540
-
1541
- mov %rsp, %r14
1542
-
1543
-
1544
-
1545
- sub $VARIABLE_OFFSET, %rsp
1546
- and $~63, %rsp # align rsp to 64 bytes
1547
-
1548
- vmovdqu (arg2), %xmm6 # xmm6 = HashKey
1549
-
1550
- vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
1551
- ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1552
- vmovdqa %xmm6, %xmm2
1553
- vpsllq $1, %xmm6, %xmm6
1554
- vpsrlq $63, %xmm2, %xmm2
1555
- vmovdqa %xmm2, %xmm1
1556
- vpslldq $8, %xmm2, %xmm2
1557
- vpsrldq $8, %xmm1, %xmm1
1558
- vpor %xmm2, %xmm6, %xmm6
1559
- #reduction
1560
- vpshufd $0b00100100, %xmm1, %xmm2
1561
- vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1562
- vpand POLY(%rip), %xmm2, %xmm2
1563
- vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
1564
- #######################################################################
1565
- vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
1566
-
1567
-
1568
- PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1569
-
1570
- mov %r14, %rsp
1571
-
1572
- pop %r15
1573
- pop %r14
1574
- pop %r13
1575
- pop %r12
1576
- ret
1577
-ENDPROC(aesni_gcm_precomp_avx_gen2)
1776
+SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1777
+ FUNC_SAVE
1778
+ INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1779
+ FUNC_RESTORE
1780
+ RET
1781
+SYM_FUNC_END(aesni_gcm_init_avx_gen2)
15781782
15791783 ###############################################################################
1580
-#void aesni_gcm_enc_avx_gen2(
1784
+#void aesni_gcm_enc_update_avx_gen2(
15811785 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1786
+# gcm_context_data *data,
15821787 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
15831788 # const u8 *in, /* Plaintext input */
1584
-# u64 plaintext_len, /* Length of data in Bytes for encryption. */
1585
-# u8 *iv, /* Pre-counter block j0: 4 byte salt
1586
-# (from Security Association) concatenated with 8 byte
1587
-# Initialisation Vector (from IPSec ESP Payload)
1588
-# concatenated with 0x00000001. 16-byte aligned pointer. */
1589
-# const u8 *aad, /* Additional Authentication Data (AAD)*/
1590
-# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1591
-# u8 *auth_tag, /* Authenticated Tag output. */
1592
-# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1593
-# Valid values are 16 (most likely), 12 or 8. */
1789
+# u64 plaintext_len) /* Length of data in Bytes for encryption. */
15941790 ###############################################################################
1595
-ENTRY(aesni_gcm_enc_avx_gen2)
1596
- GCM_ENC_DEC_AVX ENC
1597
- ret
1598
-ENDPROC(aesni_gcm_enc_avx_gen2)
1791
+SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1792
+ FUNC_SAVE
1793
+ mov keysize, %eax
1794
+ cmp $32, %eax
1795
+ je key_256_enc_update
1796
+ cmp $16, %eax
1797
+ je key_128_enc_update
1798
+ # must be 192
1799
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1800
+ FUNC_RESTORE
1801
+ RET
1802
+key_128_enc_update:
1803
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1804
+ FUNC_RESTORE
1805
+ RET
1806
+key_256_enc_update:
1807
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1808
+ FUNC_RESTORE
1809
+ RET
1810
+SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
15991811
16001812 ###############################################################################
1601
-#void aesni_gcm_dec_avx_gen2(
1813
+#void aesni_gcm_dec_update_avx_gen2(
16021814 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1815
+# gcm_context_data *data,
16031816 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
16041817 # const u8 *in, /* Ciphertext input */
1605
-# u64 plaintext_len, /* Length of data in Bytes for encryption. */
1606
-# u8 *iv, /* Pre-counter block j0: 4 byte salt
1607
-# (from Security Association) concatenated with 8 byte
1608
-# Initialisation Vector (from IPSec ESP Payload)
1609
-# concatenated with 0x00000001. 16-byte aligned pointer. */
1610
-# const u8 *aad, /* Additional Authentication Data (AAD)*/
1611
-# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1818
+# u64 plaintext_len) /* Length of data in Bytes for encryption. */
1819
+###############################################################################
1820
+SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1821
+ FUNC_SAVE
1822
+ mov keysize,%eax
1823
+ cmp $32, %eax
1824
+ je key_256_dec_update
1825
+ cmp $16, %eax
1826
+ je key_128_dec_update
1827
+ # must be 192
1828
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1829
+ FUNC_RESTORE
1830
+ RET
1831
+key_128_dec_update:
1832
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1833
+ FUNC_RESTORE
1834
+ RET
1835
+key_256_dec_update:
1836
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1837
+ FUNC_RESTORE
1838
+ RET
1839
+SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1840
+
1841
+###############################################################################
1842
+#void aesni_gcm_finalize_avx_gen2(
1843
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1844
+# gcm_context_data *data,
16121845 # u8 *auth_tag, /* Authenticated Tag output. */
16131846 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
16141847 # Valid values are 16 (most likely), 12 or 8. */
16151848 ###############################################################################
1616
-ENTRY(aesni_gcm_dec_avx_gen2)
1617
- GCM_ENC_DEC_AVX DEC
1618
- ret
1619
-ENDPROC(aesni_gcm_dec_avx_gen2)
1620
-#endif /* CONFIG_AS_AVX */
1849
+SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1850
+ FUNC_SAVE
1851
+ mov keysize,%eax
1852
+ cmp $32, %eax
1853
+ je key_256_finalize
1854
+ cmp $16, %eax
1855
+ je key_128_finalize
1856
+ # must be 192
1857
+ GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1858
+ FUNC_RESTORE
1859
+ RET
1860
+key_128_finalize:
1861
+ GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1862
+ FUNC_RESTORE
1863
+ RET
1864
+key_256_finalize:
1865
+ GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1866
+ FUNC_RESTORE
1867
+ RET
1868
+SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
16211869
1622
-#ifdef CONFIG_AS_AVX2
16231870 ###############################################################################
16241871 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
16251872 # Input: A and B (128-bits each, bit-reflected)
....@@ -1670,113 +1917,42 @@
16701917 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
16711918 vmovdqa \HK, \T5
16721919 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1673
- vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
1920
+ vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
16741921
16751922 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1676
- vmovdqa \T5, HashKey_3(arg1)
1923
+ vmovdqu \T5, HashKey_3(arg2)
16771924
16781925 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1679
- vmovdqa \T5, HashKey_4(arg1)
1926
+ vmovdqu \T5, HashKey_4(arg2)
16801927
16811928 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1682
- vmovdqa \T5, HashKey_5(arg1)
1929
+ vmovdqu \T5, HashKey_5(arg2)
16831930
16841931 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1685
- vmovdqa \T5, HashKey_6(arg1)
1932
+ vmovdqu \T5, HashKey_6(arg2)
16861933
16871934 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1688
- vmovdqa \T5, HashKey_7(arg1)
1935
+ vmovdqu \T5, HashKey_7(arg2)
16891936
16901937 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1691
- vmovdqa \T5, HashKey_8(arg1)
1938
+ vmovdqu \T5, HashKey_8(arg2)
16921939
16931940 .endm
1694
-
16951941
16961942 ## if a = number of total plaintext bytes
16971943 ## b = floor(a/16)
16981944 ## num_initial_blocks = b mod 4#
16991945 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
17001946 ## r10, r11, r12, rax are clobbered
1701
-## arg1, arg2, arg3, r14 are used as a pointer only, not modified
1947
+## arg1, arg3, arg4, r14 are used as a pointer only, not modified
17021948
1703
-.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1949
+.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
17041950 i = (8-\num_initial_blocks)
1705
- j = 0
17061951 setreg
1707
-
1708
- mov arg6, %r10 # r10 = AAD
1709
- mov arg7, %r12 # r12 = aadLen
1710
-
1711
-
1712
- mov %r12, %r11
1713
-
1714
- vpxor reg_j, reg_j, reg_j
1715
- vpxor reg_i, reg_i, reg_i
1716
-
1717
- cmp $16, %r11
1718
- jl _get_AAD_rest8\@
1719
-_get_AAD_blocks\@:
1720
- vmovdqu (%r10), reg_i
1721
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
1722
- vpxor reg_i, reg_j, reg_j
1723
- GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
1724
- add $16, %r10
1725
- sub $16, %r12
1726
- sub $16, %r11
1727
- cmp $16, %r11
1728
- jge _get_AAD_blocks\@
1729
- vmovdqu reg_j, reg_i
1730
- cmp $0, %r11
1731
- je _get_AAD_done\@
1732
-
1733
- vpxor reg_i, reg_i, reg_i
1734
-
1735
- /* read the last <16B of AAD. since we have at least 4B of
1736
- data right after the AAD (the ICV, and maybe some CT), we can
1737
- read 4B/8B blocks safely, and then get rid of the extra stuff */
1738
-_get_AAD_rest8\@:
1739
- cmp $4, %r11
1740
- jle _get_AAD_rest4\@
1741
- movq (%r10), \T1
1742
- add $8, %r10
1743
- sub $8, %r11
1744
- vpslldq $8, \T1, \T1
1745
- vpsrldq $8, reg_i, reg_i
1746
- vpxor \T1, reg_i, reg_i
1747
- jmp _get_AAD_rest8\@
1748
-_get_AAD_rest4\@:
1749
- cmp $0, %r11
1750
- jle _get_AAD_rest0\@
1751
- mov (%r10), %eax
1752
- movq %rax, \T1
1753
- add $4, %r10
1754
- sub $4, %r11
1755
- vpslldq $12, \T1, \T1
1756
- vpsrldq $4, reg_i, reg_i
1757
- vpxor \T1, reg_i, reg_i
1758
-_get_AAD_rest0\@:
1759
- /* finalize: shift out the extra bytes we read, and align
1760
- left. since pslldq can only shift by an immediate, we use
1761
- vpshufb and an array of shuffle masks */
1762
- movq %r12, %r11
1763
- salq $4, %r11
1764
- movdqu aad_shift_arr(%r11), \T1
1765
- vpshufb \T1, reg_i, reg_i
1766
-_get_AAD_rest_final\@:
1767
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
1768
- vpxor reg_j, reg_i, reg_i
1769
- GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1770
-
1771
-_get_AAD_done\@:
1772
- # initialize the data pointer offset as zero
1773
- xor %r11d, %r11d
1952
+ vmovdqu AadHash(arg2), reg_i
17741953
17751954 # start AES for num_initial_blocks blocks
1776
- mov arg5, %rax # rax = *Y0
1777
- vmovdqu (%rax), \CTR # CTR = Y0
1778
- vpshufb SHUF_MASK(%rip), \CTR, \CTR
1779
-
1955
+ vmovdqu CurCount(arg2), \CTR
17801956
17811957 i = (9-\num_initial_blocks)
17821958 setreg
....@@ -1799,7 +1975,7 @@
17991975
18001976 j = 1
18011977 setreg
1802
-.rep 9
1978
+.rep \REP
18031979 vmovdqa 16*j(arg1), \T_key
18041980 i = (9-\num_initial_blocks)
18051981 setreg
....@@ -1814,7 +1990,7 @@
18141990 .endr
18151991
18161992
1817
- vmovdqa 16*10(arg1), \T_key
1993
+ vmovdqa 16*j(arg1), \T_key
18181994 i = (9-\num_initial_blocks)
18191995 setreg
18201996 .rep \num_initial_blocks
....@@ -1826,9 +2002,9 @@
18262002 i = (9-\num_initial_blocks)
18272003 setreg
18282004 .rep \num_initial_blocks
1829
- vmovdqu (arg3, %r11), \T1
2005
+ vmovdqu (arg4, %r11), \T1
18302006 vpxor \T1, reg_i, reg_i
1831
- vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for
2007
+ vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
18322008 # num_initial_blocks blocks
18332009 add $16, %r11
18342010 .if \ENC_DEC == DEC
....@@ -1905,7 +2081,7 @@
19052081
19062082 i = 1
19072083 setreg
1908
-.rep 9 # do 9 rounds
2084
+.rep \REP # do REP rounds
19092085 vmovdqa 16*i(arg1), \T_key
19102086 vaesenc \T_key, \XMM1, \XMM1
19112087 vaesenc \T_key, \XMM2, \XMM2
....@@ -1930,58 +2106,58 @@
19302106 vaesenclast \T_key, \XMM7, \XMM7
19312107 vaesenclast \T_key, \XMM8, \XMM8
19322108
1933
- vmovdqu (arg3, %r11), \T1
2109
+ vmovdqu (arg4, %r11), \T1
19342110 vpxor \T1, \XMM1, \XMM1
1935
- vmovdqu \XMM1, (arg2 , %r11)
2111
+ vmovdqu \XMM1, (arg3 , %r11)
19362112 .if \ENC_DEC == DEC
19372113 vmovdqa \T1, \XMM1
19382114 .endif
19392115
1940
- vmovdqu 16*1(arg3, %r11), \T1
2116
+ vmovdqu 16*1(arg4, %r11), \T1
19412117 vpxor \T1, \XMM2, \XMM2
1942
- vmovdqu \XMM2, 16*1(arg2 , %r11)
2118
+ vmovdqu \XMM2, 16*1(arg3 , %r11)
19432119 .if \ENC_DEC == DEC
19442120 vmovdqa \T1, \XMM2
19452121 .endif
19462122
1947
- vmovdqu 16*2(arg3, %r11), \T1
2123
+ vmovdqu 16*2(arg4, %r11), \T1
19482124 vpxor \T1, \XMM3, \XMM3
1949
- vmovdqu \XMM3, 16*2(arg2 , %r11)
2125
+ vmovdqu \XMM3, 16*2(arg3 , %r11)
19502126 .if \ENC_DEC == DEC
19512127 vmovdqa \T1, \XMM3
19522128 .endif
19532129
1954
- vmovdqu 16*3(arg3, %r11), \T1
2130
+ vmovdqu 16*3(arg4, %r11), \T1
19552131 vpxor \T1, \XMM4, \XMM4
1956
- vmovdqu \XMM4, 16*3(arg2 , %r11)
2132
+ vmovdqu \XMM4, 16*3(arg3 , %r11)
19572133 .if \ENC_DEC == DEC
19582134 vmovdqa \T1, \XMM4
19592135 .endif
19602136
1961
- vmovdqu 16*4(arg3, %r11), \T1
2137
+ vmovdqu 16*4(arg4, %r11), \T1
19622138 vpxor \T1, \XMM5, \XMM5
1963
- vmovdqu \XMM5, 16*4(arg2 , %r11)
2139
+ vmovdqu \XMM5, 16*4(arg3 , %r11)
19642140 .if \ENC_DEC == DEC
19652141 vmovdqa \T1, \XMM5
19662142 .endif
19672143
1968
- vmovdqu 16*5(arg3, %r11), \T1
2144
+ vmovdqu 16*5(arg4, %r11), \T1
19692145 vpxor \T1, \XMM6, \XMM6
1970
- vmovdqu \XMM6, 16*5(arg2 , %r11)
2146
+ vmovdqu \XMM6, 16*5(arg3 , %r11)
19712147 .if \ENC_DEC == DEC
19722148 vmovdqa \T1, \XMM6
19732149 .endif
19742150
1975
- vmovdqu 16*6(arg3, %r11), \T1
2151
+ vmovdqu 16*6(arg4, %r11), \T1
19762152 vpxor \T1, \XMM7, \XMM7
1977
- vmovdqu \XMM7, 16*6(arg2 , %r11)
2153
+ vmovdqu \XMM7, 16*6(arg3 , %r11)
19782154 .if \ENC_DEC == DEC
19792155 vmovdqa \T1, \XMM7
19802156 .endif
19812157
1982
- vmovdqu 16*7(arg3, %r11), \T1
2158
+ vmovdqu 16*7(arg4, %r11), \T1
19832159 vpxor \T1, \XMM8, \XMM8
1984
- vmovdqu \XMM8, 16*7(arg2 , %r11)
2160
+ vmovdqu \XMM8, 16*7(arg3 , %r11)
19852161 .if \ENC_DEC == DEC
19862162 vmovdqa \T1, \XMM8
19872163 .endif
....@@ -2010,9 +2186,9 @@
20102186
20112187 # encrypt 8 blocks at a time
20122188 # ghash the 8 previously encrypted ciphertext blocks
2013
-# arg1, arg2, arg3 are used as pointers only, not modified
2189
+# arg1, arg3, arg4 are used as pointers only, not modified
20142190 # r11 is the data offset value
2015
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2191
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
20162192
20172193 vmovdqa \XMM1, \T2
20182194 vmovdqa \XMM2, TMP2(%rsp)
....@@ -2096,7 +2272,7 @@
20962272
20972273 #######################################################################
20982274
2099
- vmovdqa HashKey_8(arg1), \T5
2275
+ vmovdqu HashKey_8(arg2), \T5
21002276 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
21012277 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
21022278 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
....@@ -2114,7 +2290,7 @@
21142290 vaesenc \T1, \XMM8, \XMM8
21152291
21162292 vmovdqa TMP2(%rsp), \T1
2117
- vmovdqa HashKey_7(arg1), \T5
2293
+ vmovdqu HashKey_7(arg2), \T5
21182294 vpclmulqdq $0x11, \T5, \T1, \T3
21192295 vpxor \T3, \T4, \T4
21202296
....@@ -2140,7 +2316,7 @@
21402316 #######################################################################
21412317
21422318 vmovdqa TMP3(%rsp), \T1
2143
- vmovdqa HashKey_6(arg1), \T5
2319
+ vmovdqu HashKey_6(arg2), \T5
21442320 vpclmulqdq $0x11, \T5, \T1, \T3
21452321 vpxor \T3, \T4, \T4
21462322
....@@ -2164,7 +2340,7 @@
21642340 vaesenc \T1, \XMM8, \XMM8
21652341
21662342 vmovdqa TMP4(%rsp), \T1
2167
- vmovdqa HashKey_5(arg1), \T5
2343
+ vmovdqu HashKey_5(arg2), \T5
21682344 vpclmulqdq $0x11, \T5, \T1, \T3
21692345 vpxor \T3, \T4, \T4
21702346
....@@ -2189,7 +2365,7 @@
21892365
21902366
21912367 vmovdqa TMP5(%rsp), \T1
2192
- vmovdqa HashKey_4(arg1), \T5
2368
+ vmovdqu HashKey_4(arg2), \T5
21932369 vpclmulqdq $0x11, \T5, \T1, \T3
21942370 vpxor \T3, \T4, \T4
21952371
....@@ -2213,7 +2389,7 @@
22132389 vaesenc \T1, \XMM8, \XMM8
22142390
22152391 vmovdqa TMP6(%rsp), \T1
2216
- vmovdqa HashKey_3(arg1), \T5
2392
+ vmovdqu HashKey_3(arg2), \T5
22172393 vpclmulqdq $0x11, \T5, \T1, \T3
22182394 vpxor \T3, \T4, \T4
22192395
....@@ -2237,7 +2413,7 @@
22372413 vaesenc \T1, \XMM8, \XMM8
22382414
22392415 vmovdqa TMP7(%rsp), \T1
2240
- vmovdqa HashKey_2(arg1), \T5
2416
+ vmovdqu HashKey_2(arg2), \T5
22412417 vpclmulqdq $0x11, \T5, \T1, \T3
22422418 vpxor \T3, \T4, \T4
22432419
....@@ -2264,7 +2440,7 @@
22642440 vaesenc \T5, \XMM8, \XMM8
22652441
22662442 vmovdqa TMP8(%rsp), \T1
2267
- vmovdqa HashKey(arg1), \T5
2443
+ vmovdqu HashKey(arg2), \T5
22682444
22692445 vpclmulqdq $0x00, \T5, \T1, \T3
22702446 vpxor \T3, \T7, \T7
....@@ -2281,17 +2457,34 @@
22812457
22822458 vmovdqu 16*10(arg1), \T5
22832459
2460
+ i = 11
2461
+ setreg
2462
+.rep (\REP-9)
2463
+ vaesenc \T5, \XMM1, \XMM1
2464
+ vaesenc \T5, \XMM2, \XMM2
2465
+ vaesenc \T5, \XMM3, \XMM3
2466
+ vaesenc \T5, \XMM4, \XMM4
2467
+ vaesenc \T5, \XMM5, \XMM5
2468
+ vaesenc \T5, \XMM6, \XMM6
2469
+ vaesenc \T5, \XMM7, \XMM7
2470
+ vaesenc \T5, \XMM8, \XMM8
2471
+
2472
+ vmovdqu 16*i(arg1), \T5
2473
+ i = i + 1
2474
+ setreg
2475
+.endr
2476
+
22842477 i = 0
22852478 j = 1
22862479 setreg
22872480 .rep 8
2288
- vpxor 16*i(arg3, %r11), \T5, \T2
2481
+ vpxor 16*i(arg4, %r11), \T5, \T2
22892482 .if \ENC_DEC == ENC
22902483 vaesenclast \T2, reg_j, reg_j
22912484 .else
22922485 vaesenclast \T2, reg_j, \T3
2293
- vmovdqu 16*i(arg3, %r11), reg_j
2294
- vmovdqu \T3, 16*i(arg2, %r11)
2486
+ vmovdqu 16*i(arg4, %r11), reg_j
2487
+ vmovdqu \T3, 16*i(arg3, %r11)
22952488 .endif
22962489 i = (i+1)
22972490 j = (j+1)
....@@ -2317,14 +2510,14 @@
23172510 vpxor \T2, \T7, \T7 # first phase of the reduction complete
23182511 #######################################################################
23192512 .if \ENC_DEC == ENC
2320
- vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
2321
- vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
2322
- vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
2323
- vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
2324
- vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
2325
- vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
2326
- vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
2327
- vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
2513
+ vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
2514
+ vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
2515
+ vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
2516
+ vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
2517
+ vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
2518
+ vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
2519
+ vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
2520
+ vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
23282521 .endif
23292522
23302523 #######################################################################
....@@ -2361,7 +2554,7 @@
23612554
23622555 ## Karatsuba Method
23632556
2364
- vmovdqa HashKey_8(arg1), \T5
2557
+ vmovdqu HashKey_8(arg2), \T5
23652558
23662559 vpshufd $0b01001110, \XMM1, \T2
23672560 vpshufd $0b01001110, \T5, \T3
....@@ -2375,7 +2568,7 @@
23752568
23762569 ######################
23772570
2378
- vmovdqa HashKey_7(arg1), \T5
2571
+ vmovdqu HashKey_7(arg2), \T5
23792572 vpshufd $0b01001110, \XMM2, \T2
23802573 vpshufd $0b01001110, \T5, \T3
23812574 vpxor \XMM2, \T2, \T2
....@@ -2393,7 +2586,7 @@
23932586
23942587 ######################
23952588
2396
- vmovdqa HashKey_6(arg1), \T5
2589
+ vmovdqu HashKey_6(arg2), \T5
23972590 vpshufd $0b01001110, \XMM3, \T2
23982591 vpshufd $0b01001110, \T5, \T3
23992592 vpxor \XMM3, \T2, \T2
....@@ -2411,7 +2604,7 @@
24112604
24122605 ######################
24132606
2414
- vmovdqa HashKey_5(arg1), \T5
2607
+ vmovdqu HashKey_5(arg2), \T5
24152608 vpshufd $0b01001110, \XMM4, \T2
24162609 vpshufd $0b01001110, \T5, \T3
24172610 vpxor \XMM4, \T2, \T2
....@@ -2429,7 +2622,7 @@
24292622
24302623 ######################
24312624
2432
- vmovdqa HashKey_4(arg1), \T5
2625
+ vmovdqu HashKey_4(arg2), \T5
24332626 vpshufd $0b01001110, \XMM5, \T2
24342627 vpshufd $0b01001110, \T5, \T3
24352628 vpxor \XMM5, \T2, \T2
....@@ -2447,7 +2640,7 @@
24472640
24482641 ######################
24492642
2450
- vmovdqa HashKey_3(arg1), \T5
2643
+ vmovdqu HashKey_3(arg2), \T5
24512644 vpshufd $0b01001110, \XMM6, \T2
24522645 vpshufd $0b01001110, \T5, \T3
24532646 vpxor \XMM6, \T2, \T2
....@@ -2465,7 +2658,7 @@
24652658
24662659 ######################
24672660
2468
- vmovdqa HashKey_2(arg1), \T5
2661
+ vmovdqu HashKey_2(arg2), \T5
24692662 vpshufd $0b01001110, \XMM7, \T2
24702663 vpshufd $0b01001110, \T5, \T3
24712664 vpxor \XMM7, \T2, \T2
....@@ -2483,7 +2676,7 @@
24832676
24842677 ######################
24852678
2486
- vmovdqa HashKey(arg1), \T5
2679
+ vmovdqu HashKey(arg2), \T5
24872680 vpshufd $0b01001110, \XMM8, \T2
24882681 vpshufd $0b01001110, \T5, \T3
24892682 vpxor \XMM8, \T2, \T2
....@@ -2536,411 +2729,108 @@
25362729
25372730
25382731
2539
-# combined for GCM encrypt and decrypt functions
2540
-# clobbering all xmm registers
2541
-# clobbering r10, r11, r12, r13, r14, r15
2542
-.macro GCM_ENC_DEC_AVX2 ENC_DEC
2543
-
2544
- #the number of pushes must equal STACK_OFFSET
2545
- push %r12
2546
- push %r13
2547
- push %r14
2548
- push %r15
2549
-
2550
- mov %rsp, %r14
2551
-
2552
-
2553
-
2554
-
2555
- sub $VARIABLE_OFFSET, %rsp
2556
- and $~63, %rsp # align rsp to 64 bytes
2557
-
2558
-
2559
- vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
2560
-
2561
- mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
2562
- and $-16, %r13 # r13 = r13 - (r13 mod 16)
2563
-
2564
- mov %r13, %r12
2565
- shr $4, %r12
2566
- and $7, %r12
2567
- jz _initial_num_blocks_is_0\@
2568
-
2569
- cmp $7, %r12
2570
- je _initial_num_blocks_is_7\@
2571
- cmp $6, %r12
2572
- je _initial_num_blocks_is_6\@
2573
- cmp $5, %r12
2574
- je _initial_num_blocks_is_5\@
2575
- cmp $4, %r12
2576
- je _initial_num_blocks_is_4\@
2577
- cmp $3, %r12
2578
- je _initial_num_blocks_is_3\@
2579
- cmp $2, %r12
2580
- je _initial_num_blocks_is_2\@
2581
-
2582
- jmp _initial_num_blocks_is_1\@
2583
-
2584
-_initial_num_blocks_is_7\@:
2585
- INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2586
- sub $16*7, %r13
2587
- jmp _initial_blocks_encrypted\@
2588
-
2589
-_initial_num_blocks_is_6\@:
2590
- INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2591
- sub $16*6, %r13
2592
- jmp _initial_blocks_encrypted\@
2593
-
2594
-_initial_num_blocks_is_5\@:
2595
- INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2596
- sub $16*5, %r13
2597
- jmp _initial_blocks_encrypted\@
2598
-
2599
-_initial_num_blocks_is_4\@:
2600
- INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2601
- sub $16*4, %r13
2602
- jmp _initial_blocks_encrypted\@
2603
-
2604
-_initial_num_blocks_is_3\@:
2605
- INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2606
- sub $16*3, %r13
2607
- jmp _initial_blocks_encrypted\@
2608
-
2609
-_initial_num_blocks_is_2\@:
2610
- INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2611
- sub $16*2, %r13
2612
- jmp _initial_blocks_encrypted\@
2613
-
2614
-_initial_num_blocks_is_1\@:
2615
- INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2616
- sub $16*1, %r13
2617
- jmp _initial_blocks_encrypted\@
2618
-
2619
-_initial_num_blocks_is_0\@:
2620
- INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2621
-
2622
-
2623
-_initial_blocks_encrypted\@:
2624
- cmp $0, %r13
2625
- je _zero_cipher_left\@
2626
-
2627
- sub $128, %r13
2628
- je _eight_cipher_left\@
2629
-
2630
-
2631
-
2632
-
2633
- vmovd %xmm9, %r15d
2634
- and $255, %r15d
2635
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2636
-
2637
-
2638
-_encrypt_by_8_new\@:
2639
- cmp $(255-8), %r15d
2640
- jg _encrypt_by_8\@
2641
-
2642
-
2643
-
2644
- add $8, %r15b
2645
- GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2646
- add $128, %r11
2647
- sub $128, %r13
2648
- jne _encrypt_by_8_new\@
2649
-
2650
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2651
- jmp _eight_cipher_left\@
2652
-
2653
-_encrypt_by_8\@:
2654
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2655
- add $8, %r15b
2656
- GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2657
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2658
- add $128, %r11
2659
- sub $128, %r13
2660
- jne _encrypt_by_8_new\@
2661
-
2662
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2663
-
2664
-
2665
-
2666
-
2667
-_eight_cipher_left\@:
2668
- GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2669
-
2670
-
2671
-_zero_cipher_left\@:
2672
- cmp $16, arg4
2673
- jl _only_less_than_16\@
2674
-
2675
- mov arg4, %r13
2676
- and $15, %r13 # r13 = (arg4 mod 16)
2677
-
2678
- je _multiple_of_16_bytes\@
2679
-
2680
- # handle the last <16 Byte block seperately
2681
-
2682
-
2683
- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
2684
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2685
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
2686
-
2687
- sub $16, %r11
2688
- add %r13, %r11
2689
- vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
2690
-
2691
- lea SHIFT_MASK+16(%rip), %r12
2692
- sub %r13, %r12 # adjust the shuffle mask pointer
2693
- # to be able to shift 16-r13 bytes
2694
- # (r13 is the number of bytes in plaintext mod 16)
2695
- vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
2696
- vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
2697
- jmp _final_ghash_mul\@
2698
-
2699
-_only_less_than_16\@:
2700
- # check for 0 length
2701
- mov arg4, %r13
2702
- and $15, %r13 # r13 = (arg4 mod 16)
2703
-
2704
- je _multiple_of_16_bytes\@
2705
-
2706
- # handle the last <16 Byte block seperately
2707
-
2708
-
2709
- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
2710
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2711
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
2712
-
2713
-
2714
- lea SHIFT_MASK+16(%rip), %r12
2715
- sub %r13, %r12 # adjust the shuffle mask pointer to be
2716
- # able to shift 16-r13 bytes (r13 is the
2717
- # number of bytes in plaintext mod 16)
2718
-
2719
-_get_last_16_byte_loop\@:
2720
- movb (arg3, %r11), %al
2721
- movb %al, TMP1 (%rsp , %r11)
2722
- add $1, %r11
2723
- cmp %r13, %r11
2724
- jne _get_last_16_byte_loop\@
2725
-
2726
- vmovdqu TMP1(%rsp), %xmm1
2727
-
2728
- sub $16, %r11
2729
-
2730
-_final_ghash_mul\@:
2731
- .if \ENC_DEC == DEC
2732
- vmovdqa %xmm1, %xmm2
2733
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
2734
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2735
- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
2736
- vpand %xmm1, %xmm2, %xmm2
2737
- vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2738
- vpxor %xmm2, %xmm14, %xmm14
2739
- #GHASH computation for the last <16 Byte block
2740
- GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2741
- sub %r13, %r11
2742
- add $16, %r11
2743
- .else
2744
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
2745
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2746
- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
2747
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2748
- vpxor %xmm9, %xmm14, %xmm14
2749
- #GHASH computation for the last <16 Byte block
2750
- GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2751
- sub %r13, %r11
2752
- add $16, %r11
2753
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
2754
- .endif
2755
-
2756
-
2757
- #############################
2758
- # output r13 Bytes
2759
- vmovq %xmm9, %rax
2760
- cmp $8, %r13
2761
- jle _less_than_8_bytes_left\@
2762
-
2763
- mov %rax, (arg2 , %r11)
2764
- add $8, %r11
2765
- vpsrldq $8, %xmm9, %xmm9
2766
- vmovq %xmm9, %rax
2767
- sub $8, %r13
2768
-
2769
-_less_than_8_bytes_left\@:
2770
- movb %al, (arg2 , %r11)
2771
- add $1, %r11
2772
- shr $8, %rax
2773
- sub $1, %r13
2774
- jne _less_than_8_bytes_left\@
2775
- #############################
2776
-
2777
-_multiple_of_16_bytes\@:
2778
- mov arg7, %r12 # r12 = aadLen (number of bytes)
2779
- shl $3, %r12 # convert into number of bits
2780
- vmovd %r12d, %xmm15 # len(A) in xmm15
2781
-
2782
- shl $3, arg4 # len(C) in bits (*128)
2783
- vmovq arg4, %xmm1
2784
- vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
2785
- vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
2786
-
2787
- vpxor %xmm15, %xmm14, %xmm14
2788
- GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
2789
- vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
2790
-
2791
- mov arg5, %rax # rax = *Y0
2792
- vmovdqu (%rax), %xmm9 # xmm9 = Y0
2793
-
2794
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
2795
-
2796
- vpxor %xmm14, %xmm9, %xmm9
2797
-
2798
-
2799
-
2800
-_return_T\@:
2801
- mov arg8, %r10 # r10 = authTag
2802
- mov arg9, %r11 # r11 = auth_tag_len
2803
-
2804
- cmp $16, %r11
2805
- je _T_16\@
2806
-
2807
- cmp $8, %r11
2808
- jl _T_4\@
2809
-
2810
-_T_8\@:
2811
- vmovq %xmm9, %rax
2812
- mov %rax, (%r10)
2813
- add $8, %r10
2814
- sub $8, %r11
2815
- vpsrldq $8, %xmm9, %xmm9
2816
- cmp $0, %r11
2817
- je _return_T_done\@
2818
-_T_4\@:
2819
- vmovd %xmm9, %eax
2820
- mov %eax, (%r10)
2821
- add $4, %r10
2822
- sub $4, %r11
2823
- vpsrldq $4, %xmm9, %xmm9
2824
- cmp $0, %r11
2825
- je _return_T_done\@
2826
-_T_123\@:
2827
- vmovd %xmm9, %eax
2828
- cmp $2, %r11
2829
- jl _T_1\@
2830
- mov %ax, (%r10)
2831
- cmp $2, %r11
2832
- je _return_T_done\@
2833
- add $2, %r10
2834
- sar $16, %eax
2835
-_T_1\@:
2836
- mov %al, (%r10)
2837
- jmp _return_T_done\@
2838
-
2839
-_T_16\@:
2840
- vmovdqu %xmm9, (%r10)
2841
-
2842
-_return_T_done\@:
2843
- mov %r14, %rsp
2844
-
2845
- pop %r15
2846
- pop %r14
2847
- pop %r13
2848
- pop %r12
2849
-.endm
2850
-
2851
-
28522732 #############################################################
2853
-#void aesni_gcm_precomp_avx_gen4
2733
+#void aesni_gcm_init_avx_gen4
28542734 # (gcm_data *my_ctx_data,
2855
-# u8 *hash_subkey)# /* H, the Hash sub key input.
2856
-# Data starts on a 16-byte boundary. */
2857
-#############################################################
2858
-ENTRY(aesni_gcm_precomp_avx_gen4)
2859
- #the number of pushes must equal STACK_OFFSET
2860
- push %r12
2861
- push %r13
2862
- push %r14
2863
- push %r15
2864
-
2865
- mov %rsp, %r14
2866
-
2867
-
2868
-
2869
- sub $VARIABLE_OFFSET, %rsp
2870
- and $~63, %rsp # align rsp to 64 bytes
2871
-
2872
- vmovdqu (arg2), %xmm6 # xmm6 = HashKey
2873
-
2874
- vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
2875
- ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2876
- vmovdqa %xmm6, %xmm2
2877
- vpsllq $1, %xmm6, %xmm6
2878
- vpsrlq $63, %xmm2, %xmm2
2879
- vmovdqa %xmm2, %xmm1
2880
- vpslldq $8, %xmm2, %xmm2
2881
- vpsrldq $8, %xmm1, %xmm1
2882
- vpor %xmm2, %xmm6, %xmm6
2883
- #reduction
2884
- vpshufd $0b00100100, %xmm1, %xmm2
2885
- vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2886
- vpand POLY(%rip), %xmm2, %xmm2
2887
- vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
2888
- #######################################################################
2889
- vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
2890
-
2891
-
2892
- PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2893
-
2894
- mov %r14, %rsp
2895
-
2896
- pop %r15
2897
- pop %r14
2898
- pop %r13
2899
- pop %r12
2900
- ret
2901
-ENDPROC(aesni_gcm_precomp_avx_gen4)
2902
-
2903
-
2904
-###############################################################################
2905
-#void aesni_gcm_enc_avx_gen4(
2906
-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2907
-# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2908
-# const u8 *in, /* Plaintext input */
2909
-# u64 plaintext_len, /* Length of data in Bytes for encryption. */
2910
-# u8 *iv, /* Pre-counter block j0: 4 byte salt
2911
-# (from Security Association) concatenated with 8 byte
2912
-# Initialisation Vector (from IPSec ESP Payload)
2913
-# concatenated with 0x00000001. 16-byte aligned pointer. */
2914
-# const u8 *aad, /* Additional Authentication Data (AAD)*/
2915
-# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2916
-# u8 *auth_tag, /* Authenticated Tag output. */
2917
-# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2918
-# Valid values are 16 (most likely), 12 or 8. */
2919
-###############################################################################
2920
-ENTRY(aesni_gcm_enc_avx_gen4)
2921
- GCM_ENC_DEC_AVX2 ENC
2922
- ret
2923
-ENDPROC(aesni_gcm_enc_avx_gen4)
2924
-
2925
-###############################################################################
2926
-#void aesni_gcm_dec_avx_gen4(
2927
-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2928
-# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2929
-# const u8 *in, /* Ciphertext input */
2930
-# u64 plaintext_len, /* Length of data in Bytes for encryption. */
2735
+# gcm_context_data *data,
29312736 # u8 *iv, /* Pre-counter block j0: 4 byte salt
29322737 # (from Security Association) concatenated with 8 byte
29332738 # Initialisation Vector (from IPSec ESP Payload)
29342739 # concatenated with 0x00000001. 16-byte aligned pointer. */
2740
+# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
29352741 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2936
-# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2742
+# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2743
+#############################################################
2744
+SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2745
+ FUNC_SAVE
2746
+ INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2747
+ FUNC_RESTORE
2748
+ RET
2749
+SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2750
+
2751
+###############################################################################
2752
+#void aesni_gcm_enc_avx_gen4(
2753
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2754
+# gcm_context_data *data,
2755
+# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2756
+# const u8 *in, /* Plaintext input */
2757
+# u64 plaintext_len) /* Length of data in Bytes for encryption. */
2758
+###############################################################################
2759
+SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2760
+ FUNC_SAVE
2761
+ mov keysize,%eax
2762
+ cmp $32, %eax
2763
+ je key_256_enc_update4
2764
+ cmp $16, %eax
2765
+ je key_128_enc_update4
2766
+ # must be 192
2767
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2768
+ FUNC_RESTORE
2769
+ RET
2770
+key_128_enc_update4:
2771
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2772
+ FUNC_RESTORE
2773
+ RET
2774
+key_256_enc_update4:
2775
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2776
+ FUNC_RESTORE
2777
+ RET
2778
+SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2779
+
2780
+###############################################################################
2781
+#void aesni_gcm_dec_update_avx_gen4(
2782
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2783
+# gcm_context_data *data,
2784
+# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2785
+# const u8 *in, /* Ciphertext input */
2786
+# u64 plaintext_len) /* Length of data in Bytes for encryption. */
2787
+###############################################################################
2788
+SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2789
+ FUNC_SAVE
2790
+ mov keysize,%eax
2791
+ cmp $32, %eax
2792
+ je key_256_dec_update4
2793
+ cmp $16, %eax
2794
+ je key_128_dec_update4
2795
+ # must be 192
2796
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2797
+ FUNC_RESTORE
2798
+ RET
2799
+key_128_dec_update4:
2800
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2801
+ FUNC_RESTORE
2802
+ RET
2803
+key_256_dec_update4:
2804
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2805
+ FUNC_RESTORE
2806
+ RET
2807
+SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2808
+
2809
+###############################################################################
2810
+#void aesni_gcm_finalize_avx_gen4(
2811
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2812
+# gcm_context_data *data,
29372813 # u8 *auth_tag, /* Authenticated Tag output. */
29382814 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2939
-# Valid values are 16 (most likely), 12 or 8. */
2815
+# Valid values are 16 (most likely), 12 or 8. */
29402816 ###############################################################################
2941
-ENTRY(aesni_gcm_dec_avx_gen4)
2942
- GCM_ENC_DEC_AVX2 DEC
2943
- ret
2944
-ENDPROC(aesni_gcm_dec_avx_gen4)
2945
-
2946
-#endif /* CONFIG_AS_AVX2 */
2817
+SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2818
+ FUNC_SAVE
2819
+ mov keysize,%eax
2820
+ cmp $32, %eax
2821
+ je key_256_finalize4
2822
+ cmp $16, %eax
2823
+ je key_128_finalize4
2824
+ # must be 192
2825
+ GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2826
+ FUNC_RESTORE
2827
+ RET
2828
+key_128_finalize4:
2829
+ GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2830
+ FUNC_RESTORE
2831
+ RET
2832
+key_256_finalize4:
2833
+ GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2834
+ FUNC_RESTORE
2835
+ RET
2836
+SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)