~hc/RK356X_SDK_RELEASE.git

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * BLAKE2b digest algorithm, NEON accelerated
 *
 * Copyright 2020 Google LLC
 *
 * Author: Eric Biggers <ebiggers@google.com>
 */
 
#include <linux/linkage.h>
 
    .text
    .fpu        neon
 
    // The arguments to blake2b_compress_neon()
    STATE        .req    r0
    BLOCK        .req    r1
    NBLOCKS        .req    r2
    INC        .req    r3
 
    // Pointers to the rotation tables
    ROR24_TABLE    .req    r4
    ROR16_TABLE    .req    r5
 
    // The original stack pointer
    ORIG_SP        .req    r6
 
    // NEON registers which contain the message words of the current block.
    // M_0-M_3 are occasionally used for other purposes too.
    M_0        .req    d16
    M_1        .req    d17
    M_2        .req    d18
    M_3        .req    d19
    M_4        .req    d20
    M_5        .req    d21
    M_6        .req    d22
    M_7        .req    d23
    M_8        .req    d24
    M_9        .req    d25
    M_10        .req    d26
    M_11        .req    d27
    M_12        .req    d28
    M_13        .req    d29
    M_14        .req    d30
    M_15        .req    d31
 
    .align        4
    // Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
    // instruction.  This is the most efficient way to implement these
    // rotation amounts with NEON.  (On Cortex-A53 it's the same speed as
    // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
.Lror24_table:
    .byte        3, 4, 5, 6, 7, 0, 1, 2
.Lror16_table:
    .byte        2, 3, 4, 5, 6, 7, 0, 1
    // The BLAKE2b initialization vector
.Lblake2b_IV:
    .quad        0x6a09e667f3bcc908, 0xbb67ae8584caa73b
    .quad        0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
    .quad        0x510e527fade682d1, 0x9b05688c2b3e6c1f
    .quad        0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
 
// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
// NEON registers q0-q7.  The message block is in q8..q15 (M_0-M_15).  The stack
// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
// (M_0-M_3), so that they can be reloaded if they are used as temporary
// registers.  The macro arguments s0-s15 give the order in which the message
// words are used in this round.  'final' is 1 if this is the final round.
.macro    _blake2b_round    s0, s1, s2, s3, s4, s5, s6, s7, \
            s8, s9, s10, s11, s12, s13, s14, s15, final=0
 
    // Mix the columns:
    // (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
    // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
 
    // a += b + m[blake2b_sigma[r][2*i + 0]];
    vadd.u64    q0, q0, q2
    vadd.u64    q1, q1, q3
    vadd.u64    d0, d0, M_\s0
    vadd.u64    d1, d1, M_\s2
    vadd.u64    d2, d2, M_\s4
    vadd.u64    d3, d3, M_\s6
 
    // d = ror64(d ^ a, 32);
    veor        q6, q6, q0
    veor        q7, q7, q1
    vrev64.32    q6, q6
    vrev64.32    q7, q7
 
    // c += d;
    vadd.u64    q4, q4, q6
    vadd.u64    q5, q5, q7
 
    // b = ror64(b ^ c, 24);
    vld1.8        {M_0}, [ROR24_TABLE, :64]
    veor        q2, q2, q4
    veor        q3, q3, q5
    vtbl.8        d4, {d4}, M_0
    vtbl.8        d5, {d5}, M_0
    vtbl.8        d6, {d6}, M_0
    vtbl.8        d7, {d7}, M_0
 
    // a += b + m[blake2b_sigma[r][2*i + 1]];
    //
    // M_0 got clobbered above, so we have to reload it if any of the four
    // message words this step needs happens to be M_0.  Otherwise we don't
    // need to reload it here, as it will just get clobbered again below.
.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
    vld1.8        {M_0}, [sp, :64]
.endif
    vadd.u64    q0, q0, q2
    vadd.u64    q1, q1, q3
    vadd.u64    d0, d0, M_\s1
    vadd.u64    d1, d1, M_\s3
    vadd.u64    d2, d2, M_\s5
    vadd.u64    d3, d3, M_\s7
 
    // d = ror64(d ^ a, 16);
    vld1.8        {M_0}, [ROR16_TABLE, :64]
    veor        q6, q6, q0
    veor        q7, q7, q1
    vtbl.8        d12, {d12}, M_0
    vtbl.8        d13, {d13}, M_0
    vtbl.8        d14, {d14}, M_0
    vtbl.8        d15, {d15}, M_0
 
    // c += d;
    vadd.u64    q4, q4, q6
    vadd.u64    q5, q5, q7
 
    // b = ror64(b ^ c, 63);
    //
    // This rotation amount isn't a multiple of 8, so it has to be
    // implemented using a pair of shifts, which requires temporary
    // registers.  Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
    veor        q8, q2, q4
    veor        q9, q3, q5
    vshr.u64    q2, q8, #63
    vshr.u64    q3, q9, #63
    vsli.u64    q2, q8, #1
    vsli.u64    q3, q9, #1
    vld1.8        {q8-q9}, [sp, :256]
 
    // Mix the diagonals:
    // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
    // (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
    //
    // There are two possible ways to do this: use 'vext' instructions to
    // shift the rows of the matrix so that the diagonals become columns,
    // and undo it afterwards; or just use 64-bit operations on 'd'
    // registers instead of 128-bit operations on 'q' registers.  We use the
    // latter approach, as it performs much better on Cortex-A7.
 
    // a += b + m[blake2b_sigma[r][2*i + 0]];
    vadd.u64    d0, d0, d5
    vadd.u64    d1, d1, d6
    vadd.u64    d2, d2, d7
    vadd.u64    d3, d3, d4
    vadd.u64    d0, d0, M_\s8
    vadd.u64    d1, d1, M_\s10
    vadd.u64    d2, d2, M_\s12
    vadd.u64    d3, d3, M_\s14
 
    // d = ror64(d ^ a, 32);
    veor        d15, d15, d0
    veor        d12, d12, d1
    veor        d13, d13, d2
    veor        d14, d14, d3
    vrev64.32    d15, d15
    vrev64.32    d12, d12
    vrev64.32    d13, d13
    vrev64.32    d14, d14
 
    // c += d;
    vadd.u64    d10, d10, d15
    vadd.u64    d11, d11, d12
    vadd.u64    d8, d8, d13
    vadd.u64    d9, d9, d14
 
    // b = ror64(b ^ c, 24);
    vld1.8        {M_0}, [ROR24_TABLE, :64]
    veor        d5, d5, d10
    veor        d6, d6, d11
    veor        d7, d7, d8
    veor        d4, d4, d9
    vtbl.8        d5, {d5}, M_0
    vtbl.8        d6, {d6}, M_0
    vtbl.8        d7, {d7}, M_0
    vtbl.8        d4, {d4}, M_0
 
    // a += b + m[blake2b_sigma[r][2*i + 1]];
.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
    vld1.8        {M_0}, [sp, :64]
.endif
    vadd.u64    d0, d0, d5
    vadd.u64    d1, d1, d6
    vadd.u64    d2, d2, d7
    vadd.u64    d3, d3, d4
    vadd.u64    d0, d0, M_\s9
    vadd.u64    d1, d1, M_\s11
    vadd.u64    d2, d2, M_\s13
    vadd.u64    d3, d3, M_\s15
 
    // d = ror64(d ^ a, 16);
    vld1.8        {M_0}, [ROR16_TABLE, :64]
    veor        d15, d15, d0
    veor        d12, d12, d1
    veor        d13, d13, d2
    veor        d14, d14, d3
    vtbl.8        d12, {d12}, M_0
    vtbl.8        d13, {d13}, M_0
    vtbl.8        d14, {d14}, M_0
    vtbl.8        d15, {d15}, M_0
 
    // c += d;
    vadd.u64    d10, d10, d15
    vadd.u64    d11, d11, d12
    vadd.u64    d8, d8, d13
    vadd.u64    d9, d9, d14
 
    // b = ror64(b ^ c, 63);
    veor        d16, d4, d9
    veor        d17, d5, d10
    veor        d18, d6, d11
    veor        d19, d7, d8
    vshr.u64    q2, q8, #63
    vshr.u64    q3, q9, #63
    vsli.u64    q2, q8, #1
    vsli.u64    q3, q9, #1
    // Reloading q8-q9 can be skipped on the final round.
.if ! \final
    vld1.8        {q8-q9}, [sp, :256]
.endif
.endm
 
//
// void blake2b_compress_neon(struct blake2b_state *state,
//                  const u8 *block, size_t nblocks, u32 inc);
//
// Only the first three fields of struct blake2b_state are used:
//    u64 h[8];    (inout)
//    u64 t[2];    (inout)
//    u64 f[2];    (in)
//
    .align        5
ENTRY(blake2b_compress_neon)
    push        {r4-r10}
 
    // Allocate a 32-byte stack buffer that is 32-byte aligned.
    mov        ORIG_SP, sp
    sub        ip, sp, #32
    bic        ip, ip, #31
    mov        sp, ip
 
    adr        ROR24_TABLE, .Lror24_table
    adr        ROR16_TABLE, .Lror16_table
 
    mov        ip, STATE
    vld1.64        {q0-q1}, [ip]!        // Load h[0..3]
    vld1.64        {q2-q3}, [ip]!        // Load h[4..7]
.Lnext_block:
      adr        r10, .Lblake2b_IV
    vld1.64        {q14-q15}, [ip]        // Load t[0..1] and f[0..1]
    vld1.64        {q4-q5}, [r10]!        // Load IV[0..3]
      vmov        r7, r8, d28        // Copy t[0] to (r7, r8)
    vld1.64        {q6-q7}, [r10]        // Load IV[4..7]
      adds        r7, r7, INC        // Increment counter
    bcs        .Lslow_inc_ctr
    vmov.i32    d28[0], r7
    vst1.64        {d28}, [ip]        // Update t[0]
.Linc_ctr_done:
 
    // Load the next message block and finish initializing the state matrix
    // 'v'.  Fortunately, there are exactly enough NEON registers to fit the
    // entire state matrix in q0-q7 and the entire message block in q8-15.
    //
    // However, _blake2b_round also needs some extra registers for rotates,
    // so we have to spill some registers.  It's better to spill the message
    // registers than the state registers, as the message doesn't change.
    // Therefore we store a copy of the first 32 bytes of the message block
    // (q8-q9) in an aligned buffer on the stack so that they can be
    // reloaded when needed.  (We could just reload directly from the
    // message buffer, but it's faster to use aligned loads.)
    vld1.8        {q8-q9}, [BLOCK]!
      veor        q6, q6, q14    // v[12..13] = IV[4..5] ^ t[0..1]
    vld1.8        {q10-q11}, [BLOCK]!
      veor        q7, q7, q15    // v[14..15] = IV[6..7] ^ f[0..1]
    vld1.8        {q12-q13}, [BLOCK]!
    vst1.8        {q8-q9}, [sp, :256]
      mov        ip, STATE
    vld1.8        {q14-q15}, [BLOCK]!
 
    // Execute the rounds.  Each round is provided the order in which it
    // needs to use the message words.
    _blake2b_round    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
    _blake2b_round    14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
    _blake2b_round    11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
    _blake2b_round    7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
    _blake2b_round    9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
    _blake2b_round    2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
    _blake2b_round    12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
    _blake2b_round    13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
    _blake2b_round    6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
    _blake2b_round    10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
    _blake2b_round    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
    _blake2b_round    14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
            final=1
 
    // Fold the final state matrix into the hash chaining value:
    //
    //    for (i = 0; i < 8; i++)
    //        h[i] ^= v[i] ^ v[i + 8];
    //
      vld1.64    {q8-q9}, [ip]!        // Load old h[0..3]
    veor        q0, q0, q4        // v[0..1] ^= v[8..9]
    veor        q1, q1, q5        // v[2..3] ^= v[10..11]
      vld1.64    {q10-q11}, [ip]        // Load old h[4..7]
    veor        q2, q2, q6        // v[4..5] ^= v[12..13]
    veor        q3, q3, q7        // v[6..7] ^= v[14..15]
    veor        q0, q0, q8        // v[0..1] ^= h[0..1]
    veor        q1, q1, q9        // v[2..3] ^= h[2..3]
      mov        ip, STATE
      subs        NBLOCKS, NBLOCKS, #1    // nblocks--
      vst1.64    {q0-q1}, [ip]!        // Store new h[0..3]
    veor        q2, q2, q10        // v[4..5] ^= h[4..5]
    veor        q3, q3, q11        // v[6..7] ^= h[6..7]
      vst1.64    {q2-q3}, [ip]!        // Store new h[4..7]
 
    // Advance to the next block, if there is one.
    bne        .Lnext_block        // nblocks != 0?
 
    mov        sp, ORIG_SP
    pop        {r4-r10}
    mov        pc, lr
 
.Lslow_inc_ctr:
    // Handle the case where the counter overflowed its low 32 bits, by
    // carrying the overflow bit into the full 128-bit counter.
    vmov        r9, r10, d29
    adcs        r8, r8, #0
    adcs        r9, r9, #0
    adc        r10, r10, #0
    vmov        d28, r7, r8
    vmov        d29, r9, r10
    vst1.64        {q14}, [ip]        // Update t[0] and t[1]
    b        .Linc_ctr_done
ENDPROC(blake2b_compress_neon)