~hc/RK356X_SDK_RELEASE.git

/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2013 ARM Ltd.
 * Copyright (C) 2013 Linaro.
 *
 * This code is based on glibc cortex strings work originally authored by Linaro
 * be found @
 *
 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
 * files/head:/src/aarch64/
 */
 
 
/*
 * Copy a buffer from src to dest (alignment handled by the hardware)
 *
 * Parameters:
 *    x0 - dest
 *    x1 - src
 *    x2 - n
 * Returns:
 *    x0 - dest
 */
dstin    .req    x0
src    .req    x1
count    .req    x2
tmp1    .req    x3
tmp1w    .req    w3
tmp2    .req    x4
tmp2w    .req    w4
dst    .req    x6
 
A_l    .req    x7
A_h    .req    x8
B_l    .req    x9
B_h    .req    x10
C_l    .req    x11
C_h    .req    x12
D_l    .req    x13
D_h    .req    x14
 
    mov    dst, dstin
    cmp    count, #16
    /*When memory length is less than 16, the accessed are not aligned.*/
    b.lo    .Ltiny15
 
    neg    tmp2, src
    ands    tmp2, tmp2, #15/* Bytes to reach alignment. */
    b.eq    .LSrcAligned
    sub    count, count, tmp2
    /*
    * Copy the leading memory data from src to dst in an increasing
    * address order.By this way,the risk of overwriting the source
    * memory data is eliminated when the distance between src and
    * dst is less than 16. The memory accesses here are alignment.
    */
    tbz    tmp2, #0, 1f
    ldrb1    tmp1w, src, #1
    strb1    tmp1w, dst, #1
1:
    tbz    tmp2, #1, 2f
    ldrh1    tmp1w, src, #2
    strh1    tmp1w, dst, #2
2:
    tbz    tmp2, #2, 3f
    ldr1    tmp1w, src, #4
    str1    tmp1w, dst, #4
3:
    tbz    tmp2, #3, .LSrcAligned
    ldr1    tmp1, src, #8
    str1    tmp1, dst, #8
 
.LSrcAligned:
    cmp    count, #64
    b.ge    .Lcpy_over64
    /*
    * Deal with small copies quickly by dropping straight into the
    * exit block.
    */
.Ltail63:
    /*
    * Copy up to 48 bytes of data. At this point we only need the
    * bottom 6 bits of count to be accurate.
    */
    ands    tmp1, count, #0x30
    b.eq    .Ltiny15
    cmp    tmp1w, #0x20
    b.eq    1f
    b.lt    2f
    ldp1    A_l, A_h, src, #16
    stp1    A_l, A_h, dst, #16
1:
    ldp1    A_l, A_h, src, #16
    stp1    A_l, A_h, dst, #16
2:
    ldp1    A_l, A_h, src, #16
    stp1    A_l, A_h, dst, #16
.Ltiny15:
    /*
    * Prefer to break one ldp/stp into several load/store to access
    * memory in an increasing address order,rather than to load/store 16
    * bytes from (src-16) to (dst-16) and to backward the src to aligned
    * address,which way is used in original cortex memcpy. If keeping
    * the original memcpy process here, memmove need to satisfy the
    * precondition that src address is at least 16 bytes bigger than dst
    * address,otherwise some source data will be overwritten when memove
    * call memcpy directly. To make memmove simpler and decouple the
    * memcpy's dependency on memmove, withdrew the original process.
    */
    tbz    count, #3, 1f
    ldr1    tmp1, src, #8
    str1    tmp1, dst, #8
1:
    tbz    count, #2, 2f
    ldr1    tmp1w, src, #4
    str1    tmp1w, dst, #4
2:
    tbz    count, #1, 3f
    ldrh1    tmp1w, src, #2
    strh1    tmp1w, dst, #2
3:
    tbz    count, #0, .Lexitfunc
    ldrb1    tmp1w, src, #1
    strb1    tmp1w, dst, #1
 
    b    .Lexitfunc
 
.Lcpy_over64:
    subs    count, count, #128
    b.ge    .Lcpy_body_large
    /*
    * Less than 128 bytes to copy, so handle 64 here and then jump
    * to the tail.
    */
    ldp1    A_l, A_h, src, #16
    stp1    A_l, A_h, dst, #16
    ldp1    B_l, B_h, src, #16
    ldp1    C_l, C_h, src, #16
    stp1    B_l, B_h, dst, #16
    stp1    C_l, C_h, dst, #16
    ldp1    D_l, D_h, src, #16
    stp1    D_l, D_h, dst, #16
 
    tst    count, #0x3f
    b.ne    .Ltail63
    b    .Lexitfunc
 
    /*
    * Critical loop.  Start at a new cache line boundary.  Assuming
    * 64 bytes per line this ensures the entire loop is in one line.
    */
    .p2align    L1_CACHE_SHIFT
.Lcpy_body_large:
    /* pre-get 64 bytes data. */
    ldp1    A_l, A_h, src, #16
    ldp1    B_l, B_h, src, #16
    ldp1    C_l, C_h, src, #16
    ldp1    D_l, D_h, src, #16
1:
    /*
    * interlace the load of next 64 bytes data block with store of the last
    * loaded 64 bytes data.
    */
    stp1    A_l, A_h, dst, #16
    ldp1    A_l, A_h, src, #16
    stp1    B_l, B_h, dst, #16
    ldp1    B_l, B_h, src, #16
    stp1    C_l, C_h, dst, #16
    ldp1    C_l, C_h, src, #16
    stp1    D_l, D_h, dst, #16
    ldp1    D_l, D_h, src, #16
    subs    count, count, #64
    b.ge    1b
    stp1    A_l, A_h, dst, #16
    stp1    B_l, B_h, dst, #16
    stp1    C_l, C_h, dst, #16
    stp1    D_l, D_h, dst, #16
 
    tst    count, #0x3f
    b.ne    .Ltail63
.Lexitfunc: