// Copyright 2014 The Go Authors. All rights reserved.
|
// Use of this source code is governed by a BSD-style
|
// license that can be found in the LICENSE file.
|
|
// +build ppc64 ppc64le
|
|
#include "textflag.h"
|
|
// func memmove(to, from unsafe.Pointer, n uintptr)
|
TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
|
MOVD to+0(FP), R3
|
MOVD from+8(FP), R4
|
MOVD n+16(FP), R5
|
|
// Determine if there are doublewords to
|
// copy so a more efficient move can be done
|
check:
|
ANDCC $7, R5, R7 // R7: bytes to copy
|
SRD $3, R5, R6 // R6: double words to copy
|
CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy
|
|
// Determine overlap by subtracting dest - src and comparing against the
|
// length. The catches the cases where src and dest are in different types
|
// of storage such as stack and static to avoid doing backward move when not
|
// necessary.
|
|
SUB R4, R3, R8 // dest - src
|
CMPU R8, R5, CR2 // < len?
|
BC 12, 8, backward // BLT CR2 backward
|
|
// Copying forward if no overlap.
|
|
BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge"
|
SRDCC $2,R6,R8 // 32 byte chunks?
|
BNE forward32setup //
|
MOVD R6,CTR // R6 = number of double words
|
|
// Move double words
|
|
forward8:
|
MOVD 0(R4), R8 // double word
|
ADD $8,R4
|
MOVD R8, 0(R3) //
|
ADD $8,R3
|
BC 16, 0, forward8
|
BR noforwardlarge // handle remainder
|
|
// Prepare for moves of 32 bytes at a time.
|
|
forward32setup:
|
DCBTST (R3) // prepare data cache
|
DCBT (R4)
|
MOVD R8, CTR // double work count
|
MOVD $16, R8
|
|
forward32:
|
LXVD2X (R4+R0), VS32 // load 16 bytes
|
LXVD2X (R4+R8), VS33
|
ADD $32, R4
|
STXVD2X VS32, (R3+R0) // store 16 bytes
|
STXVD2X VS33, (R3+R8)
|
ADD $32,R3 // bump up for next set
|
BC 16, 0, forward32 // continue
|
RLDCLCC $61,R5,$3,R6 // remaining doublewords
|
BEQ noforwardlarge
|
MOVD R6,CTR // set up the CTR
|
BR forward8
|
|
noforwardlarge:
|
CMP R7,$0 // any remaining bytes
|
BC 4, 1, LR // ble lr
|
|
forwardtail:
|
MOVD R7, CTR // move tail bytes
|
|
forwardtailloop:
|
MOVBZ 0(R4), R8 // move single bytes
|
ADD $1,R4
|
MOVBZ R8, 0(R3)
|
ADD $1,R3
|
BC 16, 0, forwardtailloop
|
RET
|
|
backward:
|
// Copying backwards proceeds by copying R7 bytes then copying R6 double words.
|
// R3 and R4 are advanced to the end of the destination/source buffers
|
// respectively and moved back as we copy.
|
|
ADD R5, R4, R4 // end of source
|
ADD R3, R5, R3 // end of dest
|
|
BEQ nobackwardtail // earlier condition
|
|
MOVD R7, CTR // bytes to move
|
|
backwardtailloop:
|
MOVBZ -1(R4), R8 // point to last byte
|
SUB $1,R4
|
MOVBZ R8, -1(R3)
|
SUB $1,R3
|
BC 16, 0, backwardtailloop // bndz
|
|
nobackwardtail:
|
BC 4, 5, LR // ble CR1 lr
|
|
backwardlarge:
|
MOVD R6, CTR
|
SUB R3, R4, R9 // Use vsx if moving
|
CMP R9, $32 // at least 32 byte chunks
|
BLT backwardlargeloop // and distance >= 32
|
SRDCC $2,R6,R8 // 32 byte chunks
|
BNE backward32setup
|
|
backwardlargeloop:
|
MOVD -8(R4), R8
|
SUB $8,R4
|
MOVD R8, -8(R3)
|
SUB $8,R3
|
BC 16, 0, backwardlargeloop // bndz
|
RET
|
|
backward32setup:
|
MOVD R8, CTR // set up loop ctr
|
MOVD $16, R8 // 32 bytes at at time
|
|
backward32loop:
|
SUB $32, R4
|
SUB $32, R3
|
LXVD2X (R4+R0), VS32 // load 16 bytes
|
LXVD2X (R4+R8), VS33
|
STXVD2X VS32, (R3+R0) // store 16 bytes
|
STXVD2X VS33, (R3+R8)
|
BC 16, 0, backward32loop // bndz
|
BC 4, 5, LR // ble CR1 lr
|
MOVD R6, CTR
|
BR backwardlargeloop
|