1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
| /*
| Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
|
| This file is subject to the terms and conditions of the GNU General Public
| License. See the file "COPYING" in the main directory of this archive
| for more details.
|
| Tight version of mempy for the case of just copying a page.
| Prefetch strategy empirically optimised against RTL simulations
| of SH5-101 cut2 eval chip with Cayman board DDR memory.
|
| Parameters:
| r2 : destination effective address (start of page)
| r3 : source effective address (start of page)
|
| Always copies 4096 bytes.
|
| Points to review.
| * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
| It seems like the prefetch needs to be at at least 4 lines ahead to get
| the data into the cache in time, and the allocos contend with outstanding
| prefetches for the same cache set, so it's better to have the numbers
| different.
| */
|
| .section .text..SHmedia32,"ax"
| .little
|
| .balign 8
| .global copy_page
| copy_page:
|
| /* Copy 4096 bytes worth of data from r3 to r2.
| Do prefetches 4 lines ahead.
| Do alloco 2 lines ahead */
|
| pta 1f, tr1
| pta 2f, tr2
| pta 3f, tr3
| ptabs r18, tr0
|
| #if 0
| /* TAKum03020 */
| ld.q r3, 0x00, r63
| ld.q r3, 0x20, r63
| ld.q r3, 0x40, r63
| ld.q r3, 0x60, r63
| #endif
| alloco r2, 0x00
| synco ! TAKum03020
| alloco r2, 0x20
| synco ! TAKum03020
|
| movi 3968, r6
| add r2, r6, r6
| addi r6, 64, r7
| addi r7, 64, r8
| sub r3, r2, r60
| addi r60, 8, r61
| addi r61, 8, r62
| addi r62, 8, r23
| addi r60, 0x80, r22
|
| /* Minimal code size. The extra branches inside the loop don't cost much
| because they overlap with the time spent waiting for prefetches to
| complete. */
| 1:
| #if 0
| /* TAKum03020 */
| bge/u r2, r6, tr2 ! skip prefetch for last 4 lines
| ldx.q r2, r22, r63 ! prefetch 4 lines hence
| #endif
| 2:
| bge/u r2, r7, tr3 ! skip alloco for last 2 lines
| alloco r2, 0x40 ! alloc destination line 2 lines ahead
| synco ! TAKum03020
| 3:
| ldx.q r2, r60, r36
| ldx.q r2, r61, r37
| ldx.q r2, r62, r38
| ldx.q r2, r23, r39
| st.q r2, 0, r36
| st.q r2, 8, r37
| st.q r2, 16, r38
| st.q r2, 24, r39
| addi r2, 32, r2
| bgt/l r8, r2, tr1
|
| blink tr0, r63 ! return
|
|