1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
| /* SPDX-License-Identifier: GPL-2.0 */
| .section .text..SHmedia32,"ax"
| .align 2
| .global __udivdi3
| __udivdi3:
| shlri r3,1,r4
| nsb r4,r22
| shlld r3,r22,r6
| shlri r6,49,r5
| movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */
| sub r21,r5,r1
| mmulfx.w r1,r1,r4
| mshflo.w r1,r63,r1
| sub r63,r22,r20 // r63 == 64 % 64
| mmulfx.w r5,r4,r4
| pta large_divisor,tr0
| addi r20,32,r9
| msub.w r1,r4,r1
| madd.w r1,r1,r1
| mmulfx.w r1,r1,r4
| shlri r6,32,r7
| bgt/u r9,r63,tr0 // large_divisor
| mmulfx.w r5,r4,r4
| shlri r2,32+14,r19
| addi r22,-31,r0
| msub.w r1,r4,r1
|
| mulu.l r1,r7,r4
| addi r1,-3,r5
| mulu.l r5,r19,r5
| sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
| shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
| the case may be, %0000000000000000 000.11111111111, still */
| muls.l r1,r4,r4 /* leaving at least one sign bit. */
| mulu.l r5,r3,r8
| mshalds.l r1,r21,r1
| shari r4,26,r4
| shlld r8,r0,r8
| add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
| sub r2,r8,r2
| /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */
|
| shlri r2,22,r21
| mulu.l r21,r1,r21
| shlld r5,r0,r8
| addi r20,30-22,r0
| shlrd r21,r0,r21
| mulu.l r21,r3,r5
| add r8,r21,r8
| mcmpgt.l r21,r63,r21 // See Note 1
| addi r20,30,r0
| mshfhi.l r63,r21,r21
| sub r2,r5,r2
| andc r2,r21,r2
|
| /* small divisor: need a third divide step */
| mulu.l r2,r1,r7
| ptabs r18,tr0
| addi r2,1,r2
| shlrd r7,r0,r7
| mulu.l r7,r3,r5
| add r8,r7,r8
| sub r2,r3,r2
| cmpgt r2,r5,r5
| add r8,r5,r2
| /* could test r3 here to check for divide by zero. */
| blink tr0,r63
|
| large_divisor:
| mmulfx.w r5,r4,r4
| shlrd r2,r9,r25
| shlri r25,32,r8
| msub.w r1,r4,r1
|
| mulu.l r1,r7,r4
| addi r1,-3,r5
| mulu.l r5,r8,r5
| sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
| shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
| the case may be, %0000000000000000 000.11111111111, still */
| muls.l r1,r4,r4 /* leaving at least one sign bit. */
| shlri r5,14-1,r8
| mulu.l r8,r7,r5
| mshalds.l r1,r21,r1
| shari r4,26,r4
| add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
| sub r25,r5,r25
| /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */
|
| shlri r25,22,r21
| mulu.l r21,r1,r21
| pta no_lo_adj,tr0
| addi r22,32,r0
| shlri r21,40,r21
| mulu.l r21,r7,r5
| add r8,r21,r8
| shlld r2,r0,r2
| sub r25,r5,r25
| bgtu/u r7,r25,tr0 // no_lo_adj
| addi r8,1,r8
| sub r25,r7,r25
| no_lo_adj:
| mextr4 r2,r25,r2
|
| /* large_divisor: only needs a few adjustments. */
| mulu.l r8,r6,r5
| ptabs r18,tr0
| /* bubble */
| cmpgtu r5,r2,r5
| sub r8,r5,r2
| blink tr0,r63
|
| /* Note 1: To shift the result of the second divide stage so that the result
| always fits into 32 bits, yet we still reduce the rest sufficiently
| would require a lot of instructions to do the shifts just right. Using
| the full 64 bit shift result to multiply with the divisor would require
| four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
| Fortunately, if the upper 32 bits of the shift result are nonzero, we
| know that the rest after taking this partial result into account will
| fit into 32 bits. So we just clear the upper 32 bits of the rest if the
| upper 32 bits of the partial result are nonzero. */
|
|