1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
| # SIMD MMX dot product
| # Equivalent to the following C code:
| # long dotprod(signed short *a,signed short *b,int cnt)
| # {
| # long sum = 0;
| # cnt *= 4;
| # while(cnt--)
| # sum += *a++ + *b++;
| # return sum;
| # }
| # a and b should also be 64-bit aligned, or speed will suffer greatly
| # Copyright 1999, Phil Karn KA9Q
| # May be used under the terms of the GNU Lesser General Public License (LGPL)
|
| .text
| .global dotprod_mmx_assist
| .type dotprod_mmx_assist,@function
| dotprod_mmx_assist:
| pushl %ebp
| movl %esp,%ebp
| pushl %esi
| pushl %edi
| pushl %ecx
| pushl %ebx
| movl 8(%ebp),%esi # a
| movl 12(%ebp),%edi # b
| movl 16(%ebp),%ecx # cnt
| pxor %mm0,%mm0 # clear running sum (in two 32-bit halves)
|
| # MMX dot product loop unrolled 4 times, crunching 16 terms per loop
| .align 16
| .Loop1: subl $4,%ecx
| jl .Loop1Done
|
| movq (%esi),%mm1 # mm1 = a[3],a[2],a[1],a[0]
| pmaddwd (%edi),%mm1 # mm1 = b[3]*a[3]+b[2]*a[2],b[1]*a[1]+b[0]*a[0]
| paddd %mm1,%mm0
|
| movq 8(%esi),%mm1
| pmaddwd 8(%edi),%mm1
| paddd %mm1,%mm0
|
| movq 16(%esi),%mm1
| pmaddwd 16(%edi),%mm1
| paddd %mm1,%mm0
|
| movq 24(%esi),%mm1
| addl $32,%esi
| pmaddwd 24(%edi),%mm1
| addl $32,%edi
| paddd %mm1,%mm0
|
| jmp .Loop1
| .Loop1Done:
|
| addl $4,%ecx
|
| # MMX dot product loop, not unrolled, crunching 4 terms per loop
| # This could be redone as Duff's Device on the unrolled loop above
| .Loop2: subl $1,%ecx
| jl .Loop2Done
|
| movq (%esi),%mm1
| addl $8,%esi
| pmaddwd (%edi),%mm1
| addl $8,%edi
| paddd %mm1,%mm0
| jmp .Loop2
| .Loop2Done:
|
| movd %mm0,%ebx # right-hand word to ebx
| punpckhdq %mm0,%mm0 # left-hand word to right side of %mm0
| movd %mm0,%eax
| addl %ebx,%eax # running sum now in %eax
| emms # done with MMX
|
| popl %ebx
| popl %ecx
| popl %edi
| popl %esi
| movl %ebp,%esp
| popl %ebp
| ret
|
|