1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
| # SIMD SSE2 dot product
| # Equivalent to the following C code:
| # long dotprod(signed short *a,signed short *b,int cnt)
| # {
| # long sum = 0;
| # cnt *= 8;
| # while(cnt--)
| # sum += *a++ + *b++;
| # return sum;
| # }
| # a and b must be 128-bit aligned
| # Copyright 2001, Phil Karn KA9Q
| # May be used under the terms of the GNU Lesser General Public License (LGPL)
|
| .text
| .global dotprod_sse2_assist
| .type dotprod_sse2_assist,@function
| dotprod_sse2_assist:
| pushl %ebp
| movl %esp,%ebp
| pushl %esi
| pushl %edi
| pushl %ecx
| pushl %ebx
| movl 8(%ebp),%esi # a
| movl 12(%ebp),%edi # b
| movl 16(%ebp),%ecx # cnt
| pxor %xmm0,%xmm0 # clear running sum (in two 32-bit halves)
|
| # SSE2 dot product loop unrolled 4 times, crunching 32 terms per loop
| .align 16
| .Loop1: subl $4,%ecx
| jl .Loop1Done
|
| movdqa (%esi),%xmm1
| pmaddwd (%edi),%xmm1
| paddd %xmm1,%xmm0
|
| movdqa 16(%esi),%xmm1
| pmaddwd 16(%edi),%xmm1
| paddd %xmm1,%xmm0
|
| movdqa 32(%esi),%xmm1
| pmaddwd 32(%edi),%xmm1
| paddd %xmm1,%xmm0
|
| movdqa 48(%esi),%xmm1
| addl $64,%esi
| pmaddwd 48(%edi),%xmm1
| addl $64,%edi
| paddd %xmm1,%xmm0
|
| jmp .Loop1
| .Loop1Done:
|
| addl $4,%ecx
|
| # SSE2 dot product loop, not unrolled, crunching 4 terms per loop
| # This could be redone as Duff's Device on the unrolled loop above
| .Loop2: subl $1,%ecx
| jl .Loop2Done
|
| movdqa (%esi),%xmm1
| addl $16,%esi
| pmaddwd (%edi),%xmm1
| addl $16,%edi
| paddd %xmm1,%xmm0
| jmp .Loop2
| .Loop2Done:
|
| movdqa %xmm0,%xmm1
| psrldq $8,%xmm0
| paddd %xmm1,%xmm0
| movd %xmm0,%eax # right-hand word to eax
| psrldq $4,%xmm0
| movd %xmm0,%ebx
| addl %ebx,%eax
|
| popl %ebx
| popl %ecx
| popl %edi
| popl %esi
| movl %ebp,%esp
| popl %ebp
| ret
|
|