1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
| # MMX assist routines for sumsq
| # Copyright 2001 Phil Karn, KA9Q
| # May be used under the terms of the GNU Public License (GPL)
|
| .text
|
| # Evaluate sum of squares of signed 16-bit input samples
| # long long sumsq_mmx_assist(signed short *in,int cnt);
| .global sumsq_mmx_assist
| .type sumsq_mmx_assist,@function
| .align 16
| sumsq_mmx_assist:
| pushl %ebp
| movl %esp,%ebp
| pushl %esi
| pushl %ecx
| pushl %ebx
|
| movl 8(%ebp),%esi
| movl 12(%ebp),%ecx
| xor %eax,%eax
| xor %edx,%edx
|
| # Since 4 * 32767**2 < 2**32, we can accumulate two at a time
| 1: subl $8,%ecx
| jl 2f
| movq (%esi),%mm0 # S0 S1 S2 S3
| pmaddwd %mm0,%mm0 # (S0^2+S1^2) (S2^2+S3^2)
| movq 8(%esi),%mm6 # S4 S5 S6 S7
| pmaddwd %mm6,%mm6 # (S4^2+S5^2) (S6^2+S7^2)
| paddd %mm6,%mm0 # (S0^2+S1^2+S4^2+S5^2)(S2^2+S3^2+S6^2+S7^2)
| movd %mm0,%ebx
| addl %ebx,%eax
| adcl $0,%edx
| psrlq $32,%mm0
| movd %mm0,%ebx
| addl %ebx,%eax
| adcl $0,%edx
| addl $16,%esi
| jmp 1b
|
| 2: emms
| popl %ebx
| popl %ecx
| popl %esi
| popl %ebp
| ret
|
| # Evaluate sum of squares of signed 16-bit input samples
| # long sumsq_wd_mmx_assist(signed short *in,int cnt);
| # Quick version, only safe for small numbers of small input values...
| .global sumsq_wd_mmx_assist
| .type sumsq_wd_mmx_assist,@function
| .align 16
| sumsq_wd_mmx_assist:
| pushl %ebp
| movl %esp,%ebp
| pushl %esi
|
| movl 8(%ebp),%esi
| movl 12(%ebp),%ecx
| pxor %mm2,%mm2 # zero sum
|
| 1: subl $8,%ecx
| jl 2f
| movq (%esi),%mm0 # S0 S1 S2 S3
| pmaddwd %mm0,%mm0 # (S0*S0+S1*S1) (S2*S2+S3*S3)
| movq 8(%esi),%mm1
| pmaddwd %mm1,%mm1
| paddd %mm1,%mm2
| paddd %mm0,%mm2 # accumulate
|
| addl $16,%esi
| jmp 1b
|
| 2: movd %mm2,%eax # even sum
| psrlq $32,%mm2
| movd %mm2,%edx # odd sum
| addl %edx,%eax
| emms
| popl %esi
| popl %ebp
| ret
|
|