1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
| /* Intel SIMD (SSE) implementation of Viterbi ACS butterflies
| for 64-state (k=7) convolutional code
| Copyright 2001 Phil Karn, KA9Q
| This code may be used under the terms of the GNU Lesser General Public License (LGPL)
|
| int update_viterbi27_blk_sse(struct v27 *vp,unsigned char syms[],int nbits) ;
| */
|
| # SSE (64-bit integer SIMD) version
| # Requires Pentium III or better
|
| # These are offsets into struct v27, defined in viterbi27.h
| .set DP,128
| .set OLDMETRICS,132
| .set NEWMETRICS,136
| .text
| .global update_viterbi27_blk_sse,Branchtab27_sse
| .type update_viterbi27_blk_sse,@function
| .align 16
|
| update_viterbi27_blk_sse:
| pushl %ebp
| movl %esp,%ebp
| pushl %esi
| pushl %edi
| pushl %edx
| pushl %ebx
|
| movl 8(%ebp),%edx # edx = vp
| testl %edx,%edx
| jnz 0f
| movl -1,%eax
| jmp err
| 0: movl OLDMETRICS(%edx),%esi # esi -> old metrics
| movl NEWMETRICS(%edx),%edi # edi -> new metrics
| movl DP(%edx),%edx # edx -> decisions
|
| 1: movl 16(%ebp),%eax # eax = nbits
| decl %eax
| jl 2f # passed zero, we're done
| movl %eax,16(%ebp)
|
| xorl %eax,%eax
| movl 12(%ebp),%ebx # %ebx = syms
| movb (%ebx),%al
| movd %eax,%mm6 # mm6[0] = first symbol
| movb 1(%ebx),%al
| movd %eax,%mm5 # mm5[0] = second symbol
| addl $2,%ebx
| movl %ebx,12(%ebp)
|
| punpcklbw %mm6,%mm6 # mm6[1] = mm6[0]
| punpcklbw %mm5,%mm5
| movq thirtyones,%mm7
|
| pshufw $0,%mm6,%mm6 # copy low word to upper 3
| pshufw $0,%mm5,%mm5
| # mm6 now contains first symbol in each byte, mm5 the second
|
| # each invocation of this macro does 8 butterflies in parallel
| .MACRO butterfly GROUP
| # compute branch metrics
| movq Branchtab27_sse+(8*\GROUP),%mm4
| movq Branchtab27_sse+32+(8*\GROUP),%mm3
| pxor %mm6,%mm4
| pxor %mm5,%mm3
| pavgb %mm3,%mm4 # mm4 contains branch metrics
| psrlw $3,%mm4
| pand %mm7,%mm4
|
| movq (8*\GROUP)(%esi),%mm0 # Incoming path metric, high bit = 0
| movq ((8*\GROUP)+32)(%esi),%mm3 # Incoming path metric, high bit = 1
| movq %mm0,%mm2
| movq %mm3,%mm1
| paddusb %mm4,%mm0
| paddusb %mm4,%mm3
|
| # invert branch metrics. This works only because they're 5 bits
| pxor %mm7,%mm4
|
| paddusb %mm4,%mm1
| paddusb %mm4,%mm2
|
| # Find survivors, leave in mm0,2
| pminub %mm1,%mm0
| pminub %mm3,%mm2
| # get decisions, leave in mm1,3
| pcmpeqb %mm0,%mm1
| pcmpeqb %mm2,%mm3
|
| # interleave and store new branch metrics in mm0,2
| movq %mm0,%mm4
| punpckhbw %mm2,%mm0 # interleave second 8 new metrics
| punpcklbw %mm2,%mm4 # interleave first 8 new metrics
| movq %mm0,(16*\GROUP+8)(%edi)
| movq %mm4,(16*\GROUP)(%edi)
|
| # interleave decisions, accumulate into %ebx
| movq %mm1,%mm4
| punpckhbw %mm3,%mm1
| punpcklbw %mm3,%mm4
| # Due to an error in the Intel instruction set ref (the register
| # fields are swapped), gas assembles pmovmskb incorrectly
| # See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html
| .byte 0x0f,0xd7,0xc1 # pmovmskb %mm1,%eax
| shll $((16*\GROUP+8)&31),%eax
| orl %eax,%ebx
| .byte 0x0f,0xd7,0xc4 # pmovmskb %mm4,%eax
| shll $((16*\GROUP)&31),%eax
| orl %eax,%ebx
| .endm
|
| # invoke macro 4 times for a total of 32 butterflies
| xorl %ebx,%ebx # clear decisions
| butterfly GROUP=0
| butterfly GROUP=1
| movl %ebx,(%edx) # stash first 32 decisions
| xorl %ebx,%ebx
| butterfly GROUP=2
| butterfly GROUP=3
| movl %ebx,4(%edx) # stash second 32 decisions
|
| addl $8,%edx # bump decision pointer
|
| # see if we have to normalize
| movl (%edi),%eax # extract first output metric
| andl $255,%eax
| cmpl $150,%eax # is it greater than 150?
| movl $0,%eax
| jle done # No, no need to normalize
|
| # Normalize by finding smallest metric and subtracting it
| # from all metrics
| movq (%edi),%mm0
| pminub 8(%edi),%mm0
| pminub 16(%edi),%mm0
| pminub 24(%edi),%mm0
| pminub 32(%edi),%mm0
| pminub 40(%edi),%mm0
| pminub 48(%edi),%mm0
| pminub 56(%edi),%mm0
| # mm0 contains 8 smallest metrics
| # crunch down to single lowest metric
| movq %mm0,%mm1
| psrlq $32,%mm0
| pminub %mm1,%mm0
| movq %mm0,%mm1
| psrlq $16,%mm0
| pminub %mm1,%mm0
| movq %mm0,%mm1
| psrlq $8,%mm0
| pminub %mm1,%mm0
| punpcklbw %mm0,%mm0 # expand to all 8 bytes
| pshufw $0,%mm0,%mm0
|
| # mm0 now contains lowest metric in all 8 bytes
| # subtract it from every output metric
| # Trashes %mm7
| .macro PSUBUSBM REG,MEM
| movq \MEM,%mm7
| psubusb \REG,%mm7
| movq %mm7,\MEM
| .endm
|
| PSUBUSBM %mm0,(%edi)
| PSUBUSBM %mm0,8(%edi)
| PSUBUSBM %mm0,16(%edi)
| PSUBUSBM %mm0,24(%edi)
| PSUBUSBM %mm0,32(%edi)
| PSUBUSBM %mm0,40(%edi)
| PSUBUSBM %mm0,48(%edi)
| PSUBUSBM %mm0,56(%edi)
|
| movd %mm0,%eax
| and $0xff,%eax
|
| done: # swap metrics
| movl %esi,%eax
| movl %edi,%esi
| movl %eax,%edi
| jmp 1b
|
| 2: emms
| movl 8(%ebp),%ebx # ebx = vp
| # stash metric pointers
| movl %esi,OLDMETRICS(%ebx)
| movl %edi,NEWMETRICS(%ebx)
| movl %edx,DP(%ebx) # stash incremented value of vp->dp
| xorl %eax,%eax
| err: popl %ebx
| popl %edx
| popl %edi
| popl %esi
| popl %ebp
|
| ret
|
| .data
|
| .align 16
| thirtyones:
| .byte 31,31,31,31,31,31,31,31
|
|
|
|