~lzh/A133.git

/* Intel SIMD (SSE) implementation of Viterbi ACS butterflies
   for 64-state (k=7) convolutional code
   Copyright 2001 Phil Karn, KA9Q
   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
 
   int update_viterbi27_blk_sse(struct v27 *vp,unsigned char syms[],int nbits) ; 
*/
 
    # SSE (64-bit integer SIMD) version
    # Requires Pentium III or better
 
    # These are offsets into struct v27, defined in viterbi27.h
    .set DP,128
    .set OLDMETRICS,132
    .set NEWMETRICS,136
.text    
.global update_viterbi27_blk_sse,Branchtab27_sse
    .type update_viterbi27_blk_sse,@function
    .align 16
    
update_viterbi27_blk_sse:
    pushl %ebp
    movl %esp,%ebp
    pushl %esi
    pushl %edi
    pushl %edx
    pushl %ebx
    
    movl 8(%ebp),%edx    # edx = vp
    testl %edx,%edx
    jnz  0f
    movl -1,%eax
    jmp  err        
0:    movl OLDMETRICS(%edx),%esi    # esi -> old metrics
    movl NEWMETRICS(%edx),%edi    # edi -> new metrics
    movl DP(%edx),%edx    # edx -> decisions
 
1:    movl 16(%ebp),%eax    # eax = nbits
    decl %eax
    jl   2f            # passed zero, we're done
    movl %eax,16(%ebp)
 
    xorl %eax,%eax
    movl 12(%ebp),%ebx    # %ebx = syms
    movb (%ebx),%al
    movd %eax,%mm6        # mm6[0] = first symbol
    movb 1(%ebx),%al
    movd %eax,%mm5        # mm5[0] = second symbol
    addl $2,%ebx
    movl %ebx,12(%ebp)
 
    punpcklbw %mm6,%mm6    # mm6[1] = mm6[0]
    punpcklbw %mm5,%mm5
    movq thirtyones,%mm7
 
    pshufw $0,%mm6,%mm6    # copy low word to upper 3
    pshufw $0,%mm5,%mm5
    # mm6 now contains first symbol in each byte, mm5 the second
 
    # each invocation of this macro does 8 butterflies in parallel
    .MACRO butterfly GROUP
    # compute branch metrics
    movq Branchtab27_sse+(8*\GROUP),%mm4
    movq Branchtab27_sse+32+(8*\GROUP),%mm3
    pxor %mm6,%mm4
    pxor %mm5,%mm3
    pavgb %mm3,%mm4            # mm4 contains branch metrics
    psrlw $3,%mm4
    pand %mm7,%mm4
    
    movq (8*\GROUP)(%esi),%mm0    # Incoming path metric, high bit = 0
    movq ((8*\GROUP)+32)(%esi),%mm3    # Incoming path metric, high bit = 1
    movq %mm0,%mm2
    movq %mm3,%mm1
    paddusb %mm4,%mm0
    paddusb %mm4,%mm3
    
    # invert branch metrics. This works only because they're 5 bits
    pxor %mm7,%mm4
    
    paddusb %mm4,%mm1
    paddusb %mm4,%mm2
    
    # Find survivors, leave in mm0,2
    pminub %mm1,%mm0
    pminub %mm3,%mm2
    # get decisions, leave in mm1,3
    pcmpeqb %mm0,%mm1
    pcmpeqb %mm2,%mm3
    
    # interleave and store new branch metrics in mm0,2
    movq %mm0,%mm4
    punpckhbw %mm2,%mm0    # interleave second 8 new metrics
    punpcklbw %mm2,%mm4    # interleave first 8 new metrics
    movq %mm0,(16*\GROUP+8)(%edi)
    movq %mm4,(16*\GROUP)(%edi)
 
    # interleave decisions, accumulate into %ebx
    movq %mm1,%mm4
    punpckhbw %mm3,%mm1
    punpcklbw %mm3,%mm4
    # Due to an error in the Intel instruction set ref (the register
    # fields are swapped), gas assembles pmovmskb incorrectly
    # See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html
    .byte 0x0f,0xd7,0xc1    # pmovmskb %mm1,%eax
    shll $((16*\GROUP+8)&31),%eax
    orl %eax,%ebx
    .byte 0x0f,0xd7,0xc4    # pmovmskb %mm4,%eax
    shll $((16*\GROUP)&31),%eax
    orl %eax,%ebx
    .endm
 
    # invoke macro 4 times for a total of 32 butterflies
    xorl %ebx,%ebx        # clear decisions
    butterfly GROUP=0
    butterfly GROUP=1
    movl %ebx,(%edx)    # stash first 32 decisions
    xorl %ebx,%ebx
    butterfly GROUP=2
    butterfly GROUP=3
    movl %ebx,4(%edx)    # stash second 32 decisions
 
    addl $8,%edx        # bump decision pointer
        
    # see if we have to normalize
    movl (%edi),%eax    # extract first output metric
    andl $255,%eax
    cmpl $150,%eax        # is it greater than 150?
    movl $0,%eax
    jle done        # No, no need to normalize
 
    # Normalize by finding smallest metric and subtracting it
    # from all metrics
    movq (%edi),%mm0
    pminub 8(%edi),%mm0
    pminub 16(%edi),%mm0
    pminub 24(%edi),%mm0
    pminub 32(%edi),%mm0
    pminub 40(%edi),%mm0
    pminub 48(%edi),%mm0
    pminub 56(%edi),%mm0
    # mm0 contains 8 smallest metrics
    # crunch down to single lowest metric
    movq %mm0,%mm1
    psrlq $32,%mm0
    pminub %mm1,%mm0
    movq %mm0,%mm1
    psrlq $16,%mm0
    pminub %mm1,%mm0
    movq %mm0,%mm1
    psrlq $8,%mm0
    pminub %mm1,%mm0
    punpcklbw %mm0,%mm0    # expand to all 8 bytes
    pshufw $0,%mm0,%mm0
 
    # mm0 now contains lowest metric in all 8 bytes
    # subtract it from every output metric
    # Trashes %mm7
    .macro PSUBUSBM REG,MEM
    movq \MEM,%mm7
    psubusb \REG,%mm7
    movq %mm7,\MEM
    .endm
    
    PSUBUSBM %mm0,(%edi)
    PSUBUSBM %mm0,8(%edi)
    PSUBUSBM %mm0,16(%edi)
    PSUBUSBM %mm0,24(%edi)
    PSUBUSBM %mm0,32(%edi)
    PSUBUSBM %mm0,40(%edi)
    PSUBUSBM %mm0,48(%edi)
    PSUBUSBM %mm0,56(%edi)
 
    movd %mm0,%eax
    and $0xff,%eax
 
done:    # swap metrics
    movl %esi,%eax
    movl %edi,%esi
    movl %eax,%edi
    jmp 1b
    
2:    emms
    movl 8(%ebp),%ebx    # ebx = vp
    # stash metric pointers
    movl %esi,OLDMETRICS(%ebx)
    movl %edi,NEWMETRICS(%ebx)
    movl %edx,DP(%ebx)    # stash incremented value of vp->dp
    xorl %eax,%eax
err:    popl %ebx
    popl %edx
    popl %edi
    popl %esi
    popl %ebp
 
    ret
 
    .data
    
    .align 16
thirtyones:
    .byte 31,31,31,31,31,31,31,31