~lzh/A133.git

# SIMD SSE2 dot product
# Equivalent to the following C code:
# long dotprod(signed short *a,signed short *b,int cnt)
# {
#    long sum = 0; 
#    cnt *= 8; 
#    while(cnt--)
#        sum += *a++ + *b++;
#    return sum;
# }
# a and b must be 128-bit aligned
# Copyright 2001, Phil Karn KA9Q
# May be used under the terms of the GNU Lesser General Public License (LGPL)
    
    .text
    .global dotprod_sse2_assist
    .type dotprod_sse2_assist,@function
dotprod_sse2_assist:
    pushl %ebp
    movl %esp,%ebp
    pushl %esi
    pushl %edi
    pushl %ecx
    pushl %ebx
    movl 8(%ebp),%esi    # a
    movl 12(%ebp),%edi    # b
    movl 16(%ebp),%ecx    # cnt
    pxor %xmm0,%xmm0        # clear running sum (in two 32-bit halves)
    
# SSE2 dot product loop unrolled 4 times, crunching 32 terms per loop
    .align 16
.Loop1:    subl $4,%ecx
    jl   .Loop1Done
    
    movdqa (%esi),%xmm1
     pmaddwd (%edi),%xmm1
    paddd %xmm1,%xmm0
    
    movdqa 16(%esi),%xmm1
    pmaddwd 16(%edi),%xmm1
    paddd %xmm1,%xmm0
 
    movdqa 32(%esi),%xmm1
    pmaddwd 32(%edi),%xmm1
    paddd %xmm1,%xmm0
 
    movdqa 48(%esi),%xmm1
    addl $64,%esi    
    pmaddwd 48(%edi),%xmm1
    addl $64,%edi    
    paddd %xmm1,%xmm0
 
    jmp .Loop1
.Loop1Done:
    
    addl $4,%ecx    
    
# SSE2 dot product loop, not unrolled, crunching 4 terms per loop
# This could be redone as Duff's Device on the unrolled loop above
.Loop2:    subl $1,%ecx
    jl   .Loop2Done
    
    movdqa (%esi),%xmm1
    addl $16,%esi
    pmaddwd (%edi),%xmm1
    addl $16,%edi
    paddd %xmm1,%xmm0
    jmp .Loop2
.Loop2Done:
 
    movdqa %xmm0,%xmm1
    psrldq $8,%xmm0
    paddd %xmm1,%xmm0
    movd %xmm0,%eax        # right-hand word to eax
    psrldq $4,%xmm0
    movd %xmm0,%ebx
    addl %ebx,%eax
 
    popl %ebx
    popl %ecx
    popl %edi
    popl %esi
    movl %ebp,%esp
    popl %ebp
    ret