; vim:filetype=nasm ts=8 ; libFLAC - Free Lossless Audio Codec library ; Copyright (C) 2001-2009 Josh Coalson ; Copyright (C) 2011-2016 Xiph.Org Foundation ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; ; - Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; - Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the distribution. ; ; - Neither the name of the Xiph.org Foundation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "nasm.h" data_section cglobal FLAC__lpc_compute_autocorrelation_asm_ia32 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 cglobal FLAC__lpc_restore_signal_asm_ia32 cglobal FLAC__lpc_restore_signal_asm_ia32_mmx cglobal FLAC__lpc_restore_signal_wide_asm_ia32 code_section ; ********************************************************************** ; ; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) ; { ; FLAC__real d; ; unsigned sample, coeff; ; const unsigned limit = data_len - lag; ; ; FLAC__ASSERT(lag > 0); ; FLAC__ASSERT(lag <= data_len); ; ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] = 0.0; ; for(sample = 0; sample <= limit; sample++) { ; d = data[sample]; ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] += d * data[sample+coeff]; ; } ; for(; sample < data_len; sample++) { ; d = data[sample]; ; for(coeff = 0; coeff < data_len - sample; coeff++) ; autoc[coeff] += d * data[sample+coeff]; ; } ; } ; ALIGN 16 cident FLAC__lpc_compute_autocorrelation_asm_ia32 ;[esp + 28] == autoc[] ;[esp + 24] == lag ;[esp + 20] == data_len ;[esp + 16] == data[] ;ASSERT(lag > 0) ;ASSERT(lag <= 33) ;ASSERT(lag <= data_len) .begin: push esi push edi push ebx ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] = 0.0; mov edi, [esp + 28] ; edi == autoc mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0 to write xor eax, eax rep stosd ; const unsigned limit = data_len - lag; mov eax, [esp + 24] ; eax == lag mov ecx, [esp + 20] sub ecx, eax ; ecx == limit mov edi, [esp + 28] ; edi == autoc mov esi, [esp + 16] ; esi == data inc ecx ; we are looping <= limit so we add one to the counter ; for(sample = 0; sample <= limit; sample++) { ; d = data[sample]; ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] += d * data[sample+coeff]; ; } fld dword [esi] ; ST = d <- data[sample] ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) lea edx, [eax + eax*2] neg edx lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1] call .mov_eip_to_ebx .get_eip1: add edx, ebx inc edx ; compensate for the shorter opcode on the last iteration inc edx ; compensate for the shorter opcode on the last iteration inc edx ; compensate for the shorter opcode on the last iteration cmp eax, 33 jne .loop1_start sub edx, byte 9 ; compensate for the longer opcodes on the first iteration .loop1_start: jmp edx .mov_eip_to_ebx: mov ebx, [esp] ret fld st0 ; ST = d d fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d WATCHOUT: not a byte displacement here! fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+32] d WATCHOUT: not a byte displacement here! fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32] ST = d WATCHOUT: not a byte displacement here! fld st0 ; ST = d d fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d fld st0 ; ST = d d fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d fld st0 ; ST = d d fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d fld st0 ; ST = d d fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d fld st0 ; ST = d d fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d fld st0 ; ST = d d fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d fld st0 ; ST = d d fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d fld st0 ; ST = d d fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d fld st0 ; ST = d d fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d fld st0 ; ST = d d fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d fld st0 ; ST = d d fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d fld st0 ; ST = d d fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d fld st0 ; ST = d d fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d fld st0 ; ST = d d fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d fld st0 ; ST = d d fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d fld st0 ; ST = d d fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d fld st0 ; ST = d d fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d fld st0 ; ST = d d fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d fld st0 ; ST = d d fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d fld st0 ; ST = d d fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d fld st0 ; ST = d d fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d fld st0 ; ST = d d fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d fld st0 ; ST = d d fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d fld st0 ; ST = d d fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d fld st0 ; ST = d d fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d fld st0 ; ST = d d fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d fld st0 ; ST = d d fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d fld st0 ; ST = d d fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d fld st0 ; ST = d d fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d fld st0 ; ST = d d fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d fld st0 ; ST = d d fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d fld st0 ; ST = d d fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here! fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here! fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here! .jumper1_0: fstp st0 ; pop d, ST = empty add esi, byte 4 ; sample++ dec ecx jz .loop1_end fld dword [esi] ; ST = d <- data[sample] jmp edx .loop1_end: ; for(; sample < data_len; sample++) { ; d = data[sample]; ; for(coeff = 0; coeff < data_len - sample; coeff++) ; autoc[coeff] += d * data[sample+coeff]; ; } mov ecx, [esp + 24] ; ecx <- lag dec ecx ; ecx <- lag - 1 jz near .end ; skip loop if 0 (i.e. lag == 1) fld dword [esi] ; ST = d <- data[sample] mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) lea edx, [eax + eax*2] neg edx lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2] call .mov_eip_to_ebx .get_eip2: add edx, ebx inc edx ; compensate for the shorter opcode on the last iteration inc edx ; compensate for the shorter opcode on the last iteration inc edx ; compensate for the shorter opcode on the last iteration jmp edx fld st0 ; ST = d d fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d fld st0 ; ST = d d fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d fld st0 ; ST = d d fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d fld st0 ; ST = d d fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d fld st0 ; ST = d d fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d fld st0 ; ST = d d fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d fld st0 ; ST = d d fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d fld st0 ; ST = d d fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d fld st0 ; ST = d d fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d fld st0 ; ST = d d fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d fld st0 ; ST = d d fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d fld st0 ; ST = d d fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d fld st0 ; ST = d d fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d fld st0 ; ST = d d fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d fld st0 ; ST = d d fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d fld st0 ; ST = d d fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d fld st0 ; ST = d d fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d fld st0 ; ST = d d fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d fld st0 ; ST = d d fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d fld st0 ; ST = d d fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d fld st0 ; ST = d d fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d fld st0 ; ST = d d fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d fld st0 ; ST = d d fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d fld st0 ; ST = d d fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d fld st0 ; ST = d d fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d fld st0 ; ST = d d fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d fld st0 ; ST = d d fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d fld st0 ; ST = d d fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d fld st0 ; ST = d d fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d fld st0 ; ST = d d fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d fld st0 ; ST = d d fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d fld st0 ; ST = d d fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here! fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here! fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here! .jumper2_0: fstp st0 ; pop d, ST = empty add esi, byte 4 ; sample++ dec ecx jz .loop2_end add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target fld dword [esi] ; ST = d <- data[sample] jmp edx .loop2_end: .end: pop ebx pop edi pop esi ret ALIGN 16 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old ;[esp + 16] == autoc[] ;[esp + 12] == lag ;[esp + 8] == data_len ;[esp + 4] == data[] ;ASSERT(lag > 0) ;ASSERT(lag <= 4) ;ASSERT(lag <= data_len) ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] = 0.0; xorps xmm5, xmm5 mov edx, [esp + 8] ; edx == data_len mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] add eax, 4 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] .warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample] mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 dec edx jz .loop_end ALIGN 16 .loop_start: ; start by reading the next sample movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] add eax, 4 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float movss xmm2, xmm0 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 dec edx jnz .loop_start .loop_end: ; store autoc mov edx, [esp + 16] ; edx == autoc movups [edx], xmm5 .end: ret ALIGN 16 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old ;[esp + 16] == autoc[] ;[esp + 12] == lag ;[esp + 8] == data_len ;[esp + 4] == data[] ;ASSERT(lag > 0) ;ASSERT(lag <= 8) ;ASSERT(lag <= data_len) ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] = 0.0; xorps xmm5, xmm5 xorps xmm6, xmm6 mov edx, [esp + 8] ; edx == data_len mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] add eax, 4 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 .warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] mulps xmm0, xmm2 mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 addps xmm5, xmm0 addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 dec edx jz .loop_end ALIGN 16 .loop_start: ; start by reading the next sample movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] ; here we reorder the instructions; see the (#) indexes for a logical order shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float add eax, 4 ; (0) shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample] movss xmm3, xmm2 ; (5) movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample] movss xmm2, xmm0 ; (6) mulps xmm1, xmm3 ; (8) mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 addps xmm6, xmm1 ; (10) addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 dec edx jnz .loop_start .loop_end: ; store autoc mov edx, [esp + 16] ; edx == autoc movups [edx], xmm5 movups [edx + 16], xmm6 .end: ret ALIGN 16 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old ;[esp + 16] == autoc[] ;[esp + 12] == lag ;[esp + 8] == data_len ;[esp + 4] == data[] ;ASSERT(lag > 0) ;ASSERT(lag <= 12) ;ASSERT(lag <= data_len) ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] = 0.0; xorps xmm5, xmm5 xorps xmm6, xmm6 xorps xmm7, xmm7 mov edx, [esp + 8] ; edx == data_len mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] add eax, 4 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0 .warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] movaps xmm1, xmm0 mulps xmm1, xmm2 addps xmm5, xmm1 movaps xmm1, xmm0 mulps xmm1, xmm3 addps xmm6, xmm1 mulps xmm0, xmm4 addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2 dec edx jz .loop_end ALIGN 16 .loop_start: ; start by reading the next sample movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] add eax, 4 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] ; shift xmm4:xmm3:xmm2 left by one float shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float movss xmm4, xmm3 movss xmm3, xmm2 movss xmm2, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2 movaps xmm1, xmm0 mulps xmm1, xmm2 addps xmm5, xmm1 movaps xmm1, xmm0 mulps xmm1, xmm3 addps xmm6, xmm1 mulps xmm0, xmm4 addps xmm7, xmm0 dec edx jnz .loop_start .loop_end: ; store autoc mov edx, [esp + 16] ; edx == autoc movups [edx], xmm5 movups [edx + 16], xmm6 movups [edx + 32], xmm7 .end: ret ALIGN 16 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old ;[ebp + 20] == autoc[] ;[ebp + 16] == lag ;[ebp + 12] == data_len ;[ebp + 8] == data[] ;[esp] == __m128 ;[esp + 16] == __m128 push ebp mov ebp, esp and esp, -16 ; stack realign for SSE instructions 'movaps' and 'addps' sub esp, 32 ;ASSERT(lag > 0) ;ASSERT(lag <= 12) ;ASSERT(lag <= data_len) ;ASSERT(data_len > 0) ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] = 0.0; xorps xmm5, xmm5 xorps xmm6, xmm6 movaps [esp], xmm5 movaps [esp + 16], xmm6 mov edx, [ebp + 12] ; edx == data_len mov eax, [ebp + 8] ; eax == &data[sample] <- &data[0] movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] add eax, 4 movaps xmm1, xmm0 ; xmm1 = 0,0,0,data[0] shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] xorps xmm2, xmm2 ; xmm2 = 0,0,0,0 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0 movaps xmm7, xmm0 mulps xmm7, xmm1 addps xmm5, xmm7 dec edx jz .loop_end ALIGN 16 .loop_start: ; start by reading the next sample movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] add eax, 4 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] ; shift xmm4:xmm3:xmm2:xmm1 left by one float shufps xmm1, xmm1, 93h shufps xmm2, xmm2, 93h shufps xmm3, xmm3, 93h shufps xmm4, xmm4, 93h movss xmm4, xmm3 movss xmm3, xmm2 movss xmm2, xmm1 movss xmm1, xmm0 ; xmmB:xmmA:xmm6:xmm5 += xmm0:xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2:xmm1 movaps xmm7, xmm0 mulps xmm7, xmm1 addps xmm5, xmm7 movaps xmm7, xmm0 mulps xmm7, xmm2 addps xmm6, xmm7 movaps xmm7, xmm0 mulps xmm7, xmm3 mulps xmm0, xmm4 addps xmm7, [esp] addps xmm0, [esp + 16] movaps [esp], xmm7 movaps [esp + 16], xmm0 dec edx jnz .loop_start .loop_end: ; store autoc mov edx, [ebp + 20] ; edx == autoc movups [edx], xmm5 movups [edx + 16], xmm6 movaps xmm5, [esp] movaps xmm6, [esp + 16] movups [edx + 32], xmm5 movups [edx + 48], xmm6 .end: mov esp, ebp pop ebp ret ;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) ; ; for(i = 0; i < data_len; i++) { ; sum = 0; ; for(j = 0; j < order; j++) ; sum += qlp_coeff[j] * data[i-j-1]; ; residual[i] = data[i] - (sum >> lp_quantization); ; } ; ALIGN 16 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 ;[esp + 40] residual[] ;[esp + 36] lp_quantization ;[esp + 32] order ;[esp + 28] qlp_coeff[] ;[esp + 24] data_len ;[esp + 20] data[] ;ASSERT(order > 0) push ebp push ebx push esi push edi mov esi, [esp + 20] ; esi = data[] mov edi, [esp + 40] ; edi = residual[] mov eax, [esp + 32] ; eax = order mov ebx, [esp + 24] ; ebx = data_len test ebx, ebx jz near .end ; do nothing if data_len == 0 .begin: cmp eax, byte 1 jg short .i_1more mov ecx, [esp + 28] mov edx, [ecx] ; edx = qlp_coeff[0] mov eax, [esi - 4] ; eax = data[-1] mov ecx, [esp + 36] ; cl = lp_quantization ALIGN 16 .i_1_loop_i: imul eax, edx sar eax, cl neg eax add eax, [esi] mov [edi], eax mov eax, [esi] add edi, byte 4 add esi, byte 4 dec ebx jnz .i_1_loop_i jmp .end .i_1more: cmp eax, byte 32 ; for order <= 32 there is a faster routine jbe short .i_32 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 ALIGN 16 .i_32more_loop_i: xor ebp, ebp mov ecx, [esp + 32] mov edx, ecx shl edx, 2 add edx, [esp + 28] neg ecx ALIGN 16 .i_32more_loop_j: sub edx, byte 4 mov eax, [edx] imul eax, [esi + 4 * ecx] add ebp, eax inc ecx jnz short .i_32more_loop_j mov ecx, [esp + 36] sar ebp, cl neg ebp add ebp, [esi] mov [edi], ebp add esi, byte 4 add edi, byte 4 dec ebx jnz .i_32more_loop_i jmp .end .mov_eip_to_eax: mov eax, [esp] ret .i_32: sub edi, esi neg eax lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] call .mov_eip_to_eax .get_eip0: add edx, eax inc edx mov eax, [esp + 28] ; eax = qlp_coeff[] xor ebp, ebp jmp edx mov ecx, [eax + 124] imul ecx, [esi - 128] add ebp, ecx mov ecx, [eax + 120] imul ecx, [esi - 124] add ebp, ecx mov ecx, [eax + 116] imul ecx, [esi - 120] add ebp, ecx mov ecx, [eax + 112] imul ecx, [esi - 116] add ebp, ecx mov ecx, [eax + 108] imul ecx, [esi - 112] add ebp, ecx mov ecx, [eax + 104] imul ecx, [esi - 108] add ebp, ecx mov ecx, [eax + 100] imul ecx, [esi - 104] add ebp, ecx mov ecx, [eax + 96] imul ecx, [esi - 100] add ebp, ecx mov ecx, [eax + 92] imul ecx, [esi - 96] add ebp, ecx mov ecx, [eax + 88] imul ecx, [esi - 92] add ebp, ecx mov ecx, [eax + 84] imul ecx, [esi - 88] add ebp, ecx mov ecx, [eax + 80] imul ecx, [esi - 84] add ebp, ecx mov ecx, [eax + 76] imul ecx, [esi - 80] add ebp, ecx mov ecx, [eax + 72] imul ecx, [esi - 76] add ebp, ecx mov ecx, [eax + 68] imul ecx, [esi - 72] add ebp, ecx mov ecx, [eax + 64] imul ecx, [esi - 68] add ebp, ecx mov ecx, [eax + 60] imul ecx, [esi - 64] add ebp, ecx mov ecx, [eax + 56] imul ecx, [esi - 60] add ebp, ecx mov ecx, [eax + 52] imul ecx, [esi - 56] add ebp, ecx mov ecx, [eax + 48] imul ecx, [esi - 52] add ebp, ecx mov ecx, [eax + 44] imul ecx, [esi - 48] add ebp, ecx mov ecx, [eax + 40] imul ecx, [esi - 44] add ebp, ecx mov ecx, [eax + 36] imul ecx, [esi - 40] add ebp, ecx mov ecx, [eax + 32] imul ecx, [esi - 36] add ebp, ecx mov ecx, [eax + 28] imul ecx, [esi - 32] add ebp, ecx mov ecx, [eax + 24] imul ecx, [esi - 28] add ebp, ecx mov ecx, [eax + 20] imul ecx, [esi - 24] add ebp, ecx mov ecx, [eax + 16] imul ecx, [esi - 20] add ebp, ecx mov ecx, [eax + 12] imul ecx, [esi - 16] add ebp, ecx mov ecx, [eax + 8] imul ecx, [esi - 12] add ebp, ecx mov ecx, [eax + 4] imul ecx, [esi - 8] add ebp, ecx mov ecx, [eax] ; there is one byte missing imul ecx, [esi - 4] add ebp, ecx .jumper_0: mov ecx, [esp + 36] sar ebp, cl neg ebp add ebp, [esi] mov [edi + esi], ebp add esi, byte 4 dec ebx jz short .end xor ebp, ebp jmp edx .end: pop edi pop esi pop ebx pop ebp ret ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for ; the channel and qlp_coeffs must be <= 16. Especially note that this routine ; cannot be used for side-channel coded 16bps channels since the effective bps ; is 17. ALIGN 16 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx ;[esp + 40] residual[] ;[esp + 36] lp_quantization ;[esp + 32] order ;[esp + 28] qlp_coeff[] ;[esp + 24] data_len ;[esp + 20] data[] ;ASSERT(order > 0) push ebp push ebx push esi push edi mov esi, [esp + 20] ; esi = data[] mov edi, [esp + 40] ; edi = residual[] mov eax, [esp + 32] ; eax = order mov ebx, [esp + 24] ; ebx = data_len test ebx, ebx jz near .end ; do nothing if data_len == 0 dec ebx test ebx, ebx jz near .last_one mov edx, [esp + 28] ; edx = qlp_coeff[] movd mm6, [esp + 36] ; mm6 = 0:lp_quantization mov ebp, esp and esp, 0xfffffff8 xor ecx, ecx .copy_qlp_loop: push word [edx + 4 * ecx] inc ecx cmp ecx, eax jnz short .copy_qlp_loop and ecx, 0x3 test ecx, ecx je short .za_end sub ecx, byte 4 .za_loop: push word 0 inc eax inc ecx jnz short .za_loop .za_end: movq mm5, [esp + 2 * eax - 8] movd mm4, [esi - 16] punpckldq mm4, [esi - 12] movd mm0, [esi - 8] punpckldq mm0, [esi - 4] packssdw mm4, mm0 cmp eax, byte 4 jnbe short .mmx_4more ALIGN 16 .mmx_4_loop_i: movd mm1, [esi] movq mm3, mm4 punpckldq mm1, [esi + 4] psrlq mm4, 16 movq mm0, mm1 psllq mm0, 48 por mm4, mm0 movq mm2, mm4 psrlq mm4, 16 pxor mm0, mm0 punpckhdq mm0, mm1 pmaddwd mm3, mm5 pmaddwd mm2, mm5 psllq mm0, 16 por mm4, mm0 movq mm0, mm3 punpckldq mm3, mm2 punpckhdq mm0, mm2 paddd mm3, mm0 psrad mm3, mm6 psubd mm1, mm3 movd [edi], mm1 punpckhdq mm1, mm1 movd [edi + 4], mm1 add edi, byte 8 add esi, byte 8 sub ebx, 2 jg .mmx_4_loop_i jmp .mmx_end .mmx_4more: shl eax, 2 neg eax add eax, byte 16 ALIGN 16 .mmx_4more_loop_i: movd mm1, [esi] punpckldq mm1, [esi + 4] movq mm3, mm4 psrlq mm4, 16 movq mm0, mm1 psllq mm0, 48 por mm4, mm0 movq mm2, mm4 psrlq mm4, 16 pxor mm0, mm0 punpckhdq mm0, mm1 pmaddwd mm3, mm5 pmaddwd mm2, mm5 psllq mm0, 16 por mm4, mm0 mov ecx, esi add ecx, eax mov edx, esp ALIGN 16 .mmx_4more_loop_j: movd mm0, [ecx - 16] movd mm7, [ecx - 8] punpckldq mm0, [ecx - 12] punpckldq mm7, [ecx - 4] packssdw mm0, mm7 pmaddwd mm0, [edx] punpckhdq mm7, mm7 paddd mm3, mm0 movd mm0, [ecx - 12] punpckldq mm0, [ecx - 8] punpckldq mm7, [ecx] packssdw mm0, mm7 pmaddwd mm0, [edx] paddd mm2, mm0 add edx, byte 8 add ecx, byte 16 cmp ecx, esi jnz .mmx_4more_loop_j movq mm0, mm3 punpckldq mm3, mm2 punpckhdq mm0, mm2 paddd mm3, mm0 psrad mm3, mm6 psubd mm1, mm3 movd [edi], mm1 punpckhdq mm1, mm1 movd [edi + 4], mm1 add edi, byte 8 add esi, byte 8 sub ebx, 2 jg near .mmx_4more_loop_i .mmx_end: emms mov esp, ebp .last_one: mov eax, [esp + 32] inc ebx jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin .end: pop edi pop esi pop ebx pop ebp ret ; ********************************************************************** ; ; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) ; { ; unsigned i, j; ; FLAC__int32 sum; ; ; FLAC__ASSERT(order > 0); ; ; for(i = 0; i < data_len; i++) { ; sum = 0; ; for(j = 0; j < order; j++) ; sum += qlp_coeff[j] * data[i-j-1]; ; data[i] = residual[i] + (sum >> lp_quantization); ; } ; } ALIGN 16 cident FLAC__lpc_restore_signal_asm_ia32 ;[esp + 40] data[] ;[esp + 36] lp_quantization ;[esp + 32] order ;[esp + 28] qlp_coeff[] ;[esp + 24] data_len ;[esp + 20] residual[] ;ASSERT(order > 0) push ebp push ebx push esi push edi mov esi, [esp + 20] ; esi = residual[] mov edi, [esp + 40] ; edi = data[] mov eax, [esp + 32] ; eax = order mov ebx, [esp + 24] ; ebx = data_len test ebx, ebx jz near .end ; do nothing if data_len == 0 .begin: cmp eax, byte 1 jg short .x87_1more mov ecx, [esp + 28] mov edx, [ecx] mov eax, [edi - 4] mov ecx, [esp + 36] ALIGN 16 .x87_1_loop_i: imul eax, edx sar eax, cl add eax, [esi] mov [edi], eax add esi, byte 4 add edi, byte 4 dec ebx jnz .x87_1_loop_i jmp .end .x87_1more: cmp eax, byte 32 ; for order <= 32 there is a faster routine jbe short .x87_32 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 ALIGN 16 .x87_32more_loop_i: xor ebp, ebp mov ecx, [esp + 32] mov edx, ecx shl edx, 2 add edx, [esp + 28] neg ecx ALIGN 16 .x87_32more_loop_j: sub edx, byte 4 mov eax, [edx] imul eax, [edi + 4 * ecx] add ebp, eax inc ecx jnz short .x87_32more_loop_j mov ecx, [esp + 36] sar ebp, cl add ebp, [esi] mov [edi], ebp add edi, byte 4 add esi, byte 4 dec ebx jnz .x87_32more_loop_i jmp .end .mov_eip_to_eax: mov eax, [esp] ret .x87_32: sub esi, edi neg eax lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] call .mov_eip_to_eax .get_eip0: add edx, eax inc edx ; compensate for the shorter opcode on the last iteration mov eax, [esp + 28] ; eax = qlp_coeff[] xor ebp, ebp jmp edx mov ecx, [eax + 124] ; ecx = qlp_coeff[31] imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32] add ebp, ecx ; sum += qlp_coeff[31] * data[i-32] mov ecx, [eax + 120] ; ecx = qlp_coeff[30] imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31] add ebp, ecx ; sum += qlp_coeff[30] * data[i-31] mov ecx, [eax + 116] ; ecx = qlp_coeff[29] imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30] add ebp, ecx ; sum += qlp_coeff[29] * data[i-30] mov ecx, [eax + 112] ; ecx = qlp_coeff[28] imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29] add ebp, ecx ; sum += qlp_coeff[28] * data[i-29] mov ecx, [eax + 108] ; ecx = qlp_coeff[27] imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28] add ebp, ecx ; sum += qlp_coeff[27] * data[i-28] mov ecx, [eax + 104] ; ecx = qlp_coeff[26] imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27] add ebp, ecx ; sum += qlp_coeff[26] * data[i-27] mov ecx, [eax + 100] ; ecx = qlp_coeff[25] imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26] add ebp, ecx ; sum += qlp_coeff[25] * data[i-26] mov ecx, [eax + 96] ; ecx = qlp_coeff[24] imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25] add ebp, ecx ; sum += qlp_coeff[24] * data[i-25] mov ecx, [eax + 92] ; ecx = qlp_coeff[23] imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24] add ebp, ecx ; sum += qlp_coeff[23] * data[i-24] mov ecx, [eax + 88] ; ecx = qlp_coeff[22] imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23] add ebp, ecx ; sum += qlp_coeff[22] * data[i-23] mov ecx, [eax + 84] ; ecx = qlp_coeff[21] imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22] add ebp, ecx ; sum += qlp_coeff[21] * data[i-22] mov ecx, [eax + 80] ; ecx = qlp_coeff[20] imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21] add ebp, ecx ; sum += qlp_coeff[20] * data[i-21] mov ecx, [eax + 76] ; ecx = qlp_coeff[19] imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20] add ebp, ecx ; sum += qlp_coeff[19] * data[i-20] mov ecx, [eax + 72] ; ecx = qlp_coeff[18] imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19] add ebp, ecx ; sum += qlp_coeff[18] * data[i-19] mov ecx, [eax + 68] ; ecx = qlp_coeff[17] imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18] add ebp, ecx ; sum += qlp_coeff[17] * data[i-18] mov ecx, [eax + 64] ; ecx = qlp_coeff[16] imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17] add ebp, ecx ; sum += qlp_coeff[16] * data[i-17] mov ecx, [eax + 60] ; ecx = qlp_coeff[15] imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16] add ebp, ecx ; sum += qlp_coeff[15] * data[i-16] mov ecx, [eax + 56] ; ecx = qlp_coeff[14] imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15] add ebp, ecx ; sum += qlp_coeff[14] * data[i-15] mov ecx, [eax + 52] ; ecx = qlp_coeff[13] imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14] add ebp, ecx ; sum += qlp_coeff[13] * data[i-14] mov ecx, [eax + 48] ; ecx = qlp_coeff[12] imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13] add ebp, ecx ; sum += qlp_coeff[12] * data[i-13] mov ecx, [eax + 44] ; ecx = qlp_coeff[11] imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12] add ebp, ecx ; sum += qlp_coeff[11] * data[i-12] mov ecx, [eax + 40] ; ecx = qlp_coeff[10] imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11] add ebp, ecx ; sum += qlp_coeff[10] * data[i-11] mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9] imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10] add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10] mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8] imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9] add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9] mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7] imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8] add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8] mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6] imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7] add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7] mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5] imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6] add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6] mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4] imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5] add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5] mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3] imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4] add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4] mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2] imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3] add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3] mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1] imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2] add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2] mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction) imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1] add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1] .jumper_0: mov ecx, [esp + 36] sar ebp, cl ; ebp = (sum >> lp_quantization) add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization) mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization) add edi, byte 4 dec ebx jz short .end xor ebp, ebp jmp edx .end: pop edi pop esi pop ebx pop ebp ret ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for ; the channel and qlp_coeffs must be <= 16. Especially note that this routine ; cannot be used for side-channel coded 16bps channels since the effective bps ; is 17. ; WATCHOUT: this routine requires that each data array have a buffer of up to ; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each ; channel n, data[n][-1] through data[n][-3] should be accessible and zero. ALIGN 16 cident FLAC__lpc_restore_signal_asm_ia32_mmx ;[esp + 40] data[] ;[esp + 36] lp_quantization ;[esp + 32] order ;[esp + 28] qlp_coeff[] ;[esp + 24] data_len ;[esp + 20] residual[] ;ASSERT(order > 0) push ebp push ebx push esi push edi mov esi, [esp + 20] mov edi, [esp + 40] mov eax, [esp + 32] mov ebx, [esp + 24] test ebx, ebx jz near .end ; do nothing if data_len == 0 cmp eax, byte 4 jb near FLAC__lpc_restore_signal_asm_ia32.begin mov edx, [esp + 28] movd mm6, [esp + 36] mov ebp, esp and esp, 0xfffffff8 xor ecx, ecx .copy_qlp_loop: push word [edx + 4 * ecx] inc ecx cmp ecx, eax jnz short .copy_qlp_loop and ecx, 0x3 test ecx, ecx je short .za_end sub ecx, byte 4 .za_loop: push word 0 inc eax inc ecx jnz short .za_loop .za_end: movq mm5, [esp + 2 * eax - 8] movd mm4, [edi - 16] punpckldq mm4, [edi - 12] movd mm0, [edi - 8] punpckldq mm0, [edi - 4] packssdw mm4, mm0 cmp eax, byte 4 jnbe short .mmx_4more ALIGN 16 .mmx_4_loop_i: movq mm7, mm4 pmaddwd mm7, mm5 movq mm0, mm7 punpckhdq mm7, mm7 paddd mm7, mm0 psrad mm7, mm6 movd mm1, [esi] paddd mm7, mm1 movd [edi], mm7 psllq mm7, 48 psrlq mm4, 16 por mm4, mm7 add esi, byte 4 add edi, byte 4 dec ebx jnz .mmx_4_loop_i jmp .mmx_end .mmx_4more: shl eax, 2 neg eax add eax, byte 16 ALIGN 16 .mmx_4more_loop_i: mov ecx, edi add ecx, eax mov edx, esp movq mm7, mm4 pmaddwd mm7, mm5 ALIGN 16 .mmx_4more_loop_j: movd mm0, [ecx - 16] punpckldq mm0, [ecx - 12] movd mm1, [ecx - 8] punpckldq mm1, [ecx - 4] packssdw mm0, mm1 pmaddwd mm0, [edx] paddd mm7, mm0 add edx, byte 8 add ecx, byte 16 cmp ecx, edi jnz .mmx_4more_loop_j movq mm0, mm7 punpckhdq mm7, mm7 paddd mm7, mm0 psrad mm7, mm6 movd mm1, [esi] paddd mm7, mm1 movd [edi], mm7 psllq mm7, 48 psrlq mm4, 16 por mm4, mm7 add esi, byte 4 add edi, byte 4 dec ebx jnz short .mmx_4more_loop_i .mmx_end: emms mov esp, ebp .end: pop edi pop esi pop ebx pop ebp ret ; ********************************************************************** ; ;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) ; { ; unsigned i, j; ; FLAC__int64 sum; ; ; FLAC__ASSERT(order > 0); ; ; for(i = 0; i < data_len; i++) { ; sum = 0; ; for(j = 0; j < order; j++) ; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1]; ; residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization); ; } ; } ALIGN 16 cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 ;[esp + 40] residual[] ;[esp + 36] lp_quantization ;[esp + 32] order ;[esp + 28] qlp_coeff[] ;[esp + 24] data_len ;[esp + 20] data[] ;ASSERT(order > 0) ;ASSERT(order <= 32) ;ASSERT(lp_quantization <= 31) push ebp push ebx push esi push edi mov ebx, [esp + 24] ; ebx = data_len test ebx, ebx jz near .end ; do nothing if data_len == 0 .begin: mov eax, [esp + 32] ; eax = order cmp eax, 1 jg short .i_32 mov esi, [esp + 40] ; esi = residual[] mov edi, [esp + 20] ; edi = data[] mov ecx, [esp + 28] ; ecx = qlp_coeff[] mov ebp, [ecx] ; ebp = qlp_coeff[0] mov eax, [edi - 4] ; eax = data[-1] mov ecx, [esp + 36] ; cl = lp_quantization ALIGN 16 .i_1_loop_i: imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1] shrd eax, edx, cl ; 0 <= lp_quantization <= 15 neg eax add eax, [edi] mov [esi], eax mov eax, [edi] add esi, 4 add edi, 4 dec ebx jnz .i_1_loop_i jmp .end .mov_eip_to_eax: mov eax, [esp] ret .i_32: ; eax = order neg eax add eax, eax lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0] call .mov_eip_to_eax .get_eip0: add ebp, eax inc ebp ; compensate for the shorter opcode on the last iteration mov ebx, [esp + 28] ; ebx = qlp_coeff[] mov edi, [esp + 20] ; edi = data[] sub [esp + 40], edi ; residual[] -= data[] xor ecx, ecx xor esi, esi jmp ebp ;eax = -- ;edx = -- ;ecx = 0 ;esi = 0 ; ;ebx = qlp_coeff[] ;edi = data[] ;ebp = @address mov eax, [ebx + 124] ; eax = qlp_coeff[31] imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32] add ecx, eax adc esi, edx ; sum += qlp_coeff[31] * data[i-32] mov eax, [ebx + 120] ; eax = qlp_coeff[30] imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31] add ecx, eax adc esi, edx ; sum += qlp_coeff[30] * data[i-31] mov eax, [ebx + 116] imul dword [edi - 120] add ecx, eax adc esi, edx mov eax, [ebx + 112] imul dword [edi - 116] add ecx, eax adc esi, edx mov eax, [ebx + 108] imul dword [edi - 112] add ecx, eax adc esi, edx mov eax, [ebx + 104] imul dword [edi - 108] add ecx, eax adc esi, edx mov eax, [ebx + 100] imul dword [edi - 104] add ecx, eax adc esi, edx mov eax, [ebx + 96] imul dword [edi - 100] add ecx, eax adc esi, edx mov eax, [ebx + 92] imul dword [edi - 96] add ecx, eax adc esi, edx mov eax, [ebx + 88] imul dword [edi - 92] add ecx, eax adc esi, edx mov eax, [ebx + 84] imul dword [edi - 88] add ecx, eax adc esi, edx mov eax, [ebx + 80] imul dword [edi - 84] add ecx, eax adc esi, edx mov eax, [ebx + 76] imul dword [edi - 80] add ecx, eax adc esi, edx mov eax, [ebx + 72] imul dword [edi - 76] add ecx, eax adc esi, edx mov eax, [ebx + 68] imul dword [edi - 72] add ecx, eax adc esi, edx mov eax, [ebx + 64] imul dword [edi - 68] add ecx, eax adc esi, edx mov eax, [ebx + 60] imul dword [edi - 64] add ecx, eax adc esi, edx mov eax, [ebx + 56] imul dword [edi - 60] add ecx, eax adc esi, edx mov eax, [ebx + 52] imul dword [edi - 56] add ecx, eax adc esi, edx mov eax, [ebx + 48] imul dword [edi - 52] add ecx, eax adc esi, edx mov eax, [ebx + 44] imul dword [edi - 48] add ecx, eax adc esi, edx mov eax, [ebx + 40] imul dword [edi - 44] add ecx, eax adc esi, edx mov eax, [ebx + 36] imul dword [edi - 40] add ecx, eax adc esi, edx mov eax, [ebx + 32] imul dword [edi - 36] add ecx, eax adc esi, edx mov eax, [ebx + 28] imul dword [edi - 32] add ecx, eax adc esi, edx mov eax, [ebx + 24] imul dword [edi - 28] add ecx, eax adc esi, edx mov eax, [ebx + 20] imul dword [edi - 24] add ecx, eax adc esi, edx mov eax, [ebx + 16] imul dword [edi - 20] add ecx, eax adc esi, edx mov eax, [ebx + 12] imul dword [edi - 16] add ecx, eax adc esi, edx mov eax, [ebx + 8] imul dword [edi - 12] add ecx, eax adc esi, edx mov eax, [ebx + 4] imul dword [edi - 8] add ecx, eax adc esi, edx mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction) imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1] add ecx, eax adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1] .jumper_0: mov edx, ecx ;esi:edx = sum mov ecx, [esp + 36] ; cl = lp_quantization shrd edx, esi, cl ; edx = (sum >> lp_quantization) ;eax = -- ;ecx = -- ;edx = sum >> lp_q ;esi = -- neg edx ; edx = -(sum >> lp_quantization) mov eax, [esp + 40] ; residual[] - data[] add edx, [edi] ; edx = data[i] - (sum >> lp_quantization) mov [edi + eax], edx add edi, 4 dec dword [esp + 24] jz short .end xor ecx, ecx xor esi, esi jmp ebp .end: pop edi pop esi pop ebx pop ebp ret ; ********************************************************************** ; ; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) ; { ; unsigned i, j; ; FLAC__int64 sum; ; ; FLAC__ASSERT(order > 0); ; ; for(i = 0; i < data_len; i++) { ; sum = 0; ; for(j = 0; j < order; j++) ; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1]; ; data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization); ; } ; } ALIGN 16 cident FLAC__lpc_restore_signal_wide_asm_ia32 ;[esp + 40] data[] ;[esp + 36] lp_quantization ;[esp + 32] order ;[esp + 28] qlp_coeff[] ;[esp + 24] data_len ;[esp + 20] residual[] ;ASSERT(order > 0) ;ASSERT(order <= 32) ;ASSERT(lp_quantization <= 31) push ebp push ebx push esi push edi mov ebx, [esp + 24] ; ebx = data_len test ebx, ebx jz near .end ; do nothing if data_len == 0 .begin: mov eax, [esp + 32] ; eax = order cmp eax, 1 jg short .x87_32 mov esi, [esp + 20] ; esi = residual[] mov edi, [esp + 40] ; edi = data[] mov ecx, [esp + 28] ; ecx = qlp_coeff[] mov ebp, [ecx] ; ebp = qlp_coeff[0] mov eax, [edi - 4] ; eax = data[-1] mov ecx, [esp + 36] ; cl = lp_quantization ALIGN 16 .x87_1_loop_i: imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1] shrd eax, edx, cl ; 0 <= lp_quantization <= 15 ; add eax, [esi] mov [edi], eax ; add esi, 4 add edi, 4 dec ebx jnz .x87_1_loop_i jmp .end .mov_eip_to_eax: mov eax, [esp] ret .x87_32: ; eax = order neg eax add eax, eax lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0] call .mov_eip_to_eax .get_eip0: add ebp, eax inc ebp ; compensate for the shorter opcode on the last iteration mov ebx, [esp + 28] ; ebx = qlp_coeff[] mov edi, [esp + 40] ; esi = data[] sub [esp + 20], edi ; residual[] -= data[] xor ecx, ecx xor esi, esi jmp ebp ;eax = -- ;edx = -- ;ecx = 0 ;esi = 0 ; ;ebx = qlp_coeff[] ;edi = data[] ;ebp = @address mov eax, [ebx + 124] ; eax = qlp_coeff[31] imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32] add ecx, eax adc esi, edx ; sum += qlp_coeff[31] * data[i-32] mov eax, [ebx + 120] ; eax = qlp_coeff[30] imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31] add ecx, eax adc esi, edx ; sum += qlp_coeff[30] * data[i-31] mov eax, [ebx + 116] imul dword [edi - 120] add ecx, eax adc esi, edx mov eax, [ebx + 112] imul dword [edi - 116] add ecx, eax adc esi, edx mov eax, [ebx + 108] imul dword [edi - 112] add ecx, eax adc esi, edx mov eax, [ebx + 104] imul dword [edi - 108] add ecx, eax adc esi, edx mov eax, [ebx + 100] imul dword [edi - 104] add ecx, eax adc esi, edx mov eax, [ebx + 96] imul dword [edi - 100] add ecx, eax adc esi, edx mov eax, [ebx + 92] imul dword [edi - 96] add ecx, eax adc esi, edx mov eax, [ebx + 88] imul dword [edi - 92] add ecx, eax adc esi, edx mov eax, [ebx + 84] imul dword [edi - 88] add ecx, eax adc esi, edx mov eax, [ebx + 80] imul dword [edi - 84] add ecx, eax adc esi, edx mov eax, [ebx + 76] imul dword [edi - 80] add ecx, eax adc esi, edx mov eax, [ebx + 72] imul dword [edi - 76] add ecx, eax adc esi, edx mov eax, [ebx + 68] imul dword [edi - 72] add ecx, eax adc esi, edx mov eax, [ebx + 64] imul dword [edi - 68] add ecx, eax adc esi, edx mov eax, [ebx + 60] imul dword [edi - 64] add ecx, eax adc esi, edx mov eax, [ebx + 56] imul dword [edi - 60] add ecx, eax adc esi, edx mov eax, [ebx + 52] imul dword [edi - 56] add ecx, eax adc esi, edx mov eax, [ebx + 48] imul dword [edi - 52] add ecx, eax adc esi, edx mov eax, [ebx + 44] imul dword [edi - 48] add ecx, eax adc esi, edx mov eax, [ebx + 40] imul dword [edi - 44] add ecx, eax adc esi, edx mov eax, [ebx + 36] imul dword [edi - 40] add ecx, eax adc esi, edx mov eax, [ebx + 32] imul dword [edi - 36] add ecx, eax adc esi, edx mov eax, [ebx + 28] imul dword [edi - 32] add ecx, eax adc esi, edx mov eax, [ebx + 24] imul dword [edi - 28] add ecx, eax adc esi, edx mov eax, [ebx + 20] imul dword [edi - 24] add ecx, eax adc esi, edx mov eax, [ebx + 16] imul dword [edi - 20] add ecx, eax adc esi, edx mov eax, [ebx + 12] imul dword [edi - 16] add ecx, eax adc esi, edx mov eax, [ebx + 8] imul dword [edi - 12] add ecx, eax adc esi, edx mov eax, [ebx + 4] imul dword [edi - 8] add ecx, eax adc esi, edx mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction) imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1] add ecx, eax adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1] .jumper_0: mov edx, ecx ;esi:edx = sum mov ecx, [esp + 36] ; cl = lp_quantization shrd edx, esi, cl ; edx = (sum >> lp_quantization) ;eax = -- ;ecx = -- ;edx = sum >> lp_q ;esi = -- ; mov eax, [esp + 20] ; residual[] - data[] add edx, [edi + eax] ; edx = residual[i] + (sum >> lp_quantization) mov [edi], edx ; data[i] = residual[i] + (sum >> lp_quantization) add edi, 4 dec dword [esp + 24] jz short .end xor ecx, ecx xor esi, esi jmp ebp .end: pop edi pop esi pop ebx pop ebp ret ; end