axmol/external/jpeg/simd/i386/jcphuff-sse2.asm

663 lines
18 KiB
NASM

;
; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
;
; Copyright (C) 2016, 2018, Matthieu Darbois
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains an SSE2 implementation of data preparation for progressive
; Huffman encoding. See jcphuff.c for more details.
%include "jsimdext.inc"
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
; --------------------------------------------------------------------------
; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
; jsimd_encode_mcu_AC_refine_prepare_sse2()
%macro LOAD16 0
pxor N0, N0
pxor N1, N1
mov T0, INT [LUT + 0*SIZEOF_INT]
mov T1, INT [LUT + 8*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 0
pinsrw X1, word [BLOCK + T1 * 2], 0
mov T0, INT [LUT + 1*SIZEOF_INT]
mov T1, INT [LUT + 9*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 1
pinsrw X1, word [BLOCK + T1 * 2], 1
mov T0, INT [LUT + 2*SIZEOF_INT]
mov T1, INT [LUT + 10*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 2
pinsrw X1, word [BLOCK + T1 * 2], 2
mov T0, INT [LUT + 3*SIZEOF_INT]
mov T1, INT [LUT + 11*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 3
pinsrw X1, word [BLOCK + T1 * 2], 3
mov T0, INT [LUT + 4*SIZEOF_INT]
mov T1, INT [LUT + 12*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 4
pinsrw X1, word [BLOCK + T1 * 2], 4
mov T0, INT [LUT + 5*SIZEOF_INT]
mov T1, INT [LUT + 13*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 5
pinsrw X1, word [BLOCK + T1 * 2], 5
mov T0, INT [LUT + 6*SIZEOF_INT]
mov T1, INT [LUT + 14*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 6
pinsrw X1, word [BLOCK + T1 * 2], 6
mov T0, INT [LUT + 7*SIZEOF_INT]
mov T1, INT [LUT + 15*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 7
pinsrw X1, word [BLOCK + T1 * 2], 7
%endmacro
%macro LOAD15 0
pxor N0, N0
pxor N1, N1
pxor X1, X1
mov T0, INT [LUT + 0*SIZEOF_INT]
mov T1, INT [LUT + 8*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 0
pinsrw X1, word [BLOCK + T1 * 2], 0
mov T0, INT [LUT + 1*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 1
mov T0, INT [LUT + 2*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 2
mov T0, INT [LUT + 3*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 3
mov T0, INT [LUT + 4*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 4
mov T0, INT [LUT + 5*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 5
mov T0, INT [LUT + 6*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 6
mov T0, INT [LUT + 7*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 7
cmp LENEND, 2
jl %%.ELOAD15
mov T1, INT [LUT + 9*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 1
cmp LENEND, 3
jl %%.ELOAD15
mov T1, INT [LUT + 10*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 2
cmp LENEND, 4
jl %%.ELOAD15
mov T1, INT [LUT + 11*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 3
cmp LENEND, 5
jl %%.ELOAD15
mov T1, INT [LUT + 12*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 4
cmp LENEND, 6
jl %%.ELOAD15
mov T1, INT [LUT + 13*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 5
cmp LENEND, 7
jl %%.ELOAD15
mov T1, INT [LUT + 14*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 6
%%.ELOAD15:
%endmacro
%macro LOAD8 0
pxor N0, N0
mov T0, INT [LUT + 0*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 0
mov T0, INT [LUT + 1*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 1
mov T0, INT [LUT + 2*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 2
mov T0, INT [LUT + 3*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 3
mov T0, INT [LUT + 4*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 4
mov T0, INT [LUT + 5*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 5
mov T0, INT [LUT + 6*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 6
mov T0, INT [LUT + 7*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 7
%endmacro
%macro LOAD7 0
pxor N0, N0
pxor X0, X0
mov T1, INT [LUT + 0*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 0
cmp LENEND, 2
jl %%.ELOAD7
mov T1, INT [LUT + 1*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 1
cmp LENEND, 3
jl %%.ELOAD7
mov T1, INT [LUT + 2*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 2
cmp LENEND, 4
jl %%.ELOAD7
mov T1, INT [LUT + 3*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 3
cmp LENEND, 5
jl %%.ELOAD7
mov T1, INT [LUT + 4*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 4
cmp LENEND, 6
jl %%.ELOAD7
mov T1, INT [LUT + 5*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 5
cmp LENEND, 7
jl %%.ELOAD7
mov T1, INT [LUT + 6*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 6
%%.ELOAD7:
%endmacro
%macro REDUCE0 0
movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
movdqa xmm2, XMMWORD [VALUES + (16*2)]
movdqa xmm3, XMMWORD [VALUES + (24*2)]
movdqa xmm4, XMMWORD [VALUES + (32*2)]
movdqa xmm5, XMMWORD [VALUES + (40*2)]
movdqa xmm6, XMMWORD [VALUES + (48*2)]
pcmpeqw xmm0, ZERO
pcmpeqw xmm1, ZERO
pcmpeqw xmm2, ZERO
pcmpeqw xmm3, ZERO
pcmpeqw xmm4, ZERO
pcmpeqw xmm5, ZERO
pcmpeqw xmm6, ZERO
pcmpeqw xmm7, XMMWORD [VALUES + (56*2)]
packsswb xmm0, xmm1
packsswb xmm2, xmm3
packsswb xmm4, xmm5
packsswb xmm6, xmm7
pmovmskb eax, xmm0
pmovmskb ecx, xmm2
pmovmskb edx, xmm4
pmovmskb esi, xmm6
shl ecx, 16
shl esi, 16
or eax, ecx
or edx, esi
not eax
not edx
mov edi, ZEROBITS
mov INT [edi], eax
mov INT [edi+SIZEOF_INT], edx
%endmacro
;
; Prepare data for jsimd_encode_mcu_AC_first().
;
; GLOBAL(void)
; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
; const int *jpeg_natural_order_start,
; int Sl, int Al, JCOEF *values,
; size_t *zerobits)
;
; eax + 8 = const JCOEF *block
; eax + 12 = const int *jpeg_natural_order_start
; eax + 16 = int Sl
; eax + 20 = int Al
; eax + 24 = JCOEF *values
; eax + 28 = size_t *zerobits
%define ZERO xmm7
%define X0 xmm0
%define X1 xmm1
%define N0 xmm2
%define N1 xmm3
%define AL xmm4
%define K eax
%define LENEND eax
%define LUT ebx
%define T0 ecx
%define T1 edx
%define BLOCK esi
%define VALUES edi
%define LEN ebp
%define ZEROBITS INT [esp + 5 * 4]
align 32
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
sub esp, 4
push ebx
push ecx
; push edx ; need not be preserved
push esi
push edi
push ebp
mov BLOCK, INT [eax + 8]
mov LUT, INT [eax + 12]
mov VALUES, INT [eax + 24]
movd AL, INT [eax + 20]
mov T0, INT [eax + 28]
mov ZEROBITS, T0
mov LEN, INT [eax + 16]
pxor ZERO, ZERO
mov K, LEN
and K, -16
shr K, 4
jz .ELOOP16
.BLOOP16:
LOAD16
pcmpgtw N0, X0
pcmpgtw N1, X1
paddw X0, N0
paddw X1, N1
pxor X0, N0
pxor X1, N1
psrlw X0, AL
psrlw X1, AL
pxor N0, X0
pxor N1, X1
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (8) * 2], X1
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
add VALUES, 16*2
add LUT, 16*SIZEOF_INT
dec K
jnz .BLOOP16
test LEN, 15
je .PADDING
.ELOOP16:
mov LENEND, LEN
and LENEND, 7
test LEN, 8
jz .TRY7
test LEN, 7
jz .TRY8
LOAD15
pcmpgtw N0, X0
pcmpgtw N1, X1
paddw X0, N0
paddw X1, N1
pxor X0, N0
pxor X1, N1
psrlw X0, AL
psrlw X1, AL
pxor N0, X0
pxor N1, X1
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (8) * 2], X1
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
add VALUES, 16*2
jmp .PADDING
.TRY8:
LOAD8
pcmpgtw N0, X0
paddw X0, N0
pxor X0, N0
psrlw X0, AL
pxor N0, X0
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
add VALUES, 8*2
jmp .PADDING
.TRY7:
LOAD7
pcmpgtw N0, X0
paddw X0, N0
pxor X0, N0
psrlw X0, AL
pxor N0, X0
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
add VALUES, 8*2
.PADDING:
mov K, LEN
add K, 7
and K, -8
shr K, 3
sub K, DCTSIZE2/8
jz .EPADDING
align 16
.ZEROLOOP:
movdqa XMMWORD [VALUES + 0], ZERO
add VALUES, 8*2
inc K
jnz .ZEROLOOP
.EPADDING:
sub VALUES, DCTSIZE2*2
REDUCE0
pop ebp
pop edi
pop esi
; pop edx ; need not be preserved
pop ecx
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
%undef ZERO
%undef X0
%undef X1
%undef N0
%undef N1
%undef AL
%undef K
%undef LUT
%undef T0
%undef T1
%undef BLOCK
%undef VALUES
%undef LEN
;
; Prepare data for jsimd_encode_mcu_AC_refine().
;
; GLOBAL(int)
; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
; const int *jpeg_natural_order_start,
; int Sl, int Al, JCOEF *absvalues,
; size_t *bits)
;
; eax + 8 = const JCOEF *block
; eax + 12 = const int *jpeg_natural_order_start
; eax + 16 = int Sl
; eax + 20 = int Al
; eax + 24 = JCOEF *values
; eax + 28 = size_t *bits
%define ZERO xmm7
%define ONE xmm5
%define X0 xmm0
%define X1 xmm1
%define N0 xmm2
%define N1 xmm3
%define AL xmm4
%define K eax
%define LENEND eax
%define LUT ebx
%define T0 ecx
%define T0w cx
%define T1 edx
%define BLOCK esi
%define VALUES edi
%define KK ebp
%define ZEROBITS INT [esp + 5 * 4]
%define EOB INT [esp + 5 * 4 + 4]
%define LEN INT [esp + 5 * 4 + 8]
align 32
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
sub esp, 16
push ebx
push ecx
; push edx ; need not be preserved
push esi
push edi
push ebp
pcmpeqw ONE, ONE
psrlw ONE, 15
mov BLOCK, INT [eax + 8]
mov LUT, INT [eax + 12]
mov VALUES, INT [eax + 24]
movd AL, INT [eax + 20]
mov T0, INT [eax + 28]
mov K, INT [eax + 16]
mov INT [T0 + 2 * SIZEOF_INT], -1
mov INT [T0 + 3 * SIZEOF_INT], -1
mov ZEROBITS, T0
mov LEN, K
pxor ZERO, ZERO
and K, -16
mov EOB, 0
xor KK, KK
shr K, 4
jz .ELOOPR16
.BLOOPR16:
LOAD16
pcmpgtw N0, X0
pcmpgtw N1, X1
paddw X0, N0
paddw X1, N1
pxor X0, N0
pxor X1, N1
psrlw X0, AL
psrlw X1, AL
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (8) * 2], X1
pcmpeqw X0, ONE
pcmpeqw X1, ONE
packsswb N0, N1
packsswb X0, X1
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
mov T1, ZEROBITS
not T0
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
jz .CONTINUER16 ; if (idx) {
lea T1, [T1+KK*8]
mov EOB, T1 ; EOB = k + idx;
.CONTINUER16:
add VALUES, 16*2
add LUT, 16*SIZEOF_INT
add KK, 2
dec K
jnz .BLOOPR16
test LEN, 15
je .PADDINGR
.ELOOPR16:
mov LENEND, LEN
test LENEND, 8
jz .TRYR7
test LENEND, 7
jz .TRYR8
and LENEND, 7
LOAD15
pcmpgtw N0, X0
pcmpgtw N1, X1
paddw X0, N0
paddw X1, N1
pxor X0, N0
pxor X1, N1
psrlw X0, AL
psrlw X1, AL
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (8) * 2], X1
pcmpeqw X0, ONE
pcmpeqw X1, ONE
packsswb N0, N1
packsswb X0, X1
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
mov T1, ZEROBITS
not T0
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
jz .CONTINUER15 ; if (idx) {
lea T1, [T1+KK*8]
mov EOB, T1 ; EOB = k + idx;
.CONTINUER15:
add VALUES, 16*2
jmp .PADDINGR
.TRYR8:
LOAD8
pcmpgtw N0, X0
paddw X0, N0
pxor X0, N0
psrlw X0, AL
movdqa XMMWORD [VALUES + (0) * 2], X0
pcmpeqw X0, ONE
packsswb N0, ZERO
packsswb X0, ZERO
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
mov T1, ZEROBITS
not T0
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
jz .CONTINUER8 ; if (idx) {
lea T1, [T1+KK*8]
mov EOB, T1 ; EOB = k + idx;
.CONTINUER8:
add VALUES, 8*2
jmp .PADDINGR
.TRYR7:
and LENEND, 7
LOAD7
pcmpgtw N0, X0
paddw X0, N0
pxor X0, N0
psrlw X0, AL
movdqa XMMWORD [VALUES + (0) * 2], X0
pcmpeqw X0, ONE
packsswb N0, ZERO
packsswb X0, ZERO
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
mov T1, ZEROBITS
not T0
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
jz .CONTINUER7 ; if (idx) {
lea T1, [T1+KK*8]
mov EOB, T1 ; EOB = k + idx;
.CONTINUER7:
add VALUES, 8*2
.PADDINGR:
mov K, LEN
add K, 7
and K, -8
shr K, 3
sub K, DCTSIZE2/8
jz .EPADDINGR
align 16
.ZEROLOOPR:
movdqa XMMWORD [VALUES + 0], ZERO
add VALUES, 8*2
inc K
jnz .ZEROLOOPR
.EPADDINGR:
sub VALUES, DCTSIZE2*2
REDUCE0
mov eax, EOB
pop ebp
pop edi
pop esi
; pop edx ; need not be preserved
pop ecx
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
%undef ZERO
%undef ONE
%undef X0
%undef X1
%undef N0
%undef N1
%undef AL
%undef K
%undef KK
%undef EOB
%undef SIGN
%undef LUT
%undef T0
%undef T1
%undef BLOCK
%undef VALUES
%undef LEN
%undef LENEND
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32