;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; asmblt24.asm
;
; Copy Bitmap to DirectDraw Surface (16bit color to 24bit color)
;
; for MSVC and GCC, use 'COFF' format:
;    nasmw -f win32 asmblt32.asm
;
; for BCC32, use 'OMF' format:
;    nasmw -f obj asmblt32.asm
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;-----------------------------------------------------------------------------
;
; asmblit(BYTE *pSrc,    // ]JnAhX
;         BYTE *pDst,    // ]JnAhX
;         int nSrcLen,   // ]̃C̐擪܂ł̃oCg
;         int nDstLen,   // ]掟̃C̐擪܂ł̃oCg
;         int nWidth,    // ]摜̕(sNZ)
;         int nHeight)   // ]摜̍(sNZ)
;
;-----------------------------------------------------------------------------
;
; Ă݂̂́Ax̍͂قƂǖB
; eōœKĂ݂ĂB
;
; 
;
; VCŃRpC邱ƂlĂ܂B
; CRpCgpꍇ́Aޔ郌WX^
; ӂĂB
;
; EDI,ECX,EDX,EBP̂ݑޔĂ܂
;
;
; 24bitfBXvC[h̊̂ŁA`FbNĂ܂B
; 瓮삵Ȃ܂B
;
;-----------------------------------------------------------------------------

        BITS 32

        SECTION .text USE32 CLASS=CODE

        EXTERN _brightmask50
        EXTERN _brightmask25
        EXTERN _palette_lookup

	GLOBAL _asmblit24
	GLOBAL _asmblit24_double
	GLOBAL _asmblit24_scanline0
	GLOBAL _asmblit24_scanline25
	GLOBAL _asmblit24_scanline50
	GLOBAL _asmblit24_scanline75

	GLOBAL _asmblit24_mmx
	GLOBAL _asmblit24_double_mmx
	GLOBAL _asmblit24_scanline0_mmx
	GLOBAL _asmblit24_scanline25_mmx
	GLOBAL _asmblit24_scanline50_mmx
	GLOBAL _asmblit24_scanline75_mmx

	GLOBAL _asmblit24_sse
	GLOBAL _asmblit24_double_sse
	GLOBAL _asmblit24_scanline0_sse
	GLOBAL _asmblit24_scanline25_sse
	GLOBAL _asmblit24_scanline50_sse
	GLOBAL _asmblit24_scanline75_sse


;-----------------------------------------------------------------------------
; }N
;-----------------------------------------------------------------------------

%define REGS    6			; number of pushed registories
%define pSrc    dword [esp+4*(1+REGS)]	; arg1
%define pDst    dword [esp+4*(2+REGS)]	; arg2
%define nSrcLen dword [esp+4*(3+REGS)]	; arg3
%define nDstLen dword [esp+4*(4+REGS)]	; arg4
%define nWidth  dword [esp+4*(5+REGS)]	; arg5
%define nHeight dword [esp+4*(6+REGS)]	; arg6

%macro PUSHREGS 0
	push	edi
	push	esi
	push	ecx
	push	edx
	push	ebx
	push	ebp
%endmacro

%macro POPREGS 0
	pop	ebp
	pop	ebx
	pop	edx
	pop	ecx
	pop	esi
	pop	edi
%endmacro


;-----------------------------------------------------------------------------
; 16bit to 24bit
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24:
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@162411000:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@162411000:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	mov		[edi], eax
	mov		[edi + 3], ecx
	add		edx, 4
	add		edi, 6
	dec		esi
	jnz		short L2@162411000

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		short L1@162411000

	POPREGS
	ret


;-----------------------------------------------------------------------------
; 16bit to 24bit 2x2 pixel
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_double:
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@162422000:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@162422000:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	mov		[edi], eax
	mov		[edi + 3], eax
	mov		[edi + 6], ecx
	mov		[edi + 9], ecx
	add		edx, 4
	add		edi, 12
	dec		esi
	jnz		short L2@162422000

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@162422000:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	mov		[edi], eax
	mov		[edi + 3], eax
	mov		[edi + 6], ecx
	mov		[edi + 9], ecx
	add		edx, 4
	add		edi, 12
	dec		esi
	jnz		short L3@162422000

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@162422000

	POPREGS
	ret


;-----------------------------------------------------------------------------
; 16bit to 24bit 0% scanlines
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_scanline0:
	PUSHREGS

	mov		ebp, nHeight
	shl		nDstLen, 1
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@162422100:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@162422100:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	mov		[edi], eax
	mov		[edi + 3], eax
	mov		[edi + 6], ecx
	mov		[edi + 9], ecx
	add		edx, 4
	add		edi, 12
	dec		esi
	jnz		short L2@162422100

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		short L1@162422100

	POPREGS
	ret


;-----------------------------------------------------------------------------
; 16bit to 24bit 25% scanlines
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_scanline25:
	PUSHREGS

	shr		nWidth, 1
	mov		ebx, [_palette_lookup]
	mov		ebp, [_brightmask25]

	ALIGN 16
L1@1624221250:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1624221250:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	mov		[edi], eax
	mov		[edi + 3], eax
	mov		[edi + 6], ecx
	mov		[edi + 9], ecx
	add		edx, 4
	add		edi, 12
	dec		esi
	jnz		short L2@1624221250

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1624221250:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	shr		eax, 1
	shr		ecx, 1
	and		eax, ebp
	and		ecx, ebp
	mov		[edi], eax
	mov		[edi + 3], eax
	mov		[edi + 6], ecx
	mov		[edi + 9], ecx
	add		edx, 4
	add		edi, 12
	dec		esi
	jnz		short L3@1624221250

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		nHeight
	jnz		near L1@1624221250

	POPREGS
	ret


;-----------------------------------------------------------------------------
; 16bit to 24bit 50% scanlines
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_scanline50:
	PUSHREGS

	shr		nWidth, 1
	mov		ebx, [_palette_lookup]
	mov		ebp, [_brightmask50]

	ALIGN 16
L1@1624221500:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1624221500:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	mov		[edi], eax
	mov		[edi + 3], eax
	mov		[edi + 6], ecx
	mov		[edi + 9], ecx
	add		edx, 4
	add		edi, 12
	dec		esi
	jnz		short L2@1624221500

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1624221500:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	shr		eax, 1
	shr		ecx, 1
	and		eax, ebp
	and		ecx, ebp
	mov		[edi], eax
	mov		[edi + 3], eax
	mov		[edi + 6], ecx
	mov		[edi + 9], ecx
	add		edx, 4
	add		edi, 12
	dec		esi
	jnz		short L3@1624221500

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		nHeight
	jnz		near L1@1624221500

	POPREGS
	ret


;-----------------------------------------------------------------------------
; 16bit to 24bit 75% scanlines
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_scanline75:
	PUSHREGS

	shr		nWidth, 1
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@1624221750:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1624221750:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	mov		[edi], eax
	mov		[edi + 3], eax
	mov		[edi + 6], ecx
	mov		[edi + 9], ecx
	add		edx, 4
	add		edi, 12
	dec		esi
	jnz		short L2@1624221750

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1624221750:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	mov		ebp, eax
	shr		eax, 1
	shr		ebp, 1
	and		eax, [_brightmask50]
	and		ebp, [_brightmask25]
	add		eax, ebp
	mov		ebp, ecx
	shr		ecx, 1
	shr		ebp, 1
	and		ecx, [_brightmask50]
	and		ebp, [_brightmask25]
	add		ecx, ebp
	mov		[edi], eax
	mov		[edi + 3], eax
	mov		[edi + 6], ecx
	mov		[edi + 9], ecx
	add		edx, 4
	add		edi, 12
	dec		esi
	jnz		short L3@1624221750

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		nHeight
	jnz		near L1@1624221750

	POPREGS
	ret


;*****************************************************************************

;-----------------------------------------------------------------------------
; 16bit to 24bit with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_mmx:
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 3
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@162411001:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@162411001:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	psllq		mm1, 24
	por		mm0, mm1
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm1, [ebx + eax * 4]
	movd		mm2, [ebx + ecx * 4]
	movq		mm3, mm1
	psllq		mm1, 48
	por		mm0, mm1
	movq		[edi], mm0
	psrlq		mm3, 16
	psllq		mm2, 8
	por		mm2, mm3
	mov		eax, [edx + 8]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm1
	psllq		mm1, 56
	por		mm0, mm1
	movq		[edi + 8], mm0
	mov		eax, [edx + 12]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	psllq		mm0, 16
	psrlq		mm2, 8
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	movq		[edi + 16], mm0
	add		edx, 16
	add		edi, 24
	dec		esi
	jnz		near L2@162411001

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@162411001

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 24bit 2x2 pixel with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_double_mmx:
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 2
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@162422001:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@162422001:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	movq		[edi], mm0
	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	movq		[edi + 8], mm0
	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	movq		[edi + 16], mm0
	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L2@162422001

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@162422001:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	movq		[edi], mm0
	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	movq		[edi + 8], mm0
	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	movq		[edi + 16], mm0
	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L3@162422001

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@162422001

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 24bit 0% scanliness with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_scanline0_mmx:
	PUSHREGS

	mov		ebp, nHeight
	shl		nDstLen, 1
	shr		nWidth, 2
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@162422101:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@162422101:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	movq		[edi], mm0
	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	movq		[edi + 8], mm0
	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	movq		[edi + 16], mm0
	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L2@162422101

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@162422101

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 24bit 25% scanliness with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_scanline25_mmx:
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 2
	mov		ebx, [_palette_lookup]
	movq		mm6, [_brightmask25]

	ALIGN 16
L1@1624221251:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1624221251:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	movq		[edi], mm0
	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	movq		[edi + 8], mm0
	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	movq		[edi + 16], mm0
	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L2@1624221251

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1624221251:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	psrlq		mm0, 2
	pand		mm0, mm6
	movq		[edi], mm0

	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	psrlq		mm0, 2
	pand		mm0, mm6
	movq		[edi + 8], mm0

	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	psrlq		mm0, 2
	pand		mm0, mm6
	movq		[edi + 16], mm0

	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L3@1624221251

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@1624221251

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 24bit 50% scanliness with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_scanline50_mmx:
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 2
	mov		ebx, [_palette_lookup]
	movq		mm6, [_brightmask50]

	ALIGN 16
L1@1624221501:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1624221501:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	movq		[edi], mm0
	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	movq		[edi + 8], mm0
	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	movq		[edi + 16], mm0
	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L2@1624221501
	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1624221501:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	psrlq		mm0, 1
	pand		mm0, mm6
	movq		[edi], mm0
	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	psrlq		mm0, 1
	pand		mm0, mm6
	movq		[edi + 8], mm0
	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	psrlq		mm0, 1
	pand		mm0, mm6
	movq		[edi + 16], mm0
	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L3@1624221501

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@1624221501

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 24bit 75% scanliness with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_scanline75_mmx:
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 2
	mov		ebx, [_palette_lookup]
	movq		mm6, [_brightmask50]
	movq		mm7, [_brightmask25]

	ALIGN 16
L1@1624221751:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1624221751:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	movq		[edi], mm0
	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	movq		[edi + 8], mm0
	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	movq		[edi + 16], mm0
	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L2@1624221751


	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1624221751:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	movq		mm1, mm0
	psrlq		mm0, 1
	pand		mm0, mm6
	psrlq		mm1, 2
	pand		mm1, mm7
	paddw		mm0, mm1
	movq		[edi], mm0

	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	movq		mm3, mm0
	psrlq		mm0, 1
	pand		mm0, mm6
	psrlq		mm3, 2
	pand		mm3, mm7
	paddw		mm0, mm3
	movq		[edi + 8], mm0

	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	movq		mm1, mm0
	psrlq		mm0, 1
	pand		mm0, mm6
	psrlq		mm1, 2
	pand		mm1, mm7
	paddw		mm0, mm1
	movq		[edi + 16], mm0
	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L3@1624221751

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@1624221751

	POPREGS
	emms
	ret


;*****************************************************************************

;-----------------------------------------------------------------------------
; 16bit to 24bit with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_sse
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 3
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@162411002:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@162411002:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	prefetchnta	[edx + 4]
	psllq		mm1, 24
	por		mm0, mm1
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm1, [ebx + eax * 4]
	movd		mm2, [ebx + ecx * 4]
	movq		mm3, mm1
	psllq		mm1, 48
	por		mm0, mm1
	prefetchnta	[edx + 8]
	movntq		[edi], mm0
	psrlq		mm3, 16
	psllq		mm2, 8
	por		mm2, mm3
	mov		eax, [edx + 8]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm1
	psllq		mm1, 56
	por		mm0, mm1
	prefetchnta	[edx + 12]
	movntq		[edi + 8], mm0
	mov		eax, [edx + 12]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	psllq		mm0, 16
	psrlq		mm2, 8
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	prefetchnta	[edx + 16]
	movntq		[edi + 16], mm0
	add		edx, 16
	add		edi, 24
	dec		esi
	jnz		near L2@162411002

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@162411002

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 24bit 2x2 pixel with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_double_sse
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 2
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@162422002:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@162422002:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	prefetchnta	[edx + 4]
	movntq		[edi], mm0
	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	movntq		[edi + 8], mm0
	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	prefetchnta	[edx + 8]
	movntq		[edi + 16], mm0
	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L2@162422002

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@162422002:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	prefetchnta	[edx + 4]
	movntq		[edi], mm0
	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	movntq		[edi + 8], mm0
	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	prefetchnta	[edx + 8]
	movntq		[edi + 16], mm0
	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L3@162422002

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@162422002

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 24bit 0% scanliness with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_scanline0_sse
	PUSHREGS

	mov		ebp, nHeight
	shl		nDstLen, 1
	shr		nWidth, 2
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@162422102:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@162422102:
	prefetchnta	[edx + 64]
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	movntq		[edi], mm0

	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	movntq		[edi + 8], mm0

	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	movntq		[edi + 16], mm0
	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L2@162422102

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@162422102

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 24bit 25% scanliness with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_scanline25_sse
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 2
	mov		ebx, [_palette_lookup]
	movq		mm6, [_brightmask25]

	ALIGN 16
L1@1624221252:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1624221252:
	prefetchnta	[edx + 64]
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	movntq		[edi], mm0

	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	movntq		[edi + 8], mm0

	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	movntq		[edi + 16], mm0

	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L2@1624221252

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1624221252:
	prefetchnta	[edx + 64]
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	psrlq		mm0, 2
	pand		mm0, mm6
	movntq		[edi], mm0

	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	psrlq		mm0, 2
	pand		mm0, mm6
	movntq		[edi + 8], mm0

	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	psrlq		mm0, 2
	pand		mm0, mm6
	movntq		[edi + 16], mm0

	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L3@1624221252

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@1624221252

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 24bit 50% scanliness with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_scanline50_sse
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 2
	mov		ebx, [_palette_lookup]
	movq		mm6, [_brightmask50]

	ALIGN 16
L1@1624221502:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1624221502:
	prefetchnta	[edx + 64]
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	movntq		[edi], mm0

	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	movntq		[edi + 8], mm0

	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	movntq		[edi + 16], mm0

	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L2@1624221502

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1624221502:
	prefetchnta	[edx + 64]
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	psrlq		mm0, 1
	pand		mm0, mm6
	movntq		[edi], mm0

	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	psrlq		mm0, 1
	pand		mm0, mm6
	movntq		[edi + 8], mm0

	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	psrlq		mm0, 1
	pand		mm0, mm6
	prefetchnta	[edx + 8]
	movntq		[edi + 16], mm0

	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L3@1624221502

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@1624221502

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 24bit 75% scanliness with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit24_scanline75_sse
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 2
	mov		ebx, [_palette_lookup]
	movq		mm6, [_brightmask50]
	movq		mm7, [_brightmask25]

	ALIGN 16
L1@1624221752:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1624221752:
	prefetchnta	[edx + 64]
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	movntq		[edi], mm0

	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	movntq		[edi + 8], mm0

	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	movntq		[edi + 16], mm0

	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L2@1624221752

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1624221752:
	prefetchnta	[edx + 64]
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm2, mm0
	movq		mm3, mm1
	psllq		mm2, 24
	por		mm0, mm2
	psllq		mm1, 48
	por		mm0, mm1
	movq		mm1, mm0
	psrlq		mm0, 1
	pand		mm0, mm6
	psrlq		mm1, 2
	pand		mm1, mm7
	paddw		mm0, mm1
	movntq		[edi], mm0

	movq		mm2, mm3
	psllq		mm2, 8
	psrlq		mm3, 16
	por		mm2, mm3
	mov		eax, [edx + 4]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	movq		mm3, mm0
	psllq		mm0, 32
	por		mm0, mm2
	movq		mm2, mm3
	psllq		mm3, 56
	por		mm0, mm3
	movq		mm3, mm0
	psrlq		mm0, 1
	pand		mm0, mm6
	psrlq		mm3, 2
	pand		mm3, mm7
	paddw		mm0, mm3
	movntq		[edi + 8], mm0

	psrlq		mm2, 8
	movq		mm0, mm1
	psllq		mm0, 16
	por		mm0, mm2
	psllq		mm1, 40
	por		mm0, mm1
	movq		mm1, mm0
	psrlq		mm0, 1
	pand		mm0, mm6
	psrlq		mm1, 2
	pand		mm1, mm7
	paddw		mm0, mm1
	movntq		[edi + 16], mm0
	add		edx, 8
	add		edi, 24
	dec		esi
	jnz		near L3@1624221752

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@1624221752

	POPREGS
	emms
	ret
