;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; asmblt32.asm
;
; Copy Bitmap to DirectDraw Surface (16bit color to 32bit color)
;
; for MSVC and GCC, use 'COFF' format:
;    nasmw -f win32 asmblt32.asm
;
; for BCC32, use 'OMF' format:
;    nasmw -f obj asmblt32.asm
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;-----------------------------------------------------------------------------
;
; asmblit(BYTE *pSrc,    // ]JnAhX
;         BYTE *pDst,    // ]JnAhX
;         int nSrcLen,   // ]̃C̐擪܂ł̃oCg
;         int nDstLen,   // ]掟̃C̐擪܂ł̃oCg
;         int nWidth,    // ]摜̕(sNZ)
;         int nHeight)   // ]摜̍(sNZ)
;
;-----------------------------------------------------------------------------
;
; Ă݂̂́Ax̍͂قƂǖB
; eōœKĂ݂ĂB
;
; 
;
; VCŃRpC邱ƂlĂ܂B
; CRpCgpꍇ́Aޔ郌WX^
; ӂĂB
;
; EDI,ECX,EDX,EBP̂ݑޔĂ܂
;
;-----------------------------------------------------------------------------

        BITS 32

        SECTION .text USE32 CLASS=CODE

        EXTERN _brightmask50
        EXTERN _brightmask25
        EXTERN _palette_lookup

	GLOBAL _asmblit32
	GLOBAL _asmblit32_double
	GLOBAL _asmblit32_scanline0
	GLOBAL _asmblit32_scanline25
	GLOBAL _asmblit32_scanline50
	GLOBAL _asmblit32_scanline75

	GLOBAL _asmblit32_mmx
	GLOBAL _asmblit32_double_mmx
	GLOBAL _asmblit32_scanline0_mmx
	GLOBAL _asmblit32_scanline25_mmx
	GLOBAL _asmblit32_scanline50_mmx
	GLOBAL _asmblit32_scanline75_mmx

	GLOBAL _asmblit32_sse
	GLOBAL _asmblit32_double_sse
	GLOBAL _asmblit32_scanline0_sse
	GLOBAL _asmblit32_scanline25_sse
	GLOBAL _asmblit32_scanline50_sse
	GLOBAL _asmblit32_scanline75_sse


;-----------------------------------------------------------------------------
; }N
;-----------------------------------------------------------------------------

%define REGS    6			; number of pushed registories
%define pSrc    dword [esp+4*(1+REGS)]	; arg1
%define pDst    dword [esp+4*(2+REGS)]	; arg2
%define nSrcLen dword [esp+4*(3+REGS)]	; arg3
%define nDstLen dword [esp+4*(4+REGS)]	; arg4
%define nWidth  dword [esp+4*(5+REGS)]	; arg5
%define nHeight dword [esp+4*(6+REGS)]	; arg6

%macro PUSHREGS 0
	push	edi
	push	esi
	push	ecx
	push	edx
	push	ebx
	push	ebp
%endmacro

%macro POPREGS 0
	pop	ebp
	pop	ebx
	pop	edx
	pop	ecx
	pop	esi
	pop	edi
%endmacro


;-----------------------------------------------------------------------------
; 16bit to 32bit
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32:
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@163211000:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@163211000:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	mov		[edi], eax
	mov		[edi + 4], ecx
	add		edx, 4
	add		edi, 8
	dec		esi
	jnz		short L2@163211000

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		short L1@163211000

	POPREGS
	ret


;-----------------------------------------------------------------------------
; 16bit to 32bit double pixel
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_double:
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@163222000:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@163222000:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	mov		[edi], eax
	mov		[edi + 4], eax
	mov		[edi + 8], ecx
	mov		[edi + 12], ecx
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L2@163222000

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@163222000:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	mov		[edi], eax
	mov		[edi + 4], eax
	mov		[edi + 8], ecx
	mov		[edi + 12], ecx
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L3@163222000

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@163222000

	POPREGS
	ret


;-----------------------------------------------------------------------------
; 16bit to 32bit 0% scanline
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_scanline0:
	PUSHREGS

	mov		ebp, nHeight
	shl		nDstLen, 1
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@163222100:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@163222100:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	mov		[edi], eax
	mov		[edi + 4], eax
	mov		[edi + 8], ecx
	mov		[edi + 12], ecx
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L2@163222100

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		short L1@163222100

	POPREGS
	ret


;-----------------------------------------------------------------------------
; 16bit to 32bit 25% scanline
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_scanline25:
	PUSHREGS

	shr		nWidth, 1
	mov		ebx, [_palette_lookup]
	mov		ebp, [_brightmask25]

	ALIGN 16
L1@1632221250:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1632221250:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	mov		[edi], eax
	mov		[edi + 4], eax
	mov		[edi + 8], ecx
	mov		[edi + 12], ecx
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L2@1632221250

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax

	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1632221250:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	shr		eax, 2
	and		eax, ebp
	shr		ecx, 2
	and		ecx, ebp
	mov		[edi], eax
	mov		[edi + 4], eax
	mov		[edi + 8], ecx
	mov		[edi + 12], ecx
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L3@1632221250

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		nHeight
	jnz		near L1@1632221250

	POPREGS
	ret


;-----------------------------------------------------------------------------
; 16bit to 32bit 50% scanline
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_scanline50:
	PUSHREGS

	shr		nWidth, 1
	mov		ebx, [_palette_lookup]
	mov		ebp, [_brightmask50]

	ALIGN 16
L1@1632221500:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1632221500:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	mov		[edi], eax
	mov		[edi + 4], eax
	mov		[edi + 8], ecx
	mov		[edi + 12], ecx
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L2@1632221500

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax

	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1632221500:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	shr		eax, 1
	and		eax, ebp
	shr		ecx, 1
	and		ecx, ebp
	mov		[edi], eax
	mov		[edi + 4], eax
	mov		[edi + 8], ecx
	mov		[edi + 12], ecx
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L3@1632221500

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		nHeight
	jnz		near L1@1632221500

	POPREGS
	ret


;-----------------------------------------------------------------------------
; 16bit to 32bit 75% scanline
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_scanline75:
	PUSHREGS

	shr		nWidth, 1
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@1632221750:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1632221750:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	mov		[edi], eax
	mov		[edi + 4], eax
	mov		[edi + 8], ecx
	mov		[edi + 12], ecx
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L2@1632221750

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax

	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1632221750:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	mov		eax, [ebx + eax * 4]
	mov		ecx, [ebx + ecx * 4]
	mov		ebp, eax
	shr		eax, 1
	shr		ebp, 2
	and		eax, [_brightmask50]
	and		ebp, [_brightmask25]
	add		eax, ebp
	mov		ebp, ecx
	shr		ecx, 1
	shr		ebp, 2
	and		ecx, [_brightmask50]
	and		ebp, [_brightmask25]
	add		ecx, ebp
	mov		[edi], eax
	mov		[edi + 4], eax
	mov		[edi + 8], ecx
	mov		[edi + 12], ecx
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L3@1632221750

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		nHeight
	jnz		near L1@1632221750

	POPREGS
	ret


;*****************************************************************************

;-----------------------------------------------------------------------------
; 16bit to 32bit with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_mmx:
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@163211001:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@163211001:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm1
	movq		[edi], mm0
	add		edx, 4
	add		edi, 8
	dec		esi
	jnz		short L2@163211001

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		short L1@163211001

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 32bit double pixel with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_double_mmx:
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@163222001:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@163222001:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	movq		[edi], mm0
	movq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L2@163222001

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@163222001:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	movq		[edi], mm0
	movq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L3@163222001

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@163222001

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 32bit 0% scanlines with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_scanline0_mmx:
	PUSHREGS

	mov		ebp, nHeight
	shl		nDstLen, 1
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@163222101:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@163222101:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	movq		[edi], mm0
	movq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L2@163222101

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		short L1@163222101

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 32bit 25% scanlines with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_scanline25_mmx:
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]
	movq		mm4, [_brightmask25]
	movq		mm5, mm4

	ALIGN 16
L1@1632221251:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1632221251:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	movq		[edi], mm0
	movq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L2@1632221251

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1632221251:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	psrlq		mm0, 2
	psrlq		mm1, 2
	pand		mm0, mm4
	pand		mm1, mm5
	movq		[edi], mm0
	movq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L3@1632221251

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@1632221251

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 32bit 50% scanlines with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_scanline50_mmx:
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]
	movq		mm4, [_brightmask50]
	movq		mm5, mm4

	ALIGN 16
L1@1632221501:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1632221501:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	movq		[edi], mm0
	movq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L2@1632221501

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1632221501:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	psrlq		mm0, 1
	psrlq		mm1, 1
	pand		mm0, mm4
	pand		mm1, mm5
	movq		[edi], mm0
	movq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L3@1632221501

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@1632221501

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 32bit 75% scanlines with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_scanline75_mmx:
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]
	movq		mm4, [_brightmask50]
	movq		mm6, [_brightmask25]
	movq		mm5, mm4
	movq		mm7, mm6

	ALIGN 16
L1@1632221751:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1632221751:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	movq		[edi], mm0
	movq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L2@1632221751

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1632221751:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	movq		mm2, mm0
	movq		mm3, mm1
	psrlq		mm0, 1
	psrlq		mm1, 1
	psrlq		mm2, 2
	psrlq		mm3, 2
	pand		mm0, mm4
	pand		mm1, mm5
	pand		mm2, mm6
	pand		mm3, mm7
	paddw		mm0, mm2
	paddw		mm1, mm3
	movq		[edi], mm0
	movq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L3@1632221751

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@1632221751

	POPREGS
	emms
	ret


;*****************************************************************************

;-----------------------------------------------------------------------------
; 16bit to 32bit with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_sse:
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@163211002:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@163211002:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm1
	prefetchnta	[edx + 64]
	movntq		[edi], mm0
	add		edx, 4
	add		edi, 8
	dec		esi
	jnz		short L2@163211002

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		short L1@163211002

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 32bit double pixel with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_double_sse
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@163222002:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@163222002:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	prefetchnta	[edx + 64]
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L2@163222002

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@163222002:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	prefetchnta	[edx + 64]
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L3@163222002

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@163222002

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 32bit 0% scanlines with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_scanline0_sse
	PUSHREGS

	mov		ebp, nHeight
	shl		nDstLen, 1
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]

	ALIGN 16
L1@163222102:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@163222102:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	prefetchnta	[edx + 64]
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L2@163222102

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		short L1@163222102

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 32bit 25% scanlines with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_scanline25_sse
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]
	movq		mm4, [_brightmask25]
	movq		mm5, mm4

	ALIGN 16
L1@1632221252:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1632221252:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	prefetchnta	[edx + 64]
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L2@1632221252

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1632221252:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	psrlq		mm0, 2
	psrlq		mm1, 2
	pand		mm0, mm4
	pand		mm1, mm5
	prefetchnta	[edx + 64]
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L3@1632221252

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@1632221252

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 32bit 50% scanlines with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_scanline50_sse
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]
	movq		mm4, [_brightmask50]
	movq		mm5, mm4

	ALIGN 16
L1@1632221502:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1632221502:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	prefetchnta	[edx + 64]
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L2@1632221502

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1632221502:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	psrlq		mm0, 1
	psrlq		mm1, 1
	pand		mm0, mm4
	pand		mm1, mm5
	prefetchnta	[edx + 64]
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L3@1632221502

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@1632221502

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 32bit 75% scanlines with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit32_scanline75_sse
	PUSHREGS

	mov		ebp, nHeight
	shr		nWidth, 1
	mov		ebx, [_palette_lookup]
	movq		mm4, [_brightmask50]
	movq		mm6, [_brightmask25]
	movq		mm5, mm4
	movq		mm7, mm6

	ALIGN 16
L1@1632221752:
	mov		edx, pSrc
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L2@1632221752:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	prefetchnta	[edx + 64]
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L2@1632221752

	mov		eax, nDstLen
	mov		edx, pSrc
	add		pDst, eax
	mov		edi, pDst
	mov		esi, nWidth

	ALIGN 16
L3@1632221752:
	mov		eax, [edx]
	mov		ecx, eax
	and		eax, 0x0000ffff
	shr		ecx, 16
	movd		mm0, [ebx + eax * 4]
	movd		mm1, [ebx + ecx * 4]
	punpckldq	mm0, mm0
	punpckldq	mm1, mm1
	movq		mm2, mm0
	movq		mm3, mm1
	psrlq		mm0, 1
	psrlq		mm1, 1
	psrlq		mm2, 2
	psrlq		mm3, 2
	pand		mm0, mm4
	pand		mm1, mm5
	pand		mm2, mm6
	pand		mm3, mm7
	paddw		mm0, mm2
	paddw		mm1, mm3
	prefetchnta	[edx + 64]
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	add		edx, 4
	add		edi, 16
	dec		esi
	jnz		short L3@1632221752

	mov		eax, nSrcLen
	mov		ecx, nDstLen
	add		pSrc, eax
	add		pDst, ecx
	dec		ebp
	jnz		near L1@1632221752

	POPREGS
	emms
	ret
