;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; asmblt16.asm
;
; Copy Bitmap to DirectDraw Surface (16bit color to 16bit color)
;
; for MSVC and GCC, use 'COFF' format:
;    nasmw -f win32 asmblt16.asm
;
; for BCC32, use 'OMF' format:
;    nasmw -f obj asmblt16.asm
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;-----------------------------------------------------------------------------
;
; asmblit(BYTE *pSrc,    // ]JnAhX
;         BYTE *pDst,    // ]JnAhX
;         int nSrcLen,   // ]̃C̐擪܂ł̃oCg
;         int nDstLen,   // ]掟̃C̐擪܂ł̃oCg
;         int nWidth,    // ]摜̕(sNZ)
;         int nHeight)   // ]摜̍(sNZ)
;
;-----------------------------------------------------------------------------
;
; Ă݂̂́Ax̍͂قƂǖB
; eōœKĂ݂ĂB
;
; 
;
; VCŃRpC邱ƂlĂ܂B
; CRpCgpꍇ́Aޔ郌WX^
; ӂĂB
;
;-----------------------------------------------------------------------------

        BITS 32

        SECTION .text USE32 CLASS=CODE

        EXTERN _brightmask25
        EXTERN _brightmask50
        EXTERN _palette_lookup

	GLOBAL _asmblit16
	GLOBAL _asmblit16_double
	GLOBAL _asmblit16_scanline0
	GLOBAL _asmblit16_scanline25
	GLOBAL _asmblit16_scanline50
	GLOBAL _asmblit16_scanline75

	GLOBAL _asmblit16_mmx
	GLOBAL _asmblit16_double_mmx
	GLOBAL _asmblit16_scanline0_mmx
	GLOBAL _asmblit16_scanline25_mmx
	GLOBAL _asmblit16_scanline50_mmx
	GLOBAL _asmblit16_scanline75_mmx

	GLOBAL _asmblit16_sse
	GLOBAL _asmblit16_double_sse
	GLOBAL _asmblit16_scanline0_sse
	GLOBAL _asmblit16_scanline25_sse
	GLOBAL _asmblit16_scanline50_sse
	GLOBAL _asmblit16_scanline75_sse


;-----------------------------------------------------------------------------
; }N
;-----------------------------------------------------------------------------

%define REGS    6			; number of pushed registories
%define pSrc    dword [esp+4*(1+REGS)]	; arg1
%define pDst    dword [esp+4*(2+REGS)]	; arg2
%define nSrcLen dword [esp+4*(3+REGS)]	; arg3
%define nDstLen dword [esp+4*(4+REGS)]	; arg4
%define nWidth  dword [esp+4*(5+REGS)]	; arg5
%define nHeight dword [esp+4*(6+REGS)]	; arg6

%macro PUSHREGS 0
	push	edi
	push	esi
	push	ecx
	push	edx
	push	ebx
	push	ebp
%endmacro

%macro POPREGS 0
	pop	ebp
	pop	ebx
	pop	edx
	pop	ecx
	pop	esi
	pop	edi
%endmacro


;-----------------------------------------------------------------------------
; 16bit to 16bit
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16:
	PUSHREGS

	mov		eax, nSrcLen
	mov		ebp, nDstLen
	mov		ebx, nWidth
	mov		edx, nHeight
	shr		ebx, 1
	cld

	ALIGN 16
L1@161611000:
	mov		esi, pSrc
	mov		edi, pDst
	mov		ecx, ebx
	rep		movsd
	add		pSrc, eax
	add		pDst, ebp
	dec		edx
	jnz		short L1@161611000

	POPREGS
	ret


;-----------------------------------------------------------------------------
; 16bit to 16bit double pixel
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_double:
	PUSHREGS

	mov		esi, nHeight
	shr		nWidth, 1

	ALIGN 16
L1@161622000:
	mov		ebp, pSrc
	mov		edi, pDst
	mov		ecx, nWidth

	ALIGN 16
L2@161622000:
	mov		eax, [ebp]
	mov		ebx, eax
	and		eax, 0x0000ffff
	mov		edx, eax
	shl		eax, 16
	or		edx, eax
	and		ebx, 0xffff0000
	mov		eax, ebx
	shr		eax, 16
	or		eax, ebx
	mov		[edi], edx
	mov		[edi + 4], eax
	add		ebp, 4
	add		edi, 8
	loopnz		L2@161622000

	mov		eax, nDstLen
	add		pDst, eax

	mov		ebp, pSrc
	mov		edi, pDst
	mov		ecx, nWidth

	ALIGN 16
L3@161622000:
	mov		eax, [ebp]
	mov		ebx, eax
	and		eax, 0x0000ffff
	mov		edx, eax
	shl		eax, 16
	or		edx, eax
	and		ebx, 0xffff0000
	mov		eax, ebx
	shr		eax, 16
	or		eax, ebx
	mov		[edi], edx
	mov		[edi + 4], eax
	add		ebp, 4
	add		edi, 8
	loopnz		L3@161622000

	mov		eax, nSrcLen
	mov		edx, nDstLen
	add		pSrc, eax
	add		pDst, edx
	dec		esi
	jnz		near L1@161622000

	POPREGS
	ret


;-----------------------------------------------------------------------------
; 16bit to 16bit 0% scanlines
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_scanline0:
	PUSHREGS

	mov		esi, nHeight
	shr		nWidth, 1
	shl		nDstLen, 1

	ALIGN 16
L1@16162210:
	mov		ebp, pSrc
	mov		edi, pDst
	mov		ecx, nWidth

	ALIGN 16
L2@16162210:
	mov		eax, [ebp]
	mov		ebx, eax
	and		eax, 0x0000ffff
	mov		edx, eax
	shl		eax, 16
	or		edx, eax
	and		ebx, 0xffff0000
	mov		eax, ebx
	shr		eax, 16
	or		eax, ebx
	mov		[edi], edx
	mov		[edi + 4], eax
	add		ebp, 4
	add		edi, 8
	loopnz		L2@16162210

	mov		eax, nSrcLen
	mov		edx, nDstLen
	add		pSrc, eax
	add		pDst, edx
	dec		esi
	jnz		short L1@16162210

	POPREGS
	ret


;-----------------------------------------------------------------------------
; 16bit to 16bit 25% scanlines
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_scanline25:
	PUSHREGS

	mov		esi, dword [_brightmask25]
	shr		nWidth, 1

	ALIGN 16
L1@1616221250:
	mov		ebp, pSrc
	mov		edi, pDst
	mov		ecx, nWidth

	ALIGN 16
L2@1616221250:
	mov		eax, [ebp]
	mov		ebx, eax
	and		eax, 0x0000ffff
	mov		edx, eax
	shl		eax, 16
	or		edx, eax
	and		ebx, 0xffff0000
	mov		eax, ebx
	shr		eax, 16
	or		eax, ebx
	mov		[edi], edx
	mov		[edi + 4], eax
	add		ebp, 4
	add		edi, 8
	loopnz		L2@1616221250

	mov		eax, nDstLen
	add		pDst, eax

	mov		ebp, pSrc
	mov		edi, pDst
	mov		ecx, nWidth

	ALIGN 16
L3@1616221250:
	mov		eax, [ebp]
	mov		ebx, eax
	and		eax, 0x0000ffff
	mov		edx, eax
	shl		eax, 16
	or		edx, eax
	shr		edx, 2
	and		edx, esi
	and		ebx, 0xffff0000
	mov		eax, ebx
	shr		eax, 16
	or		eax, ebx
	shr		eax, 2
	and		eax, esi
	mov		[edi], edx
	mov		[edi + 4], eax
	add		ebp, 4
	add		edi, 8
	loopnz		L3@1616221250

	mov		eax, nSrcLen
	mov		edx, nDstLen
	add		pSrc, eax
	add		pDst, edx
	dec		nHeight
	jnz		near L1@1616221250

	POPREGS
	ret


;-----------------------------------------------------------------------------
; 16bit to 16bit 50% scanlines
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_scanline50:
	PUSHREGS

	mov		esi, dword [_brightmask50]
	shr		nWidth, 1

	ALIGN 16
L1@1616221500:
	mov		ebp, pSrc
	mov		edi, pDst
	mov		ecx, nWidth

	ALIGN 16
L2@1616221500:
	mov		eax, [ebp]
	mov		ebx, eax
	and		eax, 0x0000ffff
	mov		edx, eax
	shl		eax, 16
	or		edx, eax
	and		ebx, 0xffff0000
	mov		eax, ebx
	shr		eax, 16
	or		eax, ebx
	mov		[edi], edx
	mov		[edi + 4], eax
	add		ebp, 4
	add		edi, 8
	loopnz		L2@1616221500

	mov		eax, nDstLen
	add		pDst, eax

	mov		ebp, pSrc
	mov		edi, pDst
	mov		ecx, nWidth

	ALIGN 16
L3@1616221500:
	mov		eax, [ebp]
	mov		ebx, eax
	and		eax, 0x0000ffff
	mov		edx, eax
	shl		eax, 16
	or		edx, eax
	shr		edx, 1
	and		edx, esi
	and		ebx, 0xffff0000
	mov		eax, ebx
	shr		eax, 16
	or		eax, ebx
	shr		eax, 1
	and		eax, esi
	mov		[edi], edx
	mov		[edi + 4], eax
	add		ebp, 4
	add		edi, 8
	loopnz		L3@1616221500

	mov		eax, nSrcLen
	mov		edx, nDstLen
	add		pSrc, eax
	add		pDst, edx
	dec		nHeight
	jnz		near L1@1616221500

	POPREGS
	ret


;-----------------------------------------------------------------------------
; 16bit to 16bit 75% scanlines
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_scanline75:
	PUSHREGS

	mov		esi, nWidth
	shr		esi, 1

	ALIGN 16
L1@1616221750:
	mov		ebp, pSrc
	mov		edi, pDst
	mov		ecx, nWidth

	ALIGN 16
L2@1616221750:
	mov		eax, [ebp]
	mov		ebx, eax
	and		eax, 0x0000ffff
	mov		edx, eax
	shl		eax, 16
	or		edx, eax
	and		ebx, 0xffff0000
	mov		eax, ebx
	shr		eax, 16
	or		eax, ebx
	mov		[edi], edx
	mov		[edi + 4], eax
	add		ebp, 4
	add		edi, 8
	loopnz		L2@1616221750

	mov		eax, nDstLen
	add		pDst, eax

	mov		ebp, pSrc
	mov		edi, pDst
	mov		ecx, nWidth

	ALIGN 16
L3@1616221750:
	mov		eax, [ebp]
	mov		ebx, eax
	and		eax, 0x0000ffff
	mov		edx, eax
	shl		eax, 16
	or		eax, edx
	mov		edx, eax
	shr		eax, 1
	shr		edx, 2
	and		eax, dword [_brightmask50]
	and		edx, dword [_brightmask25]
	add		edx, eax
	and		ebx, 0xffff0000
	mov		eax, ebx
	shr		eax, 16
	or		eax, ebx
	mov		ebx, eax
	shr		eax, 1
	shr		ebx, 2
	and		eax, dword [_brightmask50]
	and		ebx, dword [_brightmask25]
	add		eax, ebx
	mov		[edi], edx
	mov		[edi + 4], eax
	add		ebp, 4
	add		edi, 8
	loopnz		L3@1616221750

	mov		eax, nSrcLen
	mov		edx, nDstLen
	add		pSrc, eax
	add		pDst, edx
	dec		nHeight
	jnz		near L1@1616221750

	POPREGS
	ret

;*****************************************************************************

;-----------------------------------------------------------------------------
; 16bit to 16bit with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_mmx:
	PUSHREGS

	mov		eax, nSrcLen
	mov		ebp, nDstLen
	mov		ebx, nWidth
	mov		esi, nHeight
	shr		ebx, 4

	ALIGN 16
L1@16161101:
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L2@16161101:
	movq		mm0, [edx]
	movq		mm1, [edx + 8]
	movq		mm2, [edx + 16]
	movq		mm3, [edx + 24]
	movq		[edi], mm0
	movq		[edi + 8], mm1
	movq		[edi + 16], mm2
	movq		[edi + 24], mm3
	add		edx, 32
	add		edi, 32
	loopnz		L2@16161101

	add		pSrc, eax
	add		pDst, ebp
	dec		esi
	jnz		short L1@16161101

	POPREGS
	emms
	ret



;-----------------------------------------------------------------------------
; 16bit to 16bit double pixel with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_double_mmx:
	PUSHREGS

	mov		eax, nSrcLen
	mov		ebp, nDstLen
	mov		ebx, nWidth
	mov		esi, nHeight
	shr		ebx, 4

	ALIGN 16
L1@161622001:
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L2@161622001:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	movd		mm2, [edx + 8]
	movd		mm3, [edx + 12]
	movd		mm4, [edx + 16]
	movd		mm5, [edx + 20]
	movd		mm6, [edx + 24]
	movd		mm7, [edx + 28]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	punpcklwd	mm2, mm2
	punpcklwd	mm3, mm3
	punpcklwd	mm4, mm4
	punpcklwd	mm5, mm5
	punpcklwd	mm6, mm6
	punpcklwd	mm7, mm7
	movq		[edi], mm0
	movq		[edi + 8], mm1
	movq		[edi + 16], mm2
	movq		[edi + 24], mm3
	movq		[edi + 32], mm4
	movq		[edi + 40], mm5
	movq		[edi + 48], mm6
	movq		[edi + 56], mm7
	add		edx, 32
	add		edi, 64
	loopnz		L2@161622001

	add		pDst, ebp
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L3@161622001:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	movd		mm2, [edx + 8]
	movd		mm3, [edx + 12]
	movd		mm4, [edx + 16]
	movd		mm5, [edx + 20]
	movd		mm6, [edx + 24]
	movd		mm7, [edx + 28]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	punpcklwd	mm2, mm2
	punpcklwd	mm3, mm3
	punpcklwd	mm4, mm4
	punpcklwd	mm5, mm5
	punpcklwd	mm6, mm6
	punpcklwd	mm7, mm7
	movq		[edi], mm0
	movq		[edi + 8], mm1
	movq		[edi + 16], mm2
	movq		[edi + 24], mm3
	movq		[edi + 32], mm4
	movq		[edi + 40], mm5
	movq		[edi + 48], mm6
	movq		[edi + 56], mm7
	add		edx, 32
	add		edi, 64
	loopnz		L3@161622001

	add		pSrc, eax
	add		pDst, ebp
	dec		esi
	jnz		near L1@161622001

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 16bit 0% scanlines with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_scanline0_mmx:
	PUSHREGS

	mov		eax, nSrcLen
	mov		ebp, nDstLen
	mov		ebx, nWidth
	mov		esi, nHeight
	shl		ebp, 1
	shr		ebx, 4

	ALIGN 16
L1@161622101:
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L2@161622101:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	movd		mm2, [edx + 8]
	movd		mm3, [edx + 12]
	movd		mm4, [edx + 16]
	movd		mm5, [edx + 20]
	movd		mm6, [edx + 24]
	movd		mm7, [edx + 28]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	punpcklwd	mm2, mm2
	punpcklwd	mm3, mm3
	punpcklwd	mm4, mm4
	punpcklwd	mm5, mm5
	punpcklwd	mm6, mm6
	punpcklwd	mm7, mm7
	movq		[edi], mm0
	movq		[edi + 8], mm1
	movq		[edi + 16], mm2
	movq		[edi + 24], mm3
	movq		[edi + 32], mm4
	movq		[edi + 40], mm5
	movq		[edi + 48], mm6
	movq		[edi + 56], mm7
	add		edx, 32
	add		edi, 64
	loopnz		L2@161622101

	add		pSrc, eax
	add		pDst, ebp
	dec		esi
	jnz		near L1@161622101

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 16bit 25% scanlines with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_scanline25_mmx:
	PUSHREGS

	mov		eax, nSrcLen
	mov		ebp, nDstLen
	mov		ebx, nWidth
	mov		esi, nHeight
	movq		mm4, [_brightmask25]
	movq		mm5, mm4
	movq		mm6, mm4
	movq		mm7, mm4
	shr		ebx, 3

	ALIGN 16
L1@1616221251:
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L2@1616221251:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	movd		mm2, [edx + 8]
	movd		mm3, [edx + 12]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	punpcklwd	mm2, mm2
	punpcklwd	mm3, mm3
	movq		[edi], mm0
	movq		[edi + 8], mm1
	movq		[edi + 16], mm2
	movq		[edi + 24], mm3
	add		edx, 16
	add		edi, 32
	loopnz		L2@1616221251

	add		pDst, ebp
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L3@1616221251:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	movd		mm2, [edx + 8]
	movd		mm3, [edx + 12]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	punpcklwd	mm2, mm2
	punpcklwd	mm3, mm3
	psrlq		mm0, 2
	psrlq		mm1, 2
	psrlq		mm2, 2
	psrlq		mm3, 2
	pand		mm0, mm4
	pand		mm1, mm5
	pand		mm2, mm6
	pand		mm3, mm7
	movq		[edi], mm0
	movq		[edi + 8], mm1
	movq		[edi + 16], mm2
	movq		[edi + 24], mm3
	add		edx, 16
	add		edi, 32
	loopnz		L3@1616221251

	add		pSrc, eax
	add		pDst, ebp
	dec		esi
	jnz		near L1@1616221251

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 16bit 50% scanlines with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_scanline50_mmx:
	PUSHREGS

	mov		eax, nSrcLen
	mov		ebp, nDstLen
	mov		ebx, nWidth
	mov		esi, nHeight
	movq		mm4, [_brightmask50]
	movq		mm5, mm4
	movq		mm6, mm4
	movq		mm7, mm4
	shr		ebx, 3

	ALIGN 16
L1@1616221501:
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L2@1616221501:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	movd		mm2, [edx + 8]
	movd		mm3, [edx + 12]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	punpcklwd	mm2, mm2
	punpcklwd	mm3, mm3
	movq		[edi], mm0
	movq		[edi + 8], mm1
	movq		[edi + 16], mm2
	movq		[edi + 24], mm3
	add		edx, 16
	add		edi, 32
	loopnz		L2@1616221501

	add		pDst, ebp
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L3@1616221501:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	movd		mm2, [edx + 8]
	movd		mm3, [edx + 12]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	punpcklwd	mm2, mm2
	punpcklwd	mm3, mm3
	psrlq		mm0, 1
	psrlq		mm1, 1
	psrlq		mm2, 1
	psrlq		mm3, 1
	pand		mm0, mm4
	pand		mm1, mm5
	pand		mm2, mm6
	pand		mm3, mm7
	movq		[edi], mm0
	movq		[edi + 8], mm1
	movq		[edi + 16], mm2
	movq		[edi + 24], mm3
	add		edx, 16
	add		edi, 32
	loopnz		L3@1616221501

	add		pSrc, eax
	add		pDst, ebp
	dec		esi
	jnz		near L1@1616221501

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 16bit 75% scanlines with MMX
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_scanline75_mmx:
	PUSHREGS

	mov		eax, nSrcLen
	mov		ebp, nDstLen
	mov		ebx, nWidth
	mov		esi, nHeight
	movq		mm4, [_brightmask50]
	movq		mm6, [_brightmask25]
	movq		mm5, mm4
	movq		mm7, mm6
	shr		ebx, 3

	ALIGN 16
L1@1616221751:
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L2@1616221751:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	movd		mm2, [edx + 8]
	movd		mm3, [edx + 12]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	punpcklwd	mm2, mm2
	punpcklwd	mm3, mm3
	movq		[edi], mm0
	movq		[edi + 8], mm1
	movq		[edi + 16], mm2
	movq		[edi + 24], mm3
	add		edx, 16
	add		edi, 32
	loopnz		L2@1616221751

	add		pDst, ebp
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx
	shl		ecx, 1

	ALIGN 16
L3@1616221751:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	movq		mm2, mm0
	movq		mm3, mm1
	psrlq		mm0, 1
	psrlq		mm1, 1
	psrlq		mm2, 2
	psrlq		mm3, 2
	pand		mm0, mm4
	pand		mm1, mm5
	pand		mm2, mm6
	pand		mm3, mm7
	paddb		mm0, mm2
	paddb		mm1, mm3
	movq		[edi], mm0
	movq		[edi + 8], mm1
	add		edx, 8
	add		edi, 16
	loopnz		L3@1616221751

	add		pSrc, eax
	add		pDst, ebp
	dec		esi
	jnz		near L1@1616221751

	POPREGS
	emms
	ret


;*****************************************************************************

;-----------------------------------------------------------------------------
; 16bit to 16bit with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_sse:
	PUSHREGS

	mov		eax, nSrcLen
	mov		ebp, nDstLen
	mov		ebx, nWidth
	mov		esi, nHeight
	shr		ebx, 4

	ALIGN 16
L1@16161102:
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L2@16161102:
%if SCRBMP_ALIGNED
	movaps		xmm0, [edx]
	movaps		xmm1, [edx + 16]
%else
	movups		xmm0, [edx]
	movups		xmm1, [edx + 16]
%endif
	prefetchnta 	[edx + 64]
	movntps		[edi], xmm0
	movntps		[edi + 16], xmm1
	add		edx, 32
	add		edi, 32
	loopnz		L2@16161102

	add		pSrc, eax
	add		pDst, ebp
	dec		esi
	jnz		short L1@16161102

	POPREGS
	ret


;-----------------------------------------------------------------------------
; 16bit to 16bit 2x2 pixel with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_double_sse:
	PUSHREGS

	mov		eax, nSrcLen
	mov		ebp, nDstLen
	mov		ebx, nWidth
	mov		esi, nHeight
	shr		ebx, 4

	ALIGN 16
L1@161622002:
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L2@161622002:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	movd		mm2, [edx + 8]
	movd		mm3, [edx + 12]
	movd		mm4, [edx + 16]
	movd		mm5, [edx + 20]
	movd		mm6, [edx + 24]
	movd		mm7, [edx + 28]
	prefetchnta	[edx + 64]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	punpcklwd	mm2, mm2
	punpcklwd	mm3, mm3
	punpcklwd	mm4, mm4
	punpcklwd	mm5, mm5
	punpcklwd	mm6, mm6
	punpcklwd	mm7, mm7
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	movntq		[edi + 16], mm2
	movntq		[edi + 24], mm3
	movntq		[edi + 32], mm4
	movntq		[edi + 40], mm5
	movntq		[edi + 48], mm6
	movntq		[edi + 56], mm7
	add		edx, 32
	add		edi, 64
	loopnz		L2@161622002

	add		pDst, ebp
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L3@161622002:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	movd		mm2, [edx + 8]
	movd		mm3, [edx + 12]
	movd		mm4, [edx + 16]
	movd		mm5, [edx + 20]
	movd		mm6, [edx + 24]
	movd		mm7, [edx + 28]
	prefetchnta	[edx + 64]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	punpcklwd	mm2, mm2
	punpcklwd	mm3, mm3
	punpcklwd	mm4, mm4
	punpcklwd	mm5, mm5
	punpcklwd	mm6, mm6
	punpcklwd	mm7, mm7
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	movntq		[edi + 16], mm2
	movntq		[edi + 24], mm3
	movntq		[edi + 32], mm4
	movntq		[edi + 40], mm5
	movntq		[edi + 48], mm6
	movntq		[edi + 56], mm7
	add		edx, 32
	add		edi, 64
	loopnz		L3@161622002

	add		pSrc, eax
	add		pDst, ebp
	dec		esi
	jnz		near L1@161622002

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 16bit 0% scanlines with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_scanline0_sse:
	PUSHREGS

	mov		eax, nSrcLen
	mov		ebp, nDstLen
	mov		ebx, nWidth
	mov		esi, nHeight
	shl		ebp, 1
	shr		ebx, 4

	ALIGN 16
L1@161622102:
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L2@161622102:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	movd		mm2, [edx + 8]
	movd		mm3, [edx + 12]
	movd		mm4, [edx + 16]
	movd		mm5, [edx + 20]
	movd		mm6, [edx + 24]
	movd		mm7, [edx + 28]
	prefetchnta	[edx + 64]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	punpcklwd	mm2, mm2
	punpcklwd	mm3, mm3
	punpcklwd	mm4, mm4
	punpcklwd	mm5, mm5
	punpcklwd	mm6, mm6
	punpcklwd	mm7, mm7
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	movntq		[edi + 16], mm2
	movntq		[edi + 24], mm3
	movntq		[edi + 32], mm4
	movntq		[edi + 40], mm5
	movntq		[edi + 48], mm6
	movntq		[edi + 56], mm7
	add		edx, 32
	add		edi, 64
	loopnz		L2@161622102

	add		pSrc, eax
	add		pDst, ebp
	dec		esi
	jnz		near L1@161622102

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 16bit 25% scanlines with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_scanline25_sse:
	PUSHREGS

	mov		eax, nSrcLen
	mov		ebp, nDstLen
	mov		ebx, nWidth
	mov		esi, nHeight
	movq		mm4, [_brightmask25]
	movq		mm5, mm4
	movq		mm6, mm4
	movq		mm7, mm4
	shr		ebx, 3

	ALIGN 16
L1@1616221252:
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L2@1616221252:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	movd		mm2, [edx + 8]
	movd		mm3, [edx + 12]
	prefetchnta	[edx + 64]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	punpcklwd	mm2, mm2
	punpcklwd	mm3, mm3
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	movntq		[edi + 16], mm2
	movntq		[edi + 24], mm3
	add		edx, 16
	add		edi, 32
	loopnz		L2@1616221252

	add		pDst, ebp
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L3@1616221252:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	movd		mm2, [edx + 8]
	movd		mm3, [edx + 12]
	prefetchnta	[edx + 64]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	punpcklwd	mm2, mm2
	punpcklwd	mm3, mm3
	psrlq		mm0, 2
	psrlq		mm1, 2
	psrlq		mm2, 2
	psrlq		mm3, 2
	pand		mm0, mm4
	pand		mm1, mm5
	pand		mm2, mm6
	pand		mm3, mm7
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	movntq		[edi + 16], mm2
	movntq		[edi + 24], mm3
	add		edx, 16
	add		edi, 32
	loopnz		L3@1616221252

	add		pSrc, eax
	add		pDst, ebp
	dec		esi
	jnz		near L1@1616221252

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 16bit 50% scanlines with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_scanline50_sse:
	PUSHREGS

	mov		eax, nSrcLen
	mov		ebp, nDstLen
	mov		ebx, nWidth
	mov		esi, nHeight
	movq		mm4, [_brightmask50]
	movq		mm5, mm4
	movq		mm6, mm4
	movq		mm7, mm4
	shr		ebx, 3

	ALIGN 16
L1@1616221502:
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L2@1616221502:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	movd		mm2, [edx + 8]
	movd		mm3, [edx + 12]
	prefetchnta	[edx + 64]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	punpcklwd	mm2, mm2
	punpcklwd	mm3, mm3
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	movntq		[edi + 16], mm2
	movntq		[edi + 24], mm3
	add		edx, 16
	add		edi, 32
	loopnz		L2@1616221502

	add		pDst, ebp
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L3@1616221502:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	movd		mm2, [edx + 8]
	movd		mm3, [edx + 12]
	prefetchnta	[edx + 64]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	punpcklwd	mm2, mm2
	punpcklwd	mm3, mm3
	psrlq		mm0, 1
	psrlq		mm1, 1
	psrlq		mm2, 1
	psrlq		mm3, 1
	pand		mm0, mm4
	pand		mm1, mm5
	pand		mm2, mm6
	pand		mm3, mm7
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	movntq		[edi + 16], mm2
	movntq		[edi + 24], mm3
	add		edx, 16
	add		edi, 32
	loopnz		L3@1616221502

	add		pSrc, eax
	add		pDst, ebp
	dec		esi
	jnz		near L1@1616221502

	POPREGS
	emms
	ret


;-----------------------------------------------------------------------------
; 16bit to 16bit 75% scanlines with SSE
;-----------------------------------------------------------------------------

	ALIGN 32
_asmblit16_scanline75_sse:
	PUSHREGS

	mov		eax, nSrcLen
	mov		ebp, nDstLen
	mov		ebx, nWidth
	mov		esi, nHeight
	movq		mm4, [_brightmask50]
	movq		mm6, [_brightmask25]
	movq		mm5, mm4
	movq		mm7, mm6
	shr		ebx, 3

	ALIGN 16
L1@1616221752:
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx

	ALIGN 16
L2@1616221752:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	movd		mm2, [edx + 8]
	movd		mm3, [edx + 12]
	prefetchnta	[edx + 64]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	punpcklwd	mm2, mm2
	punpcklwd	mm3, mm3
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	movntq		[edi + 16], mm2
	movntq		[edi + 24], mm3
	add		edx, 16
	add		edi, 32
	loopnz		L2@1616221752

	add		pDst, ebp
	mov		edx, pSrc
	mov		edi, pDst
	mov		ecx, ebx
	shl		ecx, 1

	ALIGN 16
L3@1616221752:
	movd		mm0, [edx]
	movd		mm1, [edx + 4]
	prefetchnta	[edx + 64]
	punpcklwd	mm0, mm0
	punpcklwd	mm1, mm1
	movq		mm2, mm0
	movq		mm3, mm1
	psrlq		mm0, 1
	psrlq		mm1, 1
	psrlq		mm2, 2
	psrlq		mm3, 2
	pand		mm0, mm4
	pand		mm1, mm5
	pand		mm2, mm6
	pand		mm3, mm7
	paddb		mm0, mm2
	paddb		mm1, mm3
	movntq		[edi], mm0
	movntq		[edi + 8], mm1
	add		edx, 8
	add		edi, 16
	loopnz		L3@1616221752

	add		pSrc, eax
	add		pDst, ebp
	dec		esi
	jnz		near L1@1616221752

	POPREGS
	emms
	ret
