// Gameboy Advance GPU emulator
// Mic, 2004/2005


#include "zombie.h"
#include <commctrl.h>
#include <string.h>
#include "resource.h"
#include <GL/gl.h>

#define JUMP	0
#define VECTOR	1

#define NAME_TABLE_OFFSET	0x10000
#define PALETTE_OFFSET		0x1C000
#define IOREGS_OFFSET		0x1D000
#define SPRITE_TABLE_OFFSET	0x20000
#define BG2X_DIRTY			0x22000
#define BG3X_DIRTY			0x22004

#ifndef GL_RGB5_A1
#define GL_RGB5_A1 0x8057
#endif
#ifndef GL_UNSIGNED_SHORT_5_5_5_1
#define GL_UNSIGNED_SHORT_5_5_5_1 0x8034
#endif
#ifndef GL_UNSIGNED_SHORT_1_5_5_5_REV
#define GL_UNSIGNED_SHORT_1_5_5_5_REV 0x8366
#endif


BOOL CALLBACK DlgProc(HWND hdlg, UINT msg, WPARAM wParam, LPARAM lParam);
void draw_scanline();
void finish_screen();
void reset_screen();


typedef struct
{
	int num,prio;
}tag_bg;

typedef void(*draw_bg_callback)(int);


HANDLE hout;
HINSTANCE hInst;
HWND hwnd;
HDC dc;
RECT rect;
HGLRC hrc;
PIXELFORMATDESCRIPTOR pfd;
int pixformat;
GLuint texID;

unsigned char *VRAM,*vscreen,*vscreen2,*vscreen3,*vscreenmem,*patternTbl,*IOREGS,*currPalette;
char dataLatch;
short int *colorLUT;
int glFormat = GL_UNSIGNED_SHORT_5_5_5_1;
int filter = GL_LINEAR;
int width,scanline,visibleLines,totalLines,nameTblHeight;
int hbiAction,hbiAddress,vbiAction,vbiAddress;
int yscroll;
int vramPtr,addressLatch;
int spriteCnt,spriteLimit,spriteTile,spriteBit6,spriteDbg[4];
int htile,patLine,patLineInv; //,tileShift;
int mosaicX,mosaicY;
int addressShift;
int dummy;
int xtile;
int newframe;
int ablend,lumachange,eva,evb,evy,lineaddr,blended,blended2;
unsigned int frameInterval;
short int palfx[256];
unsigned int hofs,vofs;
draw_bg_callback draw_bg_callbacks[3][4],draw_bg;
unsigned int hTileMask,vTileMask,hScrMask,hScrShift,vScrMask,vScrShift,tileShift;
unsigned int textBgMasks[4][2] = {{0x1f,0x1f},{0x3f,0x1f},{0x1f,0x3f},{0x3f,0x3f}};
unsigned int textBgShifts[4] = {5,6,5,6};
unsigned int textScrMasks[4][2] = {{0x0,0x0},{0x100,0x0},{0x0,0x100},{0x100,0x100}};
unsigned int textScrShifts[2][4] = {{0,3,0,3},{0,3,0,4}};
unsigned int rsBgMasks[4][2] = {{0x0f,0x0f},{0x1f,0x1f},{0x3f,0x3f},{0x7f,0x7f}};
unsigned int rsBgShifts[4] = {4,5,6,7};

DWORD lastFrameTime;

bool hblankEnabled,vblankEnabled;
bool limitFps = true;
bool enableObj = true;
bool enableLayer[5] = {true,true,true,true,true};
bool forceLayer[5] = {false,false,false,false,false};
bool hasMMX,hasSSE,hasSSE2;
bool glOK=false,useGL=false,glIsInit=false;
bool isFlipped[2];

char gpuName[] = "GBA GPU";
char szMMXError[] = "This program requires MMX to run";
char szMemoryError[] = "Unable to allocate sufficient memory";
char *lpszError;

int frameskip = 0,frames;
int lastbg;
unsigned int BG2X,BG2Y,BG2PA,BG2PB,BG2PC,BG2PD;
unsigned int BG3X,BG3Y,BG3PA,BG3PB,BG3PC,BG3PD;
unsigned int bg2x,bg2y,bg3x,bg3y;
unsigned int x2,y2,deltax,deltay;
unsigned int objx2,objy2,objdx,objdy,objdmx,objdmy;
unsigned int midpoint[2];
unsigned int painv,pbinv,pcinv,pdinv;
unsigned int xclip,clippedWidth,tileOffset;

//unsigned int rs_inverse[0x8000];
char buffer[80];
unsigned int objwidth[16] = {8,16,32,64,
							 16,32,32,64,
							 8,8,16,32,
							 0,0,0,0};
unsigned int objheight[16] = {8,16,32,64,
							 8,8,16,32,
							 16,32,32,64,
							 0,0,0,0};
int log2_lut[32];

float vertexes[4][2] = {{1.0f,1.0f},{-1.0f,1.0f},{-1.0f,-0.78f},{1.0f,-0.78f}};
#define _S 0.9375f
#define _T 0.625f
float texCoords[4][2] = {{_S,0.0f},{0.0f,0.0f},{0.0f,_T},{_S,_T}};

int colormode = 0;
int colormask[2][3] = {{0x001F,0x03E0,0x7C00}, {0x001F,0x03E0,0x7C00}};

int sizemult = 1;

tag_bg layers[5],sorted_layers[5];	// 4 BG layers plus the OAM layer

cpu_irq_callback cpu_irq;
cpu_set_io_handler_callback cpu_set_io_handler;
mmu_read_word_callback mmu_read_word;
mmu_irq_callback mmu_irq;
zombie_callback zombie;

__int64 fifteen = 15;
__int64 masklo = 0x7BDE7BDE7BDE7BDE;  // 011110 11110 11110 -> 001111 01111 01111, 0011 1101 1110 1111
__int64 coladd = 0x35AD35AD35AD35AD;

__declspec(align(16)) __int64 EVA128[2] = {0,0};
__declspec(align(16)) __int64 EVB128[2] = {0,0};
__declspec(align(16)) __int64 MASKBLUE[2]  = {0x001F001F001F001F,0x001F001F001F001F};
__declspec(align(16)) __int64 MASKGREEN[2] = {0x03E003E003E003E0,0x03E003E003E003E0};
__declspec(align(16)) __int64 MASKRED[2]   = {0x7C007C007C007C00,0x7C007C007C007C00};
__declspec(align(16)) __int64 SHAMT1[2]   = {0,0};
__declspec(align(16)) __int64 SHAMT2[2]   = {0,0};

__declspec(align(16)) __int64 MASKHALF[2]  = {0x7BDE7BDE7BDE7BDE,0x7BDE7BDE7BDE7BDE};
__declspec(align(16)) __int64 MASK50[2]  = {0x0012001200120012,0x000F000F000F000F};

__int64 fubar;

int plus1=1,minus1=-1;
LARGE_INTEGER perfFreq,perfCnt1,perfCnt2;

palette_dib palDib;


// Used to copy the screen buffer to the window
struct bi
{
	BITMAPINFOHEADER bih;
	RGBQUAD mask[4];
}bi;





void draw_text_bg(int);
void draw_rotscale_bg(int);


// DLL entry point
bool __stdcall DllMain(HINSTANCE hDLL,DWORD reason,LPVOID unused)
{
	if (reason==DLL_PROCESS_ATTACH)
		hInst = hDLL;

	return true;
}



void kill_gl()
{
	if (hrc!=NULL)
	{
		wglMakeCurrent(NULL,NULL);
		wglDeleteContext(hrc);
	}
	glIsInit = false;
	glOK = false;
}


bool setup_gl()
{
	hrc = NULL;
	glIsInit = true;
	glOK = true;

	// Setup OpenGL
	pfd.nSize = sizeof(PIXELFORMATDESCRIPTOR);
	pfd.nVersion = 1;
	pfd.dwFlags = PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER | PFD_DRAW_TO_WINDOW;
	pfd.dwLayerMask = PFD_MAIN_PLANE;
	pfd.iPixelType = PFD_TYPE_RGBA;
	pfd.cColorBits = 16;
	pfd.cDepthBits = 16;
	pfd.cAccumBits = 0;
	pfd.cStencilBits = 0;
	if ((pixformat = ChoosePixelFormat(dc,&pfd))==0)
		return false;

	SetPixelFormat(dc,pixformat,&pfd);
	hrc = wglCreateContext(dc);
	if (wglMakeCurrent(dc,hrc)==FALSE)
		return false;

	glEnable(GL_TEXTURE_2D);
	glDisable(GL_CULL_FACE);
	glDisable(GL_DEPTH_TEST);
	glGenTextures(1,&texID);
	glBindTexture(GL_TEXTURE_2D,texID);

	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
  	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
  	glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, filter);
  	glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, filter);

	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB5_A1, 256, 256, 0, GL_RGBA, glFormat, vscreen);
	if (glGetError()!=GL_NO_ERROR)
		return false;

	return true;
}



bool __stdcall gpu_init(zombie_callback commlink)
{
	int i;
	zombie = commlink;

	scanline = 0;
	width = 240;
	visibleLines = 160;
	totalLines = 228;
	nameTblHeight = 240;
	spriteLimit = 128;
	frameInterval = 1090;

	// Check for available CPU extensions
	__asm
	{
		pusha
		mov eax,1
		cpuid
		mov [i],edx
		popa
	}
	hasMMX  = ((i&0x0800000)!=0);
	hasSSE  = ((i&0x2000000)!=0);
	hasSSE2 = ((i&0x4000000)!=0);

	if (!hasMMX)
	{
		lpszError = szMMXError;
		return false;
	}

	hblankEnabled = true;
	vblankEnabled = true;

	hbiAction = vbiAction = JUMP;
	hbiAddress = vbiAddress = 0x38;

	VRAM = (unsigned char*)LocalAlloc(LPTR,0x40000);		// 256 kB of VRAM

	// Allocate two virtual screens and align them on 16-byte boundaries
	vscreenmem = (unsigned char*)LocalAlloc(LPTR,width*visibleLines*2*2 + 16);
	vscreen = (unsigned char*)(((unsigned int)vscreenmem&0xFFFFFFF0)+0x10);
	vscreen2 = vscreen + width*visibleLines*2;


	if ((vscreenmem == NULL)||(VRAM==NULL))
	{
		lpszError = szMemoryError;
		return false;
	}


	colorLUT = (short*)&VRAM[PALETTE_OFFSET];
	patternTbl = &VRAM[0x10000];

	/*for (i=1; i<0x8000; i++)
		rs_inverse[i] = 0x8000/i;
	rs_inverse[0] = 0x8000/0x100;*/

	log2_lut[1] = 0;
	log2_lut[2] = 1;
	log2_lut[4] = 2;
	log2_lut[8] = 3;
	log2_lut[16] = 4;

	draw_bg_callbacks[0][0] = draw_text_bg;
	draw_bg_callbacks[0][1] = draw_text_bg;
	draw_bg_callbacks[0][2] = draw_text_bg;
	draw_bg_callbacks[0][3] = draw_text_bg;
	draw_bg_callbacks[1][0] = draw_text_bg;
	draw_bg_callbacks[1][1] = draw_text_bg;
	draw_bg_callbacks[1][2] = draw_rotscale_bg;
	draw_bg_callbacks[1][3] = NULL;
	draw_bg_callbacks[2][0] = NULL;
	draw_bg_callbacks[2][1] = NULL;
	draw_bg_callbacks[2][2] = draw_rotscale_bg;
	draw_bg_callbacks[2][3] = draw_rotscale_bg;

	// Ask for the handle of the parent window
	hwnd = (HWND)zombie(ASK,ZOMBIE_WINDOW_HANDLE,0,0);
	dc = GetDC(hwnd);


	// Get a handle to STDOUT (for WriteConsole)
	hout = GetStdHandle(STD_OUTPUT_HANDLE);

	return true;
}



bool __stdcall gpu_reset()
{
	int i;

	// Ask for the CPU IRQ callback
	cpu_irq = (cpu_irq_callback)zombie(ASK,CPU_IRQ_CALLBACK,0,0);
	mmu_irq = (mmu_irq_callback)zombie(ASK,MMU_IRQ_CALLBACK,0,0);

	memset(VRAM,0,0x40000);
	memset(vscreen,0,width*visibleLines*2);

	for (i=0; i<128; i++)
		VRAM[SPRITE_TABLE_OFFSET + (i<<3)] = 160;

	VRAM[IOREGS_OFFSET+4] = 0;
	VRAM[IOREGS_OFFSET+6] = 0;
	VRAM[IOREGS_OFFSET+7] = 0;
	VRAM[IOREGS_OFFSET+0x130] = 0xFF;

	IOREGS = &VRAM[IOREGS_OFFSET];

	IOREGS[8] = 0;
	IOREGS[10] = 0;
	IOREGS[12] = 0;
	IOREGS[14] = 0;

	*(int*)(IOREGS + 0x20) = 0x100;
	*(int*)(IOREGS + 0x24) = 0x1000000;
	*(int*)(IOREGS + 0x30) = 0x100;
	*(int*)(IOREGS + 0x34) = 0x1000000;

	for (i=0; i<5; i++)
	{
		layers[i].num = i;
		layers[i].prio = i;
	}

    *((int*)&bi.mask[0]) = 0x001F; 	// 00000000 000rrrrr
    *((int*)&bi.mask[1]) = 0x03E0; 	// 000000gg ggg00000
    *((int*)&bi.mask[2]) = 0x7C00;	// 0bbbbb00 00000000

	scanline = 0;
	frames = 0;
	newframe = 0;
	lastbg = -1;

	QueryPerformanceFrequency(&perfFreq);
	QueryPerformanceCounter(&perfCnt1);
	return true;
}


void __stdcall gpu_close()
{
	kill_gl();
	LocalFree(vscreenmem);
	LocalFree(VRAM);
}


int __stdcall gpu_config(HWND hwnd)
{
	return DialogBoxParam(hInst,
           MAKEINTRESOURCE(IDD_DIALOG1), hwnd,
           DlgProc, NULL);
}


int __stdcall gpu_get(int what)
{
	switch (what)
	{
		case GPU_CURRENT_SCANLINE:
			return scanline;
		case GPU_SCREEN_WIDTH:
			return width;
		case GPU_SCREEN_HEIGHT:
			return visibleLines;
		case GPU_VBLANK_LINES:
			return totalLines-visibleLines;
		case GPU_HBLANK_PIXELS:
			return 68;
		case GPU_NUMBER_COLORS:
			return 256;
		case GPU_PALETTE:
			palDib.data = (char*)&VRAM[PALETTE_OFFSET];
			memset(&palDib.bih,0,sizeof(BITMAPINFOHEADER));
 			palDib.bih.biSize = sizeof(BITMAPINFOHEADER);
 			palDib.bih.biWidth = 32;
 			palDib.bih.biHeight = -16;
 			palDib.bih.biPlanes = 1;
 			palDib.bih.biBitCount = 16;
  			palDib.bih.biCompression = BI_BITFIELDS;

			*((int*)&palDib.colors[0]) = 0x001F;	// 00000000 000rrrrr
			*((int*)&palDib.colors[1]) = 0x03E0;	// 000000gg ggg00000
			*((int*)&palDib.colors[2]) = 0x7C00;	// 0bbbbb00 00000000

			return (int)&palDib;

		case GPU_MEMORY:
			return (int)VRAM;
		case PLUGIN_NAME:
			return (int)&gpuName;
		case LAST_ERROR:
			return (int)lpszError;

		case 0x700:
			//wsprintf(buffer,"%d, %d, %d, %d, %d, %d\n",objx2,objy2,midpoint[0],midpoint[1],spriteDbg[1],objdy);
			wsprintf(buffer,"x=%d, y=%d, pa-pd = %d, %d, %d, %d\n",xtile,hofs,objdx,objdy,objdmx,objdmy);
			//wsprintf(buffer,"%d, %d, %d\n",spriteDbg[0],spriteDbg[1],spriteDbg[2]);
			//wsprintf(buffer,"%d, %d, %d\n",hTileMask,vTileMask,tileShift);
			//wsprintf(buffer,"x=%d, y=%d, pa-pd = %d, %d, %d, %d\n",x2,y2,BG3PA,BG3PB,BG3PC,BG3PD);
			//wsprintf(buffer,"Blended 8 pixels in %d clocks (%d)\n",(unsigned int)(fubar&0xFFFFFFFF),blended2);
			//WriteConsole(hout,buffer,lstrlen(buffer),(unsigned long*)&dummy,NULL);
			return (int)&buffer[0];
			break;
		default:
			break;
	}

	return 0;
}


void __stdcall gpu_set(int what,int param1,int param2)
{
	switch (what)
	{
		case CPU_IRQ_CALLBACK:
			cpu_irq = (cpu_irq_callback)param1;
			break;
		case MMU_READ_WORD_CALLBACK:
			mmu_read_word = (mmu_read_word_callback)param1;
			break;
		case GPU_VBLANK_ACTION:
				vbiAction = param1;
				vbiAddress = param2;
				break;
		case GPU_HBLANK_ACTION:
				hbiAction = param1;
				hbiAddress = param2;
				break;
		case PLUGIN_SETTING:
			switch (param1)
			{
				case 0:
					frameskip = param2;
					break;
				case 1:
					limitFps = param2;
					break;
				case 2:
					enableObj = param2;
					break;
				case 4:
					if (!useGL && (param2!=0))
						setup_gl();
					useGL = param2;
					break;
				case 5:
					glFormat = param2;
					break;
				case 6:
					filter = (param2==0)?GL_NEAREST:GL_LINEAR;
				  	if (useGL && glOK)
					{
						glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, filter);
				  		glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, filter);
					}
					break;
				case 7:
					if ((int)param2 > 0)
						frameInterval = (0x10000 / (unsigned int)param2) - 2;
					break;
				default:
					break;
			}
			break;

		case GPU_FRAME_RATE:
			frameInterval = (0x10000 / (unsigned int)param1) - 2;
			break;

		case ZOMBIE_WINDOW_HANDLE:
			hwnd = (HWND)param1;
			break;
		default:
			break;
	}
}


void __stdcall gpu_hblank()
{
	VRAM[IOREGS_OFFSET+4] |= 0x2;

	if (scanline<visibleLines)
		if (hblankEnabled &&
			((VRAM[IOREGS_OFFSET+4] & 0x10) != 0) &&
			((IOREGS[0x200] & 0x02) != 0))
		{
			IOREGS[0x202] |= 2;
			cpu_irq(0x00000018);
		}

	mmu_irq(2);	// Signal a hblank to the MMU

	scanline++;
	if (scanline < 0xA1)
		draw_scanline();
	scanline--;
}



void __stdcall gpu_advance_line()
{
	static char buffer[80];

	VRAM[IOREGS_OFFSET+4] &= 0xFD;	// Clear hblank bit
	scanline++;
	VRAM[IOREGS_OFFSET+6] = (unsigned char)scanline;

	if (scanline==visibleLines+1)
	{
		VRAM[IOREGS_OFFSET+4] |= 1;
		if (vblankEnabled &&
			((VRAM[IOREGS_OFFSET+4] & 0x8) != 0) &&
			((IOREGS[0x200] & 0x1) != 0))
		{
			//wsprintf(buffer,"Blended 8 pixels in %d clocks (%d)\n",(unsigned int)(fubar&0xFFFFFFFF),blended2);
			//WriteConsole(hout,"VBLANK IRQ\n",13,(unsigned long*)&dummy,NULL);

			IOREGS[0x202] |= 1;
			cpu_irq(0x00000018);
		}
		mmu_irq(1);
	}
	else if (scanline==totalLines)
	{
		finish_screen();
		reset_screen();
		VRAM[IOREGS_OFFSET+6] = 0;
		scanline = 0;
		VRAM[IOREGS_OFFSET+4] &= 0xFE;
	}
}





// Shell sort based on insertion sort
void sort_layers(int items)
{
	int i,j,gap,first,last;
	tag_bg tempi,tempj;

	for (i=0; i<items; i++)
		sorted_layers[i] = layers[i];

	last = items;
	gap = (last/3)+1;
	while (true)
	{
		first = gap+1;
		for (i=first; i<=last; i++)
		{
			tempi = sorted_layers[i-1];
			j = i-gap;
			while (true)
			{
				tempj = sorted_layers[j-1];
				if ((tempi.prio < tempj.prio) ||
				    ((tempi.prio == tempj.prio) && (tempi.num < tempj.num)))
				{
					j += gap;
					break;
				}
				sorted_layers[j+gap-1] = tempj;
				if (j<=gap) break;
				j -= gap;
			}
			sorted_layers[j-1] = tempi;
		}
		if (gap == 1) return;
		else gap = (gap/3)+1;
	}
}


void flip_colors(unsigned char *dst,unsigned char *src,int len)
{
	if (hasSSE2)
		__asm
		{
			pusha
			mov esi,[src]
			mov edi,[dst]
			mov ecx,[len]
			movdqa xmm0,[MASKBLUE]
			movdqa xmm1,[MASKGREEN]
			shr ecx,4
__bgr0555_to_rgb5551_sse2:
			movdqu xmm2,[esi]		// Read 8 colors
			movdqa xmm3,xmm2		// Copy
			movdqa xmm4,xmm2		// Copy
			pand xmm2,xmm0
			psrlw xmm3,10
			pand xmm4,xmm1
			psllw xmm2,11
			pand xmm3,xmm0
			psllw xmm4,1
			psllw xmm3,1
			por xmm2,xmm4
			add esi,16
			por xmm3,xmm2
			movdqu [edi],xmm3
			add edi,16
			dec ecx
			jnz	__bgr0555_to_rgb5551_sse2
			popa
		}
	else
		__asm
		{
			pusha
			mov esi,[src]
			mov edi,[dst]
			mov ecx,[len]
			movq mm0,[MASKBLUE]
			movq mm1,[MASKGREEN]
			shr ecx,3
__bgr0555_to_rgb5551_mmx:
			movq mm2,[esi]		// Read 4 colors
			movq mm3,mm2		// Copy
			movq mm4,mm2		// Copy
			pand mm2,mm0
			psrlw mm3,10
			pand mm4,mm1
			psllw mm2,11
			pand mm3,mm0
			psllw mm4,1
			psllw mm3,1
			por mm2,mm4
			add esi,8
			por mm3,mm2
			movq [edi],mm3
			add edi,8
			dec ecx
			jnz	__bgr0555_to_rgb5551_mmx
			popa
		}
}

//////////////////////////////
// Convert BGR0555 to RGB5551
//////////////////////////////
void flip_palette(int palnum)
{
	if (isFlipped[palnum])
		return;
	isFlipped[palnum] = true;

	currPalette = (unsigned char*)&palfx[0];
	flip_colors(currPalette,&VRAM[PALETTE_OFFSET + (palnum<<9)],512);

}



void setup_color_effects(int bg)
{
	int srcbg,dstbg;

	// Point the current palette to the regular palette
	currPalette = &VRAM[PALETTE_OFFSET];

	// Set red/blue shift amounts depending on what rendering mode and
	// texture format is used.
	if (useGL && glOK && (glFormat==GL_UNSIGNED_SHORT_5_5_5_1))
	{
		*(int*)&SHAMT1 = 10;
		*(int*)&SHAMT2 = 0;
	} else
	{
		*(int*)&SHAMT1 = 0;
		*(int*)&SHAMT2 = 10;
	}

	// Check if color effects are enabled
	lumachange = 0;
	ablend = IOREGS[0x50]&0xC0;
	if (ablend==0x40)
	{
		srcbg = log2_lut[IOREGS[0x50]&0x1F];
		dstbg = log2_lut[IOREGS[0x51]&0x1F];
		if (((bg==srcbg)&&(lastbg==dstbg)) || ((bg==dstbg)&&(lastbg==srcbg)))
		{
			eva = IOREGS[0x52]&0x1F;
			evb = IOREGS[0x53]&0x1F;
			ablend = 1;
			//blendwith[bg] = lastbg;
		}
//////////////////
// Luma increase
//////////////////
	} else if (ablend==0x80)
	{
		lumachange = 1;
			evy = IOREGS[0x54]&0x1F;
			currPalette = (unsigned char*)&palfx[0];

/////////////////////////////////////////////////////////////////////////////////
// Create another palette in which all colors have been added (31-itself)*EVY/16
/////////////////////////////////////////////////////////////////////////////////
			if (hasSSE2)
				__asm
				{
					pusha
					mov esi,[VRAM]
					movd xmm0,[evy]
					mov edi,[currPalette]
					punpcklwd xmm0,xmm0
					add esi,PALETTE_OFFSET
					movdqa xmm1,[MASKBLUE]	// XMM1 - 001F 001F 001F ....
					punpckldq xmm0,xmm0
					mov ecx,32
					punpcklqdq xmm0,xmm0	// XMM0 - evy evy evy ....
		__setup_luma_inc_sse2:
					movdqu xmm2,[esi]		// Read 8 colors
					movdqa xmm3,xmm2		// Copy
					movdqa xmm5,xmm2		// Copy
					pand xmm3,xmm1			// XMM3 - Keep reds
					psrlw xmm5,5
					movdqa xmm4,xmm1
					pand xmm5,xmm1
					psubw xmm4,xmm3
					movdqa xmm6,xmm1
					pmullw xmm4,xmm0
					psubw xmm6,xmm5
					psrlw xmm4,4
					pmullw xmm6,xmm0
					paddw xmm3,xmm4
					psrlw xmm2,10
					psrlw xmm6,4
					pand xmm2,xmm1
					psllw xmm3,[SHAMT1]
					paddw xmm5,xmm6
					movdqa xmm7,xmm1
					psllw xmm5,5
					psubw xmm7,xmm2
					por xmm3,xmm5
					pmullw xmm7,xmm0
					add edi,16
					psrlw xmm7,4
					paddw xmm2,xmm7
					psllw xmm2,[SHAMT2] //10
					por xmm3,xmm2
					movdqu [edi],xmm3
					add edi,16
					dec ecx
					jnz __setup_luma_inc_sse2
					popa
				}
			else
				__asm
				{
					pusha
					mov esi,[VRAM]
					movd mm0,[evy]
					mov edi,[currPalette]
					punpcklwd mm0,mm0
					add esi,PALETTE_OFFSET
					movq mm1,[MASKBLUE]		// mm1 - 001F 001F 001F ....
					punpckldq mm0,mm0
					mov ecx,64
		__setup_luma_inc_mmx:
					movq mm2,[esi]			// Read 8 colors
					movq mm3,mm2			// Copy
					movq mm5,mm2			// Copy
					pand mm3,mm1			// mm3 - Keep reds
					psrlw mm5,5
					movq mm4,mm1
					pand mm5,mm1
					psubw mm4,mm3
					movq mm6,mm1
					pmullw mm4,mm0
					psubw mm6,mm5
					psrlw mm4,4
					pmullw mm6,mm0
					paddw mm3,mm4
					psrlw mm2,10
					psrlw mm6,4
					pand mm2,mm1
					psllw mm3,[SHAMT1]
					paddw mm5,mm6
					movq mm7,mm1
					psllw mm5,5
					psubw mm7,mm2
					por mm3,mm5
					pmullw mm7,mm0
					add edi,8
					psrlw mm7,4
					paddw mm2,mm7
					psllw mm2,[SHAMT2] //10
					por mm3,mm2
					movq [edi],mm3
					add edi,8
					dec ecx
					jnz __setup_luma_inc_mmx
					popa
					emms
				}

	} else if (ablend==0xC0)
	{
//////////////////
// Luma decrease
//////////////////
		if (IOREGS[0x50]&(1<<bg))
		{
			lumachange = 3;
			evy = IOREGS[0x54]&0x1F;
			currPalette = (unsigned char*)&palfx[0];
/////////////////////////////////////////////////////////////////////////////////
// Create another palette in which all colors have been subtracted itself*EVY/16
/////////////////////////////////////////////////////////////////////////////////
			if (hasSSE2)
				__asm
				{
					pusha
					mov esi,[VRAM]
					movd xmm0,[evy]
					mov edi,[currPalette]
					punpcklwd xmm0,xmm0
					add esi,PALETTE_OFFSET
					movdqa xmm1,[MASKBLUE]
					punpckldq xmm0,xmm0
					mov ecx,32
					punpcklqdq xmm0,xmm0
		__setup_luma_dec_sse2:
					movdqu xmm2,[esi]	// Read 8 colors
					movdqa xmm3,xmm2	// Copy
					movdqa xmm5,xmm2	// Copy
					pand xmm3,xmm1		// XMM3 - Keep reds
					psrlw xmm5,5		// XMM5 - Shift right by 5
					movdqa xmm4,xmm3	// XMM4 - Reds
					pand xmm5,xmm1		// XMM5 - Keep greens
					pmullw xmm3,xmm0	// XMM3 - Multiply by EVY
					movdqa xmm6,xmm5	// XMM5 - Greens
					psrlw xmm3,4		// XMM3 - Shift right by 4
					pmullw xmm5,xmm0	// XMM5 - Multiply by EVY
					psrlw xmm2,10		// XMM2 - Shift right by 10
					psubw xmm4,xmm3		// XMM4 - Subtract red*EVY/16 from red
					pand xmm2,xmm1		// XMM2 - Keep blues
					psrlw xmm5,4		// XMM5 - Shift right by 4
					psllw xmm4,[SHAMT1]
					psubw xmm6,xmm5		// XMM6 - Subtract green*EVY/16 from green
					movdqa xmm3,xmm2	// XMM3 - Blues
					psllw xmm6,5
					pmullw xmm2,xmm0
					por xmm4,xmm6
					psrlw xmm2,4
					psubw xmm3,xmm2
					add esi,16
					psllw xmm3,[SHAMT2] //10
					por xmm4,xmm3
					movdqu [edi],xmm4
					add edi,16
					dec ecx
					jnz __setup_luma_dec_sse2
					popa
				}
			else
				__asm
				{
					pusha
					mov esi,[VRAM]
					movd mm0,[evy]
					mov edi,[currPalette]
					punpcklwd mm0,mm0
					add esi,PALETTE_OFFSET
					movq mm1,[MASKBLUE]
					punpckldq mm0,mm0
					mov ecx,64
		__setup_luma_dec_mmx:
					movq mm2,[esi]		// Read 4 colors
					movq mm3,mm2		// Copy
					movq mm5,mm2		// Copy
					pand mm3,mm1		// mm3 - Keep reds
					psrlw mm5,5			// mm5 - Shift right by 5
					movq mm4,mm3		// mm4 - Reds
					pand mm5,mm1		// mm5 - Keep greens
					pmullw mm3,mm0		// mm3 - Multiply by EVY
					movq mm6,mm5		// mm5 - Greens
					psrlw mm3,4			// mm3 - Shift right by 4
					pmullw mm5,mm0		// mm5 - Multiply by EVY
					psrlw mm2,10		// mm2 - Shift right by 10
					psubw mm4,mm3		// mm4 - Subtract red*EVY/16 from red
					pand mm2,mm1		// mm2 - Keep blues
					psrlw mm5,4			// mm5 - Shift right by 4
					psubw mm6,mm5		// mm6 - Subtract green*EVY/16 from green
					movq mm3,mm2		// mm3 - Blues
					psllw mm6,5
					pmullw mm2,mm0
					por mm4,mm6
					psrlw mm2,4
					psubw mm3,mm2
					add esi,8
					psllw mm3,[SHAMT2] //10
					por mm4,mm3
					movq [edi],mm4
					add edi,8
					dec ecx
					jnz __setup_luma_dec_mmx
					popa
					emms
				}
		}
	} else if (useGL && glOK && (glFormat==GL_UNSIGNED_SHORT_5_5_5_1))
		flip_palette(0);

	if (ablend!=1) ablend = 0;
}



void alpha_blend()
{
	if (hasSSE2)
////////////////////////////////////////////////////////////////
// SSE2 supported, use the SSE2 blender (~16 clocks/pixel)
////////////////////////////////////////////////////////////////
		__asm
		{
			pusha

			mov edi,[lineaddr]
			mov esi,edi
			sub esi,[vscreen]
			add esi,[vscreen2]			// Point to esi to the same line but in vscreen2

			movd xmm0,[eva]				// Copy eva into xmm0
			movd xmm1,[evb]				// Copy evb into xmm1
			punpcklwd xmm0,xmm0			// Unpack xmm0 - 0000 0000 00aa 00aa
			punpcklwd xmm1,xmm1			// Unpack xmm1 - 0000 0000 00aa 00aa
			punpckldq xmm0,xmm0			// Unpack xmm0 - 00aa 00aa 00aa 00aa
			punpckldq xmm1,xmm1			// Unpack xmm1 - 00aa 00aa 00aa 00aa
			punpcklqdq xmm0,xmm0		// Unpack xmm0 - 00aa 00aa 00aa 00aa 00aa 00aa 00aa 00aa
			punpcklqdq xmm1,xmm1		// Unpack xmm1 - 00aa 00aa 00aa 00aa 00aa 00aa 00aa 00aa
			movdqa [EVA128],xmm0		// Save the result into EVA128
			movdqa xmm4,[MASKBLUE]
			movdqa [EVB128],xmm1		// Save the result into EVB128

			mov ecx,30
__ablend_sse2_loop:
			//rdtsc
			//mov dword ptr [fubar],eax
			movdqa xmm6,[esi]			// Copy 16 bytes (8 pixels) from vscreen2 into xmm6
			movdqa xmm7,[edi]			// Copy 16 bytes (8 pixels) from vscreen into xmm7

			movdqa xmm0,xmm7			// BLU - Copy xmm7 to xmm0, destination pixels
			movdqa xmm1,xmm6			// BLU - Copy xmm6 to xmm1, source pixels

			psrlw xmm0,10				// BLU - shift each pixel to the right by 10 [000b 000b 000b 000b]
			psrlw xmm1,10				// BLU - shift each pixel to the right by 10 [000b 000b 000b 000b]

			pand xmm0,xmm4
			pand xmm1,xmm4

			pmullw xmm0,[EVB128]
			pmullw xmm1,[EVA128]
			psrlw xmm0,4
			psrlw xmm1,4
			paddw xmm0,xmm1
			movdqa xmm3,xmm4
			movdqa xmm5,xmm0
			pcmpgtw xmm5,xmm3
			pand xmm3,xmm5
			pandn xmm5,xmm0
			por xmm5,xmm3

			movdqa xmm0,xmm7			// GRN - Copy xmm7 to xmm0, destination pixels
			movdqa xmm1,xmm6			// GRN - Copy xmm6 to xmm1, source pixels

			psrlw xmm0,5				// GRN - shift each pixel to the right by 5 [000g 000g 000g 000g]
			psrlw xmm1,5				// GRN - shift each pixel to the right by 5 [000g 000g 000g 000g]

			pand xmm0,xmm4
			pand xmm1,xmm4

			pmullw xmm0,[EVB128]
			pmullw xmm1,[EVA128]
			psrlw xmm0,4
			psrlw xmm1,4
			paddw xmm0,xmm1
			movdqa xmm3,xmm4
			movdqa xmm2,xmm0
			pcmpgtw xmm2,xmm3
			pand xmm3,xmm2
			pandn xmm2,xmm0
			por xmm2,xmm3
			psllw xmm5,10
			psllw xmm2,5
			por xmm5,xmm2

			pand xmm7,xmm4				// RED - and xmm7 with the red channel mask [0r00 0r00 0r00 0r00]
			pand xmm6,xmm4				// RED - and xmm6 with red channel mask [0r00 0r00 0r00 0r00]

			pmullw xmm7,[EVB128]
			pmullw xmm6,[EVA128]
			movdqa xmm3,xmm4
			psrlw xmm7,4
			psrlw xmm6,4
			paddw xmm7,xmm6
			movdqa xmm2,xmm7
			pcmpgtw xmm2,xmm3
			pand xmm3,xmm2
			pandn xmm2,xmm7
			por xmm2,xmm3
			por xmm5,xmm2

			add esi,16

			movdqa [edi],xmm5			// Copy the 8 alpha blended pixels to the destination

			add edi,16

			//rdtsc
			//sub eax,dword ptr [fubar]
			//mov dword ptr [fubar],eax

			dec ecx
			jnz __ablend_sse2_loop
			popa
		}
	else
////////////////////////////////////////////////////////////////
// SSE2 not supported, resort to MMX blender (~32 clocks/pixel)
////////////////////////////////////////////////////////////////
		__asm
		{
			pusha

			mov edi,[lineaddr]

			movd mm0,[eva]				// Copy eva into mm0
			movd mm1,[evb]				// Copy evb into mm1

			add edi,480

			punpcklwd mm0,mm0			// Unpack mm0 - 0000 0000 00aa 00aa
			punpcklwd mm1,mm1			// Unpack mm1 - 0000 0000 00aa 00aa

			mov esi,edi

			punpckldq mm0,mm0			// Unpack mm0 - 00aa 00aa 00aa 00aa
			punpckldq mm1,mm1			// Unpack mm1 - 00aa 00aa 00aa 00aa

			sub esi,[vscreen]

			movq [EVA128],mm0			// Save the result into EVA128
			movq mm4,[MASKBLUE]			// Channel mask (001F 001F 001F 001F)
			movq [EVB128],mm1			// Save the result into EVB128

			add esi,[vscreen2]			// Point to esi to the same line but in vscreen2

			mov ecx,-60					// Use negative indexing to save 2 instructions in the inner loop

__ablend_mmx_loop:
			movq mm6,[esi+ecx*8]		// Copy 8 bytes (4 pixels) from vscreen2 into xmm6
			movq mm7,[edi+ecx*8]		// Copy 8 bytes (4 pixels) from vscreen into xmm7

			movq mm0,mm7				// BLU - Copy mm7 to mm0, destination pixels
			movq mm1,mm6				// BLU - Copy mm6 to mm1, source pixels

			psrlw mm0,10				// BLU - Shift each pixel to the right by 10 [000b 000b 000b 000b]
			psrlw mm1,10				// BLU - Shift each pixel to the right by 10 [000b 000b 000b 000b]

			pand mm0,mm4
			pand mm1,mm4

			pmullw mm0,[EVB128]			// BLU - Multiply dest component with EVB
			pmullw mm1,[EVA128]			// BLU - Multiply source component with EVA
			psrlw mm0,4					// BLU - Shift right by 4 (divide by 16)
			psrlw mm1,4
			paddw mm0,mm1				// BLU - Add source and dest components
			movq mm3,mm4
			movq mm5,mm0
			pcmpgtw mm5,mm3				// BLU - Make sure result doesn't exceed 0x1F
			pand mm3,mm5
			pandn mm5,mm0				// BLU - Keep those components that doesn't exceed 0x1F
			por mm5,mm3					// BLU - OR in the other components which have been set to 0x1F

			movq mm0,mm7				// GRN - Copy mm7 to mm0, destination pixels
			movq mm1,mm6				// GRN - Copy mm6 to mm1, source pixels

			psrlw mm0,5					// GRN - Shift each pixel to the right by 5 [000g 000g 000g 000g]
			psrlw mm1,5					// GRN - Shift each pixel to the right by 5 [000g 000g 000g 000g]

			pand mm0,mm4
			pand mm1,mm4

			pmullw mm0,[EVB128]			// GRN
			pmullw mm1,[EVA128]			// GRN
			psrlw mm0,4
			psrlw mm1,4
			paddw mm0,mm1
			movq mm3,mm4
			movq mm2,mm0
			pcmpgtw mm2,mm3
			pand mm3,mm2
			pandn mm2,mm0
			por mm2,mm3
			psllw mm5,10				// BLU - Shift components left by 10
			psllw mm2,5					// GRN - Shift components left by 5
			por mm5,mm2					// BLU - OR in the green bits

			pand mm7,mm4				// RED - and mm7 with the channel mask [000r 000r 000r 000r]
			pand mm6,mm4				// RED - and mm6 with the channel mask [000r 000r 000r 000r]

			pmullw mm7,[EVB128]			// RED
			pmullw mm6,[EVA128]			// RED
			movq mm3,mm4
			psrlw mm7,4
			psrlw mm6,4
			paddw mm7,mm6
			movq mm2,mm7
			pcmpgtw mm2,mm3
			pand mm3,mm2
			pandn mm2,mm7
			por mm2,mm3
			por mm5,mm2

			movq [edi+ecx*8],mm5		// Copy the 4 alpha blended pixels to the destination

			inc ecx
			jnz __ablend_mmx_loop
			popa
			emms
		}
}



void draw_text_bg(int bg)
{
	int chrBase,scrBase,cntReg,colMode;

	// Get the control register offset of this background
	cntReg = IOREGS_OFFSET + 8 + (bg<<1);

	// Get the horizontal and vertical screen offsets
	hofs = IOREGS[0x10 + (bg<<2)]&0xFF;
	hofs += (IOREGS[0x11 + (bg<<2)]&0x1)<<8;
	vofs = IOREGS[0x12 + (bg<<2)]&0xFF;
	vofs += (IOREGS[0x13 + (bg<<2)]&0x1)<<8;

	// Get the CHR base, SCR base and color mode
	chrBase = (VRAM[cntReg] >> 2) & 3;
	scrBase = VRAM[cntReg+1] & 0x1F;
	colMode = VRAM[cntReg] & 0x80;

	hTileMask = (VRAM[cntReg+1] >> 6) & 3;
	vTileMask = textBgMasks[hTileMask][1];
	tileShift = textBgShifts[hTileMask];
	hScrMask = textScrMasks[hTileMask][0];
	hScrShift = textScrShifts[0][hTileMask];
	vScrMask = textScrMasks[hTileMask][1];
	vScrShift = textScrShifts[1][hTileMask];
	hTileMask = textBgMasks[hTileMask][0];

	setup_color_effects(bg);

	lastbg = bg;


///////////////////////////
// Handle 256x1 color BGs
///////////////////////////
	if (colMode == 0x80)
		__asm
		{
			pusha
			mov eax,[scanline]
			mov ebx,480
			dec eax
			mov edi,[vscreen]
			mul ebx
			add edi,eax				// edi = vscreen+scanline*480

			mov esi,[VRAM]
			mov eax,[scrBase]
			shl eax,11				// Select 2k screen base block
			mov ebx,[scanline]
			dec ebx
			add ebx,[vofs]

			mov edx,ebx
			and ebx,0xFF			// temp fix !!
			mov ecx,[vScrShift]
			and edx,[vScrMask]
			shl edx,cl
			mov ecx,ebx
			shr ebx,3
			shl ebx,6				// ebx = (scanline/8)*64
			add esi,edx
			add esi,eax
			add esi,ebx				// esi = nametable

			and ecx,7				// ecx = (scanline+yscroll)&7  (tile row)
			mov edx,[VRAM]
			mov ebx,ecx
			mov eax,[chrBase]
			xor ebx,7				// "Inverted" tile row (used for vflipping)
			shl eax,14				// Select 16k chr base block
			shl ebx,3
			shl ecx,3				// Multiply by 8 (number of bytes per row)
			add edx,eax				// edx = VRAM + chrBase
			mov [tileOffset+4],ebx
			//add edx,ecx				// edx += tile_row*8
			mov [dummy],edx
			mov [tileOffset],ecx

			mov [lineaddr],edi		// Save destination address for later use

			// Is this the bottom-most layer?
			test dword ptr [newframe],0x3
			jnz __m0_not_first_bg

__m0_first_bg:

			push ebp
			mov ebp,[currPalette]

			mov eax,[hofs]
			mov ebx,eax
			shr eax,3				// eax = xscroll/8
			mov [xtile],eax
			and eax,0x1F			// AND with the number of tiles per row
			and ebx,[hScrMask]
			mov ecx,[hScrShift]
			shl ebx,cl
			lea eax,[ebx+eax*2]
			xor ebx,ebx				// ebx = 0
			mov bx,[esi+eax]		// Get tile data
			inc dword ptr [xtile]	// Next tile..
			mov eax,ebx
			and ebx,0x3FF			// ebx = tile number
			shr eax,11				// Get vflip flag
			shl ebx,6				// ebx *= 8*8
			and eax,1
			add ebx,[dummy] 		// ebx += VRAM + chrBase + tile_row*8
			add ebx,[tileOffset+eax*4]

			mov eax,[hofs]
			//add ebx,8
			mov ecx,8
			and eax,7
			add ebx,eax
			sub ecx,eax				// ecx = 8 - (xscroll&7)  (length of first tile)
			__m0_tile0_draw:
				xor eax,eax			// Clear eax
				mov al,[ebx]		// Load pixel
				mov ax,[ebp + eax*2]	// Get 15-bit color value
				mov [edi],ax		// Store
				add edi,2			// Increase destination pointer
				inc ebx
				dec ecx
			jnz __m0_tile0_draw

			mov ecx,29
			__m0_draw_tiles:
				push ecx
				mov eax,[xtile]
				lea ebx,[eax*8]
				and eax,0x1f
				and ebx,[hScrMask]
				mov ecx,[hScrShift]
				shl ebx,cl
				lea eax,[ebx+eax*2]
				xor ebx,ebx				// ebx = 0
				mov bx,[esi+eax]		// Get tile data
				mov eax,ebx
				mov edx,ebx
				shr eax,11				// Get vflip flag
				inc dword ptr [xtile]
				and eax,1
				and edx,0x400			// Get hflip flag
				and ebx,0x3FF
				shr edx,10
				shl ebx,6
				add ebx,[tileOffset+eax*4]
				add ebx,[dummy] 
				mov eax,1
				lea ebx,[ebx+edx*8]
				sub eax,edx				
				sub ebx,edx				// if (hflip==1) ebx+=7
				sub eax,edx				
				mov edx,eax				// edx = (hflip==1) ? -1 : 1

				mov ecx,8
				__m0_draw:
					xor eax,eax
					mov al,[ebx]
					mov ax,[ebp + eax*2]
					mov [edi],ax
					add edi,2
					add ebx,edx
					dec ecx
				jnz __m0_draw
				pop ecx
				dec ecx
			jnz __m0_draw_tiles

__m0_tile31:
				mov eax,[xtile]
				lea ebx,[eax*8]
				and eax,0x1f
				and ebx,[hScrMask]
				mov ecx,[hScrShift]
				shl ebx,cl
				lea eax,[ebx+eax*2]
				xor ebx,ebx				// ebx = 0
				mov bx,[esi+eax]		// Get tile data
				mov eax,ebx				// Get vflip flag
				and ebx,0x3FF
				shr eax,11
				shl ebx,6
				and eax,1
				add ebx,[dummy] //edx
				add ebx,[tileOffset+eax*4]

				mov ecx,[hofs]
				and ecx,7
				jz __m0_tile31_end
				__m0_tile31_draw:
					xor eax,eax
					mov al,[ebx]
					mov ax,[ebp + eax*2]
					mov [edi],ax
					add edi,2
					inc ebx
					dec ecx
				jnz __m0_tile31_draw
__m0_tile31_end:
			jmp __m0_done

/////////////////////////////////////////
// Handle semi-transparent layers
/////////////////////////////////////////
__m0_not_first_bg:
			push ebp
			mov ebp,[currPalette]

			test [ablend],1
			jnz __m0_prepare_blend

			mov eax,[hofs]
			xor ebx,ebx
			shr eax,3
			mov [xtile],eax
			and eax,0x1F
			mov bx,[esi+eax*2]
			mov eax,ebx
			inc dword ptr [xtile]
			shr eax,11
			and ebx,0x3FF
			and eax,1
			shl ebx,6
			add ebx,[tileOffset+eax*4]
			add ebx,[dummy] 
		
			mov eax,[hofs]
			mov ecx,8
			and eax,7
			add ebx,eax
			sub ecx,eax
				__m0_tile0_draw_trans:
					mov al,[ebx]
					and eax,0xFF
					jz __m0_tile0_invisible
					mov ax,[ebp + eax*2]
					mov [edi],ax
__m0_tile0_invisible:
					add edi,2
					inc ebx
					dec ecx
				jnz __m0_tile0_draw_trans

			mov ecx,29
			__m0_draw_tiles_trans:
				push ecx
				mov eax,[xtile]
				xor ebx,ebx
				and eax,0x1F
				mov bx,[esi+eax*2]
				mov eax,ebx
				inc dword ptr [xtile]
				shr eax,11
				mov edx,ebx
				and eax,1
				and edx,0x400			// get hflip flag
				and ebx,0x3FF
				shr edx,10
				shl ebx,6
				add ebx,[tileOffset+eax*4]
				add ebx,[dummy] 
				mov eax,1
				lea ebx,[ebx+edx*8]
				sub eax,edx				
				sub ebx,edx				// if (hflip==1) ebx+=7
				sub eax,edx				
				mov edx,eax				// edx = (hflip==1) ? -1 : 1

				mov ecx,8
				__m0_draw_trans:
					mov al,[ebx]
					and eax,0xFF
					jz __m0_invisible
						mov ax,[ebp + eax*2]
						mov [edi],ax
					__m0_invisible:
					add edi,2
					add ebx,edx
					dec ecx
				jnz __m0_draw_trans
				pop ecx
				dec ecx
			jnz __m0_draw_tiles_trans

__m0_tile31_trans:
				mov eax,[xtile]
				xor ebx,ebx
				and eax,0x1F
				mov bx,[esi+eax*2]
				mov eax,ebx
				inc dword ptr [xtile]
				shr eax,11
				and ebx,0x3FF
				and eax,1
				shl ebx,6
				add ebx,[tileOffset+eax*4]
				add ebx,[dummy] //edx

				mov ecx,[hofs]
				and ecx,7
				jz __m0_tile31_trans_end
				__m0_tile31_draw_trans:
					mov al,[ebx]
					and eax,0xFF
					jz __m0_tile31_invisible
					mov ax,[ebp + eax*2]
					mov [edi],ax
__m0_tile31_invisible:
					add edi,2
					inc ebx
					dec ecx
				jnz __m0_tile31_draw_trans
__m0_tile31_trans_end:
				jmp __m0_done

/////////////////////////////////////////
// Handle alpha blended layers
/////////////////////////////////////////
__m0_prepare_blend:
			mov eax,[hofs]
			xor ebx,ebx
			shr eax,3
			mov [xtile],eax
			and eax,0x1F
			mov bx,[esi+eax*2]
			mov eax,ebx
			inc dword ptr [xtile]
			shr eax,11
			and ebx,0x3FF
			and eax,1
			shl ebx,6
			add ebx,[tileOffset+eax*4]
			add ebx,[dummy] //edx

			mov eax,[hofs]
			push edx
			mov ecx,8
			mov edx,edi
			and eax,7
			sub edx,[vscreen]
			sub ecx,eax
			add edx,[vscreen2]
			add ebx,eax
				__m0_tile0_draw_trans_bld:
					mov al,[ebx]
					and eax,0xFF
					jz __m0_tile0_invisible_bld
					mov ax,[ebp + eax*2]
					mov [edx],ax
					jmp __m0_tile0_opaque_bld
__m0_tile0_invisible_bld:
					mov ax,[edi]
					mov [edx],ax
__m0_tile0_opaque_bld:
					add edi,2
					add edx,2
					inc ebx
					dec ecx
				jnz __m0_tile0_draw_trans_bld
			pop edx

			mov ecx,29
			__m0_draw_tiles_trans_bld:
				push ecx
				mov eax,[xtile]
				xor ebx,ebx
				and eax,0x1F
				mov bx,[esi+eax*2]
				mov eax,ebx
				inc dword ptr [xtile]
				shr eax,11
				and ebx,0x3FF
				and eax,1
				shl ebx,6
				add ebx,[tileOffset+eax*4]
				add ebx,[dummy] //edx

				mov ecx,-8
				push edx
				add edi,16
				//xor ecx,-1
				mov edx,edi
				//inc ecx
				sub edx,[vscreen]
				add edx,[vscreen2]
				__m0_draw_trans_bld:
					mov al,[ebx]
					and eax,0xFF
					jz __m0_invisible_bld
					mov ax,[ebp + eax*2]
					mov [edx + ecx*2],ax
					jmp __m0_opaque_bld
__m0_invisible_bld:
					mov ax,[edi + ecx*2]
					mov [edx + ecx*2],ax
__m0_opaque_bld:
					//add edi,2
					//add edx,2
					inc ebx
					inc ecx
				jnz __m0_draw_trans_bld
				pop edx
				pop ecx
				dec ecx
			jnz __m0_draw_tiles_trans_bld

__m0_tile31_trans_bld:
				mov eax,[xtile]
				xor ebx,ebx
				and eax,0x1F
				mov bx,[esi+eax*2]
				mov eax,ebx
				inc dword ptr [xtile]
				shr eax,11
				and ebx,0x3FF
				and eax,1
				shl ebx,6
				add ebx,[tileOffset+eax*4]
				add ebx,[dummy] //edx

				mov ecx,[hofs]
				mov edx,edi
				sub edx,[vscreen]
				add edx,[vscreen2]
				and ecx,7
				jz __m0_tile31_trans_bld_end
				__m0_tile31_draw_trans_bld:
					mov al,[ebx]
					and eax,0xFF
					jz __m0_tile31_invisible_bld
					mov ax,[ebp + eax*2]
					mov [edx],ax
					jmp __m0_tile31_opaque_bld
__m0_tile31_invisible_bld:
					mov ax,[edi]
					mov [edx],ax
__m0_tile31_opaque_bld:
					add edi,2
					add edx,2
					inc ebx
					dec ecx
				jnz __m0_tile31_draw_trans_bld
__m0_tile31_trans_bld_end:


__m0_done:
			pop ebp
			popa
		}
	else
///////////////////////////
// Handle 16x16 color BGs
///////////////////////////
		__asm
		{
			pusha
			mov eax,[scanline]
			mov ebx,480
			dec eax
			mov edi,[vscreen]
			mul ebx
			add edi,eax				// edi = vscreen+scanline*480

			mov esi,[VRAM]
			mov eax,[scrBase]
			shl eax,11
			mov ebx,[scanline]
			dec ebx
			add ebx,[vofs]
			mov edx,ebx
			and ebx,0xFF			// temp fix !!
			mov ecx,[vScrShift]
			and edx,[vScrMask]
			shl edx,cl
			mov ecx,ebx
			shr ebx,3
			shl ebx,6				// ebx = (scanline/8)*64
			add esi,edx
			add esi,eax
			add esi,ebx				// esi = nametable

			and ecx,7
			mov edx,[VRAM]
			mov eax,[chrBase]
			shl eax,14
			shl ecx,2
			add edx,eax
			add edx,ecx
			mov [dummy],edx

			//test dword ptr [bg],0xF
			//jnz __m0_not_bg0_16
			push ebp
			//mov ebp,[currPalette]

			mov eax,[hofs]
			mov ebx,eax
			shr eax,3				// eax = xscroll/8
			mov [xtile],eax
			and eax,0x1f //[hTileMask]		// AND with the number of tiles per row
			and ebx,[hScrMask]
			mov ecx,[hScrShift]
			shl ebx,cl
			lea eax,[ebx+eax*2]
			xor ebx,ebx				// ebx = 0
			mov bx,[esi+eax]		// Get tile data
			inc dword ptr [xtile]	// Next tile..
			mov ebp,ebx
			and ebx,0x3FF			// ebx = tile number
			and ebp,0xF000
			shl ebx,5				// ebx *= 8*8
			shr ebp,7
			add ebx,edx				// ebx += VRAM + chrBase + tile_row*8
			add ebp,[currPalette]

			mov eax,[hofs]
			//add ebx,8
			mov ecx,8
			and eax,7
			sub ecx,eax					// ecx = 8 - (xscroll&7)  (length of first tile)
			shr eax,1
			add ebx,eax
			test ecx,1
			jz __m0_16_tile0_even
			test dword ptr [newframe],0x3
			jnz __m0_16_transp_1
				xor eax,eax				// Clear eax
				mov al,[ebx]			// Load pixel
				shr eax,4
				mov ax,[ebp + eax*2]	// Get 15-bit color value
				mov [edi],ax			// Store
				add edi,2				// Increase destination pointer
				inc ebx
				dec ecx
				jmp __m0_16_tile0_even
			__m0_16_transp_1:
				xor eax,eax				// Clear eax
				mov al,[ebx]			// Load pixel
				shr eax,4
				jz __m0_16_transp_2
				mov ax,[ebp + eax*2]	// Get 15-bit color value
				mov [edi],ax			// Store
				__m0_16_transp_2:
				add edi,2				// Increase destination pointer
				inc ebx
				dec ecx

__m0_16_tile0_even:
			test dword ptr [newframe],0x3
			jnz __m0_16_tile0_transp
			__m0_16_tile0_draw:
				cmp ecx,2
				jl __m0_16_tile0_end
				xor eax,eax				// Clear eax
				mov al,[ebx]			// Load pixel
				mov edx,eax
				shr eax,4
				and edx,0xF
				mov ax,[ebp + eax*2]	// Get 15-bit color value
				shl eax,16
				mov ax,[ebp + edx*2]
				mov [edi],eax			// Store
				add edi,4				// Increase destination pointer
				inc ebx
				sub ecx,2
			jmp __m0_16_tile0_draw
			__m0_16_tile0_transp:
				cmp ecx,2
				jl __m0_16_tile0_end
				xor eax,eax				// Clear eax
				mov al,[ebx]			// Load pixel
				mov edx,eax
					and eax,0xF
					jz __m0_16_transp_4
					mov ax,[ebp + eax*2]
					mov [edi],ax
				__m0_16_transp_4:
					shr edx,4
					jz __m0_16_transp_5
					mov ax,[ebp + edx*2]
					mov [edi+2],ax
				__m0_16_transp_5:
				add edi,4				// Increase destination pointer
				inc ebx
				sub ecx,2
			jmp __m0_16_tile0_transp

__m0_16_tile0_end:
/*			and ecx,1
			jz __m0_16_no_spare
			add edi,2
__m0_16_no_spare:*/

			mov ecx,29
			__m0_16_draw_tiles:
				push ecx
				mov eax,[xtile]
				lea ebx,[eax*8]
				//xor ebx,ebx
				and eax,0x1f //[hTileMask]
			and ebx,[hScrMask]
			mov ecx,[hScrShift]
			shl ebx,cl
			lea eax,[ebx+eax*2]
			xor ebx,ebx				// ebx = 0
			mov bx,[esi+eax]		// Get tile data
				inc dword ptr [xtile]
				mov ebp,ebx
				and ebx,0x3FF
				and ebp,0xF000
				shl ebx,5
				shr ebp,7
				add ebx,[dummy] //edx
				add ebp,[currPalette]

				mov ecx,4
				test dword ptr [newframe],0x3
				jnz __m0_16_draw_transp
				__m0_16_draw:
					xor eax,eax
					mov al,[ebx]
					mov edx,eax
					shr eax,4
					and edx,0xF
					mov ax,[ebp + eax*2]
					shl eax,16
					mov ax,[ebp + edx*2]
					mov [edi],eax
					add edi,4
					inc ebx
					dec ecx
				jnz __m0_16_draw
				jmp __m0_16_draw_next
			__m0_16_draw_transp:
					xor eax,eax
					mov al,[ebx]
					mov edx,eax
					and eax,0xF
					jz __m0_16_transp_6
					mov ax,[ebp + eax*2]
					mov [edi],ax
				__m0_16_transp_6:
					shr edx,4
					jz __m0_16_transp_7
					mov ax,[ebp + edx*2]
					mov [edi+2],ax
				__m0_16_transp_7:
					add edi,4
					inc ebx
					dec ecx
				jnz __m0_16_draw_transp

__m0_16_draw_next:
				pop ecx
				dec ecx
			jnz __m0_16_draw_tiles

__m0_16_tile31:
				mov eax,[xtile]
				//xor ebx,ebx
				lea ebx,[eax*8]
				and eax,0x1f //[hTileMask]
			and ebx,[hScrMask]
			mov ecx,[hScrShift]
			shl ebx,cl
			lea eax,[ebx+eax*2]
			xor ebx,ebx				// ebx = 0
			mov bx,[esi+eax]		// Get tile data
				inc dword ptr [xtile]
				mov ebp,ebx
				and ebx,0x3FF
				and ebp,0xF000
				shl ebx,5
				shr ebp,7
				add ebx,[dummy] //edx
				add ebp,[currPalette]

				mov ecx,[hofs]
				test dword ptr [newframe],0x3
				jnz __m0_16_tile31_transp

				and ecx,7
				jz __m0_16_tile31_end
				__m0_16_tile31_draw:
					cmp ecx,2
					jl __m0_16_tile31_end
					xor eax,eax
					mov al,[ebx]
					mov edx,eax
					shr eax,4
					and edx,0xF
					mov ax,[ebp + eax*2]
					shl eax,16
					mov ax,[ebp + edx*2]
					mov [edi],eax
					add edi,4
					inc ebx
					sub ecx,2
				jmp __m0_16_tile31_draw
__m0_16_tile31_end:
			test ecx,1
			jz __m0_done_16 //__m0_16_tile31_even
				xor eax,eax				// Clear eax
				mov al,[ebx]			// Load pixel
				and eax,0xF
				mov ax,[ebp + eax*2]	// Get 15-bit color value
				mov [edi],ax			// Store
__m0_16_tile31_even:

				and ecx,7
				jz __m0_16_tile31t_end
			__m0_16_tile31_transp:
				cmp ecx,2
				jl __m0_16_tile31t_end
				xor eax,eax				// Clear eax
				mov al,[ebx]			// Load pixel
				mov edx,eax
					and eax,0xF
					jz __m0_16_transp_8
					mov ax,[ebp + eax*2]
					mov [edi],ax
				__m0_16_transp_8:
					shr edx,4
					jz __m0_16_transp_9
					mov ax,[ebp + edx*2]
					mov [edi+2],ax
				__m0_16_transp_9:
				add edi,4				// Increase destination pointer
				inc ebx
				sub ecx,2
			jmp __m0_16_tile31_transp
__m0_16_tile31t_end:
			test ecx,1
			jz __m0_done_16 
				xor eax,eax				// Clear eax
				mov al,[ebx]			// Load pixel
				and eax,0xF
				jz __m0_done_16
				mov ax,[ebp + eax*2]	// Get 15-bit color value
				mov [edi],ax			// Store

			jmp __m0_done_16

__m0_done_16:
			pop ebp
			popa
		}



////////////////////////////////
// Perform the alpha blending
////////////////////////////////
		if (ablend)
			alpha_blend();

	newframe++;
}





#define MAX_HTILE 0x1F


void draw_rotscale_bg(int bg)
{
	int chrBase,scrBase,cntReg,colMode;

	// Get the control register offset of this background
	cntReg = IOREGS_OFFSET + 8 + (bg<<1);

	// Get the horizontal and vertical screen offsets
	hofs = IOREGS[0x10 + (bg<<2)]&0xFF;
	hofs += (IOREGS[0x11 + (bg<<2)]&0x1)<<8;
	vofs = IOREGS[0x12 + (bg<<2)]&0xFF;
	vofs += (IOREGS[0x13 + (bg<<2)]&0x1)<<8;

	// Get the CHR base, SCR base and color mode
	chrBase = (VRAM[cntReg] >> 2) & 3;
	scrBase = VRAM[cntReg+1] & 0x1F;
	colMode = VRAM[cntReg] & 0x80;

	setup_color_effects(bg);

	lastbg = bg;

	if (bg == 2)
	{
		x2 = bg2x;
		y2 = bg2y;
		deltax = BG2PA;
		deltay = BG2PC;
	} else
	{
		x2 = bg3x;
		y2 = bg3y;
		deltax = BG3PA;
		deltay = BG3PC;
	}


	hTileMask = (VRAM[cntReg+1] >> 6) & 3;
	vTileMask = rsBgMasks[hTileMask][1];
	tileShift = rsBgShifts[hTileMask];
	hTileMask = rsBgMasks[hTileMask][0];

///////////////////////////
// Handle 256x1 color BGs
///////////////////////////
	if (colMode == 0x80)
		__asm
		{
			pusha
			mov eax,[scanline]
			mov ebx,480
			dec eax
			mov edi,[vscreen]
			mul ebx
			add edi,eax				// edi = vscreen+scanline*480

			mov esi,[VRAM]
			mov eax,[scrBase]
			shl eax,11				// Select 2k screen base block
			mov ebx,[scanline]
			dec ebx
			add ebx,[vofs]
			and ebx,0xFF			// temp fix !!
			mov ecx,ebx
			shr ebx,3
			shl ebx,5				// ebx = (scanline/8)*32
			add esi,eax
			//add esi,ebx				// esi = nametable

			and ecx,7				// ecx = (scanline+yscroll)&7  (tile row)
			mov edx,[VRAM]
			mov eax,[chrBase]
			shl eax,14				// Select 16k chr base block
			shl ecx,3				// Multiply by 8 (number of bytes per row)
			add edx,eax				// edx = VRAM + chrBase
			//add edx,ecx				// edx += tile_row*8

			mov [lineaddr],edi		// Save destination address for later use


			// Is this the bottom-most layer?
			test dword ptr [newframe],0x3
			jnz __m1_not_first_bg

__m1_first_bg:

			push ebp
			mov ebp,[currPalette]

			xor ecx,ecx
__m1_hloop:
			push ecx
			mov ebx,[x2]
			mov ebp,[y2]
			sar ebx,8
			sar ebp,8
			mov eax,ebx
			mov ecx,ebp
			sar ebx,3
			sar ebp,3
			and eax,7
			and ecx,7
			and ebx,[hTileMask]
			and ebp,[vTileMask]
			push ecx
			mov ecx,[tileShift]
			shl ebp,cl
			add ebx,ebp
			pop ecx
			movzx ebx,byte ptr [esi+ebx]
			shl ecx,3
			shl ebx,6
			mov ebp,[currPalette]
			add ebx,edx
			add eax,ecx
			movzx ecx,byte ptr [ebx+eax]
			mov ax,[ebp + ecx*2]
			mov [edi],ax
			mov eax,[deltax]
			mov ebx,[deltay]
			add edi,2
			add [x2],eax
			add [y2],ebx
			pop ecx
			inc ecx
			cmp ecx,240
			jne __m1_hloop

			jmp __m1_done

/////////////////////////////////////////
// Handle semi-transparent layers
/////////////////////////////////////////
__m1_not_first_bg:
			push ebp
			mov ebp,[currPalette]

			test [ablend],1
			jnz __m1_prepare_blend

			xor ecx,ecx
__m1_hloop_transp:
			push ecx
			mov ebx,[x2]
			mov ebp,[y2]
			sar ebx,8
			sar ebp,8
			mov eax,ebx
			mov ecx,ebp
			sar ebx,3
			sar ebp,3
			and eax,7
			and ecx,7
			and ebx,[hTileMask]
			and ebp,[vTileMask]
			push ecx
			mov ecx,[tileShift]
			shl ebp,cl
			add ebx,ebp
			pop ecx
			movzx ebx,byte ptr [esi+ebx]
			shl ecx,3
			shl ebx,6
			mov ebp,[currPalette]
			add ebx,edx
			add eax,ecx
			movzx ecx,byte ptr [ebx+eax]
			cmp ecx,0
			jz __m1_transp
			mov ax,[ebp + ecx*2]
			mov [edi],ax
__m1_transp:
			mov eax,[deltax]
			mov ebx,[deltay]
			add edi,2
			add [x2],eax
			add [y2],ebx
			pop ecx
			inc ecx
			cmp ecx,240
			jne __m1_hloop_transp

			jmp __m1_done

/////////////////////////////////////////
// Handle alpha blended layers
/////////////////////////////////////////
__m1_prepare_blend:
			mov ebp,edi
			sub edi,[vscreen]
			xor ecx,ecx
			add edi,[vscreen2]
__m1_hloop_bld:
	/*push edx
	rdtsc
	mov dword ptr [fubar],eax
	pop edx*/
			push ecx
			push ebp
			mov ebx,[x2]
			mov ebp,[y2]
			sar ebx,8
			sar ebp,8
			mov eax,ebx
			mov ecx,ebp
			sar ebx,3
			sar ebp,3
			and eax,7
			and ecx,7
			and ebx,[hTileMask]
			and ebp,[vTileMask]
			push ecx
			mov ecx,[tileShift]
			shl ebp,cl
			add ebx,ebp
			pop ecx
			movzx ebx,byte ptr [esi+ebx]
			shl ecx,3
			shl ebx,6
			mov ebp,[currPalette]
			add ebx,edx
			add eax,ecx
			movzx ecx,byte ptr [ebx+eax]
			cmp ecx,0
			jz __m1_bld_transp
			mov ax,[ebp + ecx*2]
			mov [edi],ax
			pop ebp
			jmp __m1_bld_opaque
__m1_bld_transp:
			pop ebp
			mov ax,[ebp]
			mov [edi],ax
__m1_bld_opaque:
			mov eax,[deltax]
			add ebp,2
			mov ebx,[deltay]
			add edi,2
			add [x2],eax
			add [y2],ebx
			pop ecx
			inc ecx
/*push edx
rdtsc
sub eax,dword ptr [fubar]
mov dword ptr [fubar],eax
pop edx*/
			cmp ecx,240
			jne __m1_hloop_bld
__m1_done:
			pop ebp
			popa
		}

		if (ablend)
			alpha_blend();

		newframe++;
}




void draw_sprites()
{
	spriteBit6 = IOREGS[0]&0x40;

	if (forceLayer[4] || (((VRAM[IOREGS_OFFSET+1] & 0x10) != 0) && enableObj)) //enableLayer[4]))
	{
		currPalette = &VRAM[PALETTE_OFFSET+0x200];
		if (useGL && glOK && (glFormat==GL_UNSIGNED_SHORT_5_5_5_1))
			flip_palette(1);

		scanline--;
		__asm
		{
			pusha
			//dec dword ptr [scanline]
			mov ebx,480
			mov eax,[scanline]
			mul ebx
			add eax,[vscreen]
			mov edi,eax

			mov ecx,0
			mov dword ptr [spriteCnt],127		// # of sprites on this scanline

__sprite_loop:
			mov dword ptr [xclip],0
			mov ecx,[spriteCnt]
			mov edx,[VRAM]
			add edx,SPRITE_TABLE_OFFSET
			mov eax,[edx+ecx*8]				// Get attribute 0 and 1
			mov esi,eax
			mov edx,eax
			and esi,0xC0000000				// ss00 0000 0000 0000 0000 0000 0000 0000
			and edx,0x0000C000				// SS00 0000 0000 0000
			shr esi,30						// 00ss
			shr edx,12						// SS00
			or esi,edx						// esi = SSss  (s=size, S=Shape)
			mov edx,[objheight+esi*4]
			push ecx
			movd mm6,edx
			mov ecx,eax
			and ecx,0x200
			shr ecx,9
			shl edx,cl
			dec edx

			movd mm0,eax
			and eax,0xFF					// eax = sprite.y
			lea ebx,[eax+edx] 				// ebx = sprite.y+(height-1)
			movd mm7,ecx
			inc edx
			//mov [rsheight],edx
			movd mm4,edx
			shr edx,cl
			pop ecx
			dec edx
			cmp eax,dword ptr [scanline] 	// is the entire sprite below this scanline?
			jg __next_sprite
			cmp ebx,dword ptr [scanline] 	// is the entire sprite above this scanline?
			jl __next_sprite

			movd mm1,eax
			//movd eax,mm1				// eax = sprite.y
			movd eax,mm0
			test eax,0x100
			jnz __sprite_rotoscale
			movd eax,mm1

			push edi 					// save edi
			mov ebx,[scanline]
			sub ebx,eax
			mov edx,ebx
			and ebx,7
			shl ebx,3 					// ebx = (line-sprite.y)*8
			shr edx,3					// edx = (line-sprite.y)/8
			movd mm3,ebx

			test [spriteBit6],0x40
			jz __bit6_0
			push ecx
			movd ecx,mm0				// attribute 0
			and ecx,0x2000				// color bit
			shr ecx,13					// 1 for 256x1, 0 for 16x16
			mov eax,[objwidth+esi*4]
			shr eax,3					// eax = width/8 (tiles per row)
			shl eax,cl
			bsr cx,ax
			shl edx,cl
			mov [spriteTile],edx
			//mov [spriteDbg],ecx
			pop ecx
			jmp __bit6
	__bit6_0:
			shl edx,5
			mov [spriteTile],edx
	__bit6:

			movd eax,mm0

			mov edx,[objwidth+esi*4]
			mov esi,ebx
			mov [dummy],edx
			mov ebx,eax
			dec edx
			and ebx,0x1FF0000
			shr ebx,16					// sprite.x
			add edx,ebx
			cmp edx,0
			jl __sprite_done
			cmp ebx,240
			jge __sprite_done

			add esi,[patternTbl]
			mov edx,[VRAM]
			add edx,SPRITE_TABLE_OFFSET+4
			mov eax,[edx+ecx*8]			// attribute 2
			movd mm2,eax
			and eax,0x3FF
			add eax,[spriteTile]
			shl eax,5

			add esi,eax 				// eax = tile*32 + (line-sprite.y)*8 + patternTbl
			lea edi,[edi+ebx*2]			// edi += sprite.x*2
			mov ebx,[currPalette] //[colorLUT]
			movd eax,mm0
			mov ecx,[dummy]

			test eax,0x2000
			jz __sprite_16x16
			test eax,0x10000000
			jnz __sprite_hflip

	__sprite_hloop_8:
		push ecx
		mov ecx,8
		__sprite_hloop_inner_8:
				xor eax,eax
				mov al,[esi]
				or eax,eax
				jz __sprite_pixel_invisible
				mov ax,[ebx+eax*2]
				mov [edi],ax
		__sprite_pixel_invisible:
				inc esi
				add edi,2
				dec ecx
			jnz __sprite_hloop_inner_8
		pop ecx
		add esi,56
		sub ecx,8
		jnz __sprite_hloop_8

		jmp __sprite_done

__sprite_hflip:
			mov eax,ecx
			shr eax,3
			dec eax
			shl eax,6
			add esi,eax
			add esi,8 //ecx
	__sprite_hloop_8_hflip:
			push ecx
			mov ecx,8
		__sprite_hloop_8_hflip_inner:
				dec esi
				xor eax,eax
				mov al,[esi]
				or eax,eax
				jz __sprite_pixel_invisible_hflip
				mov ax,[ebx+eax*2]
				mov [edi],ax
		__sprite_pixel_invisible_hflip:
				add edi,2
				dec ecx
			jnz __sprite_hloop_8_hflip_inner
		pop ecx
		sub esi,56
		sub ecx,8
		jnz __sprite_hloop_8_hflip

		jmp __sprite_done

__sprite_16x16:
			test eax,0x10000000			// Check the HFLIP flag
			jnz __sprite_hflip_16x16
			movd eax,mm3				// (line-sprite.y)*8
			shr eax,1					// Divide by 2
			sub esi,eax					// Subtract (line-sprite.y)*4 since each tile has half the number of bytes per row
			movd eax,mm2				// Attribute 2
			shr ecx,1					// Divide the length by 2 since 2 pixels are packed into each byte
			and eax,0xF000				// Palette
			shr eax,7					// Palette*32
			add ebx,eax
__sprite_hloop_4:
	push ecx
	mov ecx,4
	__sprite_hloop_inner_4:
			xor eax,eax
			xor edx,edx
			mov al,[esi]				// Read one byte (2 pixels)
			mov edx,eax					// Copy eax
			and eax,0xF					// Get lower pixel
			jz __sprite_pixel_invisible_4_1
			mov ax,[ebx+eax*2]
			mov [edi],ax
	__sprite_pixel_invisible_4_1:
			shr edx,4					// Get upper pixel
			jz __sprite_pixel_invisible_4_2
			mov ax,[ebx+edx*2]
			mov [edi+2],ax
	__sprite_pixel_invisible_4_2:
			inc esi
			add edi,4
			dec ecx
			jnz __sprite_hloop_inner_4
	pop ecx
	add esi,28
	sub ecx,4
	jnz __sprite_hloop_4
			jmp __sprite_done

__sprite_hflip_16x16:
			movd eax,mm3				// (line-sprite.y)*8
			shr eax,1					// Divide by 2
			sub esi,eax					// Subtract (line-sprite.y)*4 since each tile has half the number of bytes per row
			movd eax,mm2				// Attribute 2
			and eax,0xF000				// Palette
			shr eax,7					// Palette*32
			add ebx,eax

			mov eax,ecx
			shr eax,3
			dec eax
			shl eax,5
			add esi,eax					// esi += ((width/8)-1)*32
			add esi,4 
	__sprite_hloop_16x16_hflip:
			push ecx
			mov ecx,4
		__sprite_hloop_16x16_hflip_inner:
				dec esi
				xor eax,eax
				xor edx,edx
				mov al,[esi]				// Read one byte (2 pixels)
				mov edx,eax					// Copy eax
				shr eax,4					// Get upper pixel
				jz __sprite_pixel_invisible_4_1_16x16hf
					mov ax,[ebx+eax*2]
					mov [edi],ax
				__sprite_pixel_invisible_4_1_16x16hf:
				and edx,0xF					// Get lower pixel
				jz __sprite_pixel_invisible_4_2_16x16hf
					mov ax,[ebx+edx*2]
					mov [edi+2],ax
				__sprite_pixel_invisible_4_2_16x16hf:
				add edi,4
				dec ecx
		jnz __sprite_hloop_16x16_hflip_inner
		pop ecx
		sub esi,28
		sub ecx,8
	jnz __sprite_hloop_16x16_hflip
	jmp __sprite_done


__sprite_rotoscale:
			mov edx,[VRAM]
			push ecx
			add edx,SPRITE_TABLE_OFFSET+6
			movd ecx,mm7
			mov eax,[objwidth + esi*4]
			shl eax,cl
			movd mm7,eax
			mov [clippedWidth],eax
			movd ecx,mm0
			and ecx,0x3E000000
			shr ecx,20
			add edx,ecx
			push ebp

			movsx eax,word ptr [edx]		// PA-n
			movsx ebx,word ptr [edx+16]		// PC-n
			mov [objdx],eax
			mov [objdy],ebx

			movsx eax,word ptr [edx+8]		// PB-n
			movsx ebx,word ptr [edx+24]		// PD-n
			mov [objdmx],eax
			mov [objdmy],ebx

			sub edx,ecx
			pop ebp
			sub edx,2
			pop ecx
			push edi

			mov eax,[edx+ecx*8]			// attribute 2
			movd ebx,mm0
			and eax,0x3FF
			and ebx,0x1FF0000
			mov [spriteTile],eax
			shr ebx,16
			//cmp edx,0
			//jl __sprite_done
			cmp ebx,240
			jl __sprite_rs_x_ok
			movd eax,mm7 
			add eax,ebx
			sub eax,513
			js __sprite_done
			inc eax
			// clippedWidth = (sprite.x + sprite.width) - 512
			mov [clippedWidth],eax
			movd ebx,mm7 
			sub ebx,eax
			mov [xclip],ebx
			xor ebx,ebx
			__sprite_rs_x_ok:

			lea edi,[edi+ebx*2]

			movd eax,mm6 //mm4
			mov ebx,[objwidth+esi*4]
			//movd ebx,mm7
			shl eax,7
			shl ebx,7
			mov [objy2],eax
			mov [objx2],ebx

			movd eax,mm1				// sprite.y
			mov ebx,[scanline]
			movd edx,mm4
			sub ebx,eax					// ebx = scanline-sprite.y
			shr edx,1					// edx = height/2
			inc ebx						// ebx = line (1..height)
			sub ebx,edx					// ebx =

			mov eax,ebx
			imul dword ptr [objdmy]
			add [objy2],eax

			mov eax,ebx
			imul dword ptr [objdmx]
			add [objx2],eax

			//mov ebx,[objwidth+esi*4]
			movd ebx,mm7
			xor ecx,ecx
			shr ebx,1
			neg ebx
			add ebx,[xclip]
			mov eax,ebx
			imul dword ptr [objdy]
			add [objy2],eax
			mov eax,[objdx]
			imul ebx
			add [objx2],eax

			movd ebx,mm0
			test ebx,0x2000
			jz __sprite_rs_16x16

			push ebp
			mov ecx,[objwidth+esi*4]
			//movd ecx,mm7
			movd mm5,ecx
			mov eax,ecx
			mov ebp,[currPalette]
			shr eax,3
			mov edx,[objheight+esi*4]
			test [spriteBit6],0x40
			jz __sprite_rs_2d
				shl eax,1
				bsr cx,ax
				movd mm6,ecx
				jmp __sprite_rs_1d
	__sprite_rs_2d:
			mov ecx,5
			movd mm6,ecx
	__sprite_rs_1d:

			mov ecx,[clippedWidth] 
			mov eax,[spriteTile]
			mov esi,[patternTbl]
			shl eax,5
			add esi,eax

__sprite_rs_hloop:
			push ecx
			mov ebx,[objy2]
			push esi
			sar ebx,8
			js __sprite_rs_transp
			cmp ebx,edx
			jge __sprite_rs_transp
			movd ecx,mm6
			mov eax,ebx
			shr ebx,3
			and eax,7
			shl ebx,cl
			lea esi,[esi+eax*8]
			mov eax,[objx2]
			shl ebx,5
			sar eax,8
			js __sprite_rs_transp
			movd ecx,mm5
			cmp eax,ecx
			jge __sprite_rs_transp
			mov ecx,eax
			shr eax,3
			and ecx,7
			shl eax,6		// eax = (u/8)*64
			add esi,ecx
			add esi,ebx
			mov al,[esi+eax]
			and eax,255
			jz __sprite_rs_transp
			mov bx,[ebp+eax*2]
			mov [edi],bx
			mov eax,[objdx]
			mov ecx,[objdy]
			add [objx2],eax
			add edi,2
			add [objy2],ecx
			pop esi
			pop ecx
			dec ecx
			jnz __sprite_rs_hloop
			pop ebp
			jmp __sprite_done
__sprite_rs_transp:
			mov eax,[objdx]
			mov ecx,[objdy]
			add [objx2],eax
			add edi,2
			add [objy2],ecx
			pop esi
			pop ecx
			dec ecx
			jnz __sprite_rs_hloop
			pop ebp
			jmp __sprite_done
/*__sprite_rs_transp2:
			mov word ptr [edi],0x1F
			mov eax,[objdx]
			mov ecx,[objdy]
			add [objx2],eax
			add edi,2
			add [objy2],ecx
			pop esi
			pop ecx
			dec ecx
			jnz __sprite_rs_hloop
			pop ebp
			jmp __sprite_done
__sprite_rs_transp3:
			mov word ptr [edi],0x7FE0
			mov eax,[objdx]
			mov ecx,[objdy]
			add [objx2],eax
			add edi,2
			add [objy2],ecx
			pop esi
			pop ecx
			dec ecx
			jnz __sprite_rs_hloop
			pop ebp
			jmp __sprite_done*/

__sprite_rs_16x16:

__sprite_done:
			pop edi

__next_sprite:
			dec dword ptr [spriteCnt]
			//cmp dword ptr [spriteCnt],128
			jns __sprite_loop
__sprites_done:
			//inc dword ptr [scanline]
			emms
			popa
		}
		scanline++;
	} else
		__asm
		{
			emms
		}
}



void draw_scanline()
{
	int tilebias = 0,i,j;
	bool glRenderer = (useGL && glOK);

	// Read/update rotation/scaling parameters
	if (scanline==1)
	{
		BG2X = (*(unsigned int *)(IOREGS + 0x28)) & 0x0FFFFFFF;
		BG2Y = (*(unsigned int *)(IOREGS + 0x2C)) & 0x0FFFFFFF;
		BG3X = (*(unsigned int *)(IOREGS + 0x38)) & 0x0FFFFFFF;
		BG3Y = (*(unsigned int *)(IOREGS + 0x3C)) & 0x0FFFFFFF;
		__asm
		{
			mov eax,BG2X
			shl eax,4
			sar eax,4
			mov bg2x,eax
			mov eax,BG2Y
			shl eax,4
			sar eax,4
			mov bg2y,eax
			mov eax,BG3X
			shl eax,4
			sar eax,4
			mov bg3x,eax
			mov eax,BG3Y
			shl eax,4
			sar eax,4
			mov bg3y,eax
		}
		BG2PA = (*(unsigned int *)(IOREGS + 0x20)) & 0xFFFF;
		BG2PB = (*(unsigned int *)(IOREGS + 0x22)) & 0xFFFF;
		BG2PC = (*(unsigned int *)(IOREGS + 0x24)) & 0xFFFF;
		BG2PD = (*(unsigned int *)(IOREGS + 0x26)) & 0xFFFF;
		BG3PA = (*(unsigned int *)(IOREGS + 0x30)) & 0xFFFF;
		BG3PB = (*(unsigned int *)(IOREGS + 0x32)) & 0xFFFF;
		BG3PC = (*(unsigned int *)(IOREGS + 0x34)) & 0xFFFF;
		BG3PD = (*(unsigned int *)(IOREGS + 0x36)) & 0xFFFF;
		__asm
		{
			shl dword ptr BG2PA,16
			sar dword ptr BG2PA,16
			shl dword ptr BG2PB,16
			sar dword ptr BG2PB,16
			shl dword ptr BG2PC,16
			sar dword ptr BG2PC,16
			shl dword ptr BG2PD,16
			sar dword ptr BG2PD,16
			shl dword ptr BG3PA,16
			sar dword ptr BG3PA,16
			shl dword ptr BG3PB,16
			sar dword ptr BG3PB,16
			shl dword ptr BG3PC,16
			sar dword ptr BG3PC,16
			shl dword ptr BG3PD,16
			sar dword ptr BG3PD,16
		}
	} else
	{
		//if ((BG2X != ((*(unsigned int *)(IOREGS + 0x28)) & 0x0FFFFFFF)) ||
		//	(BG2Y != ((*(unsigned int *)(IOREGS + 0x2C)) & 0x0FFFFFFF)))
		if (IOREGS[BG2X_DIRTY]!=0)
		{
			BG2X = (*(unsigned int *)(IOREGS + 0x28)) & 0x0FFFFFFF;
			BG2Y = (*(unsigned int *)(IOREGS + 0x2C)) & 0x0FFFFFFF;
			__asm
			{
				mov eax,BG2X
				shl eax,4
				sar eax,4
				mov bg2x,eax
				mov eax,BG2Y
				shl eax,4
				sar eax,4
				mov bg2y,eax
			}
		} else
		{
			bg2x += BG2PB;
			bg2y += BG2PD;
		}

		BG2PA = (*(unsigned int *)(IOREGS + 0x20)) & 0xFFFF;
		BG2PB = (*(unsigned int *)(IOREGS + 0x22)) & 0xFFFF;
		BG2PC = (*(unsigned int *)(IOREGS + 0x24)) & 0xFFFF;
		BG2PD = (*(unsigned int *)(IOREGS + 0x26)) & 0xFFFF;
		__asm
		{
			shl dword ptr BG2PA,16
			sar dword ptr BG2PA,16
			shl dword ptr BG2PB,16
			sar dword ptr BG2PB,16
			shl dword ptr BG2PC,16
			sar dword ptr BG2PC,16
			shl dword ptr BG2PD,16
			sar dword ptr BG2PD,16
		}

		//if ((BG3X != ((*(unsigned int *)(IOREGS + 0x38)) & 0x0FFFFFFF)) ||
		//	(BG3Y != ((*(unsigned int *)(IOREGS + 0x3C)) & 0x0FFFFFFF)))
		if (IOREGS[BG3X_DIRTY]!=0)
		{
			BG3X = (*(unsigned int *)(IOREGS + 0x38)) & 0x0FFFFFFF;
			BG3Y = (*(unsigned int *)(IOREGS + 0x3C)) & 0x0FFFFFFF;
			__asm
			{
				mov eax,BG3X
				shl eax,4
				sar eax,4
				mov bg3x,eax
				mov eax,BG3Y
				shl eax,4
				sar eax,4
				mov bg3y,eax
			}
		} else
		{
			bg3x += BG3PB;
			bg3y += BG3PD;
		}

		BG3PA = (*(unsigned int *)(IOREGS + 0x30)) & 0xFFFF;
		BG3PB = (*(unsigned int *)(IOREGS + 0x32)) & 0xFFFF;
		BG3PC = (*(unsigned int *)(IOREGS + 0x34)) & 0xFFFF;
		BG3PD = (*(unsigned int *)(IOREGS + 0x36)) & 0xFFFF;
		__asm
		{
			shl dword ptr BG3PA,16
			sar dword ptr BG3PA,16
			shl dword ptr BG3PB,16
			sar dword ptr BG3PB,16
			shl dword ptr BG3PC,16
			sar dword ptr BG3PC,16
			shl dword ptr BG3PD,16
			sar dword ptr BG3PD,16
		}
	}

	IOREGS[BG2X_DIRTY] = 0;
	IOREGS[BG3X_DIRTY] = 0;

	// Have we skipped enough frames?
	if (frames != frameskip)
		return;

	isFlipped[0] = isFlipped[1] = false;

	// Get mosaic values for BG2
	mosaicX = mosaicY = 0;
	if ((VRAM[IOREGS_OFFSET+0x0C] & 0x40) == 0x40)
	{
		mosaicX = VRAM[IOREGS_OFFSET+0x4C] & 0xF;
		mosaicY = (VRAM[IOREGS_OFFSET+0x4C] >> 4) & 0xF;
	}


	j = (VRAM[IOREGS_OFFSET] & 7);
	currPalette = &VRAM[PALETTE_OFFSET];

	// Handle modes 0-2
	if ((VRAM[IOREGS_OFFSET] & 7) < 3)
	{
		// Sort BGs
		for (i=0; i<4; i++)
			layers[i].prio = VRAM[IOREGS_OFFSET + 8 + (i<<1)]&0x3;
		layers[4].prio = 10;
		sort_layers(5);

		lastbg = -1;

		for (i=1; i<5; i++)
			if (forceLayer[sorted_layers[i].num] ||
				(((VRAM[IOREGS_OFFSET+1] & (1<<sorted_layers[i].num))!=0) && enableLayer[sorted_layers[i].num]))
			{
				draw_bg = draw_bg_callbacks[j][sorted_layers[i].num];
				if (draw_bg != NULL)
					draw_bg(sorted_layers[i].num);
			}
		draw_sprites();
		newframe = 0;
		blended2 = blended;
		blended = 0;
		return;
	}


	// Handle modes 3-5
	//VRAM[IOREGS_OFFSET+1] |= 4;

	currPalette = &VRAM[PALETTE_OFFSET];

	if ((j==4) && glRenderer && (glFormat==GL_UNSIGNED_SHORT_5_5_5_1))
		flip_palette(0);



	// Is BG2 enabled?

	if (forceLayer[2] || (((VRAM[IOREGS_OFFSET+1] & 0x4) != 0) && enableLayer[2]))
	{

		switch (VRAM[IOREGS_OFFSET] & 7)
		{
		case 4:
			if ((mosaicX==0) && (mosaicY==0))
			{
				if ((bg2x==0)&&(bg2y==0)&&(BG2PA==0)&&(BG2PB==0)&&(BG2PC==0))
				{
					__asm
					{
						pusha
						mov eax,[scanline]
						mov ebx,480
						dec eax
						mov edi,[vscreen]
						mul ebx
						mov esi,[VRAM]
						add edi,eax				// edi = vscreen+scanline*480
						movzx ecx,byte ptr [esi+IOREGS_OFFSET]
						shr eax,1
						and ecx,0x10			// ecx = x0000 (frame buffer selection)
						add esi,eax				// esi = VRAM+scanline*240
						shl ecx,9				// ecx = x0000000000000
						lea ecx,[ecx+ecx*4]		// ecx = x0x0000000000000 (0 : A000)
						//mov ebx,[VRAM]
						add esi,ecx

						//add ebx,PALETTE_OFFSET
						mov ebx,[currPalette]
						add esi,240				// The add is done to allow negative indexing
						add edi,480				// ..dito
						mov ecx,-240
						xor eax,eax
						__bm_pixel_loop:
							mov al,[esi+ecx]	// Read one byte from VRAM
							mov dx,[ebx+eax*2]	// Look it up in the color table
							mov [edi+ecx*2],dx	// Store it in the virtual screen
							inc ecx				// Next pixel..
						jnz __bm_pixel_loop
						popa
					}
				} else
				{
					__asm
					{
						pusha
						mov eax,[scanline]
						mov ebx,480
						dec eax
						mov edi,[vscreen]
						mov ecx,eax
						mov esi,[VRAM]
						mul ebx
						add edi,eax
						movzx edx,byte ptr [esi+IOREGS_OFFSET]
						and edx,0x10			// ecx = x0000 (frame buffer selection)
						shl edx,9				// ecx = x0000000000000
						lea edx,[edx+edx*4]		// ecx = x0x0000000000000 (0 : A000)
						push ebp
						add esi,edx
						mov ecx,240
						mov eax,bg2y
						mov ebp,eax				// ebp = bg2y*240
						mov eax,[BG2PC]
						mov ebx,eax				// ebx = dy*240
						mov edx,bg2x			// edx = bg2x
						mov ecx,0
		mode_4_draw:
						push esi
						mov eax,edx
						sar eax,8
						js mode_4_blank			// u+x<0 ?
						cmp eax,240
						jge mode_4_blank		// u+x>=240 ?
						add esi,eax
						mov eax,ebp				// v
						sar eax,8				// shift out fractional bits
						js mode_4_blank			// v+y<0 ?
						cmp eax,160
						jge mode_4_blank		// v+y>240*160
						shl eax,8
						add esi,eax
						shr eax,4
						sub esi,eax
						mov al,[esi]
						and eax,255
						mov esi,[currPalette]
						mov ax,[esi+eax*2]
						mov [edi],ax
						add edx,[BG2PA]
						add ebp,ebx
						add edi,2
						inc ecx
						pop esi
						cmp ecx,240
						jne mode_4_draw
						jmp mode_4_done
		mode_4_blank:
						mov esi,[colorLUT]
						mov ax,[esi]
						mov [edi],ax
						add edx,[BG2PA]
						add ebp,ebx
						add edi,2
						inc ecx
						pop esi
						cmp ecx,240
						jne mode_4_draw
		mode_4_done:
						pop ebp
						popa
					}
				}
			} else
				__asm
				{
					pusha
					mov eax,[scanline]
					dec eax
					mov ebx,480
					mov edi,[vscreen]
					mul ebx
					mov esi,[VRAM]
					add edi,eax				// edi = vscreen+scanline*480
					movzx ecx,byte ptr [esi+IOREGS_OFFSET]
					shr eax,1
					push eax
					and ecx,0x10			// ecx = x0000 (frame buffer selection)
					mov eax,[mosaicY]
					inc eax
					mov ebx,240
					mul ebx
					mov ebx,eax
					pop eax
					cdq
					div ebx
					mul ebx
					add esi,eax				// esi = VRAM+scanline*240
					shl ecx,9				// ecx = x0000000000000
					lea ecx,[ecx+ecx*4]		// ecx = x0x0000000000000 (0 : A000)
					//mov ebx,[VRAM]
					add esi,ecx

					push ebp
					//add ebx,PALETTE_OFFSET
					mov ebx,[currPalette]
					add edi,480				// ..dito
					mov ecx,-240
					mov ebp,[mosaicX]
					xor eax,eax
					__mos_bm_pixel_loop:
						mov al,[esi]		// Read one byte from VRAM
						mov dx,[ebx+eax*2]	// Look it up in the color table
						mov [edi+ecx*2],dx	// Store it in the virtual screen
						dec ebp
						jns __mos_ok
							inc esi
							mov ebp,[mosaicX]
							add esi,ebp
						__mos_ok:
						inc ecx				// Next pixel..
					jnz __mos_bm_pixel_loop
					pop ebp
					popa
				}
			break;

		case 3:
			if (useGL && glOK && (glFormat==GL_UNSIGNED_SHORT_5_5_5_1))
				flip_colors(&vscreen[(scanline-1)*480],&VRAM[(scanline-1)*480],480);
			else
				__asm
				{
					pusha
					mov eax,[scanline]
					mov ebx,480
					dec eax
					mov edi,[vscreen]
					mul ebx
					mov esi,[VRAM]
					mov ecx,120
					add edi,eax
					add esi,eax
					rep movsd
					popa
				}
			break;
		case 5:
			if ((bg2x==0)&&(bg2y==0)&&(BG2PA==0)&&(BG2PB==0)&&(BG2PC==0))
			//if (true)
				__asm
				{
					pusha
					mov eax,[scanline]
					cmp eax,128
					jg __blank_line
					mov ebx,480
					dec eax
					mov edi,[vscreen]
					mov ecx,eax
					mov esi,[VRAM]
					mul ebx
					add edi,eax
					mov eax,320
					mul ecx
					movzx edx,byte ptr [esi+IOREGS_OFFSET]
					and edx,0x10			// ecx = x0000 (frame buffer selection)
					add esi,eax
					shl edx,9				// ecx = x0000000000000
					lea edx,[edx+edx*4]		// ecx = x0x0000000000000 (0 : A000)
					add esi,edx
					mov ecx,80
					rep movsd
					xor eax,eax
					mov ecx,40
					rep stosd
					jmp __mode_5_done
__blank_line:
					mov ebx,480
					dec eax
					mov edi,[vscreen]
					mul ebx
					add edi,eax
					mov ecx,120
					xor eax,eax
					rep stosd
__mode_5_done:
					popa
				}
			else
				__asm
				{
					pusha
					mov eax,[scanline]
					mov ebx,480
					dec eax
					mov edi,[vscreen]
					mov ecx,eax
					mov esi,[VRAM]
					mul ebx
					add edi,eax
					mov eax,160
					mul ecx
					movzx edx,byte ptr [esi+IOREGS_OFFSET]
					//mov dummy,eax			// dummy = scanline*160
					mov dummy,ecx
					and edx,0x10			// ecx = x0000 (frame buffer selection)
					shl edx,9				// ecx = x0000000000000
					lea edx,[edx+edx*4]		// ecx = x0x0000000000000 (0 : A000)
					push ebp
					add esi,edx
					mov ecx,160
					mov eax,bg2y
					//imul ecx
					mov ebp,eax				// ebp = bg2y*160
					mov eax,[BG2PC]
					//imul ecx
					mov ebx,eax				// ebx = dy*160
					mov edx,bg2x			// edx = bg2x
					mov ecx,0
	mode_5_draw:
					push esi
					mov eax,edx				// bg2x
					sar eax,8
					//add eax,ecx
					cmp eax,0
					js mode_5_blank			// u+x<0 ?
					cmp eax,160
					jge mode_5_blank		// u+x>=160 ?
					lea esi,[esi+eax*2]
					mov eax,ebp				// v
					sar eax,8				// shift out fractional bits
					//add eax,dummy			// add scanline*320
					js mode_5_blank			// v+y<0 ?
					//cmp eax,20480
					cmp eax,128
					jge mode_5_blank		// v+y>160*128
					//add esi,[div160 + eax*4]
					//add esi,[mul160 + eax*4]
					shl eax,6
					add esi,eax
					lea esi,[esi+eax*4]

					mov ax,[esi]
					mov [edi],ax
					add edx,[BG2PA]
					add ebp,ebx
					add edi,2
					inc ecx
					pop esi
					cmp ecx,240
					jne mode_5_draw
					jmp mode_5_done
	mode_5_blank:
					xor eax,eax
					mov [edi],ax
					add edx,[BG2PA]
					add ebp,ebx
					add edi,2
					inc ecx
					pop esi
					cmp ecx,240
					jne mode_5_draw
	mode_5_done:
					pop ebp
					popa
				}
			break;
		default:
			//__asm { pusha }
			break;
		}
	}
	//}

	draw_sprites();
}




void finish_screen()
{
	int *masks = &colormask[colormode][0];
	bool glRenderer = (useGL && glOK);

	// Tell the shell that another frame has been completed
	zombie(TELL,GPU_FRAME_COMPLETED,0,0);

	newframe = 0;
	blended = 0;

	if (++frames < (frameskip+1))
		return;
	frames = 0;

	vscreen3 = vscreen2;
	if (colormode==0)
		vscreen3 = vscreen;
	else
		// Do color truncation if SSE2 is enabled
		if (hasSSE2)
			__asm
			{
				pusha
				mov esi,[vscreen]
				mov edi,[vscreen2]
				movq xmm1,[masklo]
				movq xmm2,[coladd]
				pxor xmm4,xmm4
				sub edi,16
				punpcklqdq xmm1,xmm1
				punpcklqdq xmm2,xmm2
				mov ecx,4800
		__shift_color:
				movdqa xmm0,[esi]
				movdqa xmm3,xmm2
				movdqa xmm5,xmm0
				add edi,16
				pand xmm0,xmm1
				pcmpgtw xmm5,xmm4
				add esi,16
				pand xmm5,xmm3
				psrlw xmm0,1
				paddw xmm0,xmm5
				movdqa [edi],xmm0
				dec ecx
				jnz __shift_color
				popa
		}

	if (glRenderer)
		glTexSubImage2D(GL_TEXTURE_2D,0,0,0,240,160,GL_RGBA,glFormat,vscreen);

	// Do frame limitation
	if (limitFps)
	{
		QueryPerformanceCounter(&perfCnt2);
		while (((perfCnt2.QuadPart-perfCnt1.QuadPart)<<16)/perfFreq.QuadPart < frameInterval)
		{ QueryPerformanceCounter(&perfCnt2); }
		perfCnt1 = perfCnt2;
	}

	// Check if glTexSubImage2D returned an error
	if (glRenderer)
	{
		if (glGetError()!=GL_NO_ERROR)
		{
			MessageBox(hwnd,"glTexSubImage2D failed\n\nSelect another texture format or\nuse the GDI renderer.","GPU error",MB_OK | MB_ICONERROR);
			frameskip = 1000000;
			return;
		}
	} else
	{
		// Set up the BITMAPINFOHEADER if the GDI renderer is used
		memset(&bi,0,sizeof(BITMAPINFOHEADER));
 		bi.bih.biSize = sizeof(BITMAPINFOHEADER);
 		bi.bih.biWidth = width;
 		bi.bih.biHeight = -visibleLines;
 		bi.bih.biPlanes = 1;
 		bi.bih.biBitCount = 16;
  		bi.bih.biCompression = BI_BITFIELDS;
	}

 	GetClientRect(hwnd,&rect);

	if (glRenderer)
	{
		glViewport(0, 0, rect.right, rect.bottom);
		glEnable(GL_TEXTURE_2D);
		glBegin(GL_QUADS);
			glTexCoord2fv(texCoords[0]);
			glVertex2fv(vertexes[0]);
			glTexCoord2fv(texCoords[1]);
			glVertex2fv(vertexes[1]);
			glTexCoord2fv(texCoords[2]);
			glVertex2fv(vertexes[2]);
			glTexCoord2fv(texCoords[3]);
			glVertex2fv(vertexes[3]);
		glEnd();
		glDisable(GL_TEXTURE_2D);

		if (SwapBuffers(dc)==FALSE)
		{
			MessageBox(hwnd,"SwapBuffers failed","GPU error",MB_OK | MB_ICONERROR);
			frameskip = 1000000;
			return;
		}
	} else
	{
		if (sizemult==1)
			SetDIBitsToDevice(dc,0,0,240,160,0,0,0,160,vscreen3,(BITMAPINFO*)&bi,DIB_RGB_COLORS);
		else
 			StretchDIBits(dc,0,0,rect.right,rect.bottom-20,
 						  0,0,width,visibleLines,vscreen3,(BITMAPINFO*)&bi,DIB_RGB_COLORS,SRCCOPY);

	}
}


void reset_screen()
{
}



// Configuration dialog callback
BOOL CALLBACK DlgProc(HWND hdlg, UINT msg, WPARAM wParam, LPARAM lParam) {
	static char stemp[64];
	int i;
	HWND hCmb,hChk,hColPC,hColGBA,hSize1,hSize2;

	hColPC = GetDlgItem(hdlg,IDC_COLPC);
	hColGBA = GetDlgItem(hdlg,IDC_COLGBA);
	hSize1 = GetDlgItem(hdlg,IDC_SIZE1);
	hSize2 = GetDlgItem(hdlg,IDC_SIZE2);

   switch(msg) {
		case WM_INITDIALOG:
			SetWindowLong(hdlg, DWL_USER, lParam);

			hCmb = GetDlgItem(hdlg,IDC_COMBO1);
			SendMessage(hCmb,CB_ADDSTRING,0,(LPARAM)"0");
			SendMessage(hCmb,CB_ADDSTRING,0,(LPARAM)"1");
			SendMessage(hCmb,CB_ADDSTRING,0,(LPARAM)"2");
			SendMessage(hCmb,CB_ADDSTRING,0,(LPARAM)"3");
			SendMessage(hCmb,CB_ADDSTRING,0,(LPARAM)"4");
			SendMessage(hCmb,CB_ADDSTRING,0,(LPARAM)"5");
			SendMessage(hCmb,CB_SETCURSEL,frameskip,0);

			hCmb = GetDlgItem(hdlg,IDC_COMBO2);
			SendMessage(hCmb,CB_ADDSTRING,0,(LPARAM)"GDI");
			SendMessage(hCmb,CB_ADDSTRING,0,(LPARAM)"OpenGL");
			SendMessage(hCmb,CB_SETCURSEL,(glOK&&useGL)?1:0,0);

			hCmb = GetDlgItem(hdlg,IDC_COMBO3);
			SendMessage(hCmb,CB_ADDSTRING,0,(LPARAM)"5551");
			SendMessage(hCmb,CB_ADDSTRING,0,(LPARAM)"1555_REV");
			SendMessage(hCmb,CB_SETCURSEL,(glFormat==GL_UNSIGNED_SHORT_5_5_5_1)?0:1,0);
			EnableWindow(hCmb,useGL);

			hCmb = GetDlgItem(hdlg,IDC_COMBO4);
			SendMessage(hCmb,CB_ADDSTRING,0,(LPARAM)"NEAREST");
			SendMessage(hCmb,CB_ADDSTRING,0,(LPARAM)"LINEAR");
			SendMessage(hCmb,CB_SETCURSEL,(filter==GL_LINEAR)?1:0,0);
			EnableWindow(hCmb,useGL);

			hChk = GetDlgItem(hdlg,IDC_CHECK1);
			if (limitFps)
				SendMessage(hChk, BM_SETCHECK, BST_CHECKED, 0);
			else
				SendMessage(hChk, BM_SETCHECK, BST_UNCHECKED, 0);

			hChk = GetDlgItem(hdlg,IDC_OBJ);
			if (enableObj)
				SendMessage(hChk, BM_SETCHECK, BST_CHECKED, 0);
			else
				SendMessage(hChk, BM_SETCHECK, BST_UNCHECKED, 0);

			for (i=0; i<4; i++)
			{
				hChk = GetDlgItem(hdlg,IDC_BG3+i);
				SendMessage(hChk,BM_SETCHECK,enableLayer[3-i]?BST_CHECKED:BST_UNCHECKED,0);
				hChk = GetDlgItem(hdlg,IDC_BG3F+i);
				SendMessage(hChk,BM_SETCHECK,forceLayer[3-i]?BST_CHECKED:BST_UNCHECKED,0);
			}

			SendMessage(hColPC, BM_SETCHECK, 1-colormode, 0);
			SendMessage(hColGBA, BM_SETCHECK, colormode, 0);

			SendMessage(hSize1, BM_SETCHECK, (sizemult==1)?1:0, 0);
			SendMessage(hSize2, BM_SETCHECK, (sizemult==2)?1:0, 0);


			return TRUE;

		case WM_COMMAND:
			switch(LOWORD(wParam)) {
				case IDOK:
					hCmb = GetDlgItem(hdlg,IDC_COMBO1);
					frameskip = SendMessage(hCmb,CB_GETCURSEL,0,0);

					hCmb = GetDlgItem(hdlg,IDC_COMBO4);
					filter = (SendMessage(hCmb,CB_GETCURSEL,0,0)==0)?GL_NEAREST:GL_LINEAR;
					if (glIsInit)
					{
 						glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, filter);
  						glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, filter);
					}

					hCmb = GetDlgItem(hdlg,IDC_COMBO2);
					useGL = (SendMessage(hCmb,CB_GETCURSEL,0,0)==1);
					if (useGL && !glIsInit)
						glOK = setup_gl();

					hCmb = GetDlgItem(hdlg,IDC_COMBO3);
					glFormat = (SendMessage(hCmb,CB_GETCURSEL,0,0)==0)?GL_UNSIGNED_SHORT_5_5_5_1:GL_UNSIGNED_SHORT_1_5_5_5_REV;

					hChk = GetDlgItem(hdlg,IDC_CHECK1);
					limitFps = (SendMessage(hChk,BM_GETCHECK,0,0)==BST_CHECKED);

					hChk = GetDlgItem(hdlg,IDC_OBJ);
					enableObj = (SendMessage(hChk,BM_GETCHECK,0,0)==BST_CHECKED);

					for (i=0; i<4; i++)
					{
						hChk = GetDlgItem(hdlg,IDC_BG3+i);
						enableLayer[3-i] = (SendMessage(hChk,BM_GETCHECK,0,0)==BST_CHECKED);
						hChk = GetDlgItem(hdlg,IDC_BG3F+i);
						forceLayer[3-i] = (SendMessage(hChk,BM_GETCHECK,0,0)==BST_CHECKED);
					}

					if (SendMessage(hColPC,BM_GETCHECK,0,0)==BST_CHECKED)
						colormode = 0;
					else
						colormode = 1;

					if (SendMessage(hSize1,BM_GETCHECK,0,0)==BST_CHECKED)
					{
						sizemult = 1;
						vertexes[2][1] = vertexes[3][1] = -0.78f;
					} else
					{
						sizemult = 2;
						vertexes[2][1] = vertexes[3][1] = -0.9f;
					}
					zombie(TELL,GPU_SIZE_MULTIPLIER,(void*)sizemult,0);

					EndDialog(hdlg, 0);
					return TRUE;

				case IDCANCEL:
					EndDialog(hdlg, 1);
					return FALSE;

				case IDC_COMBO2:
					if (HIWORD(wParam)==CBN_SELCHANGE)
					{
						if (SendMessage((HWND)lParam,CB_GETCURSEL,0,0)==1)
						{
							hCmb = GetDlgItem(hdlg,IDC_COMBO3);
							EnableWindow(hCmb,1);
							hCmb = GetDlgItem(hdlg,IDC_COMBO4);
							EnableWindow(hCmb,1);
						} else
						{
							hCmb = GetDlgItem(hdlg,IDC_COMBO3);
							EnableWindow(hCmb,0);
							hCmb = GetDlgItem(hdlg,IDC_COMBO4);
							EnableWindow(hCmb,0);
						}
					}
					return TRUE;

				default:
					break;
			}
			break;
   }

	return FALSE;
}
