/*====================================================================

filename:     trx_ppc_rec.cpp
project:      GCemu
created:      2004-6-18
mail:		  duddie@walla.com

Copyright (c) 2005 Duddie & Tratax

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

====================================================================*/
/*
 *	Tratax PowerPC recompiler
 *	trx_ppc_rec.cpp
 *
 *	2004-7-17 started work based on assembly interpreter core
 *  2004-7-23 can now run by itself 2 small demos (PS and FPU based demos)
 *  2004-7-25 debugged srawi bug until 3:00 am, all cases of instructions
 *            will eventually be hit so better breakpoint or test them
 *  2004-7-30 important lesson learned. VC7 assumes that only EAX, ECX, EDX are freely used.
 *            so when changing EBX in generated code all hell broke loose
 *  2004-8-01 VC's default floating point accuracy is 53 bits. This caused some problems
 *            with selfcheck on floats because the results dont match exactly. 
 *            putting VC's 'floating point consistency' to 'improve consistency' solved the problem
 *  2004-8-03 I'm now directly playing with the FPU control words, rounding mode IS important
 *            especially when converting floats to ints.
 *            Also fixed ps_sum0 .. typos are BADDD mmmkay ?
 *  2004-8-04 Added quick fix support for locked cache
 *  2004-8-05 Added basic benchmark in preparation for optimizations
 *  2004-8-06 DMA engine added, now only TLB is remaining as a big item on 'TODO list'
 *  2004-8-08 On Screen profile report for CPU and IO
 *  2004-8-08 Removed overhead, but still need to properly optimize block to block running
 *            With 80% or more on IO it doesnt seem like a big priority though ...
 *  2004-8-09 And rolled back changes because it is too soon. Block-to-block optimizations
 *            are the very LAST thing
 *  2004-8-09 Added QUICK_MEMREAD and QUICK_MEMWRITE inlining memory read/write directly
 *            also added 'assume stack always in memory'. 
 *            Maybe inlining memory read isnt such a good idea, it increases code size a LOT
 *            Need to clean up and remove it all tomorrow. It wont work with icache and 
 *            texture cache checks (QUICK_MEMWRITE) and its better to improve the memory routines
 *            Time to clean up memory routines more later...
 *  2004-8-10 Memory routines cleaned up
 *            started register caching and constant elimination. Got until branch opcodes
 *  2004-8-12 Register and constant elimination somewhat running but still needs debugging
 *  2004-8-13 More inner block optimization stuff
 *  2004-8-19 replaced horrible gnu style disassembler with one from bochs that understands even sse3
 *  2004-8-24 Started hooking up SSE2 fpu & ps routines
 *  2004-8-25 SSE2 with register caching can now run some stuff (as far as I can tell)
 *  2004-8-26 debug day .. many bugs fixed. SSE2 running with limited caching. Problem is that SSE2 can be used randomly by DLLs
 *            so the cache needs to be flushed every time we leave the compiled block for unknown code
 *  2004-8-28 benchmarking fixed. Beware of buffer overruns in clearing loops. Start work on multiblock mode
 *  2004-8-29 first version of multiblock mode somewhat works with games. needs cleanup and more work
 *  2004-8-31 added register cache tweaks to make it more efficient
 *  2004-9-01 finished SSE register cache tweaks and added idle loop detection. Out of good optimizing ideas for now
 *  2004-9-07 after holidays now starting on clean up of memory interface. Adding proper 64bit memory routines and fix PSQ_Lx PSQ_STx
 *  2004-9-09 added few more opcodes. Mobility Radeon 9700 is seriously screwed in OpenGL
 *	2005-2-05 continued project after long break. aim for correctness first, speed later. BACK TO INTERPRETER AGAIN !
 *
 * TODO:
 * - prove that opcodes are 100% correct in all cases (corner case test!) especially gekko instructions
 *  - add option to list all used opcodes in a run 
 *  - write corner case test program
 *  - figure out way to run blocks of CPU code in lockstep with the GC
 * -
 * -
 *
 * Ideas:
 *------------------------------------------------------------------------------------
 * SSE2 registers staticly assigned to FPU/Gekko registers.
 * -make sure that SSE2 is not overwritten ourside of CPU without it knowing
 * -benchmark to get most used registers, rest loaded from memory
 *
 *------------------------------------------------------------------------------------
 *
 * Block optimizations:
 * -constant address memory access can be speeded up a LOT. no masking, memory direct store
 *  and direct call IO (also constant values can be pre-byteswapped)
 * -predict memory segment for memory access and selfmodify fix it if predicted wrong
 * -predict GQR values and selfmodify fix it if predicted wrong
 * 
 * Add Assumptions:
 * -IO only through constant memory addresses (lis+addi + load/store)
 * -psq_l loads only from memory (cache ?)
 * -GQRs 'constant'. Set up once and then not changed without psq_l code changed
 * -stmw, lmw only store to memory...
 *
 *
 */

#include "config.h"

#include <windows.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "config.h"
#include "cpu/trx_ppc_cpu.h"
#include "cpu/trx_ppc_rec.h"
#include "trx_ppc_rec_opcodes.h"
#include "trx_ppc_rec_fpu_ps_opcodes_x87.h"
#include "trx_ppc_rec_fpu_ps_opcodes_sse2.h"
#include "asm_x86.h"
#include "hardware/hw_io.h"
#include "w32_display.h"
#include "ppc_disasm.h"
#include "profiler.h"
#include "plugins/gx_plugin.h"

#include "pad.h"
#include "disasm_x86.h"

#include "dsp/gdsp_interpreter.h"

#define CR0_LT (1<<31)
#define CR0_GT (1<<30)
#define CR0_EQ (1<<29)
#define CR0_SO (1<<28)

#define PPC_EXC_UNKNOWN 0
#define PPC_EXC_SYS_RESET 0x100
#define PPC_EXC_DSI 0x00300
#define PPC_EXC_EXT_INT 0x00500
#define PPC_EXC_ALIGNMENT 0x00600
#define PPC_EXC_PROGRAM 0x00700
#define PPC_EXC_NO_FPU 0x00800
#define PPC_EXC_DEC 0x00900
//Reserved 0x00A00
//Reserved 0x00B00
#define PPC_EXC_SC 0x00C00
#define PPC_EXC_PERF_MON 0xF00
#define PPC_EXC_ALTIVEC 0xF20
#define PPC_EXC_ALTIVEC_ASSIST 0x1600
#define PPC_EXC_TAU 0x1700

#define PPC_EXC_PROGRAM_FLOAT (1<<20)
#define PPC_EXC_PROGRAM_ILL   (1<<19)
#define PPC_EXC_PROGRAM_PRIV  (1<<18)
#define PPC_EXC_PROGRAM_TRAP  (1<<17)
#define PPC_EXC_PROGRAM_NEXT  (1<<16)

FILE *debugasm_fp;

void debugasm_dump(char *str)
{
	fputs(str, debugasm_fp);
	fflush(debugasm_fp);
}

#if HOTBLOCK_PROFILE
uint32 *hotblock_table;
uint32	hotblock_table_address;
#endif

struct TRX_PPC_Registers trxCPUrec;

uint32 block_start_x86_pos;

void *p_rec_mem_read8;
void *p_rec_mem_read16;
void *p_rec_mem_read32;
void *p_rec_mem_read64;
void *p_rec_mem_write8;
void *p_rec_mem_write16;
void *p_rec_mem_write32;
void *p_rec_mem_write64;

double *trx_rec_ps0_double;
double *trx_rec_ps1_double;
uint64 *trx_rec_ps0_int;
uint64 *trx_rec_ps1_int;

uint32 blockalign_16 = 1;

uint32 *translation_table;
uint32 address_of_translation_table;
uint8 *translation_memory;
uint32 translation_pos;

// we only need to check for FPU exceptions in the block once
static int fpu_exception_checked;
int use_sse;

// FPU control word storage for controlling rounding and precision
uint16 fpucontrol_roundzero; // rc=zero 
uint16 fpucontrol_default; // whatever it is set at on default 

//==============================================================================
// Routines for detecting self modifying code, or just recompiled code being overwritten 
// by something else (possibly overlays or just new code)
// With these routines installed, instruction cache flushes can be safely ignored
// because recompiler cache flushes will happen at the moment when they are needed
//
// IMPORTANT: external devices should also include the checking routine when writing 
// to memory!

// power of 2 divisor for accuracy of code modification. If it is set to 2 it means check per uint32 (instruction size)
// and also means waste 32Mbyte / 4 for checking
// safe to assume check per cache line (32 bytes, 2^5)

#define REC_CACHE_SHIFT (5)
#define REC_CACHECHECK_SIZE (32*1024*1024>>REC_CACHE_SHIFT)

uint8 *rec_cache_table;
uint8 codemodified=0;

int rec_cache_init(void)
{
	// allocate 1 byte for every check location
	rec_cache_table = (uint8 *)malloc(REC_CACHECHECK_SIZE);
	if(rec_cache_table == NULL)
	{
		printf("[Tratax Recompiler] failed to allocate memory (%d kbyte) for recompiler cache table\n", REC_CACHECHECK_SIZE/1024);
		return -1;
	}
	printf("[Tratax Recompiler] allocated %d kbyte for recompiler cache table\n", REC_CACHECHECK_SIZE/1024);
	// make sure table is cleaned
	memset(rec_cache_table, 0, REC_CACHECHECK_SIZE);
	return 0;
}

// this routine needs to be added to every memory write routine
// it will make sure that when code is overwritten it will be detected automatically and the code cache will be invalidated
void rec_cache_snoop(uint32 address)
{
	codemodified |= rec_cache_table[address>>REC_CACHE_SHIFT];
}

void rec_cache_snoop_area(uint32 address, uint32 size)
{
	uint32 address_walker, address_end;

	address_walker = address;
	address_end = address + size;

	do
	{
		codemodified |= rec_cache_table[address_walker>>REC_CACHE_SHIFT];
		address_walker += (1<<REC_CACHE_SHIFT);
	}while(address_walker < address_end);
}

// recompiler calls this to mark areas that are recompiled
void rec_cache_mark_valid(uint32 address)
{
	rec_cache_table[address>>REC_CACHE_SHIFT] = 0xff;
}

// flushcache calls this to clean the marked areas
void rec_cache_mark_all_invalid(void)
{
	// make sure table is cleaned
	memset(rec_cache_table, 0, (24*1024*1024>>REC_CACHE_SHIFT));
}

//==============================================================================
//
// Register caching routines for recompiler usage
// uses: EBP, ESI, EDI, EBX for cache.

// 3 states that PPC registers can be in
enum e_regc_state
{
	REGC_UNCACHED,
	REGC_CONSTANT,
	REGC_CACHED
};

// 2 states that cache registers can be in
enum e_regc_cachestate
{
	REGC_NOTUSED,
	REGC_USED,
};

// is a register only used for loads ? or has it been stored to ? (possibly modified!)
enum e_regc_modified
{
	REGC_UNMODIFIED,
	REGC_MODIFIED
};
// keeps state of PPC registers
uint32 regc_state[32];
// keeps track of whether PPC register has been (possibly) modified
uint32 regc_modified[32];
// keep track which register PPC register is cached in
uint32 regc_which[32];
// keeps constant value is register is constant
uint32 regc_constant[32];
// LRU for each register (not only cache registers)
uint32 regc_lru[8];
uint32 regc_lrucount;
// the IDs of the cache registers
uint32 regc_lrumap[4] = {EBX, ESI, EDI, EBP};
// and the state that the cache registers are in
uint32 regc_cachestate[4];

// free up cache register
void regc_free(uint32 cachereg)
{
	uint32 i;
	for(i = 0; i<4; i++)
	{
		if(regc_lrumap[i] == cachereg)
		{
			regc_cachestate[i] = REGC_NOTUSED;
			return;
		}
	}
}
// find a cache register, empty or full (flush current val if needed)
uint32 regc_alloc(void)
{
	uint32 i, curlru_val, curlru_reg;

	// first see if there is any empty register
	for(i=0; i < 4; i++)
	{
		if(regc_cachestate[i] == REGC_NOTUSED)
		{
			regc_cachestate[i] = REGC_USED;
			return regc_lrumap[i];
		}
	}

	// no free registers so get least recently used register
	curlru_val = 0;
	curlru_reg = 0;
	for(i=0; i < 4; i++)
	{
		if((regc_lrucount - regc_lru[regc_lrumap[i]]) > curlru_val)
		{
			curlru_val = (regc_lrucount - regc_lru[regc_lrumap[i]]);
			curlru_reg = regc_lrumap[i];
		}
	}
	// find out which register this belongs to and flush value
	for(i = 0; i < 32; i++)
	{
		if(regc_which[i] == curlru_reg)
		{
			if(regc_modified[i] == REGC_MODIFIED)
			{
				// flush value
				gen_asm(MOV_MR, (uint32)&trxCPUrec.gpr[i], curlru_reg);
			}
			// and flush cache info
			regc_state[i] = REGC_UNCACHED;
			regc_which[i] = 0;
			// keep register cache as 'used' since we're recycling here
		}
	}
	return curlru_reg;
}

// initialise register cache for beginning of block
// cache empty
void regc_start(void)
{
	uint32 i;

	for(i = 0; i < 32; i++)
	{
		regc_state[i] = REGC_UNCACHED;
		regc_which[i] = 0;
		regc_modified[i] = REGC_UNMODIFIED;
		regc_constant[i] = 0xc0edbabe;
	}
	for(i = 0; i < 8; i++)
	{
		regc_lru[i] = 0;
	}
	for(i = 0; i < 4; i++)
	{
		regc_cachestate[i] = REGC_NOTUSED;
	}
	regc_lrucount = 0;
}

// flush all at end of block marker (can be done multiple times per block!)
// returns how many bytes it took.
uint32 regc_end(void)
{
	uint32 i, beginpos;

	beginpos = translation_pos;
	for(i = 0; i < 32; i++)
	{
		if(regc_state[i] != REGC_UNCACHED)
		{
			if(regc_state[i] == REGC_CONSTANT)
			{
				// writeback constant value
				gen_asm(MOV_M32I32, (uint32)&trxCPUrec.gpr[i], regc_constant[i]);
			}
			else
			{
				// no need to store registers that have only been load()-ed
				if(regc_modified[i] == REGC_MODIFIED)
				{
					// writeback cached value
					gen_asm(MOV_MR, (uint32)&trxCPUrec.gpr[i], regc_which[i]);
				}
			}
		}
	}	
	return (translation_pos - beginpos);
}

// return 1 if constant, 0 if not
uint32 regc_is_constant(uint32 ppcreg)
{
	if(regc_state[ppcreg] == REGC_CONSTANT)return 1;
	return 0;
}

uint32 regc_getconstant(uint32 ppcreg)
{
	return regc_constant[ppcreg];
}

uint32 regc_getcachereg(uint32 ppcreg)
{
	if(regc_state[ppcreg] == REGC_CONSTANT)
	{
		printf("regc_getcachereg() cannot cache constant values!\n");
		exit(0);
	}
	if(regc_state[ppcreg] == REGC_UNCACHED)
	{
		uint32 target;
		// currently uncached, get register and load value
		target = regc_alloc();
		regc_state[ppcreg] = REGC_CACHED;
		regc_which[ppcreg] = target;
		regc_modified[ppcreg] = REGC_UNMODIFIED;
		gen_asm(MOV_RM, target, (uint32)&trxCPUrec.gpr[ppcreg]);
		// fall through
	}
	// must be cached at this point
	// ---------------------------
	// another cache hit
	regc_lrucount++;
	regc_lru[regc_which[ppcreg]] = regc_lrucount;
	return regc_which[ppcreg];
}

// generate a LOAD of register to 'DST' register
void regc_load(uint32 dstreg, uint32 ppcreg)
{
	if(regc_state[ppcreg] == REGC_CONSTANT)
	{
		// load constant value
		gen_asm(MOV_RI32, dstreg, regc_constant[ppcreg]);
		return;
	}
	if(regc_state[ppcreg] == REGC_UNCACHED)
	{
		uint32 target;
		// currently uncached, get register and load value
		target = regc_alloc();
		regc_state[ppcreg] = REGC_CACHED;
		regc_which[ppcreg] = target;
		regc_modified[ppcreg] = REGC_UNMODIFIED;
		gen_asm(MOV_RM, target, (uint32)&trxCPUrec.gpr[ppcreg]);
		// fall through
	}
	// must be cached at this point
	// ---------------------------
	// load cached value
	gen_asm(MOV_RR, dstreg, regc_which[ppcreg]);
	// another cache hit
	regc_lrucount++;
	regc_lru[regc_which[ppcreg]] = regc_lrucount;
}

// generate a STORE of 'SRC' (cache) register 
void regc_store(uint32 srcreg, uint32 ppcreg)
{
	if(regc_state[ppcreg] == REGC_CONSTANT)
	{
		// no longer constant
		regc_state[ppcreg] = REGC_UNCACHED;
		regc_constant[ppcreg] = 0xc0edbabe; // just to make sure to catch errors easier
		// fall through
	}
	if(regc_state[ppcreg] == REGC_UNCACHED)
	{
		uint32 target;
		// get register for storing
		target = regc_alloc();
		regc_state[ppcreg] = REGC_CACHED;
		regc_which[ppcreg] = target;
	}
	// must be cached at this point
	// ---------------------------
	// store to cache register
	gen_asm(MOV_RR, regc_which[ppcreg], srcreg);
	regc_modified[ppcreg] = REGC_MODIFIED;
	// another cache hit
	regc_lrucount++;
	regc_lru[regc_which[ppcreg]] = regc_lrucount;
}

uint32 regc_load_constant(uint32 ppcreg)
{
	// c0edbabe will protect us :)
	return regc_constant[ppcreg];
}

// generate a STORE of 'SRC' (cache) register 
void regc_store_constant(uint32 value, uint32 ppcreg)
{
	if(regc_state[ppcreg] == REGC_CACHED)
	{
		// free up the register without flushing it
		regc_free(regc_which[ppcreg]);
		regc_state[ppcreg] = REGC_CONSTANT;
		regc_which[ppcreg] = 0;
		regc_modified[ppcreg] = REGC_MODIFIED;
	}
	if(regc_state[ppcreg] == REGC_UNCACHED)
	{
		regc_state[ppcreg] = REGC_CONSTANT;
		regc_which[ppcreg] = 0; // overkill .. but still
		regc_modified[ppcreg] = REGC_MODIFIED;
	}
	// and finally store value
	regc_constant[ppcreg] = value;
}
//==============================================================================
//
// recompiler core decoding and execution routines
//

void trx_ppc_gen_group19(void)
{
	switch((trxCPUrec.opcode>>1)&0x3ff)
	{
	case 0:		trx_ppc_gen_mcrf();break;
	case 16:	trx_ppc_gen_bclrx();break;
	case 33:	trx_ppc_gen_crnor();break;
	case 50:	trx_ppc_gen_rfi();break;
	case 150:	break;// isync, we dont care about it
	case 193:	trx_ppc_gen_crxor();break;
	case 257:	trx_ppc_gen_crand();break;
	case 289:	trx_ppc_gen_creqv(); break;
	case 449:	trx_ppc_gen_cror();break;
	case 528:	trx_ppc_gen_bcctrx();break;
	default: 
		printf("[trxCPUrec] unhandled op19: %d\n", (trxCPUrec.opcode>>1)&0x3ff);
		exit(0);
		break;
	}
}

void fpu_exception_check_handler(uint32 curpc)
{
	trxCPUrec.spr[PPC_SRR0] = curpc;
	trxCPUrec.spr[PPC_SRR1] = trxCPUrec.msr & 0x87c0ffff;
	trxCPUrec.msr = 0;
	trxCPUrec.npc = PPC_EXC_NO_FPU;
}

void gen_fpu_exception_check(void)
{
	if(fpu_exception_checked == 0)
	{
		uint32 jumppos, size;
		gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.msr);
		gen_asm(AND_RI32, EAX, MSR_FP);
		//gen_asm(BREAK);
		// patch location later, cause we will be encoding register cache flush of variable size
		gen_asm(JNE_I32, 0); 
		//if(msr & NO_FPU){
		jumppos = translation_pos - 4;
		size = 19; // base size = 19
		size += regc_end();
		*((uint32 *)(translation_memory+jumppos)) = size;
		gen_asm(PUSH_I32, trxCPUrec.pc);
		gen_asm(CALL_M, (uint32)&fpu_exception_check_handler);
		gen_asm(ADD_RI8, ESP, 4);
		gen_asm(MOV_RI32, EAX, trxCPUrec.block_instr+1);
		gen_asm(RET);

		// only 1 check per block required
		fpu_exception_checked = 1;
	}
}

void trx_ppc_gen_group31(void)
{
	switch((trxCPUrec.opcode>>1)&0x3ff)
	{
	case 535: case 567: case 599: case 631: case 663: 
	case 695: case 727:	case 759: case 983:
		gen_fpu_exception_check();
	}

	switch((trxCPUrec.opcode>>1)&0x3ff)
	{
	case 0:		trx_ppc_gen_cmp();break;
	case 8:		trx_ppc_gen_subfcx();break;
	case 10:	trx_ppc_gen_addcx();break;
	case 11:	trx_ppc_gen_mulhwux();break;
	case 19:	trx_ppc_gen_mfcr();break;
	case 23:	trx_ppc_gen_lwzx();break;
	case 24:	trx_ppc_gen_slwx();break;
	case 26:	trx_ppc_gen_cntlzwx();break;
	case 28:	trx_ppc_gen_andx();break;
	case 32:	trx_ppc_gen_cmpl();break;
	case 40:	trx_ppc_gen_subfx();break;
	case 54:	break; // dcbst ignored for now
	case 55:	trx_ppc_gen_lwzux(); break;
	case 60:	trx_ppc_gen_andcx(); break;
	case 75:	trx_ppc_gen_mulhwx(); break;
	case 83:	trx_ppc_gen_mfmsr();break;
	case 86:	break;// dcbf data cache block flush, we dont care
	case 87:	trx_ppc_gen_lbzx(); break;
	case 104:	trx_ppc_gen_negx(); break;
	case 119:	trx_ppc_gen_lbzux(); break;
	case 124:	trx_ppc_gen_norx();break;
	case 136:	trx_ppc_gen_subfex();break;
	case 138:	trx_ppc_gen_addex();break;
	case 144:	trx_ppc_gen_mtcrf();break;
	case 146:	trx_ppc_gen_mtmsr(); break;
	case 151:	trx_ppc_gen_stwx();break;
	case 183:	trx_ppc_gen_stwux();break;
	case 200:	trx_ppc_gen_subfzex(); break;
	case 202:	trx_ppc_gen_addzex(); break;
	case 210:	trx_ppc_gen_mtsr(); break;
	case 215:	trx_ppc_gen_stbx();break;
	case 234:	trx_ppc_gen_addmex();break;
	case 235:	trx_ppc_gen_mullwx(); break;
	case 242:	trx_ppc_gen_mtsrin(); break;
	case 247:	trx_ppc_gen_stbux(); break;
	case 266:	trx_ppc_gen_addx();break;
	case 278:	break;// dcbt data cache block touch, we dont care
	case 279:	trx_ppc_gen_lhzx();break;
	case 284:	trx_ppc_gen_eqvx(); break;
	case 311:	trx_ppc_gen_lhzux();break;
	case 316:	trx_ppc_gen_xorx();break;
	case 339:	trx_ppc_gen_mfspr();break;
	case 343:	trx_ppc_gen_lhax(); break;
	case 371:	trx_ppc_gen_mftb();break;
	case 407:	trx_ppc_gen_sthx();break;
	case 412:	trx_ppc_gen_orcx();break;
	case 439:	trx_ppc_gen_sthux();break;
	case 444:	trx_ppc_gen_orx();break;
	case 459:	trx_ppc_gen_divwux();break;
	case 467:	trx_ppc_gen_mtspr();break;
	case 470:	break; // dcbi data cache block invalidate, we dont care
	case 476:	trx_ppc_gen_nandx(); break;
	case 491:	trx_ppc_gen_divwx();break;
	case 512:	trx_ppc_gen_mcrxr();break;
	case 534:	trx_ppc_gen_lwbrx(); break;
	case 535:	if(use_sse){trx_ppc_gen_sse2_lfsx();}else{trx_ppc_gen_lfsx();}break;	
	case 536:	trx_ppc_gen_srwx();break;
	case 566:	break; // tlbsync()
	case 567:	if(use_sse){trx_ppc_gen_sse2_lfsux();}else{trx_ppc_gen_lfsux();}break;
	case 595:	trx_ppc_gen_mfsr();break;
	case 597:	trx_ppc_gen_lswi();break;
	case 598:	break; // sync we dont care about it.
	case 599:	if(use_sse){trx_ppc_gen_sse2_lfdx();}else{trx_ppc_gen_lfdx();} break;
	case 631:	if(use_sse){trx_ppc_gen_sse2_lfdux();}else{trx_ppc_gen_lfdux();}break;
	case 659:	trx_ppc_gen_mfsrin();break;
	case 662:	trx_ppc_gen_stwbrx(); break;
	case 663:	if(use_sse){trx_ppc_gen_sse2_stfsx();}else{trx_ppc_gen_stfsx();}break;
	case 695:	if(use_sse){trx_ppc_gen_sse2_stfsux();}else{trx_ppc_gen_stfsux();}break;
	case 725:	trx_ppc_gen_stswi(); break;
	case 727:	if(use_sse){trx_ppc_gen_sse2_stfdx();}else{trx_ppc_gen_stfdx();}break;
	case 759:	if(use_sse){trx_ppc_gen_sse2_stfdux();}else{trx_ppc_gen_stfdux();}break;
	case 790:	trx_ppc_gen_lhbrx();break;
	case 792:	trx_ppc_gen_srawx();break;
	case 824:	trx_ppc_gen_srawix();break;
	case 918:	trx_ppc_gen_sthbrx();break;
	case 922:	trx_ppc_gen_extshx();break;
	case 954:	trx_ppc_gen_extsbx();break;
	case 982:	trx_ppc_gen_icbc();break;
	case 983:	if(use_sse){trx_ppc_gen_sse2_stfiwx();}else{trx_ppc_gen_stfiwx();} break;
	case 1014:	trx_ppc_gen_dcbz(); break;
	default:
		printf("[trxCPUrec] unhandled op31: %d\n", (trxCPUrec.opcode>>1)&0x3ff);
		exit(0);
		break;
	}
}

// decode and generate code for single instruction
void trx_rec_single(void)
{
	// fpu or ps instruction ? then generate fpu exception check if required
	if( (((trxCPUrec.opcode>>26) & 0x3f) == 4) || (((trxCPUrec.opcode>>26) & 0x3f) > 47) )
	{
		gen_fpu_exception_check();

		if(use_sse)
		{
			switch((trxCPUrec.opcode>>26) & 0x3f)
			{
				case 4:		trx_ppc_gen_sse2_gekko();break;
				case 48:	trx_ppc_gen_sse2_lfs();	break;
				case 49:	trx_ppc_gen_sse2_lfsu();	break;
				case 50:	trx_ppc_gen_sse2_lfd();break; 
				case 51:	trx_ppc_gen_sse2_lfdu();break; 
				case 52:	trx_ppc_gen_sse2_stfs();break; 
				case 53:	trx_ppc_gen_sse2_stfsu(); break;
				case 54:	trx_ppc_gen_sse2_stfd(); break;
				case 55:	trx_ppc_gen_sse2_stfdu(); break;
				case 56:	trx_ppc_gen_sse2_psq_l(); break;
				case 57:	trx_ppc_gen_sse2_psq_lu(); break;
				case 59:	trx_ppc_gen_sse2_group59(); break;
				case 60:	trx_ppc_gen_sse2_psq_st(); break;
				case 61:	trx_ppc_gen_sse2_psq_stu(); break;
				case 63:	trx_ppc_gen_sse2_group63(); break;
				default:
					printf("[trxCPUrec] unhandled op: %d\n", (trxCPUrec.opcode>>26));
					exit(0);
					break;
			}
		}
		else
		{
			switch((trxCPUrec.opcode>>26) & 0x3f)
			{
				case 4:		trx_ppc_gen_gekko();break;
				case 48:	trx_ppc_gen_lfs();	break;
				case 49:	trx_ppc_gen_lfsu();	break;
				case 50:	trx_ppc_gen_lfd();break; 
				case 51:	trx_ppc_gen_lfdu();break; 
				case 52:	trx_ppc_gen_stfs();break; 
				case 53:	trx_ppc_gen_stfsu(); break;
				case 54:	trx_ppc_gen_stfd(); break;
				case 55:	trx_ppc_gen_stfdu(); break;
				case 56:	trx_ppc_gen_psq_l(); break;
				case 57:	trx_ppc_gen_psq_lu(); break;
				case 59:	trx_ppc_gen_group59(); break;
				case 60:	trx_ppc_gen_psq_st(); break;
				case 61:	trx_ppc_gen_psq_stu(); break;
				case 63:	trx_ppc_gen_group63(); break;
				default:
					printf("[trxCPUrec] unhandled op: %d\n", (trxCPUrec.opcode>>26));
					exit(0);
					break;
			}
		}
	}
	else
	{
		// decode and execute
		switch((trxCPUrec.opcode>>26) & 0x3f)
		{
		case 7:		trx_ppc_gen_mulli();break;
		case 8:		trx_ppc_gen_subfic();break;
		case 10:	trx_ppc_gen_cmpli();break;
		case 11:	trx_ppc_gen_cmpi();break;
		case 12:	trx_ppc_gen_addic();break;
		case 13:	trx_ppc_gen_addic_();break;
		case 14: 	trx_ppc_gen_addi(); break;
		case 15:	trx_ppc_gen_addis(); break;
		case 16:	trx_ppc_gen_bcx(); break;
		case 17:	trx_ppc_gen_sc(); break;
		case 18:	trx_ppc_gen_bx(); break;
		case 19:	trx_ppc_gen_group19(); break;
		case 20:	trx_ppc_gen_rlwimix();break;
		case 21:	trx_ppc_gen_rlwinmx();break;
		case 23:	trx_ppc_gen_rlwnmx(); break;
		case 24:	trx_ppc_gen_ori();break;
		case 25:	trx_ppc_gen_oris();break;
		case 26:	trx_ppc_gen_xori();break;
		case 27:	trx_ppc_gen_xoris();break;
		case 28:	trx_ppc_gen_andi_();break;
		case 29:	trx_ppc_gen_andis_();break;
		case 31:	trx_ppc_gen_group31(); break;
		case 32:	trx_ppc_gen_lwz(); break;
		case 33:	trx_ppc_gen_lwzu(); break;
		case 34:	trx_ppc_gen_lbz(); break;
		case 35:	trx_ppc_gen_lzbu(); break;
		case 36:	trx_ppc_gen_stw(); break;
		case 37:	trx_ppc_gen_stwu(); break;
		case 38:	trx_ppc_gen_stb(); break;
		case 39:	trx_ppc_gen_stbu(); break;
		case 40:	trx_ppc_gen_lhz();break;
		case 41:	trx_ppc_gen_lhzu();break;					
		case 42:	trx_ppc_gen_lha();break;					
		case 43:	trx_ppc_gen_lhau();break;					
		case 44:	trx_ppc_gen_sth(); break;
		case 45:	trx_ppc_gen_sthu(); break;
		case 46:	trx_ppc_gen_lmw(); break;
		case 47:	trx_ppc_gen_stmw(); break;	
		default:
			printf("[trxCPUrec] unhandled op: %d at: %x\n", (trxCPUrec.opcode>>26), trxCPUrec.pc);
			exit(0);
			break;
		}
	}
}

#define TRANSLATION_MEM_SIZE (16*1024*1024)
// we assume that we do not recompile more than 1024 bytes in a block
// so doing only one check per block to see if something touches the safe zone 
// is enough to keep the compiled code from overflowing the buffer
#define TRANSLATION_SAFE_ZONE 1024

int trx_rec_translate_block_and_run(void);

//==============================================================================
//
// for benchmarking and profiling
//

static uint32 benchmark_cur_bytes, benchmark_max_bytes;
static uint32 benchmark_cur_blocks, benchmark_max_blocks;
static uint32 benchmark_cur_ppc_instr_per_block, benchmark_max_ppc_instr_per_block;
static uint32 benchmark_cur_x86_instr_per_block, benchmark_max_x86_instr_per_block;
static uint32 benchmark_cur_x86_bytes_per_block, benchmark_max_x86_bytes_per_block;
static uint32 benchmark_total_ppc_instr;
static uint32 benchmark_total_x86_instr;
static uint32 benchmark_total_x86_bytes;
static uint32 benchmark_total_blocks;
static uint32 benchmark_total_flushes;

void trx_rec_benchmark_init(void)
{
	benchmark_max_blocks = 0;
	benchmark_max_bytes = 0;
	benchmark_max_ppc_instr_per_block = 0;
	benchmark_max_x86_instr_per_block = 0;
	benchmark_max_x86_bytes_per_block = 0;
	benchmark_total_ppc_instr = 0;
	benchmark_total_x86_instr = 0;
	benchmark_total_x86_bytes = 0;
	benchmark_total_blocks = 0;
	benchmark_total_flushes = 0;
}	

void trx_rec_benchmark_dump(void)
{
	double average;
	// affected by recompiler flush
	if(benchmark_cur_bytes > benchmark_max_bytes)benchmark_max_bytes = benchmark_cur_bytes;
	if(benchmark_cur_blocks > benchmark_max_blocks)benchmark_max_blocks = benchmark_cur_blocks;

	printf("[Tratax Recompiler] ==========================================================\n");
	printf("[Tratax Recompiler] PROFILING AND BENCHMARK REPORT\n");
	printf("[Tratax Recompiler] Max blocks: %d\n", benchmark_max_blocks);
	printf("[Tratax Recompiler] Max bytes: %d\n", benchmark_max_bytes);
	printf("[Tratax Recompiler] Max PPC instructions per block: %d\n", benchmark_max_ppc_instr_per_block);
	printf("[Tratax Recompiler] Max x86 instructions per block: %d\n", benchmark_max_x86_instr_per_block);
	printf("[Tratax Recompiler] Max x86 bytes per block: %d\n", benchmark_max_x86_bytes_per_block);
	printf("[Tratax Recompiler] Total recompiler cache flushes: %d\n", benchmark_total_flushes);	
	printf("[Tratax Recompiler] Total blocks processed: %d\n", benchmark_total_blocks);
	printf("[Tratax Recompiler] Total PPC instructions processed: %d\n", benchmark_total_ppc_instr);
	printf("[Tratax Recompiler] Total x86 instructions generated: %d\n", benchmark_total_x86_instr);
	printf("[Tratax Recompiler] Total x86 codebytes generated: %d\n", benchmark_total_x86_bytes);
	average = (double)benchmark_total_x86_instr / (double)benchmark_total_ppc_instr;
	printf("[Tratax Recompiler] Average overhead %f x86 instructions per PPC instruction\n", average);
	average = (double)benchmark_total_x86_bytes / (double)(benchmark_total_ppc_instr*4);
	printf("[Tratax Recompiler] Average overhead %f x86 codebytes per PPC codebyte\n", average);
	average = (double)benchmark_total_ppc_instr / (double)benchmark_total_blocks;
	printf("[Tratax Recompiler] Average PPC instructions per block: %f\n", average);

#if HOTBLOCK_PROFILE
	{
		uint32 i,j, maxval, maxindex;

		// find most used blocks and print their location and amount used
		for(j = 0; j < 10; j++)
		{
			maxval = 0;
			for(i = 0; i < ((24*1024*1024)/4); i++)
			{
				if(hotblock_table[i] > maxval)
				{
					maxval = hotblock_table[i];
					maxindex = i;
					hotblock_table[i] = 0; // only show up once!
				}
			}
			if(maxval == 0)break;
			printf("hotblock[%d]: %x called %d times\n", j, maxindex*4, maxval);
		}
	}
#endif

}

//==============================================================================
//

int trx_rec_init()
{
	static int has_initted = 0;
	int i;

	trx_rec_ps0_double = (double *)trxCPUrec.fpr;
	trx_rec_ps1_double = (double *)trxCPUrec.ps1;
	trx_rec_ps0_int = (uint64 *)trxCPUrec.fpr;
	trx_rec_ps1_int = (uint64 *)trxCPUrec.ps1;

	// can we use optimized SSE2 versions of FPU and PS opcodes ?
	if(has_sse2() && !config_disable_sse2 && !config_cpumode==CPU_SELFCHECKMODE)
	{
		use_sse = 1;
		printf("[Tratax Recompiler] System supports SSE2, using optimized version of code\n");
	}
	else
	{
		use_sse = 0;
		printf("[Tratax Recompiler] Either system does not support SSE2 or SSE2 mode has been disabled\n");
	}

	if(has_initted)return 0;

	trx_rec_benchmark_init();

	// allocate 24 MB of translation table
	translation_table = (uint32 *)malloc(24*1024*1024);
	if(translation_table==NULL)
	{
		printf("[Tratax Recompiler] failed to allocate memory for translation table\n");
		return -1;
	}
	// speedup trick for recompiler. we store the absolute address of the translation table
	// so that we can directly compile it in as opposed to have to go through the pointer
	address_of_translation_table = (uint32)&translation_table[0];

	// fill translation table with pointers to translation routine
	for(i = 0; i < (24*1024*1024)/4; i++)
	{
		translation_table[i] = (uint32)&trx_rec_translate_block_and_run;
	}

	// allocate translation memory
	// !!! IMPORTANT !!! CANNOT MALLOC THIS MEMORY
	// because on WinXP SP2 and CPU's with NX bit malloc()-ed memory cannot 
	// be executed
	translation_memory = (uint8 *)VirtualAlloc(NULL, TRANSLATION_MEM_SIZE, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
	if(translation_memory==NULL)
	{
		printf("[Tratax Recompiler] failed to allocate: %d KB for translation memory\n", TRANSLATION_MEM_SIZE/1024);
		return -2;
	}
	else
	{ 
		printf("[Tratax Recompiler] allocated: %d KB for translation memory\n", TRANSLATION_MEM_SIZE/1024);
	}
	translation_pos = 0;

	if(config_debugasm)
	{
		// open special output file for assembly dumps
		debugasm_fp = fopen("c:/debugasm.txt","wb");
		if(debugasm_fp==NULL)
		{
			printf("[Tratax Recompiler] failed to open c:/debugasm.txt for assembly debug output\n");
			return -3;
		}
	}

#if HOTBLOCK_PROFILE
	// per memory pointer location (4 bytes) we have a 4 byte counter.
	hotblock_table = (uint32 *)malloc(24*1024*1024);
	if(hotblock_table == NULL)
	{
		printf("[Tratax Recompiler] failed to allocate memory for hotblock table\n");
		return -1;
	}
	hotblock_table_address = (uint32)&hotblock_table[0];
	// make sure table is cleaned
	memset(hotblock_table, 0, (24*1024*1024));
#endif

	has_initted = 1;
	return 0;
}

// flushes the recompiler cache.
void trx_rec_cacheflush(void)
{
	uint32 i;
	printf("flushing recompiler cache\n");
	translation_pos = 0;
	// fill translation table with pointers to translation routine
	for(i = 0; i < (24*1024*1024)>>2; i++)
	{
		translation_table[i] = (uint32)&trx_rec_translate_block_and_run;
	}
	// mark everything as non recompiled
	rec_cache_mark_all_invalid();

	// benchmark update and reset
	if(benchmark_cur_bytes > benchmark_max_bytes)benchmark_max_bytes = benchmark_cur_bytes;
	if(benchmark_cur_blocks > benchmark_max_blocks)benchmark_max_blocks = benchmark_cur_blocks;
	benchmark_cur_bytes = 0;
	benchmark_cur_blocks = 0;
	benchmark_total_flushes++;
}

// check if "trxCPUrec.npc" is already recompiled (means translation_table[trxCPUrec.npc] != rec_translate_block_and_run())
// if so, then working back from the return address transform the "call rec_patchmeup" into "jmp offset newfunction"
__declspec (naked) void rec_patchmeup(void)
{
	_asm
	{
		mov eax, trxCPUrec.npc
		and eax, MEM_MASK32MB
		mov ecx, dword ptr translation_table
		mov ecx, [ecx + eax]
		cmp ecx, trx_rec_translate_block_and_run
		je not_yet_recompiled
		mov eax, [esp]
		sub eax, 5
		mov byte ptr [eax], 0xe9
		add eax, 1
		mov edx, ecx
		sub edx, eax
		sub edx, 4
		mov dword ptr [eax], edx
not_yet_recompiled:
		add esp, 4; // kill the ret .. and totally confuse the call stack prediction :)
		ret
	}
}

// called when block hasnt been translated yet.
// translate block, fill in translation table and call the block
int trx_rec_translate_block_and_run(void)
{
	uint32 *retpatchpoint;
	uint32 value;
	uint32 hit_breakpoint, i;

	// important, re-entrant compiling is not setting trxCPUrec.pc but is setting trxCPUrec.npc .. so sync them here
	trxCPUrec.pc = trxCPUrec.npc;
	// check if our recompile buffer is full and if so just warn and flush the whole buffer
	if((translation_pos + TRANSLATION_SAFE_ZONE) > TRANSLATION_MEM_SIZE)
	{
		printf("recompiler buffer overflow! flushing cache!\n");
		trx_rec_cacheflush();
	}

	// align block to 16 bytes here for CPUs that like it
	if(blockalign_16 == 1)
	{
		translation_pos = (translation_pos+15) & ~0xf;
	}

	static int recblock = 0;
	char str[256], x86str[128];
	uint32 x86start, x86pos;

	if(config_debugasm)
	{
		debugasm_dump("================================================\n");
		sprintf(str,"Recompiled block: %d\n", recblock++);
		debugasm_dump(str);
		x86start = translation_pos; 
	}

	int nr_opcodes = 0;

	trxCPUrec.block_startPC = trxCPUrec.pc;
	block_start_x86_pos = translation_pos;

	// setup benchmark
	benchmark_cur_x86_instr_per_block = gen_asm_total;
	benchmark_cur_x86_bytes_per_block = translation_pos;

	// mark start for recompiled block
	translation_table[(trxCPUrec.block_startPC & MEM_MASK32MB)>>2] = (uint32)&translation_memory[translation_pos];


#if HOTBLOCK_PROFILE
// debug point for checking in more detail actual generated code !
//	if(trxCPUrec.pc == 0x8013A5D4)gen_asm(BREAK);
#endif

	//	while(cpuslice_left >= 0)
	gen_asm(CMP_M32I8, (uint32)&cpuslice_left, 0);
	// if negative, exit block. this is the unlikely case so and static prediction for forward branches is not taken
	// this is why I hook up with the exit block RET in the hopes it will help static branch prediction more than it
	// hurts me in 3 bytes more code (remember .. recycled RET)
	gen_asm(JS_I32, 0x0);
	retpatchpoint = (uint32 *)&translation_memory[translation_pos - 4];

#if HOTBLOCK_PROFILE
	gen_asm(MOV_RI32, EAX, (uint32)(trxCPUrec.block_startPC & MEM_MASK32MB));
	gen_asm(ADD_MRI32I8, EAX, hotblock_table_address, 1);
#endif
	// compile one block of code
	trxCPUrec.blockend = BLOCKEND_CONT;
	// did not check for fpu exceptions this block yet
	fpu_exception_checked = 0;
	// initialise register cache
	regc_start();
	if(use_sse)psc_start();

	hit_breakpoint = 0;

	for(trxCPUrec.block_instr = 0; (trxCPUrec.blockend != BLOCKEND_STOP) && (trxCPUrec.block_instr < MAX_BLOCK_SIZE) ; trxCPUrec.block_instr++)
	{
		// check for breakpoints!
		for(i=0; cpu_instr_breakpoint_list[i] != 0; i++)
		{
			if(trxCPUrec.pc == cpu_instr_breakpoint_list[i])
			{
				// generate breakpoint here!
				trx_gen_breakpoint();
				hit_breakpoint = 1;
			}
		}

		if(hit_breakpoint == 0)
		{
		trxCPUrec.npc = trxCPUrec.pc + 4;
		trxCPUrec.opcode = mem_iread(trxCPUrec.pc);
		// mark instruction as recompiled
		rec_cache_mark_valid(trxCPUrec.pc & 0x01ffffff);

		if(config_debugasm)
		{
			char buf[64], opStr[16], parmStr[32];
			uint32 target;

			GekkoDisassemble(opStr, parmStr, trxCPUrec.opcode, trxCPUrec.pc, &target);
			sprintf(buf, "%-10s %s", opStr, parmStr);    
			sprintf(str, "%.8X  %.8X  GEKKO: %s\n", trxCPUrec.pc, trxCPUrec.opcode, buf);
			debugasm_dump(str);
		}
		trx_rec_single();
		if(config_debugasm)
		{
			uint32 instr_addr;
			for(x86pos = x86start; x86pos < translation_pos;)
			{
				instr_addr = (uint32)&translation_memory[x86pos];
				x86pos += disasm_x86((uint32)&translation_memory[x86pos], x86str);
				sprintf(str, " -%8.8x %s\n", instr_addr, x86str);
				debugasm_dump(str);
			};
			x86start = translation_pos;
		}
		trxCPUrec.pc = trxCPUrec.npc;
		nr_opcodes++;
	}
	}

	if(config_debugasm)x86start = translation_pos;

	// block finished early ?
	if(trxCPUrec.blockend != BLOCKEND_STOP)
	{
		regc_end();
		if(use_sse)psc_end();

		if((trxCPUrec.block_instr) > 127)
		{
			gen_asm(SUB_M32I32, (uint32)&cpuslice_left, trxCPUrec.block_instr);
		}
		else
		{
			gen_asm(SUB_M32I8, (uint32)&cpuslice_left, trxCPUrec.block_instr);
		}
		gen_asm(MOV_M32I32, (uint32)&trxCPUrec.npc, trxCPUrec.npc);
	}
	// finish block 
	*retpatchpoint = ((uint32)&translation_memory[translation_pos])-((uint32)retpatchpoint)-4;
	gen_asm(RET);

	// update benchmark
	benchmark_cur_bytes = translation_pos;
	benchmark_cur_blocks++; 
	benchmark_cur_ppc_instr_per_block = nr_opcodes;
	benchmark_cur_x86_instr_per_block = gen_asm_total - benchmark_cur_x86_instr_per_block;
	benchmark_cur_x86_bytes_per_block = translation_pos - benchmark_cur_x86_bytes_per_block;

	benchmark_total_ppc_instr += benchmark_cur_ppc_instr_per_block;
	benchmark_total_x86_instr += benchmark_cur_x86_instr_per_block;
	benchmark_total_x86_bytes += benchmark_cur_x86_bytes_per_block;
	benchmark_total_blocks++;

	if(benchmark_cur_ppc_instr_per_block > benchmark_max_ppc_instr_per_block)benchmark_max_ppc_instr_per_block = benchmark_cur_ppc_instr_per_block;
	if(benchmark_cur_x86_instr_per_block > benchmark_max_x86_instr_per_block)benchmark_max_x86_instr_per_block = benchmark_cur_x86_instr_per_block;
	if(benchmark_cur_x86_bytes_per_block > benchmark_max_x86_bytes_per_block)benchmark_max_x86_bytes_per_block = benchmark_cur_x86_bytes_per_block;

	if(config_debugasm)
	{
		for(x86pos = x86start; x86pos < translation_pos;)
		{
			x86pos += disasm_x86((uint32)&translation_memory[x86pos], x86str);
			sprintf(str, "%8.8x %s\n",(uint32)&translation_memory[x86pos], x86str);
			debugasm_dump(str);
		};
		sprintf(str, "PPC instr: %d x86 instr: %d PPC bytes: %d x86 bytes %d\n",benchmark_cur_ppc_instr_per_block, benchmark_cur_x86_instr_per_block, benchmark_cur_ppc_instr_per_block*4, benchmark_cur_x86_bytes_per_block);
		debugasm_dump(str);
	}

	// and run it
	value = translation_table[(trxCPUrec.block_startPC & 0x01ffffff)>>2];
	_asm
	{
		push ebx
		push esi
		push edi
		push ebp
		mov eax, value
		call eax
		pop ebp
		pop edi
		pop esi
		pop ebx
	}
}

//__declspec (naked) void trx_rec_runcpu(void)
void trx_rec_runcpu(void)
{
	_asm
	{
		pushad
runloop:
		cmp cpuslice_left, 0
		js completed
		mov ecx, translation_table
		mov eax, trxCPUrec.npc
		and eax, MEM_MASK32MB
		mov eax, [eax + ecx]
		call eax
		jmp runloop
completed:
		popad
	}
}

void trx_rec_run()
{
	uint32 cpu_instr_ran = 0;

	int ops=0;
	int cyclecount = 0;
	int done = 0;

	trxCPUrec.pc = trxCPUrec.npc;

	// start with a clean slate
	cpu_instr_breakpoint_flag = 0;
	cpu_stop_running = 0;
	trx_rec_cacheflush();

	// simply call function at PC address (masked by memory boundaries of course)
	// if it hasnt been translated, the translation function will be called automatically
	while(!done)
	{
		// when we want to stop 
		if(cpu_stop_running == 1)
			return;

		// one 'horizontal scanline' worth of instructions ...
		cpuslice = config_instructions_per_line; 
		// dont delay DEC exceptions too much..
		if(cpuslice > trxCPUrec.spr[PPC_DEC]) cpuslice = trxCPUrec.spr[PPC_DEC];
#if FORCE_ONE_BLOCK_MODE
		cpuslice = 0;
#endif
		cpuslice_left = cpuslice;
		trxCPUrec.npc = trxCPUrec.pc;
		trx_rec_runcpu();

		// if hit breakpoint then immediately exit, dont even bother updating timer for this block
		if(cpu_instr_breakpoint_flag == 1) return;

		// check for code modifications
		if(codemodified)
		{
			trx_rec_cacheflush();
			codemodified = 0;
		}

		// handle interrupt early exits
		if(trxCPUrec.interrupt_signalled)
		{
			trxCPUrec.interrupt_signalled = 0;
			cpuslice_left = trxCPUrec.interrupt_cycles_leftover;
		}

		cpu_instr_ran = cpuslice + (-cpuslice_left);
		ops += cpu_instr_ran;

		// the time base register seems to be updated every 8 cycles or so.
		cyclecount += cpu_instr_ran;
		if(cyclecount > 8)
		{
			uint64 tb;
			tb = (trxCPUrec.spr[PPC_TBL])+(trxCPUrec.spr[PPC_TBH]<<32);
			tb += (cyclecount>>3); 
			cyclecount -= ((cyclecount>>3)<<3);
			trxCPUrec.spr[PPC_TBL] = tb;
			trxCPUrec.spr[PPC_TBH] = (tb>>32); 
		}
		if(cpu_instr_ran > trxCPUrec.spr[PPC_DEC])
		{
			trxCPUrec.spr[PPC_DEC] = 0xffffffff;
			trxCPUrec.exception_pending = true;
			trxCPUrec.dec_exception = true;
		}
		else
		{
			trxCPUrec.spr[PPC_DEC] -= cpu_instr_ran;
		}

		if(ops > config_instructions_per_line) 
		{
			ops = 0;
			static int exitcheck=0;

			vi_next_scanline();
			// run delayed interrupts based on x-scanlines delay
			if (pi_check_interrupt_lines()) 
			{
				trxCPUrec.exception_pending = true;
				trxCPUrec.ext_exception = true;
			}

			exitcheck++;
			if(exitcheck>10) 
			{
				exitcheck = 0;
				pad_read();
				done = w32_check_events();

				syslog(CPU,"@%08x (%d ops) dec: %08x lr: %08x\r", trxCPUrec.pc, ops, trxCPUrec.spr[PPC_DEC], trxCPUrec.lr);
			}
		}
		
		trxCPUrec.pc = trxCPUrec.npc;
		if (trxCPUrec.exception_pending)
		{
			if (trxCPUrec.stop_exception) 
			{
				trxCPUrec.stop_exception = false;
				if (!trxCPUrec.dec_exception && !trxCPUrec.ext_exception) trxCPUrec.exception_pending = false;
				break;
			}
			if (trxCPUrec.msr & MSR_EE) 
			{
				if (trxCPUrec.ext_exception) 
				{
					//printf("Entering exception @%8.8x\n", trxCPUrec.pc);
					trx_ppc_exception(PPC_EXC_EXT_INT, 0);
					trxCPUrec.pc = trxCPUrec.npc;
					trxCPUrec.ext_exception = false;
					if (!trxCPUrec.dec_exception) trxCPUrec.exception_pending = false;
					continue;
				}
				if (trxCPUrec.dec_exception) {
					trxCPUrec.dec_exception = false;
					trxCPUrec.exception_pending = false;
					trx_ppc_exception(PPC_EXC_DEC, 0);
					trxCPUrec.pc = trxCPUrec.npc;
					continue;
				}
			}
		}
	}
	trx_rec_benchmark_dump();
}

void trx_rec_stop()
{
}


//==============================================================================
//
// Memory access routines
// Recompiler optimized 
// 
//
static uint64 bswap64store;

void rec_io_write8(uint32 address, uint8 val)
{
	mem_write_io8(address, val);
}

// address in ECX, value in EAX
__declspec (naked) void rec_mem_write8(void)
{
	_asm
	{
		cmp ecx, 0xc8000000
		jae nomem8
		and ecx, MEM_MASK32MB
		mov edx, ecx ; // backup of masked address
		add ecx, gMemory
		mov [ecx], al
		// masked and shifted address in EDX
		shr edx, GX_CACHE_BLOCKSHIFT
		// !!! assumes that GX_CACHE_BLOCKSHIFT == REC_CACHE_SHIFT
		//gx_cache_table[address>>GX_CACHE_BLOCKSHIFT] = 0xff
		mov eax, dword ptr [gx_cache_table]
		mov byte ptr [eax+edx], 0xff
		//codemodified |= rec_cache_table[address>>REC_CACHE_SHIFT];
		mov eax, dword ptr [rec_cache_table]
		mov cl, byte ptr [eax+edx]
		or byte ptr [codemodified], cl
		// 
		ret
nomem8:
		cmp ecx, 0xe0000000
		jb io8
		and ecx, LOCKEDCACHE_MASK
		add ecx, pLockedcache
		mov [ecx], al
		ret
io8:
		push eax
		push ecx
		call rec_io_write8
		add esp, 8
		ret
	}
}

void rec_io_write16(uint32 address, uint16 val)
{
	mem_write_io16(address, val);
}

// address in ECX, value in EAX
__declspec (naked) void rec_mem_write16(void)
{
	_asm
	{
		cmp ecx, 0xc8000000
		jae nomem16
		and ecx, MEM_MASK32MB
		mov edx, ecx ; // backup of masked address in edx
		add ecx, gMemory
		xchg al, ah
		mov [ecx], ax
		// masked and shifted address in EDX
		shr edx, GX_CACHE_BLOCKSHIFT
		// !!! assumes that GX_CACHE_BLOCKSHIFT == REC_CACHE_SHIFT
		//gx_cache_table[address>>GX_CACHE_BLOCKSHIFT] = 0xff
		mov eax, dword ptr [gx_cache_table]
		mov byte ptr [eax+edx], 0xff ; // resident evil dies here !?
		//codemodified |= rec_cache_table[address>>REC_CACHE_SHIFT];
		mov eax, dword ptr [rec_cache_table]
		mov cl, byte ptr [eax+edx]
		or byte ptr [codemodified], cl
		// 
		ret
nomem16:
		cmp ecx, 0xe0000000
		jb io16
		and ecx, LOCKEDCACHE_MASK
		add ecx, pLockedcache
		xchg al, ah
		mov [ecx], ax
		ret
io16:
		push eax
		push ecx
		call rec_io_write16
		add esp, 8
		ret
	}
}

void rec_io_write32(uint32 address, uint32 val)
{
	mem_write_io32(address, val);
}

// address in ECX, value in EAX
__declspec (naked) void rec_mem_write32(void)
{
	_asm
	{
		cmp ecx, 0xc8000000
		jae nomem32
		and ecx, MEM_MASK32MB
		mov edx, ecx
		add ecx, gMemory
		bswap eax
		mov [ecx], eax
		// masked and shifted address in EDX
		shr edx, GX_CACHE_BLOCKSHIFT
		// !!! assumes that GX_CACHE_BLOCKSHIFT == REC_CACHE_SHIFT
		//gx_cache_table[address>>GX_CACHE_BLOCKSHIFT] = 0xff
		mov eax, dword ptr [gx_cache_table]
		mov byte ptr [eax+edx], 0xff ; // kills serious sam !?
		//codemodified |= rec_cache_table[address>>REC_CACHE_SHIFT];
		mov eax, dword ptr [rec_cache_table]
		mov cl, byte ptr [eax+edx] ; // rayman3 dies here !?? illegal memory access or routine broken ?
		or byte ptr [codemodified], cl
		// 
		ret
nomem32:
		cmp ecx, 0xe0000000
		jb io32
		and ecx, LOCKEDCACHE_MASK
		add ecx, pLockedcache
		bswap eax
		mov [ecx], eax
		ret
io32:
		push eax
		push ecx
		call rec_io_write32
		add esp, 8
		ret
	}
}

// address in ECX, value in XMM0
__declspec (naked) void rec_mem_write64(void)
{
	_asm
	{
		cmp ecx, 0xc8000000
		jae nomem64
		and ecx, MEM_MASK32MB
		push ecx
		add ecx, gMemory
		movlpd bswap64store, xmm0
		mov edx, dword ptr [bswap64store+4]
		mov eax, dword ptr [bswap64store]
		bswap edx
		bswap eax
		mov [ecx], edx
		mov [ecx+4], eax
		pop edx
		// masked and shifted address in EDX
		shr edx, GX_CACHE_BLOCKSHIFT
		// !!! assumes that GX_CACHE_BLOCKSHIFT == REC_CACHE_SHIFT
		//gx_cache_table[address>>GX_CACHE_BLOCKSHIFT] = 0xff
		mov eax, dword ptr [gx_cache_table]
		mov byte ptr [eax+edx], 0xff
		//codemodified |= rec_cache_table[address>>REC_CACHE_SHIFT];
		mov eax, dword ptr [rec_cache_table]
		mov cl, byte ptr [eax+edx]
		or byte ptr [codemodified], cl
		// 
		ret
nomem64:
		cmp ecx, 0xe0000000
		jb io64
		and ecx, LOCKEDCACHE_MASK
		add ecx, pLockedcache
		movlpd bswap64store, xmm0
		mov edx, dword ptr [bswap64store+4]
		mov eax, dword ptr [bswap64store]
		bswap edx
		bswap eax
		mov [ecx], edx
		mov [ecx+4], eax
		ret
io64:
		movlpd bswap64store, xmm0
		mov eax, dword ptr [bswap64store+4]
		mov edx, dword ptr [bswap64store]

		push edx

		push eax
		push ecx
		call rec_io_write32
		pop ecx
		add esp, 4

		pop eax
		add ecx, 4

		push eax
		push ecx
		call rec_io_write32
		add esp, 8

		ret
	}
}

uint8 rec_io_read8(uint32 address)
{
	return mem_read_io8(address);
}

// address in ECX, returns value in EAX
__declspec (naked) void rec_mem_read8(void)
{
	_asm
	{
		cmp ecx, 0xc8000000
		jae nomem8
		and ecx, MEM_MASK32MB
		xor eax, eax
		add ecx, gMemory
		mov al, [ecx]
		ret
nomem8:
		cmp ecx, 0xe0000000
		jb io8
		and ecx, LOCKEDCACHE_MASK
		add ecx, pLockedcache
		xor eax, eax
		mov al, [ecx]
		ret
io8:
		push ecx
		call rec_io_read8
		pop ecx
		and eax, 0xff
		ret
	}
}

uint16 rec_io_read16(uint32 address)
{
	return mem_read_io16(address);
}

// address in ECX, returns value in EAX
__declspec (naked) void rec_mem_read16(void)
{
	_asm
	{
		cmp ecx, 0xc8000000
		jae nomem16
		and ecx, MEM_MASK32MB
		xor eax, eax
		add ecx, gMemory
		mov ax, [ecx]
		xchg al, ah
		ret
nomem16:
		cmp ecx, 0xe0000000
		jb io16
		and ecx, LOCKEDCACHE_MASK
		xor eax, eax
		add ecx, pLockedcache
		mov ax, [ecx]
		xchg al, ah
		ret
io16:
		push ecx
		call rec_io_read16
		pop ecx
		and eax, 0xffff
		ret
	}
}

uint32 rec_io_read32(uint32 address)
{
	return mem_read_io32(address);
}

// address in ECX, returns value in EAX
__declspec (naked) void rec_mem_read32(void)
{
	_asm
	{
		cmp ecx, 0xc8000000
		jae nomem32
		and ecx, MEM_MASK32MB
		add ecx, gMemory
		mov eax, [ecx]
		bswap eax
		ret
nomem32:
		cmp ecx, 0xe0000000
		jb io32
		and ecx, LOCKEDCACHE_MASK
		add ecx, pLockedcache
		mov eax, [ecx]
		bswap eax
		ret
io32:
		push ecx
		call rec_io_read32
		pop ecx
		ret
	}
}

// address in ECX, returns value in XMM0
__declspec (naked) void rec_mem_read64(void)
{
	_asm
	{
		cmp ecx, 0xc8000000
		jae nomem64
		and ecx, MEM_MASK32MB
		add ecx, gMemory
		mov edx, [ecx]
		mov eax, [ecx+4]
		bswap edx
		bswap eax
		mov dword ptr [bswap64store+4], edx
		mov dword ptr [bswap64store], eax
		movlpd xmm0, [bswap64store]
		ret
nomem64:
		cmp ecx, 0xe0000000
		jb io64
		and ecx, LOCKEDCACHE_MASK
		add ecx, pLockedcache
		mov edx, [ecx]
		mov eax, [ecx+4]
		bswap edx
		bswap eax
		mov dword ptr [bswap64store+4], edx
		mov dword ptr [bswap64store], eax
		movlpd xmm0, [bswap64store]
		ret
io64:
		int 3h
		push ecx
		call rec_io_read32
		pop ecx
		push eax
		push ecx
		call rec_io_read32
		pop ecx
		pop edx	
		mov dword ptr [bswap64store+4], edx ; // actually the first read !
		mov dword ptr [bswap64store], eax // and the second read
		movlpd xmm0, [bswap64store]
		ret
	}
}


/////////
//
// Self check routines for memory access
//

void rec_slave_write8(uint32 address, uint8 val)
{
	slave_mem_write8(address, val);
}

// address in ECX, value in EAX
__declspec (naked) void rec_slave_mem_write8(void)
{
	_asm
	{
		push eax
		push ecx
		call rec_slave_write8
		add esp, 8
		ret
	}
}

void rec_slave_write16(uint32 address, uint16 val)
{
	slave_mem_write16(address, val);
}

// address in ECX, value in EAX
__declspec (naked) void rec_slave_mem_write16(void)
{
	_asm
	{
		push eax
		push ecx
		call rec_slave_write16
		add esp, 8
		ret
	}
}

void rec_slave_write32(uint32 address, uint32 val)
{
	slave_mem_write32(address, val);
}

// address in ECX, value in EAX
__declspec (naked) void rec_slave_mem_write32(void)
{
	_asm
	{
		push eax
		push ecx
		call rec_slave_write32
		add esp, 8
		ret
	}
}

uint8 rec_slave_read8(uint32 address)
{
	return slave_mem_read8(address);
}

// address in ECX, returns value in EAX
__declspec (naked) void rec_slave_mem_read8(void)
{
	_asm
	{
		push ecx
		call rec_slave_read8
		pop ecx
		and eax, 0xff
		ret
	}
}

uint16 rec_slave_read16(uint32 address)
{
	return slave_mem_read16(address);
}

// address in ECX, returns value in EAX
__declspec (naked) void rec_slave_mem_read16(void)
{
	_asm
	{
		push ecx
		call rec_slave_read16
		pop ecx
		and eax, 0xffff
		ret
	}
}

uint32 rec_slave_read32(uint32 address)
{
	return slave_mem_read32(address);
}

// address in ECX, returns value in EAX
__declspec (naked) void rec_slave_mem_read32(void)
{
	_asm
	{
		push ecx
		call rec_slave_read32
		pop ecx
		ret
	}
}