/*====================================================================

filename:     trx_ppc_cpu.cpp
project:      GCemu
created:      2004-6-18
mail:		  duddie@walla.com

Copyright (c) 2005 Duddie & Tratax

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

====================================================================*/
/*
 *	Tratax PowerPC interpreter
 *
 *  Contains code borrowed from PearPC by Sebastian Biallas licensed under GPL 
 *
 *	2004-6-18 started work based on PearPC core
 *  2004-7-06 enough emulated to run small demo in lockstep selfchecking mode
 *  2004-7-07 start replacing C code with (somewhat optimized) assembly equivalents
 *  2004-7-14 start making it run standalone interpreter
 *  2004-7-16 can now run standalone (still using some patchwork from PearPC)
 *  2004-7-24 first big merge with Duddies version
 *  2004-8-04 Start of debugging DMA and locked cache
 *	2005-2-06 Going back in time a bit. Time for correctness checking and this means first improving interpreter to be 100% correct
 *
 * Known limitations/bugs:
 *
 * XER_SO not calculated
 * FPU flags except for compare instructions not calculated
 * FPU precision is always doubles even for single operations
 * PS operates on 2 doubles instead of being 2 floats merged in one double
 *
 * General operation:
 * ------------------
 *
 * "Correctness first, speed will be later!" should be general idea. Dont get lured into early optimization again !
 *
 * Even though it is an interpreter, operations are done in 'block' style like a recompiler to make ports easy
 * 
 * EAX, ECX, EDX used as operation registers for integer, SSE2 registers used for FPU/Gekko
 * LOAD to operation register(s)
 * execute operation
 * STORE from operation register(s)
 *
 * available: EBX, ESI, EDI, EBP for register caching in recompiler implementation, and all of SSE2 of course (static assignment ?)
 *
 * MMU emulation to be added which self-enables when MMU is first accessed.
 *
 */

#include "config.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "hardware/memory_interface.h"
#include "hardware/hw_io.h"
#include "trx_ppc_cpu.h"
#include "ppc_disasm.h"
#include "w32_display.h"
#include "trx_ppc_rec.h"
#include "trx_ppc_int.h"
#include "profiler.h"

#include "pad.h"

#include "plugins/gx_plugin.h"

struct TRX_PPC_Registers *CPUcurrmode;

// helper pointers for assembly test version of interpreter
uint32 *prd, *prs, *pra, *prb, *prc;
double *frd, *frs, *fra, *frb, *frc, *fps1;
uint64 *qrd, *qrb;
uint32 zero = 0, one = 1;
uint32 cr_gt = CR0_GT, cr_lt = CR0_LT, cr_eq= CR0_EQ, cr_so = CR0_SO;

// these variables are globally modified in both interpreter and recompiler for keeping track of CPU slice
uint32 cpuslice = 0; // keeps amount of cycles to run this time
int cpuslice_left = 0; // keeps amount of cycles left in this slice (continously decreased and if goes to 0 or below block ends)

// for debugging
uint32 cpu_stop_running;
uint32 cpu_is_running;
uint32 cpu_instr_breakpoint_flag;
uint32 cpu_instr_breakpoint_list[256];

// for selfcheck mode memory-read routines.
uint32 selfmemread_buffer[MAX_BLOCK_SIZE];
uint32 master_selfmemwrite_buffer[MAX_BLOCK_SIZE];
uint32 slave_selfmemwrite_buffer[MAX_BLOCK_SIZE];

uint32 fcmpo_nan_flags = (FPSCR_VXSNAN | FPSCR_VXVC);
uint32 fcmpu_nan_flags = (FPSCR_VXSNAN);

uint32 trx_ppc_cmp_and_mask[8] = 
{
	0xfffffff0,
	0xffffff0f,
	0xfffff0ff,
	0xffff0fff,
	0xfff0ffff,
	0xff0fffff,
	0xf0ffffff,
	0x0fffffff
};


// returns true is given address is marked for breakpoint
bool is_breakpoint(uint32 address)
{
	uint32 i;

	for(i = 0; cpu_instr_breakpoint_list[i]!=0; i++)
	{
		if(cpu_instr_breakpoint_list[i] == address)return true;
	}
	return false;
}

// adds breakpoint for given address 
// returns true if breakpoint could be set, else returns false
bool add_breakpoint(uint32 address)
{
	uint32 i;

	for(i = 0; cpu_instr_breakpoint_list[i]!=0; i++);
	// can we still fit this breakpoint ?
	if(i < 256)
	{
		cpu_instr_breakpoint_list[i] = address;
		return true;
	}
	return false;
}

// removes breakpoint for given address 
// returns true if breakpoint could be removed, else returns false
bool remove_breakpoint(uint32 address)
{
	uint32 i, j;

	for(i = 0; cpu_instr_breakpoint_list[i]!=address; i++)
	{
		if((cpu_instr_breakpoint_list[i] == 0) || (i >= 256)) return false;
	}
	// found breakpoint
	cpu_instr_breakpoint_list[i] = 0;
	// scan to end of list and copy over this breakpoint to fill gap
	for(j = i+1; cpu_instr_breakpoint_list[j] != 0; j++);
	cpu_instr_breakpoint_list[i] = cpu_instr_breakpoint_list[j-1];
	cpu_instr_breakpoint_list[j-1] = 0;
	return true;
}

int trx_ppc_exception(uint32 type, uint32 flags)
{
	//printf("Entering Exception @%8.8x, type: %d\n", CPUcurrmode->pc, type);
	switch (type) {
	case PPC_EXC_DEC: { // .284
		CPUcurrmode->spr[PPC_SRR0] = CPUcurrmode->npc;
		CPUcurrmode->spr[PPC_SRR1] = CPUcurrmode->msr & 0x87c0ffff;
		break;
	}
	case PPC_EXC_EXT_INT: {
		CPUcurrmode->spr[PPC_SRR0] = CPUcurrmode->pc;
		CPUcurrmode->spr[PPC_SRR1] = CPUcurrmode->msr & 0x87c0ffff;
		break;
	}
	case PPC_EXC_SC: {  // .285
		CPUcurrmode->spr[PPC_SRR0] = CPUcurrmode->npc;
		CPUcurrmode->spr[PPC_SRR1] = CPUcurrmode->msr & 0x87c0ffff;
		break;
	}
	case PPC_EXC_NO_FPU: { // .284
		CPUcurrmode->spr[PPC_SRR0] = CPUcurrmode->pc;
		CPUcurrmode->spr[PPC_SRR1] = CPUcurrmode->msr & 0x87c0ffff;
		break;
	}
	case PPC_EXC_PROGRAM: { // .283
		if (flags & PPC_EXC_PROGRAM_NEXT) {
			CPUcurrmode->spr[PPC_SRR0] = CPUcurrmode->npc;
		} else {
			CPUcurrmode->spr[PPC_SRR0] = CPUcurrmode->pc;
		}
		CPUcurrmode->spr[PPC_SRR1] = (CPUcurrmode->msr & 0x87c0ffff) | flags;
		break;
	}
	default:
		return false;
	}
	CPUcurrmode->msr = 0;
	CPUcurrmode->npc = type;
	return true;
}

// signals external device interrupt and will break current CPU running loop before cpuslice is done 
// it will be stopped after the current basic block.
void trx_ppc_signal_interrupt(void)
{
	CPUcurrmode->exception_pending = true;
	CPUcurrmode->ext_exception = true;

	CPUcurrmode->interrupt_signalled = true;

	// potential multithreading problems:
	CPUcurrmode->interrupt_cycles_leftover = cpuslice_left;
	cpuslice_left = 0;
}

//==============================================================================

uint8 *virtualmemory;

bool trx_cpu_init()
{	
	uint32 i;
	// for games using TLB to map ARAM as virtual memory
	// we dont bother with the TLB emulation and just give it a nice fresh juicy 32 MB of memory to play with
	// this will work as long as no one is mixing virtual memory with DSP code/data (and hopefully no one will!)
	virtualmemory = (uint8 *)malloc(32*1024*1024);

/*
	gCPU.ibatl[0] = 0x00000030;
	gCPU.ibatu[0] = 0x000003ff;
	gCPU.ibatl[1] = 0x00000030;
	gCPU.ibatu[1] = 0x800003ff;
	gCPU.ibatl[2] = 0x00000030;
	gCPU.ibatu[2] = 0xc00003ff;
	gCPU.ibatl[3] = 0x82000300;
	gCPU.ibatu[3] = 0xc7f83300;

	gCPU.ibat_bl17[0] = ~(BATU_BL(gCPU.ibatu[0])<<17);
	gCPU.ibat_bl17[1] = ~(BATU_BL(gCPU.ibatu[1])<<17);
	gCPU.ibat_bl17[2] = ~(BATU_BL(gCPU.ibatu[2])<<17);
	gCPU.ibat_bl17[3] = ~(BATU_BL(gCPU.ibatu[3])<<17);

	gCPU.dbatl[0] = 0x00000030;
	gCPU.dbatu[0] = 0x000003ff;
	gCPU.dbatl[1] = 0x00000030;
	gCPU.dbatu[1] = 0x800003ff;
	gCPU.dbatl[2] = 0x00000030;
	gCPU.dbatu[2] = 0xc00003ff;
	gCPU.dbatl[3] = 0xcc000030;
	gCPU.dbatu[3] = 0xcc0003ff;

	gCPU.dbat_bl17[0] = ~(BATU_BL(gCPU.dbatu[0])<<17);
	gCPU.dbat_bl17[1] = ~(BATU_BL(gCPU.dbatu[1])<<17);
	gCPU.dbat_bl17[2] = ~(BATU_BL(gCPU.dbatu[2])<<17);
	gCPU.dbat_bl17[3] = ~(BATU_BL(gCPU.dbatu[3])<<17);
*/
	if(rec_cache_init())
	{
		printf("[Tratax Recompiler] failed to initialise self modifying code checks\n");
		return false;
	}

	// both interpreter and recompiler need to be initialised for debugger runs
	// just for selfcheck mode it is different.
	trx_rec_init();
	// clear breakpoint list
	//cpu_instr_breakpoint_list[0] = 0x801141d0;
	//cpu_instr_breakpoint_list[1] = 0x801143b8;
	//cpu_instr_breakpoint_list[2] = 0;

	// fill in memory routines
	if(config_cpumode != CPU_SELFCHECKMODE)
	{
		p_rec_mem_read8 = (void *)rec_mem_read8;
		p_rec_mem_read16 = (void *)rec_mem_read16;
		p_rec_mem_read32 = (void *)rec_mem_read32;
		p_rec_mem_write8 = (void *)rec_mem_write8;
		p_rec_mem_write16 = (void *)rec_mem_write16;
		p_rec_mem_write32 = (void *)rec_mem_write32;
	}
	else
	{
		p_rec_mem_read8 = (void *)rec_slave_mem_read8;
		p_rec_mem_read16 = (void *)rec_slave_mem_read16;
		p_rec_mem_read32 = (void *)rec_slave_mem_read32;
		p_rec_mem_write8 = (void *)rec_slave_mem_write8;
		p_rec_mem_write16 = (void *)rec_slave_mem_write16;
		p_rec_mem_write32 = (void *)rec_slave_mem_write32;
	}

	trx_int_ps0_double = (double *)trxCPUint.fpr;
	trx_int_ps1_double = (double *)trxCPUint.ps1;
	trx_int_ps0_int = (uint64 *)trxCPUint.fpr;
	trx_int_ps1_int = (uint64 *)trxCPUint.ps1;
	// fill in memory routines
	if(config_cpumode != CPU_SELFCHECKMODE)
	{
		mem_read8_int = mem_read8;
		mem_read16_int = mem_read16;
		mem_read32_int = mem_read32;
		mem_write8_int = mem_write8;
		mem_write16_int = mem_write16;
		mem_write32_int = mem_write32;
	}
	else
	{
		mem_read8_int = master_mem_read8;
		mem_read16_int = master_mem_read16;
		mem_read32_int = master_mem_read32;
		mem_write8_int = master_mem_write8;
		mem_write16_int = master_mem_write16;
		mem_write32_int = master_mem_write32;		
	}
	
	// most people will use recompiler anyway
	CPUcurrmode = &trxCPUrec;

	memset(CPUcurrmode, 0, sizeof(struct TRX_PPC_Registers));
	// initialize srs (mostly for prom)
	for (i=0; i<16; i++) 
	{
		CPUcurrmode->sr[i] = 0x2aa*i;
	}
	for(i=0;i<32;i++) CPUcurrmode->gpr[i] = 0x8130fffc;
	CPUcurrmode->msr = MSR_IR | MSR_DR | MSR_FP;

	return 0;
}
//////////////////////////////////////////
// selfcheck mode
//
///////////////////////////////////////

uint32 master_selfmemread_index, master_selfmemwrite_index;
uint32 slave_selfmemread_index, slave_selfmemwrite_index;
uint32 master_selfmemread_data[MAX_BLOCK_SIZE];
uint32 master_selfmemread_address[MAX_BLOCK_SIZE];
// data not available for selfcheck reading! it uses the master data!
uint32 slave_selfmemread_address[MAX_BLOCK_SIZE];
uint32 master_selfmemwrite_data[MAX_BLOCK_SIZE];
uint32 master_selfmemwrite_address[MAX_BLOCK_SIZE];
uint32 slave_selfmemwrite_data[MAX_BLOCK_SIZE];
uint32 slave_selfmemwrite_address[MAX_BLOCK_SIZE];

// memory access routines for selfcheck mode
uint8 master_mem_read8(uint32 address)
{
	uint8 res;	
	res = mem_read8(address);
	master_selfmemread_data[master_selfmemread_index] = res;
	master_selfmemread_address[master_selfmemread_index] = address;
	master_selfmemread_index++;
	return res;
}

uint16 master_mem_read16(uint32 address)
{
	uint16 res;	
	res = mem_read16(address);
	master_selfmemread_data[master_selfmemread_index] = res;
	master_selfmemread_address[master_selfmemread_index] = address;
	master_selfmemread_index++;
	return res;
}

uint32 master_mem_read32(uint32 address)
{
	uint32 res;	
	res = mem_read32(address);
	master_selfmemread_data[master_selfmemread_index] = res;
	master_selfmemread_address[master_selfmemread_index] = address;
	master_selfmemread_index++;
	return res;
}
void master_mem_write8(uint32 address, uint8 val)
{
	master_selfmemwrite_data[master_selfmemwrite_index] = val;
	master_selfmemwrite_address[master_selfmemwrite_index] = address;
	mem_write8(address, val);
	master_selfmemwrite_index++;
}
void master_mem_write16(uint32 address, uint16 val)
{
	master_selfmemwrite_data[master_selfmemwrite_index] = val;
	master_selfmemwrite_address[master_selfmemwrite_index] = address;
	mem_write16(address, val);
	master_selfmemwrite_index++;
}
void master_mem_write32(uint32 address, uint32 val)
{
	master_selfmemwrite_data[master_selfmemwrite_index] = val;
	master_selfmemwrite_address[master_selfmemwrite_index] = address;
	mem_write32(address, val);
	master_selfmemwrite_index++;
}

uint8 slave_mem_read8(uint32 address)
{
	slave_selfmemread_address[slave_selfmemread_index] = address;
	return 	master_selfmemread_data[slave_selfmemread_index++];
}

uint16 slave_mem_read16(uint32 address)
{
	slave_selfmemread_address[slave_selfmemread_index] = address;
	return 	master_selfmemread_data[slave_selfmemread_index++];
}
uint32 slave_mem_read32(uint32 address)
{
	slave_selfmemread_address[slave_selfmemread_index] = address;
	return 	master_selfmemread_data[slave_selfmemread_index++];
}
void slave_mem_write8(uint32 address, uint8 val)
{
	slave_selfmemwrite_address[slave_selfmemwrite_index] = address;
	slave_selfmemwrite_data[slave_selfmemwrite_index] = val;
	slave_selfmemwrite_index++;
}

void slave_mem_write16(uint32 address, uint16 val)
{
	slave_selfmemwrite_address[slave_selfmemwrite_index] = address;
	slave_selfmemwrite_data[slave_selfmemwrite_index] = val;
	slave_selfmemwrite_index++;
}

void slave_mem_write32(uint32 address, uint32 val)
{
	slave_selfmemwrite_address[slave_selfmemwrite_index] = address;
	slave_selfmemwrite_data[slave_selfmemwrite_index] = val;
	slave_selfmemwrite_index++;
}

// compares CPU state structures in detail and disassembles last block to identify problem.
int trx_ppc_register_selfcheck(void)
{
	int pass = 1;
	uint32 i;	
	if(memcmp(trxCPUint.gpr, trxCPUrec.gpr, 32*4)!=0)
	{
		pass = 0;
		printf("GPR:\n");
		for(i = 0; i < 32; i++)printf("%s[%2.2d] INT %8.8x REC %8.8x\n", (trxCPUint.gpr[i] != trxCPUrec.gpr[i])?"*":"-", i, trxCPUint.gpr[i], trxCPUrec.gpr[i]);
	}
	if(memcmp(trxCPUint.fpr, trxCPUrec.fpr, 32*8)!=0)
	{
		pass = 0;
		printf("FPR:\n");
		for(i = 0; i < 32; i++)printf("%s[%2.2d] INT %16.16I64x REC %16.16I64x\n", (trxCPUint.fpr[i] != trxCPUrec.fpr[i])?"*":"-", i, trxCPUint.fpr[i], trxCPUrec.fpr[i]);
	}
	if(memcmp(trxCPUint.ps1, trxCPUrec.ps1, 32*8)!=0)
	{
		pass = 0;
		printf("PS1:\n");
		for(i = 0; i < 32; i++)printf("%s[%2.2d] INT %16.16I64x REC %16.16I64x\n", (trxCPUint.ps1[i] != trxCPUrec.ps1[i])?"*":"-", i, trxCPUint.fpr[i], trxCPUrec.fpr[i]);
	}
	if(trxCPUint.cr != trxCPUrec.cr){pass = 0;printf("* INT CR %8.8x REC %8.8x\n", trxCPUint.cr, trxCPUrec.cr);	}
	if(trxCPUint.fpscr != trxCPUrec.fpscr){pass = 0;printf("* INT FPSCR %8.8x REC %8.8x\n", trxCPUint.fpscr, trxCPUrec.fpscr);}
	if(trxCPUint.xer != trxCPUrec.xer){pass = 0;printf("* INT XER %8.8x REC %8.8x\n", trxCPUint.xer, trxCPUrec.xer);}
	if(trxCPUint.xer_ca != trxCPUrec.xer_ca){pass = 0;printf("* INT XER_CA %8.8x REC %8.8x\n", trxCPUint.xer_ca, trxCPUrec.xer_ca);}
	if(trxCPUint.lr != trxCPUrec.lr){pass = 0;printf("* INT LR %8.8x REC %8.8x\n", trxCPUint.lr, trxCPUrec.lr);}
	if(trxCPUint.ctr != trxCPUrec.ctr){pass = 0;printf("* INT CTR %8.8x REC %8.8x\n", trxCPUint.ctr, trxCPUrec.ctr);}
	if(trxCPUint.msr != trxCPUrec.msr){pass = 0;printf("* INT MSR %8.8x REC %8.8x\n", trxCPUint.msr, trxCPUrec.msr);}

	for(i = 0; i < 16; i++)
	{
		if(trxCPUint.sr[i] != trxCPUrec.sr[i]){pass = 0;printf("* SR[%d] INT: %8.8x REC: %8.8x\n", i, trxCPUint.sr[i], trxCPUrec.sr[i]);}
	}
	for(i = 0; i < 4096; i++)
	{
		if(trxCPUint.spr[i] != trxCPUrec.spr[i]){pass = 0;printf("* SPR[%d] INT: %8.8x REC: %8.8x\n", i, trxCPUint.spr[i], trxCPUrec.spr[i]);}
	}

	if(trxCPUint.pc != trxCPUrec.pc){pass = 0;printf("* INT PC %8.8x REC %8.8x\n", trxCPUint.pc, trxCPUrec.pc);}
	if(trxCPUint.npc != trxCPUrec.npc){pass = 0;printf("* INT NPC %8.8x REC %8.8x\n", trxCPUint.npc, trxCPUrec.npc);}

/*	bool   exception_pending;
	bool   dec_exception;
	bool   ext_exception;
	bool   stop_exception;
	uint32 blockend;
	uint32 block_instr;
	uint32 block_startPC;
*/
	return pass;
}
// run interpreter and recompiler at the same time and compare results for easy debugging (basically it finds bugs by itself)
// interpreter is always master and recompiler is using a special mode where it is being slave, and not actually touching memory
void trx_ppc_selfcheck(void)
{
	struct TRX_PPC_Registers CPUlaststate;	
	bool done = false;
	uint32 cpu_instr_ran = 0, blocks_ran = 0, block_startaddres = 0;

	int ops=0;
	int cyclecount = 0;

	int blockpass = 1;

	memcpy(&trxCPUrec, &trxCPUint, sizeof(struct TRX_PPC_Registers));

	while (!done) 
	{
		// keep current state, then run one block on interpreter, one block on recompiler and see if they match
		memcpy(&CPUlaststate, &trxCPUint, sizeof(struct TRX_PPC_Registers));
		block_startaddres = trxCPUint.pc;

		// run only one block
		cpuslice = 0;
		cpuslice_left = cpuslice;
		// reset selfcheck memory buffers
		master_selfmemread_index = 0;
		master_selfmemwrite_index = 0;

		CPUcurrmode = &trxCPUint;
		trx_int_runcpu();
	
		// run only one block
		cpuslice = 0;
		cpuslice_left = cpuslice;

		// reset selfcheck memory buffers
		slave_selfmemread_index = 0;
		slave_selfmemwrite_index = 0;

		CPUcurrmode = &trxCPUrec;
		trx_rec_runcpu();
		// recompiler only updates npc, not pc register
		trxCPUrec.pc = trxCPUrec.npc;

		// switch back to interpreter for exception checks
		CPUcurrmode = &trxCPUint;

		// compare register state !
		blockpass = trx_ppc_register_selfcheck();
		// compare memory accesses
		if(master_selfmemread_index != slave_selfmemread_index)
		{
			printf("read access: INT %d times, REC %d times\n", master_selfmemread_index, slave_selfmemread_index);
			printf("selfcheck on memory read access failed\n");
			blockpass = 0;
		}
		if(master_selfmemwrite_index != master_selfmemwrite_index)
		{
			printf("write access: INT %d times, REC %d times\n", master_selfmemwrite_index, slave_selfmemwrite_index);
			printf("selfcheck on memory write access failed\n");
			blockpass = 0;
		}
		if(memcmp(master_selfmemwrite_data, slave_selfmemwrite_data, master_selfmemwrite_index)!=0)
		{
			int i;
			printf("write data comparison error\n");
			for(i = 0; i < master_selfmemwrite_index; i++)
			{
				printf("%s INT %8.8x REC %8.8x\n", (master_selfmemwrite_data[i] == slave_selfmemwrite_data[i])?"-":"*", master_selfmemwrite_data[i], slave_selfmemwrite_data[i]);   
			}
			blockpass = 0;
		}
		if(memcmp(master_selfmemwrite_address, slave_selfmemwrite_address, master_selfmemwrite_index)!=0)
		{
			int i;
			printf("write address comparison error\n");
			for(i = 0; i < master_selfmemwrite_index; i++)
			{
				printf("%s INT %8.8x REC %8.8x\n", (master_selfmemwrite_address[i] == slave_selfmemwrite_address[i])?"-":"*", master_selfmemwrite_address[i], slave_selfmemwrite_address[i]);   
			}
			blockpass = 0;
		}
		if(memcmp(master_selfmemread_address, slave_selfmemread_address, master_selfmemread_index)!=0)
		{
			int i;
			printf("read address comparison error\n");
			for(i = 0; i < master_selfmemread_index; i++)
			{
				printf("%s INT %8.8x REC %8.8x\n", (master_selfmemread_address[i] == slave_selfmemread_address[i])?"-":"*", master_selfmemread_address[i], slave_selfmemread_address[i]);   
			}
			blockpass = 0;
		}

		if(blockpass == 0)
		{
			int i;
			printf("LAST BLOCK:\n");
			printf("GPR:\n");
			for(i = 0; i < 32; i+=4)printf("[%2.2d] %8.8x [%2.2d] %8.8x [%2.2d] %8.8x [%2.2d] %8.8x\n", i, CPUlaststate.gpr[i], i+1, CPUlaststate.gpr[i+1], i+2, CPUlaststate.gpr[i+2], i+3, CPUlaststate.gpr[i+3]); 
			printf("FPR:\n");
			for(i = 0; i < 32; i+=4)printf("[%2.2d] %16.16I64x [%2.2d] %16.16I64x [%2.2d] %16.16I64x [%2.2d] %16.16I64x\n", i, CPUlaststate.fpr[i], i+1, CPUlaststate.fpr[i+1], i+2, CPUlaststate.fpr[i+2], i+3, CPUlaststate.fpr[i+3]); 
			printf("PS1:\n");
			for(i = 0; i < 32; i+=4)printf("[%2.2d] %16.16I64x [%2.2d] %16.16I64x [%2.2d] %16.16I64x [%2.2d] %16.16I64x\n", i, CPUlaststate.ps1[i], i+1, CPUlaststate.ps1[i+1], i+2, CPUlaststate.ps1[i+2], i+3, CPUlaststate.ps1[i+3]); 
			printf("selfcheck failed at: %x after %d blocks\n", block_startaddres, blocks_ran);
			exit(0);
		}
		blocks_ran++;

		// give progress indication
		if(blocks_ran % 10000 == 0)
		{
			char tmp[255];
			sprintf(tmp, "blocks ran: %d\n", blocks_ran);
			fprintf(stderr, tmp);
		}

		cpu_instr_ran = cpuslice + (-cpuslice_left);

		ops += cpu_instr_ran;

		// ==================================
		// housekeeping after each block. current CPU core is interpreter, so we have to manually update recompiler to keep sync

		// the time base register seems to be updated every 8 cycles or so.
		cyclecount += cpu_instr_ran;
		if(cyclecount > 8)
		{
			uint64 tb;
			tb = (CPUcurrmode->spr[PPC_TBL])+(CPUcurrmode->spr[PPC_TBH]<<32);
			tb += (cyclecount>>3); 
			cyclecount -= ((cyclecount>>3)<<3);
			CPUcurrmode->spr[PPC_TBL] = tb;
			CPUcurrmode->spr[PPC_TBH] = (tb>>32);  
			// sync cores
			trxCPUrec.spr[PPC_TBL] = CPUcurrmode->spr[PPC_TBL];
			trxCPUrec.spr[PPC_TBH] = CPUcurrmode->spr[PPC_TBH];			
		}
		if(cpu_instr_ran > CPUcurrmode->spr[PPC_DEC])
		{
			CPUcurrmode->spr[PPC_DEC] = 0xffffffff;
			CPUcurrmode->exception_pending = true;
			CPUcurrmode->dec_exception = true;
			// sync
			trxCPUrec.spr[PPC_DEC] = CPUcurrmode->spr[PPC_DEC];
			trxCPUrec.exception_pending = CPUcurrmode->exception_pending;
			trxCPUrec.dec_exception = CPUcurrmode->dec_exception;
		}
		else
		{
			CPUcurrmode->spr[PPC_DEC] -= cpu_instr_ran;
			// sync
			trxCPUrec.spr[PPC_DEC] = CPUcurrmode->spr[PPC_DEC];
		}

		if(ops >= config_instructions_per_line) 
		{
			ops = 0;
			static int exitcheck=0;
			vi_next_scanline();
			if (pi_check_interrupt_lines()) 
			{
				CPUcurrmode->exception_pending = true;
				CPUcurrmode->ext_exception = true;
				// sync
				trxCPUrec.spr[PPC_DEC] = CPUcurrmode->spr[PPC_DEC];
				trxCPUrec.exception_pending = CPUcurrmode->exception_pending;
				trxCPUrec.dec_exception = CPUcurrmode->dec_exception;
			}
			exitcheck++;
			if(exitcheck>10) 
			{
				exitcheck = 0;
				pad_read();
				done = w32_check_events();
				syslog(CPU,"@%08x (%d ops) dec: %08x lr: %08x\r", CPUcurrmode->pc, ops, CPUcurrmode->spr[PPC_DEC], CPUcurrmode->lr);
			}
		}
		
		CPUcurrmode->pc = CPUcurrmode->npc;
		if (CPUcurrmode->exception_pending)
		{
			if (CPUcurrmode->stop_exception) 
			{
				CPUcurrmode->stop_exception = false;
				if (!CPUcurrmode->dec_exception && !CPUcurrmode->ext_exception) 
				{
					CPUcurrmode->exception_pending = false;
				}
				// sync
				trxCPUrec.exception_pending = CPUcurrmode->exception_pending;
				trxCPUrec.dec_exception = CPUcurrmode->dec_exception;
				trxCPUrec.stop_exception = CPUcurrmode->stop_exception;
				break;
			}
			if (CPUcurrmode->msr & MSR_EE) 
			{
				if (CPUcurrmode->ext_exception) 
				{
					//printf("Entering exception @%8.8x\n", CPUcurrmode->pc);
					trx_ppc_exception(PPC_EXC_EXT_INT, 0);
					CPUcurrmode->pc = CPUcurrmode->npc;
					CPUcurrmode->ext_exception = false;
					if (!CPUcurrmode->dec_exception)
					{
						CPUcurrmode->exception_pending = false;
					}
					// sync
					trxCPUrec.exception_pending = CPUcurrmode->exception_pending;
					trxCPUrec.ext_exception = CPUcurrmode->ext_exception;
					trxCPUrec.spr[PPC_SRR0] = CPUcurrmode->spr[PPC_SRR0];
					trxCPUrec.spr[PPC_SRR1] = CPUcurrmode->spr[PPC_SRR1];
					trxCPUrec.msr = CPUcurrmode->msr;
					trxCPUrec.npc = CPUcurrmode->npc;
					trxCPUrec.pc = CPUcurrmode->pc;
					continue;
				}
				if (CPUcurrmode->dec_exception) {
					CPUcurrmode->dec_exception = false;
					CPUcurrmode->exception_pending = false;
					trx_ppc_exception(PPC_EXC_DEC, 0);
					CPUcurrmode->pc = CPUcurrmode->npc;
					// sync
					trxCPUrec.exception_pending = CPUcurrmode->exception_pending;
					trxCPUrec.dec_exception = CPUcurrmode->dec_exception;
					trxCPUrec.spr[PPC_SRR0] = CPUcurrmode->spr[PPC_SRR0];
					trxCPUrec.spr[PPC_SRR1] = CPUcurrmode->spr[PPC_SRR1];
					trxCPUrec.msr = CPUcurrmode->msr;
					trxCPUrec.npc = CPUcurrmode->npc;
					trxCPUrec.pc = CPUcurrmode->pc;
					continue;
				}
			}
		}
	}
}

// used for stepping in debug mode
// note: timer and line interrupts are not processed !
void trx_ppc_step()
{
	// we always use interpreter to do stepping
	if(CPUcurrmode == &trxCPUrec)
	{
		// sync up the cores!
		memcpy(&trxCPUint, &trxCPUrec, sizeof(struct TRX_PPC_Registers));
		CPUcurrmode = &trxCPUint;
	}
	trx_int_step();
}

// runs until requested to exit (but will not exit emulator) or hitting breakpoint
void trx_ppc_debug_run()
{
	// recompiler expects CPUcurrmode->npc to be valid
	CPUcurrmode->npc = CPUcurrmode->pc;
	// we want to run debugger always on recompiler
	if(CPUcurrmode == &trxCPUint)
	{
		// sync up the cores!
		memcpy(&trxCPUrec, &trxCPUint, sizeof(struct TRX_PPC_Registers));
		CPUcurrmode = &trxCPUrec;
	}
	cpu_is_running = 1;
	trx_rec_run();
	cpu_is_running = 0;
}

void trx_ppc_run()
{
	// recompiler expects CPUcurrmode->npc to be valid
	CPUcurrmode->npc = CPUcurrmode->pc;
	printf("execution started at %08x\n", CPUcurrmode->pc);

	// sync up the cores, if not running (default) recompiler
	if(config_cpumode != CPU_RECOMPILER)
	{
		memcpy(&trxCPUint, &trxCPUrec, sizeof(struct TRX_PPC_Registers));
	}
	// clear breakpoint list
	cpu_instr_breakpoint_list[0] = 0;

	// we can dynamically switch between modes now
	cpu_is_running = 1;
	switch(config_cpumode)
	{
		case CPU_RECOMPILER:
			CPUcurrmode = &trxCPUrec; printf("using recompiler\n"); trx_rec_run(); break;
		case CPU_INTERPRETER: 
			CPUcurrmode = &trxCPUint; printf("using interpreter\n"); trx_int_run(); break;
		case CPU_SELFCHECKMODE: 
			CPUcurrmode = &trxCPUint; printf("using selfcheck mode\n"); trx_ppc_selfcheck();break;
		default:
			printf("unknown CPU mode \n"); exit(0);
	}
	cpu_is_running = 0;
	printf("exitting trx_ppc_run()\n");
}

//==============================================================================
// DMA and Locked cache
//

uint8 lockedcache[16*1024];
void *pLockedcache = lockedcache;

unsigned char *gMemory;
extern uint64 tmp_io, total_io;



// handles DMA transfer commands
//
// ?? does this actually work correctly in selfcheck mode ??
//
void dma_engine(void)
{
	if(CPUcurrmode->spr[PPC_DMAL] & DMAL_T)
	{
		uint32 dmalen, memadr, cacheadr;

		dmalen = ((CPUcurrmode->spr[PPC_DMAU] & 0x1f)<<2)+((CPUcurrmode->spr[PPC_DMAL]>>2)&3);
		if(dmalen == 0)dmalen = 128;

		memadr = CPUcurrmode->spr[PPC_DMAU] & ~0x1f;
		cacheadr = CPUcurrmode->spr[PPC_DMAL] & ~0x1f;

//		printf("[TRXCPU] DMAL_T triggered!\n");
//		printf("DMA cache address: %8.8x\n", cacheadr);
//		printf("DMA mem address: %8.8x\n", memadr);
//		printf("DMA len: %d cachelines\n", dmalen);
//		printf("DMA: %s\n", ((CPUcurrmode->spr[PPC_DMAL]>>4)&1)?"load":"store");
		// do DMA transfer
		if(((CPUcurrmode->spr[PPC_DMAL]>>4)&1)==0)
		{
			// store (cache to mem)
			memcpy(&gMemory[memadr & MEM_MASK32MB], &lockedcache[cacheadr & LOCKEDCACHE_MASK], dmalen*32);
			// invalidate texture cache for this area
			GX_CacheMarkInvalid(memadr & MEM_MASK32MB, dmalen*32);
		}
		else
		{
			// load (mem to cache)
			memcpy(&lockedcache[cacheadr & LOCKEDCACHE_MASK], &gMemory[memadr & MEM_MASK32MB], dmalen*32);
		}
		// clear T bit
		CPUcurrmode->spr[PPC_DMAL] &= ~DMAL_T;
	}
}