/*====================================================================

filename:     trx_ppc_int_fpu_ps_opcodes.cpp
project:      GCemu
created:      2004-6-18
mail:		  duddie@walla.com

Copyright (c) 2005 Duddie & Tratax

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

====================================================================*/
#include "config.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "trx_ppc_int_fpu_ps_opcodes.h"
#include "trx_ppc_cpu.h"
#include "ppc_disasm.h"
#include "cpu/trx_ppc_int.h"

// helper pointers for assembly test version of interpreter
static double *ps0rd, *ps0rs, *ps0ra, *ps0rb, *ps0rc;
static double *ps1rd, *ps1rs, *ps1ra, *ps1rb, *ps1rc;

static uint64 *qps0rd, *qps0rb;
static uint64 *qps1rd, *qps1rb;

void trx_ppc_int_gekko_ill(void);

//==============================================================================
// Floating point control opcodes
//

// used
void trx_ppc_int_mtfsb1x()
{
	int crbD, n1, n2;
	
	crbD = (trxCPUint.opcode >> 21)& 0x1f;
	n1 = (trxCPUint.opcode >> 16)& 0x1f;
	n2 = (trxCPUint.opcode >> 11)& 0x1f;

	if (crbD != 1 && crbD != 2) {
		trxCPUint.fpscr |= 1<<(31-crbD);
	}
}

// used
void trx_ppc_int_mtfsb0x()
{
	int crbD, n1, n2;
	
	crbD = (trxCPUint.opcode >> 21)& 0x1f;
	n1 = (trxCPUint.opcode >> 16)& 0x1f;
	n2 = (trxCPUint.opcode >> 11)& 0x1f;

	if (crbD != 1 && crbD != 2) {
		trxCPUint.fpscr &= ~(1<<(31-crbD));
	}
}

void trx_ppc_int_mffsx()
{
	uint32 rD;

	rD = (trxCPUint.opcode >> 21)& 0x1f;
	trxCPUint.fpr[rD] = trxCPUint.fpscr;
}

void trx_ppc_int_mtfsfx()
{
	uint32 rB,fm, FM;

	fm = ((trxCPUint.opcode)>>17)&0xff;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	FM = ((fm&0x80)?0xf0000000:0)|((fm&0x40)?0x0f000000:0)|((fm&0x20)?0x00f00000:0)|((fm&0x10)?0x000f0000:0)|
	     ((fm&0x08)?0x0000f000:0)|((fm&0x04)?0x00000f00:0)|((fm&0x02)?0x000000f0:0)|((fm&0x01)?0x0000000f:0);

	trxCPUint.fpscr = (trxCPUint.fpr[rB] & FM) | (trxCPUint.fpscr & ~FM);
}

// unknown,unverified FPSCR bits not calculated correctly anyway!
void trx_ppc_int_mcrfs()
{
	uint32 crD, crS, c;
	crD = (trxCPUint.opcode >> 23)& 0x7;
	crS = (trxCPUint.opcode >> 18)& 0x7;
	crD = 7-crD;
	crS = 7-crS;
	
	// clear the bits
	trxCPUint.cr &= trx_ppc_cmp_and_mask[crD];
	// copy from fpscr field rS to cr field rD
	c = (trxCPUint.fpscr >> (crS*4))&0xf;
	trxCPUint.cr |= c<<(crD*4);
	// and clear original bits
	trxCPUint.fpscr &= trx_ppc_cmp_and_mask[crS];
}

//==============================================================================
// Floating point load/store opcodes
//

// used
void trx_ppc_int_lfs()
{
	uint32 rD, rA, EA, r;
	float f;
	sint16 imm;

	if ((trxCPUint.msr & MSR_FP) == 0) 
	{
		trx_ppc_exception(PPC_EXC_NO_FPU, 0);
		return;
	}

	rD = (trxCPUint.opcode >> 21)& 0x1f;
	rA = (trxCPUint.opcode >> 16)& 0x1f;
	imm = trxCPUint.opcode & 0xffff;

	if(rA == 0)	EA = 0 + imm;
	else EA = trxCPUint.gpr[rA] + imm;

	r = mem_read32_int(EA);
	memcpy(&f, &r, 4); // copy to float
	trx_int_ps0_double[rD] = f; // store as double

	if(trxCPUint.spr[PPC_HID2] & HID2_PSE)
	{
		trx_int_ps1_double[rD] = f;
	}	
}

// used
void trx_ppc_int_lfsx()
{
	uint32 rD, rA, rB, EA, r;
	float f;

	if ((trxCPUint.msr & MSR_FP) == 0) 
	{
		trx_ppc_exception(PPC_EXC_NO_FPU, 0);
		return;
	}

	rD = (trxCPUint.opcode >> 21)& 0x1f;
	rA = (trxCPUint.opcode >> 16)& 0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	if(rA == 0)	EA = trxCPUint.gpr[rB];
	else EA = trxCPUint.gpr[rA] + trxCPUint.gpr[rB];

	r = mem_read32_int(EA);
	memcpy(&f, &r, 4); // copy to float
	trx_int_ps0_double[rD] = f; // store as double

	if(trxCPUint.spr[PPC_HID2] & HID2_PSE)
	{
		trx_int_ps1_double[rD] = f;
	}	
}

// used
void trx_ppc_int_lfsux()
{
	uint32 rD, rA, rB, EA, r;
	float f;

	if ((trxCPUint.msr & MSR_FP) == 0) 
	{
		trx_ppc_exception(PPC_EXC_NO_FPU, 0);
		return;
	}

	rD = (trxCPUint.opcode >> 21)& 0x1f;
	rA = (trxCPUint.opcode >> 16)& 0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	EA = trxCPUint.gpr[rA] + trxCPUint.gpr[rB];

	r = mem_read32_int(EA);
	memcpy(&f, &r, 4); // copy to float
	trx_int_ps0_double[rD] = f; // store as double

	if(trxCPUint.spr[PPC_HID2] & HID2_PSE)
	{
		trx_int_ps1_double[rD] = f;
	}	
	trxCPUint.gpr[rA] = EA;
}

// used
void trx_ppc_int_lfsu()
{
	uint32 rD, rA, EA, r;
	float f;
	sint16 imm;

	if ((trxCPUint.msr & MSR_FP) == 0) 
	{
		trx_ppc_exception(PPC_EXC_NO_FPU, 0);
		return;
	}

	rD = (trxCPUint.opcode >> 21)& 0x1f;
	rA = (trxCPUint.opcode >> 16)& 0x1f;
	imm = trxCPUint.opcode & 0xffff;

	EA = trxCPUint.gpr[rA] + imm;

	r = mem_read32_int(EA);
	memcpy(&f, &r, 4); // copy to float
	trx_int_ps0_double[rD] = f; // store as double

	if(trxCPUint.spr[PPC_HID2] & HID2_PSE)
	{
		trx_int_ps1_double[rD] = f;
	}
	trxCPUint.gpr[rA] = EA;
}

// used
void trx_ppc_int_lfd()
{
	uint32 rD, rA, EA;
	sint16 imm;
	uint32 r;
	if ((trxCPUint.msr & MSR_FP) == 0) 
	{
		trx_ppc_exception(PPC_EXC_NO_FPU, 0);
		return;
	}

	rD = (trxCPUint.opcode >> 21)& 0x1f;
	rA = (trxCPUint.opcode >> 16)& 0x1f;
	imm = trxCPUint.opcode & 0xffff;

	if(rA == 0)	EA = 0 + imm;
	else EA = trxCPUint.gpr[rA] + imm;

	r = mem_read32_int(EA);
	trx_int_ps0_int[rD] = r;
	trx_int_ps0_int[rD] <<=32;
	r = mem_read32_int(EA+4);
	trx_int_ps0_int[rD] |= r; // store directly as double
}

// used
void trx_ppc_int_lfdu()
{
	uint32 rD, rA, EA;
	sint16 imm;
	uint32 r;
	if ((trxCPUint.msr & MSR_FP) == 0) 
	{
		trx_ppc_exception(PPC_EXC_NO_FPU, 0);
		return;
	}

	rD = (trxCPUint.opcode >> 21)& 0x1f;
	rA = (trxCPUint.opcode >> 16)& 0x1f;
	imm = trxCPUint.opcode & 0xffff;

	EA = trxCPUint.gpr[rA] + imm;

	r = mem_read32_int(EA);
	trx_int_ps0_int[rD] = r;
	trx_int_ps0_int[rD] <<=32;
	r = mem_read32_int(EA+4);
	trx_int_ps0_int[rD] |= r; // store directly as double
	trxCPUint.gpr[rA] = EA; // and update
}

// used
void trx_ppc_int_lfdx()
{
	uint32 rD, rA, rB, EA;
	uint32 r;

	if ((trxCPUint.msr & MSR_FP) == 0) 
	{
		trx_ppc_exception(PPC_EXC_NO_FPU, 0);
		return;
	}

	rD=((trxCPUint.opcode)>>21)&0x1f;
	rA=((trxCPUint.opcode)>>16)&0x1f;
	rB=((trxCPUint.opcode)>>11)&0x1f;

	if(rA == 0)	EA = trxCPUint.gpr[rB];
	else EA = trxCPUint.gpr[rA] + trxCPUint.gpr[rB];

	r = mem_read32_int(EA);
	trx_int_ps0_int[rD] = r;
	trx_int_ps0_int[rD] <<=32;
	r = mem_read32_int(EA+4);
	trx_int_ps0_int[rD] |= r; // store directly as double
}

// used
void trx_ppc_int_lfdux()
{
	uint32 rD, rA, rB, EA;
	uint32 r;

	if ((trxCPUint.msr & MSR_FP) == 0) 
	{
		trx_ppc_exception(PPC_EXC_NO_FPU, 0);
		return;
	}

	rD=((trxCPUint.opcode)>>21)&0x1f;
	rA=((trxCPUint.opcode)>>16)&0x1f;
	rB=((trxCPUint.opcode)>>11)&0x1f;

	EA = trxCPUint.gpr[rA] + trxCPUint.gpr[rB];

	r = mem_read32_int(EA);
	trx_int_ps0_int[rD] = r;
	trx_int_ps0_int[rD] <<=32;
	r = mem_read32_int(EA+4);
	trx_int_ps0_int[rD] |= r; // store directly as double
	trxCPUint.gpr[rA] = EA;
}

// used
void trx_ppc_int_stfd()
{
	uint32 rS, rA, EA;
	sint16 imm;

	if ((trxCPUint.msr & MSR_FP) == 0) 
	{
		trx_ppc_exception(PPC_EXC_NO_FPU, 0);
		return;
	}
	
	rS = (trxCPUint.opcode >> 21)& 0x1f;
	rA = (trxCPUint.opcode >> 16)& 0x1f;
	imm = trxCPUint.opcode & 0xffff;

	if(rA == 0)	EA = 0 + imm;
	else EA = trxCPUint.gpr[rA] + imm;

	mem_write32_int(EA, (trx_int_ps0_int[rS]>>32));
	mem_write32_int(EA+4, trx_int_ps0_int[rS]);
}

// used
void trx_ppc_int_stfdu()
{
	uint32 rS, rA, EA;
	sint16 imm;

	if ((trxCPUint.msr & MSR_FP) == 0) 
	{
		trx_ppc_exception(PPC_EXC_NO_FPU);
		return;
	}
	
	rS = (trxCPUint.opcode >> 21)& 0x1f;
	rA = (trxCPUint.opcode >> 16)& 0x1f;
	imm = trxCPUint.opcode & 0xffff;

	EA = trxCPUint.gpr[rA] + imm;

	mem_write32_int(EA, (trx_int_ps0_int[rS]>>32));
	mem_write32_int(EA+4, trx_int_ps0_int[rS]);

	trxCPUint.gpr[rA] = EA;
}

// used
void trx_ppc_int_stfdx()
{
	uint32 rS, rA, rB, EA;

	if ((trxCPUint.msr & MSR_FP) == 0) 
	{
		trx_ppc_exception(PPC_EXC_NO_FPU);
		return;
	}
	
	rS = (trxCPUint.opcode >> 21)& 0x1f;
	rA = (trxCPUint.opcode >> 16)& 0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	if(rA == 0)	EA = trxCPUint.gpr[rB];
	else EA = trxCPUint.gpr[rA] + trxCPUint.gpr[rB];

	mem_write32_int(EA, (trx_int_ps0_int[rS]>>32));
	mem_write32_int(EA+4, trx_int_ps0_int[rS]);
}

// used
void trx_ppc_int_stfdux()
{
	uint32 rS, rA, rB, EA;

	if ((trxCPUint.msr & MSR_FP) == 0) 
	{
		trx_ppc_exception(PPC_EXC_NO_FPU, 0);
		return;
	}
	
	rS = (trxCPUint.opcode >> 21)& 0x1f;
	rA = (trxCPUint.opcode >> 16)& 0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	EA = trxCPUint.gpr[rA] + trxCPUint.gpr[rB];

	mem_write32_int(EA, (trx_int_ps0_int[rS]>>32));
	mem_write32_int(EA+4, trx_int_ps0_int[rS]);
	trxCPUint.gpr[rA] = EA;
}

// used
void trx_ppc_int_stfs()
{
	uint32 rS, rA, EA, s;
	float f;
	sint16 imm;

	if ((trxCPUint.msr & MSR_FP) == 0) 
	{
		trx_ppc_exception(PPC_EXC_NO_FPU, 0);
		return;
	}
	
	rS = (trxCPUint.opcode >> 21)& 0x1f;
	rA = (trxCPUint.opcode >> 16)& 0x1f;
	imm = trxCPUint.opcode & 0xffff;

	if(rA == 0)	EA = 0 + imm;
	else EA = trxCPUint.gpr[rA] + imm;

	f = trx_int_ps0_double[rS];
	memcpy(&s, &f, 4); // copy to int		
	mem_write32_int(EA, s);
}

// used
void trx_ppc_int_stfsx()
{
	uint32 rS, rA, rB, EA, s;
	float f;

	if ((trxCPUint.msr & MSR_FP) == 0) 
	{
		trx_ppc_exception(PPC_EXC_NO_FPU, 0);
		return;
	}
	
	rS = (trxCPUint.opcode >> 21)& 0x1f;
	rA = (trxCPUint.opcode >> 16)& 0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	if(rA == 0)	EA = trxCPUint.gpr[rB];
	else EA = trxCPUint.gpr[rA] + trxCPUint.gpr[rB];

	f = trx_int_ps0_double[rS];
	memcpy(&s, &f, 4); // copy to int		
	mem_write32_int(EA, s);
}

// used
void trx_ppc_int_stfsux()
{
	uint32 rS, rA, rB, EA, s;
	float f;

	if ((trxCPUint.msr & MSR_FP) == 0) 
	{
		trx_ppc_exception(PPC_EXC_NO_FPU, 0);
		return;
	}
	
	rS = (trxCPUint.opcode >> 21)& 0x1f;
	rA = (trxCPUint.opcode >> 16)& 0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	EA = trxCPUint.gpr[rA] + trxCPUint.gpr[rB];

	f = trx_int_ps0_double[rS];
	memcpy(&s, &f, 4); // copy to int		
	mem_write32_int(EA, s);

	trxCPUint.gpr[rA] = EA;
}

// used
void trx_ppc_int_stfsu()
{
	uint32 rS, rA, EA, s;
	float f;
	sint16 imm;

	if ((trxCPUint.msr & MSR_FP) == 0) 
	{
		trx_ppc_exception(PPC_EXC_NO_FPU, 0);
		return;
	}
	
	rS = (trxCPUint.opcode >> 21)& 0x1f;
	rA = (trxCPUint.opcode >> 16)& 0x1f;
	imm = trxCPUint.opcode & 0xffff;

	EA = trxCPUint.gpr[rA] + imm;

	f = trx_int_ps0_double[rS];
	memcpy(&s, &f, 4); // copy to int		
	mem_write32_int(EA, s);

	trxCPUint.gpr[rA] = EA;
}

//==============================================================================
// Floating point arithmetic opcodes
//

// tested
void trx_ppc_int_fdivsx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
		
/*
	f = (trx_int_ps0_double[rA] / trx_int_ps0_double[rB]);
	trx_int_ps0_double[rD] = f;
	if(trxCPUint.spr[PPC_HID2] & HID2_PSE)
	{
		trx_int_ps1_double[rD] = f;
	}
*/

	frd = &trx_int_ps0_double[rD];
	fra = &trx_int_ps0_double[rA];
	frb = &trx_int_ps0_double[rB];
	fps1 = &trx_int_ps1_double[rD];

	if(trxCPUint.spr[PPC_HID2] & HID2_PSE)
	{
		_asm
		{
			mov edx, dword ptr fra
			fld qword ptr [edx]	; // load double
			mov edx, dword ptr frb
			fdiv qword ptr [edx]; // divide by double
			mov edx, dword ptr frd
			fst qword ptr [edx]; // store as double 
			mov edx, dword ptr fps1
			fstp qword ptr [edx]; // store as double 
		};
	}
	else
	{
		_asm
		{
			mov edx, dword ptr fra
			fld qword ptr [edx]	; // load double
			mov edx, dword ptr frb
			fdiv qword ptr [edx]; // divide by double
			mov edx, dword ptr frd
			fstp qword ptr [edx]; // store as double 
		};
	}
}

static float float_store;
// used
void trx_ppc_int_frspx()
{
	uint32 rD, rB;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

//	float f;
//	f = trx_int_ps0_double[rB];
//	trx_int_ps0_double[rD] = f;

	frd = &trx_int_ps0_double[rD];
	frb = &trx_int_ps0_double[rB];
	_asm
	{
		mov edx, dword ptr frb
		movlpd xmm0, qword ptr [edx];
		cvtsd2ss xmm0, xmm0		;// truncate to single and back to double
		cvtss2sd xmm0, xmm0
		mov edx, dword ptr frd
		movlpd qword ptr [edx], xmm0
//		mov edx, dword ptr frb
//		fld qword ptr [edx]
//		fstp dword ptr float_store
//		fld dword ptr float_store
//		mov edx, dword ptr frd
//		fstp qword ptr [edx]
	}	
//	gen_asm(FLD64_M, (uint32)&trx_int_ps0_double[rB]);
//	gen_asm(FSTP32_M, (uint32)&float_store);
//	gen_asm(FLD32_M, (uint32)&float_store);
//	gen_asm(FSTP64_M, (uint32)&trx_int_ps0_double[rD]);		

//	ps_load(0, rB);
//	gen_asm(CVTSD2SS_RR, 0, 0);
//	ps_store_low(0, rD);

}

// tested
void trx_ppc_int_fsubsx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
/*		
	f = (trx_int_ps0_double[rA] - trx_int_ps0_double[rB]);
	trx_int_ps0_double[rD] = f;

	if(trxCPUint.spr[PPC_HID2] & HID2_PSE)
	{
		trx_int_ps1_double[rD] = f;
	}
*/
	frd = &trx_int_ps0_double[rD];
	fra = &trx_int_ps0_double[rA];
	frb = &trx_int_ps0_double[rB];
	fps1 = &trx_int_ps1_double[rD];

	if(trxCPUint.spr[PPC_HID2] & HID2_PSE)
	{
		_asm
		{
			mov edx, dword ptr fra
			fld qword ptr [edx]	; // load double
			mov edx, dword ptr frb
			fsub qword ptr [edx]; 
			mov edx, dword ptr frd
			fst qword ptr [edx]; // store as double 
			mov edx, dword ptr fps1
			fstp qword ptr [edx]; // store as double 
		};
	}
	else
	{
		_asm
		{
			mov edx, dword ptr fra
			fld qword ptr [edx]	; // load double
			mov edx, dword ptr frb
			fsub qword ptr [edx]; 
			mov edx, dword ptr frd
			fstp qword ptr [edx]; // store as double 
		};
	}
}

// tested
void trx_ppc_int_fmrx()
{
	uint32 rD, rB;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	//trx_int_ps0_int[rD] = trx_int_ps0_int[rB]; // maybe copy as doubles is faster ?

	frd = &trx_int_ps0_double[rD];
	frb = &trx_int_ps0_double[rB];
	_asm
	{
		mov edx, dword ptr frb
		fld qword ptr [edx]	; // load double
		mov edx, dword ptr frd
		fstp qword ptr [edx]; // store as double 
	};
}

// used
void trx_ppc_int_fmulx()
{
	uint32 rD, rA, rC;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;

	trx_int_ps0_double[rD] = trx_int_ps0_double[rA] * trx_int_ps0_double[rC];	
}
// used
void trx_ppc_int_fsubx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	trx_int_ps0_double[rD] = trx_int_ps0_double[rA] - trx_int_ps0_double[rB];	
}

// used
void trx_ppc_int_fmaddx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;

	trx_int_ps0_double[rD] = (trx_int_ps0_double[rA] * trx_int_ps0_double[rC]) + trx_int_ps0_double[rB];	
}

// used
void trx_ppc_int_fmsubx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;

	trx_int_ps0_double[rD] = (trx_int_ps0_double[rA] * trx_int_ps0_double[rC]) - trx_int_ps0_double[rB];	
}

// used
void trx_ppc_int_fnmsubx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;

	trx_int_ps0_double[rD] = -((trx_int_ps0_double[rA] * trx_int_ps0_double[rC]) - trx_int_ps0_double[rB]);	
}

// used
void trx_ppc_int_faddx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	trx_int_ps0_double[rD] = trx_int_ps0_double[rA] + trx_int_ps0_double[rB];	
}
// used
void trx_ppc_int_fdivx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	trx_int_ps0_double[rD] = trx_int_ps0_double[rA] / trx_int_ps0_double[rB];	
}

// tested
void trx_ppc_int_fmulsx()
{
	uint32 rD, rA, rC;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;

/*
	f = (trx_int_ps0_double[rA] * trx_int_ps0_double[rC]);
	trx_int_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_int_ps1_double[rD] = f;
	}
*/
	frd = &trx_int_ps0_double[rD];
	fra = &trx_int_ps0_double[rA];
	frc = &trx_int_ps0_double[rC];
	fps1 = &trx_int_ps1_double[rD];

	if(trxCPUint.spr[PPC_HID2] & HID2_PSE)
	{
		_asm
		{
			mov edx, dword ptr fra
			fld qword ptr [edx]	; // load double
			mov edx, dword ptr frc
			fmul qword ptr [edx];
			mov edx, dword ptr frd
			fst qword ptr [edx]; // store as double 
			mov edx, dword ptr fps1
			fstp qword ptr [edx]; // store as double 
		};
	}
	else
	{
		_asm
		{
			mov edx, dword ptr fra
			fld qword ptr [edx]	; // load double
			mov edx, dword ptr frc
			fmul qword ptr [edx];
			mov edx, dword ptr frd
			fstp qword ptr [edx]; // store as double 
		};
	}
}

// tested
void trx_ppc_int_fnegx()
{
	uint32 rD, rB;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
		
//	trx_int_ps0_int[rD] = trx_int_ps0_int[rB] ^ FPU_SIGN_BIT;

	qrd = &trx_int_ps0_int[rD];
	qrb = &trx_int_ps0_int[rB];
	_asm
	{
		mov edx, dword ptr qrb
		mov eax, [edx]
		mov ecx, [edx+4]
		xor ecx, 0x80000000
		mov edx, dword ptr qrd
		mov [edx], eax
		mov [edx+4], ecx
	};
}

// tested
void trx_ppc_int_fabsx()
{
	uint32 rD, rB;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
		
	//trx_int_ps0_int[rD] = trx_int_ps0_int[rB] & ~FPU_SIGN_BIT;

	qrd = &trx_int_ps0_int[rD];
	qrb = &trx_int_ps0_int[rB];
	_asm
	{
		mov edx, dword ptr qrb
		mov eax, [edx]
		mov ecx, [edx+4]
		and ecx, 0x7fffffff
		mov edx, dword ptr qrd
		mov [edx], eax
		mov [edx+4], ecx
	};
}

// tested
void trx_ppc_int_fnabsx()
{
	uint32 rD, rB;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
		
	//trx_int_ps0_int[rD] = trx_int_ps0_int[rB] | FPU_SIGN_BIT;

	qrd = &trx_int_ps0_int[rD];
	qrb = &trx_int_ps0_int[rB];
	_asm
	{
		mov edx, dword ptr qrb
		mov eax, [edx]
		mov ecx, [edx+4]
		or ecx, 0x80000000
		mov edx, dword ptr qrd
		mov [edx], eax
		mov [edx+4], ecx
	};
}

// tested
void trx_ppc_int_fresx()
{
	uint32 rD, rB;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

/*
	f = 1.0f / fpu_as_double[frB];
	fpu_as_double[frD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		ps1_double[frD] = f;
	}
*/
	frd = &trx_int_ps0_double[rD];
	frb = &trx_int_ps0_double[rB];
	fps1 = &trx_int_ps1_double[rD];

	if(trxCPUint.spr[PPC_HID2] & HID2_PSE)
	{
		_asm
		{
			fld1
			mov edx, dword ptr frb
			fdiv qword ptr [edx];
			mov edx, dword ptr frd
			fst qword ptr [edx]; // store as double 
			mov edx, dword ptr fps1
			fstp qword ptr [edx]; // store as double 
		};
	}
	else
	{
		_asm
		{
			fld1
			mov edx, dword ptr frb
			fdiv qword ptr [edx];
			mov edx, dword ptr frd
			fstp qword ptr [edx]; // store as double 
		};
	}
}

// tested
void trx_ppc_int_faddsx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
/*		
	f = (trx_int_ps0_double[rA] + trx_int_ps0_double[rB]);
	trx_int_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_int_ps1_double[rD] = f;
	}
*/
	frd = &trx_int_ps0_double[rD];
	fra = &trx_int_ps0_double[rA];
	frb = &trx_int_ps0_double[rB];
	fps1 = &trx_int_ps1_double[rD];

	if(trxCPUint.spr[PPC_HID2] & HID2_PSE)
	{
		_asm
		{
			mov edx, dword ptr fra
			fld qword ptr [edx]	; // load double
			mov edx, dword ptr frb
			fadd qword ptr [edx];
			mov edx, dword ptr frd
			fst qword ptr [edx]; // store as double 
			mov edx, dword ptr fps1
			fstp qword ptr [edx]; // store as double 
		};
	}
	else
	{
		_asm
		{
			mov edx, dword ptr fra
			fld qword ptr [edx]	; // load double
			mov edx, dword ptr frb
			fadd qword ptr [edx];
			mov edx, dword ptr frd
			fstp qword ptr [edx]; // store as double 
		};
	}
}

// tested
void trx_ppc_int_frsqrtex()
{
	uint32 rD, rB;
	double one = 1.0f;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

//	trx_int_ps0_double[rD] = 1.0f / sqrt(trx_int_ps0_double[rB]);

	frd = &trx_int_ps0_double[rD];
	frb = &trx_int_ps0_double[rB];
	_asm
	{
		fld qword ptr [one]
		mov edx, dword ptr frb
		fld qword ptr [edx]	; // load double
		fsqrt
		fdiv
		mov edx, dword ptr frd
		fstp qword ptr [edx]; // store as double 
	};
}

//==============================================================================
// Floating point comparision opcodes
//

// used
void trx_ppc_int_fcmpu()
{
	uint32 cr, rA, rB, comp, nan;

	cr = (trxCPUint.opcode >> 23)& 0x7;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	fra = &trx_int_ps0_double[rA];
	frb = &trx_int_ps0_double[rB];
	_asm
	{
		mov edx, dword ptr frb
		fld qword ptr [edx]	; // load double
		mov edx, dword ptr fra
		fld qword ptr [edx]	; // load double
		xor ecx, ecx
		fucomip st, st(1)
		fstp st(0)
		cmova eax, cr_gt
		cmovb eax, cr_lt
		cmove eax, cr_eq
		cmovp eax, cr_so
		cmovp ecx, fcmpu_nan_flags
		mov comp, eax
		mov nan, ecx
	}
	trxCPUint.fpscr |= nan;
	trxCPUint.fpscr &= ~0x1f000;
	trxCPUint.fpscr |= (comp>>16); // compare bits are in 28-31 and should go to 14-12

	cr = 7-cr;
	trxCPUint.cr &= trx_ppc_cmp_and_mask[cr];
	trxCPUint.cr |= (comp >> (28-(cr * 4)));
}

// used
void trx_ppc_int_fcmpo()
{
	uint32 cr, rA, rB, comp, nan;

	cr = (trxCPUint.opcode >> 23)& 0x7;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	fra = &trx_int_ps0_double[rA];
	frb = &trx_int_ps0_double[rB];
	_asm
	{
		mov edx, dword ptr frb
		fld qword ptr [edx]	; // load double
		mov edx, dword ptr fra
		fld qword ptr [edx]	; // load double
		xor ecx, ecx
		fucomip st, st(1)
		fstp st(0)
		cmova eax, cr_gt
		cmovb eax, cr_lt
		cmove eax, cr_eq
		cmovp eax, cr_so
		cmovp ecx, fcmpo_nan_flags
		mov comp, eax
		mov nan, ecx
	}
	trxCPUint.fpscr |= nan;
	trxCPUint.fpscr &= ~0x1f000;
	trxCPUint.fpscr |= (comp>>16); // compare bits are in 28-31 and should go to 14-12

	cr = 7-cr;
	trxCPUint.cr &= trx_ppc_cmp_and_mask[cr];
	trxCPUint.cr |= (comp >> (28-(cr * 4)));
}

// tested
void trx_ppc_int_fsel()
{
	uint32 rD, rA, rB, rC;

	printf("ERROR: trx_ppc_int_fsel not implemented\n");
	exit(0);

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;

//	ppc_double A;
//	ppc_fpu_unpack_double(A, trxCPUint.fpr[rA]);

//	if (A.type == ppc_fpr_NaN || trx_int_ps0_double[rA] < 0.0f) 
//	{
//		trx_int_ps0_double[rD] = trx_int_ps0_double[rB];
//	} 
//	else 
//	{
//		trx_int_ps0_double[rD] = trx_int_ps0_double[rC];
//	}
}

//==============================================================================
// Floating point conversion and rounding opcodes
//
extern uint16 fpucontrol_roundzero; 
extern uint16 fpucontrol_default;

void trx_ppc_int_fctiwzx()
{
	uint32 rD, rB;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	frb = &trx_int_ps0_double[rB];
	qrd = &trxCPUint.fpr[rD];
	_asm
	{
		fldcw fpucontrol_roundzero
		mov edx, frb
		fld qword ptr [edx]
		mov edx, qrd
		fistp dword ptr [edx]
		fldcw fpucontrol_default
	};
}

// this one should use rounding mode as specified in rounding control
void trx_ppc_int_fctiwx()
{
	uint32 rD, rB;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	frb = &trx_int_ps0_double[rB];
	qrd = &trxCPUint.fpr[rD];
	_asm
	{
		mov edx, frb
		fld qword ptr [edx]
		mov edx, qrd
		fistp dword ptr [edx]
	};
}

//==============================================================================
// Floating point multiply-add opcodes
//

// tested
void trx_ppc_int_fmsubsx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;

/*
	double f;

	f = ((trx_int_ps0_double[rA] * trx_int_ps0_double[rC]) - trx_int_ps0_double[rB]);
	trx_int_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_int_ps1_double[rD] = f;
	}
*/
	frd = &trx_int_ps0_double[rD];
	fra = &trx_int_ps0_double[rA];
	frb = &trx_int_ps0_double[rB];
	frc = &trx_int_ps0_double[rC];
	fps1 = &trx_int_ps1_double[rD];

	if(trxCPUint.spr[PPC_HID2] & HID2_PSE)
	{
		_asm
		{
			mov edx, dword ptr fra
			fld qword ptr [edx]	; // load double
			mov edx, dword ptr frc
			fmul qword ptr [edx];
			mov edx, dword ptr frb
			fsub qword ptr [edx]; // store as double 
			mov edx, dword ptr frd
			fst  qword ptr [edx]; // store as double 
			mov edx, dword ptr fps1
			fstp qword ptr [edx]; // store as double 
		};
	}
	else
	{
		_asm
		{
			mov edx, dword ptr fra
			fld qword ptr [edx]	; // load double
			mov edx, dword ptr frc
			fmul qword ptr [edx];
			mov edx, dword ptr frb
			fsub qword ptr [edx]; // store as double 
			mov edx, dword ptr frd
			fstp qword ptr [edx]; // store as double 
		};
	}
}

// tested
void trx_ppc_int_fnmsubsx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;

/*
	double f;

	f = -((trx_int_ps0_double[rA] * trx_int_ps0_double[rC]) - trx_int_ps0_double[rB]);
	trx_int_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_int_ps1_double[rD] = f;
	}
*/
	frd = &trx_int_ps0_double[rD];
	fra = &trx_int_ps0_double[rA];
	frb = &trx_int_ps0_double[rB];
	frc = &trx_int_ps0_double[rC];
	fps1 = &trx_int_ps1_double[rD];

	if(trxCPUint.spr[PPC_HID2] & HID2_PSE)
	{
		_asm
		{
			mov edx, dword ptr fra
			fld qword ptr [edx]	; // load double
			mov edx, dword ptr frc
			fmul qword ptr [edx];
			mov edx, dword ptr frb
			fsub qword ptr [edx]; // store as double 
			fchs
			mov edx, dword ptr frd
			fst  qword ptr [edx]; // store as double 
			mov edx, dword ptr fps1
			fstp qword ptr [edx]; // store as double 
		};
	}
	else
	{
		_asm
		{
			mov edx, dword ptr fra
			fld qword ptr [edx]	; // load double
			mov edx, dword ptr frc
			fmul qword ptr [edx];
			mov edx, dword ptr frb
			fsub qword ptr [edx]; // store as double 
			fchs
			mov edx, dword ptr frd
			fstp qword ptr [edx]; // store as double 
		};
	}
}

// tested
void trx_ppc_int_fmaddsx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;

/*
	double f;

	f = (trx_int_ps0_double[rA] * trx_int_ps0_double[rC]) + trx_int_ps0_double[rB];
	trx_int_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_int_ps1_double[rD] = f;
	}
*/

	frd = &trx_int_ps0_double[rD];
	fra = &trx_int_ps0_double[rA];
	frb = &trx_int_ps0_double[rB];
	frc = &trx_int_ps0_double[rC];
	fps1 = &trx_int_ps1_double[rD];

	if(trxCPUint.spr[PPC_HID2] & HID2_PSE)
	{
		_asm
		{
			mov edx, dword ptr fra
			fld qword ptr [edx]	; // load double
			mov edx, dword ptr frc
			fmul qword ptr [edx];
			mov edx, dword ptr frb
			fadd qword ptr [edx]; // store as double 
			mov edx, dword ptr frd
			fst  qword ptr [edx]; // store as double 
			mov edx, dword ptr fps1
			fstp qword ptr [edx]; // store as double 
		};
	}
	else
	{
		_asm
		{
			mov edx, dword ptr fra
			fld qword ptr [edx]	; // load double
			mov edx, dword ptr frc
			fmul qword ptr [edx];
			mov edx, dword ptr frb
			fadd qword ptr [edx]; // store as double 
			mov edx, dword ptr frd
			fstp qword ptr [edx]; // store as double 
		};
	}
}

// used
void trx_ppc_int_fnmaddsx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;

/*
	double f;

	f = -((trx_int_ps0_double[rA] * trx_int_ps0_double[rC]) + trx_int_ps0_double[rB]);
	trx_int_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_int_ps1_double[rD] = f;
	}
*/

	frd = &trx_int_ps0_double[rD];
	fra = &trx_int_ps0_double[rA];
	frb = &trx_int_ps0_double[rB];
	frc = &trx_int_ps0_double[rC];
	fps1 = &trx_int_ps1_double[rD];

	if(trxCPUint.spr[PPC_HID2] & HID2_PSE)
	{
		_asm
		{
			mov edx, dword ptr fra
			fld qword ptr [edx]	; // load double
			mov edx, dword ptr frc
			fmul qword ptr [edx];
			mov edx, dword ptr frb
			fadd qword ptr [edx]; // store as double 
			fchs
			mov edx, dword ptr frd
			fst  qword ptr [edx]; // store as double 
			mov edx, dword ptr fps1
			fstp qword ptr [edx]; // store as double 
		};
	}
	else
	{
		_asm
		{
			mov edx, dword ptr fra
			fld qword ptr [edx]	; // load double
			mov edx, dword ptr frc
			fmul qword ptr [edx];
			mov edx, dword ptr frb
			fadd qword ptr [edx]; // store as double 
			fchs
			mov edx, dword ptr frd
			fstp qword ptr [edx]; // store as double 
		};
	}
}

/*
void trx_log_gekko(void)
{
	char buf[64], opStr[16], parmStr[32];
	uint32 target;

//	if (!gCPU.log_gekko)
//		return;

	GekkoDisassemble(opStr, parmStr, trxCPUint.opcode, trxCPUint.pc, &target);
	sprintf(buf, "%-10s %s", opStr, parmStr);    
	printf("%.8X  %.8X  GEKKO: %s\n", trxCPUint.pc, trxCPUint.opcode, buf);
}

static void trx_ppc_gekko_ill(void);
*/

// gekko locked cache allocate
void trx_ppc_int_dcbz_l(void)
{
	uint32 EA, rA, rB;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	
	if(rA == 0)
	{
		EA = trxCPUint.gpr[rB];
	}
	else
	{
		EA = trxCPUint.gpr[rA] + trxCPUint.gpr[rB];
	}
	if((EA < 0xe0000000) || (EA > 0xe0003ff0))
	{
		printf("[trxCPUint] ERROR !dcbz_l locking line at address: %x\n", EA);
		exit(0);
	}
	// clear the cache line
	memset(&lockedcache[EA & LOCKEDCACHE_MASK], 0, 32);
}

// used
void trx_ppc_int_ps_msub(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;
	
//	res0 = (trx_int_ps0_double[rA] * trx_int_ps0_double[rC]) - trx_int_ps0_double[rB];
//	res1 = (trx_int_ps1_double[rA] * trx_int_ps1_double[rC]) - trx_int_ps1_double[rB];
//	trx_int_ps0_double[rD] = res0;
//	trx_int_ps1_double[rD] = res1;

	ps0ra = &trx_int_ps0_double[rA];
	ps0rb = &trx_int_ps0_double[rB];
	ps0rc = &trx_int_ps0_double[rC];
	ps0rd = &trx_int_ps0_double[rD];
	ps1ra = &trx_int_ps1_double[rA];
	ps1rb = &trx_int_ps1_double[rB];
	ps1rc = &trx_int_ps1_double[rC];
	ps1rd = &trx_int_ps1_double[rD];

	_asm
	{
		mov edx, dword ptr ps0ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps0rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps0rb
		fsub qword ptr [edx]; 
		mov edx, dword ptr ps0rd
		fstp  qword ptr [edx]; // store as double 
		mov edx, dword ptr ps1ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps1rb
		fsub qword ptr [edx]; 
		mov edx, dword ptr ps1rd
		fstp  qword ptr [edx]; // store as double 
	};
}
// used
void trx_ppc_int_ps_madd(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;
	
	//res0 = (trx_int_ps0_double[rA] * trx_int_ps0_double[rC]) + trx_int_ps0_double[rB];
	//res1 = (trx_int_ps1_double[rA] * trx_int_ps1_double[rC]) + trx_int_ps1_double[rB];
	//trx_int_ps0_double[rD] = res0;
	//trx_int_ps1_double[rD] = res1;

	ps0ra = &trx_int_ps0_double[rA];
	ps0rb = &trx_int_ps0_double[rB];
	ps0rc = &trx_int_ps0_double[rC];
	ps0rd = &trx_int_ps0_double[rD];
	ps1ra = &trx_int_ps1_double[rA];
	ps1rb = &trx_int_ps1_double[rB];
	ps1rc = &trx_int_ps1_double[rC];
	ps1rd = &trx_int_ps1_double[rD];

	_asm
	{
		mov edx, dword ptr ps0ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps0rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps0rb
		fadd qword ptr [edx]; 
		mov edx, dword ptr ps0rd
		fstp  qword ptr [edx]; // store as double 
		mov edx, dword ptr ps1ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps1rb
		fadd qword ptr [edx]; 
		mov edx, dword ptr ps1rd
		fstp  qword ptr [edx]; // store as double 
	};
}
// not used
void trx_ppc_int_ps_nmsub(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;
	
	//res0 = -((trx_int_ps0_double[rA] * trx_int_ps0_double[rC]) - trx_int_ps0_double[rB]);
	//res1 = -((trx_int_ps1_double[rA] * trx_int_ps1_double[rC]) - trx_int_ps1_double[rB]);
	//trx_int_ps0_double[rD] = res0;
	//trx_int_ps1_double[rD] = res1;

	ps0ra = &trx_int_ps0_double[rA];
	ps0rb = &trx_int_ps0_double[rB];
	ps0rc = &trx_int_ps0_double[rC];
	ps0rd = &trx_int_ps0_double[rD];
	ps1ra = &trx_int_ps1_double[rA];
	ps1rb = &trx_int_ps1_double[rB];
	ps1rc = &trx_int_ps1_double[rC];
	ps1rd = &trx_int_ps1_double[rD];

	_asm
	{
		mov edx, dword ptr ps0ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps0rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps0rb
		fsub qword ptr [edx];
		fchs
		mov edx, dword ptr ps0rd
		fstp  qword ptr [edx]; // store as double 
		mov edx, dword ptr ps1ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps1rb
		fsub qword ptr [edx]; 
		fchs
		mov edx, dword ptr ps1rd
		fstp  qword ptr [edx]; // store as double 
	};
}
// not used
void trx_ppc_int_ps_nmadd(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;
	
	//res0 = -((trx_int_ps0_double[rA] * trx_int_ps0_double[rC]) + trx_int_ps0_double[rB]);
	//res1 = -((trx_int_ps1_double[rA] * trx_int_ps1_double[rC]) + trx_int_ps1_double[rB]);
	//trx_int_ps0_double[rD] = res0;
	//trx_int_ps1_double[rD] = res1;

	ps0ra = &trx_int_ps0_double[rA];
	ps0rb = &trx_int_ps0_double[rB];
	ps0rc = &trx_int_ps0_double[rC];
	ps0rd = &trx_int_ps0_double[rD];
	ps1ra = &trx_int_ps1_double[rA];
	ps1rb = &trx_int_ps1_double[rB];
	ps1rc = &trx_int_ps1_double[rC];
	ps1rd = &trx_int_ps1_double[rD];

	_asm
	{
		mov edx, dword ptr ps0ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps0rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps0rb
		fadd qword ptr [edx]; 
		fchs
		mov edx, dword ptr ps0rd
		fstp  qword ptr [edx]; // store as double 
		mov edx, dword ptr ps1ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps1rb
		fadd qword ptr [edx]; 
		fchs
		mov edx, dword ptr ps1rd
		fstp  qword ptr [edx]; // store as double 
	};
}
// used
void trx_ppc_int_ps_neg(void)
{
	int rD, rB;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	//trx_int_ps0_int[rD] = trx_int_ps0_int[rB] ^ FPU_SIGN_BIT;
	//trx_int_ps1_int[rD] = trx_int_ps1_int[rB] ^ FPU_SIGN_BIT;

	qps0rd = &trx_int_ps0_int[rD];
	qps0rb = &trx_int_ps0_int[rB];
	qps1rd = &trx_int_ps1_int[rD];
	qps1rb = &trx_int_ps1_int[rB];
	_asm
	{
		mov edx, dword ptr qps0rb
		mov eax, [edx]
		mov ecx, [edx+4]
		xor ecx, 0x80000000
		mov edx, dword ptr qps0rd
		mov [edx], eax
		mov [edx+4], ecx
		mov edx, dword ptr qps1rb
		mov eax, [edx]
		mov ecx, [edx+4]
		xor ecx, 0x80000000
		mov edx, dword ptr qps1rd
		mov [edx], eax
		mov [edx+4], ecx
	};
}
// not used
void trx_ppc_int_ps_nabs(void)
{
	int rD, rB;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	//trx_int_ps0_int[rD] = trx_int_ps0_int[rB] | FPU_SIGN_BIT;
	//trx_int_ps1_int[rD] = trx_int_ps1_int[rB] | FPU_SIGN_BIT;

	qps0rd = &trx_int_ps0_int[rD];
	qps0rb = &trx_int_ps0_int[rB];
	qps1rd = &trx_int_ps1_int[rD];
	qps1rb = &trx_int_ps1_int[rB];
	_asm
	{
		mov edx, dword ptr qps0rb
		mov eax, [edx]
		mov ecx, [edx+4]
		or ecx, 0x80000000
		mov edx, dword ptr qps0rd
		mov [edx], eax
		mov [edx+4], ecx
		mov edx, dword ptr qps1rb
		mov eax, [edx]
		mov ecx, [edx+4]
		or ecx, 0x80000000
		mov edx, dword ptr qps1rd
		mov [edx], eax
		mov [edx+4], ecx
	};
}
// used
void trx_ppc_int_ps_abs(void)
{
	int rD, rB;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	//trx_int_ps0_int[rD] = trx_int_ps0_int[rB] & ~FPU_SIGN_BIT;
	//trx_int_ps1_int[rD] = trx_int_ps1_int[rB] & ~FPU_SIGN_BIT;

	qps0rd = &trx_int_ps0_int[rD];
	qps0rb = &trx_int_ps0_int[rB];
	qps1rd = &trx_int_ps1_int[rD];
	qps1rb = &trx_int_ps1_int[rB];
	_asm
	{
		mov edx, dword ptr qps0rb
		mov eax, [edx]
		mov ecx, [edx+4]
		and ecx, 0x7fffffff
		mov edx, dword ptr qps0rd
		mov [edx], eax
		mov [edx+4], ecx
		mov edx, dword ptr qps1rb
		mov eax, [edx]
		mov ecx, [edx+4]
		and ecx, 0x7fffffff
		mov edx, dword ptr qps1rd
		mov [edx], eax
		mov [edx+4], ecx
	};
}

// used
void trx_ppc_int_ps_mr(void)
{
	int rD, rB;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

//	trx_int_ps0_double[rD] = trx_int_ps0_double[rB];
//	trx_int_ps1_double[rD] = trx_int_ps1_double[rB];

	ps0rb = &trx_int_ps0_double[rB];
	ps0rd = &trx_int_ps0_double[rD];
	ps1rb = &trx_int_ps1_double[rB];
	ps1rd = &trx_int_ps1_double[rD];
	_asm
	{
		mov edx, dword ptr ps0rb
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps0rd
		fstp qword ptr [edx];
		mov edx, dword ptr ps1rb
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rd
		fstp qword ptr [edx];
	};
}
// used
void trx_ppc_int_ps_merge00(void)
{
	int rD, rA, rB;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	//res0 = trx_int_ps0_double[rA];
	//res1 = trx_int_ps0_double[rB];
	//trx_int_ps0_double[rD] = res0;
	//trx_int_ps1_double[rD] = res1;

	ps0ra = &trx_int_ps0_double[rA];
	ps0rb = &trx_int_ps0_double[rB];
	ps0rd = &trx_int_ps0_double[rD];
	ps1rd = &trx_int_ps1_double[rD];
	_asm
	{
		mov edx, dword ptr ps0rb
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rd
		fstp qword ptr [edx];
		mov edx, dword ptr ps0ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps0rd
		fstp qword ptr [edx];
	};
}

// used
void trx_ppc_int_ps_merge01(void)
{
	int rD, rA, rB;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	//res0 = trx_int_ps0_double[rA];
	//res1 = trx_int_ps1_double[rB];
	//trx_int_ps0_double[rD] = res0;
	//trx_int_ps1_double[rD] = res1;

	ps0ra = &trx_int_ps0_double[rA];
	ps1rb = &trx_int_ps1_double[rB];
	ps0rd = &trx_int_ps0_double[rD];
	ps1rd = &trx_int_ps1_double[rD];
	_asm
	{
		mov edx, dword ptr ps0ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps0rd
		fstp qword ptr [edx];
		mov edx, dword ptr ps1rb
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rd
		fstp qword ptr [edx];
	};
}
// used
void trx_ppc_int_ps_merge10(void)
{
	int rD, rA, rB;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	
	//res0 = trx_int_ps1_double[rA];
	//res1 = trx_int_ps0_double[rB];
	//trx_int_ps0_double[rD] = res0;
	//trx_int_ps1_double[rD] = res1;

	ps1ra = &trx_int_ps1_double[rA];
	ps0rb = &trx_int_ps0_double[rB];
	ps0rd = &trx_int_ps0_double[rD];
	ps1rd = &trx_int_ps1_double[rD];
	_asm
	{
		mov edx, dword ptr ps1ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps0rb
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rd
		fstp qword ptr [edx];
		mov edx, dword ptr ps0rd
		fstp qword ptr [edx];
	};
}
// used
void trx_ppc_int_ps_merge11(void)
{
	int rD, rA, rB;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	
	//res0 = trx_int_ps1_double[rA];
	//res1 = trx_int_ps1_double[rB];
	//trx_int_ps0_double[rD] = res0;
	//trx_int_ps1_double[rD] = res1;

	ps1ra = &trx_int_ps1_double[rA];
	ps1rb = &trx_int_ps1_double[rB];
	ps0rd = &trx_int_ps0_double[rD];
	ps1rd = &trx_int_ps1_double[rD];
	_asm
	{
		mov edx, dword ptr ps1ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rb
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rd
		fstp qword ptr [edx];
		mov edx, dword ptr ps0rd
		fstp qword ptr [edx];
	};
}

// used
void trx_ppc_int_ps_mul(void)
{
	int rD, rA, rC;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;
	
	//res0 = trx_int_ps0_double[rA] * trx_int_ps0_double[rC];
	//res1 = trx_int_ps1_double[rA] * trx_int_ps1_double[rC];
	//trx_int_ps0_double[rD] = res0;
	//trx_int_ps1_double[rD] = res1;

	ps0ra = &trx_int_ps0_double[rA];
	ps0rc = &trx_int_ps0_double[rC];
	ps0rd = &trx_int_ps0_double[rD];
	ps1ra = &trx_int_ps1_double[rA];
	ps1rc = &trx_int_ps1_double[rC];
	ps1rd = &trx_int_ps1_double[rD];

	_asm
	{
		mov edx, dword ptr ps0ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps0rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps0rd
		fstp  qword ptr [edx]; // store as double 
		mov edx, dword ptr ps1ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps1rd
		fstp  qword ptr [edx]; // store as double 
	};
}
// used
void trx_ppc_int_ps_muls0(void)
{
	int rD, rA, rC;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;
	
	//res0 = trx_int_ps0_double[rA] * trx_int_ps0_double[rC];
	//res1 = trx_int_ps1_double[rA] * trx_int_ps0_double[rC];
	//trx_int_ps0_double[rD] = res0;
	//trx_int_ps1_double[rD] = res1;

	ps0ra = &trx_int_ps0_double[rA];
	ps0rc = &trx_int_ps0_double[rC];
	ps0rd = &trx_int_ps0_double[rD];
	ps1ra = &trx_int_ps1_double[rA];
	ps1rd = &trx_int_ps1_double[rD];
	_asm
	{
		mov edx, dword ptr ps0ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps0rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps1ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps0rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps1rd
		fstp qword ptr [edx];
		mov edx, dword ptr ps0rd
		fstp qword ptr [edx];
	};
}
// used
void trx_ppc_int_ps_muls1(void)
{
	double res0, res1;
	int rD, rA, rC;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;
	
//	res0 = trx_int_ps0_double[rA] * trx_int_ps1_double[rC];
//	res1 = trx_int_ps1_double[rA] * trx_int_ps1_double[rC];
//	trx_int_ps0_double[rD] = res0;
//	trx_int_ps1_double[rD] = res1;

	ps0ra = &trx_int_ps0_double[rA];
	ps1rc = &trx_int_ps1_double[rC];
	ps0rd = &trx_int_ps0_double[rD];
	ps1ra = &trx_int_ps1_double[rA];
	ps1rd = &trx_int_ps1_double[rD];
	_asm
	{
		mov edx, dword ptr ps0ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps1ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps1rd
		fstp qword ptr [edx];
		mov edx, dword ptr ps0rd
		fstp qword ptr [edx];
	};
}
// used
void trx_ppc_int_ps_madds0(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;
	
//	res0 = (trx_int_ps0_double[rA] * trx_int_ps0_double[rC]) + trx_int_ps0_double[rB];
//	res1 = (trx_int_ps1_double[rA] * trx_int_ps0_double[rC]) + trx_int_ps1_double[rB];
//	trx_int_ps0_double[rD] = res0;
//	trx_int_ps1_double[rD] = res1;

	ps0ra = &trx_int_ps0_double[rA];
	ps0rb = &trx_int_ps0_double[rB];
	ps0rc = &trx_int_ps0_double[rC];
	ps0rd = &trx_int_ps0_double[rD];
	ps1ra = &trx_int_ps1_double[rA];
	ps1rb = &trx_int_ps1_double[rB];
	ps1rd = &trx_int_ps1_double[rD];
	_asm
	{
		mov edx, dword ptr ps0ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps0rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps0rb
		fadd qword ptr [edx]
		mov edx, dword ptr ps1ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps0rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps1rb
		fadd qword ptr [edx]
		mov edx, dword ptr ps1rd
		fstp qword ptr [edx];
		mov edx, dword ptr ps0rd
		fstp qword ptr [edx];
	};
}
// used
void trx_ppc_int_ps_madds1(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;
	
//	res0 = (trx_int_ps0_double[rA] * trx_int_ps1_double[rC]) + trx_int_ps0_double[rB];
//	res1 = (trx_int_ps1_double[rA] * trx_int_ps1_double[rC]) + trx_int_ps1_double[rB];
//	trx_int_ps0_double[rD] = res0;
//	trx_int_ps1_double[rD] = res1;

	ps0ra = &trx_int_ps0_double[rA];
	ps0rb = &trx_int_ps0_double[rB];
	ps1rc = &trx_int_ps1_double[rC];
	ps0rd = &trx_int_ps0_double[rD];
	ps1ra = &trx_int_ps1_double[rA];
	ps1rb = &trx_int_ps1_double[rB];
	ps1rd = &trx_int_ps1_double[rD];
	_asm
	{
		mov edx, dword ptr ps0ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps0rb
		fadd qword ptr [edx]
		mov edx, dword ptr ps1ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rc
		fmul qword ptr [edx];
		mov edx, dword ptr ps1rb
		fadd qword ptr [edx]
		mov edx, dword ptr ps1rd
		fstp qword ptr [edx];
		mov edx, dword ptr ps0rd
		fstp qword ptr [edx];
	};
}

// used
void trx_ppc_int_ps_cmpo0(void)
{
	uint32 cr, rA, rB, comp, nan;

	cr = (trxCPUint.opcode >> 23)& 0x7;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	fra = &trx_int_ps0_double[rA];
	frb = &trx_int_ps0_double[rB];
	_asm
	{
		mov edx, dword ptr frb
		fld qword ptr [edx]	; // load double
		mov edx, dword ptr fra
		fld qword ptr [edx]	; // load double
		xor ecx, ecx
		fucomip st, st(1)
		fstp st(0)
		cmova eax, cr_gt
		cmovb eax, cr_lt
		cmove eax, cr_eq
		cmovp eax, cr_so
		cmovp ecx, fcmpo_nan_flags
		mov comp, eax
		mov nan, ecx
	}
	trxCPUint.fpscr |= nan;
	trxCPUint.fpscr &= ~0x1f000;
	trxCPUint.fpscr |= (comp>>16); // compare bits are in 28-31 and should go to 14-12

	cr = 7-cr;
	trxCPUint.cr &= trx_ppc_cmp_and_mask[cr];
	trxCPUint.cr |= (comp >> (28-(cr * 4)));
}

void trx_ppc_int_ps_cmpu0(void)
{
	uint32 cr, rA, rB, comp, nan;

	cr = (trxCPUint.opcode >> 23)& 0x7;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	fra = &trx_int_ps0_double[rA];
	frb = &trx_int_ps0_double[rB];
	_asm
	{
		mov edx, dword ptr frb
		fld qword ptr [edx]	; // load double
		mov edx, dword ptr fra
		fld qword ptr [edx]	; // load double
		xor ecx, ecx
		fucomip st, st(1)
		fstp st(0)
		cmova eax, cr_gt
		cmovb eax, cr_lt
		cmove eax, cr_eq
		cmovp eax, cr_so
		cmovp ecx, fcmpu_nan_flags
		mov comp, eax
		mov nan, ecx
	}
	trxCPUint.fpscr |= nan;
	trxCPUint.fpscr &= ~0x1f000;
	trxCPUint.fpscr |= (comp>>16); // compare bits are in 28-31 and should go to 14-12

	cr = 7-cr;
	trxCPUint.cr &= trx_ppc_cmp_and_mask[cr];
	trxCPUint.cr |= (comp >> (28-(cr * 4)));
}

void trx_ppc_int_ps_cmpo1(void)
{
	uint32 cr, rA, rB, comp, nan;

	cr = (trxCPUint.opcode >> 23)& 0x7;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	fra = &trx_int_ps1_double[rA];
	frb = &trx_int_ps1_double[rB];
	_asm
	{
		mov edx, dword ptr frb
		fld qword ptr [edx]	; // load double
		mov edx, dword ptr fra
		fld qword ptr [edx]	; // load double
		xor ecx, ecx
		fucomip st, st(1)
		fstp st(0)
		cmova eax, cr_gt
		cmovb eax, cr_lt
		cmove eax, cr_eq
		cmovp eax, cr_so
		cmovp ecx, fcmpo_nan_flags
		mov comp, eax
		mov nan, ecx
	}
	trxCPUint.fpscr |= nan;
	trxCPUint.fpscr &= ~0x1f000;
	trxCPUint.fpscr |= (comp>>16); // compare bits are in 28-31 and should go to 14-12

	cr = 7-cr;
	trxCPUint.cr &= trx_ppc_cmp_and_mask[cr];
	trxCPUint.cr |= (comp >> (28-(cr * 4)));
}

void trx_ppc_int_ps_cmpu1(void)
{
	uint32 cr, rA, rB, comp, nan;

	cr = (trxCPUint.opcode >> 23)& 0x7;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;

	fra = &trx_int_ps1_double[rA];
	frb = &trx_int_ps1_double[rB];
	_asm
	{
		mov edx, dword ptr frb
		fld qword ptr [edx]	; // load double
		mov edx, dword ptr fra
		fld qword ptr [edx]	; // load double
		xor ecx, ecx
		fucomip st, st(1)
		fstp st(0)
		cmova eax, cr_gt
		cmovb eax, cr_lt
		cmove eax, cr_eq
		cmovp eax, cr_so
		cmovp ecx, fcmpu_nan_flags
		mov comp, eax
		mov nan, ecx
	}
	trxCPUint.fpscr |= nan;
	trxCPUint.fpscr &= ~0x1f000;
	trxCPUint.fpscr |= (comp>>16); // compare bits are in 28-31 and should go to 14-12

	cr = 7-cr;
	trxCPUint.cr &= trx_ppc_cmp_and_mask[cr];
	trxCPUint.cr |= (comp >> (28-(cr * 4)));
}

// used
void trx_ppc_int_ps_sum0(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;
	
	//res0 = trx_int_ps0_double[rA] + trx_int_ps1_double[rB];
	//res1 = trx_int_ps1_double[rC];
	//trx_int_ps0_double[rD] = res0;
	//trx_int_ps1_double[rD] = res1;

	ps0ra = &trx_int_ps0_double[rA];
	ps1rb = &trx_int_ps1_double[rB];
	ps0rd = &trx_int_ps0_double[rD];
	ps1rc = &trx_int_ps1_double[rC];
	ps1rd = &trx_int_ps1_double[rD];
	_asm
	{
		mov edx, dword ptr ps0ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rb
		fadd qword ptr [edx];
		mov edx, dword ptr ps0rd
		fstp qword ptr [edx];
		mov edx, dword ptr ps1rc
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rd
		fstp qword ptr [edx];
	};
}
// used
void trx_ppc_int_ps_sum1(void)
{
	double res0, res1;
	int rD, rA, rB, rC;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;
	
	//res0 = trx_int_ps0_double[rC];
	//res1 = trx_int_ps0_double[rA] + trx_int_ps1_double[rB];
	//trx_int_ps0_double[rD] = res0;
	//trx_int_ps1_double[rD] = res1;

	ps0ra = &trx_int_ps0_double[rA];
	ps1rb = &trx_int_ps1_double[rB];
	ps0rd = &trx_int_ps0_double[rD];
	ps0rc = &trx_int_ps0_double[rC];
	ps1rd = &trx_int_ps1_double[rD];
	_asm
	{
		mov edx, dword ptr ps0rc
		fld qword ptr [edx]	; 
		fstp  qword ptr [res0]; // store as double 
		mov edx, dword ptr ps0ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rb
		fadd qword ptr [edx];
		fstp  qword ptr [res1]; // store as double 
		fld qword ptr [res0]
		mov edx, dword ptr ps0rd
		fstp qword ptr [edx];
		fld qword ptr [res1]
		mov edx, dword ptr ps1rd
		fstp qword ptr [edx];
	};
}
// used
void trx_ppc_int_ps_div(void)
{
	int rD, rA, rB;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	
	//res0 = trx_int_ps0_double[rA] / trx_int_ps0_double[rB];
	//res1 = trx_int_ps1_double[rA] / trx_int_ps1_double[rB];
	//trx_int_ps0_double[rD] = res0;
	//trx_int_ps1_double[rD] = res1;

	ps0ra = &trx_int_ps0_double[rA];
	ps0rb = &trx_int_ps0_double[rB];
	ps0rd = &trx_int_ps0_double[rD];
	ps1ra = &trx_int_ps1_double[rA];
	ps1rb = &trx_int_ps1_double[rB];
	ps1rd = &trx_int_ps1_double[rD];
	_asm
	{
		mov edx, dword ptr ps0ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps0rb
		fdiv qword ptr [edx];
		mov edx, dword ptr ps0rd
		fstp  qword ptr [edx]; // store as double 
		mov edx, dword ptr ps1ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rb
		fdiv qword ptr [edx];
		mov edx, dword ptr ps1rd
		fstp  qword ptr [edx]; // store as double 
	};
}
// not used
void trx_ppc_int_ps_sub(void)
{
	int rD, rA, rB;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	
//	res0 = trx_int_ps0_double[rA] - trx_int_ps0_double[rB];
//	res1 = trx_int_ps1_double[rA] - trx_int_ps1_double[rB];
//	trx_int_ps0_double[rD] = res0;
//	trx_int_ps1_double[rD] = res1;

	ps0ra = &trx_int_ps0_double[rA];
	ps0rb = &trx_int_ps0_double[rB];
	ps0rd = &trx_int_ps0_double[rD];
	ps1ra = &trx_int_ps1_double[rA];
	ps1rb = &trx_int_ps1_double[rB];
	ps1rd = &trx_int_ps1_double[rD];
	_asm
	{
		mov edx, dword ptr ps0ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps0rb
		fsub qword ptr [edx];
		mov edx, dword ptr ps0rd
		fstp  qword ptr [edx]; // store as double 
		mov edx, dword ptr ps1ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rb
		fsub qword ptr [edx];
		mov edx, dword ptr ps1rd
		fstp  qword ptr [edx]; // store as double 
	};
}
// not used
void trx_ppc_int_ps_add(void)
{
	int rD, rA, rB;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	
//	res0 = trx_int_ps0_double[rA] + trx_int_ps0_double[rB];
//	res1 = trx_int_ps1_double[rA] + trx_int_ps1_double[rB];
//	trx_int_ps0_double[rD] = res0;
//	trx_int_ps1_double[rD] = res1;

	ps0ra = &trx_int_ps0_double[rA];
	ps0rb = &trx_int_ps0_double[rB];
	ps0rd = &trx_int_ps0_double[rD];
	ps1ra = &trx_int_ps1_double[rA];
	ps1rb = &trx_int_ps1_double[rB];
	ps1rd = &trx_int_ps1_double[rD];
	_asm
	{
		mov edx, dword ptr ps0ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps0rb
		fadd qword ptr [edx];
		mov edx, dword ptr ps0rd
		fstp  qword ptr [edx]; // store as double 
		mov edx, dword ptr ps1ra
		fld qword ptr [edx]	; 
		mov edx, dword ptr ps1rb
		fadd qword ptr [edx];
		mov edx, dword ptr ps1rd
		fstp  qword ptr [edx]; // store as double 
	};
}

// used
void trx_ppc_int_ps_sel(void)
{
	int rD, rA, rB, rC;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	rC = ((trxCPUint.opcode)>>6)&0x1f;
	
	//if(trx_rec_ps0_double[rA] < 0.0f || NAN)	trx_rec_ps0_double[rD] = trx_rec_ps0_double[rB];
	//else trx_rec_ps0_double[rD] = trx_rec_ps0_double[rC];
	//if(trx_rec_ps1_double[rA] < 0.0f || NAN)	trx_rec_ps1_double[rD] = trx_rec_ps1_double[rB];
	//else trx_rec_ps1_double[rD] = trx_rec_ps1_double[rC];
	fra = &trx_int_ps0_double[rA];
	frb = &trx_int_ps0_double[rB];
	frc = &trx_int_ps0_double[rC];
	frd = &trx_int_ps0_double[rD];
	_asm
	{
		mov edx, dword ptr frc
		fld qword ptr [edx]	; // load double
		mov edx, dword ptr frb
		fld qword ptr [edx]	; // load double
		fldz
		mov edx, dword ptr fra
		fld qword ptr [edx]	; // load double
		fucomip st, st(1)
		fcmovb st(0), st(1)
		fcmovu st(0), st(1)
		fcmovnb st(0), st(1)
		mov edx, dword ptr frd
		fstp qword ptr [edx]	; // load double
		fstp st(0)
		fstp st(0)
	}
	fra = &trx_int_ps1_double[rA];
	frb = &trx_int_ps1_double[rB];
	frc = &trx_int_ps1_double[rC];
	frd = &trx_int_ps1_double[rD];
	_asm
	{
		mov edx, dword ptr frc
		fld qword ptr [edx]	; // load double
		mov edx, dword ptr frb
		fld qword ptr [edx]	; // load double
		fldz
		mov edx, dword ptr fra
		fld qword ptr [edx]	; // load double
		fucomip st, st(1)
		fcmovb st(0), st(1)
		fcmovu st(0), st(1)
		fcmovnb st(0), st(1)
		mov edx, dword ptr frd
		fstp qword ptr [edx]	; // load double
		fstp st(0)
		fstp st(0)
	}
}

// used
void trx_ppc_int_ps_rsqrte(void)
{
	double res0, res1;
	int rD, rB;
	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	
	res0 = 1.0f / sqrt(trx_int_ps0_double[rB]);
	res1 = 1.0f / sqrt(trx_int_ps1_double[rB]);

	trx_int_ps0_double[rD] = res0;
	trx_int_ps1_double[rD] = res1;
}

void trx_ppc_int_gekko(void)
{
//	log_gekko();
	// fpu unavailable check already done

	switch((trxCPUint.opcode >> 1) & 0x1f)
	{
        case 0:
			switch((trxCPUint.opcode >> 6) & 3)
			{
				case 0: trx_ppc_int_ps_cmpu0(); break;
				case 1: trx_ppc_int_ps_cmpo0(); break;
				case 2: trx_ppc_int_ps_cmpu1(); break;
				case 3: trx_ppc_int_ps_cmpo1(); break;
				default:
					printf("cmp unhandled!: %d\n",(trxCPUint.opcode >> 6) & 3);
					exit(0);
					break;
			}
			break;
        case 6:
	        if(trxCPUint.opcode & 0x40) trx_ppc_int_psq_lux();    /* psq_lux */
	        else trx_ppc_int_psq_lx();              /* psq_lx */
            break;
        case 7:
	        if(trxCPUint.opcode & 0x40) trx_ppc_int_psq_stux();   /* psq_stux */
            else trx_ppc_int_psq_stx();             /* psq_stx */
            break;
        case 8:
          switch((trxCPUint.opcode >> 6) & 0x1f)
          {
            case 1:
				trx_ppc_int_ps_neg();
				break;
	          case 2:
				trx_ppc_int_ps_mr();
				break;
	          case 4:
				trx_ppc_int_ps_nabs();
				break;
              case 8:
				trx_ppc_int_ps_abs();
				break;
            default:
              trx_ppc_int_gekko_ill();
			  break;
          }
          break;
        case 10:
			trx_ppc_int_ps_sum0();
			break;
        case 11:
			trx_ppc_int_ps_sum1();
			break;
       case 12:
			trx_ppc_int_ps_muls0();
			break;
        case 13:
			trx_ppc_int_ps_muls1();
			break;
        case 14:
			trx_ppc_int_ps_madds0();
			break;
        case 15:
			trx_ppc_int_ps_madds1();
			break;
        case 16:
          switch((trxCPUint.opcode >> 6) & 0x1f)
          {
            case 16:
              trx_ppc_int_ps_merge00();            /* ps_merge00 */
              break;
            case 17:
              trx_ppc_int_ps_merge01();            /* ps_merge01 */
              break;
            case 18:
              trx_ppc_int_ps_merge10();            /* ps_merge10 */
              break;
            case 19:
              trx_ppc_int_ps_merge11();            /* ps_merge11 */
              break;
            default:
				trx_ppc_int_gekko_ill();
				break;
          }
          break;
        case 18:
			trx_ppc_int_ps_div();
			break;
        case 20:
			trx_ppc_int_ps_sub();
			break;
        case 21:
			trx_ppc_int_ps_add();
			break;
		case 22:
			trx_ppc_int_dcbz_l();
			break;
        case 23:
			trx_ppc_int_ps_sel();
			break;
//        case 24:
//          trx_ppc_ps_db();                     /* ps_res */
//          break;
	      case 25:
			trx_ppc_int_ps_mul();
			break;
        case 26:
	        trx_ppc_int_ps_rsqrte(); 
			break;
        case 28:
			trx_ppc_int_ps_msub();
            break;
        case 29:
			trx_ppc_int_ps_madd();
            break;
        case 30:
			trx_ppc_int_ps_nmsub();
            break;
        case 31:
			trx_ppc_int_ps_nmadd();
            break;
        default:
			trx_ppc_int_gekko_ill();
			break;
      }
}
//
// Paired Single Load and Store Instructions
// ------------------------------------------

// dequantization factor
static const float dq_factor[] =
{
	1.0/(1 <<  0),
	1.0/(1 <<  1),
	1.0/(1 <<  2),
	1.0/(1 <<  3),
	1.0/(1 <<  4),
	1.0/(1 <<  5),
	1.0/(1 <<  6),
	1.0/(1 <<  7),
	1.0/(1 <<  8),
	1.0/(1 <<  9),
	1.0/(1 << 10),
	1.0/(1 << 11),
	1.0/(1 << 12),
	1.0/(1 << 13),
	1.0/(1 << 14),
	1.0/(1 << 15),
	1.0/(1 << 16),
	1.0/(1 << 17),
	1.0/(1 << 18),
	1.0/(1 << 19),
	1.0/(1 << 20),
	1.0/(1 << 21),
	1.0/(1 << 22),
	1.0/(1 << 23),
	1.0/(1 << 24),
	1.0/(1 << 25),
	1.0/(1 << 26),
	1.0/(1 << 27),
	1.0/(1 << 28),
	1.0/(1 << 29),
	1.0/(1 << 30),
	1.0/(1 << 31),

	(1ULL << 32),
	(1 << 31),
	(1 << 30),
	(1 << 29),
	(1 << 28),
	(1 << 27),
	(1 << 26),
	(1 << 25),
	(1 << 24),
	(1 << 23),
	(1 << 22),
	(1 << 21),
	(1 << 20),
	(1 << 19),
	(1 << 18),
	(1 << 17),
	(1 << 16),
	(1 << 15),
	(1 << 14),
	(1 << 13),
	(1 << 12),
	(1 << 11),
	(1 << 10),
	(1 <<  9),
	(1 <<  8),
	(1 <<  7),
	(1 <<  6),
	(1 <<  5),
	(1 <<  4),
	(1 <<  3),
	(1 <<  2),
	(1 <<  1),
};

static void trx_int_gekko_qload(uint32 EA, uint32 offset, uint32 type, uint32 scale, double *dest)
{
	float f;
	uint32 u32;
	sint8 s8;
	sint16 s16;
	uint8 u8;
	uint16 u16;

	switch (type)
	{
		case 0: // direct to float
			u32 = mem_read32_int(EA+(offset<<2));
			memcpy(&f, &u32,4); 
			*dest = f;
//			printf("QLOAD0@%x: load %x result %f scale %f\n", EA+(offset<<2), u32, *dest, 0.0f);
			break;
		case 4: // unsigned byte to float (scaled)
			u8 = mem_read8_int(EA+offset);
			*dest = u8;
			*dest *= dq_factor[scale];
//			printf("QLOAD4@%x: load %x result %f scale %f\n", EA+offset, u8, *dest, dq_factor[scale]);
			break;
		case 5: // unsigned word to float (scaled)
			u16 = mem_read16_int(EA+(offset<<1));
			*dest = u16;
			*dest *= dq_factor[scale];
//			printf("QLOAD5@%x: load %x result %f scale %f\n", EA+(offset<<1), u16, *dest, dq_factor[scale]);
			break;
		case 6: // signed byte to float (scaled)
			s8 = mem_read8_int(EA+offset);
			*dest = s8;
			*dest *= dq_factor[scale];
//			printf("QLOAD6@%x: load %x result %f scale %f\n", EA+offset, s8, *dest, dq_factor[scale]);
			break;
		case 7: // signed word to float (scaled)
			s16 = mem_read16_int(EA+(offset<<1));
			*dest = s16;
			*dest *= dq_factor[scale];
//			printf("QLOAD7@%x: load %x result %f scale %f\n", EA+(offset<<1), s16, *dest, dq_factor[scale]);
			break;
		default: // cant happen !
			printf("[trxCPUint: error! default in qload\n");
			exit(0);
			break;
	}
}

// quantization factor
static const float q_factor[] =
{
	(1 <<  0),
	(1 <<  1),
	(1 <<  2),
	(1 <<  3),
	(1 <<  4),
	(1 <<  5),
	(1 <<  6),
	(1 <<  7),
	(1 <<  8),
	(1 <<  9),

	(1 << 10),
	(1 << 11),
	(1 << 12),
	(1 << 13),
	(1 << 14),
	(1 << 15),
	(1 << 16),
	(1 << 17),
	(1 << 18),
	(1 << 19),

	(1 << 20),
	(1 << 21),
	(1 << 22),
	(1 << 23),
	(1 << 24),
	(1 << 25),
	(1 << 26),
	(1 << 27),
	(1 << 28),
	(1 << 29),
	(1 << 30),
	(1 << 31),

	1.0/(1ULL << 32),
	1.0/(1 << 31),
	1.0/(1 << 30),

	1.0/(1 << 29),
	1.0/(1 << 28),
	1.0/(1 << 27),
	1.0/(1 << 26),
	1.0/(1 << 25),
	1.0/(1 << 24),
	1.0/(1 << 23),
	1.0/(1 << 22),
	1.0/(1 << 21),
	1.0/(1 << 20),

	1.0/(1 << 19),
	1.0/(1 << 18),
	1.0/(1 << 17),
	1.0/(1 << 16),
	1.0/(1 << 15),
	1.0/(1 << 14),
	1.0/(1 << 13),
	1.0/(1 << 12),
	1.0/(1 << 11),
	1.0/(1 << 10),

	1.0/(1 <<  9),
	1.0/(1 <<  8),
	1.0/(1 <<  7),
	1.0/(1 <<  6),
	1.0/(1 <<  5),
	1.0/(1 <<  4),
	1.0/(1 <<  3),
	1.0/(1 <<  2),
	1.0/(1 <<  1),
};

static void trx_int_gekko_qstore(uint32 EA, uint32 offset, uint32 type, uint32 scale, double *dest)
{
	double d;
	float f;
	sint32 s32;
	uint32 u32;
	sint8 s8;
	sint16 s16;
	uint8 u8;
	uint16 u16;

	d = *dest;
	switch (type)
	{
		case 0: // store float direct
			f = d;
			memcpy(&u32, &f,4); 
			mem_write32_int(EA+(offset<<2), u32);
//			printf("QSTORE0@%x: store %x in %f scale %f\n",EA+(offset<<2), u32, d, 0.0f);
			break;
		case 4: // float to unsigned byte (scaled)
			f = d * q_factor[scale];
			s32 = f;
			if(s32 > 255)s32 = 255;
			if(s32 < 0)s32 = 0;
			u8 = s32;
			mem_write8_int(EA+offset, u8);
//			printf("QSTORE4@%x: store %x in %f scale %f\n", EA+offset, u8, d, q_factor[scale]);
			break;
		case 5: // float to unsigned word (scaled)
			f = d * q_factor[scale];
			s32 = f;
			if(s32 > 65535)s32 = 65535;
			if(s32 < 0)s32 = 0;
			u16 = s32;
			mem_write16_int(EA+(offset<<1), u16);
//			printf("QSTORE5@%x: store %x in %f scale %f\n", u16, d, q_factor[scale]);
			break;
		case 6: // float to signed byte (scaled)
			f = d * q_factor[scale];
			s32 = f;
			if(s32 > 127)s32 = 127;
			if(s32 < -128)s32 = -128;
			s8 = s32;
			mem_write8_int(EA+offset, s8);
//			printf("QSTORE6@%x: store %x in %f scale %f\n", EA+offset, s8, d, q_factor[scale]);
			break;
		case 7: // float to signed word (scaled)
			f = d * q_factor[scale];
			s32 = f;
			if(s32 > 32767)s32 = 32767;
			if(s32 < -32768)s32 = -32768;
			s16 = s32;
			mem_write16_int(EA+(offset<<1), s16);
//			printf("QSTORE7@%x: store %x in %f scale %f\n", s16, d, q_factor[scale]);
			break;
		default: // cant happen !
			printf("[trxCPUint: error! default in qload\n");
			exit(0);
			break;
	}
}

void trx_ppc_int_psq_l(void)
{
	uint32 r;
	uint32 rA, rD, type, scale, i, w;

	//trx_log_gekko();

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	i = ((trxCPUint.opcode)>>12)&0x7;
	w = ((trxCPUint.opcode)>>15)&1;

	type = (trxCPUint.spr[PPC_GQR0+i]>>16)&7;
	scale = (trxCPUint.spr[PPC_GQR0+i]>>24)&0x3f;

	uint32 EA = trxCPUint.opcode & 0xfff; // pay attention ! only 12 bits !
    if(EA & 0x800) EA |= 0xfffff000;	
	
	if(rA) EA += trxCPUint.gpr[rA];

	if(w == 0)
	{
		trx_int_gekko_qload(EA, 0, type, scale, &trx_int_ps0_double[rD]);
		trx_int_gekko_qload(EA, 1, type, scale, &trx_int_ps1_double[rD]);
	}
	else
	{
		trx_int_gekko_qload(EA, 0, type, scale, &trx_int_ps0_double[rD]);
		trx_int_ps1_double[rD] = 1.0f;
	}
}

void trx_ppc_int_psq_lu(void)
{
	uint32 r;
	uint32 rA, rD, type, scale, i, w;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	i = ((trxCPUint.opcode)>>12)&0x7;
	w = ((trxCPUint.opcode)>>15)&1;

	type = (trxCPUint.spr[PPC_GQR0+i]>>16)&7;
	scale = (trxCPUint.spr[PPC_GQR0+i]>>24)&0x3f;

	uint32 EA = trxCPUint.opcode & 0xfff; // pay attention ! only 12 bits !
    if(EA & 0x800) EA |= 0xfffff000;	
	EA += trxCPUint.gpr[rA];

	if(w == 0)
	{
		trx_int_gekko_qload(EA, 0, type, scale, &trx_int_ps0_double[rD]);
		trx_int_gekko_qload(EA, 1, type, scale, &trx_int_ps1_double[rD]);
	}
	else
	{
		trx_int_gekko_qload(EA, 0, type, scale, &trx_int_ps0_double[rD]);
		trx_int_ps1_double[rD] = 1.0f;
	}
	trxCPUint.gpr[rA] = EA;
}

void trx_ppc_int_psq_lx(void)
{
	uint32 r;
	uint32 rA, rD, rB, type, scale, i, w;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	i = ((trxCPUint.opcode)>>7)&0x7;
	w = ((trxCPUint.opcode)>>10)&1;

	type = (trxCPUint.spr[PPC_GQR0+i]>>16)&7;
	scale = (trxCPUint.spr[PPC_GQR0+i]>>24)&0x3f;

	uint32 EA = trxCPUint.gpr[rB];
	if(rA) EA += trxCPUint.gpr[rA];

	if(w == 0)
	{
		trx_int_gekko_qload(EA, 0, type, scale, &trx_int_ps0_double[rD]);
		trx_int_gekko_qload(EA, 1, type, scale, &trx_int_ps1_double[rD]);
	}
	else
	{
		trx_int_gekko_qload(EA, 0, type, scale, &trx_int_ps0_double[rD]);
		trx_int_ps1_double[rD] = 1.0f;
	}
}
void trx_ppc_int_psq_lux(void)
{
	uint32 r;
	uint32 rA, rD, rB, type, scale, i, w;

	rD = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	i = ((trxCPUint.opcode)>>7)&0x7;
	w = ((trxCPUint.opcode)>>10)&1;

	type = (trxCPUint.spr[PPC_GQR0+i]>>16)&7;
	scale = (trxCPUint.spr[PPC_GQR0+i]>>24)&0x3f;

	uint32 EA = trxCPUint.gpr[rA] + trxCPUint.gpr[rB];

	if(w == 0)
	{
		trx_int_gekko_qload(EA, 0, type, scale, &trx_int_ps0_double[rD]);
		trx_int_gekko_qload(EA, 1, type, scale, &trx_int_ps1_double[rD]);
	}
	else
	{
		trx_int_gekko_qload(EA, 0, type, scale, &trx_int_ps0_double[rD]);
		trx_int_ps1_double[rD] = 1.0f;
	}
	trxCPUint.gpr[rA] = EA;
}

void trx_ppc_int_psq_st(void)
{
	uint32 r;
	uint32 rA, rS, type, scale, i, w;

	rS = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	i = ((trxCPUint.opcode)>>12)&0x7;
	w = ((trxCPUint.opcode)>>15)&1;

	type = (trxCPUint.spr[PPC_GQR0+i])&7;
	scale = (trxCPUint.spr[PPC_GQR0+i]>>8)&0x3f;

	uint32 EA = trxCPUint.opcode & 0xfff; // pay attention ! only 12 bits !
    if(EA & 0x800) EA |= 0xfffff000;	
	
	if(rA) EA += trxCPUint.gpr[rA];

	if(w == 0)
	{
		trx_int_gekko_qstore(EA, 0, type, scale, &trx_int_ps0_double[rS]);
		trx_int_gekko_qstore(EA, 1, type, scale, &trx_int_ps1_double[rS]);
	}
	else
	{
		trx_int_gekko_qstore(EA, 0, type, scale, &trx_int_ps0_double[rS]);
	}
}

void trx_ppc_int_psq_stu(void)
{
	uint32 r;
	uint32 rA, rS, type, scale, i, w;

	rS = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	i = ((trxCPUint.opcode)>>12)&0x7;
	w = ((trxCPUint.opcode)>>15)&1;

	type = (trxCPUint.spr[PPC_GQR0+i])&7;
	scale = (trxCPUint.spr[PPC_GQR0+i]>>8)&0x3f;

	uint32 EA = trxCPUint.opcode & 0xfff; // pay attention ! only 12 bits !
    if(EA & 0x800) EA |= 0xfffff000;	
	
	if(rA) EA += trxCPUint.gpr[rA];

	if(w == 0)
	{
		trx_int_gekko_qstore(EA, 0, type, scale, &trx_int_ps0_double[rS]);
		trx_int_gekko_qstore(EA, 1, type, scale, &trx_int_ps1_double[rS]);
	}
	else
	{
		trx_int_gekko_qstore(EA, 0, type, scale, &trx_int_ps0_double[rS]);
	}
	trxCPUint.gpr[rA] = EA;
}

void trx_ppc_int_psq_stx(void)
{
	uint32 r;
	uint32 rA, rS, rB, type, scale, i, w;

	rS = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	i = ((trxCPUint.opcode)>>7)&0x7;
	w = ((trxCPUint.opcode)>>10)&1;

	type = (trxCPUint.spr[PPC_GQR0+i])&7;
	scale = (trxCPUint.spr[PPC_GQR0+i]>>8)&0x3f;

	uint32 EA = trxCPUint.gpr[rB];	
	if(rA) EA += trxCPUint.gpr[rA];

	if(w == 0)
	{
		trx_int_gekko_qstore(EA, 0, type, scale, &trx_int_ps0_double[rS]);
		trx_int_gekko_qstore(EA, 1, type, scale, &trx_int_ps1_double[rS]);
	}
	else
	{
		trx_int_gekko_qstore(EA, 0, type, scale, &trx_int_ps0_double[rS]);
	}
}

void trx_ppc_int_psq_stux(void)
{
	uint32 r;
	uint32 rA, rS, rB, type, scale, i, w;

	rS = ((trxCPUint.opcode)>>21)&0x1f;
	rA = ((trxCPUint.opcode)>>16)&0x1f;
	rB = ((trxCPUint.opcode)>>11)&0x1f;
	i = ((trxCPUint.opcode)>>7)&0x7;
	w = ((trxCPUint.opcode)>>10)&1;

	type = (trxCPUint.spr[PPC_GQR0+i])&7;
	scale = (trxCPUint.spr[PPC_GQR0+i]>>8)&0x3f;

	uint32 EA = trxCPUint.gpr[rB] + trxCPUint.gpr[rA];	

	if(w == 0)
	{
		trx_int_gekko_qstore(EA, 0, type, scale, &trx_int_ps0_double[rS]);
		trx_int_gekko_qstore(EA, 1, type, scale, &trx_int_ps1_double[rS]);
	}
	else
	{
		trx_int_gekko_qstore(EA, 0, type, scale, &trx_int_ps0_double[rS]);
	}
	trxCPUint.gpr[rA] = EA;
}

static void trx_ppc_int_gekko_ill(void)
{
	printf("GEKKO UNKNOWN OPCODE: %d\n", (trxCPUint.opcode >> 1) & 0x1f);
	char buf[64], opStr[16], parmStr[32];
	uint32 target;

	GekkoDisassemble(opStr, parmStr, trxCPUint.opcode, trxCPUint.pc, &target);
	sprintf(buf, "%-10s %s", opStr, parmStr);    
	printf("%.8X  %.8X  %s\n", trxCPUint.pc, trxCPUint.opcode, buf);
	exit(0);
//	SINGLESTEP("unknown instruction\n %08x", trxCPUint.opcode);
}