/*====================================================================

filename:     trx_ppc_rec_fpu_ps_opcodes_sse2.cpp
project:      GCemu
created:      2004-6-18
mail:		  duddie@walla.com

Copyright (c) 2005 Duddie & Tratax

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

====================================================================*/
/*
 *	Tratax PowerPC recompiler
 *
 *  SSE2 Opcode implementations for floating point and paired single
 *
 *  2004-8-19 started SSE2 implementation
 *
 *
 * General idea:
 * Use 8 XMM registers in packed double mode to emulate FPU and PS opcodes 
 * Use XMM0 and XMM1 for calculations and the others for register cache
 * Use single double opcodes to emulate FPU opcodes, packed double opcodes for ps opcodes
 *
 *
 */
#include <windows.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "cpu/trx_ppc_cpu.h"
#include "trx_ppc_rec.h"
#include "asm_x86.h"
#include "trx_ppc_rec_fpu_ps_opcodes_sse2.h"
#include "ppc_disasm.h"

#pragma warning (disable:4311)

static double double_one = 1.0f;
static uint32 trx_ppc_cmp_and_mask[8] = 
{
	0xfffffff0,
	0xffffff0f,
	0xfffff0ff,
	0xffff0fff,
	0xfff0ffff,
	0xff0fffff,
	0xf0ffffff,
	0x0fffffff
};

__declspec(align(16)) paired_single trx_ps[32];

__declspec(align(16)) paired_uint64 fnegmask = { (0x8000000000000000ULL), 0ULL};
__declspec(align(16)) paired_uint64 fabsmask = { (0x7fffffffffffffffULL), (0xffffffffffffffffULL)};

__declspec(align(16)) paired_uint64 psnegmask = { (0x8000000000000000ULL), (0x8000000000000000ULL)};
__declspec(align(16)) paired_uint64 psabsmask = { (0x7fffffffffffffffULL), (0x7fffffffffffffffULL)};
__declspec(align(16)) paired_single ps_one = { 1.0f, 1.0f};

static float scale_value;
static float scale_watch=0xffffffff;
static double double_store;
static float float_store;
static float float_store_low, float_store_high;
static uint32 int32_store;

static void trx_ppc_gekko_ill(void)
{
	printf("GEKKO UNKNOWN OPCODE: %d\n", (trxCPUrec.opcode >> 1) & 0x1f);
	char buf[64], opStr[16], parmStr[32];
	uint32 target;

	GekkoDisassemble(opStr, parmStr, trxCPUrec.opcode, trxCPUrec.pc, &target);
	sprintf(buf, "%-10s %s", opStr, parmStr);    
	printf("%.8X  %.8X  %s\n", trxCPUrec.pc, trxCPUrec.opcode, buf);
	exit(0);
}

void trx_ppc_gen_sse2_gekko(void)
{
	switch((trxCPUrec.opcode >> 1) & 0x1f)
	{
        case 0:
			switch((trxCPUrec.opcode >> 6) & 3)
			{
				case 0: trx_ppc_gen_sse2_ps_cmpu0(); break;
				case 1: trx_ppc_gen_sse2_ps_cmpo0(); break;
				case 2: trx_ppc_gen_sse2_ps_cmpu1(); break;
				case 3: trx_ppc_gen_sse2_ps_cmpo1(); break;
				default:
					printf("cmp unhandled!: %d\n",(trxCPUrec.opcode >> 6) & 3);
					exit(0);
					break;
			}
			break;
        case 6:  
			if(trxCPUrec.opcode & 0x40) trx_ppc_gen_sse2_psq_lux();
	        else trx_ppc_gen_sse2_psq_lx();   
	        break;
        case 7:
			if(trxCPUrec.opcode & 0x40) trx_ppc_gen_sse2_psq_stux();
            else trx_ppc_gen_sse2_psq_stx();
            break;
        case 8:
          switch((trxCPUrec.opcode >> 6) & 0x1f)
          {
            case 1:	trx_ppc_gen_sse2_ps_neg(); break;
	        case 2:	trx_ppc_gen_sse2_ps_mr(); break;
	        case 4:	trx_ppc_gen_sse2_ps_nabs(); break;
            case 8:	trx_ppc_gen_sse2_ps_abs(); break;
            default:trx_ppc_gekko_ill(); break;
          }
          break;
        case 10: trx_ppc_gen_sse2_ps_sum0();	break;
        case 11: trx_ppc_gen_sse2_ps_sum1();	break;
        case 12: trx_ppc_gen_sse2_ps_muls0();break;
        case 13: trx_ppc_gen_sse2_ps_muls1();break;
        case 14: trx_ppc_gen_sse2_ps_madds0();break;
        case 15: trx_ppc_gen_sse2_ps_madds1();break;
        case 16:
		  switch((trxCPUrec.opcode >> 6) & 0x1f)
          {
            case 16: trx_ppc_gen_sse2_ps_merge00(); break;
            case 17: trx_ppc_gen_sse2_ps_merge01(); break;
            case 18: trx_ppc_gen_sse2_ps_merge10(); break;
            case 19: trx_ppc_gen_sse2_ps_merge11(); break;
            default: trx_ppc_gekko_ill(); break;
          }
          break;
        case 18: trx_ppc_gen_sse2_ps_div(); break;
        case 20: trx_ppc_gen_sse2_ps_sub(); break;
        case 21: trx_ppc_gen_sse2_ps_add(); break;
		case 22: trx_ppc_gen_sse2_dcbz_l(); break;
        case 23: trx_ppc_gen_sse2_ps_sel(); break;
        case 25: trx_ppc_gen_sse2_ps_mul(); break;
        case 26: trx_ppc_gen_sse2_ps_rsqrte(); break;
        case 28: trx_ppc_gen_sse2_ps_msub(); break;
        case 29: trx_ppc_gen_sse2_ps_madd(); break;
        case 30: trx_ppc_gen_sse2_ps_nmsub(); break;
        case 31: trx_ppc_gen_sse2_ps_nmadd(); break;
        default: trx_ppc_gekko_ill();break;
      }
}

// main opcode 59
void trx_ppc_gen_sse2_group59()
{
	uint32 ext = ((trxCPUrec.opcode>>1)&0x3ff);

	switch (ext & 0x1f) 
	{
		case 18: trx_ppc_gen_sse2_fdivsx(); break;
		case 20: trx_ppc_gen_sse2_fsubsx(); break;
		case 21: trx_ppc_gen_sse2_faddsx(); break;
//		case 22: ppc_alt_gen_fsqrtsx(); return;
		case 24: trx_ppc_gen_sse2_fresx(); return;
		case 25: trx_ppc_gen_sse2_fmulsx(); break;
		case 28: trx_ppc_gen_sse2_fmsubsx(); return;
		case 29: trx_ppc_gen_sse2_fmaddsx(); break;
		case 30: trx_ppc_gen_sse2_fnmsubsx(); break;
		case 31: trx_ppc_gen_sse2_fnmaddsx(); break;
		default:
			printf("[trxCPUrec] unhandled op59: %d\n", ext & 0x1f);
			exit(0);
			break;
	}
}
// main opcode 63, floating point instructions
void trx_ppc_gen_sse2_group63()
{
	uint32 ext = ((trxCPUrec.opcode>>1)&0x3ff);
	if (ext & 16) 
	{
		switch (ext & 0x1f) 
		{
		case 18: trx_ppc_gen_sse2_fdivx(); return;
		case 20: trx_ppc_gen_sse2_fsubx(); return;
		case 21: trx_ppc_gen_sse2_faddx(); return;
		//case 22: ppc_alt_fsqrtx(); return;
		case 23: trx_ppc_gen_sse2_fselx(); return;
		case 25: trx_ppc_gen_sse2_fmulx(); return;
		case 26: trx_ppc_gen_sse2_frsqrtex(); return;
		case 28: trx_ppc_gen_sse2_fmsubx(); return;
		case 29: trx_ppc_gen_sse2_fmaddx(); return;
		case 30: trx_ppc_gen_sse2_fnmsubx(); return;
		case 31: trx_ppc_gen_sse2_fnmaddx(); return;
		}
		printf("[trxCPUrec] unhandled op63: %d\n", ext & 0x1f);
		exit(0);
	} else {
		switch (ext) 
		{
		case 0: trx_ppc_gen_sse2_fcmpu(); return;
		case 12: trx_ppc_gen_sse2_frspx(); return;
		case 14: trx_ppc_gen_sse2_fctiwx(); return;
		case 15: trx_ppc_gen_sse2_fctiwzx(); return;
		//--
		case 32: trx_ppc_gen_sse2_fcmpo(); return;
		case 38: trx_ppc_gen_sse2_mtfsb1x(); return;
		case 40: trx_ppc_gen_sse2_fnegx(); return;
		case 64: trx_ppc_gen_sse2_mcrfs(); return;
		case 70: trx_ppc_gen_sse2_mtfsb0x(); return;
		case 72: trx_ppc_gen_sse2_fmrx(); return;
		//case 134: ppc_opc_mtfsfix(); return;
		case 136: trx_ppc_gen_sse2_fnabsx(); return;
		case 264: trx_ppc_gen_sse2_fabsx(); return;
		case 583: trx_ppc_gen_sse2_mffsx(); return;
		case 711: trx_ppc_gen_sse2_mtfsfx(); return;
		}
		printf("[trxCPUrec] unhandled op63: %d\n", ext);
		exit(0);
	}
}

//==============================================================================
// misc functions
//

// return 0 if system does not support sse2, 1 if it does
int has_sse2(void)
{
	_try
	{
		_asm xorpd xmm0, xmm0
	}
	_except(EXCEPTION_EXECUTE_HANDLER)
	{
		if(_exception_code()==STATUS_ILLEGAL_INSTRUCTION)return 0;
	}
	return 1;
}

//==============================================================================
// SSE register caching functions
//
#define PSC_DISABLED 0

#define PSC_CACHEREGSTART 2
#define PSC_CACHEREGCOUNT 6

#define PSC_CACHEMAP(x) (x+PSC_CACHEREGSTART)
// 2 states that SSE registers can be in
enum e_psc_sse_state
{
	PSC_UNCACHED,
	PSC_CACHED
};

// 2 states that cache registers can be in
enum e_psc_sse_cachestate
{
	PSC_NOTUSED,
	PSC_USED,
};

enum e_psc_sse_modified
{
	PSC_UNMODIFIED,
	PSC_MODIFIED
};

// keeps state of PS registers
uint32 psc_state[32];
// keeps track if the register was modified (had store operation executed)
uint32 psc_modified[32];
// keeps track which SSE register PS register is cached in
uint32 psc_which[32];
// keeps last used status
uint32 psc_lru[PSC_CACHEREGCOUNT];
uint32 psc_lrucount;
// and the state that the cache registers are in
uint32 psc_cachestate[PSC_CACHEREGCOUNT];

// find a cache register, empty or full (flush current val if needed)
uint32 psc_alloc(void)
{
	uint32 i, curlru_val, curlru_reg;

	// first see if there is any empty register
	for(i=0; i < PSC_CACHEREGCOUNT; i++)
	{
		if(psc_cachestate[i] == PSC_NOTUSED)
		{
			psc_cachestate[i] = PSC_USED;
			return PSC_CACHEMAP(i);
		}
	}

	// no free registers so get least recently used register
	curlru_val = 0;
	curlru_reg = 0;
	for(i=0; i < PSC_CACHEREGCOUNT; i++)
	{
		if((psc_lrucount - psc_lru[i]) > curlru_val)
		{
			curlru_val = (psc_lrucount - psc_lru[i]);
			curlru_reg = PSC_CACHEMAP(i);
		}
	}
	// find out which register this belongs to and flush value
	for(i = 0; i < 32; i++)
	{
		if(psc_which[i] == curlru_reg)
		{
			if(psc_modified[i] == PSC_MODIFIED)
			{
				// flush value
				gen_asm(MOVAPD_MR, (uint32)&trx_ps[i], curlru_reg);
			}
			// and flush cache info
			psc_state[i] = PSC_UNCACHED;
			psc_which[i] = 0;
			// keep register cache as 'used' since we're recycling here
		}
	}
	return curlru_reg;
}

// initialise register cache for beginning of block
// cache empty
void psc_start(void)
{
	uint32 i;

	for(i = 0; i < 32; i++)
	{
		psc_state[i] = PSC_UNCACHED;
		psc_which[i] = 0;
		psc_modified[i] = PSC_UNMODIFIED;
	}
	for(i = 0; i < PSC_CACHEREGCOUNT; i++)
	{
		psc_lru[i] = 0;
		psc_cachestate[i] = PSC_NOTUSED;
	}
	psc_lrucount = 0;
}

// flush all at end of block marker (can be done multiple times per block!)
// returns how many bytes it took.
uint32 psc_end(void)
{
	uint32 i, beginpos;

	beginpos = translation_pos;
	for(i = 0; i < 32; i++)
	{
		if(psc_state[i] == PSC_CACHED)
		{
			if(psc_modified[i] == PSC_MODIFIED)
			{
				// writeback cached value
				gen_asm(MOVAPD_MR, (uint32)&trx_ps[i], psc_which[i]);
			}
		}
	}	
	return (translation_pos - beginpos);
}

// load paired single into destination xmm register (from memory or cache)
void psc_load(uint32 destxmm, uint32 regnr)
{
#if PSC_DISABLED
	gen_asm(MOVAPD_RM, destxmm, (uint32)&trx_ps[regnr]);
	return;
#endif
	if(psc_state[regnr] == PSC_UNCACHED)
	{
		uint32 target;
		// currently uncached, get register and load value
		target = psc_alloc();
		psc_state[regnr] = PSC_CACHED;
		psc_which[regnr] = target;
		psc_modified[regnr] = PSC_UNMODIFIED;
		gen_asm(MOVAPD_RM, target, (uint32)&trx_ps[regnr]);	
		// fall through
	}
	// must be cached at this point
	// ---------------------------
	// load cached value
	gen_asm(MOVAPD_RR, destxmm, psc_which[regnr]);
	// another cache hit
	psc_lrucount++;
	psc_lru[psc_which[regnr]-PSC_CACHEREGSTART] = psc_lrucount;
}

// load paired single into destination xmm register (from memory or cache)
uint32 psc_getcachereg(uint32 regnr)
{
#if PSC_DISABLED
	gen_asm(MOVAPD_RM, 2, (uint32)&trx_ps[regnr]);
	return 2;
#endif
	if(psc_state[regnr] == PSC_UNCACHED)
	{
		uint32 target;
		// currently uncached, get register and load value
		target = psc_alloc();
		psc_state[regnr] = PSC_CACHED;
		psc_which[regnr] = target;
		psc_modified[regnr] = PSC_UNMODIFIED;
		gen_asm(MOVAPD_RM, target, (uint32)&trx_ps[regnr]);	
		// fall through
	}
	// must be cached at this point
	// ---------------------------
	// another cache hit
	psc_lrucount++;
	psc_lru[psc_which[regnr]-PSC_CACHEREGSTART] = psc_lrucount;
	return psc_which[regnr];
}

// store paired single in source xmm register into cache or memory
void psc_store(uint32 srcxmm, uint32 regnr)
{
#if PSC_DISABLED
	gen_asm(MOVAPD_MR, (uint32)&trx_ps[regnr], srcxmm);
	return;
#endif
	if(psc_state[regnr] == PSC_UNCACHED)
	{
		uint32 target;
		// get register for storing
		target = psc_alloc();
		psc_state[regnr] = PSC_CACHED;
		psc_which[regnr] = target;
	}
	// must be cached at this point
	// ---------------------------
	// store to cache register
	gen_asm(MOVAPD_RR, psc_which[regnr], srcxmm);
	psc_modified[regnr] = PSC_MODIFIED;
	// another cache hit
	psc_lrucount++;
	psc_lru[psc_which[regnr]-PSC_CACHEREGSTART] = psc_lrucount;
}

// store only low part source xmm register into cache or memory
void psc_store_low(uint32 srcxmm, uint32 regnr)
{
#if PSC_DISABLED
	gen_asm(MOVLPD_MR, (uint32)&trx_ps[regnr].low, srcxmm);
	return;
#endif
	if(psc_state[regnr] == PSC_UNCACHED)
	{
		uint32 target;
		// get register for storing
		target = psc_alloc();
		psc_state[regnr] = PSC_CACHED;
		psc_which[regnr] = target;
		// need to load the higher part
		gen_asm(MOVHPD_RM, target, (uint32)&trx_ps[regnr].high);
	}
	// must be cached at this point
	// ---------------------------
	// store to cache register only low part
	gen_asm(MOVSD_RR, psc_which[regnr], srcxmm);
	psc_modified[regnr] = PSC_MODIFIED;
	// another cache hit
	psc_lrucount++;
	psc_lru[psc_which[regnr]-PSC_CACHEREGSTART] = psc_lrucount;
}

// store only low part source xmm register into cache or memory
void psc_store_high(uint32 srcxmm, uint32 regnr)
{
#if PSC_DISABLED
	gen_asm(MOVHPD_MR, (uint32)&trx_ps[regnr].high, srcxmm);
	return;
#endif
	if(psc_state[regnr] == PSC_UNCACHED)
	{
		uint32 target;
		// get register for storing
		target = psc_alloc();
		psc_state[regnr] = PSC_CACHED;
		psc_which[regnr] = target;
		// need to load the low part
		gen_asm(MOVLPD_RM, target, (uint32)&trx_ps[regnr].low);
	}
	// must be cached at this point
	// ---------------------------
	// store to cache register only high part
	gen_asm(SHUFPD_RR, psc_which[regnr], srcxmm, 2);
	psc_modified[regnr] = PSC_MODIFIED;
	// another cache hit
	psc_lrucount++;
	psc_lru[psc_which[regnr]-PSC_CACHEREGSTART] = psc_lrucount;
}

// force writeback of paired single
void psc_flush(uint32 regnr)
{
#if PSC_DISABLED
	return;
#endif
	if(psc_state[regnr] == PSC_CACHED)
	{
		// writeback 
		if(psc_modified[regnr] == PSC_MODIFIED)
		{
			gen_asm(MOVAPD_MR, (uint32)&trx_ps[regnr], psc_which[regnr]);
		}
		// and flush cache info
		psc_cachestate[psc_which[regnr]-PSC_CACHEREGSTART] = PSC_NOTUSED;
		psc_state[regnr] = PSC_UNCACHED;
		psc_which[regnr] = 0;
	}
}

void psc_flushall(void)
{
#if PSC_DISABLED
	return;
#endif
	uint32 i;
	// find cached register and force writeback
	for(i = 0; i < 32; i++)
	{
		if(psc_state[i] == PSC_CACHED)
		{
			// writeback cached value
			if(psc_modified[i] == PSC_MODIFIED)
			{
				gen_asm(MOVAPD_MR, (uint32)&trx_ps[i], psc_which[i]);
			}
			psc_cachestate[psc_which[i]-PSC_CACHEREGSTART] = PSC_NOTUSED;
			psc_state[i] = PSC_UNCACHED;
			psc_which[i] = 0;
		}
	}
}

//==============================================================================
// Floating point control opcodes
//

// used
void trx_ppc_gen_sse2_mtfsb1x()
{
	int crb, n1, n2;
	
	crb = (trxCPUrec.opcode >> 21)& 0x1f;
	n1 = (trxCPUrec.opcode >> 16)& 0x1f;
	n2 = (trxCPUrec.opcode >> 11)& 0x1f;

	if (crb != 1 && crb != 2) 
	{
//		trxCPUrec.fpscr |= 1<<(31-crb);
		gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.fpscr);
		gen_asm(OR_RI32, EAX, (1<<(31-crb)));
		gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, EAX);
	}
}

// used
void trx_ppc_gen_sse2_mtfsb0x()
{
	int crb, n1, n2;
	
	crb = (trxCPUrec.opcode >> 21)& 0x1f;
	n1 = (trxCPUrec.opcode >> 16)& 0x1f;
	n2 = (trxCPUrec.opcode >> 11)& 0x1f;

	if (crb != 1 && crb != 2) 
	{
		//trxCPUrec.fpscr &= ~(1<<(31-crbD));
		gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.fpscr);
		gen_asm(AND_RI32, EAX, ~(1<<(31-crb)));
		gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, EAX);
	}
}

void trx_ppc_gen_sse2_mffsx()
{
	uint32 rD;

	rD = (trxCPUrec.opcode >> 21)& 0x1f;
	
	// trxCPUrec.fpr[rD] = trxCPUrec.fpscr;
//	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.fpscr);
//	gen_asm(MOV_MR, (uint32)&trx_ps[rD].low, EAX);
	// kill low order bits
//	gen_asm(XOR_RR, EAX, EAX);
//	gen_asm(MOV_MR, (uint32)(&trx_ps[rD].low)+4, EAX);

	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.fpscr);
	gen_asm(MOV_MR, (uint32)&double_store, EAX);
	// kill low order bits
	gen_asm(XOR_RR, EAX, EAX);
	gen_asm(MOV_MR, (uint32)(&double_store)+4, EAX);
	gen_asm(MOVLPD_RM, 0, (uint32)&double_store);
	psc_store_low(0, rD);
}

void trx_ppc_gen_sse2_mtfsfx()
{
	uint32 rB,fm, FM;

	fm = ((trxCPUrec.opcode)>>17)&0xff;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	FM = ((fm&0x80)?0xf0000000:0)|((fm&0x40)?0x0f000000:0)|((fm&0x20)?0x00f00000:0)|((fm&0x10)?0x000f0000:0)|
	     ((fm&0x08)?0x0000f000:0)|((fm&0x04)?0x00000f00:0)|((fm&0x02)?0x000000f0:0)|((fm&0x01)?0x0000000f:0);

	//trxCPUrec.fpscr = (trxCPUrec.fpr[rB] & FM) | (trxCPUrec.fpscr & ~FM);
	//gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.fpscr);
	//gen_asm(AND_RI32, EAX, ~FM);
	//gen_asm(MOV_RM, ECX, (uint32)&trx_ps[rB].low);
	//gen_asm(AND_RI32, ECX, FM);
	//gen_asm(OR_RR, EAX, ECX);
	//gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, EAX);

	psc_load(0, rB);
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.fpscr);
	gen_asm(AND_RI32, EAX, ~FM);
	gen_asm(MOVLPD_MR, (uint32)&double_store, 0);
	gen_asm(MOV_RM, ECX, (uint32)&double_store);
	gen_asm(AND_RI32, ECX, FM);
	gen_asm(OR_RR, EAX, ECX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, EAX);
}

// unknown,unverified FPSCR bits not calculated correctly anyway!
void trx_ppc_gen_sse2_mcrfs()
{
	uint32 crD, crS, c, src_mask, dst_mask;
	crD = (trxCPUrec.opcode >> 23)& 0x7;
	crS = (trxCPUrec.opcode >> 18)& 0x7;
	crD = 7-crD;
	crS = 7-crS;
	
	dst_mask = trx_ppc_cmp_and_mask[crD];
	src_mask = trx_ppc_cmp_and_mask[crS];
	
	//trxCPUrec.cr &= trx_ppc_cmp_and_mask[crD];
	gen_asm(MOV_RM, ECX, (uint32)&trxCPUrec.cr);
	gen_asm(AND_RI32, ECX, dst_mask);
	//c = (trxCPUrec.fpscr >> (crS*4))&0xf;
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.fpscr);
	gen_asm(SHR_RI8, EAX, crS*4);
	gen_asm(AND_RI32, EAX, 0xf);
	//trxCPUrec.cr |= c<<(crD*4);
	gen_asm(SHL_RI8, EAX, crD*4);
	gen_asm(OR_RR, ECX, EAX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.cr, ECX);
	//trxCPUrec.fpscr &= trx_ppc_cmp_and_mask[crS];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.fpscr);
	gen_asm(AND_RI32, EAX, src_mask);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, EAX);
}

//==============================================================================
// Floating point load/store opcodes
//
// used
void trx_ppc_gen_sse2_lfs()
{
	uint32 rD, rA;
	sint16 imm;

	// FPU exceptions check already generated outside
	rD = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	imm = trxCPUrec.opcode & 0xffff;

	if(rA == 0)
	{
	//	printf("[Tratax recompiler] lfs constant access at %8.8x\n", trxCPUrec.pc);
		//EA = 0 + imm;
		gen_asm(MOV_RI32, ECX, imm);
	}
	else
	{
		if(regc_is_constant(rA))
		{
//			printf("[Tratax recompiler] lfs constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rA] + imm;
		regc_load(ECX, rA);
		gen_asm(ADD_RI32, ECX, imm);
	}	
	//r = mem_read32(EA);
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_read32);
	gen_asm(MOV_MR, (uint32)&float_store, EAX);
	gen_asm(FLD32_M, (uint32)&float_store);
	gen_asm(FSTP64_M, (uint32)&double_store);
	gen_asm(MOVLPD_RM, 0, (uint32)&double_store);

	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		//trx_ps1_double[rD] = f;
		gen_asm(SHUFPD_RR, 0, 0, 0);
		psc_store(0, rD);
	}
	else
	{
		psc_store_low(0, rD);
	}
}

// used
void trx_ppc_gen_sse2_lfsx()
{
	uint32 rD, rA, rB;

	// FPU exceptions check already generated outside
	rD = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(rA == 0)
	{
		if(regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] lfsx constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rB];
		regc_load(ECX, rB);
	}
	else
	{
		if(regc_is_constant(rA) && regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] lfsx constant access at %8.8x\n", trxCPUrec.pc);
		}
		regc_load(EAX, rA);
		regc_load(ECX, rB);
		//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
		gen_asm(ADD_RR, ECX, EAX);
	}
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_read32);
	gen_asm(MOV_MR, (uint32)&float_store, EAX);
	gen_asm(FLD32_M, (uint32)&float_store);
	gen_asm(FSTP64_M, (uint32)&double_store);
	gen_asm(MOVLPD_RM, 0, (uint32)&double_store);

	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		//trx_ps1_double[rD] = f;
		gen_asm(SHUFPD_RR, 0, 0, 0);
		psc_store(0, rD);
	}
	else
	{
		psc_store_low(0, rD);
	}
}

// used
void trx_ppc_gen_sse2_lfsux()
{
	uint32 rD, rA, rB;

	// FPU exceptions check already generated outside
	rD = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(regc_is_constant(rA) && regc_is_constant(rB))
	{
//		printf("[Tratax recompiler] lfsux constant access at %8.8x\n", trxCPUrec.pc);
	}
	regc_load(EAX, rA);
	regc_load(ECX, rB);
	//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
	gen_asm(ADD_RR, ECX, EAX);
	regc_store(ECX, rA);
	//	trxCPUrec.gpr[rD] = mem_read32(EA);
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_read32);
	gen_asm(MOV_MR, (uint32)&float_store, EAX);
	gen_asm(FLD32_M, (uint32)&float_store);
	gen_asm(FSTP64_M, (uint32)&double_store);
	gen_asm(MOVLPD_RM, 0, (uint32)&double_store);

	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		//trx_ps1_double[rD] = f;
		gen_asm(SHUFPD_RR, 0, 0, 0);
		psc_store(0, rD);
	}
	else
	{
		psc_store_low(0, rD);
	}
}

// used
void trx_ppc_gen_sse2_lfsu()
{
	uint32 rD, rA;
	sint16 imm;

	// FPU exceptions check already generated outside
	rD = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	imm = trxCPUrec.opcode & 0xffff;

	if(regc_is_constant(rA))
	{
//		printf("[Tratax recompiler] lfsu constant access at %8.8x\n", trxCPUrec.pc);
	}
	//EA = trxCPUrec.gpr[rA] + imm;
	regc_load(ECX, rA);
	gen_asm(ADD_RI32, ECX, imm);
	regc_store(ECX, rA);
	//r = mem_read32(EA);
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_read32);
	gen_asm(MOV_MR, (uint32)&float_store, EAX);
	gen_asm(FLD32_M, (uint32)&float_store);
	gen_asm(FSTP64_M, (uint32)&double_store);
	gen_asm(MOVLPD_RM, 0, (uint32)&double_store);

	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		//trx_ps1_double[rD] = f;
		gen_asm(SHUFPD_RR, 0, 0, 0);
		psc_store(0, rD);
	}
	else
	{
		psc_store_low(0, rD);
	}
}

// used
void trx_ppc_gen_sse2_lfd()
{
	uint32 rD, rA;
	sint16 imm;

	rD = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	imm = trxCPUrec.opcode & 0xffff;

	if(rA == 0)
	{
//		printf("[Tratax recompiler] lfd constant access at %8.8x\n", trxCPUrec.pc);
		//EA = 0 + imm;
		gen_asm(MOV_RI32, ECX, imm);
	}
	else
	{
		if(regc_is_constant(rA))
		{
//			printf("[Tratax recompiler] lfd constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rA] + imm;
		regc_load(ECX, rA);
		gen_asm(ADD_RI32, ECX, imm);
	}	
	gen_asm(PUSH_R, ECX);
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_read64);
	gen_asm(POP_R, ECX);
	
	psc_store_low(0, rD);
}

// used
void trx_ppc_gen_sse2_lfdu()
{
	uint32 rD, rA;
	sint16 imm;

	rD = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	imm = trxCPUrec.opcode & 0xffff;

	if(regc_is_constant(rA))
	{
//		printf("[Tratax recompiler] lfdu constant access at %8.8x\n", trxCPUrec.pc);
	}
	//EA = trxCPUrec.gpr[rA] + imm;
	regc_load(ECX, rA);
	gen_asm(ADD_RI32, ECX, imm);
	regc_store(ECX, rA);
	gen_asm(PUSH_R, ECX);
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_read64);
	gen_asm(POP_R, ECX);
	psc_store_low(0, rD);
}

// used
void trx_ppc_gen_sse2_lfdx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(rA == 0)
	{
		if(regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] lfdx constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rB];
		regc_load(ECX, rB);
	}
	else
	{
		if(regc_is_constant(rA) && regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] lfdx constant access at %8.8x\n", trxCPUrec.pc);
		}
		regc_load(EAX, rA);
		regc_load(ECX, rB);
		//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
		gen_asm(ADD_RR, ECX, EAX);
	}
	gen_asm(PUSH_R, ECX);
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_read64);
	gen_asm(POP_R, ECX);
	psc_store_low(0, rD);
}

// used
void trx_ppc_gen_sse2_lfdux()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(regc_is_constant(rA) && regc_is_constant(rB))
	{
//		printf("[Tratax recompiler] lfdux constant access at %8.8x\n", trxCPUrec.pc);
	}
	regc_load(EAX, rA);
	regc_load(ECX, rB);
	//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
	gen_asm(ADD_RR, ECX, EAX);
	regc_store(ECX, rA);
	gen_asm(PUSH_R, ECX);
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_read64);
	gen_asm(POP_R, ECX);
	psc_store_low(0, rD);
}

// used
void trx_ppc_gen_sse2_stfd()
{
	uint32 rS, rA;
	sint16 imm;
	
	rS = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	imm = trxCPUrec.opcode & 0xffff;

	if(rA == 0)
	{
//		printf("[Tratax recompiler] stfd constant access at %8.8x\n", trxCPUrec.pc);
		//EA = 0 + imm;
		gen_asm(MOV_RI32, ECX, imm);
	}
	else
	{
		if(regc_is_constant(rA))
		{
//			printf("[Tratax recompiler] stfd constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rA] + imm;
		regc_load(ECX, rA);
		gen_asm(ADD_RI32, ECX, imm);
	}
	psc_load(0, rS);
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_write64);
/*
	gen_asm(MOVLPD_MR, (uint32)&double_store, 0);
	gen_asm(MOV_RM, EAX, (uint32)(&double_store)+4);
	gen_asm(PUSH_R, ECX);
	gen_asm(CALL_M, (uint32)&rec_mem_write32);
	gen_asm(POP_R, ECX);
	gen_asm(MOV_RM, EAX, (uint32)&double_store);
	gen_asm(ADD_RI32, ECX, 4);
	gen_asm(CALL_M, (uint32)&rec_mem_write32);
*/
}

// used
void trx_ppc_gen_sse2_stfdu()
{
	uint32 rS, rA;
	sint16 imm;
	
	rS = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	imm = trxCPUrec.opcode & 0xffff;

	if(regc_is_constant(rA))
	{
//		printf("[Tratax recompiler] stfdu constant access at %8.8x\n", trxCPUrec.pc);
	}
	//EA = trxCPUrec.gpr[rA] + imm;
	regc_load(ECX, rA);
	gen_asm(ADD_RI32, ECX, imm);
	regc_store(ECX, rA);

	psc_load(0, rS);
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_write64);
/*
	gen_asm(MOVLPD_MR, (uint32)&double_store, 0);
	gen_asm(MOV_RM, EAX, (uint32)(&double_store)+4);
	gen_asm(PUSH_R, ECX);
	gen_asm(CALL_M, (uint32)&rec_mem_write32);
	gen_asm(POP_R, ECX);
	gen_asm(MOV_RM, EAX, (uint32)&double_store);
	gen_asm(ADD_RI32, ECX, 4);
	gen_asm(CALL_M, (uint32)&rec_mem_write32);
*/
}

// used
void trx_ppc_gen_sse2_stfdx()
{
	uint32 rS, rA, rB;
	
	rS = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(rA == 0)
	{
		if(regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] stfdx constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rB];
		regc_load(ECX, rB);
	}
	else
	{
		if(regc_is_constant(rA) && regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] stfdx constant access at %8.8x\n", trxCPUrec.pc);
		}
		regc_load(EAX, rA);
		regc_load(ECX, rB);
		//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
		gen_asm(ADD_RR, ECX, EAX);
	}
	psc_load(0, rS);
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_write64);
/*
	gen_asm(MOVLPD_MR, (uint32)&double_store, 0);
	gen_asm(MOV_RM, EAX, (uint32)(&double_store)+4);
	gen_asm(PUSH_R, ECX);
	gen_asm(CALL_M, (uint32)&rec_mem_write32);
	gen_asm(POP_R, ECX);
	gen_asm(MOV_RM, EAX, (uint32)&double_store);
	gen_asm(ADD_RI32, ECX, 4);
	gen_asm(CALL_M, (uint32)&rec_mem_write32);
*/
}

// used
void trx_ppc_gen_sse2_stfdux()
{
	uint32 rS, rA, rB;
	
	rS = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(regc_is_constant(rA) && regc_is_constant(rB))
	{
//		printf("[Tratax recompiler] stfdux constant access at %8.8x\n", trxCPUrec.pc);
	}
	regc_load(EAX, rA);
	regc_load(ECX, rB);
	//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
	gen_asm(ADD_RR, ECX, EAX);
	regc_store(ECX, rA);

	psc_load(0, rS);
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_write64);
/*
	gen_asm(MOVLPD_MR, (uint32)&double_store, 0);
	gen_asm(MOV_RM, EAX, (uint32)(&double_store)+4);
	gen_asm(PUSH_R, ECX);
	gen_asm(CALL_M, (uint32)&rec_mem_write32);
	gen_asm(POP_R, ECX);
	gen_asm(MOV_RM, EAX, (uint32)&double_store);
	gen_asm(ADD_RI32, ECX, 4);
	gen_asm(CALL_M, (uint32)&rec_mem_write32);
*/
}

// used
void trx_ppc_gen_sse2_stfs()
{
	uint32 rS, rA;
	sint16 imm;
	
	rS = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	imm = trxCPUrec.opcode & 0xffff;

	if(rA == 0)
	{
//		printf("[Tratax recompiler] stfs constant access at %8.8x\n", trxCPUrec.pc);
		//EA = 0 + imm;
		gen_asm(MOV_RI32, ECX, imm);
	}
	else
	{
		if(regc_is_constant(rA))
		{
//			printf("[Tratax recompiler] stfs constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rA] + imm;
		regc_load(ECX, rA);
		gen_asm(ADD_RI32, ECX, imm);
	}	
	psc_load(0, rS);
	gen_asm(MOVLPD_MR, (uint32)&double_store, 0);
	gen_asm(FLD64_M, (uint32)&double_store);
	gen_asm(FSTP32_M, (uint32)&float_store);
	gen_asm(MOV_RM, EAX, (uint32)&float_store);
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_write32);
}

// used
void trx_ppc_gen_sse2_stfsx()
{
	uint32 rS, rA, rB;
	
	rS = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(rA == 0)
	{
		if(regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] stfsx constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rB];
		regc_load(ECX, rB);
	}
	else
	{
		if(regc_is_constant(rA) && regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] stfsx constant access at %8.8x\n", trxCPUrec.pc);
		}
		regc_load(EAX, rA);
		regc_load(ECX, rB);
		//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
		gen_asm(ADD_RR, ECX, EAX);
	}
	psc_load(0, rS);
	gen_asm(MOVLPD_MR, (uint32)&double_store, 0);
	gen_asm(FLD64_M, (uint32)&double_store);
	gen_asm(FSTP32_M, (uint32)&float_store);
	gen_asm(MOV_RM, EAX, (uint32)&float_store);
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_write32);
}

// used
void trx_ppc_gen_sse2_stfsux()
{
	uint32 rS, rA, rB;
	
	rS = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(regc_is_constant(rA) && regc_is_constant(rB))
	{
//		printf("[Tratax recompiler] stfsux constant access at %8.8x\n", trxCPUrec.pc);
	}
	regc_load(EAX, rA);
	regc_load(ECX, rB);
	//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
	gen_asm(ADD_RR, ECX, EAX);
	regc_store(ECX, rA);
	// store
	psc_load(0, rS);
	gen_asm(MOVLPD_MR, (uint32)&double_store, 0);
	gen_asm(FLD64_M, (uint32)&double_store);
	gen_asm(FSTP32_M, (uint32)&float_store);
	gen_asm(MOV_RM, EAX, (uint32)&float_store);
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_write32);
}

// used
void trx_ppc_gen_sse2_stfsu()
{
	uint32 rS, rA;
	sint16 imm;
	
	rS = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	imm = trxCPUrec.opcode & 0xffff;

	if(regc_is_constant(rA))
	{
//		printf("[Tratax recompiler] stfsu constant access at %8.8x\n", trxCPUrec.pc);
	}
	//EA = trxCPUrec.gpr[rA] + imm;
	regc_load(ECX, rA);
	gen_asm(ADD_RI32, ECX, imm);
	regc_store(ECX, rA);
	// store
	psc_load(0, rS);
	gen_asm(MOVLPD_MR, (uint32)&double_store, 0);
	gen_asm(FLD64_M, (uint32)&double_store);
	gen_asm(FSTP32_M, (uint32)&float_store);
	gen_asm(MOV_RM, EAX, (uint32)&float_store);
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_write32);
}

// used
void trx_ppc_gen_sse2_stfiwx()
{
	uint32 rS, rA, rB;

	rS = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(rA == 0)
	{
		if(regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] stfiwx constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rB];
		regc_load(ECX, rB);
	}
	else
	{
		if(regc_is_constant(rA) && regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] stfiwx constant access at %8.8x\n", trxCPUrec.pc);
		}
		regc_load(EAX, rA);
		regc_load(ECX, rB);
		//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
		gen_asm(ADD_RR, ECX, EAX);
	}
	psc_flush(rS);
	gen_asm(MOV_RM, EAX, (uint32)&trx_ps[rS].low);
	psc_flushall();
	gen_asm(CALL_M, (uint32)&rec_mem_write32);
}
//==============================================================================
// Floating point arithmetic opcodes
//

// tested
void trx_ppc_gen_sse2_fdivsx()
{
	uint32 rD, rA, rB, rega, regb;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
		
/*
	f = (trx_ps0_double[rA] / trx_ps0_double[rB]);
	trx_ps0_double[rD] = f;
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		trx_ps1_double[rD] = f;
	}
*/
	psc_load(0, rA);
	gen_asm(DIVSD_RR, 0, psc_getcachereg(rB));
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		// copy result also to high part of 0
		gen_asm(SHUFPD_RR, 0, 0, 0);
		psc_store(0, rD);
	}
	else
	{
		psc_store_low(0, rD);
	}
}

// tested
void trx_ppc_gen_sse2_fdivx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
		
/*
	trx_ps0_double[rD] = (trx_ps0_double[rA] / trx_ps0_double[rB]);
*/
	psc_load(0, rA);
	gen_asm(DIVSD_RR, 0, psc_getcachereg(rB));
	psc_store_low(0, rD);
}

// tested
void trx_ppc_gen_sse2_fsubsx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
/*		
	f = (trx_ps0_double[rA] - trx_ps0_double[rB]);
	trx_ps0_double[rD] = f;

	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		trx_ps1_double[rD] = f;
	}
*/
	psc_load(0, rA);
	gen_asm(SUBSD_RR, 0, psc_getcachereg(rB));
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		// copy result also to high part of 0
		gen_asm(SHUFPD_RR, 0, 0, 0);
		psc_store(0, rD);
	}
	else
	{
		psc_store_low(0, rD);
	}
}

// tested
void trx_ppc_gen_sse2_fmrx()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//trx_ps0_int[rD] = trx_ps0_int[rB]; // maybe copy as doubles is faster ?
	psc_load(0, rB);
	psc_store_low(0, rD);
}
// used
void trx_ppc_gen_sse2_fmulx()
{
	uint32 rD, rA, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

	//trx_ps0_double[rD] = trx_ps0_double[rA] * trx_ps0_double[rC];	
	psc_load(0, rA);
	gen_asm(MULSD_RR, 0, psc_getcachereg(rC));
	psc_store_low(0, rD);
}
// used
void trx_ppc_gen_sse2_fsubx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//trx_ps0_double[rD] = trx_ps0_double[rA] - trx_ps0_double[rB];	
	psc_load(0, rA);
	gen_asm(SUBSD_RR, 0, psc_getcachereg(rB));
	psc_store_low(0, rD);
}

// tested
void trx_ppc_gen_sse2_fmulsx()
{
	uint32 rD, rA, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	f = (trx_ps0_double[rA] * trx_ps0_double[rC]);
	trx_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_ps1_double[rD] = f;
	}
*/
	psc_load(0, rA);
	gen_asm(MULSD_RR, 0, psc_getcachereg(rC));
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		// copy result also to high part of 0
		gen_asm(SHUFPD_RR, 0, 0, 0);
		psc_store(0, rD);
	}
	else
	{
		psc_store_low(0, rD);
	}
}

// tested
void trx_ppc_gen_sse2_fnegx()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
		
//	trx_ps0_int[rD] = trx_ps0_int[rB] ^ FPU_SIGN_BIT;

	psc_load(0, rB);
	gen_asm(XORPD_RM, 0, (uint32)&fnegmask);
	psc_store_low(0, rD);
}
// used
void trx_ppc_gen_sse2_frspx()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//float f;
	//f = trx_ps0_double[rB];
	//trx_ps0_double[rD] = f;

	// We are supposed to convert Double to Single, but since we internally keep 2 doubles 
	// just truncate to single precision and then convert back to double precision
	psc_load(0, rB);
	gen_asm(CVTSD2SS_RR, 0, 0);
	gen_asm(CVTSS2SD_RR, 0, 0);
	psc_store_low(0, rD);
}

// tested
void trx_ppc_gen_sse2_fabsx()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
		
	//trx_ps0_int[rD] = trx_ps0_int[rB] & ~FPU_SIGN_BIT;
	psc_load(0, rB);
	gen_asm(ANDPD_RM, 0, (uint32)&fabsmask);
	psc_store_low(0, rD);
}

// tested
void trx_ppc_gen_sse2_fnabsx()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
		
	//trx_ps0_int[rD] = trx_ps0_int[rB] | FPU_SIGN_BIT;
	psc_load(0, rB);
	gen_asm(ORPD_RM, 0, (uint32)&fnegmask);
	psc_store_low(0, rD);
}

// tested
void trx_ppc_gen_sse2_fresx()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

/*
	fpu_as_double[frD] = 1.0f / fpu_as_double[frB];

	if(gCPU.hid[2] & HID2_PSE)
	{
		ps1_double[frD] = f;
	}
*/
	psc_load(0, rB);
	gen_asm(MOVLPD_RM, 0, (uint32)&double_one);
	gen_asm(DIVSD_RR, 0, psc_getcachereg(rB));
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		// copy result also to high part of 0
		gen_asm(SHUFPD_RR, 0, 0, 0);
		psc_store(0, rD);
	}
	else
	{
		psc_store_low(0, rD);
	}
}

// tested
void trx_ppc_gen_sse2_faddsx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
/*		
	f = (trx_ps0_double[rA] + trx_ps0_double[rB]);
	trx_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_ps1_double[rD] = f;
	}
*/
	psc_load(0, rA);
	gen_asm(ADDSD_RR, 0, psc_getcachereg(rB));
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		// copy result also to high part of 0
		gen_asm(SHUFPD_RR, 0, 0, 0);
		psc_store(0, rD);
	}
	else
	{
		psc_store_low(0, rD);
	}
}

// tested
void trx_ppc_gen_sse2_faddx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
/*		
	trx_ps0_double[rD] = (trx_ps0_double[rA] + trx_ps0_double[rB]);
*/
	psc_load(0, rA);
	gen_asm(ADDSD_RR, 0, psc_getcachereg(rB));
	psc_store_low(0, rD);
}

// tested
void trx_ppc_gen_sse2_frsqrtex()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

//	trx_ps0_double[rD] = 1.0f / sqrt(trx_ps0_double[rB]);
	psc_load(1, rB);
	gen_asm(SQRTSD_RR, 1, psc_getcachereg(rB)); // result in 1.low
	gen_asm(MOVLPD_RM, 0, (uint32)&double_one);
	gen_asm(DIVSD_RR, 0, 1);
	psc_store_low(0, rD);
}

//==============================================================================
// Floating point comparision opcodes
//

// used
void trx_ppc_gen_sse2_fcmpu()
{
	uint32 cr, rA, rB;
	uint32 shift, mask;

	cr = (trxCPUrec.opcode >> 23)& 0x7;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	cr = 7-cr;
	shift = 28-(cr*4); 
	mask = trx_ppc_cmp_and_mask[cr];

	gen_asm(XOR_RR, ECX, ECX);
	gen_asm(UCOMISD_RR, psc_getcachereg(rA), psc_getcachereg(rB)); 
	gen_asm(CMOVA_M, EAX, (uint32)&cr_gt);
	gen_asm(CMOVB_M, EAX, (uint32)&cr_lt);
	gen_asm(CMOVE_M, EAX, (uint32)&cr_eq);
	gen_asm(CMOVP_M, EAX, (uint32)&cr_so);	
	gen_asm(CMOVP_M, ECX, (uint32)&fcmpu_nan_flags);
	//if(rA == NaN || rB == Nan)
	//{
	//	trxCPUrec.fpscr |= FPSCR_VXSNAN;
	//}
	//trxCPUrec.fpscr &= ~0x1f000;
	//trxCPUrec.fpscr |= (cmp << 12); -> thats when the bits are in 0-3 now they are in 28-31
	//so we should do >>28 <<12 which is >>16
	gen_asm(MOV_RM, EDX, (uint32)&trxCPUrec.fpscr);
	gen_asm(OR_RR, EDX, ECX); // apply flags if needed
	gen_asm(AND_RI32, EDX, ~0x1f000);
	gen_asm(MOV_RR, ECX, EAX);
	gen_asm(SHR_RI8, ECX, 16);
	gen_asm(OR_RR, ECX, EDX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, ECX);
	// store in designated flags
	gen_asm(SHR_RI8, EAX, shift);
	gen_asm(MOV_RM, ECX, (uint32)&trxCPUrec.cr);
	gen_asm(AND_RI32, ECX, mask);
	gen_asm(OR_RR, ECX, EAX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.cr, ECX);
}

// used
void trx_ppc_gen_sse2_fcmpo()
{
	uint32 cr, rA, rB;
	uint32 shift, mask;

	cr = (trxCPUrec.opcode >> 23)& 0x7;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	cr = 7-cr;
	shift = 28-(cr*4); 
	mask = trx_ppc_cmp_and_mask[cr];

	gen_asm(XOR_RR, ECX, ECX);
	gen_asm(UCOMISD_RR, psc_getcachereg(rA), psc_getcachereg(rB));
	gen_asm(CMOVA_M, EAX, (uint32)&cr_gt);
	gen_asm(CMOVB_M, EAX, (uint32)&cr_lt);
	gen_asm(CMOVE_M, EAX, (uint32)&cr_eq);
	gen_asm(CMOVP_M, EAX, (uint32)&cr_so);
	gen_asm(CMOVP_M, ECX, (uint32)&fcmpo_nan_flags);
	//if(rA == NaN || rB == Nan)
	//{
	//	trxCPUrec.fpscr |= FPSCR_VXSNAN;
	//	trxCPUrec.fpscr |= FPSCR_VXVC;
	//}
	//trxCPUrec.fpscr &= ~0x1f000;
	//trxCPUrec.fpscr |= (cmp << 12); -> thats when the bits are in 0-3 now they are in 28-31
	//so we should do >>28 <<12 which is >>16
	gen_asm(MOV_RM, EDX, (uint32)&trxCPUrec.fpscr);
	gen_asm(OR_RR, EDX, ECX); // apply flags if needed
	gen_asm(AND_RI32, EDX, ~0x1f000);
	gen_asm(MOV_RR, ECX, EAX);
	gen_asm(SHR_RI8, ECX, 16);
	gen_asm(OR_RR, ECX, EDX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, ECX);
	// store in designated flags
	gen_asm(SHR_RI8, EAX, shift);
	gen_asm(MOV_RM, ECX, (uint32)&trxCPUrec.cr);
	gen_asm(AND_RI32, ECX, mask);
	gen_asm(OR_RR, ECX, EAX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.cr, ECX);
}

// tested
void trx_ppc_gen_sse2_fselx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

//	if (A.type == ppc_fpr_NaN || trx_ps0_double[rA] < 0.0f) 
//		trx_ps0_double[rD] = trx_ps0_double[rB];
//	else 
//		trx_ps0_double[rD] = trx_ps0_double[rC];
	psc_flush(rA);
	psc_flush(rB);
	psc_flush(rC);
	psc_flush(rD);
	gen_asm(FLD64_M, (uint32)&trxCPUrec.fpr[rC]);
	gen_asm(FLD64_M, (uint32)&trxCPUrec.fpr[rB]);
	gen_asm(FLDZ);
	gen_asm(FLD64_M, (uint32)&trxCPUrec.fpr[rA]);
	gen_asm(FUCOMIP, 1); // cmp a, 0
	gen_asm(FCMOVB_M, 1); // < 0
	gen_asm(FCMOVU_M, 1); // NaN
	gen_asm(FCMOVNB_M, 2); // >= 0
	gen_asm(FSTP64_M, (uint32)&trxCPUrec.fpr[rD]);
	gen_asm(FPOP);
	gen_asm(FPOP);
}

//==============================================================================
// Floating point conversion and rounding opcodes
//

void trx_ppc_gen_sse2_fctiwzx()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	psc_load(0, rB);
	gen_asm(CVTTSD2SI_RR, EAX, 0);
	psc_flush(rD);
    gen_asm(MOV_MR, (uint32)&trx_ps[rD].low, EAX);
}

void trx_ppc_gen_sse2_fctiwx()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	psc_load(0, rB);
	gen_asm(CVTSD2SI_RR, EAX, 0);
	psc_flush(rD);
    gen_asm(MOV_MR, (uint32)&trx_ps[rD].low, EAX);
}

//==============================================================================
// Floating point multiply-add opcodes
//
// tested
void trx_ppc_gen_sse2_fmsubsx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	double f;

	f = ((trx_ps0_double[rA] * trx_ps0_double[rC]) - trx_ps0_double[rB]);
	trx_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_ps1_double[rD] = f;
	}
*/
	psc_load(0, rA);
	gen_asm(MULSD_RR, 0, psc_getcachereg(rC));
	gen_asm(SUBSD_RR, 0, psc_getcachereg(rB));
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		// copy result also to high part of 0
		gen_asm(SHUFPD_RR, 0, 0, 0);
		psc_store(0, rD);
	}
	else
	{
		psc_store_low(0, rD);
	}
}

// tested
void trx_ppc_gen_sse2_fnmsubsx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	double f;

	f = -((trx_ps0_double[rA] * trx_ps0_double[rC]) - trx_ps0_double[rB]);
	trx_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_ps1_double[rD] = f;
	}
*/
	psc_load(0, rA);
	gen_asm(MULSD_RR, 0, psc_getcachereg(rC));
	gen_asm(SUBSD_RR, 0, psc_getcachereg(rB));
	gen_asm(XORPD_RM, 0, (uint32)&fnegmask);
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		// copy result also to high part of 0
		gen_asm(SHUFPD_RR, 0, 0, 0);
		psc_store(0, rD);
	}
	else
	{
		psc_store_low(0, rD);
	}
}

// tested
void trx_ppc_gen_sse2_fnmsubx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	trx_ps0_double[rD] = -((trx_ps0_double[rA] * trx_ps0_double[rC]) - trx_ps0_double[rB]);
*/
	psc_load(0, rA);
	gen_asm(MULSD_RR, 0, psc_getcachereg(rC));
	gen_asm(SUBSD_RR, 0, psc_getcachereg(rB));
	gen_asm(XORPD_RM, 0, (uint32)&fnegmask);
	psc_store_low(0, rD);
}

// tested
void trx_ppc_gen_sse2_fmsubx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	trx_ps0_double[rD] = (trx_ps0_double[rA] * trx_ps0_double[rC]) - trx_ps0_double[rB];
*/
	psc_load(0, rA);
	gen_asm(MULSD_RR, 0, psc_getcachereg(rC));
	gen_asm(SUBSD_RR, 0, psc_getcachereg(rB));
	psc_store_low(0, rD);
}

// tested
void trx_ppc_gen_sse2_fmaddsx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	double f;

	f = (trx_ps0_double[rA] * trx_ps0_double[rC]) + trx_ps0_double[rB];
	trx_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_ps1_double[rD] = f;
	}
*/
	psc_load(0, rA);
	gen_asm(MULSD_RR, 0, psc_getcachereg(rC));
	gen_asm(ADDSD_RR, 0, psc_getcachereg(rB));
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		// copy result also to high part of 0
		gen_asm(SHUFPD_RR, 0, 0, 0);
		psc_store(0, rD);
	}
	else
	{
		psc_store_low(0, rD);
	}
}

// used
void trx_ppc_gen_sse2_fnmaddsx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	double f;

	f = -((trx_ps0_double[rA] * trx_ps0_double[rC]) + trx_ps0_double[rB]);
	trx_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_ps1_double[rD] = f;
	}
*/
	psc_load(0, rA);
	gen_asm(MULSD_RR, 0, psc_getcachereg(rC));
	gen_asm(ADDSD_RR, 0, psc_getcachereg(rB));
	gen_asm(XORPD_RM, 0, (uint32)&fnegmask);
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		// copy result also to high part of 0
		gen_asm(SHUFPD_RR, 0, 0, 0);
		psc_store(0, rD);
	}
	else
	{
		psc_store_low(0, rD);
	}
}

// tested
void trx_ppc_gen_sse2_fmaddx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	trx_ps0_double[rD] = (trx_ps0_double[rA] * trx_ps0_double[rC]) + trx_ps0_double[rB];
*/
	psc_load(0, rA);
	gen_asm(MULSD_RR, 0, psc_getcachereg(rC));
	gen_asm(ADDSD_RR, 0, psc_getcachereg(rB));
	psc_store_low(0, rD);
}

// used
void trx_ppc_gen_sse2_fnmaddx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	trx_ps0_double[rD] = -((trx_ps0_double[rA] * trx_ps0_double[rC]) + trx_ps0_double[rB]);
*/
	psc_load(0, rA);
	gen_asm(MULSD_RR, 0, psc_getcachereg(rC));
	gen_asm(ADDSD_RR, 0, psc_getcachereg(rB));
	gen_asm(XORPD_RM, 0, (uint32)&fnegmask);
	psc_store_low(0, rD);
}

//==============================================================================
// Paired Single Load and Store Instructions
//
// dequantization factor
static const float dq_factor[] =
{
	1.0/(1 <<  0),
	1.0/(1 <<  1),
	1.0/(1 <<  2),
	1.0/(1 <<  3),
	1.0/(1 <<  4),
	1.0/(1 <<  5),
	1.0/(1 <<  6),
	1.0/(1 <<  7),
	1.0/(1 <<  8),
	1.0/(1 <<  9),
	1.0/(1 << 10),
	1.0/(1 << 11),
	1.0/(1 << 12),
	1.0/(1 << 13),
	1.0/(1 << 14),
	1.0/(1 << 15),
	1.0/(1 << 16),
	1.0/(1 << 17),
	1.0/(1 << 18),
	1.0/(1 << 19),
	1.0/(1 << 20),
	1.0/(1 << 21),
	1.0/(1 << 22),
	1.0/(1 << 23),
	1.0/(1 << 24),
	1.0/(1 << 25),
	1.0/(1 << 26),
	1.0/(1 << 27),
	1.0/(1 << 28),
	1.0/(1 << 29),
	1.0/(1 << 30),
	1.0/(1 << 31),

	(1ULL << 32),
	(1 << 31),
	(1 << 30),
	(1 << 29),
	(1 << 28),
	(1 << 27),
	(1 << 26),
	(1 << 25),
	(1 << 24),
	(1 << 23),
	(1 << 22),
	(1 << 21),
	(1 << 20),
	(1 << 19),
	(1 << 18),
	(1 << 17),
	(1 << 16),
	(1 << 15),
	(1 << 14),
	(1 << 13),
	(1 << 12),
	(1 << 11),
	(1 << 10),
	(1 <<  9),
	(1 <<  8),
	(1 <<  7),
	(1 <<  6),
	(1 <<  5),
	(1 <<  4),
	(1 <<  3),
	(1 <<  2),
	(1 <<  1),
};


// ECX is assumed to contain EA, scale_value contains scale
// returns result in XMM0

// load unscaled float
__declspec (naked) void qload_sse2_type0(void)
{
	_asm
	{
		call rec_mem_read32
		mov dword ptr float_store, eax
		cvtss2sd xmm0, dword ptr float_store
		ret
	}
}
// load scaled unsigned byte
__declspec (naked) void qload_sse2_type4(void)
{
	_asm
	{
		call rec_mem_read8
		movzx eax, al
		cvtsi2sd xmm0, eax
		cvtss2sd xmm1, dword ptr scale_value
		mulsd xmm0, xmm1
		ret
	}
}
// load scaled unsigned word
__declspec (naked) void qload_sse2_type5(void)
{
	_asm
	{
		call rec_mem_read16
		movzx eax, ax
		cvtsi2sd xmm0, eax
		cvtss2sd xmm1, dword ptr scale_value
		mulsd xmm0, xmm1
		ret
	}
}
// load scaled signed byte
__declspec (naked) void qload_sse2_type6(void)
{
	_asm
	{
		call rec_mem_read8
		movsx eax, al
		cvtsi2sd xmm0, eax
		cvtss2sd xmm1, dword ptr scale_value
		mulsd xmm0, xmm1
		ret
	}
}
// load scaled signed word
__declspec (naked) void qload_sse2_type7(void)
{
	_asm
	{
		call rec_mem_read16
		movsx eax, ax
		cvtsi2sd xmm0, eax
		cvtss2sd xmm1, dword ptr scale_value
		mulsd xmm0, xmm1
		ret
	}
}

// ECX is assumed to contain EA, scale_value contains scale
// returns result in XMM0 (PAIRED result!)

// load unscaled floats
__declspec (naked) void qloadp_sse2_type0(void)
{
	_asm
	{
		call rec_mem_read64
		cvtps2pd xmm0, xmm0
		shufpd xmm0, xmm0, 1 ; // swap values
		ret
	}
}
// load scaled unsigned byte
__declspec (naked) void qloadp_sse2_type4(void)
{
	_asm
	{
		call rec_mem_read16
		movzx edx, al
		movzx eax, ah
		cvtsi2sd xmm0, eax
		shufpd xmm0, xmm0, 0 ; // copy to high
		cvtsi2sd xmm0, edx
		cvtss2sd xmm1, dword ptr scale_value
		shufpd xmm1, xmm1, 0 ; // copy to high
		mulpd xmm0, xmm1
		ret
	}
}
// load scaled unsigned word
__declspec (naked) void qloadp_sse2_type5(void)
{
	_asm
	{
		call rec_mem_read32
		mov edx, eax
		shr edx, 16
		movzx eax, ax
		cvtsi2sd xmm0, eax
		shufpd xmm0, xmm0, 0 ; // copy to high
		cvtsi2sd xmm0, edx
		cvtss2sd xmm1, dword ptr scale_value
		shufpd xmm1, xmm1, 0 ; // copy to high
		mulpd xmm0, xmm1
		ret
	}
}
// load scaled signed byte
__declspec (naked) void qloadp_sse2_type6(void)
{
	_asm
	{
		call rec_mem_read16
		movsx edx, al
		movsx eax, ah
		cvtsi2sd xmm0, eax
		shufpd xmm0, xmm0, 0 ; // copy to high
		cvtsi2sd xmm0, edx
		cvtss2sd xmm1, dword ptr scale_value
		shufpd xmm1, xmm1, 0 ; // copy to high
		mulpd xmm0, xmm1
		ret
	}
}
// load scaled signed word
__declspec (naked) void qloadp_sse2_type7(void)
{
	_asm
	{
		call rec_mem_read32
		mov edx, eax
		sar edx, 16
		movsx eax, ax
		cvtsi2sd xmm0, eax
		shufpd xmm0, xmm0, 0 ; // copy to high
		cvtsi2sd xmm0, edx
		cvtss2sd xmm1, dword ptr scale_value
		shufpd xmm1, xmm1, 0 ; // copy to high
		mulpd xmm0, xmm1
		ret
	}
}
// quantization factor
static const float q_factor[] =
{
	(1 <<  0),
	(1 <<  1),
	(1 <<  2),
	(1 <<  3),
	(1 <<  4),
	(1 <<  5),
	(1 <<  6),
	(1 <<  7),
	(1 <<  8),
	(1 <<  9),

	(1 << 10),
	(1 << 11),
	(1 << 12),
	(1 << 13),
	(1 << 14),
	(1 << 15),
	(1 << 16),
	(1 << 17),
	(1 << 18),
	(1 << 19),

	(1 << 20),
	(1 << 21),
	(1 << 22),
	(1 << 23),
	(1 << 24),
	(1 << 25),
	(1 << 26),
	(1 << 27),
	(1 << 28),
	(1 << 29),
	(1 << 30),
	(1 << 31),

	1.0/(1ULL << 32),
	1.0/(1 << 31),
	1.0/(1 << 30),

	1.0/(1 << 29),
	1.0/(1 << 28),
	1.0/(1 << 27),
	1.0/(1 << 26),
	1.0/(1 << 25),
	1.0/(1 << 24),
	1.0/(1 << 23),
	1.0/(1 << 22),
	1.0/(1 << 21),
	1.0/(1 << 20),

	1.0/(1 << 19),
	1.0/(1 << 18),
	1.0/(1 << 17),
	1.0/(1 << 16),
	1.0/(1 << 15),
	1.0/(1 << 14),
	1.0/(1 << 13),
	1.0/(1 << 12),
	1.0/(1 << 11),
	1.0/(1 << 10),

	1.0/(1 <<  9),
	1.0/(1 <<  8),
	1.0/(1 <<  7),
	1.0/(1 <<  6),
	1.0/(1 <<  5),
	1.0/(1 <<  4),
	1.0/(1 <<  3),
	1.0/(1 <<  2),
	1.0/(1 <<  1),
};

// ECX is assumed to contain EA, scale_value contains scale value
// xmm0 is assumed to contain double to be stored
// returns ECX incremented by size
// -- rounding is done towards zero for integers --
// store unscaled float
__declspec (naked) void qstore_sse2_type0(void)
{
	_asm
	{
		cvtsd2ss xmm0, xmm0
		movlps dword ptr float_store, xmm0
		mov eax, dword ptr float_store
		call rec_mem_write32
		ret
	}
}
// store scaled unsigned byte
__declspec (naked) void qstore_sse2_type4(void)
{
	_asm
	{
		cvtss2sd xmm1, dword ptr scale_value
		mulsd xmm0, xmm1
		cvttsd2si eax, xmm0
		// clip it in range
		mov edx, 0
		cmp eax, edx
		cmovl eax, edx // clip to 0 if below 0
		mov edx, 255
		cmp eax, edx
		cmovg eax, edx // clip to 255 if above 255
		// store
		call rec_mem_write8
		ret
	}
}
// store scaled unsigned word
__declspec (naked) void qstore_sse2_type5(void)
{
	_asm
	{
		cvtss2sd xmm1, dword ptr scale_value
		mulsd xmm0, xmm1
		cvttsd2si eax,xmm0
		// clip it in range
		mov edx, 0
		cmp eax, edx
		cmovl eax, edx // clip to 0 if below 0
		mov edx, 65535
		cmp eax, edx
		cmovg eax, edx // clip to 65535 if above 65535
		// store
		call rec_mem_write16
		ret
	}
}
// store scaled signed byte
__declspec (naked) void qstore_sse2_type6(void)
{
	_asm
	{
		cvtss2sd xmm1, dword ptr scale_value
		mulsd xmm0, xmm1
		cvttsd2si eax, xmm0
		// clip it in range
		mov edx, -128
		cmp eax, edx
		cmovl eax, edx // clip to -128 if below -128
		mov edx, 127
		cmp eax, edx
		cmovg eax, edx // clip to 127 if above 127
		// store
		call rec_mem_write8
		ret
	}
}
// store scaled signed word
__declspec (naked) void qstore_sse2_type7(void)
{
	_asm
	{
		cvtss2sd xmm1, dword ptr scale_value
		mulsd xmm0, xmm1
		cvttsd2si eax, xmm0
		// clip it in range
		mov edx, -32768
		cmp eax, edx
		cmovl eax, edx // clip to -32768 if below -32768
		mov edx, 32767
		cmp eax, edx
		cmovg eax, edx // clip to 32767 if above 32767
		// store
		call rec_mem_write16
		ret
	}
}

// ECX is assumed to contain EA, scale_value contains scale value
// xmm0 is assumed to contain pair to be stored
// -- rounding is done towards zero for integers --
// store unscaled float

__declspec (naked) void qstorep_sse2_type0(void)
{
	_asm
	{
		cvtpd2ps xmm0, xmm0 ; 
		shufps xmm0, xmm0, 1
		call rec_mem_write64
		ret
	}
}
// store scaled unsigned byte
__declspec (naked) void qstorep_sse2_type4(void)
{
	_asm
	{
		cvtss2sd xmm1, dword ptr scale_value
		shufpd xmm1, xmm1, 0 ; // copy low to high
		mulpd xmm0, xmm1
		cvttsd2si eax, xmm0
		// clip it in range
		mov edx, 0
		cmp eax, edx
		cmovl eax, edx // clip to 0 if below 0
		mov edx, 255
		cmp eax, edx
		cmovg eax, edx // clip to 255 if above 255
		// store low part
		push eax

		shufpd xmm0, xmm0, 1 ; // swap 
		cvttsd2si eax, xmm0
		// clip it in range
		mov edx, 0
		cmp eax, edx
		cmovl eax, edx // clip to 0 if below 0
		mov edx, 255
		cmp eax, edx
		cmovg eax, edx // clip to 255 if above 255

		pop edx
		// replace top part
		mov ah, dl

		call rec_mem_write16
		ret
	}
}
// store scaled unsigned word
__declspec (naked) void qstorep_sse2_type5(void)
{
	_asm
	{
		cvtss2sd xmm1, dword ptr scale_value
		shufpd xmm1, xmm1, 0 ; // copy low to high
		mulpd xmm0, xmm1
		cvttsd2si eax, xmm0
		// clip it in range
		mov edx, 0
		cmp eax, edx
		cmovl eax, edx // clip to 0 if below 0
		mov edx, 65535
		cmp eax, edx
		cmovg eax, edx // clip to 65535 if above 65535
		// store low part
		push eax

		shufpd xmm0, xmm0, 1 ; // swap 
		cvttsd2si xmm0, eax
		// clip it in range
		mov edx, 0
		cmp eax, edx
		cmovl eax, edx // clip to 0 if below 0
		mov edx, 65535
		cmp eax, edx
		cmovg eax, edx // clip to 65535 if above 65535

		pop edx
		shl edx, 16
		// replace top part
		or eax, edx

		call rec_mem_write32
		ret
	}
}
// store scaled signed byte
__declspec (naked) void qstorep_sse2_type6(void)
{
	_asm
	{
		cvtss2sd xmm1, dword ptr scale_value
		shufpd xmm1, xmm1, 0 ; // copy low to high
		mulpd xmm0, xmm1
		cvttsd2si eax, xmm0
		// clip it in range
		mov edx, -128
		cmp eax, edx
		cmovl eax, edx // clip to -128 if below -128
		mov edx, 127
		cmp eax, edx
		cmovg eax, edx // clip to 127 if above 127
		// store low part
		push eax

		shufpd xmm0, xmm0, 1 ; // swap 
		cvttsd2si eax, xmm0
		// clip it in range
		mov edx, -128
		cmp eax, edx
		cmovl eax, edx // clip to -128 if below -128
		mov edx, 127
		cmp eax, edx
		cmovg eax, edx // clip to 127 if above 127

		pop edx
		// replace top part
		mov ah, dl
		call rec_mem_write16
		ret
	}
}
// store scaled signed word
__declspec (naked) void qstorep_sse2_type7(void)
{
	_asm
	{
		cvtss2sd xmm1, dword ptr scale_value
		shufpd xmm1, xmm1, 0 ; // copy low to high
		mulpd xmm0, xmm1
		cvttsd2si eax, xmm0
		// clip it in range
		mov edx, -32768
		cmp eax, edx
		cmovl eax, edx // clip to -32768 if below -32768
		mov edx, 32767
		cmp eax, edx
		cmovg eax, edx // clip to 32767 if above 32767
		// store low part
		push eax

		shufpd xmm0, xmm0, 1 ; // swap 
		cvttsd2si eax, xmm0
		// clip it in range
		mov edx, -32768
		cmp eax, edx
		cmovl eax, edx // clip to -32768 if below -32768
		mov edx, 32767
		cmp eax, edx
		cmovg eax, edx // clip to 32767 if above 32767

		pop edx
		shl edx, 16
		// replace top part
		or eax, edx
		call rec_mem_write32
		ret
	}
}

void *qload_sse2_functions[8] =
{
	qload_sse2_type0,
	qload_sse2_type0,
	qload_sse2_type0,
	qload_sse2_type0,
	qload_sse2_type4,
	qload_sse2_type5,
	qload_sse2_type6,
	qload_sse2_type7,
};

void *qloadp_sse2_functions[8] =
{
	qloadp_sse2_type0,
	qloadp_sse2_type0,
	qloadp_sse2_type0,
	qloadp_sse2_type0,
	qloadp_sse2_type4,
	qloadp_sse2_type5,
	qloadp_sse2_type6,
	qloadp_sse2_type7,
};
void *qstore_sse2_functions[8] =
{
	qstore_sse2_type0,
	qstore_sse2_type0,
	qstore_sse2_type0,
	qstore_sse2_type0,
	qstore_sse2_type4,
	qstore_sse2_type5,
	qstore_sse2_type6,
	qstore_sse2_type7,
};
void *qstorep_sse2_functions[8] =
{
	qstorep_sse2_type0,
	qstorep_sse2_type0,
	qstorep_sse2_type0,
	qstorep_sse2_type0,
	qstorep_sse2_type4,
	qstorep_sse2_type5,
	qstorep_sse2_type6,
	qstorep_sse2_type7,
};

void trx_ppc_gen_sse2_psq_l(void)
{
	uint32 rA, rD, type, scale, i, w;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	i = ((trxCPUrec.opcode)>>12)&0x7;
	w = ((trxCPUrec.opcode)>>15)&1;

	type = (trxCPUrec.spr[PPC_GQR0+i]>>16)&7;
	scale = (trxCPUrec.spr[PPC_GQR0+i]>>24)&0x3f;

	// scale_value = dq_factor[trxCPUrec.spr[PPC_GQR0+i]>>24)&0x3f]
	// EAX = (uint32)&qload_functions[trxCPUrec.spr[PPC_GQR0+i]>>16)&7];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.spr[PPC_GQR0+i]);
	gen_asm(MOV_RR, EDX, EAX);
	gen_asm(SHR_RI8, EDX, 22);
	gen_asm(AND_RI32, EDX, 0x3f<<2);// indexing 32 bit values!
	gen_asm(MOV_RMRI32, EDX, EDX, (uint32)&dq_factor);
	gen_asm(MOV_MR, (uint32)&scale_value, EDX);
	gen_asm(SHR_RI8, EAX, 14);
	gen_asm(AND_RI32, EAX, 7<<2); //indexing 32bit values!

	uint32 EA = trxCPUrec.opcode & 0xfff; // pay attention ! only 12 bits !
    if(EA & 0x800) EA |= 0xfffff000;	

	if(rA == 0)
	{
		//EA = 0 + imm;
		gen_asm(MOV_RI32, ECX, EA);
	}
	else
	{
		//EA = trxCPUrec.gpr[rA] + imm;
		regc_load(ECX, rA);
		gen_asm(ADD_RI32, ECX, EA);
	}

	psc_flushall();
	if(w == 0)
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qloadp_sse2_functions);
		gen_asm(CALL_R, EAX);
	}
	else
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qload_sse2_functions);
		gen_asm(CALL_R, EAX);
		gen_asm(MOVHPD_RM, 0, (uint32)&double_one);
	}
	psc_store(0, rD);
}

void trx_ppc_gen_sse2_psq_lx(void)
{
	uint32 rA, rD, rB, type, scale, i, w;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	i = ((trxCPUrec.opcode)>>7)&0x7;
	w = ((trxCPUrec.opcode)>>10)&1;

	type = (trxCPUrec.spr[PPC_GQR0+i]>>16)&7;
	//scale = (trxCPUrec.spr[PPC_GQR0+i]>>24)&0x3f;

	//if(type !=0)gen_asm(BREAK);

	// scale_value = dq_factor[trxCPUrec.spr[PPC_GQR0+i]>>24)&0x3f]
	// EAX = (uint32)&qload_functions[trxCPUrec.spr[PPC_GQR0+i]>>16)&7];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.spr[PPC_GQR0+i]);
	gen_asm(MOV_RR, EDX, EAX);
	gen_asm(SHR_RI8, EDX, 22);
	gen_asm(AND_RI32, EDX, 0x3f<<2);// indexing 32 bit values!
	gen_asm(MOV_RMRI32, EDX, EDX, (uint32)&dq_factor);
	gen_asm(MOV_MR, (uint32)&scale_value, EDX);
	gen_asm(SHR_RI8, EAX, 14);
	gen_asm(AND_RI32, EAX, 7<<2); //indexing 32bit values!
	
	if(rA == 0)
	{
		//EA = trxCPUrec.gpr[rB]
		regc_load(ECX, rB);
	}
	else
	{
		//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB]
		regc_load(ECX, rA);
		regc_load(EDX, rB);
		gen_asm(ADD_RR, ECX, EDX);
	}

	psc_flushall();
	if(w == 0)
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qloadp_sse2_functions);
		gen_asm(CALL_R, EAX);
	}
	else
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qload_sse2_functions);
		gen_asm(CALL_R, EAX);
		gen_asm(MOVHPD_RM, 0, (uint32)&double_one);
	}
	psc_store(0, rD);
}

void trx_ppc_gen_sse2_psq_lu(void)
{
	uint32 rA, rD, type, scale, i, w;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	i = ((trxCPUrec.opcode)>>12)&0x7;
	w = ((trxCPUrec.opcode)>>15)&1;

	type = (trxCPUrec.spr[PPC_GQR0+i]>>16)&7;
	//scale = (trxCPUrec.spr[PPC_GQR0+i]>>24)&0x3f;

	//if(type !=0)gen_asm(BREAK);

	// scale_value = dq_factor[trxCPUrec.spr[PPC_GQR0+i]>>24)&0x3f]
	// EAX = (uint32)&qload_functions[trxCPUrec.spr[PPC_GQR0+i]>>16)&7];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.spr[PPC_GQR0+i]);
	gen_asm(MOV_RR, EDX, EAX);
	gen_asm(SHR_RI8, EDX, 22);
	gen_asm(AND_RI32, EDX, 0x3f<<2);// indexing 32 bit values!
	gen_asm(MOV_RMRI32, EDX, EDX, (uint32)&dq_factor);
	gen_asm(MOV_MR, (uint32)&scale_value, EDX);
	gen_asm(SHR_RI8, EAX, 14);
	gen_asm(AND_RI32, EAX, 7<<2); //indexing 32bit values!

	uint32 EA = trxCPUrec.opcode & 0xfff; // pay attention ! only 12 bits !
    if(EA & 0x800) EA |= 0xfffff000;	
	
	regc_load(ECX, rA);
	gen_asm(ADD_RI32, ECX, EA);
	// and update
	regc_store(ECX, rA);

	psc_flushall();
	if(w == 0)
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qloadp_sse2_functions);
		gen_asm(CALL_R, EAX);
	}
	else
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qload_sse2_functions);
		gen_asm(CALL_R, EAX);
		gen_asm(MOVHPD_RM, 0, (uint32)&double_one);
	}
	psc_store(0, rD);
}

void trx_ppc_gen_sse2_psq_lux(void)
{
	uint32 rA, rD, rB, type, scale, i, w;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	i = ((trxCPUrec.opcode)>>7)&0x7;
	w = ((trxCPUrec.opcode)>>10)&1;

	type = (trxCPUrec.spr[PPC_GQR0+i]>>16)&7;
	//scale = (trxCPUrec.spr[PPC_GQR0+i]>>24)&0x3f;

	//if(type !=0)gen_asm(BREAK);

	// scale_value = dq_factor[trxCPUrec.spr[PPC_GQR0+i]>>24)&0x3f]
	// EAX = (uint32)&qload_functions[trxCPUrec.spr[PPC_GQR0+i]>>16)&7];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.spr[PPC_GQR0+i]);
	gen_asm(MOV_RR, EDX, EAX);
	gen_asm(SHR_RI8, EDX, 22);
	gen_asm(AND_RI32, EDX, 0x3f<<2);// indexing 32 bit values!
	gen_asm(MOV_RMRI32, EDX, EDX, (uint32)&dq_factor);
	gen_asm(MOV_MR, (uint32)&scale_value, EDX);
	gen_asm(SHR_RI8, EAX, 14);
	gen_asm(AND_RI32, EAX, 7<<2); //indexing 32bit values!

	//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB]
	regc_load(ECX, rA);
	regc_load(EDX, rB);
	gen_asm(ADD_RR, ECX, EDX);
	// and update
	regc_store(ECX, rA);

	psc_flushall();
	if(w == 0)
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qloadp_sse2_functions);
		gen_asm(CALL_R, EAX);
	}
	else
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qload_sse2_functions);
		gen_asm(CALL_R, EAX);
		gen_asm(MOVHPD_RM, 0, (uint32)&double_one);
	}
	psc_store(0, rD);
}

void trx_ppc_gen_sse2_psq_st(void)
{
	uint32 rA, rS, type, scale, i, w;

	rS = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	i = ((trxCPUrec.opcode)>>12)&0x7;
	w = ((trxCPUrec.opcode)>>15)&1;

	type = (trxCPUrec.spr[PPC_GQR0+i])&7;
	scale = (trxCPUrec.spr[PPC_GQR0+i]>>8)&0x3f;

	// scale_value = q_factor[trxCPUrec.spr[PPC_GQR0+i]>>8)&0x3f]
	// EAX = (uint32)&qload_functions[trxCPUrec.spr[PPC_GQR0+i])&7];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.spr[PPC_GQR0+i]);
	gen_asm(MOV_RR, EDX, EAX);
	gen_asm(SHR_RI8, EDX, 6);
	gen_asm(AND_RI32, EDX, 0x3f<<2);// indexing 32 bit values!
	gen_asm(MOV_RMRI32, EDX, EDX, (uint32)&q_factor);
	gen_asm(MOV_MR, (uint32)&scale_value, EDX);
	gen_asm(SHL_RI8, EAX, 2);
	gen_asm(AND_RI32, EAX, 7<<2); //indexing 32bit values!

	uint32 EA = trxCPUrec.opcode & 0xfff; // pay attention ! only 12 bits !
    if(EA & 0x800) EA |= 0xfffff000;	

	if(rA == 0)
	{
		gen_asm(MOV_RI32, ECX, EA);
	}
	else
	{
		regc_load(ECX, rA);
		gen_asm(ADD_RI32, ECX, EA);
	}	
	
	psc_load(0, rS);
	psc_flushall();
	if(w == 0)
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qstorep_sse2_functions);
		gen_asm(CALL_R, EAX);
	}
	else
	{
		//trx_gekko_qstore(EA, 0, type, scale, &trx_ps0_double[rS]);
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qstore_sse2_functions);
		gen_asm(CALL_R, EAX);
	}
}

void trx_ppc_gen_sse2_psq_stu(void)
{
	uint32 rA, rS, type, scale, i, w;

	rS = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	i = ((trxCPUrec.opcode)>>12)&0x7;
	w = ((trxCPUrec.opcode)>>15)&1;

	type = (trxCPUrec.spr[PPC_GQR0+i])&7;
	//scale = (trxCPUrec.spr[PPC_GQR0+i]>>8)&0x3f;

	//if(type!= 0)gen_asm(BREAK);
	// scale_value = q_factor[trxCPUrec.spr[PPC_GQR0+i]>>8)&0x3f]
	// EAX = (uint32)&qload_functions[trxCPUrec.spr[PPC_GQR0+i])&7];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.spr[PPC_GQR0+i]);
	gen_asm(MOV_RR, EDX, EAX);
	gen_asm(SHR_RI8, EDX, 6);
	gen_asm(AND_RI32, EDX, 0x3f<<2);// indexing 32 bit values!
	gen_asm(MOV_RMRI32, EDX, EDX, (uint32)&q_factor);
	gen_asm(MOV_MR, (uint32)&scale_value, EDX);
	gen_asm(SHL_RI8, EAX, 2);
	gen_asm(AND_RI32, EAX, 7<<2); //indexing 32bit values!

	uint32 EA = trxCPUrec.opcode & 0xfff; // pay attention ! only 12 bits !
    if(EA & 0x800) EA |= 0xfffff000;	
	
	regc_load(ECX, rA);
	gen_asm(ADD_RI32, ECX, EA);
	// and update
	regc_store(ECX, rA);
	
	psc_load(0, rS);
	psc_flushall();
	if(w == 0)
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qstorep_sse2_functions);
		gen_asm(CALL_R, EAX);
	}
	else
	{
		//trx_gekko_qstore(EA, 0, type, scale, &trx_ps0_double[rS]);
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qstore_sse2_functions);
		gen_asm(CALL_R, EAX);
	}
}

void trx_ppc_gen_sse2_psq_stx(void)
{
	uint32 rA, rS, rB, type, scale, i, w;

	rS = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	i = ((trxCPUrec.opcode)>>7)&0x7;
	w = ((trxCPUrec.opcode)>>10)&1;

	type = (trxCPUrec.spr[PPC_GQR0+i])&7;
	//scale = (trxCPUrec.spr[PPC_GQR0+i]>>8)&0x3f;

	//if(type != 0)gen_asm(BREAK);
	// scale_value = q_factor[trxCPUrec.spr[PPC_GQR0+i]>>8)&0x3f]
	// EAX = (uint32)&qload_functions[trxCPUrec.spr[PPC_GQR0+i])&7];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.spr[PPC_GQR0+i]);
	gen_asm(MOV_RR, EDX, EAX);
	gen_asm(SHR_RI8, EDX, 6);
	gen_asm(AND_RI32, EDX, 0x3f<<2);// indexing 32 bit values!
	gen_asm(MOV_RMRI32, EDX, EDX, (uint32)&q_factor);
	gen_asm(MOV_MR, (uint32)&scale_value, EDX);
	gen_asm(SHL_RI8, EAX, 2);
	gen_asm(AND_RI32, EAX, 7<<2); //indexing 32bit values!

	if(rA == 0)
	{
		//EA = trxCPUrec.gpr[rB]
		regc_load(ECX, rB);
	}
	else
	{
		//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB]
		regc_load(EDX, rA);
		regc_load(ECX, rB);
		gen_asm(ADD_RR, ECX, EDX);
	}
	
	psc_load(0, rS);
	psc_flushall();
	if(w == 0)
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qstorep_sse2_functions);
		gen_asm(CALL_R, EAX);
	}
	else
	{
		//trx_gekko_qstore(EA, 0, type, scale, &trx_ps0_double[rS]);
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qstore_sse2_functions);
		gen_asm(CALL_R, EAX);
	}
}

void trx_ppc_gen_sse2_psq_stux(void)
{
	uint32 rA, rS, rB, type, scale, i, w;

	rS = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	i = ((trxCPUrec.opcode)>>7)&0x7;
	w = ((trxCPUrec.opcode)>>10)&1;

	type = (trxCPUrec.spr[PPC_GQR0+i])&7;
	//scale = (trxCPUrec.spr[PPC_GQR0+i]>>8)&0x3f;

	// scale_value = q_factor[trxCPUrec.spr[PPC_GQR0+i]>>8)&0x3f]
	// EAX = (uint32)&qload_functions[trxCPUrec.spr[PPC_GQR0+i])&7];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.spr[PPC_GQR0+i]);
	gen_asm(MOV_RR, EDX, EAX);
	gen_asm(SHR_RI8, EDX, 6);
	gen_asm(AND_RI32, EDX, 0x3f<<2);// indexing 32 bit values!
	gen_asm(MOV_RMRI32, EDX, EDX, (uint32)&q_factor);
	gen_asm(MOV_MR, (uint32)&scale_value, EDX);
	gen_asm(SHL_RI8, EAX, 2);
	gen_asm(AND_RI32, EAX, 7<<2); //indexing 32bit values!

	//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB]
	regc_load(EDX, rA);
	regc_load(ECX, rB);
	gen_asm(ADD_RR, ECX, EDX);
	// and update
	regc_store(ECX, rA);
	
	psc_load(0, rS);
	psc_flushall();
	if(w == 0)
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qstorep_sse2_functions);
		gen_asm(CALL_R, EAX);
	}
	else
	{
		//trx_gekko_qstore(EA, 0, type, scale, &trx_ps0_double[rS]);
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qstore_sse2_functions);
		gen_asm(CALL_R, EAX);
	}
}

//==============================================================================
// Paired Single SIMD opcodes
//
// used
void trx_ppc_gen_sse2_ps_msub(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
//	res0 = (trx_ps0_double[rA] * trx_ps0_double[rC]) - trx_ps0_double[rB];
//	res1 = (trx_ps1_double[rA] * trx_ps1_double[rC]) - trx_ps1_double[rB];
//	trx_ps0_double[rD] = res0;
//	trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	gen_asm(MULPD_RR, 0, psc_getcachereg(rC));
	gen_asm(SUBPD_RR, 0, psc_getcachereg(rB));
	psc_store(0, rD);
}
// used
void trx_ppc_gen_sse2_ps_madd(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//res0 = (trx_ps0_double[rA] * trx_ps0_double[rC]) + trx_ps0_double[rB];
	//res1 = (trx_ps1_double[rA] * trx_ps1_double[rC]) + trx_ps1_double[rB];
	//trx_ps0_double[rD] = res0;
	//trx_ps1_double[rD] = res1;

	psc_load(0, rA);
	gen_asm(MULPD_RR, 0, psc_getcachereg(rC));
	gen_asm(ADDPD_RR, 0, psc_getcachereg(rB));
	psc_store(0, rD);
}
// not used
void trx_ppc_gen_sse2_ps_nmsub(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//res0 = -((trx_ps0_double[rA] * trx_ps0_double[rC]) - trx_ps0_double[rB]);
	//res1 = -((trx_ps1_double[rA] * trx_ps1_double[rC]) - trx_ps1_double[rB]);
	//trx_ps0_double[rD] = res0;
	//trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	gen_asm(MULPD_RR, 0, psc_getcachereg(rC));
	gen_asm(SUBPD_RR, 0, psc_getcachereg(rB));
	gen_asm(XORPD_RM, 0, (uint32)&psnegmask);
	psc_store(0, rD);
}
// not used
void trx_ppc_gen_sse2_ps_nmadd(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//res0 = -((trx_ps0_double[rA] * trx_ps0_double[rC]) + trx_ps0_double[rB]);
	//res1 = -((trx_ps1_double[rA] * trx_ps1_double[rC]) + trx_ps1_double[rB]);
	//trx_ps0_double[rD] = res0;
	//trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	gen_asm(MULPD_RR, 0, psc_getcachereg(rC));
	gen_asm(ADDPD_RR, 0, psc_getcachereg(rB));
	gen_asm(XORPD_RM, 0, (uint32)&psnegmask);
	psc_store(0, rD);
}
// used
void trx_ppc_gen_sse2_ps_neg(void)
{
	int rD, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//trx_ps0_int[rD] = trx_ps0_int[rB] ^ FPU_SIGN_BIT;
	//trx_ps1_int[rD] = trx_ps1_int[rB] ^ FPU_SIGN_BIT;
	psc_load(0, rB);
	gen_asm(XORPD_RM, 0, (uint32)&psnegmask);
	psc_store(0, rD);
}
// not used
void trx_ppc_gen_sse2_ps_nabs(void)
{
	int rD, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//trx_ps0_int[rD] = trx_ps0_int[rB] | FPU_SIGN_BIT;
	//trx_ps1_int[rD] = trx_ps1_int[rB] | FPU_SIGN_BIT;

	psc_load(0, rB);
	gen_asm(ORPD_RM, 0, (uint32)&psnegmask);
	psc_store(0, rD);
}
// used
void trx_ppc_gen_sse2_ps_abs(void)
{
	int rD, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//trx_ps0_int[rD] = trx_ps0_int[rB] & ~FPU_SIGN_BIT;
	//trx_ps1_int[rD] = trx_ps1_int[rB] & ~FPU_SIGN_BIT;

	psc_load(0, rB);
	gen_asm(ANDPD_RM, 0, (uint32)&psabsmask);
	psc_store(0, rD);
}

// used
void trx_ppc_gen_sse2_ps_mr(void)
{
	int rD, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

//	trx_ps0_double[rD] = trx_ps0_double[rB];
//	trx_ps1_double[rD] = trx_ps1_double[rB];
	psc_load(0, rB);
	psc_store(0, rD);
}
// used
void trx_ppc_gen_sse2_ps_merge00(void)
{
	int rD, rA, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//res0 = trx_ps0_double[rA];
	//res1 = trx_ps0_double[rB];
	//trx_ps0_double[rD] = res0;
	//trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	gen_asm(SHUFPD_RR, 0, psc_getcachereg(rB), 0);
	psc_store(0, rD);
}

// used
void trx_ppc_gen_sse2_ps_merge01(void)
{
	int rD, rA, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//res0 = trx_ps0_double[rA];
	//res1 = trx_ps1_double[rB];
	//trx_ps0_double[rD] = res0;
	//trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	gen_asm(SHUFPD_RR, 0, psc_getcachereg(rB), 2);
	psc_store(0, rD);
}
// used
void trx_ppc_gen_sse2_ps_merge10(void)
{
	int rD, rA, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	
	//res0 = trx_ps1_double[rA];
	//res1 = trx_ps0_double[rB];
	//trx_ps0_double[rD] = res0;
	//trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	gen_asm(SHUFPD_RR, 0, psc_getcachereg(rB), 1);
	psc_store(0, rD);
}
// used
void trx_ppc_gen_sse2_ps_merge11(void)
{
	int rD, rA, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	
	//res0 = trx_ps1_double[rA];
	//res1 = trx_ps1_double[rB];
	//trx_ps0_double[rD] = res0;
	//trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	gen_asm(SHUFPD_RR, 0, psc_getcachereg(rB), 3);
	psc_store(0, rD);
}

// used
void trx_ppc_gen_sse2_ps_mul(void)
{
	int rD, rA, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//res0 = trx_ps0_double[rA] * trx_ps0_double[rC];
	//res1 = trx_ps1_double[rA] * trx_ps1_double[rC];
	//trx_ps0_double[rD] = res0;
	//trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	gen_asm(MULPD_RR, 0, psc_getcachereg(rC));
	psc_store(0, rD);
}

// used
void trx_ppc_gen_sse2_ps_muls0(void)
{
	int rD, rA, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//res0 = trx_ps0_double[rA] * trx_ps0_double[rC];
	//res1 = trx_ps1_double[rA] * trx_ps0_double[rC];
	//trx_ps0_double[rD] = res0;
	//trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	psc_load(1, rC);
	gen_asm(SHUFPD_RR, 1, 1, 0); // copy lower part of C to higher part
	gen_asm(MULPD_RR, 0, 1);
	psc_store(0, rD);
}
// not used
void trx_ppc_gen_sse2_ps_muls1(void)
{
	int rD, rA, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//res0 = trx_ps0_double[rA] * trx_ps1_double[rC];
	//res1 = trx_ps1_double[rA] * trx_ps1_double[rC];
	//trx_ps0_double[rD] = res0;
	//trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	psc_load(1, rC);
	gen_asm(SHUFPD_RR, 1, 1, 3); // copy higher part to lower
	gen_asm(MULPD_RR, 0, 1);
	psc_store(0, rD);
}
// used
void trx_ppc_gen_sse2_ps_madds0(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
//	res0 = (trx_ps0_double[rA] * trx_ps0_double[rC]) + trx_ps0_double[rB];
//	res1 = (trx_ps1_double[rA] * trx_ps0_double[rC]) + trx_ps1_double[rB];
//	trx_ps0_double[rD] = res0;
//	trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	psc_load(1, rC);
	gen_asm(SHUFPD_RR, 1, 1, 0); // copy lower part to higher
	gen_asm(MULPD_RR, 0, 1);
	psc_load(1, rB);
	gen_asm(ADDPD_RR, 0, 1);
	psc_store(0, rD);
}
// used
void trx_ppc_gen_sse2_ps_madds1(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
//	res0 = (trx_ps0_double[rA] * trx_ps1_double[rC]) + trx_ps0_double[rB];
//	res1 = (trx_ps1_double[rA] * trx_ps1_double[rC]) + trx_ps1_double[rB];
//	trx_ps0_double[rD] = res0;
//	trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	psc_load(1, rC);
	gen_asm(SHUFPD_RR, 1, 1, 3); // copy higher part to lower
	gen_asm(MULPD_RR, 0, 1);
	psc_load(1, rB);
	gen_asm(ADDPD_RR, 0, 1);
	psc_store(0, rD);
}

// used
void trx_ppc_gen_sse2_ps_cmpo0()
{
	uint32 cr, rA, rB;
	uint32 shift, mask;

	cr = (trxCPUrec.opcode >> 23)& 0x7;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	cr = 7-cr;
	shift = 28-(cr*4); 
	mask = trx_ppc_cmp_and_mask[cr];

	gen_asm(XOR_RR, ECX, ECX);
	gen_asm(UCOMISD_RR, psc_getcachereg(rA), psc_getcachereg(rB));
	gen_asm(CMOVA_M, EAX, (uint32)&cr_gt);
	gen_asm(CMOVB_M, EAX, (uint32)&cr_lt);
	gen_asm(CMOVE_M, EAX, (uint32)&cr_eq);
	gen_asm(CMOVP_M, EAX, (uint32)&cr_so);
	gen_asm(CMOVP_M, ECX, (uint32)&fcmpo_nan_flags);
	//if(rA == NaN || rB == Nan)
	//{
	//	trxCPUrec.fpscr |= FPSCR_VXSNAN;
	//	trxCPUrec.fpscr |= FPSCR_VXVC;
	//}
	//trxCPUrec.fpscr &= ~0x1f000;
	//trxCPUrec.fpscr |= (cmp << 12); -> thats when the bits are in 0-3 now they are in 28-31
	//so we should do >>28 <<12 which is >>16
	gen_asm(MOV_RM, EDX, (uint32)&trxCPUrec.fpscr);
	gen_asm(OR_RR, EDX, ECX); // apply flags if needed
	gen_asm(AND_RI32, EDX, ~0x1f000);
	gen_asm(MOV_RR, ECX, EAX);
	gen_asm(SHR_RI8, ECX, 16);
	gen_asm(OR_RR, ECX, EDX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, ECX);
	// store in designated flags
	gen_asm(SHR_RI8, EAX, shift);
	gen_asm(MOV_RM, ECX, (uint32)&trxCPUrec.cr);
	gen_asm(AND_RI32, ECX, mask);
	gen_asm(OR_RR, ECX, EAX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.cr, ECX);
}

// used
void trx_ppc_gen_sse2_ps_cmpo1()
{
	uint32 cr, rA, rB;
	uint32 shift, mask;

	cr = (trxCPUrec.opcode >> 23)& 0x7;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	cr = 7-cr;
	shift = 28-(cr*4); 
	mask = trx_ppc_cmp_and_mask[cr];

	psc_load(0, rA);
	psc_load(1, rB);
	gen_asm(XOR_RR, ECX, ECX);
	gen_asm(SHUFPD_RR, 0, 0, 3); // copy higher part to lower
	gen_asm(SHUFPD_RR, 1, 1, 3); // copy higher part to lower
	gen_asm(UCOMISD_RR, 0, 1);
	gen_asm(CMOVA_M, EAX, (uint32)&cr_gt);
	gen_asm(CMOVB_M, EAX, (uint32)&cr_lt);
	gen_asm(CMOVE_M, EAX, (uint32)&cr_eq);
	gen_asm(CMOVP_M, EAX, (uint32)&cr_so);
	gen_asm(CMOVP_M, ECX, (uint32)&fcmpo_nan_flags);
	//if(rA == NaN || rB == Nan)
	//{
	//	trxCPUrec.fpscr |= FPSCR_VXSNAN;
	//	trxCPUrec.fpscr |= FPSCR_VXVC;
	//}
	//trxCPUrec.fpscr &= ~0x1f000;
	//trxCPUrec.fpscr |= (cmp << 12); -> thats when the bits are in 0-3 now they are in 28-31
	//so we should do >>28 <<12 which is >>16
	gen_asm(MOV_RM, EDX, (uint32)&trxCPUrec.fpscr);
	gen_asm(OR_RR, EDX, ECX); // apply flags if needed
	gen_asm(AND_RI32, EDX, ~0x1f000);
	gen_asm(MOV_RR, ECX, EAX);
	gen_asm(SHR_RI8, ECX, 16);
	gen_asm(OR_RR, ECX, EDX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, ECX);
	// store in designated flags
	gen_asm(SHR_RI8, EAX, shift);
	gen_asm(MOV_RM, ECX, (uint32)&trxCPUrec.cr);
	gen_asm(AND_RI32, ECX, mask);
	gen_asm(OR_RR, ECX, EAX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.cr, ECX);
}

// used
void trx_ppc_gen_sse2_ps_cmpu0()
{
	uint32 cr, rA, rB;
	uint32 shift, mask;

	cr = (trxCPUrec.opcode >> 23)& 0x7;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	cr = 7-cr;
	shift = 28-(cr*4); 
	mask = trx_ppc_cmp_and_mask[cr];

	gen_asm(XOR_RR, ECX, ECX);
	gen_asm(UCOMISD_RR, psc_getcachereg(rA), psc_getcachereg(rB)); 
	gen_asm(CMOVA_M, EAX, (uint32)&cr_gt);
	gen_asm(CMOVB_M, EAX, (uint32)&cr_lt);
	gen_asm(CMOVE_M, EAX, (uint32)&cr_eq);
	gen_asm(CMOVP_M, EAX, (uint32)&cr_so);	
	gen_asm(CMOVP_M, ECX, (uint32)&fcmpu_nan_flags);
	//if(rA == NaN || rB == Nan)
	//{
	//	trxCPUrec.fpscr |= FPSCR_VXSNAN;
	//}
	//trxCPUrec.fpscr &= ~0x1f000;
	//trxCPUrec.fpscr |= (cmp << 12); -> thats when the bits are in 0-3 now they are in 28-31
	//so we should do >>28 <<12 which is >>16
	gen_asm(MOV_RM, EDX, (uint32)&trxCPUrec.fpscr);
	gen_asm(OR_RR, EDX, ECX); // apply flags if needed
	gen_asm(AND_RI32, EDX, ~0x1f000);
	gen_asm(MOV_RR, ECX, EAX);
	gen_asm(SHR_RI8, ECX, 16);
	gen_asm(OR_RR, ECX, EDX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, ECX);
	// store in designated flags
	gen_asm(SHR_RI8, EAX, shift);
	gen_asm(MOV_RM, ECX, (uint32)&trxCPUrec.cr);
	gen_asm(AND_RI32, ECX, mask);
	gen_asm(OR_RR, ECX, EAX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.cr, ECX);
}

// used
void trx_ppc_gen_sse2_ps_cmpu1()
{
	uint32 cr, rA, rB;
	uint32 shift, mask;

	cr = (trxCPUrec.opcode >> 23)& 0x7;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	cr = 7-cr;
	shift = 28-(cr*4); 
	mask = trx_ppc_cmp_and_mask[cr];

	psc_load(0, rA);
	psc_load(1, rB);
	gen_asm(XOR_RR, ECX, ECX);
	gen_asm(SHUFPD_RR, 0, 0, 3); // copy higher part to lower
	gen_asm(SHUFPD_RR, 1, 1, 3); // copy higher part to lower
	gen_asm(UCOMISD_RR, 0, 1); 
	gen_asm(CMOVA_M, EAX, (uint32)&cr_gt);
	gen_asm(CMOVB_M, EAX, (uint32)&cr_lt);
	gen_asm(CMOVE_M, EAX, (uint32)&cr_eq);
	gen_asm(CMOVP_M, EAX, (uint32)&cr_so);	
	gen_asm(CMOVP_M, ECX, (uint32)&fcmpu_nan_flags);
	//if(rA == NaN || rB == Nan)
	//{
	//	trxCPUrec.fpscr |= FPSCR_VXSNAN;
	//}
	//trxCPUrec.fpscr &= ~0x1f000;
	//trxCPUrec.fpscr |= (cmp << 12); -> thats when the bits are in 0-3 now they are in 28-31
	//so we should do >>28 <<12 which is >>16
	gen_asm(MOV_RM, EDX, (uint32)&trxCPUrec.fpscr);
	gen_asm(OR_RR, EDX, ECX); // apply flags if needed
	gen_asm(AND_RI32, EDX, ~0x1f000);
	gen_asm(MOV_RR, ECX, EAX);
	gen_asm(SHR_RI8, ECX, 16);
	gen_asm(OR_RR, ECX, EDX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, ECX);
	// store in designated flags
	gen_asm(SHR_RI8, EAX, shift);
	gen_asm(MOV_RM, ECX, (uint32)&trxCPUrec.cr);
	gen_asm(AND_RI32, ECX, mask);
	gen_asm(OR_RR, ECX, EAX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.cr, ECX);
}

// used
void trx_ppc_gen_sse2_ps_sum0(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//res0 = trx_ps0_double[rA] + trx_ps1_double[rB];
	//res1 = trx_ps1_double[rC];
	//trx_ps0_double[rD] = res0;
	//trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	psc_load(1, rB);
	gen_asm(SHUFPD_RR, 1, 1, 3); // copy higher part to lower
	gen_asm(ADDSD_RR, 0, 1);
	gen_asm(SHUFPD_RR, 0, psc_getcachereg(rC), 2); // just copy high part of C to D
	psc_store(0, rD);
}

// used
void trx_ppc_gen_sse2_ps_sum1(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//res0 = trx_ps0_double[rC];
	//res1 = trx_ps0_double[rA] + trx_ps1_double[rB];
	//trx_ps0_double[rD] = res0;
	//trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	gen_asm(SHUFPD_RR, 0, 0, 0); // copy lower part to higher
	gen_asm(ADDPD_RR, 0, psc_getcachereg(rB));
	psc_load(1, rC);
	gen_asm(SHUFPD_RR, 1, 0, 2); // merge res1 into res0
	psc_store(1, rD);
}

// used
void trx_ppc_gen_sse2_ps_div(void)
{
	int rD, rA, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	
	//res0 = trx_ps0_double[rA] / trx_ps0_double[rB];
	//res1 = trx_ps1_double[rA] / trx_ps1_double[rB];
	//trx_ps0_double[rD] = res0;
	//trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	gen_asm(DIVPD_RR, 0, psc_getcachereg(rB));
	psc_store(0, rD);
}
// used
void trx_ppc_gen_sse2_ps_sub(void)
{
	int rD, rA, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	
//	res0 = trx_ps0_double[rA] - trx_ps0_double[rB];
//	res1 = trx_ps1_double[rA] - trx_ps1_double[rB];
//	trx_ps0_double[rD] = res0;
//	trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	gen_asm(SUBPD_RR, 0, psc_getcachereg(rB));
	psc_store(0, rD);
}
// used
void trx_ppc_gen_sse2_ps_add(void)
{
	int rD, rA, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	
//	res0 = trx_ps0_double[rA] + trx_ps0_double[rB];
//	res1 = trx_ps1_double[rA] + trx_ps1_double[rB];
//	trx_ps0_double[rD] = res0;
//	trx_ps1_double[rD] = res1;
	psc_load(0, rA);
	gen_asm(ADDPD_RR, 0, psc_getcachereg(rB));
	psc_store(0, rD);
}

// used
void trx_ppc_gen_sse2_ps_sel(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//if(trx_ps0_double[rA] < 0.0f || NAN)	trx_ps0_double[rD] = trx_ps0_double[rB];
	//else trx_ps0_double[rD] = trx_ps0_double[rC];
	//if(trx_ps1_double[rA] < 0.0f || NAN)	trx_ps1_double[rD] = trx_ps1_double[rB];
	//else trx_ps1_double[rD] = trx_ps1_double[rC];
	psc_flush(rA);
	psc_flush(rB);
	psc_flush(rC);
	psc_flush(rD);
	gen_asm(FLD64_M, (uint32)&trx_ps[rC].low);
	gen_asm(FLD64_M, (uint32)&trx_ps[rB].low);
	gen_asm(FLDZ);
	gen_asm(FLD64_M, (uint32)&trx_ps[rA].low);
	gen_asm(FUCOMIP, 1); // cmp a, 0
	gen_asm(FCMOVB_M, 1); // < 0
	gen_asm(FCMOVU_M, 1); // NaN
	gen_asm(FCMOVNB_M, 2); // >= 0
	gen_asm(FSTP64_M, (uint32)&trx_ps[rD].low);
	gen_asm(FPOP);
	gen_asm(FPOP);

	gen_asm(FLD64_M, (uint32)&trx_ps[rC].high);
	gen_asm(FLD64_M, (uint32)&trx_ps[rB].high);
	gen_asm(FLDZ);
	gen_asm(FLD64_M, (uint32)&trx_ps[rA].high);
	gen_asm(FUCOMIP, 1); // cmp a, 0
	gen_asm(FCMOVB_M, 1); // < 0
	gen_asm(FCMOVU_M, 1); // NaN
	gen_asm(FCMOVNB_M, 2); // >= 0
	gen_asm(FSTP64_M, (uint32)&trx_ps[rD].high);
	gen_asm(FPOP);
	gen_asm(FPOP);
}
// used
void trx_ppc_gen_sse2_ps_rsqrte(void)
{
	int rD, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	
//	res0 = 1.0f / sqrt(trx_ps0_double[rB]);
//	res1 = 1.0f / sqrt(trx_ps1_double[rB]);
//	trx_ps0_double[rD] = res0;
//	trx_ps1_double[rD] = res1;
	psc_load(1, rB);
	gen_asm(SQRTPD_RR, 1, 1);
	gen_asm(MOVAPD_RM, 0, (uint32)&ps_one);
	gen_asm(DIVPD_RR, 0, 1);
	psc_store(0, rD);
}

// locked cache support
// gekko locked cache allocate
void trx_ppc_gen_sse2_dcbz_l(void)
{
	uint32 rA, rB;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(rA == 0)
	{
		//EA = trxCPUrec.gpr[rB]
		regc_load(ECX, rB);
	}
	else
	{
		//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB]
		regc_load(EAX, rA);
		regc_load(ECX, rB);
		gen_asm(ADD_RR, ECX, EAX);
	}

	// interpreter is doing checks if the locked cache is in correct 
	// memory area. we dont care, we assume it is right
	// clear the cache line
	//memset(&lockedcache[EA & LOCKEDCACHE_MASK], 0, 32);
	gen_asm(XOR_RR, EAX, EAX); // zero
	gen_asm(AND_RI32, ECX, LOCKEDCACHE_MASK);
	gen_asm(ADD_RI32, ECX, (uint32)&lockedcache[0]);
	gen_asm(MOV_MRRI32, EAX, ECX, 0);
	gen_asm(MOV_MRRI32, EAX, ECX, 4);
	gen_asm(MOV_MRRI32, EAX, ECX, 8);
	gen_asm(MOV_MRRI32, EAX, ECX, 12);
	gen_asm(MOV_MRRI32, EAX, ECX, 16);
	gen_asm(MOV_MRRI32, EAX, ECX, 20);
	gen_asm(MOV_MRRI32, EAX, ECX, 24);
	gen_asm(MOV_MRRI32, EAX, ECX, 28);
}

