//
// GX FIFO emulation.
//
// Big unknowns for WPAR GX FIFO emulation:
// what exactly triggers ending a display list ? It seems now that it is 
//
// -0x4500.0002 PE DrawDone token ? -> works for ribbit king
// -NOP command (0x00) -> needed by many games to get them to work 
//
//
//
//

#include "config.h"

#include <stdio.h>
#include <windows.h>

#include "gxD3D.h"
#include "system/types.h"
#include "gx_fifo_interface.h"
#include "graphics_processor/xf_regs.h"
#include "graphics_processor/bp_regs.h"
#include "graphics_processor/cp_regs.h"
#include "graphics_processor/vertex_processor.h"
#include "debug/tracers.h"
#include "graphics_processor/display_message.h"
//#include "profiler.h"
#include "d3d_fun.h"

static inline uint32 byteswap32(uint32 data)
{
	__asm
	{
		mov	eax, data
			bswap eax
	}
}

static inline uint16 byteswap16(uint16 data)
{
	return (data<<8)|(data>>8);
}


void gx_init(void)
{
}

void gx_close(void)
{
}

bool pe_finished = false;


enum fifo_decode_stages
{
	LOAD_OPCODE = 0,
	LOAD_BP_REG,
	LOAD_CP_REG_SEL,
	LOAD_CP_REG,
	LOAD_XF_REG_SEL,
	LOAD_XF_REG_SEL_2,
	LOAD_XF_REG,
	LOAD_VERTEX_NUM,
	LOAD_VERTEX,
	LOAD_DL_ADDRESS,
	LOAD_DL_SIZE
};

uint8 load_stage_size[]=
{
	1, // LOAD_OPCODE = 0,
	4, // LOAD_BP_REG,
	1, // LOAD_CP_REG_SEL,
	4, // LOAD_CP_REG,
	4, // LOAD_XF_REG_SEL,
	2, // LOAD_XF_REG_SEL_2,
	4, // LOAD_XF_REG,
	2, // LOAD_VERTEX_NUM,
	1, // LOAD_VERTEX,
	4, // LOAD_DL_ADDRESS,
	4, // LOAD_DL_SIZE
};

int command_count = 0;
int command_size = 0;

int schedule_finish;
int render_count = 0;

#define FPS_TABLE_SIZE	0x40
#define FPS_TABLE_MASK	(FPS_TABLE_SIZE - 1)
uint32	fps_ticks[FPS_TABLE_SIZE];
uint32	fps_tick_pos;

uint32 gx_dl_addr, gx_dl_size;

static uint8	vertex_attribute_table;
static uint32 vertex_count = 0;
static uint32 vertex_elements;
static uint32 vertex_size;
static uint32 vx_vertex_byte_pos;
static vx_vertex_ptr_t	vx_vertex_ptr_temp;

void gx_parse_display_list(uint32 addr, uint32 size)
{
	uint32	i, j;
	uint8	data8;
	uint16	data16;
	uint32	data32;
	uint32	xf_count, xf_sel;

	//syslog_warn(GX,"DISPLAY LIST NOT IMPLEMENTED at %08x size %08x\n", addr, size);

	addr &= 0x01ffffff;
	
	for(i = 0 ; i < size ; )
	{
		uint8 cmd = gx_memory[addr + i];
		i++;
		switch(cmd)
		{
		case 0x00:	// NOP
			break;
		case 0x61:
			// LOAD BP REG
			data32 = *(uint32 *)(&gx_memory[addr + i]);
			data32 = byteswap32(data32);
			i += 4;
			gp_bp_write_reg32(data32 >> 24, data32 & 0x00ffffff);
			break;
		case 0x08:
			data8 = gx_memory[addr + i];
			i++;
			data32 = *(uint32 *)(&gx_memory[addr + i]);
			data32 = byteswap32(data32);
			i += 4;
			gp_cp_write_reg(data8, data32);
			break;
		case 0x10:
			data16 = *(uint16 *)(&gx_memory[addr + i]);
			xf_count = byteswap16(data16) + 1;
			i += 2;
			data16 = *(uint16 *)(&gx_memory[addr + i]);
			xf_sel = byteswap16(data16) + 1;
			i += 2;
			for(j = 0 ; j < xf_count ; j++)
			{
				data32 = *(uint32 *)(&gx_memory[addr + i]);
				data32 = byteswap32(data32);
				i += 4;
				gp_xf_write_reg(xf_sel, data32);
				xf_sel++;
			}
			break;
		default:
			if (cmd < 0x80 || cmd >= 0xb8)
			{
				// ikaruga crash !!
				syslog_error(GX, "%02x %08x\n", cmd, *(uint32 *)&gx_memory[addr + i]);
			}
			data16 = *(uint16 *)(&gx_memory[addr + i]);
			data16 = byteswap16(data16);
			i+=2;
			vertex_count = data16;
			vx_begin_count(vertex_count);
			vertex_attribute_table = cmd & 0x7;
			vertex_size = vx_prepare_table(vertex_attribute_table);
			syslog(GX,"DL Draw Quads VAT: %x Count: %d Size: %d\n", vertex_attribute_table, vertex_count, vertex_size);
			switch(cmd & 0xf8)
			{
			case 0x80:
				vx_begin(VX_QUADS);
				break;
			case 0x90:
				vx_begin(VX_TRIANGLES);
				break;
			case 0x98:
				vx_begin(VX_TRIANGLE_STRIP);
				break;
			case 0xa0:
				vx_begin(VX_TRIANGLE_FAN);
				break;
			default:
				syslog_error(VX,"%02x %08x\n", cmd, *(uint32 *)&gx_memory[addr + i]);
			}
			for(j = 0 ; j < vertex_count ; j++)
			{
#if USE_VERTEX_PTR
				vx_vertex_ptr.ubp = gx_memory + addr + i;
#else
#error Display Lists work only with vertex pointers
#endif
				i += vertex_size;
				vx_process_commands();
				render_count++;
			}
			vx_end();
			break;
		}
	}
}

void gx_render_finished(void)
{
#if WITH_PROFILER
	static uint32 profile_ticks = 0, cpu_perc = 0, io_perc = 0;

	profile_ticks++;
	if(profile_ticks == 60)
	{
		cpu_perc = ((total_cpu - total_io)*100)/total_total;
		io_perc = (total_io*100)/total_total;
		total_total = 0;
		total_cpu = 0;
		total_io = 0;
		profile_ticks = 0;
	}
#endif
	fps_ticks[fps_tick_pos] = GetTickCount();
	double fps = ((fps_ticks[fps_tick_pos & FPS_TABLE_MASK] - fps_ticks[(fps_tick_pos + 1) & FPS_TABLE_MASK]));
	fps /= FPS_TABLE_SIZE;
	fps_tick_pos = (fps_tick_pos + 1) & FPS_TABLE_MASK;
#if WITH_PROFILER
	dmsg_printf("FPS: %.2f CPU: %2.2d IO: %2.2d OVERHEAD: %2.2d", 1000.0f/fps, cpu_perc, io_perc, (100-cpu_perc-io_perc));
#else
	//dmsg_printf("FPS: %.2f", 1000.0f/fps);
#endif
	d3d_endscene();
	d3d_beginscene();
}

void gx_fifo_finished(void)
{
	syslog(GX,"lets make PE finish because DL is finished\n");
	pe_finished = true;
	syslog(GX,"number of commands in fifo: %d size: %d\n", command_count, command_size);
	command_count = 0;
	command_size = 0;

	if (schedule_finish)
	{
		schedule_finish = 0;
#if WITH_PBUFFER
		pb_select_pbuffer(false);
		pb_copy_to_screen();
		gx_render_finished();
		pb_select_pbuffer(true);
#else
		gx_render_finished();
#endif
	}
}

static uint8 nop_counter = 0;
static uint8 opcode = 0xff;
static int cp_reg_sel, xf_reg_sel, xf_reg_load_len;
static uint8 load_stage = 0;
static uint32 vertex_pos;

// internal command buffer
static uint8 gx_buff[256];
static uint32 gx_buff_bytesleft = 0;
static void *gx_ptr;

//////////////////////////////////////////////////////////////////////////////////////////////
//
// GX RENDERING LIST PARSER
//
//
// GX has an internal buffer where it adds bursts of incoming bursts of data
// it will process the data in this buffer until it is either finished, empty or there is not
// enough data to complete the current command
//
// We add data at the end of the buffer, and we process the buffer by cruising through it on 
// a casted void pointer until there is no more data to process
//
// The only correct way to feed the GPU is through WPAR which bursts 32bytes of data to the GPU
// NOTE: just keep in mind that the WPAR data is in big endian format !!
void gx_write_fifo256(uint8 *data256)
{
	uint32 data32;
	uint16 data16;
	uint8 data8;

#if 0
	syslog(GX,"FIFO: %2.2x %2.2x %2.2x %2.2x %2.2x %2.2x %2.2x %2.2x\n", data256[0], data256[1], data256[2], data256[3], data256[4], data256[5], data256[6], data256[7]);
	syslog(GX,"FIFO: %2.2x %2.2x %2.2x %2.2x %2.2x %2.2x %2.2x %2.2x\n", data256[8], data256[9], data256[10], data256[11], data256[12], data256[13], data256[14], data256[15]);
	syslog(GX,"FIFO: %2.2x %2.2x %2.2x %2.2x %2.2x %2.2x %2.2x %2.2x\n", data256[16], data256[17], data256[18], data256[19], data256[20], data256[21], data256[22], data256[23]);
	syslog(GX,"FIFO: %2.2x %2.2x %2.2x %2.2x %2.2x %2.2x %2.2x %2.2x\n", data256[24], data256[25], data256[26], data256[27], data256[28], data256[29], data256[30], data256[31]);
#endif

	// move partial bytes to front of buffer
	if(gx_buff_bytesleft > 0)
	{
		memcpy(&gx_buff[0], gx_ptr, gx_buff_bytesleft);
	}
	// add this burst to the buffer so we can complete any partial commands
	memcpy(&gx_buff[gx_buff_bytesleft], data256, 32); // always 32byte bursts (256 bits)
	gx_buff_bytesleft += 32; // 32 bytes of data to process..

	// as long as we still have data to process this command ..
	gx_ptr = (void *)gx_buff;

	for(; gx_buff_bytesleft >= load_stage_size[load_stage] ;)  
	{
//		command_count++;
//		command_size += load_stage_size[load_stage];

		switch(load_stage)
		{
		case LOAD_OPCODE:
			 gx_buff_bytesleft -= 1;
			_asm
			{
				mov ecx, gx_ptr
				mov al, [ecx]
				mov data8, al
				add ecx, 1
				mov gx_ptr, ecx
			}

			// opcode decode stage, 0 is special 'NOP' case
			// after NOP the GPU is finished  ?
			if (data8 == 0)
			{
//				gx_fifo_finished();
			}
			else
			{
				// GP opcode	oooo ovvv	o - opcode, v - VAT
				// except special case 0x61 Load BP Reg - SU_ByPassCmd
				if (data8 == 0x61)
				{
					opcode = 0x61;
					vertex_attribute_table = 0x00;
					load_stage = LOAD_BP_REG;
				}
				else
				{
					opcode = data8 & 0xf8;
					vertex_attribute_table = data8 & 0x7;
					// GP opcode	oooo ovvv	o - opcode, v - VAT
					switch(opcode)
					{
					case 0x00: 
						syslog(GX,"NOP\n");
						break;
					case 0x08:
						if (vertex_attribute_table != 0)
						{
							syslog_error(GX,"GP: this should not happen %s %d\n", __FILE__, __LINE__);
						}
						load_stage = LOAD_CP_REG_SEL;
						break;
					case 0x10:
						if (vertex_attribute_table != 0)
						{
							syslog_error(GX,"GP: this should not happen %s %d\n", __FILE__, __LINE__);
						}
						load_stage = LOAD_XF_REG_SEL;
						break;
					case 0x40:
						// Call DL
						load_stage = LOAD_DL_ADDRESS;
						break;
					case 0x48:
						if (vertex_attribute_table != 0)
						{
							syslog_error(GX,"GP: this should not happen %s %d\n", __FILE__, __LINE__);
						}
						syslog(GX,"GP: Invalidate Vertex Cache\n");
						break;
					case 0x80:
						syslog(GX,"GP: Draw Quads VAT: %d\n", vertex_attribute_table);
						load_stage = LOAD_VERTEX_NUM;
						vx_begin(VX_QUADS);
						break;
					case 0x88:
						syslog(GX,"GP: Draw Quad Strip VAT: %d\n", vertex_attribute_table);
						load_stage = LOAD_VERTEX_NUM;
						vx_begin(VX_QUAD_STRIP);
						break;
					case 0x90:
						syslog(GX,"GP: Draw Triangles VAT: %d\n", vertex_attribute_table);
						load_stage = LOAD_VERTEX_NUM;
						vx_begin(VX_TRIANGLES);
						break;
					case 0x98:
						syslog(GX,"GP: Draw Triangle Strip VAT: %d\n", vertex_attribute_table);
						load_stage = LOAD_VERTEX_NUM;
						vx_begin(VX_TRIANGLE_STRIP);
						break;
					case 0xa0:
						syslog(GX,"GP: Draw Triangle Fan VAT: %d\n", vertex_attribute_table);
						load_stage = LOAD_VERTEX_NUM;
						vx_begin(VX_TRIANGLE_FAN);
						break;
					case 0xa8:
						syslog(GX,"GP: Draw Lines VAT: %d\n", vertex_attribute_table);
						load_stage = LOAD_VERTEX_NUM;
						vx_begin(VX_LINES);
						break;
					case 0xb0:
						syslog(GX,"GP: Draw Line  Strip VAT: %d\n", vertex_attribute_table);
						load_stage = LOAD_VERTEX_NUM;
						vx_begin(VX_LINE_STRIP);
						break;
					case 0xb8:
						syslog(GX,"GP: Draw Points VAT: %d\n", vertex_attribute_table);
						load_stage = LOAD_VERTEX_NUM;
						vx_begin(VX_POINTS);
						break;
					default:
						syslog_error(GX,"GP: opcode not implemented: %02x%\n", opcode);
						break;
					}
				}
			}
			break;
		case LOAD_BP_REG:
			 gx_buff_bytesleft -= 4;
			// source data is in big endian!
			_asm
			{
				mov ecx, gx_ptr
				mov eax, [ecx]
				bswap eax
				mov data32, eax
				add ecx, 4
				mov gx_ptr, ecx
			}

			gp_bp_write_reg32(data32 >> 24, data32 & 0x00ffffff);

			
			// we could finish render on GXCopyDisplay (why not?)
			// TODO: check if it is copying to active display area in YUV or rendering to texture!
			// this will prevent unneeded GL flushes!
			if ((data32 >> 24) == 0x52)
			{
				//fprintf(stderr, "%08x %x\n", data32, (data32 >> 2) & 0x7);
				if(data32 & 0x10000)
				{
					//clear area
					//glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);		// Clear The Screen And The Depth Buffer	
				}
				if (((data32 >> 2) & 0x7) == 0)
				{
					// destination format == 0 (YCbCr)
					//== 0x004003)
//					if (render_count)
					{
						schedule_finish = 1;
						gx_fifo_finished();
                        render_count = 0;
					}
				}
			}

			//
			// PE done marker (marks end of display list?) 
			//
/*			if (data32 == 0x45000002)
			{
				// but we will flush on DrawDone :)
				if (render_count)
				{
					schedule_finish = 1;
					gx_fifo_finished();
 				}
				else
				{
					syslog(GX,"Empty List\n");
				}
				render_count = 0;
			}
*/			load_stage = LOAD_OPCODE;
			break;
		case LOAD_CP_REG_SEL:
			 gx_buff_bytesleft -= 1;
			_asm
			{
				mov ecx, gx_ptr
				mov al, [ecx]
				mov data8, al
				add ecx, 1
				mov gx_ptr, ecx
			}
			cp_reg_sel = data8;
			load_stage = LOAD_CP_REG;
			break;
		case LOAD_CP_REG:
			 gx_buff_bytesleft -= 4;
			// source data is in big endian!
			_asm
			{
				mov ecx, gx_ptr
				mov eax, [ecx]
				bswap eax
				mov data32, eax
				add ecx, 4
				mov gx_ptr, ecx
			}
			gp_cp_write_reg(cp_reg_sel, data32);
			load_stage = LOAD_OPCODE;
			break;
		case LOAD_XF_REG_SEL:
			 gx_buff_bytesleft -= 4;
			// source data is in big endian!
			_asm
			{
				mov ecx, gx_ptr
				mov eax, [ecx]
				bswap eax
				mov data32, eax
				add ecx, 4
				mov gx_ptr, ecx
			}
			xf_reg_sel = data32 & 0xffff;
			xf_reg_load_len = data32 >> 16;
			load_stage = LOAD_XF_REG;
			break;
		case LOAD_XF_REG_SEL_2:
			 gx_buff_bytesleft -= 2;
			// source data is in big endian!
			_asm
			{
				mov ecx, gx_ptr
				mov ax, [ecx]
				xchg al, ah
				mov data16, ax
				add ecx, 2
				mov gx_ptr, ecx
			}
			xf_reg_sel = data16 & 0xffff;
			load_stage = LOAD_XF_REG;
			break;
		case LOAD_XF_REG:
			 gx_buff_bytesleft -= 4;
			// source data is in big endian!
			_asm
			{
				mov ecx, gx_ptr
				mov eax, [ecx]
				bswap eax
				mov data32, eax
				add ecx, 4
				mov gx_ptr, ecx
			}
			gp_xf_write_reg(xf_reg_sel, data32);
			xf_reg_sel++;
			xf_reg_load_len--;
			if (xf_reg_load_len < 0) 
				load_stage = LOAD_OPCODE;
			break;
		case LOAD_VERTEX_NUM:
			 gx_buff_bytesleft -= 2;
			// source data is in big endian!
			_asm
			{
				mov ecx, gx_ptr
				mov ax, [ecx]
				xchg al, ah
				mov data16, ax
				add ecx, 2
				mov gx_ptr, ecx
			}
#if USE_VERTEX_PTR
			vx_vertex_ptr.ubp = vx_vertex_data_ub;
			vx_vertex_ptr_temp.ubp = vx_vertex_data_ub;
			vx_vertex_byte_pos = 0;
#else
			vx_vertex_data = vx_vertex_data_static;
#endif
			vertex_count = data16;
			vertex_count = data16;
			if (vertex_count == 0)
			{
				// sometimes there is no single vertex, strange but true :)
				// an empty end of primitives
				syslog(GX,"GX: Empty\n");
				vx_end();
				load_stage = LOAD_OPCODE;
			}
			else
			{
				vx_begin_count(vertex_count);
				vertex_elements = vx_prepare_table(vertex_attribute_table);
				syslog(GX,"GP: Vertex Count: %d of %d elements\n", vertex_count, vertex_elements);
	//			syslog(GX,"GP: Vertex Descriptor: %08x %08x\n", gp_cp_regs[0x50], gp_cp_regs[0x60]);
	//			syslog(GX,"POS: %02x")
				vertex_pos = 0;
				load_stage = LOAD_VERTEX;
			}
			break;
		case LOAD_VERTEX:
			// $OPTIMIZATION HINT$: loading 1 byte at a time is very very slow, it could load much more each time
			//
			 gx_buff_bytesleft -= 1;
			//syslog(GX,"VERTEX pos %d %08x.%d (%02x)\n", vertex_pos, data, size, vx_vertex_data_size[vertex_pos]);
	#if USE_VERTEX_PTR
			// feed big endian data byte by byte
			_asm
			{
				mov ecx, gx_ptr
				mov al, [ecx]
				mov data8, al
				add ecx, 1
				mov gx_ptr, ecx
			}

			vx_vertex_ptr_temp.ubp[0] = data8;
			vx_vertex_ptr_temp.ubp += 1;
			vx_vertex_byte_pos += 1;

			if (vx_vertex_byte_pos >= vertex_elements)
			{
				if (vx_vertex_byte_pos > vertex_elements)
				{
					syslog(VX,"THIS SHOULD NEVER HAPPEN\n");
				}
				vx_process_commands();

				vx_vertex_ptr_temp.ubp = vx_vertex_data_ub;
				vx_vertex_byte_pos = 0;
				vertex_count--;
				if (vertex_count == 0)
				{
					render_count++;
					vx_end();
					load_stage = LOAD_OPCODE;
				}
			}
	#else
			if ((vx_vertex_data_size[vertex_pos] & 0xf) != 1)
			{
				uint32 cp_len = (vx_vertex_data_size[vertex_pos] >> 4) & 0xf;
				uint32 cp_size = vx_vertex_data_size[vertex_pos] & 0xf;
				if (1 != cp_len)
				{
					vx_process_commands();
					syslog_warn(VX,"Cannot upload FIFO at: %d %02x %d\n", vertex_pos, vx_vertex_data_size[vertex_pos], );
					syslog_warn(VX,"Assuming casting input\n");
					vx_vertex_data[vertex_pos].i = data;
					vertex_pos++;
				}
				else
				{
					syslog_error(VI,"FIFO conversion *NOT IMPLEMENTED* %d->%d\n", 1 , cp_size);
					break;
				}
			}
			else
			{
				vx_vertex_data[vertex_pos].i = data;
				vertex_pos++;
			}
			if (vertex_pos >= vertex_elements)
			{
				if (vertex_pos > vertex_elements)
				{
					syslog_error(VI,"THIS SHOULD NEVER HAPPEN\n");
				}
				vx_process_commands();
				vertex_pos = 0;
				vertex_count--;
				if (vertex_count == 0)
				{
					render_count++;
					glEnd();
					load_stage = LOAD_OPCODE;
				}
			}
	#endif // USE_VERTEX_PTR
			break;

		case LOAD_DL_ADDRESS:
			 gx_buff_bytesleft -= 4;
			// source data is in big endian!
			_asm
			{
				mov ecx, gx_ptr
				mov eax, [ecx]
				bswap eax
				mov data32, eax
				add ecx, 4
				mov gx_ptr, ecx
			}
			gx_dl_addr = data32; 
			load_stage = LOAD_DL_SIZE;
			break;

		case LOAD_DL_SIZE:
			 gx_buff_bytesleft -= 4;
			// source data is in big endian!
			_asm
			{
				mov ecx, gx_ptr
				mov eax, [ecx]
				bswap eax
				mov data32, eax
				add ecx, 4
				mov gx_ptr, ecx
			}
			gx_dl_size = data32; 
			load_stage = LOAD_OPCODE;
			gx_parse_display_list(gx_dl_addr, gx_dl_size);
			//glEnd();
			break;
		default:
			syslog(GX,"GP: this should never happen (fifo load stage %d) %s line: %d\n", load_stage, __FILE__, __LINE__);
			exit(1);
			break;
		}
	}
}



bool pe_check_interrupt(void)
{
	return pe_finished;
}
