/***************************************************************************
                      gx_fifo_interface.c  -  description
                             -------------------
    begin                : 
    copyright            : (C) 2005 by Duddie
    email                : duddie@walla.com
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version. See also the license.txt file for *
 *   additional informations.                                              *
 *                                                                         *
 ***************************************************************************/

//*************************************************************************//
// History of changes:
//
// 2005/05/30 - Pete
// - added lines and points
//
// 2005/05/28 - Pete
// - removed Duddie's "frame finish" hack, we are now doing real EFB/XFB 
//
//*************************************************************************//

#include "stdafx.h"
#include "gx_fifo_interface.h"
#include "vertex_processor.h"
#include "cp_regs.h"
#include "bp_regs.h"
#include "xf_regs.h"
#include "ogl_generic.h"
#include <stdio.h>

#define _IN_GXFIFO
#include "externals.h"

///////////////////////////////////////////////////////////////////////////
// DEFINES + GLOBALS

enum fifo_decode_stages
{
    LOAD_OPCODE = 0,
    LOAD_BP_REG,
    LOAD_CP_REG_SEL,
    LOAD_CP_REG,
    LOAD_XF_REG_SEL,
    LOAD_XF_REG_SEL_2,
    LOAD_XF_REG,
    LOAD_VERTEX_NUM,
    LOAD_VERTEX,
    LOAD_DL_ADDRESS,
    LOAD_DL_SIZE
};

uint8 load_stage_size[]=
{
    1, // LOAD_OPCODE = 0,
    4, // LOAD_BP_REG,
    1, // LOAD_CP_REG_SEL,
    4, // LOAD_CP_REG,
    4, // LOAD_XF_REG_SEL,
    2, // LOAD_XF_REG_SEL_2,
    4, // LOAD_XF_REG,
    2, // LOAD_VERTEX_NUM,
    1, // LOAD_VERTEX,
    4, // LOAD_DL_ADDRESS,
    4, // LOAD_DL_SIZE
};

int render_count    = 0;

uint32 gx_dl_addr   = 0;
uint32 gx_dl_size   = 0;

static uint8           vertex_attribute_table;
static uint32          vertex_count = 0;
static uint32          vertex_elements;
static uint32          vertex_size;
static uint32          vx_vertex_byte_pos;
static vx_vertex_ptr_t vx_vertex_ptr_temp;

static uint8  nop_counter = 0;
static uint8  opcode      = 0xff;
static uint8  load_stage  = 0;
static int    cp_reg_sel;
static int    xf_reg_sel;
static int    xf_reg_load_len;
static uint32 vertex_pos;

// internal command buffer          
static uint8  gx_buff[4096];
static uint32 gx_buff_bytesleft = 0;
static void * gx_ptr;

///////////////////////////////////////////////////////////////////////////
// inline helpers

static __inline uint32 byteswap32(uint32 data)
{
 __asm
  {
   mov eax, data
   bswap eax
  }
}

static __inline uint16 byteswap16(uint16 data)
{
 return (data<<8)|(data>>8);
}

///////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////

void gx_parse_display_list(uint32 addr, uint32 size)
{
 uint32  i, j;
 uint8   data8;
 uint16  data16;
 uint32  data32;
 uint32  xf_count, xf_sel;
 uint8   cmd;

 addr &= 0x01ffffff;

 for(i=0;i<size;)
  {
   cmd = gx_memory[addr+i];
        
   i++;
   
   switch(cmd)
    {//------------------------------------------------//
     case 0x00:                                        // NOP
      break;
     //------------------------------------------------//
     case 0x08:                                        // CP

      data8 = gx_memory[addr + i];
      i++;
      data32 = *(uint32 *)(&gx_memory[addr + i]);
      data32 = byteswap32(data32);
      i += 4;
      gp_cp_write_reg(data8, data32);
      break;
     //------------------------------------------------//
     case 0x10:                                        // XF

      data16 = *(uint16 *)(&gx_memory[addr + i]);
      xf_count = byteswap16(data16) + 1;
      i += 2;
      data16 = *(uint16 *)(&gx_memory[addr + i]);
      xf_sel = byteswap16(data16) + 1;
      i += 2;
      for(j = 0 ; j < xf_count ; j++)
       {
        data32 = *(uint32 *)(&gx_memory[addr + i]);
        data32 = byteswap32(data32);
        i += 4;
        gp_xf_write_reg(xf_sel, data32);
        xf_sel++;
       }
      break;
     //------------------------------------------------//
     case 0x20:                                        // IDX A,B,C,D
     case 0x28:
     case 0x30:
     case 0x38:
//PETE:
#ifdef AUX_ILLEGALCALLS 
      auxprintf("INDEX DL %x\n",cmd);
#endif
      break;
     //------------------------------------------------//
     case 0x40:                                        // CALL DL
#ifdef AUX_ILLEGALCALLS 
      auxprintf("CALL DL FROM DL%x\n",cmd);
#endif
      break;
     //------------------------------------------------// INVALIDATE VTX CACHE
     case 0x48:
      //auxprintf("IVV\n");
      break;
     //------------------------------------------------//
     case 0x61:                                        // LOAD BP REG

      data32 = *(uint32 *)(&gx_memory[addr + i]);
      data32 = byteswap32(data32);
      i += 4;
      gp_bp_write_reg32(data32 >> 24, data32 & 0x00ffffff);
      break;
     //------------------------------------------------//
     default:

      data16 = *(uint16 *)(&gx_memory[addr + i]);
      data16 = byteswap16(data16);
      i+=2;
      vertex_count = data16;
      vertex_attribute_table = cmd & 0x7;
      vertex_size = vx_prepare_table(vertex_attribute_table);

      // syslog(GX,"DL Draw Quads VAT: %x Count: %d Size: %d\n", vertex_attribute_table, vertex_count, vertex_size);

      switch(cmd & 0xf8)
       {
        case 0x80:
         vx_begin(GL_QUADS);
         break;

        case 0x90:
         vx_begin(GL_TRIANGLES);
         break;
 
        case 0x98:
         vx_begin(GL_TRIANGLE_STRIP);
         break;

        case 0xa0:
         vx_begin(GL_TRIANGLE_FAN);
         break;
                  
        case 0xa8:
         vx_begin(GL_LINES);
         break;

        case 0xb0:
         vx_begin(GL_LINE_STRIP);
         break;

        case 0xb8:
         vx_begin(GL_POINTS);
         break;

        default:
#ifdef AUX_ILLEGALCALLS 
      auxprintf("ILLEGAL DL CMD %02x %08x\n", cmd, *(uint32 *)&gx_memory[addr + i]);
#endif
//       vx_begin(GL_POLYGON);
         break;
       }

      for(j = 0 ; j < vertex_count ; j++)
       {
        vx_vertex_ptr.ubp = gx_memory + addr + i;
        i += vertex_size;
        vx_process_commands();
        render_count++;
       }
      vx_end();


      break;
     //------------------------------------------------//
    }
  }
}

///////////////////////////////////////////////////////////////////////////
//
// GX RENDERING LIST PARSER
//
//
// GX has an internal buffer where it adds bursts of incoming bursts of data
// it will process the data in this buffer until it is either finished, empty or there is not
// enough data to complete the current command
//
// We add data at the end of the buffer, and we process the buffer by cruising through it on 
// a casted void pointer until there is no more data to process
//
// The only correct way to feed the GPU is through WPAR which bursts 32bytes of data to the GPU
// NOTE: just keep in mind that the WPAR data is in big endian format !!
//
///////////////////////////////////////////////////////////////////////////

void gx_write_fifo(uint8 *databurst, uint32 len)
{
 uint32 data32;
 uint16 data16;
 uint8  data8;
    
 // move partial bytes to front of buffer
 if(gx_buff_bytesleft>0)
  {
   memcpy(&gx_buff[0],gx_ptr, gx_buff_bytesleft);
  }

 // add this burst to the buffer so we can complete any partial commands
 memcpy(&gx_buff[gx_buff_bytesleft], databurst, len);     // always 32byte bursts (256 bits)
 gx_buff_bytesleft += len;                              // 32 bytes of data to process..

 // as long as we still have data to process this command ..
 gx_ptr=(void *)gx_buff;

 for(;gx_buff_bytesleft>=load_stage_size[load_stage];)  
  {
//if(GetAsyncKeyState(VK_SHIFT)&32768)
// auxprintf("st %d\n",load_stage);


   switch(load_stage)                                 
    {//------------------------------------------------//
     case LOAD_OPCODE:

      gx_buff_bytesleft -= 1;
      __asm
        {
         mov ecx, gx_ptr
         mov al,  [ecx]
         mov data8, al
         add ecx, 1
         mov gx_ptr, ecx
        }

      // opcode decode stage, 0 is special 'NOP' case
      // after NOP the GPU is finished  ?

      if (data8 == 0)
       {
       }
      else
       {
        // GP opcode    oooo ovvv   o - opcode, v - VAT
        // except special case 0x61 Load BP Reg - SU_ByPassCmd

        if(data8==0x61)
         {
          opcode=0x61;
          vertex_attribute_table=0x00;
          load_stage=LOAD_BP_REG;
         }
        else
         {
          opcode = data8 & 0xf8;
          vertex_attribute_table = data8 & 0x7;
 
          // GP opcode    oooo ovvv   o - opcode, v - VAT

          switch(opcode)                               
           {//-----------------------------------------//
            case 0x00: 
             // syslog(GX,"NOP\n");
             break;
            //-----------------------------------------//
            case 0x08:
             //if (vertex_attribute_table != 0) syslog_error(GX,"GP: this should not happen %s %d\n", __FILE__, __LINE__);
             load_stage = LOAD_CP_REG_SEL;
             break;
            //-----------------------------------------//
            case 0x10:
             //if (vertex_attribute_table != 0) syslog_error(GX,"GP: this should not happen %s %d\n", __FILE__, __LINE__);
             load_stage = LOAD_XF_REG_SEL;
             break;
            //-----------------------------------------//
            case 0x40:
             load_stage = LOAD_DL_ADDRESS;             // Call DL
             break;
            //-----------------------------------------//
            case 0x48:
             //if (vertex_attribute_table != 0) syslog_error(GX,"GP: this should not happen %s %d\n", __FILE__, __LINE__);
             //syslog(GX,"GP: Invalidate Vertex Cache\n");
             break;
            //-----------------------------------------//
            case 0x80:
             //syslog(GX,"GP: Draw Quads VAT: %d\n", vertex_attribute_table);
             load_stage = LOAD_VERTEX_NUM;
             vx_begin(GL_QUADS);
             break;
            //-----------------------------------------//
            case 0x88:
             //syslog(GX,"GP: Draw Quad Strip VAT: %d\n", vertex_attribute_table);
             load_stage = LOAD_VERTEX_NUM;
             vx_begin(GL_QUAD_STRIP);
             break;
            //-----------------------------------------//
            case 0x90:
             //syslog(GX,"GP: Draw Triangles VAT: %d\n", vertex_attribute_table);
             load_stage = LOAD_VERTEX_NUM;
             vx_begin(GL_TRIANGLES);
             break;
            //-----------------------------------------//
            case 0x98:
             //syslog(GX,"GP: Draw Triangle Strip VAT: %d\n", vertex_attribute_table);
             load_stage = LOAD_VERTEX_NUM;
             vx_begin(GL_TRIANGLE_STRIP);
             break;
            //-----------------------------------------//
            case 0xa0:
             //syslog(GX,"GP: Draw Triangle Fan VAT: %d\n", vertex_attribute_table);
             load_stage = LOAD_VERTEX_NUM;
             vx_begin(GL_TRIANGLE_FAN);
             break;
            //-----------------------------------------//
            case 0xa8:
             //syslog(GX,"GP: Draw Lines VAT: %d\n", vertex_attribute_table);
             load_stage = LOAD_VERTEX_NUM;
             vx_begin(GL_LINES);
             break;
            //-----------------------------------------//
            case 0xb0:
             //syslog(GX,"GP: Draw Line  Strip VAT: %d\n", vertex_attribute_table);
             load_stage = LOAD_VERTEX_NUM;
             vx_begin(GL_LINE_STRIP);
             break;
            //-----------------------------------------//
            case 0xb8:
             //syslog(GX,"GP: Draw Points VAT: %d\n", vertex_attribute_table);
             load_stage = LOAD_VERTEX_NUM;
             vx_begin(GL_POINTS);
             break;
            //-----------------------------------------//
            default:
             //syslog_error(GX,"GP: opcode not implemented: %02x%\n", opcode);
             break;
           }
         }
       }
      break;
     //------------------------------------------------//
     case LOAD_BP_REG:

      gx_buff_bytesleft -= 4;
      // source data is in big endian!
      __asm
       {
        mov ecx, gx_ptr
        mov eax, [ecx]
        bswap eax
        mov data32, eax
        add ecx, 4
        mov gx_ptr, ecx
       }

      gp_bp_write_reg32(data32 >> 24, data32 & 0x00ffffff);

      load_stage = LOAD_OPCODE;
      break;
     //------------------------------------------------//
     case LOAD_CP_REG_SEL:

      gx_buff_bytesleft -= 1;
      __asm
       {
        mov ecx, gx_ptr
        mov al, [ecx]
        mov data8, al
        add ecx, 1
        mov gx_ptr, ecx
       }
      cp_reg_sel = data8;
      load_stage = LOAD_CP_REG;
      break;
     //------------------------------------------------//
     case LOAD_CP_REG:
 
      gx_buff_bytesleft -= 4;
      // source data is in big endian!
      __asm
       {
        mov ecx, gx_ptr
        mov eax, [ecx]
        bswap eax
        mov data32, eax
        add ecx, 4
        mov gx_ptr, ecx
       }
  
      gp_cp_write_reg(cp_reg_sel, data32);
      load_stage = LOAD_OPCODE;
      break;
     //------------------------------------------------//
     case LOAD_XF_REG_SEL:

      gx_buff_bytesleft -= 4;
      // source data is in big endian!
      __asm
       {
        mov ecx, gx_ptr
        mov eax, [ecx]
        bswap eax
        mov data32, eax
        add ecx, 4
        mov gx_ptr, ecx
       }

      xf_reg_sel = data32 & 0xffff;
      xf_reg_load_len = data32 >> 16;
      load_stage = LOAD_XF_REG;
      break;
     //------------------------------------------------//
     case LOAD_XF_REG_SEL_2:

      gx_buff_bytesleft -= 2;
      // source data is in big endian!
      __asm
       {
        mov ecx, gx_ptr
        mov ax, [ecx]
        xchg al, ah
        mov data16, ax
        add ecx, 2
        mov gx_ptr, ecx
       }
      xf_reg_sel = data16 & 0xffff;
      load_stage = LOAD_XF_REG;
      break;
     //------------------------------------------------//
     case LOAD_XF_REG:

      gx_buff_bytesleft -= 4;
      // source data is in big endian!
      __asm
       {
        mov ecx, gx_ptr
        mov eax, [ecx]
        bswap eax
        mov data32, eax
        add ecx, 4
        mov gx_ptr, ecx
       }
      gp_xf_write_reg(xf_reg_sel, data32);
      xf_reg_sel++;
      xf_reg_load_len--;
      if(xf_reg_load_len<0) load_stage=LOAD_OPCODE;
      break;
     //------------------------------------------------//
     case LOAD_VERTEX_NUM:

      gx_buff_bytesleft -= 2;
      // source data is in big endian!
      __asm
       {
        mov ecx, gx_ptr
        mov ax, [ecx]
        xchg al, ah
        mov data16, ax
        add ecx, 2
        mov gx_ptr, ecx
       }
      vx_vertex_ptr.ubp = vx_vertex_data_ub;
      vx_vertex_ptr_temp.ubp = vx_vertex_data_ub;
      vx_vertex_byte_pos = 0;
      vertex_count = data16;

      if(vertex_count==0)
       {
        // sometimes there is no single vertex, strange but true :)
        // an empty end of primitives
        // syslog(GX,"GX: Empty\n");
        vx_end();
        load_stage = LOAD_OPCODE;
       }
      else
       {
        vertex_elements = vx_prepare_table(vertex_attribute_table);
        // syslog(GX,"GP: Vertex Count: %d of %d elements\n", vertex_count, vertex_elements);
        // syslog(GX,"GP: Vertex Descriptor: %08x %08x\n", gp_cp_regs[0x50], gp_cp_regs[0x60]);
        // syslog(GX,"POS: %02x")
        vertex_pos = 0;
        load_stage = LOAD_VERTEX;
       }
      break;
     //------------------------------------------------//
     case LOAD_VERTEX:
      // $OPTIMIZATION HINT$: loading 1 byte at a time is very very slow, it could load much more each time

      gx_buff_bytesleft -= 1;
      //syslog(GX,"VERTEX pos %d %08x.%d (%02x)\n", vertex_pos, data, size, vx_vertex_data_size[vertex_pos]);

      // feed big endian data byte by byte
      __asm
       {
        mov ecx, gx_ptr
        mov al, [ecx]
        mov data8, al
        add ecx, 1
        mov gx_ptr, ecx
       }

      vx_vertex_ptr_temp.ubp[0] = data8;
      vx_vertex_ptr_temp.ubp += 1;
      vx_vertex_byte_pos += 1;

      if(vx_vertex_byte_pos>=vertex_elements)
       {
        // if (vx_vertex_byte_pos > vertex_elements) syslog(VX,"THIS SHOULD NEVER HAPPEN\n");
        vx_process_commands();

        vx_vertex_ptr_temp.ubp = vx_vertex_data_ub;
        vx_vertex_byte_pos = 0;
        vertex_count--;

        if(vertex_count==0)
         {
          render_count++;
          vx_end();
          load_stage = LOAD_OPCODE;
         }
       }
      break;
     //------------------------------------------------//
     case LOAD_DL_ADDRESS:

      gx_buff_bytesleft -= 4;
      // source data is in big endian!
      __asm
       {
        mov ecx, gx_ptr
        mov eax, [ecx]
        bswap eax
        mov data32, eax
        add ecx, 4
        mov gx_ptr, ecx
       }
      gx_dl_addr = data32; 
      load_stage = LOAD_DL_SIZE;
      break;
     //------------------------------------------------//
     case LOAD_DL_SIZE:

      gx_buff_bytesleft -= 4;
      // source data is in big endian!
      __asm
       {
        mov ecx, gx_ptr
        mov eax, [ecx]
        bswap eax
        mov data32, eax
        add ecx, 4
        mov gx_ptr, ecx
       }
      gx_dl_size = data32; 
      load_stage = LOAD_OPCODE;
      gx_parse_display_list(gx_dl_addr, gx_dl_size);
      break;
     //------------------------------------------------//
     default:
      //syslog(GX,"GP: this should never happen (fifo load stage %d) %s line: %d\n", load_stage, __FILE__, __LINE__);
      gpu_exit(1,6);
      break;
     //------------------------------------------------//
    }
  }
}

///////////////////////////////////////////////////////////////////////////

void gx_write_fifo8(uint32 data)
{
 switch(load_stage)                                   
  {//--------------------------------------------------//
   case LOAD_OPCODE:
    // opcode decode stage
    // opcode should be 1 byte, if it is not bail out
    if (data == 0)
     {
      //syslog(GX,"NOP is not 4 bytes in size but %d\n", 1);
      nop_counter++;
     }
    else
     {
      nop_counter = 0;
      // GP opcode    oooo ovvv   o - opcode, v - VAT
      // except special case 0x61 Load BP Reg - SU_ByPassCmd

      if (data == 0x61)
       {
        opcode = 0x61;
        vertex_attribute_table = 0x00;
        load_stage = LOAD_BP_REG;
       }
      else
       {
        opcode = data & 0xf8;
        vertex_attribute_table = data & 0x7;
        switch(opcode)                                 
         {//-------------------------------------------//
          case 0x00:
           //syslog(GX,"NOP\n");
           break;
          //-------------------------------------------//
          case 0x08:
           //if (vertex_attribute_table != 0) syslog_error(GX,"GP: this should not happen %s %d\n", __FILE__, __LINE__);
           load_stage = LOAD_CP_REG_SEL;
           break;
          //-------------------------------------------//
          case 0x10:
           //if (vertex_attribute_table != 0) syslog_error(GX,"GP: this should not happen %s %d\n", __FILE__, __LINE__);
           load_stage = LOAD_XF_REG_SEL;
           break;
          //-------------------------------------------//
          case 0x40:
           // Call DL
           load_stage = LOAD_DL_ADDRESS;
           break;
          //-------------------------------------------//
          case 0x48:
           //if (vertex_attribute_table != 0) syslog_error(GX,"GP: this should not happen %s %d\n", __FILE__, __LINE__);
           //syslog(GX,"GP: Invalidate Vertex Cache\n");
           break;
          //-------------------------------------------//
          case 0x80:
           //syslog(GX,"GP: Draw Quads VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_QUADS);
           break;
          //-------------------------------------------//
          case 0x88:
           //syslog(GX,"GP: Draw Quad Strip VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_QUAD_STRIP);
           break;
          //-------------------------------------------//
          case 0x90:
           //syslog(GX,"GP: Draw Triangles VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_TRIANGLES);
           break;
          //-------------------------------------------//
          case 0x98:
           //syslog(GX,"GP: Draw Triangle Strip VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_TRIANGLE_STRIP);
           break;
          //-------------------------------------------//
          case 0xa0:
           //syslog(GX,"GP: Draw Triangle Fan VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_TRIANGLE_FAN);
           break;
          //-------------------------------------------//
          case 0xa8:
           //syslog(GX,"GP: Draw Lines VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_LINES);
           break;
          //-------------------------------------------//
          case 0xb0:
           //syslog(GX,"GP: Draw Line  Strip VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_LINE_STRIP);
           break;
          //-------------------------------------------//
          case 0xb8:
           //syslog(GX,"GP: Draw Points VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_POINTS);
           break;
          //-------------------------------------------//
          default:
           //syslog_error(GX,"GP: opcode not implemented: %02x%\n", opcode);
           break;
         }
       }
     }
    break;
   //--------------------------------------------------//
   case LOAD_BP_REG:
    //syslog_error(GX,"GP: loading BP register with data size different than 32 bit: %08x.%d\n", data, 1);
    break;
   //--------------------------------------------------//
   case LOAD_CP_REG_SEL:
    cp_reg_sel = data;
    load_stage = LOAD_CP_REG;
    break;
   //--------------------------------------------------//
   case LOAD_CP_REG:
    //syslog_error(GX,"GP: loading CP register with data size different than 32 bit: %08x.%d\n", data, 1);
    break;
   //--------------------------------------------------//
   case LOAD_XF_REG_SEL:
    //syslog_error(GX,"loading XF register selector with selector of different than 32bit: %08x.%d\n", data, 1);
    break;
   //--------------------------------------------------//
   case LOAD_XF_REG_SEL_2:
    //syslog_error(GX,"loading XF register selector with selecor of size different than 16 bit: %08x.%d\n", data, 1);
    break;
   //--------------------------------------------------//
   case LOAD_XF_REG:
    //syslog(GX,"GP: loading XF register with data of size different than 32 bit: %08x.%d\n", data, 1);
    gpu_exit(1,7);
    break;
   //--------------------------------------------------//
   case LOAD_VERTEX_NUM:
    //syslog(GX,"GP: loading VERTEX count with data of size different than 16 bit: %08x.%d\n", data, 1);
    gpu_exit(1,8);
    break;
   //--------------------------------------------------//
   case LOAD_VERTEX:
    //syslog(GX,"VERTEX pos %d %08x.%d (%02x)\n", vertex_pos, data, size, vx_vertex_data_size[vertex_pos]);

    vx_vertex_ptr_temp.ubp[0] = data;
    vx_vertex_ptr_temp.ubp += 1;
    vx_vertex_byte_pos += 1;

    if(vx_vertex_byte_pos>=vertex_elements)
     {
      //if(vx_vertex_byte_pos>vertex_elements) syslog(VX,"THIS SHOULD NEVER HAPPEN\n");

      vx_process_commands();

      vx_vertex_ptr_temp.ubp = vx_vertex_data_ub;
      vx_vertex_byte_pos = 0;
      vertex_count--;
 
      if(vertex_count==0)
       {
        render_count++;
        vx_end();
        load_stage = LOAD_OPCODE;
       }
     }
   break;
   //--------------------------------------------------//
   case LOAD_DL_ADDRESS:
    //syslog_error(GX,"loading DL address with data size different than 32 bit: %08x.%d\n", data, 1);
    break;
   //--------------------------------------------------//
   case LOAD_DL_SIZE:
    //syslog_error(GX,"loading DL size with data size different than 32 bit: %08x.%d\n", data, 1);
    break;
   //--------------------------------------------------//
   default:
    //syslog(GX,"GP: this should never happen (fifo load stage %d) %s line: %d\n", load_stage, __FILE__, __LINE__);
    gpu_exit(1,9);
    break;
   //--------------------------------------------------//
  }
}

///////////////////////////////////////////////////////////////////////////

void gx_write_fifo16(uint32 data)
{
 switch(load_stage)                                    
  {//--------------------------------------------------//
   case LOAD_OPCODE:
    // opcode decode stage
    // opcode should be 1 byte, if it is not bail out
 
    if(data==0)
     {
      //syslog(GX,"NOP is not 4 bytes in size but %d\n", 2);

      nop_counter++;
     }
    else
     {
      nop_counter = 0;

      // GP opcode    oooo ovvv   o - opcode, v - VAT
      // except special case 0x61 Load BP Reg - SU_ByPassCmd

      if(data==0x61)
       {
        opcode = 0x61;
        vertex_attribute_table = 0x00;
        load_stage = LOAD_BP_REG;
       }
      else
       {
        opcode = data & 0xf8;
        vertex_attribute_table = data & 0x7;
        switch(opcode)                                 
         {//-------------------------------------------//
          case 0x00:
           //syslog(GX,"NOP\n");
           break;
          //-------------------------------------------//
          case 0x08:
           //if (vertex_attribute_table != 0) syslog_error(GX,"GP: this should not happen %s %d\n", __FILE__, __LINE__);
           load_stage = LOAD_CP_REG_SEL;
           break;
          //-------------------------------------------//
          case 0x10:
           //if (vertex_attribute_table != 0) syslog_error(GX,"GP: this should not happen %s %d\n", __FILE__, __LINE__);
           load_stage = LOAD_XF_REG_SEL;
           break;
          //-------------------------------------------//
          case 0x40:
           // Call DL
           load_stage = LOAD_DL_ADDRESS;
           break;
          //-------------------------------------------//
          case 0x48:
           //if (vertex_attribute_table != 0) syslog_error(GX,"GP: this should not happen %s %d\n", __FILE__, __LINE__);
           //syslog(GX,"GP: Invalidate Vertex Cache\n");
           break;
          //-------------------------------------------//
          case 0x80:
           //syslog(GX,"GP: Draw Quads VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_QUADS);
           break;
          //-------------------------------------------//
          case 0x88:
           //syslog(GX,"GP: Draw Quad Strip VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_QUAD_STRIP);
           break;
          //-------------------------------------------//
          case 0x90:
           //syslog(GX,"GP: Draw Triangles VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_TRIANGLES);
           break;
          //-------------------------------------------//
          case 0x98:
           //syslog(GX,"GP: Draw Triangle Strip VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_TRIANGLE_STRIP);
           break;
          //-------------------------------------------//
          case 0xa0:
           //syslog(GX,"GP: Draw Triangle Fan VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_TRIANGLE_FAN);
           break;
          //-------------------------------------------//
          case 0xa8:
           //syslog(GX,"GP: Draw Lines VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_LINES);
           break;
          //-------------------------------------------//
          case 0xb0:
           //syslog(GX,"GP: Draw Line  Strip VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_LINE_STRIP);
           break;
          //-------------------------------------------//
          case 0xb8:
           //syslog(GX,"GP: Draw Points VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_POINTS);
           break;
          //-------------------------------------------//
          default:
           //syslog_error(GX,"GP: opcode not implemented: %02x%\n", opcode);
           break;
          //-------------------------------------------//
         }
       }
     }
    break;
   //--------------------------------------------------//
   case LOAD_BP_REG:
    //syslog_error(GX,"GP: loading BP register with data size different than 32 bit: %08x.%d\n", data, 2);
    break;
   //--------------------------------------------------//
   case LOAD_CP_REG_SEL:
    //syslog_error(GX,"GP: loading CP register selector with selecor of size different than 8 bit: %08x.%d\n", data, 2);
    break;
   //--------------------------------------------------//
   case LOAD_CP_REG:
    //syslog_error(GX,"GP: loading CP register with data size different than 32 bit: %08x.%d\n", data, 2);
    break;
   //--------------------------------------------------//
   case LOAD_XF_REG_SEL:
    //syslog_warn(GX,"loading XF register selector with selecor of size different than 32 bit: %08x.%d\n", data, 2);
    xf_reg_load_len = data;
    load_stage = LOAD_XF_REG_SEL_2;
    break;
   //--------------------------------------------------//
   case LOAD_XF_REG_SEL_2:
    xf_reg_sel = data & 0xffff;
    load_stage = LOAD_XF_REG;
    break;
   //--------------------------------------------------//
   case LOAD_XF_REG:
    //syslog(GX,"GP: loading XF register with data of size different than 32 bit: %08x.%d\n", data, 2);
    gpu_exit(1,10);
    break;
   //--------------------------------------------------//
   case LOAD_VERTEX_NUM:
    vx_vertex_ptr.ubp = vx_vertex_data_ub;
    vx_vertex_ptr_temp.ubp = vx_vertex_data_ub;
    vx_vertex_byte_pos = 0;
    vertex_count = data;
    vertex_count = data;
   
    if(vertex_count==0)
     {
      // sometimes there is no single vertex, strange but true :)
      // an empty end of primitives
      // syslog(GX,"GX: Empty\n");
      vx_end();
      load_stage = LOAD_OPCODE;
     }
    else
     {
      vertex_elements = vx_prepare_table(vertex_attribute_table);
//          syslog(GX,"GP: Vertex Count: %d of %d elements\n", vertex_count, vertex_elements);
//          syslog(GX,"GP: Vertex Descriptor: %08x %08x\n", gp_cp_regs[0x50], gp_cp_regs[0x60]);
//          syslog(GX,"POS: %02x")
            vertex_pos = 0;
      load_stage = LOAD_VERTEX;
     }
    break;
   //--------------------------------------------------//
   case LOAD_VERTEX:

    //syslog(GX,"VERTEX pos %d %08x.%d (%02x)\n", vertex_pos, data, size, vx_vertex_data_size[vertex_pos]);

//???????? PETE: what's that... ptr to 16 bit, ehhh?
    vx_vertex_ptr_temp.usp[0] = byteswap16((uint16)data);

    vx_vertex_ptr_temp.ubp += 2;
    vx_vertex_byte_pos += 2;

    if(vx_vertex_byte_pos>=vertex_elements)
     {
      //if (vx_vertex_byte_pos > vertex_elements) syslog(VX,"THIS SHOULD NEVER HAPPEN\n");

      vx_process_commands();

      vx_vertex_ptr_temp.ubp = vx_vertex_data_ub;
      vx_vertex_byte_pos = 0;
      vertex_count--;
  
      if(vertex_count==0)
       {
        render_count++;
        vx_end();
        load_stage = LOAD_OPCODE;
       }
     }
    break;
   //--------------------------------------------------//
   case LOAD_DL_ADDRESS:
    //syslog_error(GX,"loading DL address with data size different than 32 bit: %08x.%d\n", data, 2);
    break;
   //--------------------------------------------------//
   case LOAD_DL_SIZE:
    //syslog_error(GX,"loading DL size with data size different than 32 bit: %08x.%d\n", data, 2);
    break;
   //--------------------------------------------------//
   default:
    //syslog(GX,"GP: this should never happen (fifo load stage %d) %s line: %d\n", load_stage, __FILE__, __LINE__);
    gpu_exit(1,11);
    break;
   //--------------------------------------------------//
  }
}

///////////////////////////////////////////////////////////////////////////

void gx_write_fifo32(uint32 data)
{
 switch(load_stage)                                   
  {//--------------------------------------------------//
   case LOAD_OPCODE:
    // opcode decode stage
    // opcode should be 1 byte, if it is not bail out
    if (data == 0)
     {
      nop_counter++;
     }
    else
     {
      nop_counter = 0;

      // GP opcode    oooo ovvv   o - opcode, v - VAT
      // except special case 0x61 Load BP Reg - SU_ByPassCmd

      if(data == 0x61)
       {
        opcode = 0x61;
        vertex_attribute_table = 0x00;
        load_stage = LOAD_BP_REG;
       }
      else
       {
        opcode = data & 0xf8;
        vertex_attribute_table = data & 0x7;

        switch(opcode)                                 
         {//-------------------------------------------//
          case 0x00:
           //syslog(GX,"NOP\n");
           break;
          //-------------------------------------------//
          case 0x08:
           //if (vertex_attribute_table != 0) syslog_error(GX,"GP: this should not happen %s %d\n", __FILE__, __LINE__);
           load_stage = LOAD_CP_REG_SEL;
           break;
          //-------------------------------------------//
          case 0x10:
           //if (vertex_attribute_table != 0) syslog_error(GX,"GP: this should not happen %s %d\n", __FILE__, __LINE__);
           load_stage = LOAD_XF_REG_SEL;
           break;
          //-------------------------------------------//
          case 0x40:
           // Call DL
           load_stage = LOAD_DL_ADDRESS;
           break;
          //-------------------------------------------//
          case 0x48:
           //if (vertex_attribute_table != 0) syslog_error(GX,"GP: this should not happen %s %d\n", __FILE__, __LINE__);
           //syslog(GX,"GP: Invalidate Vertex Cache\n");
           break;
          //-------------------------------------------//
          case 0x80:
           //syslog(GX,"GP: Draw Quads VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_QUADS);
           break;
          //-------------------------------------------//
          case 0x88:
           //syslog(GX,"GP: Draw Quad Strip VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_QUAD_STRIP);
           break;
          //-------------------------------------------//
          case 0x90:
           //syslog(GX,"GP: Draw Triangles VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_TRIANGLES);
           break;
          //-------------------------------------------//
          case 0x98:
           //syslog(GX,"GP: Draw Triangle Strip VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_TRIANGLE_STRIP);
           break;
          //-------------------------------------------//
          case 0xa0:
           //syslog(GX,"GP: Draw Triangle Fan VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_TRIANGLE_FAN);
           break;
          //-------------------------------------------//
          case 0xa8:
           //syslog(GX,"GP: Draw Lines VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_LINES);
           break;
          //-------------------------------------------//
          case 0xb0:
           //syslog(GX,"GP: Draw Line  Strip VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_LINE_STRIP);
           break;
          //-------------------------------------------//
          case 0xb8:
           //syslog(GX,"GP: Draw Points VAT: %d\n", vertex_attribute_table);
           load_stage = LOAD_VERTEX_NUM;
           vx_begin(GL_POINTS);
           break;
          //-------------------------------------------//
          default:
           //syslog_error(GX,"GP: opcode not implemented: %02x%\n", opcode);
           break;
          //-------------------------------------------//
         }
       }
     }
    break;
   //--------------------------------------------------//
   case LOAD_BP_REG:

    gp_bp_write_reg32(data >> 24, data & 0x00ffffff);

    load_stage = LOAD_OPCODE;
    break;
   //--------------------------------------------------//
   case LOAD_CP_REG_SEL:
    //syslog_error(GX,"GP: loading CP register selector with selecor of size different than 8 bit: %08x.%d\n", data, 4);
    break;
   //--------------------------------------------------//
   case LOAD_CP_REG:
    gp_cp_write_reg(cp_reg_sel, data);
    load_stage = LOAD_OPCODE;
    break;
   //--------------------------------------------------//
   case LOAD_XF_REG_SEL:
    xf_reg_sel = data & 0xffff;
    xf_reg_load_len = data >> 16;
    load_stage = LOAD_XF_REG;
    break;
   //--------------------------------------------------//
   case LOAD_XF_REG_SEL_2:
    //syslog_error(GX,"loading XF register selector with selecor of size different than 16 bit: %08x.%d\n", data, 4);
    break;
   //--------------------------------------------------//
   case LOAD_XF_REG:
    gp_xf_write_reg(xf_reg_sel, data);
    xf_reg_sel++;
    xf_reg_load_len--;
    if(xf_reg_load_len<0) load_stage=LOAD_OPCODE;
    break;
   //--------------------------------------------------//
   case LOAD_VERTEX_NUM:
    //syslog(GX,"GP: loading VERTEX count with data of size different than 16 bit: %08x.%d\n", data, 4);
    gpu_exit(1,12);
    break;
   //--------------------------------------------------//
   case LOAD_VERTEX:
    //syslog(GX,"VERTEX pos %d %08x.%d (%02x)\n", vertex_pos, data, size, vx_vertex_data_size[vertex_pos]);
    vx_vertex_ptr_temp.ulp[0] = byteswap32(data);
    vx_vertex_ptr_temp.ubp += 4;
    vx_vertex_byte_pos += 4;

    if (vx_vertex_byte_pos >= vertex_elements)
     {
      //if (vx_vertex_byte_pos > vertex_elements) syslog(VX,"THIS SHOULD NEVER HAPPEN\n");

      vx_process_commands();

      vx_vertex_ptr_temp.ubp = vx_vertex_data_ub;
      vx_vertex_byte_pos = 0;
      vertex_count--;
      if(vertex_count==0)
       {
        render_count++;
        vx_end();
        load_stage = LOAD_OPCODE;
       }
     }
    break;
   //--------------------------------------------------//
   case LOAD_DL_ADDRESS:
    gx_dl_addr = data; 
    load_stage = LOAD_DL_SIZE;
    break;
   //--------------------------------------------------//
   case LOAD_DL_SIZE:
    gx_dl_size = data; 
    load_stage = LOAD_OPCODE;
    gx_parse_display_list(gx_dl_addr, gx_dl_size);
    break;
   //--------------------------------------------------//
   default:
    //syslog(GX,"GP: this should never happen (fifo load stage %d) %s line: %d\n", load_stage, __FILE__, __LINE__);
    gpu_exit(1,13);
    break;
   //--------------------------------------------------//
  }
}

///////////////////////////////////////////////////////////////////////////

