/*
Copyright (C) 2006 StrmnNrmn

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

*/

#include "stdafx.h"
#include "PSPColour.h"

#include "SysPSP/DaedalusVFPU.h"
#include <pspvfpu.h>

#include "DaedMathUtil.h"
#include "Vector4.h"

namespace
{
#ifdef DAEDALUS_PSP_USE_VFPU

//*****************************************************************************
//
//*****************************************************************************
const v4 __attribute__((aligned(16))) SCALE( 255.0f, 255.0f, 255.0f, 255.0f );

//*****************************************************************************
// Around 354,000 ticks/million
//*****************************************************************************
u32 Vector2ColourClampedVFPU(const v4 * col_in)
{
	pspvfpu_use_matrices(gDaedalusVFPUContext, 0, VMAT0);

	u32		out_ints[4];

	__asm__ volatile (

		"ulv.q		R000, 0  + %1\n"		// Load col_in into R000
		"lv.q		R001, %2\n"				// Load SCALE into R001 (we know it's aligned)
		"vzero.q	R002\n"					// Load 0,0,0,0 into R002

		"vmul.q		R000, R000, R001\n"		// R000 = R000 * [255,255,255,255]
		"vmax.q		R000, R000, R002\n"		// R000 = max(min(R000,255), 0)
		"vmin.q		R000, R000, R001\n"		// R000 = min(R000, 255)

		"vf2in.q	R000, R000, 0\n"		// R000 = (s32)(R000) << 0		- or is scale applied before? could use << 8 to scale to 0..255? Would need to be careful of 1.0 overflowing to 256
		"usv.q		R000, %0\n"				// Save out value

		: "=m" (out_ints) : "m" (*col_in), "m" (SCALE) : "memory" );

	return c32::Make( out_ints[0], out_ints[1], out_ints[2], out_ints[3] );
} 

//*****************************************************************************
// Around 320,000 ticks/million
//*****************************************************************************
u32 Vector2ColourUnclampedVFPU(const v4 * col_in)
{
	pspvfpu_use_matrices(gDaedalusVFPUContext, 0, VMAT0);

	u32		out_ints[4];

	__asm__ volatile (

		"ulv.q		R000, 0  + %1\n"		// Load col_in into R000
		"lv.q		R001, %2\n"				// Load SCALE into R001 (we know it's aligned)

		"vmul.q		R000, R000, R001\n"		// R000 = R000 * [255,255,255,255]

		"vf2in.q	R000, R000, 0\n"		// R000 = (s32)(R000) << 0		- or is scale applied before? could use << 8 to scale to 0..255? Would need to be careful of 1.0 overflowing to 256
		"usv.q		R000, %0\n"				// Save out value

		: "=m" (out_ints) : "m" (*col_in), "m" (SCALE) : "memory" );

	return c32::Make( out_ints[0], out_ints[1], out_ints[2], out_ints[3] );
} 

#endif // DAEDALUS_PSP_USE_VFPU

//*****************************************************************************
// Around 463,000 ticks/million
//*****************************************************************************
u32	Vector2ColourClampedCPU( const v4 * col_in )
{
	s32 r( Clamp<s32>( s32(col_in->x * 255.0f), 0, 255 ) );
	s32 g( Clamp<s32>( s32(col_in->y * 255.0f), 0, 255 ) );
	s32 b( Clamp<s32>( s32(col_in->z * 255.0f), 0, 255 ) );
	s32 a( Clamp<s32>( s32(col_in->w * 255.0f), 0, 255 ) );

	return c32::Make( r, g, b, a );
}

//*****************************************************************************
// Around 15,000 ticks/million (! - much faster than VFPU version
//*****************************************************************************
u32	Vector2ColourUnclampedCPU( const v4 * col_in )
{
	s32 r( s32(col_in->x * 255.0f) );
	s32 g( s32(col_in->y * 255.0f) );
	s32 b( s32(col_in->z * 255.0f) );
	s32 a( s32(col_in->w * 255.0f) );

	return c32::Make( r, g, b, a );
}

//*****************************************************************************
//
//*****************************************************************************
u32	Vector2ColourClamped( const v4 & colour )
{
#ifdef DAEDALUS_PSP_USE_VFPU
	return Vector2ColourClampedVFPU( &colour );
#else
	return Vector2ColourClampedCPU( &colour );
#endif
}

//*****************************************************************************
//
//*****************************************************************************
u32	Vector2ColourUnclamped( const v4 & colour )
{
	// This is always faster than the VFPU version
	return Vector2ColourUnclampedCPU( &colour );
}

//*****************************************************************************
//
//*****************************************************************************
u8 AddComponent( u8 a, u8 b )
{
	return u8( Clamp< s32 >( s32( a ) + s32( b ), 0, 255 ) );
}

//*****************************************************************************
//
//*****************************************************************************
u8 SubComponent( u8 a, u8 b )
{
	return u8( Clamp< s32 >( s32( a ) - s32( b ), 0, 255 ) );
}

//*****************************************************************************
//
//*****************************************************************************
u8 ModulateComponent( u8 a, u8 b )
{
	return u8( ( u32( a ) * u32( b ) ) >> 8 );		// >> 8 to return to 0..255
}

//*****************************************************************************
//
//*****************************************************************************
u8	InterpolateComponent( u8 a, u8 b, float factor )
{
	return u8(float(a) + (float(b) - float(a)) * factor);
}

}

//*****************************************************************************
//
//*****************************************************************************
const c32 c32::White( 255,255,255, 255 );
const c32 c32::Black( 0,0,0, 255 );
const c32 c32::Red( 255,0,0, 255 );
const c32 c32::Green( 0,255,0, 255 );
const c32 c32::Blue( 0,0,255, 255 );
const c32 c32::Magenta( 255,0,255, 255 );

//*****************************************************************************
//
//*****************************************************************************
c32::c32( const v4 & colour )
:	mColour( Vector2ColourClamped( colour ) )
{
}

//*****************************************************************************
//
//*****************************************************************************
v4	c32::GetColourV4() const
{
	v4 col( GetR() / 255.0f, GetG() / 255.0f, GetB() / 255.0f, GetA() / 255.0f );

	return col;
}

//*****************************************************************************
//
//*****************************************************************************
c32	c32::Add( c32 colour ) const
{
	u8		r( AddComponent( GetR(), colour.GetR() ) );
	u8		g( AddComponent( GetG(), colour.GetG() ) );
	u8		b( AddComponent( GetB(), colour.GetB() ) );
	u8		a( AddComponent( GetA(), colour.GetA() ) );

	return c32( r, g, b, a );
}

//*****************************************************************************
//
//*****************************************************************************
c32	c32::AddRGB( c32 colour ) const
{
	u8		r( AddComponent( GetR(), colour.GetR() ) );
	u8		g( AddComponent( GetG(), colour.GetG() ) );
	u8		b( AddComponent( GetB(), colour.GetB() ) );
	u8		a( GetA() );

	return c32( r, g, b, a );
}

//*****************************************************************************
//
//*****************************************************************************
c32	c32::AddA( c32 colour ) const
{
	u8		r( GetR() );
	u8		g( GetG() );
	u8		b( GetB() );
	u8		a( AddComponent( GetA(), colour.GetA() ) );

	return c32( r, g, b, a );
}

//*****************************************************************************
//
//*****************************************************************************
c32	c32::Sub( c32 colour ) const
{
	u8		r( SubComponent( GetR(), colour.GetR() ) );
	u8		g( SubComponent( GetG(), colour.GetG() ) );
	u8		b( SubComponent( GetB(), colour.GetB() ) );
	u8		a( SubComponent( GetA(), colour.GetA() ) );

	return c32( r, g, b, a );
}

//*****************************************************************************
//
//*****************************************************************************
c32	c32::SubRGB( c32 colour ) const
{
	u8		r( SubComponent( GetR(), colour.GetR() ) );
	u8		g( SubComponent( GetG(), colour.GetG() ) );
	u8		b( SubComponent( GetB(), colour.GetB() ) );
	u8		a( GetA() );

	return c32( r, g, b, a );
}

//*****************************************************************************
//
//*****************************************************************************
c32	c32::SubA( c32 colour ) const
{
	u8		r( GetR() );
	u8		g( GetG() );
	u8		b( GetB() );
	u8		a( SubComponent( GetA(), colour.GetA() ) );

	return c32( r, g, b, a );
}

//*****************************************************************************
//
//*****************************************************************************
c32	c32::Modulate( c32 colour ) const
{
	u8		r( ModulateComponent( GetR(), colour.GetR() ) );
	u8		g( ModulateComponent( GetG(), colour.GetG() ) );
	u8		b( ModulateComponent( GetB(), colour.GetB() ) );
	u8		a( ModulateComponent( GetA(), colour.GetA() ) );

	return c32( r, g, b, a );
}

//*****************************************************************************
//
//*****************************************************************************
c32	c32::ModulateRGB( c32 colour ) const
{
	u8		r( ModulateComponent( GetR(), colour.GetR() ) );
	u8		g( ModulateComponent( GetG(), colour.GetG() ) );
	u8		b( ModulateComponent( GetB(), colour.GetB() ) );
	u8		a( GetA() );

	return c32( r, g, b, a );
}

//*****************************************************************************
//
//*****************************************************************************
c32	c32::ModulateA( c32 colour ) const
{
	u8		r( GetR() );
	u8		g( GetG() );
	u8		b( GetB() );
	u8		a( ModulateComponent( GetA(), colour.GetA() ) );

	return c32( r, g, b, a );
}

//*****************************************************************************
//
//*****************************************************************************
c32	c32::Interpolate( c32 colour, float factor ) const
{
	u8		r( InterpolateComponent( GetR(), colour.GetR(), factor ) );
	u8		g( InterpolateComponent( GetG(), colour.GetG(), factor ) );
	u8		b( InterpolateComponent( GetB(), colour.GetB(), factor ) );
	u8		a( InterpolateComponent( GetA(), colour.GetA(), factor ) );

	return c32( r, g, b, a );
}

//*****************************************************************************
//
//*****************************************************************************
c32	c32::Interpolate( c32 colour, c32 factor ) const
{
	float	factor_r( factor.GetR() / 255.0f );
	float	factor_g( factor.GetG() / 255.0f );
	float	factor_b( factor.GetB() / 255.0f );
	float	factor_a( factor.GetA() / 255.0f );

	u8		r( InterpolateComponent( GetR(), colour.GetR(), factor_r ) );
	u8		g( InterpolateComponent( GetG(), colour.GetG(), factor_g ) );
	u8		b( InterpolateComponent( GetB(), colour.GetB(), factor_b ) );
	u8		a( InterpolateComponent( GetA(), colour.GetA(), factor_a ) );

	return c32( r, g, b, a );
}

//*****************************************************************************
//
//*****************************************************************************
c32 c32::ReplicateAlpha() const
{
	u8		a( GetA() );

	return c32( a, a, a, a );
}

