	
	.global		_TransformVerticesWithLighting
	.global		_TransformVerticesWithColour
	
.text
	.set		push
	.set		noreorder
	
############################
#	a0 - world matrix				- must be aligned to 16 bytes
#	a1 - world*projection matrix	- must be aligned to 16 bytes
#	a2 - Fiddled vertices			- stride 16
#	a3 - Output vertices			- must be aligned to 16 bytes, stride 64
#	t0 - num vertices
#	t1 - ambient colour
#	t2 - lights
#	t3 - num_lights
_TransformVerticesWithLighting:
	lv.q		R000, 0($a0)		// Load mat world
	lv.q		R001, 16($a0)
	lv.q		R002, 32($a0)
	lv.q		R003, 48($a0)

	lv.q		R100, 0($a1)		// Load mat project
	lv.q		R101, 16($a1)
	lv.q		R102, 32($a1)
	lv.q		R103, 48($a1)
	
	sll			$t0, $t0, 4			// count = count * 16
	addu		$t0, $a2, $t0		// end_ptr = start_ptr + count * 16
	
	ulv.q		R300, 0($t1)		// Load ambient into R300

	beq			$a2, $t0, finished_lighting
	nop

next_vertex_lighting:
	# Load and transform this vertex position
 	lv.s		S200, 0($a2)				// load word [y,x,?,z]
 	lv.s		S210, 4($a2)				//		should align this to 16 bytes so we can do a single load?
	vs2i.p		R200, R200					// R200 = [?,z,y,x]
	vi2f.q		R200, R200, 16				// int -> float
	vmov.q		R200, R200[y,x,w,1]

	vtfm4.q		R201, M000, R200			// World transform
	vtfm4.q		R202, M100, R200			// Projection transform
	
	sv.q		R201, 0($a3)				// Store world transform
	sv.q		R202, 16($a3)				// Store projection transform
	
	# Convert the normal in R200 to float and transform
	lv.s		S200, 12($a2)				// load normal word [?,z,y,x]
	.word		0xd0398080 | (8<<8) | (40)	// vc2i.s		R200, S200					// R200 = [?,z,y,x]
	vi2f.q		R200, R200, 0				// int -> float (obliterates world transform)
	vmov.q		R201, R200[w,z,y,0]			// Unfiddle
	vtfm4.q		R200, M000, R201			// Transform with the world view matrix
	
	vdot.t		S201, R200, R200			// S201 = x*x + y*y + z*z
	vrsq.s		S201, S201					// S201 = 1/sqrt(x*x + y*y + z*z)
	vscl.q		R200, R200, S201			// S200 = v.normalise().

	# Lighting calculation
	# M000: World Matrix
	# M100: Projection Matrix
	# R200: Material normal 
	# R201: Accumulated colour
	# R202: ?
	# R203: ?
	# R300: Ambient
	# R301: Light normal
	# R302: Light colour
	# R303: Scratch
	# t1 = last_light
	# t2 = p_lights
	# t3 = num_lights
	# t4 = cur_light
	
	vmov.q		R201, R300			// Colour = ambient

	sll			$t1, $t3, 5			// t1 = num_lights*32
	addu		$t1, $t2, $t1		// last_light = p_lights + num_lights*32
	or			$t4, $t2, $0		// cur_light = p_lights

	beq			$t4, $t1, done_lighting		// cur_light == last_light?
	nop

next_light:
	lv.q		R301, 0($t4)		// Load normal into R301
	lv.q		R302, 16($t4)		// Load colour into R302

	vdot.t		S303[0:1], R200, R301		// x = clamp(dot(normal,(x,y,z,0)),0,1)
	vscl.q		R303, R302, S303			// r,g,b,a = r*x, g*x, b*x, a*x

	addiu		$t4, $t4, 32		// Skip to the next light
	bne			$t4, $t1, next_light
	vadd.q		R201, R201, R303			// col += r,g,b,a

done_lighting:
	vmov.q		R200[0:1,0:1,0:1,0:1], R201			// Clamp 0..1
	sv.q		R200, 32($a3)						// Store colour

	# Continue with the next vertex
	addiu		$a2, $a2, 16		// Next input vertex
	bne			$a2, $t0, next_vertex_lighting
	addiu		$a3, $a3, 64		// Next output vertex

finished_lighting:	
	jr			$ra
	nop
	

############################
#	a0 - world matrix				- must be aligned to 16 bytes
#	a1 - world projection matrix	- must be aligned to 16 bytes
#	a2 - Fiddled vertices			- stride 16
#	a3 - Output vertices			- must be aligned to 16 bytes, stride 64
#	t0 - num vertices
_TransformVerticesWithColour:
	lv.q		R000, 0($a0)		// Load mat world
	lv.q		R001, 16($a0)
	lv.q		R002, 32($a0)
	lv.q		R003, 48($a0)

	lv.q		R100, 0($a1)		// Load mat project
	lv.q		R101, 16($a1)
	lv.q		R102, 32($a1)
	lv.q		R103, 48($a1)
	
	# Load 1/256 (vuc2i/vi2f end up converting 0xff to 256.0)
	vfim.s		S203, 0.00390625

	sll			$t0, $t0, 4			// count = count * 16
	addu		$t0, $a2, $t0		// end_ptr = start_ptr + count * 16

	beq			$a2, $t0, finished_colour
	nop

next_vertex_colour:
	# Load and transform this vertex position
 	lv.s		S200, 0($a2)					// load word [y,x,?,z]
 	lv.s		S210, 4($a2)					//		should align this to 16 bytes so we can do a single load?
	vs2i.p		R200, R200						// R200 = [?,z,y,x]
	vi2f.q		R200, R200, 16					// int -> float
	vmov.q		R200, R200[y,x,w,1]				// Have to permute here, as sadly can't do this with first vtfm4.q
 
	vtfm4.q		R201, M000, R200				// World transform
	vtfm4.q		R202, M100, R200				// Projection transform
	
	sv.q		R201, 0($a3)					// Store world transform
	sv.q		R202, 16($a3)					// Store projection transform
	
	# Load and normalise the RGBA colour
	lv.s		S200, 12($a2)					// load colour word [a,b,g,r]
	.word		0xd0388080 | (8<<8) | (40)		// vuc2i.s	R200, S200					// R200 = [?,z,y,x]
	vi2f.q		R200, R200, 23					// int -> float
	vscl.q		R200, R200[w,z,y,x], S203		// R200 = [r * 1/256, g * 1/256, b * 1/256, a * 1/256]
	sv.q		R200, 32($a3)					// Store colour

	# Continue with the next vertex
	addiu		$a2, $a2, 16		// Next input vertex
	bne			$a2, $t0, next_vertex_colour
	addiu		$a3, $a3, 64		// Next output vertex
	
finished_colour:
	jr			$ra
	nop
	
	.set pop

