Whats so slow here? (Triangle fill routine)

Blitz3D Forums/Blitz3D Programming/Whats so slow here? (Triangle fill routine)

bytecode77

(Posted 2007) [#1]

hi!

i made a triangle fill routine here. i've seen so many fill routines which were so fast... why is mine so slow? is there any visible mistake?

i mean - i've seen this sample below running on 100 fps, and now it's only 20 fps. where is the giant speed leak?
i know, everything i'm programing is slow :(...

thank you for any help!

texture:

Graphics 1024, 768, 32, 2
SetBuffer BackBuffer()

;Texture
Dim Tex(15, 15)
img = LoadImage("Texture.png")
LockBuffer ImageBuffer(img)
For x = 0 To 15
	For y = 0 To 15
		Tex(x, y) = ReadPixelFast(x, y, ImageBuffer(img))
	Next
Next
UnlockBuffer ImageBuffer(img)
FreeImage img

While Not KeyHit(1)
	mx = MouseX()
	my = MouseY()
	
	Cls
	LockBuffer()
	FillTri(50, 50, 0, 0, 970, 100, 16, 0, mx, my, 16, 16);, 0, 0, 16, 0, 16, 16)
	FillTri(50, 50, 0, 0, 100, 600, 0, 16, mx, my, 16, 16);, 0, 0, 0, 16, 16, 16)
	UnlockBuffer()
	
	Text 10, 10, "FPS: " + GetFPS()
	Flip 0
Wend
End

Global FPS, FPS_temp, FPS_time
Function GetFPS()
ctime = MilliSecs()
FPS_temp = FPS_temp + 1
If ctime - FPS_time > 500 Then
	FPS = FPS_temp * 2
	FPS_temp = 0
	FPS_time = ctime
EndIf
Return FPS
End Function

;---
Function FillTri(p1x, p1y, p1u, p1v, p2x, p2y, p2u, p2v, p3x, p3y, p3u, p3v)
If p1y > p2y Then
	ptx = p1x
	pty = p1y
	ptu = p1u
	ptv = p1v
	
	p1x = p2x
	p1y = p2y
	p1u = p2u
	p1v = p2v
	
	p2x = ptx
	p2y = pty
	p2u = ptu
	p2v = ptv
EndIf
If p2y > p3y Then
	ptx = p2x
	pty = p2y
	ptu = p2u
	ptv = p2v
	
	p2x = p3x
	p2y = p3y
	p2u = p3u
	p2v = p3v
	
	p3x = ptx
	p3y = pty
	p3u = ptu
	p3v = ptv
	If p1y > p2y Then
		ptx = p1x
		pty = p1y
		ptu = p1u
		ptv = p1v
		
		p1x = p2x
		p1y = p2y
		p1u = p2u
		p1v = p2v
		
		p2x = ptx
		p2y = pty
		p2u = ptu
		p2v = ptv
	EndIf
EndIf
If p1y <> p2y Then
	For y = p1y To p2y
		x1 = vgliIntp(y, p1y, p3y, p1x, p3x)
		u1# = vgliIntp(y, p1y, p3y, p1u, p3u)
		v1# = vgliIntp(y, p1y, p3y, p1v, p3v)
		x2 = vgliIntp(y, p1y, p2y, p1x, p2x)
		u2# = vgliIntp(y, p1y, p2y, p1u, p2u)
		v2# = vgliIntp(y, p1y, p2y, p1v, p2v)
		FillTriLine(x1, u1#, v1#, x2, u2#, v2#, y)
	Next
EndIf
If p2y <> p3y Then
	For y = p2y To p3y
		x1 = vgliIntp(y, p1y, p3y, p1x, p3x)
		u1# = vgliIntp(y, p1y, p3y, p1u, p3u)
		v1# = vgliIntp(y, p1y, p3y, p1v, p3v)
		x2 = vgliIntp(y, p2y, p3y, p2x, p3x)
		u2# = vgliIntp(y, p2y, p3y, p2u, p3u)
		v2# = vgliIntp(y, p2y, p3y, p2v, p3v)
		FillTriLine(x1, u1#, v1#, x2, u2#, v2#, y)
	Next
EndIf
End Function

Function FillTriLine(p1x, p1u#, p1v#, p2x, p2u#, p2v#, y)
If p1x = p2x Then Return
If p1x > p2x Then
	tx = p1x
	tu# = p1u#
	tv# = p1v#
	p1x = p2x
	p1u# = p2u#
	p1v# = p2v#
	p2x = tx
	p2u# = tu#
	p2v# = tv#
EndIf
For x = p1x To p2x
	u = vgliIntp(x, p1x, p2x, p1u#, p2u#)
	v = vgliIntp(x, p1x, p2x, p1v#, p2v#)
	WritePixelFast x, y, Tex(u, v)
Next
End Function

Function vgliIntp#(value#, vMin#, vMax#, retMin#, retMax#)
Return retMin# + (value# - vMin#) * (retMax# - retMin#) / (vMax# - vMin#)
End Function

_33	(Posted 2007) [#2]

Well, if you'd do a load texture, you could load it in VRAM instead.

bytecode77

(Posted 2007) [#3]

vram?
this is a software rendered polygon.

without textures it looks like that

Graphics 1024, 768, 32, 2
SetBuffer BackBuffer()

While Not KeyHit(1)
	mx = MouseX()
	my = MouseY()
	
	Cls
	LockBuffer()
	FillTri(50, 50, 970, 100, mx, my)
	FillTri(50, 50, 100, 600,  mx, my)
	UnlockBuffer()
	
	Text 10, 10, "FPS: " + GetFPS()
	Flip 0
Wend
End

Global FPS, FPS_temp, FPS_time
Function GetFPS()
ctime = MilliSecs()
FPS_temp = FPS_temp + 1
If ctime - FPS_time > 500 Then
	FPS = FPS_temp * 2
	FPS_temp = 0
	FPS_time = ctime
EndIf
Return FPS
End Function

Function FillTri(p1x, p1y, p2x, p2y, p3x, p3y)
If p1y > p2y Then
	ptx = p1x
	pty = p1y
	p1x = p2x
	p1y = p2y
	p2x = ptx
	p2y = pty
EndIf
If p2y > p3y Then
	ptx = p2x
	pty = p2y
	p2x = p3x
	p2y = p3y
	p3x = ptx
	p3y = pty
	If p1y > p2y Then
		ptx = p1x
		pty = p1y
		p1x = p2x
		p1y = p2y
		p2x = ptx
		p2y = pty
	EndIf
EndIf
If p1y <> p2y Then
	For y = p1y To p2y
		x1 = vgliIntp(y, p1y, p3y, p1x, p3x)
		x2 = vgliIntp(y, p1y, p2y, p1x, p2x)
		FillTriLine(x1, x2, y)
	Next
EndIf
If p2y <> p3y Then
	p13x = vgliIntp(p2y, p1y, p3y, p1x, p3x)
	For y = p2y To p3y
		x1 = vgliIntp(y, p2y, p3y, p13x, p3x)
		x2 = vgliIntp(y, p2y, p3y, p2x, p3x)
		FillTriLine(x1, x2, y)
	Next
EndIf
End Function

Function FillTriLine(p1x, p2x, y)
If p1x = p2x Then Return
If p1x > p2x Then
	t = p1x
	p1x = p2x
	p2x = t
EndIf
;Rect p1x, y, p2x - p1x, 1
For x = p1x To p2x
	WritePixelFast x, y, $0000FF
Next
End Function

Function vgliIntp#(value#, vMin#, vMax#, retMin#, retMax#)
Return retMin# + (value# - vMin#) * (retMax# - retMin#) / (vMax# - vMin#)
End Function

without textures, it is just high-end performance.
the only difference to textures, is that i have to interpolate UV coordinates AND XY coordinates...

_33	(Posted 2007) [#4]

Well, I would use the VRAM to do these operations, simply because VRAM is usually about 10X faster than regular RAM.

bytecode77

(Posted 2007) [#5]

what? vram? but i dont have a Graphics3D mode on! this is 2d. how would <you> use vram in this example?

big10p

(Posted 2007) [#6]

The code doesn't even run, here. I get array out of bounds error in debug mode. Also, I advise you specify the destination buffer with WritePixelFast, otherwise I don't think it'll work on my machine.

_33	(Posted 2007) [#7]

Instead of using a Dim, work with a bank. Try avoiding to have code that looks like "reference (reference(reference))" in big loops. If possible take the reference and bring it to a simple reference% variable, such as the imagebuffer(img), etc. That alone should yeld some speed boost.

This piece of code will take a nice chunk of CPU:

For x = p1x To p2x
	u = vgliIntp(x, p1x, p2x, p1u#, p2u#)
	v = vgliIntp(x, p1x, p2x, p1v#, p2v#)
	WritePixelFast x, y, Tex(u, v)
Next

If you use a bank, it will be faster, for your u v map. Use peek to get the u v, and poke to initiate it in the beginning. Access the position of the u v data using a displacement value, calculated with (v * 16 + u) * 4. multiply by 4, to get the float value, as it will be 4 bytes.

disp% = (v * 16 + u) * 4
texture_RGBA% = PeekInt (uv_table%, disp%)

or, in your case, a more fancy code:

disp% = (vgliIntp(x, p1x, p2x, p1v#, p2v#) * 16 + vgliIntp(x, p1x, p2x, p1u#, p2u#) )  * 4
texture_RGBA% = PeekInt (uv_table%, disp%)

Note, that multiplying by 16 is actually for jumping to the next line of your texture. if the texture is 32 pixels wide, then this value should be 32, or a variable containing the texture width.

Try to have a version of vgliIntp that returns an Int value. Avoid divisions, as it is slower than multiplications. It is preferable to multiply by 0.1 than to divide by 10 for gaining speed in calculating.

*** NOTE: And, while I'm thinking about your case, why not just peek at the imagebuffer?

Cheers.

bytecode77

(Posted 2007) [#8]

hi!

thanks for the fast reply!
i replaced it using banks,but it ain't faster. it is as slow as before.
:(

_33	(Posted 2007) [#9]

But, usually these, types of functions are preferably done in assembly language.

ShadowTurtle

(Posted 2007) [#10]

I think you must make a DLL for this work, Devils Child. *kotz*
I known this since mine last slowdown on my project :/

bytecode77

(Posted 2007) [#11]

Graphics 1024, 768, 32, 2
SetBuffer BackBuffer()

;Texture
Global Tex = CreateBank(16 * 16 * 4)
img = LoadImage("Texture.png")
LockBuffer ImageBuffer(img)
For x = 0 To 15
	For y = 0 To 15
		PokeFloat Tex, (x * 16 + y) * 4, ReadPixelFast(x, y, ImageBuffer(img))
	Next
Next
UnlockBuffer ImageBuffer(img)
FreeImage img

While Not KeyHit(1)
	mx = MouseX()
	my = MouseY()
	
	Cls
	LockBuffer()
	FillTri(120, 120, 0, 0, 970, 100, 16, 0, mx, my, 16, 16);, 0, 0, 16, 0, 16, 16)
	FillTri(120, 120, 0, 0, 100, 600, 0, 16, mx, my, 16, 16);, 0, 0, 0, 16, 16, 16)
	UnlockBuffer()
	
	Text 10, 10, "FPS: " + GetFPS()
	Flip 0
Wend
End

Global FPS, FPS_temp, FPS_time
Function GetFPS()
ctime = MilliSecs()
FPS_temp = FPS_temp + 1
If ctime - FPS_time > 500 Then
	FPS = FPS_temp * 2
	FPS_temp = 0
	FPS_time = ctime
EndIf
Return FPS
End Function

Function FillTri(p1x, p1y, p1u, p1v, p2x, p2y, p2u, p2v, p3x, p3y, p3u, p3v)
p1u = p1u Shl 16
p1v = p1v Shl 16
p2u = p2u Shl 16
p2v = p2v Shl 16
p3u = p3u Shl 16
p3v = p3v Shl 16
If p1y > p2y Then
	ptx = p1x
	pty = p1y
	ptu = p1u
	ptv = p1v
	
	p1x = p2x
	p1y = p2y
	p1u = p2u
	p1v = p2v
	
	p2x = ptx
	p2y = pty
	p2u = ptu
	p2v = ptv
EndIf
If p2y > p3y Then
	ptx = p2x
	pty = p2y
	ptu = p2u
	ptv = p2v
	
	p2x = p3x
	p2y = p3y
	p2u = p3u
	p2v = p3v
	
	p3x = ptx
	p3y = pty
	p3u = ptu
	p3v = ptv
	If p1y > p2y Then
		ptx = p1x
		pty = p1y
		ptu = p1u
		ptv = p1v
		
		p1x = p2x
		p1y = p2y
		p1u = p2u
		p1v = p2v
		
		p2x = ptx
		p2y = pty
		p2u = ptu
		p2v = ptv
	EndIf
EndIf
If p1y <> p2y Then
	For y = p1y To p2y
		x1 = vgliIntp(y, p1y, p3y, p1x, p3x)
		u1 = vgliIntp(y, p1y, p3y, p1u, p3u)
		v1 = vgliIntp(y, p1y, p3y, p1v, p3v)
		x2 = vgliIntp(y, p1y, p2y, p1x, p2x)
		u2 = vgliIntp(y, p1y, p2y, p1u, p2u)
		v2 = vgliIntp(y, p1y, p2y, p1v, p2v)
		FillTriLine(x1, u1, v1, x2, u2, v2, y)
	Next
EndIf
If p2y <> p3y Then
	For y = p2y To p3y
		x1 = vgliIntp(y, p1y, p3y, p1x, p3x)
		u1 = vgliIntp(y, p1y, p3y, p1u, p3u)
		v1 = vgliIntp(y, p1y, p3y, p1v, p3v)
		x2 = vgliIntp(y, p2y, p3y, p2x, p3x)
		u2 = vgliIntp(y, p2y, p3y, p2u, p3u)
		v2 = vgliIntp(y, p2y, p3y, p2v, p3v)
		FillTriLine(x1, u1, v1, x2, u2, v2, y)
	Next
EndIf
End Function

Function FillTriLine(p1x, p1u, p1v, p2x, p2u, p2v, y)
If p1x = p2x Then Return
If p1x > p2x Then
	tx = p1x
	tu = p1u
	tv = p1v
	p1x = p2x
	p1u = p2u
	p1v = p2v
	p2x = tx
	p2u = tu
	p2v = tv
EndIf
iu = (p2u - p1u) / (p2x - p1x)
iv = (p2v - p1v) / (p2x - p1x)
For x = p1x To p2x
	u = (p1u + (x - p1x) * iu) Shr 16
	v = (p1v + (x - p1x) * iv) Shr 16
	WritePixelFast x, y, PeekFloat(Tex, (v * 16 + u) * 4)
Next
End Function

Function vgliIntp#(value#, vMin#, vMax#, retMin#, retMax#)
Return retMin# + (value# - vMin#) * (retMax# - retMin#) / (vMax# - vMin#)
End Function

thank you for your help, i used shr and shl instead of floats and replaced speed critical functions with their content.

thanks for help :)

_33	(Posted 2007) [#12]

So, is it faster? ;P

EDIT: tested on my gear. You passed from 20-30 FPS to 85-130 FPS when fully extended. Good!

It's in my belief that a good / tight / efficient programming scheme on any language can supplant a poor inefficient programming scheme in Assembly / machine language.

Oh, and you could also do this, but it becomes quite unreadable, for a gain of about 8 fps (on my system)

	WritePixelFast x, y, PeekFloat(Tex, (((p1v + (x - p1x) * iv) Shr 16) * 16 + (p1u + (x - p1x) * iu) Shr 16) * 4)

Cheers.

DJWoodgate

(Posted 2007) [#13]

I was playing around with the array based approach and trying to minimise work in the writepixel loop. Not really working properly yet as far as indexing goes, I have increased the array size to stop it throwing hissy fits in debug mode. It gives a few extra fps anyway.

Graphics 1024, 768, 32, 2
SetBuffer BackBuffer()

;Texture
Dim Tex(16, 16)
img = LoadImage("Texture.png")
LockBuffer ImageBuffer(img)
For x = 0 To 15
	For y = 0 To 15
		Tex(x, y) = ReadPixelFast(x, y, ImageBuffer(img))
	Next
Next
UnlockBuffer ImageBuffer(img)
FreeImage img
ClsColor 200,100,100
While Not KeyHit(1)
	mx = MouseX()
	my = MouseY()
	state$=""
	
	Cls
	LockBuffer()
	FillTri(50, 50, 0, 0, 970, 100, 16, 0, mx, my, 16, 16);, 0, 0, 16, 0, 16, 16)
	FillTri(50, 50, 0, 0, 100, 600, 0, 16, mx, my, 16, 16);, 0, 0, 0, 16, 16, 16)
	UnlockBuffer()
	
	Text 10, 10, "FPS: " + GetFPS()
	Flip 0
Wend
End

Global FPS, FPS_temp, FPS_time
Function GetFPS()
ctime = MilliSecs()
FPS_temp = FPS_temp + 1
If ctime - FPS_time > 500 Then
	FPS = FPS_temp * 2
	FPS_temp = 0
	FPS_time = ctime
EndIf
Return FPS
End Function

;---
Function FillTri(p1x, p1y, p1u, p1v, p2x, p2y, p2u, p2v, p3x, p3y, p3u, p3v)
p1u = p1u Shl 16
p1v = p1v Shl 16
p2u = p2u Shl 16
p2v = p2v Shl 16
p3u = p3u Shl 16
p3v = p3v Shl 16
If p1y > p2y Then
	ptx = p1x
	pty = p1y
	ptu = p1u
	ptv = p1v
	
	p1x = p2x
	p1y = p2y
	p1u = p2u
	p1v = p2v
	
	p2x = ptx
	p2y = pty
	p2u = ptu
	p2v = ptv
EndIf
If p2y > p3y Then
	ptx = p2x
	pty = p2y
	ptu = p2u
	ptv = p2v
	
	p2x = p3x
	p2y = p3y
	p2u = p3u
	p2v = p3v
	
	p3x = ptx
	p3y = pty
	p3u = ptu
	p3v = ptv
	If p1y > p2y Then
		ptx = p1x
		pty = p1y
		ptu = p1u
		ptv = p1v
		
		p1x = p2x
		p1y = p2y
		p1u = p2u
		p1v = p2v
		
		p2x = ptx
		p2y = pty
		p2u = ptu
		p2v = ptv
	EndIf
EndIf
If p1y <> p2y Then
	For y = p1y To p2y
		x1 = vgliIntp(y, p1y, p3y, p1x, p3x)
		u1 = vgliIntp(y, p1y, p3y, p1u, p3u)
		v1 = vgliIntp(y, p1y, p3y, p1v, p3v)
		x2 = vgliIntp(y, p1y, p2y, p1x, p2x)
		u2 = vgliIntp(y, p1y, p2y, p1u, p2u)
		v2 = vgliIntp(y, p1y, p2y, p1v, p2v)
		FillTriLine(x1, u1, v1, x2, u2, v2, y)
	Next
EndIf
If p2y <> p3y Then
	For y = p2y To p3y
		x1 = vgliIntp(y, p1y, p3y, p1x, p3x)
		u1 = vgliIntp(y, p1y, p3y, p1u, p3u)
		v1 = vgliIntp(y, p1y, p3y, p1v, p3v)
		x2 = vgliIntp(y, p2y, p3y, p2x, p3x)
		u2 = vgliIntp(y, p2y, p3y, p2u, p3u)
		v2 = vgliIntp(y, p2y, p3y, p2v, p3v)
		FillTriLine(x1, u1, v1, x2, u2, v2, y)
	Next
EndIf
End Function

Function FillTriLine(p1x, p1u, p1v, p2x, p2u, p2v, y)
If p1x = p2x Then Return
If p1x > p2x Then
	tx = p1x
	tu = p1u
	tv = p1v
	p1x = p2x
	p1u = p2u
	p1v = p2v
	p2x = tx
	p2u = tu
	p2v = tv
EndIf

ustride = (p2u - p1u) / (p2x-p1x)
vstride = (p2v - p1v) / (p2x-p1x)
u = p1u
v = p1v
If ustride>=0 And vstride>=0 Then

	For x = p1x To p2x

		WritePixelFast x, y, Tex(u Shr 16, v Shr 16)
		v=v+vstride		u=u+ustride	
	Next
ElseIf ustride<0 And vstride>=0 Then

	ustride=Abs(ustride)
	For x = p1x To p2x
	

		WritePixelFast x, y, Tex(u Shr 16, v Shr 16)
		u=u-ustride 	v=v+vstride	
		
	Next
ElseIf ustride>=0 And vstride<0 Then

	vstride=Abs(vstride)
	For x = p1x To p2x

		WritePixelFast x, y, Tex(u Shr 16, v Shr 16)
		u=u+ustride		v=v-vstride	
		
	Next
ElseIf ustride<0 And vstride<0 Then

	ustride=Abs(ustride)
	vstride=Abs(vstride)
	For x = p1x To p2x
	

		WritePixelFast x, y, Tex(u Shr 16, v Shr 16)
		u=u-ustride 	v=v-vstride			
	Next
EndIf
			

End Function

Function vgliIntp#(value#, vMin#, vMax#, retMin#, retMax#)
Return retMin# + (value# - vMin#) * (retMax# - retMin#) / (vMax# - vMin#)
End Function

bytecode77

(Posted 2007) [#14]

hey nice speedup, thanks a lot :)

_33	(Posted 2007) [#15]

Yep, his bumps the speed up by a solid 20% from the previous version (considering same coordinates).