diff --git a/src/3DMath.cpp b/src/3DMath.cpp
index 42e675b8..bda882c8 100644
--- a/src/3DMath.cpp
+++ b/src/3DMath.cpp
@@ -3,26 +3,19 @@
 
 void MultMatrix(float m0[4][4], float m1[4][4], float dest[4][4])
 {
-    int i;
-    for (i = 0; i < 4; i++)
-    {
-        dest[0][i] = m0[0][i]*m1[0][0] + m0[1][i]*m1[0][1] + m0[2][i]*m1[0][2] + m0[3][i]*m1[0][3];
-        dest[1][i] = m0[0][i]*m1[1][0] + m0[1][i]*m1[1][1] + m0[2][i]*m1[1][2] + m0[3][i]*m1[1][3];
-        dest[2][i] = m0[0][i]*m1[2][0] + m0[1][i]*m1[2][1] + m0[2][i]*m1[2][2] + m0[3][i]*m1[2][3];
-        dest[3][i] = m0[3][i]*m1[3][3] + m0[2][i]*m1[3][2] + m0[1][i]*m1[3][1] + m0[0][i]*m1[3][0];
-    }
-}
-
-void MultMatrix2(float m0[4][4], float m1[4][4])
-{
-    float dst[4][4];
-    MultMatrix(m0, m1, dst);
-    memcpy( m0, dst, sizeof(float) * 16 );
+	int i;
+	for (i = 0; i < 4; i++)
+	{
+		dest[0][i] = m0[0][i]*m1[0][0] + m0[1][i]*m1[0][1] + m0[2][i]*m1[0][2] + m0[3][i]*m1[0][3];
+		dest[1][i] = m0[0][i]*m1[1][0] + m0[1][i]*m1[1][1] + m0[2][i]*m1[1][2] + m0[3][i]*m1[1][3];
+		dest[2][i] = m0[0][i]*m1[2][0] + m0[1][i]*m1[2][1] + m0[2][i]*m1[2][2] + m0[3][i]*m1[2][3];
+		dest[3][i] = m0[3][i]*m1[3][3] + m0[2][i]*m1[3][2] + m0[1][i]*m1[3][1] + m0[0][i]*m1[3][0];
+	}
 }
 
 void TransformVectorNormalize(float vec[3], float mtx[4][4])
 {
-    float len;
+	float len;
 
 	float vres[3];
 	vres[0] = mtx[0][0] * vec[0]
@@ -35,12 +28,84 @@ void TransformVectorNormalize(float vec[3], float mtx[4][4])
 		   + mtx[1][2] * vec[1]
 		   + mtx[2][2] * vec[2];
 	memcpy(vec, vres, sizeof(float)*3);
-    len = vec[0]*vec[0] + vec[1]*vec[1] + vec[2]*vec[2];
-    if (len != 0.0)
-    {
-        len = sqrtf(len);
-        vec[0] /= len;
-        vec[1] /= len;
-        vec[2] /= len;
-    }
+	len = vec[0]*vec[0] + vec[1]*vec[1] + vec[2]*vec[2];
+	if (len != 0.0)
+	{
+		len = sqrtf(len);
+		vec[0] /= len;
+		vec[1] /= len;
+		vec[2] /= len;
+	}
+}
+
+void Normalize(float v[3])
+{
+#ifdef WIN32_ASM
+	__asm {
+		mov		esi, dword ptr [v]
+										//	ST(6)			ST(5)			ST(4)			ST(3)			ST(2)			ST(1)			ST
+		fld		dword ptr [esi+08h]		//																									v2
+		fld		dword ptr [esi+04h]		//																					v2				v1
+		fld		dword ptr [esi]			//																	v2				v1				v0
+		fld1							//													v2				v1				v0				1.0
+		fld		ST(3)					//									v2				v1				v0				1.0				v2
+		fmul	ST, ST					//									v2				v1				v0				1.0				v2*v2
+		fld		ST(3)					//					v2				v1				v0				1.0				v2*v2			v1
+		fmul	ST, ST					//					v2				v1				v0				1.0				v2*v2			v1*v1
+		fld		ST(3)					//	v2				v1				v0				1.0				v2*v2			v1*v1			v0
+		fmul	ST, ST					//	v2				v1				v0				1.0				v2*v2			v1*v1			v0*v0
+		fadd							//					v2				v1				v0				1.0				v2*v2			v1*v1+v0*v0
+		fadd							//									v2				v1				v0				1.0				v2*v2+v1*v1+v0*v0
+		ftst							// Compare ST to 0
+		fstsw	ax						// Store FPU status word in ax
+		sahf							// Transfer ax to flags register
+		jz		End						// Skip if length is zero
+		fsqrt							//									v2				v1				v0				1.0				len
+		fdiv							//													v2				v1				v0				1.0/len
+		fmul	ST(3), ST				//													v2*(1.0/len)	v1				v0				1.0/len
+		fmul	ST(2), ST				//													v2*(1.0/len)	v1*(1.0/len)	v0				1.0/len
+		fmul							//																	v2*(1.0/len)	v1*(1.0/len)	v0*(1.0/len)
+		fstp	dword ptr [esi]			//																					v2*(1.0/len)	v1*(1.0/len)
+		fstp	dword ptr [esi+04h]		//																									v2*(1.0/len)
+		fstp	dword ptr [esi+08h]		//
+End:
+		finit
+	}
+#else // WIN32_ASM
+	float len;
+
+	len = v[0]*v[0] + v[1]*v[1] + v[2]*v[2];
+	if (len != 0.0)	{
+		len = sqrtf( len );
+		v[0] /= len;
+		v[1] /= len;
+		v[2] /= len;
+	}
+#endif // WIN32_ASM
+}
+
+
+float DotProduct(const float v0[3], const float v1[3])
+{
+	float	dot;
+#ifdef WIN32_ASM
+	__asm {
+		mov		esi, dword ptr [v0]
+		mov		edi, dword ptr [v1]
+		lea		ebx, [dot]
+
+		fld		dword ptr [esi]
+		fmul	dword ptr [edi]
+		fld		dword ptr [esi+04h]
+		fmul	dword ptr [edi+04h]
+		fld		dword ptr [esi+08h]
+		fmul	dword ptr [edi+08h]
+		fadd
+		fadd
+		fstp	dword ptr [ebx]
+	}
+#else // WIN32_ASM
+	dot = v0[0]*v1[0] + v0[1]*v1[1] + v0[2]*v1[2];
+#endif // WIN32_ASM
+	return dot;
 }
diff --git a/src/3DMath.h b/src/3DMath.h
index 5902ccf5..00815b6b 100644
--- a/src/3DMath.h
+++ b/src/3DMath.h
@@ -4,8 +4,16 @@
 #include <string.h>
 
 void MultMatrix( float m0[4][4], float m1[4][4], float dest[4][4]);
-void MultMatrix2(float m0[4][4], float m1[4][4] );
 void TransformVectorNormalize(float vec[3], float mtx[4][4]);
+void Normalize(float v[3]);
+float DotProduct(const float v0[3], const float v1[3]);
+
+inline void MultMatrix2(float m0[4][4], float m1[4][4])
+{
+	float dst[4][4];
+	MultMatrix(m0, m1, dst);
+	memcpy( m0, dst, sizeof(float) * 16 );
+}
 
 inline void CopyMatrix( float m0[4][4], float m1[4][4] )
 {
@@ -94,76 +102,4 @@ inline void Transpose3x3Matrix( float mtx[4][4] )
 #endif // WIN32_ASM
 }
 
-inline void Normalize(float v[3])
-{
-#ifdef WIN32_ASM
-	__asm {
-		mov		esi, dword ptr [v]
-										//	ST(6)			ST(5)			ST(4)			ST(3)			ST(2)			ST(1)			ST
-		fld		dword ptr [esi+08h]		//																									v2
-		fld		dword ptr [esi+04h]		//																					v2				v1
-		fld		dword ptr [esi]			//																	v2				v1				v0
-		fld1							//													v2				v1				v0				1.0
-		fld		ST(3)					//									v2				v1				v0				1.0				v2
-		fmul	ST, ST					//									v2				v1				v0				1.0				v2*v2
-		fld		ST(3)					//					v2				v1				v0				1.0				v2*v2			v1
-		fmul	ST, ST					//					v2				v1				v0				1.0				v2*v2			v1*v1
-		fld		ST(3)					//	v2				v1				v0				1.0				v2*v2			v1*v1			v0
-		fmul	ST, ST					//	v2				v1				v0				1.0				v2*v2			v1*v1			v0*v0
-		fadd							//					v2				v1				v0				1.0				v2*v2			v1*v1+v0*v0
-		fadd							//									v2				v1				v0				1.0				v2*v2+v1*v1+v0*v0
-		ftst							// Compare ST to 0
-		fstsw	ax						// Store FPU status word in ax
-		sahf							// Transfer ax to flags register
-		jz		End						// Skip if length is zero
-		fsqrt							//									v2				v1				v0				1.0				len
-		fdiv							//													v2				v1				v0				1.0/len
-		fmul	ST(3), ST				//													v2*(1.0/len)	v1				v0				1.0/len
-		fmul	ST(2), ST				//													v2*(1.0/len)	v1*(1.0/len)	v0				1.0/len
-		fmul							//																	v2*(1.0/len)	v1*(1.0/len)	v0*(1.0/len)
-		fstp	dword ptr [esi]			//																					v2*(1.0/len)	v1*(1.0/len)
-		fstp	dword ptr [esi+04h]		//																									v2*(1.0/len)
-		fstp	dword ptr [esi+08h]		//
-End:
-		finit
-	}
-#else // WIN32_ASM
-	float len;
-
-	len = v[0]*v[0] + v[1]*v[1] + v[2]*v[2];
-	if (len != 0.0)	{
-		len = sqrtf( len );
-		v[0] /= len;
-		v[1] /= len;
-		v[2] /= len;
-	}
-#endif // WIN32_ASM
-}
-
-
-inline float DotProduct(const float v0[3], const float v1[3])
-{
-	float	dot;
-#ifdef WIN32_ASM
-	__asm {
-		mov		esi, dword ptr [v0]
-		mov		edi, dword ptr [v1]
-		lea		ebx, [dot]
-
-		fld		dword ptr [esi]
-		fmul	dword ptr [edi]
-		fld		dword ptr [esi+04h]
-		fmul	dword ptr [edi+04h]
-		fld		dword ptr [esi+08h]
-		fmul	dword ptr [edi+08h]
-		fadd
-		fadd
-		fstp	dword ptr [ebx]
-	}
-#else // WIN32_ASM
-	dot = v0[0]*v1[0] + v0[1]*v1[1] + v0[2]*v1[2];
-#endif // WIN32_ASM
-	return dot;
-}
-
 #endif
diff --git a/src/3DMathNeon.cpp b/src/3DMathNeon.cpp
new file mode 100644
index 00000000..5db7d47d
--- /dev/null
+++ b/src/3DMathNeon.cpp
@@ -0,0 +1,124 @@
+#include "3DMath.h"
+
+void MultMatrix( float m0[4][4], float m1[4][4], float dest[4][4])
+{
+	asm volatile (
+	"vld1.32 		{d0, d1}, [%1]!			\n\t"	//q0 = m1
+	"vld1.32 		{d2, d3}, [%1]!	    	\n\t"	//q1 = m1+4
+	"vld1.32 		{d4, d5}, [%1]!	    	\n\t"	//q2 = m1+8
+	"vld1.32 		{d6, d7}, [%1]	    	\n\t"	//q3 = m1+12
+	"vld1.32 		{d16, d17}, [%0]!		\n\t"	//q8 = m0
+	"vld1.32 		{d18, d19}, [%0]!   	\n\t"	//q9 = m0+4
+	"vld1.32 		{d20, d21}, [%0]!   	\n\t"	//q10 = m0+8
+	"vld1.32 		{d22, d23}, [%0]    	\n\t"	//q11 = m0+12
+
+	"vmul.f32 		q12, q8, d0[0] 			\n\t"	//q12 = q8 * d0[0]
+	"vmul.f32 		q13, q8, d2[0] 		    \n\t"	//q13 = q8 * d2[0]
+	"vmul.f32 		q14, q8, d4[0] 		    \n\t"	//q14 = q8 * d4[0]
+	"vmul.f32 		q15, q8, d6[0]	 		\n\t"	//q15 = q8 * d6[0]
+	"vmla.f32 		q12, q9, d0[1] 			\n\t"	//q12 = q9 * d0[1]
+	"vmla.f32 		q13, q9, d2[1] 		    \n\t"	//q13 = q9 * d2[1]
+	"vmla.f32 		q14, q9, d4[1] 		    \n\t"	//q14 = q9 * d4[1]
+	"vmla.f32 		q15, q9, d6[1] 		    \n\t"	//q15 = q9 * d6[1]
+	"vmla.f32 		q12, q10, d1[0] 		\n\t"	//q12 = q10 * d0[0]
+	"vmla.f32 		q13, q10, d3[0] 		\n\t"	//q13 = q10 * d2[0]
+	"vmla.f32 		q14, q10, d5[0] 		\n\t"	//q14 = q10 * d4[0]
+	"vmla.f32 		q15, q10, d7[0] 		\n\t"	//q15 = q10 * d6[0]
+	"vmla.f32 		q12, q11, d1[1] 		\n\t"	//q12 = q11 * d0[1]
+	"vmla.f32 		q13, q11, d3[1] 		\n\t"	//q13 = q11 * d2[1]
+	"vmla.f32 		q14, q11, d5[1] 		\n\t"	//q14 = q11 * d4[1]
+	"vmla.f32 		q15, q11, d7[1]	 	    \n\t"	//q15 = q11 * d6[1]
+
+	"vst1.32 		{d24, d25}, [%2]! 		\n\t"	//d = q12
+	"vst1.32 		{d26, d27}, [%2]! 	    \n\t"	//d+4 = q13
+	"vst1.32 		{d28, d29}, [%2]! 	    \n\t"	//d+8 = q14
+	"vst1.32 		{d30, d31}, [%2] 	    \n\t"	//d+12 = q15
+
+	:"+r"(m0), "+r"(m1), "+r"(dest):
+	: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+	"d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+	"d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
+	"memory"
+	);
+}
+
+void TransformVectorNormalize(float vec[3], float mtx[4][4])
+{
+	asm volatile (
+	"vld1.32 		{d0}, [%1]  			\n\t"	//Q0 = v
+	"flds    		s2, [%1, #8]  			\n\t"	//Q0 = v
+	"vld1.32 		{d18, d19}, [%0]!		\n\t"	//Q1 = m
+	"vld1.32 		{d20, d21}, [%0]!	    \n\t"	//Q2 = m+4
+	"vld1.32 		{d22, d23}, [%0]	    \n\t"	//Q3 = m+8
+
+	"vmul.f32 		q2, q9, d0[0]			\n\t"	//q2 = q9*Q0[0]
+	"vmla.f32 		q2, q10, d0[1]			\n\t"	//Q5 += Q1*Q0[1]
+	"vmla.f32 		q2, q11, d1[0]			\n\t"	//Q5 += Q2*Q0[2]
+
+	"vmul.f32 		d0, d4, d4				\n\t"	//d0 = d0*d0
+	"vpadd.f32 		d0, d0, d0				\n\t"	//d0 = d[0] + d[1]
+	"vmla.f32 		d0, d5, d5				\n\t"	//d0 = d0 + d1*d1
+
+	"vmov.f32 		d1, d0					\n\t"	//d1 = d0
+	"vrsqrte.f32 	d0, d0					\n\t"	//d0 = ~ 1.0 / sqrt(d0)
+	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1
+	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d3 = (3 - d0 * d2) / 2
+	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d3
+	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1
+	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d3 = (3 - d0 * d3) / 2
+	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d4
+
+	"vmul.f32 		q2, q2, d0[0]			\n\t"	//d0= d2*d4
+
+	"vst1.32 		{d4}, [%1] 	    	    \n\t"	//Q4 = m+12
+	"fsts   		s10, [%1, #8] 	    	\n\t"	//Q4 = m+12
+	: "+r"(mtx): "r"(vec)
+	: "d0","d1","d2","d3","d18","d19","d20","d21","d22", "d23", "memory"
+	);
+}
+
+void Normalize(float v[3])
+{
+	asm volatile (
+	"vld1.32 		{d4}, [%0]!	    		\n\t"	//d4={x,y}
+	"flds    		s10, [%0]   	    	\n\t"	//d5[0] = z
+	"sub    		%0, %0, #8   	    	\n\t"	//d5[0] = z
+	"vmul.f32 		d0, d4, d4				\n\t"	//d0= d4*d4
+	"vpadd.f32 		d0, d0, d0				\n\t"	//d0 = d[0] + d[1]
+	"vmla.f32 		d0, d5, d5				\n\t"	//d0 = d0 + d5*d5
+
+	"vmov.f32 		d1, d0					\n\t"	//d1 = d0
+	"vrsqrte.f32 	d0, d0					\n\t"	//d0 = ~ 1.0 / sqrt(d0)
+	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1
+	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d3 = (3 - d0 * d2) / 2
+	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d3
+	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1
+	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d3 = (3 - d0 * d3) / 2
+	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d4
+
+	"vmul.f32 		q2, q2, d0[0]			\n\t"	//d0= d2*d4
+	"vst1.32 		{d4}, [%0]!  			\n\t"	//d2={x0,y0}, d3={z0, w0}
+	"fsts    		s10, [%0]     			\n\t"	//d2={x0,y0}, d3={z0, w0}
+
+	:"+r"(v) :
+	: "d0", "d1", "d2", "d3", "d4", "d5", "memory"
+	);
+}
+float DotProduct(const float v0[3], const float v1[3])
+{
+	float dot;
+	asm volatile (
+	"vld1.32 		{d8}, [%1]!			\n\t"	//d8={x0,y0}
+	"vld1.32 		{d10}, [%2]!		\n\t"	//d10={x1,y1}
+	"flds 			s18, [%1, #0]	    \n\t"	//d9[0]={z0}
+	"flds 			s22, [%2, #0]	    \n\t"	//d11[0]={z1}
+	"vmul.f32 		d12, d8, d10		\n\t"	//d0= d2*d4
+	"vpadd.f32 		d12, d12, d12		\n\t"	//d0 = d[0] + d[1]
+	"vmla.f32 		d12, d9, d11		\n\t"	//d0 = d0 + d3*d5
+	"fmrs	        %0, s24	    		\n\t"	//r0 = s0
+	: "=r"(dot), "+r"(v0), "+r"(v1):
+	: "d8", "d9", "d10", "d11", "d12"
+
+	);
+	return dot;
+}
diff --git a/src/gSP.cpp b/src/gSP.cpp
index bacca97c..d5768d51 100644
--- a/src/gSP.cpp
+++ b/src/gSP.cpp
@@ -381,7 +381,8 @@ void gSPProcessVertex4(u32 v)
 
 	gSPClipVertex4(v);
 }
-#endif
+
+#endif //__VEC4_OPT
 
 static void gSPTransformVertex_default(float vtx[4], float mtx[4][4])
 {
@@ -2427,15 +2428,30 @@ void gSPObjRendermode(u32 _mode)
 	gSP.objRendermode = _mode;
 }
 
+
+#ifdef __NEON_OPT
+void gSPTransformVertex4NEON(u32 v, float mtx[4][4]);
+void gSPTransformNormal4NEON(u32 v, float mtx[4][4]);
+void gSPBillboardVertex4NEON(u32 v);
+#endif //__NEON_OPT
+
 #ifdef __VEC4_OPT
-void (*gSPTransformVertex4)(u32 v, float mtx[4][4]) =
-		gSPTransformVertex4_default;
-void (*gSPTransformNormal4)(u32 v, float mtx[4][4]) =
-		gSPTransformNormal4_default;
+#ifndef __NEON_OPT
+void (*gSPTransformVertex4)(u32 v, float mtx[4][4]) = gSPTransformVertex4_default;
+void (*gSPTransformNormal4)(u32 v, float mtx[4][4]) = gSPTransformNormal4_default;
+void (*gSPBillboardVertex4)(u32 v) = gSPBillboardVertex4_default;
+#else
+void (*gSPTransformVertex4)(u32 v, float mtx[4][4]) = gSPTransformVertex4NEON;
+void (*gSPTransformNormal4)(u32 v, float mtx[4][4]) = gSPTransformNormal4NEON;
+void (*gSPBillboardVertex4)(u32 v) = gSPBillboardVertex4NEON;
+#endif
+
 void (*gSPLightVertex4)(u32 v) = gSPLightVertex4_default;
 void (*gSPPointLightVertex4)(u32 v, float _vPos[4][3]) = gSPPointLightVertex4_default;
-void (*gSPBillboardVertex4)(u32 v) = gSPBillboardVertex4_default;
+
 #endif
+
+
 void (*gSPTransformVertex)(float vtx[4], float mtx[4][4]) =
 		gSPTransformVertex_default;
 void (*gSPLightVertex)(SPVertex & _vtx) = gSPLightVertex_default;
@@ -2445,6 +2461,7 @@ void (*gSPBillboardVertex)(u32 v, u32 i) = gSPBillboardVertex_default;
 void gSPSetupFunctions()
 {
 	if (GBI.getMicrocodeType() != F3DEX2CBFD) {
+
 #ifdef __VEC4_OPT
 		gSPLightVertex4 = gSPLightVertex4_default;
 		gSPPointLightVertex4 = gSPPointLightVertex4_default;
diff --git a/src/gSPNeon.cpp b/src/gSPNeon.cpp
new file mode 100644
index 00000000..b27125a3
--- /dev/null
+++ b/src/gSPNeon.cpp
@@ -0,0 +1,202 @@
+#include <stdio.h>
+#include <math.h>
+#include <algorithm>
+#include <assert.h>
+#include "N64.h"
+#include "GLideN64.h"
+#include "Debug.h"
+#include "Types.h"
+#include "RSP.h"
+#include "GBI.h"
+#include "gSP.h"
+#include "gDP.h"
+#include "3DMath.h"
+#include "OpenGL.h"
+#include "CRC.h"
+#include <string.h>
+#include "convert.h"
+#include "S2DEX.h"
+#include "VI.h"
+#include "FrameBuffer.h"
+#include "DepthBuffer.h"
+#include "Config.h"
+#include "Log.h"
+
+void gSPTransformVertex4NEON(u32 v, float mtx[4][4])
+{
+	OGLRender & render = video().getRender();
+	SPVertex & vtx = render.getVertex(v);
+	void *ptr = &vtx.x;
+
+	asm volatile (
+	"vld1.32 		{d0, d1}, [%1]		  	\n\t"	//q0 = {x,y,z,w}
+	"add 		    %1, %1, %2   		  	\n\t"	//q0 = {x,y,z,w}
+	"vld1.32 		{d2, d3}, [%1]	    	\n\t"	//q1 = {x,y,z,w}
+	"add 		    %1, %1, %2 	    	  	\n\t"	//q0 = {x,y,z,w}
+	"vld1.32 		{d4, d5}, [%1]	        \n\t"	//q2 = {x,y,z,w}
+	"add 		    %1, %1, %2 		      	\n\t"	//q0 = {x,y,z,w}
+	"vld1.32 		{d6, d7}, [%1]	        \n\t"	//q3 = {x,y,z,w}
+	"sub 		    %1, %1, %3   		  	\n\t"	//q0 = {x,y,z,w}
+
+	"vld1.32 		{d18, d19}, [%0]!		\n\t"	//q9 = m
+	"vld1.32 		{d20, d21}, [%0]!       \n\t"	//q10 = m
+	"vld1.32 		{d22, d23}, [%0]!       \n\t"	//q11 = m
+	"vld1.32 		{d24, d25}, [%0]        \n\t"	//q12 = m
+
+	"vmov.f32 		q13, q12    			\n\t"	//q13 = q12
+	"vmov.f32 		q14, q12    			\n\t"	//q14 = q12
+	"vmov.f32 		q15, q12    			\n\t"	//q15 = q12
+
+	"vmla.f32 		q12, q9, d0[0]			\n\t"	//q12 = q9*d0[0]
+	"vmla.f32 		q13, q9, d2[0]			\n\t"	//q13 = q9*d0[0]
+	"vmla.f32 		q14, q9, d4[0]			\n\t"	//q14 = q9*d0[0]
+	"vmla.f32 		q15, q9, d6[0]			\n\t"	//q15 = q9*d0[0]
+	"vmla.f32 		q12, q10, d0[1]			\n\t"	//q12 = q10*d0[1]
+	"vmla.f32 		q13, q10, d2[1]			\n\t"	//q13 = q10*d0[1]
+	"vmla.f32 		q14, q10, d4[1]			\n\t"	//q14 = q10*d0[1]
+	"vmla.f32 		q15, q10, d6[1]			\n\t"	//q15 = q10*d0[1]
+	"vmla.f32 		q12, q11, d1[0]			\n\t"	//q12 = q11*d1[0]
+	"vmla.f32 		q13, q11, d3[0]			\n\t"	//q13 = q11*d1[0]
+	"vmla.f32 		q14, q11, d5[0]			\n\t"	//q14 = q11*d1[0]
+	"vmla.f32 		q15, q11, d7[0]			\n\t"	//q15 = q11*d1[0]
+
+	"vst1.32 		{d24, d25}, [%1] 		\n\t"	//q12
+	"add 		    %1, %1, %2 		      	\n\t"	//q0 = {x,y,z,w}
+	"vst1.32 		{d26, d27}, [%1] 	    \n\t"	//q13
+	"add 		    %1, %1, %2 	    	  	\n\t"	//q0 = {x,y,z,w}
+	"vst1.32 		{d28, d29}, [%1] 	    \n\t"	//q14
+	"add 		    %1, %1, %2   		  	\n\t"	//q0 = {x,y,z,w}
+	"vst1.32 		{d30, d31}, [%1]     	\n\t"	//q15
+
+	: "+&r"(mtx), "+&r"(ptr)
+	: "I"(sizeof(SPVertex)), "I"(3 * sizeof(SPVertex))
+	: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+	  "d18","d19", "d20", "d21", "d22", "d23", "d24",
+	  "d25", "d26", "d27", "d28", "d29", "d30", "d31", "memory"
+	);
+}
+
+//4x Transform normal and normalize
+void gSPTransformNormal4NEON(u32 v, float mtx[4][4])
+{
+	OGLRender & render = video().getRender();
+	SPVertex & vtx = render.getVertex(v);
+	void *ptr = &vtx.nx;
+
+	asm volatile (
+	"vld1.32 		{d0, d1}, [%1]		  	\n\t"	//q0 = {x,y,z,w}
+	"add 		    %1, %1, %2  		  	\n\t"	//q0 = {x,y,z,w}
+	"vld1.32 		{d2, d3}, [%1]	    	\n\t"	//q1 = {x,y,z,w}
+	"add 		    %1, %1, %2  		  	\n\t"	//q0 = {x,y,z,w}
+	"vld1.32 		{d4, d5}, [%1]	        \n\t"	//q2 = {x,y,z,w}
+	"add 		    %1, %1, %2  		  	\n\t"	//q0 = {x,y,z,w}
+	"vld1.32 		{d6, d7}, [%1]	        \n\t"	//q3 = {x,y,z,w}
+	"sub 		    %1, %1, %3  		  	\n\t"	//q0 = {x,y,z,w}
+
+	"vld1.32 		{d18, d19}, [%0]!		\n\t"	//q9 = m
+	"vld1.32 		{d20, d21}, [%0]!	    \n\t"	//q10 = m+16
+	"vld1.32 		{d22, d23}, [%0]    	\n\t"	//q11 = m+32
+
+	"vmul.f32 		q12, q9, d0[0]			\n\t"	//q12 = q9*d0[0]
+	"vmul.f32 		q13, q9, d2[0]			\n\t"	//q13 = q9*d2[0]
+	"vmul.f32 		q14, q9, d4[0]			\n\t"	//q14 = q9*d4[0]
+	"vmul.f32 		q15, q9, d6[0]			\n\t"	//q15 = q9*d6[0]
+
+	"vmla.f32 		q12, q10, d0[1]			\n\t"	//q12 += q10*q0[1]
+	"vmla.f32 		q13, q10, d2[1]			\n\t"	//q13 += q10*q2[1]
+	"vmla.f32 		q14, q10, d4[1]			\n\t"	//q14 += q10*q4[1]
+	"vmla.f32 		q15, q10, d6[1]			\n\t"	//q15 += q10*q6[1]
+
+	"vmla.f32 		q12, q11, d1[0]			\n\t"	//q12 += q11*d1[0]
+	"vmla.f32 		q13, q11, d3[0]			\n\t"	//q13 += q11*d3[0]
+	"vmla.f32 		q14, q11, d5[0]			\n\t"	//q14 += q11*d5[0]
+	"vmla.f32 		q15, q11, d7[0]			\n\t"	//q15 += q11*d7[0]
+
+	"vmul.f32 		q0, q12, q12			\n\t"	//q0 = q12*q12
+	"vmul.f32 		q1, q13, q13			\n\t"	//q1 = q13*q13
+	"vmul.f32 		q2, q14, q14			\n\t"	//q2 = q14*q14
+	"vmul.f32 		q3, q15, q15			\n\t"	//q3 = q15*q15
+
+	"vpadd.f32 		d0, d0  				\n\t"	//d0[0] = d0[0] + d0[1]
+	"vpadd.f32 		d2, d2  				\n\t"	//d2[0] = d2[0] + d2[1]
+	"vpadd.f32 		d4, d4  				\n\t"	//d4[0] = d4[0] + d4[1]
+	"vpadd.f32 		d6, d6  				\n\t"	//d6[0] = d6[0] + d6[1]
+
+	"vmov.f32    	s1, s2  				\n\t"	//d0[1] = d1[0]
+	"vmov.f32 	    s5, s6  				\n\t"	//d2[1] = d3[0]
+	"vmov.f32 	    s9, s10  				\n\t"	//d4[1] = d5[0]
+	"vmov.f32    	s13, s14  				\n\t"	//d6[1] = d7[0]
+
+	"vpadd.f32 		d0, d0, d2  			\n\t"	//d0 = {d0[0] + d0[1], d2[0] + d2[1]}
+	"vpadd.f32 		d1, d4, d6  			\n\t"	//d1 = {d4[0] + d4[1], d6[0] + d6[1]}
+
+	"vmov.f32 		q1, q0					\n\t"	//q1 = q0
+	"vrsqrte.f32 	q0, q0					\n\t"	//q0 = ~ 1.0 / sqrt(q0)
+	"vmul.f32 		q2, q0, q1				\n\t"	//q2 = q0 * q1
+	"vrsqrts.f32 	q3, q2, q0				\n\t"	//q3 = (3 - q0 * q2) / 2
+	"vmul.f32 		q0, q0, q3				\n\t"	//q0 = q0 * q3
+	"vmul.f32 		q2, q0, q1				\n\t"	//q2 = q0 * q1
+	"vrsqrts.f32 	q3, q2, q0				\n\t"	//q3 = (3 - q0 * q2) / 2
+	"vmul.f32 		q0, q0, q3				\n\t"	//q0 = q0 * q3
+
+	"vmul.f32 		q3, q15, d1[1]			\n\t"	//q3 = q15*d1[1]
+	"vmul.f32 		q2, q14, d1[0]			\n\t"	//q2 = q14*d1[0]
+	"vmul.f32 		q1, q13, d0[1]			\n\t"	//q1 = q13*d0[1]
+	"vmul.f32 		q0, q12, d0[0]			\n\t"	//q0 = q12*d0[0]
+
+	"vst1.32 		{d0, d1}, [%1]  	    \n\t"	//d0={nx,ny,nz,pad}
+	"add 		    %1, %1, %2   		  	\n\t"	//q0 = {x,y,z,w}
+	"vst1.32 		{d2, d3}, [%1]  	    \n\t"	//d2={nx,ny,nz,pad}
+	"add 		    %1, %1, %2  		  	\n\t"	//q0 = {x,y,z,w}
+	"vst1.32 		{d4, d5}, [%1]  	    \n\t"	//d4={nx,ny,nz,pad}
+	"add 		    %1, %1, %2  		  	\n\t"	//q0 = {x,y,z,w}
+	"vst1.32 		{d6, d7}, [%1]        	\n\t"	//d6={nx,ny,nz,pad}
+
+	: "+&r"(mtx), "+&r"(ptr)
+	: "I"(sizeof(SPVertex)), "I"(3 * sizeof(SPVertex))
+	: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+		"d16","d17", "d18","d19", "d20", "d21", "d22",
+		"d23", "d24", "d25", "d26", "d27", "d28", "d29",
+		"d30", "d31", "memory"
+	);
+}
+
+void gSPBillboardVertex4NEON(u32 v)
+{
+	int i = 0;
+
+	OGLRender & render = video().getRender();
+	SPVertex & vtx0 = render.getVertex(v);
+	SPVertex & vtx1 = render.getVertex(i);
+
+	void *ptr0 = (void*)&vtx0.x;
+	void *ptr1 = (void*)&vtx1.x;
+	asm volatile (
+
+	"vld1.32 		{d0, d1}, [%0]		  	\n\t"	//q0 = {x,y,z,w}
+	"add 		    %0, %0, %2 		  	    \n\t"	//q0 = {x,y,z,w}
+	"vld1.32 		{d2, d3}, [%0]	    	\n\t"	//q1 = {x,y,z,w}
+	"add 		    %0, %0, %2 		      	\n\t"	//q0 = {x,y,z,w}
+	"vld1.32 		{d4, d5}, [%0]	        \n\t"	//q2 = {x,y,z,w}
+	"add 		    %0, %0, %2 	    	  	\n\t"	//q0 = {x,y,z,w}
+	"vld1.32 		{d6, d7}, [%0]	        \n\t"	//q3 = {x,y,z,w}
+	"sub 		    %0, %0, %3   		  	\n\t"	//q0 = {x,y,z,w}
+
+	"vld1.32 		{d16, d17}, [%1]		\n\t"	//q2={x1,y1,z1,w1}
+	"vadd.f32 		q0, q0, q8 			    \n\t"	//q1=q1+q1
+	"vadd.f32 		q1, q1, q8 			    \n\t"	//q1=q1+q1
+	"vadd.f32 		q2, q2, q8 			    \n\t"	//q1=q1+q1
+	"vadd.f32 		q3, q3, q8 			    \n\t"	//q1=q1+q1
+	"vst1.32 		{d0, d1}, [%0] 		    \n\t"	//
+	"add 		    %0, %0, %2  		  	\n\t"	//q0 = {x,y,z,w}
+	"vst1.32 		{d2, d3}, [%0]          \n\t"	//
+	"add 		    %0, %0, %2 		  	    \n\t"	//q0 = {x,y,z,w}
+	"vst1.32 		{d4, d5}, [%0]          \n\t"	//
+	"add 		    %0, %0, %2 		  	    \n\t"	//q0 = {x,y,z,w}
+	"vst1.32 		{d6, d7}, [%0]          \n\t"	//
+	: "+&r"(ptr0), "+&r"(ptr1)
+	: "I"(sizeof(SPVertex)), "I"(3 * sizeof(SPVertex))
+	: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+	  "d16", "d17", "memory"
+	);
+}