diff --git a/src/3DMath.cpp b/src/3DMath.cpp index 42e675b8..bda882c8 100644 --- a/src/3DMath.cpp +++ b/src/3DMath.cpp @@ -3,26 +3,19 @@ void MultMatrix(float m0[4][4], float m1[4][4], float dest[4][4]) { - int i; - for (i = 0; i < 4; i++) - { - dest[0][i] = m0[0][i]*m1[0][0] + m0[1][i]*m1[0][1] + m0[2][i]*m1[0][2] + m0[3][i]*m1[0][3]; - dest[1][i] = m0[0][i]*m1[1][0] + m0[1][i]*m1[1][1] + m0[2][i]*m1[1][2] + m0[3][i]*m1[1][3]; - dest[2][i] = m0[0][i]*m1[2][0] + m0[1][i]*m1[2][1] + m0[2][i]*m1[2][2] + m0[3][i]*m1[2][3]; - dest[3][i] = m0[3][i]*m1[3][3] + m0[2][i]*m1[3][2] + m0[1][i]*m1[3][1] + m0[0][i]*m1[3][0]; - } -} - -void MultMatrix2(float m0[4][4], float m1[4][4]) -{ - float dst[4][4]; - MultMatrix(m0, m1, dst); - memcpy( m0, dst, sizeof(float) * 16 ); + int i; + for (i = 0; i < 4; i++) + { + dest[0][i] = m0[0][i]*m1[0][0] + m0[1][i]*m1[0][1] + m0[2][i]*m1[0][2] + m0[3][i]*m1[0][3]; + dest[1][i] = m0[0][i]*m1[1][0] + m0[1][i]*m1[1][1] + m0[2][i]*m1[1][2] + m0[3][i]*m1[1][3]; + dest[2][i] = m0[0][i]*m1[2][0] + m0[1][i]*m1[2][1] + m0[2][i]*m1[2][2] + m0[3][i]*m1[2][3]; + dest[3][i] = m0[3][i]*m1[3][3] + m0[2][i]*m1[3][2] + m0[1][i]*m1[3][1] + m0[0][i]*m1[3][0]; + } } void TransformVectorNormalize(float vec[3], float mtx[4][4]) { - float len; + float len; float vres[3]; vres[0] = mtx[0][0] * vec[0] @@ -35,12 +28,84 @@ void TransformVectorNormalize(float vec[3], float mtx[4][4]) + mtx[1][2] * vec[1] + mtx[2][2] * vec[2]; memcpy(vec, vres, sizeof(float)*3); - len = vec[0]*vec[0] + vec[1]*vec[1] + vec[2]*vec[2]; - if (len != 0.0) - { - len = sqrtf(len); - vec[0] /= len; - vec[1] /= len; - vec[2] /= len; - } + len = vec[0]*vec[0] + vec[1]*vec[1] + vec[2]*vec[2]; + if (len != 0.0) + { + len = sqrtf(len); + vec[0] /= len; + vec[1] /= len; + vec[2] /= len; + } +} + +void Normalize(float v[3]) +{ +#ifdef WIN32_ASM + __asm { + mov esi, dword ptr [v] + // ST(6) ST(5) ST(4) ST(3) ST(2) ST(1) ST + fld dword ptr [esi+08h] // v2 + fld dword ptr [esi+04h] // v2 v1 + fld dword ptr [esi] // v2 v1 v0 + fld1 // v2 v1 v0 1.0 + fld ST(3) // v2 v1 v0 1.0 v2 + fmul ST, ST // v2 v1 v0 1.0 v2*v2 + fld ST(3) // v2 v1 v0 1.0 v2*v2 v1 + fmul ST, ST // v2 v1 v0 1.0 v2*v2 v1*v1 + fld ST(3) // v2 v1 v0 1.0 v2*v2 v1*v1 v0 + fmul ST, ST // v2 v1 v0 1.0 v2*v2 v1*v1 v0*v0 + fadd // v2 v1 v0 1.0 v2*v2 v1*v1+v0*v0 + fadd // v2 v1 v0 1.0 v2*v2+v1*v1+v0*v0 + ftst // Compare ST to 0 + fstsw ax // Store FPU status word in ax + sahf // Transfer ax to flags register + jz End // Skip if length is zero + fsqrt // v2 v1 v0 1.0 len + fdiv // v2 v1 v0 1.0/len + fmul ST(3), ST // v2*(1.0/len) v1 v0 1.0/len + fmul ST(2), ST // v2*(1.0/len) v1*(1.0/len) v0 1.0/len + fmul // v2*(1.0/len) v1*(1.0/len) v0*(1.0/len) + fstp dword ptr [esi] // v2*(1.0/len) v1*(1.0/len) + fstp dword ptr [esi+04h] // v2*(1.0/len) + fstp dword ptr [esi+08h] // +End: + finit + } +#else // WIN32_ASM + float len; + + len = v[0]*v[0] + v[1]*v[1] + v[2]*v[2]; + if (len != 0.0) { + len = sqrtf( len ); + v[0] /= len; + v[1] /= len; + v[2] /= len; + } +#endif // WIN32_ASM +} + + +float DotProduct(const float v0[3], const float v1[3]) +{ + float dot; +#ifdef WIN32_ASM + __asm { + mov esi, dword ptr [v0] + mov edi, dword ptr [v1] + lea ebx, [dot] + + fld dword ptr [esi] + fmul dword ptr [edi] + fld dword ptr [esi+04h] + fmul dword ptr [edi+04h] + fld dword ptr [esi+08h] + fmul dword ptr [edi+08h] + fadd + fadd + fstp dword ptr [ebx] + } +#else // WIN32_ASM + dot = v0[0]*v1[0] + v0[1]*v1[1] + v0[2]*v1[2]; +#endif // WIN32_ASM + return dot; } diff --git a/src/3DMath.h b/src/3DMath.h index 5902ccf5..00815b6b 100644 --- a/src/3DMath.h +++ b/src/3DMath.h @@ -4,8 +4,16 @@ #include void MultMatrix( float m0[4][4], float m1[4][4], float dest[4][4]); -void MultMatrix2(float m0[4][4], float m1[4][4] ); void TransformVectorNormalize(float vec[3], float mtx[4][4]); +void Normalize(float v[3]); +float DotProduct(const float v0[3], const float v1[3]); + +inline void MultMatrix2(float m0[4][4], float m1[4][4]) +{ + float dst[4][4]; + MultMatrix(m0, m1, dst); + memcpy( m0, dst, sizeof(float) * 16 ); +} inline void CopyMatrix( float m0[4][4], float m1[4][4] ) { @@ -94,76 +102,4 @@ inline void Transpose3x3Matrix( float mtx[4][4] ) #endif // WIN32_ASM } -inline void Normalize(float v[3]) -{ -#ifdef WIN32_ASM - __asm { - mov esi, dword ptr [v] - // ST(6) ST(5) ST(4) ST(3) ST(2) ST(1) ST - fld dword ptr [esi+08h] // v2 - fld dword ptr [esi+04h] // v2 v1 - fld dword ptr [esi] // v2 v1 v0 - fld1 // v2 v1 v0 1.0 - fld ST(3) // v2 v1 v0 1.0 v2 - fmul ST, ST // v2 v1 v0 1.0 v2*v2 - fld ST(3) // v2 v1 v0 1.0 v2*v2 v1 - fmul ST, ST // v2 v1 v0 1.0 v2*v2 v1*v1 - fld ST(3) // v2 v1 v0 1.0 v2*v2 v1*v1 v0 - fmul ST, ST // v2 v1 v0 1.0 v2*v2 v1*v1 v0*v0 - fadd // v2 v1 v0 1.0 v2*v2 v1*v1+v0*v0 - fadd // v2 v1 v0 1.0 v2*v2+v1*v1+v0*v0 - ftst // Compare ST to 0 - fstsw ax // Store FPU status word in ax - sahf // Transfer ax to flags register - jz End // Skip if length is zero - fsqrt // v2 v1 v0 1.0 len - fdiv // v2 v1 v0 1.0/len - fmul ST(3), ST // v2*(1.0/len) v1 v0 1.0/len - fmul ST(2), ST // v2*(1.0/len) v1*(1.0/len) v0 1.0/len - fmul // v2*(1.0/len) v1*(1.0/len) v0*(1.0/len) - fstp dword ptr [esi] // v2*(1.0/len) v1*(1.0/len) - fstp dword ptr [esi+04h] // v2*(1.0/len) - fstp dword ptr [esi+08h] // -End: - finit - } -#else // WIN32_ASM - float len; - - len = v[0]*v[0] + v[1]*v[1] + v[2]*v[2]; - if (len != 0.0) { - len = sqrtf( len ); - v[0] /= len; - v[1] /= len; - v[2] /= len; - } -#endif // WIN32_ASM -} - - -inline float DotProduct(const float v0[3], const float v1[3]) -{ - float dot; -#ifdef WIN32_ASM - __asm { - mov esi, dword ptr [v0] - mov edi, dword ptr [v1] - lea ebx, [dot] - - fld dword ptr [esi] - fmul dword ptr [edi] - fld dword ptr [esi+04h] - fmul dword ptr [edi+04h] - fld dword ptr [esi+08h] - fmul dword ptr [edi+08h] - fadd - fadd - fstp dword ptr [ebx] - } -#else // WIN32_ASM - dot = v0[0]*v1[0] + v0[1]*v1[1] + v0[2]*v1[2]; -#endif // WIN32_ASM - return dot; -} - #endif diff --git a/src/3DMathNeon.cpp b/src/3DMathNeon.cpp new file mode 100644 index 00000000..5db7d47d --- /dev/null +++ b/src/3DMathNeon.cpp @@ -0,0 +1,124 @@ +#include "3DMath.h" + +void MultMatrix( float m0[4][4], float m1[4][4], float dest[4][4]) +{ + asm volatile ( + "vld1.32 {d0, d1}, [%1]! \n\t" //q0 = m1 + "vld1.32 {d2, d3}, [%1]! \n\t" //q1 = m1+4 + "vld1.32 {d4, d5}, [%1]! \n\t" //q2 = m1+8 + "vld1.32 {d6, d7}, [%1] \n\t" //q3 = m1+12 + "vld1.32 {d16, d17}, [%0]! \n\t" //q8 = m0 + "vld1.32 {d18, d19}, [%0]! \n\t" //q9 = m0+4 + "vld1.32 {d20, d21}, [%0]! \n\t" //q10 = m0+8 + "vld1.32 {d22, d23}, [%0] \n\t" //q11 = m0+12 + + "vmul.f32 q12, q8, d0[0] \n\t" //q12 = q8 * d0[0] + "vmul.f32 q13, q8, d2[0] \n\t" //q13 = q8 * d2[0] + "vmul.f32 q14, q8, d4[0] \n\t" //q14 = q8 * d4[0] + "vmul.f32 q15, q8, d6[0] \n\t" //q15 = q8 * d6[0] + "vmla.f32 q12, q9, d0[1] \n\t" //q12 = q9 * d0[1] + "vmla.f32 q13, q9, d2[1] \n\t" //q13 = q9 * d2[1] + "vmla.f32 q14, q9, d4[1] \n\t" //q14 = q9 * d4[1] + "vmla.f32 q15, q9, d6[1] \n\t" //q15 = q9 * d6[1] + "vmla.f32 q12, q10, d1[0] \n\t" //q12 = q10 * d0[0] + "vmla.f32 q13, q10, d3[0] \n\t" //q13 = q10 * d2[0] + "vmla.f32 q14, q10, d5[0] \n\t" //q14 = q10 * d4[0] + "vmla.f32 q15, q10, d7[0] \n\t" //q15 = q10 * d6[0] + "vmla.f32 q12, q11, d1[1] \n\t" //q12 = q11 * d0[1] + "vmla.f32 q13, q11, d3[1] \n\t" //q13 = q11 * d2[1] + "vmla.f32 q14, q11, d5[1] \n\t" //q14 = q11 * d4[1] + "vmla.f32 q15, q11, d7[1] \n\t" //q15 = q11 * d6[1] + + "vst1.32 {d24, d25}, [%2]! \n\t" //d = q12 + "vst1.32 {d26, d27}, [%2]! \n\t" //d+4 = q13 + "vst1.32 {d28, d29}, [%2]! \n\t" //d+8 = q14 + "vst1.32 {d30, d31}, [%2] \n\t" //d+12 = q15 + + :"+r"(m0), "+r"(m1), "+r"(dest): + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", + "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", + "memory" + ); +} + +void TransformVectorNormalize(float vec[3], float mtx[4][4]) +{ + asm volatile ( + "vld1.32 {d0}, [%1] \n\t" //Q0 = v + "flds s2, [%1, #8] \n\t" //Q0 = v + "vld1.32 {d18, d19}, [%0]! \n\t" //Q1 = m + "vld1.32 {d20, d21}, [%0]! \n\t" //Q2 = m+4 + "vld1.32 {d22, d23}, [%0] \n\t" //Q3 = m+8 + + "vmul.f32 q2, q9, d0[0] \n\t" //q2 = q9*Q0[0] + "vmla.f32 q2, q10, d0[1] \n\t" //Q5 += Q1*Q0[1] + "vmla.f32 q2, q11, d1[0] \n\t" //Q5 += Q2*Q0[2] + + "vmul.f32 d0, d4, d4 \n\t" //d0 = d0*d0 + "vpadd.f32 d0, d0, d0 \n\t" //d0 = d[0] + d[1] + "vmla.f32 d0, d5, d5 \n\t" //d0 = d0 + d1*d1 + + "vmov.f32 d1, d0 \n\t" //d1 = d0 + "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) + "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 + "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 + "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 + "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d3) / 2 + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 + + "vmul.f32 q2, q2, d0[0] \n\t" //d0= d2*d4 + + "vst1.32 {d4}, [%1] \n\t" //Q4 = m+12 + "fsts s10, [%1, #8] \n\t" //Q4 = m+12 + : "+r"(mtx): "r"(vec) + : "d0","d1","d2","d3","d18","d19","d20","d21","d22", "d23", "memory" + ); +} + +void Normalize(float v[3]) +{ + asm volatile ( + "vld1.32 {d4}, [%0]! \n\t" //d4={x,y} + "flds s10, [%0] \n\t" //d5[0] = z + "sub %0, %0, #8 \n\t" //d5[0] = z + "vmul.f32 d0, d4, d4 \n\t" //d0= d4*d4 + "vpadd.f32 d0, d0, d0 \n\t" //d0 = d[0] + d[1] + "vmla.f32 d0, d5, d5 \n\t" //d0 = d0 + d5*d5 + + "vmov.f32 d1, d0 \n\t" //d1 = d0 + "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) + "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 + "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 + "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 + "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d3) / 2 + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 + + "vmul.f32 q2, q2, d0[0] \n\t" //d0= d2*d4 + "vst1.32 {d4}, [%0]! \n\t" //d2={x0,y0}, d3={z0, w0} + "fsts s10, [%0] \n\t" //d2={x0,y0}, d3={z0, w0} + + :"+r"(v) : + : "d0", "d1", "d2", "d3", "d4", "d5", "memory" + ); +} +float DotProduct(const float v0[3], const float v1[3]) +{ + float dot; + asm volatile ( + "vld1.32 {d8}, [%1]! \n\t" //d8={x0,y0} + "vld1.32 {d10}, [%2]! \n\t" //d10={x1,y1} + "flds s18, [%1, #0] \n\t" //d9[0]={z0} + "flds s22, [%2, #0] \n\t" //d11[0]={z1} + "vmul.f32 d12, d8, d10 \n\t" //d0= d2*d4 + "vpadd.f32 d12, d12, d12 \n\t" //d0 = d[0] + d[1] + "vmla.f32 d12, d9, d11 \n\t" //d0 = d0 + d3*d5 + "fmrs %0, s24 \n\t" //r0 = s0 + : "=r"(dot), "+r"(v0), "+r"(v1): + : "d8", "d9", "d10", "d11", "d12" + + ); + return dot; +} diff --git a/src/gSP.cpp b/src/gSP.cpp index bacca97c..d5768d51 100644 --- a/src/gSP.cpp +++ b/src/gSP.cpp @@ -381,7 +381,8 @@ void gSPProcessVertex4(u32 v) gSPClipVertex4(v); } -#endif + +#endif //__VEC4_OPT static void gSPTransformVertex_default(float vtx[4], float mtx[4][4]) { @@ -2427,15 +2428,30 @@ void gSPObjRendermode(u32 _mode) gSP.objRendermode = _mode; } + +#ifdef __NEON_OPT +void gSPTransformVertex4NEON(u32 v, float mtx[4][4]); +void gSPTransformNormal4NEON(u32 v, float mtx[4][4]); +void gSPBillboardVertex4NEON(u32 v); +#endif //__NEON_OPT + #ifdef __VEC4_OPT -void (*gSPTransformVertex4)(u32 v, float mtx[4][4]) = - gSPTransformVertex4_default; -void (*gSPTransformNormal4)(u32 v, float mtx[4][4]) = - gSPTransformNormal4_default; +#ifndef __NEON_OPT +void (*gSPTransformVertex4)(u32 v, float mtx[4][4]) = gSPTransformVertex4_default; +void (*gSPTransformNormal4)(u32 v, float mtx[4][4]) = gSPTransformNormal4_default; +void (*gSPBillboardVertex4)(u32 v) = gSPBillboardVertex4_default; +#else +void (*gSPTransformVertex4)(u32 v, float mtx[4][4]) = gSPTransformVertex4NEON; +void (*gSPTransformNormal4)(u32 v, float mtx[4][4]) = gSPTransformNormal4NEON; +void (*gSPBillboardVertex4)(u32 v) = gSPBillboardVertex4NEON; +#endif + void (*gSPLightVertex4)(u32 v) = gSPLightVertex4_default; void (*gSPPointLightVertex4)(u32 v, float _vPos[4][3]) = gSPPointLightVertex4_default; -void (*gSPBillboardVertex4)(u32 v) = gSPBillboardVertex4_default; + #endif + + void (*gSPTransformVertex)(float vtx[4], float mtx[4][4]) = gSPTransformVertex_default; void (*gSPLightVertex)(SPVertex & _vtx) = gSPLightVertex_default; @@ -2445,6 +2461,7 @@ void (*gSPBillboardVertex)(u32 v, u32 i) = gSPBillboardVertex_default; void gSPSetupFunctions() { if (GBI.getMicrocodeType() != F3DEX2CBFD) { + #ifdef __VEC4_OPT gSPLightVertex4 = gSPLightVertex4_default; gSPPointLightVertex4 = gSPPointLightVertex4_default; diff --git a/src/gSPNeon.cpp b/src/gSPNeon.cpp new file mode 100644 index 00000000..b27125a3 --- /dev/null +++ b/src/gSPNeon.cpp @@ -0,0 +1,202 @@ +#include +#include +#include +#include +#include "N64.h" +#include "GLideN64.h" +#include "Debug.h" +#include "Types.h" +#include "RSP.h" +#include "GBI.h" +#include "gSP.h" +#include "gDP.h" +#include "3DMath.h" +#include "OpenGL.h" +#include "CRC.h" +#include +#include "convert.h" +#include "S2DEX.h" +#include "VI.h" +#include "FrameBuffer.h" +#include "DepthBuffer.h" +#include "Config.h" +#include "Log.h" + +void gSPTransformVertex4NEON(u32 v, float mtx[4][4]) +{ + OGLRender & render = video().getRender(); + SPVertex & vtx = render.getVertex(v); + void *ptr = &vtx.x; + + asm volatile ( + "vld1.32 {d0, d1}, [%1] \n\t" //q0 = {x,y,z,w} + "add %1, %1, %2 \n\t" //q0 = {x,y,z,w} + "vld1.32 {d2, d3}, [%1] \n\t" //q1 = {x,y,z,w} + "add %1, %1, %2 \n\t" //q0 = {x,y,z,w} + "vld1.32 {d4, d5}, [%1] \n\t" //q2 = {x,y,z,w} + "add %1, %1, %2 \n\t" //q0 = {x,y,z,w} + "vld1.32 {d6, d7}, [%1] \n\t" //q3 = {x,y,z,w} + "sub %1, %1, %3 \n\t" //q0 = {x,y,z,w} + + "vld1.32 {d18, d19}, [%0]! \n\t" //q9 = m + "vld1.32 {d20, d21}, [%0]! \n\t" //q10 = m + "vld1.32 {d22, d23}, [%0]! \n\t" //q11 = m + "vld1.32 {d24, d25}, [%0] \n\t" //q12 = m + + "vmov.f32 q13, q12 \n\t" //q13 = q12 + "vmov.f32 q14, q12 \n\t" //q14 = q12 + "vmov.f32 q15, q12 \n\t" //q15 = q12 + + "vmla.f32 q12, q9, d0[0] \n\t" //q12 = q9*d0[0] + "vmla.f32 q13, q9, d2[0] \n\t" //q13 = q9*d0[0] + "vmla.f32 q14, q9, d4[0] \n\t" //q14 = q9*d0[0] + "vmla.f32 q15, q9, d6[0] \n\t" //q15 = q9*d0[0] + "vmla.f32 q12, q10, d0[1] \n\t" //q12 = q10*d0[1] + "vmla.f32 q13, q10, d2[1] \n\t" //q13 = q10*d0[1] + "vmla.f32 q14, q10, d4[1] \n\t" //q14 = q10*d0[1] + "vmla.f32 q15, q10, d6[1] \n\t" //q15 = q10*d0[1] + "vmla.f32 q12, q11, d1[0] \n\t" //q12 = q11*d1[0] + "vmla.f32 q13, q11, d3[0] \n\t" //q13 = q11*d1[0] + "vmla.f32 q14, q11, d5[0] \n\t" //q14 = q11*d1[0] + "vmla.f32 q15, q11, d7[0] \n\t" //q15 = q11*d1[0] + + "vst1.32 {d24, d25}, [%1] \n\t" //q12 + "add %1, %1, %2 \n\t" //q0 = {x,y,z,w} + "vst1.32 {d26, d27}, [%1] \n\t" //q13 + "add %1, %1, %2 \n\t" //q0 = {x,y,z,w} + "vst1.32 {d28, d29}, [%1] \n\t" //q14 + "add %1, %1, %2 \n\t" //q0 = {x,y,z,w} + "vst1.32 {d30, d31}, [%1] \n\t" //q15 + + : "+&r"(mtx), "+&r"(ptr) + : "I"(sizeof(SPVertex)), "I"(3 * sizeof(SPVertex)) + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d18","d19", "d20", "d21", "d22", "d23", "d24", + "d25", "d26", "d27", "d28", "d29", "d30", "d31", "memory" + ); +} + +//4x Transform normal and normalize +void gSPTransformNormal4NEON(u32 v, float mtx[4][4]) +{ + OGLRender & render = video().getRender(); + SPVertex & vtx = render.getVertex(v); + void *ptr = &vtx.nx; + + asm volatile ( + "vld1.32 {d0, d1}, [%1] \n\t" //q0 = {x,y,z,w} + "add %1, %1, %2 \n\t" //q0 = {x,y,z,w} + "vld1.32 {d2, d3}, [%1] \n\t" //q1 = {x,y,z,w} + "add %1, %1, %2 \n\t" //q0 = {x,y,z,w} + "vld1.32 {d4, d5}, [%1] \n\t" //q2 = {x,y,z,w} + "add %1, %1, %2 \n\t" //q0 = {x,y,z,w} + "vld1.32 {d6, d7}, [%1] \n\t" //q3 = {x,y,z,w} + "sub %1, %1, %3 \n\t" //q0 = {x,y,z,w} + + "vld1.32 {d18, d19}, [%0]! \n\t" //q9 = m + "vld1.32 {d20, d21}, [%0]! \n\t" //q10 = m+16 + "vld1.32 {d22, d23}, [%0] \n\t" //q11 = m+32 + + "vmul.f32 q12, q9, d0[0] \n\t" //q12 = q9*d0[0] + "vmul.f32 q13, q9, d2[0] \n\t" //q13 = q9*d2[0] + "vmul.f32 q14, q9, d4[0] \n\t" //q14 = q9*d4[0] + "vmul.f32 q15, q9, d6[0] \n\t" //q15 = q9*d6[0] + + "vmla.f32 q12, q10, d0[1] \n\t" //q12 += q10*q0[1] + "vmla.f32 q13, q10, d2[1] \n\t" //q13 += q10*q2[1] + "vmla.f32 q14, q10, d4[1] \n\t" //q14 += q10*q4[1] + "vmla.f32 q15, q10, d6[1] \n\t" //q15 += q10*q6[1] + + "vmla.f32 q12, q11, d1[0] \n\t" //q12 += q11*d1[0] + "vmla.f32 q13, q11, d3[0] \n\t" //q13 += q11*d3[0] + "vmla.f32 q14, q11, d5[0] \n\t" //q14 += q11*d5[0] + "vmla.f32 q15, q11, d7[0] \n\t" //q15 += q11*d7[0] + + "vmul.f32 q0, q12, q12 \n\t" //q0 = q12*q12 + "vmul.f32 q1, q13, q13 \n\t" //q1 = q13*q13 + "vmul.f32 q2, q14, q14 \n\t" //q2 = q14*q14 + "vmul.f32 q3, q15, q15 \n\t" //q3 = q15*q15 + + "vpadd.f32 d0, d0 \n\t" //d0[0] = d0[0] + d0[1] + "vpadd.f32 d2, d2 \n\t" //d2[0] = d2[0] + d2[1] + "vpadd.f32 d4, d4 \n\t" //d4[0] = d4[0] + d4[1] + "vpadd.f32 d6, d6 \n\t" //d6[0] = d6[0] + d6[1] + + "vmov.f32 s1, s2 \n\t" //d0[1] = d1[0] + "vmov.f32 s5, s6 \n\t" //d2[1] = d3[0] + "vmov.f32 s9, s10 \n\t" //d4[1] = d5[0] + "vmov.f32 s13, s14 \n\t" //d6[1] = d7[0] + + "vpadd.f32 d0, d0, d2 \n\t" //d0 = {d0[0] + d0[1], d2[0] + d2[1]} + "vpadd.f32 d1, d4, d6 \n\t" //d1 = {d4[0] + d4[1], d6[0] + d6[1]} + + "vmov.f32 q1, q0 \n\t" //q1 = q0 + "vrsqrte.f32 q0, q0 \n\t" //q0 = ~ 1.0 / sqrt(q0) + "vmul.f32 q2, q0, q1 \n\t" //q2 = q0 * q1 + "vrsqrts.f32 q3, q2, q0 \n\t" //q3 = (3 - q0 * q2) / 2 + "vmul.f32 q0, q0, q3 \n\t" //q0 = q0 * q3 + "vmul.f32 q2, q0, q1 \n\t" //q2 = q0 * q1 + "vrsqrts.f32 q3, q2, q0 \n\t" //q3 = (3 - q0 * q2) / 2 + "vmul.f32 q0, q0, q3 \n\t" //q0 = q0 * q3 + + "vmul.f32 q3, q15, d1[1] \n\t" //q3 = q15*d1[1] + "vmul.f32 q2, q14, d1[0] \n\t" //q2 = q14*d1[0] + "vmul.f32 q1, q13, d0[1] \n\t" //q1 = q13*d0[1] + "vmul.f32 q0, q12, d0[0] \n\t" //q0 = q12*d0[0] + + "vst1.32 {d0, d1}, [%1] \n\t" //d0={nx,ny,nz,pad} + "add %1, %1, %2 \n\t" //q0 = {x,y,z,w} + "vst1.32 {d2, d3}, [%1] \n\t" //d2={nx,ny,nz,pad} + "add %1, %1, %2 \n\t" //q0 = {x,y,z,w} + "vst1.32 {d4, d5}, [%1] \n\t" //d4={nx,ny,nz,pad} + "add %1, %1, %2 \n\t" //q0 = {x,y,z,w} + "vst1.32 {d6, d7}, [%1] \n\t" //d6={nx,ny,nz,pad} + + : "+&r"(mtx), "+&r"(ptr) + : "I"(sizeof(SPVertex)), "I"(3 * sizeof(SPVertex)) + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d16","d17", "d18","d19", "d20", "d21", "d22", + "d23", "d24", "d25", "d26", "d27", "d28", "d29", + "d30", "d31", "memory" + ); +} + +void gSPBillboardVertex4NEON(u32 v) +{ + int i = 0; + + OGLRender & render = video().getRender(); + SPVertex & vtx0 = render.getVertex(v); + SPVertex & vtx1 = render.getVertex(i); + + void *ptr0 = (void*)&vtx0.x; + void *ptr1 = (void*)&vtx1.x; + asm volatile ( + + "vld1.32 {d0, d1}, [%0] \n\t" //q0 = {x,y,z,w} + "add %0, %0, %2 \n\t" //q0 = {x,y,z,w} + "vld1.32 {d2, d3}, [%0] \n\t" //q1 = {x,y,z,w} + "add %0, %0, %2 \n\t" //q0 = {x,y,z,w} + "vld1.32 {d4, d5}, [%0] \n\t" //q2 = {x,y,z,w} + "add %0, %0, %2 \n\t" //q0 = {x,y,z,w} + "vld1.32 {d6, d7}, [%0] \n\t" //q3 = {x,y,z,w} + "sub %0, %0, %3 \n\t" //q0 = {x,y,z,w} + + "vld1.32 {d16, d17}, [%1] \n\t" //q2={x1,y1,z1,w1} + "vadd.f32 q0, q0, q8 \n\t" //q1=q1+q1 + "vadd.f32 q1, q1, q8 \n\t" //q1=q1+q1 + "vadd.f32 q2, q2, q8 \n\t" //q1=q1+q1 + "vadd.f32 q3, q3, q8 \n\t" //q1=q1+q1 + "vst1.32 {d0, d1}, [%0] \n\t" // + "add %0, %0, %2 \n\t" //q0 = {x,y,z,w} + "vst1.32 {d2, d3}, [%0] \n\t" // + "add %0, %0, %2 \n\t" //q0 = {x,y,z,w} + "vst1.32 {d4, d5}, [%0] \n\t" // + "add %0, %0, %2 \n\t" //q0 = {x,y,z,w} + "vst1.32 {d6, d7}, [%0] \n\t" // + : "+&r"(ptr0), "+&r"(ptr1) + : "I"(sizeof(SPVertex)), "I"(3 * sizeof(SPVertex)) + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d16", "d17", "memory" + ); +}