1
0
mirror of https://github.com/blawar/GLideN64.git synced 2024-07-02 09:03:37 +00:00

arm neon: remove DotProduct

Compared to C function DotProduct runs slower.
-O0 factor 0,86
-O1 factor 1,60
-O2 factor 1,59
-O3 factor 1,57
Six values and 3x mult/add is not enough workload to fill at least two
quads and hide neon latency.
This commit is contained in:
gizmo98 2017-03-24 18:36:38 +01:00
parent 5bfac0a664
commit f907706dae
3 changed files with 25 additions and 46 deletions

View File

@ -87,29 +87,3 @@ End:
}
#endif // WIN32_ASM
}
float DotProduct(const float v0[3], const float v1[3])
{
float dot;
#ifdef WIN32_ASM
__asm {
mov esi, dword ptr [v0]
mov edi, dword ptr [v1]
lea ebx, [dot]
fld dword ptr [esi]
fmul dword ptr [edi]
fld dword ptr [esi+04h]
fmul dword ptr [edi+04h]
fld dword ptr [esi+08h]
fmul dword ptr [edi+08h]
fadd
fadd
fstp dword ptr [ebx]
}
#else // WIN32_ASM
dot = v0[0]*v1[0] + v0[1]*v1[1] + v0[2]*v1[2];
#endif // WIN32_ASM
return dot;
}

View File

@ -58,4 +58,29 @@ inline void CopyMatrix( float m0[4][4], float m1[4][4] )
#endif // WIN32_ASM
}
inline float DotProduct(const float v0[3], const float v1[3])
{
float dot;
#ifdef WIN32_ASM
__asm {
mov esi, dword ptr [v0]
mov edi, dword ptr [v1]
lea ebx, [dot]
fld dword ptr [esi]
fmul dword ptr [edi]
fld dword ptr [esi+04h]
fmul dword ptr [edi+04h]
fld dword ptr [esi+08h]
fmul dword ptr [edi+08h]
fadd
fadd
fstp dword ptr [ebx]
}
#else // WIN32_ASM
dot = v0[0]*v1[0] + v0[1]*v1[1] + v0[2]*v1[2];
#endif // WIN32_ASM
return dot;
}
#endif

View File

@ -139,23 +139,3 @@ void Normalize(float v[3])
: "d0", "d1", "d2", "d3", "d4", "d5", "memory"
);
}
float DotProduct(const float v0[3], const float v1[3])
{
float dot;
asm volatile (
"vld1.32 {d8}, [%1]! \n\t" //d8={x0,y0}
"vld1.32 {d10}, [%2]! \n\t" //d10={x1,y1}
"flds s18, [%1, #0] \n\t" //d9[0]={z0}
"flds s22, [%2, #0] \n\t" //d11[0]={z1}
"vmul.f32 d12, d8, d10 \n\t" //d12= d8*d10
"vpadd.f32 d12, d12, d12 \n\t" //d12 = d12[0] + d12[1]
"vmla.f32 d12, d9, d11 \n\t" //d12 += d9*d11
"fmrs %0, s24 \n\t" //r0 = s0
: "=r"(dot), "+r"(v0), "+r"(v1):
: "d8", "d9", "d10", "d11", "d12"
);
return dot;
}