mirror of
https://github.com/blawar/GLideN64.git
synced 2024-07-02 09:03:37 +00:00
arm neon: remove DotProduct
Compared to C function DotProduct runs slower. -O0 factor 0,86 -O1 factor 1,60 -O2 factor 1,59 -O3 factor 1,57 Six values and 3x mult/add is not enough workload to fill at least two quads and hide neon latency.
This commit is contained in:
parent
5bfac0a664
commit
f907706dae
|
@ -87,29 +87,3 @@ End:
|
|||
}
|
||||
#endif // WIN32_ASM
|
||||
}
|
||||
|
||||
|
||||
float DotProduct(const float v0[3], const float v1[3])
|
||||
{
|
||||
float dot;
|
||||
#ifdef WIN32_ASM
|
||||
__asm {
|
||||
mov esi, dword ptr [v0]
|
||||
mov edi, dword ptr [v1]
|
||||
lea ebx, [dot]
|
||||
|
||||
fld dword ptr [esi]
|
||||
fmul dword ptr [edi]
|
||||
fld dword ptr [esi+04h]
|
||||
fmul dword ptr [edi+04h]
|
||||
fld dword ptr [esi+08h]
|
||||
fmul dword ptr [edi+08h]
|
||||
fadd
|
||||
fadd
|
||||
fstp dword ptr [ebx]
|
||||
}
|
||||
#else // WIN32_ASM
|
||||
dot = v0[0]*v1[0] + v0[1]*v1[1] + v0[2]*v1[2];
|
||||
#endif // WIN32_ASM
|
||||
return dot;
|
||||
}
|
||||
|
|
25
src/3DMath.h
25
src/3DMath.h
|
@ -58,4 +58,29 @@ inline void CopyMatrix( float m0[4][4], float m1[4][4] )
|
|||
#endif // WIN32_ASM
|
||||
}
|
||||
|
||||
inline float DotProduct(const float v0[3], const float v1[3])
|
||||
{
|
||||
float dot;
|
||||
#ifdef WIN32_ASM
|
||||
__asm {
|
||||
mov esi, dword ptr [v0]
|
||||
mov edi, dword ptr [v1]
|
||||
lea ebx, [dot]
|
||||
|
||||
fld dword ptr [esi]
|
||||
fmul dword ptr [edi]
|
||||
fld dword ptr [esi+04h]
|
||||
fmul dword ptr [edi+04h]
|
||||
fld dword ptr [esi+08h]
|
||||
fmul dword ptr [edi+08h]
|
||||
fadd
|
||||
fadd
|
||||
fstp dword ptr [ebx]
|
||||
}
|
||||
#else // WIN32_ASM
|
||||
dot = v0[0]*v1[0] + v0[1]*v1[1] + v0[2]*v1[2];
|
||||
#endif // WIN32_ASM
|
||||
return dot;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -139,23 +139,3 @@ void Normalize(float v[3])
|
|||
: "d0", "d1", "d2", "d3", "d4", "d5", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
float DotProduct(const float v0[3], const float v1[3])
|
||||
{
|
||||
float dot;
|
||||
asm volatile (
|
||||
"vld1.32 {d8}, [%1]! \n\t" //d8={x0,y0}
|
||||
"vld1.32 {d10}, [%2]! \n\t" //d10={x1,y1}
|
||||
"flds s18, [%1, #0] \n\t" //d9[0]={z0}
|
||||
"flds s22, [%2, #0] \n\t" //d11[0]={z1}
|
||||
"vmul.f32 d12, d8, d10 \n\t" //d12= d8*d10
|
||||
"vpadd.f32 d12, d12, d12 \n\t" //d12 = d12[0] + d12[1]
|
||||
"vmla.f32 d12, d9, d11 \n\t" //d12 += d9*d11
|
||||
"fmrs %0, s24 \n\t" //r0 = s0
|
||||
: "=r"(dot), "+r"(v0), "+r"(v1):
|
||||
: "d8", "d9", "d10", "d11", "d12"
|
||||
|
||||
);
|
||||
return dot;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user