1
0
mirror of https://github.com/blawar/GLideN64.git synced 2024-07-07 03:13:49 +00:00
GLideN64/src/Neon/gSPNeon.cpp
gizmo98 afc41907d5 arm neon: replace neon asm with neon intrinsics
-adds aarch64/arm64 support
-easier to maintain
2018-03-27 13:14:18 +07:00

277 lines
10 KiB
C++

#include <stdio.h>
#include <math.h>
#include <algorithm>
#include <assert.h>
#include "N64.h"
#include "GLideN64.h"
#include "Types.h"
#include "RSP.h"
#include "GBI.h"
#include "gSP.h"
#include "gDP.h"
#include "3DMath.h"
#include "CRC.h"
#include <string.h>
#include "convert.h"
#include "uCodes/S2DEX.h"
#include "VI.h"
#include "FrameBuffer.h"
#include "DepthBuffer.h"
#include "Config.h"
#include "Log.h"
#include "DisplayWindow.h"
#include <arm_neon.h>
using namespace std;
void gSPTransformVertex4NEON(u32 v, float mtx[4][4])
{
GraphicsDrawer & drawer = dwnd().getDrawer();
SPVertex & vtx0 = drawer.getVertex(v);
SPVertex & vtx1 = drawer.getVertex(v + 1);
SPVertex & vtx2 = drawer.getVertex(v + 2);
SPVertex & vtx3 = drawer.getVertex(v + 3);
// Load vtx
float32x4x4_t _vtx;
_vtx.val[0] = vld1q_f32(&vtx0.x);
_vtx.val[1] = vld1q_f32(&vtx1.x);
_vtx.val[2] = vld1q_f32(&vtx2.x);
_vtx.val[3] = vld1q_f32(&vtx3.x);
// Load mtx
float32x4x4_t _mtx;
_mtx.val[0] = vld1q_f32(mtx[0]);
_mtx.val[1] = vld1q_f32(mtx[1]);
_mtx.val[2] = vld1q_f32(mtx[2]);
_mtx.val[3] = vld1q_f32(mtx[3]);
float32x4x4_t _out;
_out.val[0] = vmlaq_n_f32(_mtx.val[3], _mtx.val[0], _vtx.val[0][0]);
_out.val[1] = vmlaq_n_f32(_mtx.val[3], _mtx.val[0], _vtx.val[1][0]);
_out.val[2] = vmlaq_n_f32(_mtx.val[3], _mtx.val[0], _vtx.val[2][0]);
_out.val[3] = vmlaq_n_f32(_mtx.val[3], _mtx.val[0], _vtx.val[3][0]);
_out.val[0] = vmlaq_n_f32(_out.val[0], _mtx.val[1], _vtx.val[0][1]);
_out.val[1] = vmlaq_n_f32(_out.val[1], _mtx.val[1], _vtx.val[1][1]);
_out.val[2] = vmlaq_n_f32(_out.val[2], _mtx.val[1], _vtx.val[2][1]);
_out.val[3] = vmlaq_n_f32(_out.val[3], _mtx.val[1], _vtx.val[3][1]);
_out.val[0] = vmlaq_n_f32(_out.val[0], _mtx.val[2], _vtx.val[0][2]);
_out.val[1] = vmlaq_n_f32(_out.val[1], _mtx.val[2], _vtx.val[1][2]);
_out.val[2] = vmlaq_n_f32(_out.val[2], _mtx.val[2], _vtx.val[2][2]);
_out.val[3] = vmlaq_n_f32(_out.val[3], _mtx.val[2], _vtx.val[3][2]);
// Store vtx data
vst1q_f32(&vtx0.x, _out.val[0]);
vst1q_f32(&vtx1.x, _out.val[1]);
vst1q_f32(&vtx2.x, _out.val[2]);
vst1q_f32(&vtx3.x, _out.val[3]);
}
void gSPBillboardVertex4NEON(u32 v)
{
GraphicsDrawer & drawer = dwnd().getDrawer();
SPVertex & vtx = drawer.getVertex(0);
SPVertex & vtx0 = drawer.getVertex(v);
SPVertex & vtx1 = drawer.getVertex(v+1);
SPVertex & vtx2 = drawer.getVertex(v+2);
SPVertex & vtx3 = drawer.getVertex(v+3);
// Load vtx[0]
float32x4_t _vtx = vld1q_f32(&vtx.x);
// Load vtx v, v+1, v+2, v+3
float32x4_t _vtx0 = vld1q_f32(&vtx0.x);
float32x4_t _vtx1 = vld1q_f32(&vtx1.x);
float32x4_t _vtx2 = vld1q_f32(&vtx2.x);
float32x4_t _vtx3 = vld1q_f32(&vtx3.x);
// Add vtx[0] with vtx[n]./.[n+3] quad registers
_vtx0 = vaddq_f32(_vtx0, _vtx);
_vtx1 = vaddq_f32(_vtx1, _vtx);
_vtx2 = vaddq_f32(_vtx2, _vtx);
_vtx3 = vaddq_f32(_vtx3, _vtx);
// Store quad registers
vst1q_f32(&vtx0.x, _vtx0);
vst1q_f32(&vtx1.x, _vtx1);
vst1q_f32(&vtx2.x, _vtx2);
vst1q_f32(&vtx3.x, _vtx3);
}
void gSPTransformVector_NEON(float vtx[4], float mtx[4][4])
{
// Load vtx
float32x4_t _vtx = vld1q_f32(vtx);
// Load mtx
float32x4_t _mtx0 = vld1q_f32(mtx[0]);
float32x4_t _mtx1 = vld1q_f32(mtx[1]);
float32x4_t _mtx2 = vld1q_f32(mtx[2]);
float32x4_t _mtx3 = vld1q_f32(mtx[3]);
// Multiply and add
_mtx0 = vmlaq_n_f32(_mtx3, _mtx0, _vtx[0]); // _mtx0 = _mtx3 + _mtx0 * _vtx[0]
_mtx0 = vmlaq_n_f32(_mtx0, _mtx1, _vtx[1]); // _mtx0 = _mtx0 + _mtx1 * _vtx[1]
_mtx0 = vmlaq_n_f32(_mtx0, _mtx2, _vtx[2]); // _mtx0 = _mtx0 + _mtx2 * _vtx[2]
// Store vtx
vst1q_f32(vtx, _mtx0);
}
void gSPInverseTransformVector_NEON(float vec[3], float mtx[4][4])
{
float32x4x4_t _mtx = vld4q_f32(mtx[0]); // load 4x4 mtx interleaved
_mtx.val[0] = vmulq_n_f32(_mtx.val[0], vec[0]); // mtx[0][0]=mtx[0][0]*_vtx[0]
// mtx[0][1]=mtx[0][1]*_vtx[0]
// mtx[0][2]=mtx[0][2]*_vtx[0]
_mtx.val[0] = vmlaq_n_f32(_mtx.val[0], _mtx.val[1], vec[1]); // mtx[0][0]+=mtx[1][0]*_vtx[1]
// mtx[0][1]+=mtx[1][1]*_vtx[1]
// mtx[0][2]+=mtx[1][2]*_vtx[1]
_mtx.val[0] = vmlaq_n_f32(_mtx.val[0], _mtx.val[2], vec[2]); // mtx[0][0]+=mtx[2][0]*_vtx[2]
// mtx[0][1]+=mtx[2][1]*_vtx[2]
// mtx[0][2]+=mtx[2][2]*_vtx[2]
vec[0] = _mtx.val[0][0]; // store vec[0]
vec[1] = _mtx.val[0][1]; // store vec[1]
vec[2] = _mtx.val[0][2]; // store vec[2]
}
void DotProductMax7FullNeon( float v0[3], float v1[7][3], float lights[7][3], float _vtx[3])
{
// load v1
float32x4x3_t _v10 = vld3q_f32(v1[0]); // load 4x3 mtx interleaved
float32x2x3_t _v11 = vld3_f32(v1[4]); // load 2x3 mtx interleaved
float32x2x3_t _v12 = vld3_dup_f32(v1[6]); // load 1x3 mtx interleaved
for(int i = 0; i< 3; i++){
_v12.val[i][1]=0.0;
}
// load lights
float32x4x3_t _lights0 = vld3q_f32(lights[0]); // load 4x3 mtx interleaved
float32x2x3_t _lights1 = vld3_f32(lights[4]); // load 2x3 mtx interleaved
float32x2x3_t _lights2 = vld3_dup_f32(lights[6]); // load 1x3 mtx interleaved
float32x4_t product0;
float32x2_t product1;
float32x2_t product2;
float32x4_t max = vmovq_n_f32(0.0);
float32x2_t max1 = vmov_n_f32(0.0);
// calc product
product0 = vmulq_n_f32(_v10.val[0],v0[0]);
product1 = vmul_n_f32(_v11.val[0],v0[0]);
product2 = vmul_n_f32(_v12.val[0],v0[0]);
product0 = vmlaq_n_f32(product0, _v10.val[1],v0[1]);
product1 = vmla_n_f32(product1, _v11.val[1],v0[1]);
product2 = vmla_n_f32(product2, _v12.val[1],v0[1]);
product0 = vmlaq_n_f32(product0, _v10.val[2],v0[2]);
product1 = vmla_n_f32(product1, _v11.val[2],v0[2]);
product2 = vmla_n_f32(product2, _v12.val[2],v0[2]);
product0 = vmaxq_f32(product0, max);
product1 = vmax_f32(product1, max1);
product2 = vmax_f32(product2, max1);
// multiply product with lights
_lights0.val[0] = vmulq_f32(_lights0.val[0],product0);
_lights1.val[0] = vmul_f32(_lights1.val[0],product1);
_lights1.val[0] = vmla_f32(_lights1.val[0],_lights2.val[0],product2);
_lights0.val[1] = vmulq_f32(_lights0.val[1],product0);
_lights1.val[1] = vmul_f32(_lights1.val[1],product1);
_lights1.val[1] = vmla_f32(_lights1.val[1],_lights2.val[1],product2);
_lights0.val[2] = vmulq_f32(_lights0.val[2],product0);
_lights1.val[2] = vmul_f32(_lights1.val[2],product1);
_lights1.val[2] = vmla_f32(_lights1.val[2],_lights2.val[2],product2);
// add x, y and z values
float32x2_t d00 = vadd_f32(vget_high_f32(_lights0.val[0]),vget_low_f32(_lights0.val[0]));
float32x2_t d10 = vadd_f32(vget_high_f32(_lights0.val[1]),vget_low_f32(_lights0.val[1]));
float32x2_t d20 = vadd_f32(vget_high_f32(_lights0.val[2]),vget_low_f32(_lights0.val[2]));
d00 = vadd_f32(d00,_lights1.val[0]);
d10 = vadd_f32(d10,_lights1.val[1]);
d20 = vadd_f32(d20,_lights1.val[2]);
d00 = vpadd_f32(d00,d00);
d10 = vpadd_f32(d10,d10);
d20 = vpadd_f32(d20,d20);
_vtx[0] += d00[0];
_vtx[1] += d10[0];
_vtx[2] += d20[0];
}
void DotProductMax4FullNeon( float v0[3], float v1[4][3], float lights[4][3], float vtx[3])
{
float32x4x3_t _v1 = vld3q_f32(v1[0]); // load 4x3 mtx interleaved
float32x4x3_t _lights = vld3q_f32(lights[0]); // load 4x3 mtx interleaved
float32x4_t product;
float32x4_t max = vmovq_n_f32(0.0);
product = vmulq_n_f32(_v1.val[0],v0[0]);
product = vmlaq_n_f32(product, _v1.val[1],v0[1]);
product = vmlaq_n_f32(product, _v1.val[2],v0[2]);
product = vmaxq_f32(product, max);
_lights.val[0] = vmulq_f32(_lights.val[0],product);
_lights.val[1] = vmulq_f32(_lights.val[1],product);
_lights.val[2] = vmulq_f32(_lights.val[2],product);
float32x2_t d00 = vadd_f32(vget_high_f32(_lights.val[0]),vget_low_f32(_lights.val[0]));
float32x2_t d10 = vadd_f32(vget_high_f32(_lights.val[1]),vget_low_f32(_lights.val[1]));
float32x2_t d20 = vadd_f32(vget_high_f32(_lights.val[2]),vget_low_f32(_lights.val[2]));
d00 = vpadd_f32(d00,d00);
d10 = vpadd_f32(d10,d10);
d20 = vpadd_f32(d20,d20);
vtx[0] += d00[0];
vtx[1] += d10[0];
vtx[2] += d20[0];
}
void gSPLightVertex_NEON(u32 vnum, u32 v, SPVertex * spVtx)
{
if (!isHWLightingAllowed()) {
for(int j = 0; j < vnum; ++j) {
SPVertex & vtx = spVtx[v + j];
vtx.r = gSP.lights.rgb[gSP.numLights][R];
vtx.g = gSP.lights.rgb[gSP.numLights][G];
vtx.b = gSP.lights.rgb[gSP.numLights][B];
vtx.HWLight = 0;
s32 count = gSP.numLights-1;
while (count >= 6) {
DotProductMax7FullNeon(&vtx.nx,(float (*)[3])gSP.lights.i_xyz[gSP.numLights - count - 1],(float (*)[3])gSP.lights.rgb[gSP.numLights - count - 1],&vtx.r);
count -= 7;
}
while (count >= 3) {
DotProductMax4FullNeon(&vtx.nx,(float (*)[3])gSP.lights.i_xyz[gSP.numLights - count - 1],(float (*)[3])gSP.lights.rgb[gSP.numLights - count - 1],&vtx.r);
count -= 4;
}
while (count >= 0)
{
f32 intensity = DotProduct( &vtx.nx, gSP.lights.i_xyz[gSP.numLights - count - 1] );
if (intensity > 0.0f){
vtx.r += gSP.lights.rgb[gSP.numLights - count - 1][R] * intensity;
vtx.g += gSP.lights.rgb[gSP.numLights - count - 1][G] * intensity;
vtx.b += gSP.lights.rgb[gSP.numLights - count - 1][B] * intensity;
}
count -= 1;
}
vtx.r = min(1.0f, vtx.r);
vtx.g = min(1.0f, vtx.g);
vtx.b = min(1.0f, vtx.b);
}
} else {
for(int j = 0; j < vnum; ++j) {
SPVertex & vtx = spVtx[v+j];
TransformVectorNormalize(&vtx.r, gSP.matrix.modelView[gSP.matrix.modelViewi]);
vtx.HWLight = gSP.numLights;
}
}
}