Update MathUtilNeon64.inl

add arm64bit  neon support
This commit is contained in:
wangqm0513 2014-11-12 14:15:29 +08:00
parent 706f49cb59
commit e8d24cb49b
1 changed files with 167 additions and 131 deletions

View File

@ -46,173 +46,209 @@ public:
inline void MathUtilNeon64::addMatrix(const float* m, float scalar, float* dst) inline void MathUtilNeon64::addMatrix(const float* m, float scalar, float* dst)
{ {
dst[0] = m[0] + scalar; asm volatile(
dst[1] = m[1] + scalar; "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // M[m0-m7] M[m8-m15]
dst[2] = m[2] + scalar; "ld1r {v4.4s}, [%2] \n\t" //ssss
dst[3] = m[3] + scalar;
dst[4] = m[4] + scalar; "fadd v8.4s, v0.4s, v4.4s \n\t" // DST->M[m0-m3] = M[m0-m3] + s
dst[5] = m[5] + scalar; "fadd v9.4s, v1.4s, v4.4s \n\t" // DST->M[m4-m7] = M[m4-m7] + s
dst[6] = m[6] + scalar; "fadd v10.4s, v2.4s, v4.4s \n\t" // DST->M[m8-m11] = M[m8-m11] + s
dst[7] = m[7] + scalar; "fadd v11.4s, v3.4s, v4.4s \n\t" // DST->M[m12-m15] = M[m12-m15] + s
dst[8] = m[8] + scalar;
dst[9] = m[9] + scalar; "st4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n\t" // Result in V9
dst[10] = m[10] + scalar; :
dst[11] = m[11] + scalar; : "r"(dst), "r"(m), "r"(&scalar)
dst[12] = m[12] + scalar; : "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "memory"
dst[13] = m[13] + scalar; );
dst[14] = m[14] + scalar;
dst[15] = m[15] + scalar;
} }
inline void MathUtilNeon64::addMatrix(const float* m1, const float* m2, float* dst) inline void MathUtilNeon64::addMatrix(const float* m1, const float* m2, float* dst)
{ {
dst[0] = m1[0] + m2[0]; asm volatile(
dst[1] = m1[1] + m2[1]; "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // M1[m0-m7] M1[m8-m15]
dst[2] = m1[2] + m2[2]; "ld4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%2] \n\t" // M2[m0-m7] M2[m8-m15]
dst[3] = m1[3] + m2[3];
dst[4] = m1[4] + m2[4]; "fadd v12.4s, v0.4s, v8.4s \n\t" // DST->M[m0-m3] = M1[m0-m3] + M2[m0-m3]
dst[5] = m1[5] + m2[5]; "fadd v13.4s, v1.4s, v9.4s \n\t" // DST->M[m4-m7] = M1[m4-m7] + M2[m4-m7]
dst[6] = m1[6] + m2[6]; "fadd v14.4s, v2.4s, v10.4s \n\t" // DST->M[m8-m11] = M1[m8-m11] + M2[m8-m11]
dst[7] = m1[7] + m2[7]; "fadd v15.4s, v3.4s, v11.4s \n\t" // DST->M[m12-m15] = M1[m12-m15] + M2[m12-m15]
dst[8] = m1[8] + m2[8];
dst[9] = m1[9] + m2[9]; "st4 {v12.4s, v13.4s, v14.4s, v15.4s}, [%0] \n\t" // DST->M[m0-m7] DST->M[m8-m15]
dst[10] = m1[10] + m2[10]; :
dst[11] = m1[11] + m2[11]; : "r"(dst), "r"(m1), "r"(m2)
dst[12] = m1[12] + m2[12]; : "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
dst[13] = m1[13] + m2[13]; );
dst[14] = m1[14] + m2[14];
dst[15] = m1[15] + m2[15];
} }
inline void MathUtilNeon64::subtractMatrix(const float* m1, const float* m2, float* dst) inline void MathUtilNeon64::subtractMatrix(const float* m1, const float* m2, float* dst)
{ {
dst[0] = m1[0] - m2[0]; asm volatile(
dst[1] = m1[1] - m2[1]; "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // M1[m0-m7] M1[m8-m15]
dst[2] = m1[2] - m2[2]; "ld4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%2] \n\t" // M2[m0-m7] M2[m8-m15]
dst[3] = m1[3] - m2[3];
dst[4] = m1[4] - m2[4]; "fsub v12.4s, v0.4s, v8.4s \n\t" // DST->M[m0-m3] = M1[m0-m3] - M2[m0-m3]
dst[5] = m1[5] - m2[5]; "fsub v13.4s, v1.4s, v9.4s \n\t" // DST->M[m4-m7] = M1[m4-m7] - M2[m4-m7]
dst[6] = m1[6] - m2[6]; "fsub v14.4s, v2.4s, v10.4s \n\t" // DST->M[m8-m11] = M1[m8-m11] - M2[m8-m11]
dst[7] = m1[7] - m2[7]; "fsub v15.4s, v3.4s, v11.4s \n\t" // DST->M[m12-m15] = M1[m12-m15] - M2[m12-m15]
dst[8] = m1[8] - m2[8];
dst[9] = m1[9] - m2[9]; "st4 {v12.4s, v13.4s, v14.4s, v15.4s}, [%0] \n\t" // DST->M[m0-m7] DST->M[m8-m15]
dst[10] = m1[10] - m2[10]; :
dst[11] = m1[11] - m2[11]; : "r"(dst), "r"(m1), "r"(m2)
dst[12] = m1[12] - m2[12]; : "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
dst[13] = m1[13] - m2[13]; );
dst[14] = m1[14] - m2[14];
dst[15] = m1[15] - m2[15];
} }
inline void MathUtilNeon64::multiplyMatrix(const float* m, float scalar, float* dst) inline void MathUtilNeon64::multiplyMatrix(const float* m, float scalar, float* dst)
{ {
dst[0] = m[0] * scalar; asm volatile(
dst[1] = m[1] * scalar; "ld1 {v0.s}[0], [%2] \n\t" //s
dst[2] = m[2] * scalar; "ld4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%1] \n\t" //M[m0-m7] M[m8-m15]
dst[3] = m[3] * scalar;
dst[4] = m[4] * scalar; "fmul v8.4s, v4.4s, v0.s[0] \n\t" // DST->M[m0-m3] = M[m0-m3] * s
dst[5] = m[5] * scalar; "fmul v9.4s, v5.4s, v0.s[0] \n\t" // DST->M[m4-m7] = M[m4-m7] * s
dst[6] = m[6] * scalar; "fmul v10.4s, v6.4s, v0.s[0] \n\t" // DST->M[m8-m11] = M[m8-m11] * s
dst[7] = m[7] * scalar; "fmul v11.4s, v7.4s, v0.s[0] \n\t" // DST->M[m12-m15] = M[m12-m15] * s
dst[8] = m[8] * scalar;
dst[9] = m[9] * scalar; "st4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n\t" // DST->M[m0-m7] DST->M[m8-m15]
dst[10] = m[10] * scalar; :
dst[11] = m[11] * scalar; : "r"(dst), "r"(m), "r"(&scalar)
dst[12] = m[12] * scalar; : "v0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
dst[13] = m[13] * scalar; );
dst[14] = m[14] * scalar;
dst[15] = m[15] * scalar;
} }
inline void MathUtilNeon64::multiplyMatrix(const float* m1, const float* m2, float* dst) inline void MathUtilNeon64::multiplyMatrix(const float* m1, const float* m2, float* dst)
{ {
// Support the case where m1 or m2 is the same array as dst. asm volatile(
float product[16]; "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%1] \n\t" // M1[m0-m7] M1[m8-m15] M2[m0-m7] M2[m8-m15]
"ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2] \n\t" // M2[m0-m15]
product[0] = m1[0] * m2[0] + m1[4] * m2[1] + m1[8] * m2[2] + m1[12] * m2[3];
product[1] = m1[1] * m2[0] + m1[5] * m2[1] + m1[9] * m2[2] + m1[13] * m2[3];
product[2] = m1[2] * m2[0] + m1[6] * m2[1] + m1[10] * m2[2] + m1[14] * m2[3];
product[3] = m1[3] * m2[0] + m1[7] * m2[1] + m1[11] * m2[2] + m1[15] * m2[3];
product[4] = m1[0] * m2[4] + m1[4] * m2[5] + m1[8] * m2[6] + m1[12] * m2[7]; "fmul v12.4s, v8.4s, v0.s[0] \n\t" // DST->M[m0-m3] = M1[m0-m3] * M2[m0]
product[5] = m1[1] * m2[4] + m1[5] * m2[5] + m1[9] * m2[6] + m1[13] * m2[7]; "fmul v13.4s, v8.4s, v0.s[1] \n\t" // DST->M[m4-m7] = M1[m4-m7] * M2[m4]
product[6] = m1[2] * m2[4] + m1[6] * m2[5] + m1[10] * m2[6] + m1[14] * m2[7]; "fmul v14.4s, v8.4s, v0.s[2] \n\t" // DST->M[m8-m11] = M1[m8-m11] * M2[m8]
product[7] = m1[3] * m2[4] + m1[7] * m2[5] + m1[11] * m2[6] + m1[15] * m2[7]; "fmul v15.4s, v8.4s, v0.s[3] \n\t" // DST->M[m12-m15] = M1[m12-m15] * M2[m12]
product[8] = m1[0] * m2[8] + m1[4] * m2[9] + m1[8] * m2[10] + m1[12] * m2[11]; "fmla v12.4s, v9.4s, v1.s[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m1]
product[9] = m1[1] * m2[8] + m1[5] * m2[9] + m1[9] * m2[10] + m1[13] * m2[11]; "fmla v13.4s, v9.4s, v1.s[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m5]
product[10] = m1[2] * m2[8] + m1[6] * m2[9] + m1[10] * m2[10] + m1[14] * m2[11]; "fmla v14.4s, v9.4s, v1.s[2] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m9]
product[11] = m1[3] * m2[8] + m1[7] * m2[9] + m1[11] * m2[10] + m1[15] * m2[11]; "fmla v15.4s, v9.4s, v1.s[3] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m13]
product[12] = m1[0] * m2[12] + m1[4] * m2[13] + m1[8] * m2[14] + m1[12] * m2[15]; "fmla v12.4s, v10.4s, v2.s[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m2]
product[13] = m1[1] * m2[12] + m1[5] * m2[13] + m1[9] * m2[14] + m1[13] * m2[15]; "fmla v13.4s, v10.4s, v2.s[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m6]
product[14] = m1[2] * m2[12] + m1[6] * m2[13] + m1[10] * m2[14] + m1[14] * m2[15]; "fmla v14.4s, v10.4s, v2.s[2] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m10]
product[15] = m1[3] * m2[12] + m1[7] * m2[13] + m1[11] * m2[14] + m1[15] * m2[15]; "fmla v15.4s, v10.4s, v2.s[3] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m14]
memcpy(dst, product, MATRIX_SIZE); "fmla v12.4s, v11.4s, v3.s[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m3]
"fmla v13.4s, v11.4s, v3.s[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m7]
"fmla v14.4s, v11.4s, v3.s[2] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m11]
"fmla v15.4s, v11.4s, v3.s[3] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m15]
"st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%0] \n\t" // DST->M[m0-m7]// DST->M[m8-m15]
: // output
: "r"(dst), "r"(m1), "r"(m2) // input - note *value* of pointer doesn't change.
: "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
);
} }
inline void MathUtilNeon64::negateMatrix(const float* m, float* dst) inline void MathUtilNeon64::negateMatrix(const float* m, float* dst)
{ {
dst[0] = -m[0]; asm volatile(
dst[1] = -m[1]; "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // load m0-m7 load m8-m15
dst[2] = -m[2];
dst[3] = -m[3]; "fneg v4.4s, v0.4s \n\t" // negate m0-m3
dst[4] = -m[4]; "fneg v5.4s, v1.4s \n\t" // negate m4-m7
dst[5] = -m[5]; "fneg v6.4s, v2.4s \n\t" // negate m8-m15
dst[6] = -m[6]; "fneg v7.4s, v3.4s \n\t" // negate m8-m15
dst[7] = -m[7];
dst[8] = -m[8]; "st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n\t" // store m0-m7 store m8-m15
dst[9] = -m[9]; :
dst[10] = -m[10]; : "r"(dst), "r"(m)
dst[11] = -m[11]; : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory"
dst[12] = -m[12]; );
dst[13] = -m[13];
dst[14] = -m[14];
dst[15] = -m[15];
} }
inline void MathUtilNeon64::transposeMatrix(const float* m, float* dst) inline void MathUtilNeon64::transposeMatrix(const float* m, float* dst)
{ {
float t[16] = { asm volatile(
m[0], m[4], m[8], m[12], "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // DST->M[m0, m4, m8, m12] = M[m0-m3]
m[1], m[5], m[9], m[13], //DST->M[m1, m5, m9, m12] = M[m4-m7]
m[2], m[6], m[10], m[14], "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n\t"
m[3], m[7], m[11], m[15] :
}; : "r"(dst), "r"(m)
memcpy(dst, t, MATRIX_SIZE); : "v0", "v1", "v2", "v3", "memory"
);
} }
inline void MathUtilNeon64::transformVec4(const float* m, float x, float y, float z, float w, float* dst) inline void MathUtilNeon64::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
{ {
dst[0] = x * m[0] + y * m[4] + z * m[8] + w * m[12]; asm volatile(
dst[1] = x * m[1] + y * m[5] + z * m[9] + w * m[13]; "ld1 {v0.s}[0], [%1] \n\t" // V[x]
dst[2] = x * m[2] + y * m[6] + z * m[10] + w * m[14]; "ld1 {v0.s}[1], [%2] \n\t" // V[y]
"ld1 {v0.s}[2], [%3] \n\t" // V[z]
"ld1 {v0.s}[3], [%4] \n\t" // V[w]
"ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%5] \n\t" // M[m0-m7] M[m8-m15]
"fmul v13.4s, v9.4s, v0.s[0] \n\t" // DST->V = M[m0-m3] * V[x]
"fmla v13.4s, v10.4s, v0.s[1] \n\t" // DST->V += M[m4-m7] * V[y]
"fmla v13.4s, v11.4s, v0.s[2] \n\t" // DST->V += M[m8-m11] * V[z]
"fmla v13.4s, v12.4s, v0.s[3] \n\t" // DST->V += M[m12-m15] * V[w]
//"st1 {v13.4s}, [%0] \n\t" // DST->V[x, y] // DST->V[z]
"st1 {v13.2s}, [%0], 8 \n\t"
"st1 {v13.s}[2], [%0] \n\t"
:
: "r"(dst), "r"(&x), "r"(&y), "r"(&z), "r"(&w), "r"(m)
: "v0", "v9", "v10","v11", "v12", "v13", "memory"
);
} }
inline void MathUtilNeon64::transformVec4(const float* m, const float* v, float* dst) inline void MathUtilNeon64::transformVec4(const float* m, const float* v, float* dst)
{ {
// Handle case where v == dst. asm volatile
float x = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + v[3] * m[12]; (
float y = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + v[3] * m[13]; "ld1 {v0.4s}, [%1] \n\t" // V[x, y, z, w]
float z = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + v[3] * m[14]; "ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%2] \n\t" // M[m0-m7] M[m8-m15]
float w = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + v[3] * m[15];
dst[0] = x; "fmul v13.4s, v9.4s, v0.s[0] \n\t" // DST->V = M[m0-m3] * V[x]
dst[1] = y; "fmla v13.4s, v10.4s, v0.s[1] \n\t" // DST->V = M[m4-m7] * V[y]
dst[2] = z; "fmla v13.4s, v11.4s, v0.s[2] \n\t" // DST->V = M[m8-m11] * V[z]
dst[3] = w; "fmla v13.4s, v12.4s, v0.s[3] \n\t" // DST->V = M[m12-m15] * V[w]
"st1 {v13.4s}, [%0] \n\t" // DST->V
:
: "r"(dst), "r"(v), "r"(m)
: "v0", "v9", "v10","v11", "v12", "v13", "memory"
);
} }
inline void MathUtilNeon64::crossVec3(const float* v1, const float* v2, float* dst) inline void MathUtilNeon64::crossVec3(const float* v1, const float* v2, float* dst)
{ {
float x = (v1[1] * v2[2]) - (v1[2] * v2[1]); asm volatile(
float y = (v1[2] * v2[0]) - (v1[0] * v2[2]); "ld1 {v0.2s}, [%2] \n\t" //
float z = (v1[0] * v2[1]) - (v1[1] * v2[0]); "ld1 {v0.s}[3], [%1] \n\t" //
"mov v0.s[2], v0.s[1] \n\t" // q0 = (v1y, v1z, v1z, v1x)
dst[0] = x; "ld1 {v1.s}[1], [%3] \n\t" //
dst[1] = y; "ld1 {v1.s}[2], [%4], 4 \n\t" //
dst[2] = z; "ld1 {v1.s}[3], [%4] \n\t" //
"mov v1.s[0], v1.s[3] \n\t" // q1 = (v2z, v2x, v2y, v2z)
"fmul v2.4s, v0.4s, v1.4s \n\t" // x = v1y * v2z, y = v1z * v2x
"fsub s8, s8, s10 \n\t"
"fsub s9, s9, s11 \n\t" // x -= v1z * v2y, y-= v1x - v2z
"fmul s10, s3, s6 \n\t" // z = v1x * v2y
"fmul s11, s0, s5 \n\t" // z-= v1y * vx
"fsub s10, s10, s11 \n\t"
"st1 {v2.2s}, [%0], 8 \n\t" // V[x, y]
"st1 {v2.s}[2], [%0] \n\t" // V[z]
:
: "r"(dst), "r"(v1), "r"((v1+1)), "r"(v2), "r"((v2+1))
: "v0", "v1", "v2", "memory"
);
} }
NS_CC_MATH_END NS_CC_MATH_END