mirror of https://github.com/axmolengine/axmol.git
98 lines
3.1 KiB
C
98 lines
3.1 KiB
C
/*
|
|
NEON math library for the iPhone / iPod touch
|
|
|
|
Copyright (c) 2009 Justin Saunders
|
|
|
|
This software is provided 'as-is', without any express or implied warranty.
|
|
In no event will the authors be held liable for any damages arising
|
|
from the use of this software.
|
|
Permission is granted to anyone to use this software for any purpose,
|
|
including commercial applications, and to alter it and redistribute it freely,
|
|
subject to the following restrictions:
|
|
|
|
1. The origin of this software must not be misrepresented; you must
|
|
not claim that you wrote the original software. If you use this
|
|
software in a product, an acknowledgment in the product documentation
|
|
would be appreciated but is not required.
|
|
|
|
2. Altered source versions must be plainly marked as such, and must
|
|
not be misrepresented as being the original software.
|
|
|
|
3. This notice may not be removed or altered from any source distribution.
|
|
*/
|
|
|
|
#include "kazmath/neon_matrix_impl.h"
|
|
|
|
#if defined(__ARM_NEON__)
|
|
|
|
void NEON_Matrix4Mul(const float* a, const float* b, float* output )
|
|
{
|
|
__asm__ volatile
|
|
(
|
|
// Store A & B leaving room for q4-q7, which should be preserved
|
|
"vldmia %1, { q0-q3 } \n\t"
|
|
"vldmia %2, { q8-q11 }\n\t"
|
|
|
|
// result = first column of B x first row of A
|
|
"vmul.f32 q12, q8, d0[0]\n\t"
|
|
"vmul.f32 q13, q8, d2[0]\n\t"
|
|
"vmul.f32 q14, q8, d4[0]\n\t"
|
|
"vmul.f32 q15, q8, d6[0]\n\t"
|
|
|
|
// result += second column of B x second row of A
|
|
"vmla.f32 q12, q9, d0[1]\n\t"
|
|
"vmla.f32 q13, q9, d2[1]\n\t"
|
|
"vmla.f32 q14, q9, d4[1]\n\t"
|
|
"vmla.f32 q15, q9, d6[1]\n\t"
|
|
|
|
// result += third column of B x third row of A
|
|
"vmla.f32 q12, q10, d1[0]\n\t"
|
|
"vmla.f32 q13, q10, d3[0]\n\t"
|
|
"vmla.f32 q14, q10, d5[0]\n\t"
|
|
"vmla.f32 q15, q10, d7[0]\n\t"
|
|
|
|
// result += last column of B x last row of A
|
|
"vmla.f32 q12, q11, d1[1]\n\t"
|
|
"vmla.f32 q13, q11, d3[1]\n\t"
|
|
"vmla.f32 q14, q11, d5[1]\n\t"
|
|
"vmla.f32 q15, q11, d7[1]\n\t"
|
|
|
|
// output = result registers
|
|
"vstmia %0, { q12-q15 }"
|
|
: // no output
|
|
: "r" (output), "r" (a), "r" (b) // input - note *value* of pointer doesn't change
|
|
: "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" //clobber
|
|
);
|
|
}
|
|
|
|
void NEON_Matrix4Vector4Mul(const float* m, const float* v, float* output)
|
|
{
|
|
__asm__ volatile
|
|
(
|
|
// Store m & v - avoiding q4-q7 which need to be preserved - q0 = result
|
|
"vldmia %1, { q8-q11 } \n\t" // q8-q11 = m
|
|
"vldmia %2, { q1 } \n\t" // q1 = v
|
|
|
|
// result = first column of A x V.x
|
|
"vmul.f32 q0, q8, d2[0]\n\t"
|
|
|
|
// result += second column of A x V.y
|
|
"vmla.f32 q0, q9, d2[1]\n\t"
|
|
|
|
// result += third column of A x V.z
|
|
"vmla.f32 q0, q10, d3[0]\n\t"
|
|
|
|
// result += last column of A x V.w
|
|
"vmla.f32 q0, q11, d3[1]\n\t"
|
|
|
|
// output = result registers
|
|
"vstmia %0, { q0 }"
|
|
|
|
: // no output
|
|
: "r" (output), "r" (m), "r" (v) // input - note *value* of pointer doesn't change
|
|
: "memory", "q0", "q1", "q8", "q9", "q10", "q11" //clobber
|
|
);
|
|
}
|
|
|
|
#endif
|