Merge pull request #8158 from dabingnn/v3_neon

V3 neon
This commit is contained in:
minggo 2014-09-25 18:02:46 +08:00
commit 615b367277
7 changed files with 735 additions and 244 deletions

View File

@ -227,6 +227,10 @@ LOCAL_CPPFLAGS := -Wno-deprecated-declarations -Wno-extern-c-compat
LOCAL_EXPORT_CFLAGS := -DUSE_FILE32API
LOCAL_EXPORT_CPPFLAGS := -Wno-deprecated-declarations -Wno-extern-c-compat
ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
LOCAL_ARM_NEON := true
endif
include $(BUILD_STATIC_LIBRARY)
#==============================================================
@ -234,7 +238,6 @@ include $(BUILD_STATIC_LIBRARY)
include $(CLEAR_VARS)
LOCAL_MODULE := cocos2dx_static
LOCAL_MODULE_FILENAME := libcocos2d
LOCAL_STATIC_LIBRARIES := cocostudio_static

View File

@ -21,12 +21,62 @@ This file was modified to fit the cocos2d-x project
#include "MathUtil.h"
#include "base/ccMacros.h"
#if (CC_TARGET_PLATFORM == CC_PLATFORM_ANDROID)
#include <cpu-features.h>
#endif
//#define USE_NEON32 : neon 32 code will be used
//#define USE_NEON64 : neon 64 code will be used
//#define INCLUDE_NEON32 : neon 32 code included
//#define INCLUDE_NEON64 : neon 64 code included
//#define USE_SSE : SSE code used
//#define INCLUDE_SSE : SSE code included
#if (CC_TARGET_PLATFORM == CC_PLATFORM_IOS)
#if defined (__arm64__)
#define USE_NEON64
#define INCLUDE_NEON64
#elif defined (__ARM_NEON__)
#define USE_NEON32
#define INCLUDE_NEON32
#else
#endif
#elif (CC_TARGET_PLATFORM == CC_PLATFORM_ANDROID)
#if defined (__arm64__)
#define INCLUDE_NEON64
#elif defined (__ARM_NEON__)
#define INCLUDE_NEON32
#else
#endif
#else
#endif
#if defined (__SSE__)
#define USE_SSE
#define INCLUDE_SSE
#endif
#ifdef INCLUDE_NEON32
#include "MathUtilNeon.inl"
#endif
#ifdef INCLUDE_NEON64
#include "MathUtilNeon64.inl"
#endif
#ifdef INCLUDE_SSE
#include "MathUtilSSE.inl"
#endif
#include "MathUtil.inl"
NS_CC_MATH_BEGIN
void MathUtil::smooth(float* x, float target, float elapsedTime, float responseTime)
{
GP_ASSERT(x);
if (elapsedTime > 0)
{
*x += (target - *x) * elapsedTime / (elapsedTime + responseTime);
@ -44,4 +94,179 @@ void MathUtil::smooth(float* x, float target, float elapsedTime, float riseTime,
}
}
bool MathUtil::isNeon32Enabled()
{
#ifdef USE_NEON32
return true;
#elif (defined (INCLUDE_NEON32) && (CC_TARGET_PLATFORM == CC_PLATFORM_ANDROID) )
class AnrdoidNeonChecker
{
public:
AnrdoidNeonChecker()
{
if (android_getCpuFamily() == ANDROID_CPU_FAMILY_ARM && (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) != 0)
_isNeonEnabled = true;
else
_isNeonEnabled = false;
}
bool isNeonEnabled() const { return _isNeonEnabled; }
private:
bool _isNeonEnabled;
};
static AnrdoidNeonChecker checker;
return checker.isNeonEnabled();
#else
return false;
#endif
}
bool MathUtil::isNeon64Enabled()
{
#ifdef USE_NEON64
return true;
#else
return false;
#endif
}
void MathUtil::addMatrix(const float* m, float scalar, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::addMatrix(m, scalar, dst);
#elif defined (USE_NEON64)
MathUtilNeon64::addMatrix(m, scalar, dst);
#elif defined (INCLUDE_NEON32)
if(isNeon32Enabled()) MathUtilNeon::addMatrix(m, scalar, dst);
else MathUtilC::addMatrix(m, scalar, dst);
#else
MathUtilC::addMatrix(m, scalar, dst);
#endif
}
void MathUtil::addMatrix(const float* m1, const float* m2, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::addMatrix(m1, m2, dst);
#elif defined (USE_NEON64)
MathUtilNeon64::addMatrix(m1, m2, dst);
#elif defined (INCLUDE_NEON32)
if(isNeon32Enabled()) MathUtilNeon::addMatrix(m1, m2, dst);
else MathUtilC::addMatrix(m1, m2, dst);
#else
MathUtilC::addMatrix(m1, m2, dst);
#endif
}
void MathUtil::subtractMatrix(const float* m1, const float* m2, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::subtractMatrix(m1, m2, dst);
#elif defined (USE_NEON64)
MathUtilNeon64::subtractMatrix(m1, m2, dst);
#elif defined (INCLUDE_NEON32)
if(isNeon32Enabled()) MathUtilNeon::subtractMatrix(m1, m2, dst);
else MathUtilC::subtractMatrix(m1, m2, dst);
#else
MathUtilC::subtractMatrix(m1, m2, dst);
#endif
}
void MathUtil::multiplyMatrix(const float* m, float scalar, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::multiplyMatrix(m, scalar, dst);
#elif defined (USE_NEON64)
MathUtilNeon64::multiplyMatrix(m, scalar, dst);
#elif defined (INCLUDE_NEON32)
if(isNeon32Enabled()) MathUtilNeon::multiplyMatrix(m, scalar, dst);
else MathUtilC::multiplyMatrix(m, scalar, dst);
#else
MathUtilC::multiplyMatrix(m, scalar, dst);
#endif
}
void MathUtil::multiplyMatrix(const float* m1, const float* m2, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::multiplyMatrix(m1, m2, dst);
#elif defined (USE_NEON64)
MathUtilNeon64::multiplyMatrix(m1, m2, dst);
#elif defined (INCLUDE_NEON32)
if(isNeon32Enabled()) MathUtilNeon::multiplyMatrix(m1, m2, dst);
else MathUtilC::multiplyMatrix(m1, m2, dst);
#else
MathUtilC::multiplyMatrix(m1, m2, dst);
#endif
}
void MathUtil::negateMatrix(const float* m, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::negateMatrix(m, dst);
#elif defined (USE_NEON64)
MathUtilNeon64::negateMatrix(m, dst);
#elif defined (INCLUDE_NEON32)
if(isNeon32Enabled()) MathUtilNeon::negateMatrix(m, dst);
else MathUtilC::negateMatrix(m, dst);
#else
MathUtilC::negateMatrix(m, dst);
#endif
}
void MathUtil::transposeMatrix(const float* m, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::transposeMatrix(m, dst);
#elif defined (USE_NEON64)
MathUtilNeon64::transposeMatrix(m, dst);
#elif defined (INCLUDE_NEON32)
if(isNeon32Enabled()) MathUtilNeon::transposeMatrix(m, dst);
else MathUtilC::transposeMatrix(m, dst);
#else
MathUtilC::transposeMatrix(m, dst);
#endif
}
void MathUtil::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::transformVec4(m, x, y, z, w, dst);
#elif defined (USE_NEON64)
MathUtilNeon64::transformVec4(m, x, y, z, w, dst);
#elif defined (INCLUDE_NEON32)
if(isNeon32Enabled()) MathUtilNeon::transformVec4(m, x, y, z, w, dst);
else MathUtilC::transformVec4(m, x, y, z, w, dst);
#else
MathUtilC::transformVec4(m, x, y, z, w, dst);
#endif
}
void MathUtil::transformVec4(const float* m, const float* v, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::transformVec4(m, v, dst);
#elif defined (USE_NEON64)
MathUtilNeon64::transformVec4(m, v, dst);
#elif defined (INCLUDE_NEON32)
if(isNeon32Enabled()) MathUtilNeon::transformVec4(m, v, dst);
else MathUtilC::transformVec4(m, v, dst);
#else
MathUtilC::transformVec4(m, v, dst);
#endif
}
void MathUtil::crossVec3(const float* v1, const float* v2, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::crossVec3(v1, v2, dst);
#elif defined (USE_NEON64)
MathUtilNeon64::crossVec3(v1, v2, dst);
#elif defined (INCLUDE_NEON32)
if(isNeon32Enabled()) MathUtilNeon::crossVec3(v1, v2, dst);
else MathUtilC::crossVec3(v1, v2, dst);
#else
MathUtilC::crossVec3(v1, v2, dst);
#endif
}
NS_CC_MATH_END

View File

@ -69,59 +69,52 @@ public:
* @param fallTime response time for falling slope (in the same units as elapsedTime).
*/
static void smooth(float* x, float target, float elapsedTime, float riseTime, float fallTime);
private:
//Indicates that if neon is enabled
static bool isNeon32Enabled();
static bool isNeon64Enabled();
private:
#ifdef __SSE__
inline static void addMatrix(const __m128 m[4], float scalar, __m128 dst[4]);
static void addMatrix(const __m128 m[4], float scalar, __m128 dst[4]);
inline static void addMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]);
static void addMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]);
inline static void subtractMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]);
static void subtractMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]);
inline static void multiplyMatrix(const __m128 m[4], float scalar, __m128 dst[4]);
static void multiplyMatrix(const __m128 m[4], float scalar, __m128 dst[4]);
inline static void multiplyMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]);
static void multiplyMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]);
inline static void negateMatrix(const __m128 m[4], __m128 dst[4]);
static void negateMatrix(const __m128 m[4], __m128 dst[4]);
inline static void transposeMatrix(const __m128 m[4], __m128 dst[4]);
static void transposeMatrix(const __m128 m[4], __m128 dst[4]);
inline static void transformVec4(const __m128 m[4], const __m128& v, __m128& dst);
static void transformVec4(const __m128 m[4], const __m128& v, __m128& dst);
#endif
inline static void addMatrix(const float* m, float scalar, float* dst);
static void addMatrix(const float* m, float scalar, float* dst);
inline static void addMatrix(const float* m1, const float* m2, float* dst);
static void addMatrix(const float* m1, const float* m2, float* dst);
inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
static void subtractMatrix(const float* m1, const float* m2, float* dst);
inline static void multiplyMatrix(const float* m, float scalar, float* dst);
static void multiplyMatrix(const float* m, float scalar, float* dst);
inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
static void multiplyMatrix(const float* m1, const float* m2, float* dst);
inline static void negateMatrix(const float* m, float* dst);
static void negateMatrix(const float* m, float* dst);
inline static void transposeMatrix(const float* m, float* dst);
static void transposeMatrix(const float* m, float* dst);
inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
inline static void transformVec4(const float* m, const float* v, float* dst);
static void transformVec4(const float* m, const float* v, float* dst);
inline static void crossVec3(const float* v1, const float* v2, float* dst);
static void crossVec3(const float* v1, const float* v2, float* dst);
MathUtil();
};
NS_CC_MATH_END
#define MATRIX_SIZE ( sizeof(float) * 16)
#ifdef USE_NEON
#include "MathUtilNeon.inl"
#else
#include "MathUtil.inl"
#if defined(__SSE__)
#include "MathUtilSSE.inl"
#endif
#endif
#endif

View File

@ -20,7 +20,31 @@
NS_CC_MATH_BEGIN
inline void MathUtil::addMatrix(const float* m, float scalar, float* dst)
class MathUtilC
{
public:
inline static void addMatrix(const float* m, float scalar, float* dst);
inline static void addMatrix(const float* m1, const float* m2, float* dst);
inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
inline static void multiplyMatrix(const float* m, float scalar, float* dst);
inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
inline static void negateMatrix(const float* m, float* dst);
inline static void transposeMatrix(const float* m, float* dst);
inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
inline static void transformVec4(const float* m, const float* v, float* dst);
inline static void crossVec3(const float* v1, const float* v2, float* dst);
};
inline void MathUtilC::addMatrix(const float* m, float scalar, float* dst)
{
dst[0] = m[0] + scalar;
dst[1] = m[1] + scalar;
@ -40,7 +64,7 @@ inline void MathUtil::addMatrix(const float* m, float scalar, float* dst)
dst[15] = m[15] + scalar;
}
inline void MathUtil::addMatrix(const float* m1, const float* m2, float* dst)
inline void MathUtilC::addMatrix(const float* m1, const float* m2, float* dst)
{
dst[0] = m1[0] + m2[0];
dst[1] = m1[1] + m2[1];
@ -60,7 +84,7 @@ inline void MathUtil::addMatrix(const float* m1, const float* m2, float* dst)
dst[15] = m1[15] + m2[15];
}
inline void MathUtil::subtractMatrix(const float* m1, const float* m2, float* dst)
inline void MathUtilC::subtractMatrix(const float* m1, const float* m2, float* dst)
{
dst[0] = m1[0] - m2[0];
dst[1] = m1[1] - m2[1];
@ -80,7 +104,7 @@ inline void MathUtil::subtractMatrix(const float* m1, const float* m2, float* ds
dst[15] = m1[15] - m2[15];
}
inline void MathUtil::multiplyMatrix(const float* m, float scalar, float* dst)
inline void MathUtilC::multiplyMatrix(const float* m, float scalar, float* dst)
{
dst[0] = m[0] * scalar;
dst[1] = m[1] * scalar;
@ -100,35 +124,35 @@ inline void MathUtil::multiplyMatrix(const float* m, float scalar, float* dst)
dst[15] = m[15] * scalar;
}
inline void MathUtil::multiplyMatrix(const float* m1, const float* m2, float* dst)
inline void MathUtilC::multiplyMatrix(const float* m1, const float* m2, float* dst)
{
// Support the case where m1 or m2 is the same array as dst.
float product[16];
product[0] = m1[0] * m2[0] + m1[4] * m2[1] + m1[8] * m2[2] + m1[12] * m2[3];
product[1] = m1[1] * m2[0] + m1[5] * m2[1] + m1[9] * m2[2] + m1[13] * m2[3];
product[2] = m1[2] * m2[0] + m1[6] * m2[1] + m1[10] * m2[2] + m1[14] * m2[3];
product[3] = m1[3] * m2[0] + m1[7] * m2[1] + m1[11] * m2[2] + m1[15] * m2[3];
product[4] = m1[0] * m2[4] + m1[4] * m2[5] + m1[8] * m2[6] + m1[12] * m2[7];
product[5] = m1[1] * m2[4] + m1[5] * m2[5] + m1[9] * m2[6] + m1[13] * m2[7];
product[6] = m1[2] * m2[4] + m1[6] * m2[5] + m1[10] * m2[6] + m1[14] * m2[7];
product[7] = m1[3] * m2[4] + m1[7] * m2[5] + m1[11] * m2[6] + m1[15] * m2[7];
product[8] = m1[0] * m2[8] + m1[4] * m2[9] + m1[8] * m2[10] + m1[12] * m2[11];
product[9] = m1[1] * m2[8] + m1[5] * m2[9] + m1[9] * m2[10] + m1[13] * m2[11];
product[10] = m1[2] * m2[8] + m1[6] * m2[9] + m1[10] * m2[10] + m1[14] * m2[11];
product[11] = m1[3] * m2[8] + m1[7] * m2[9] + m1[11] * m2[10] + m1[15] * m2[11];
product[12] = m1[0] * m2[12] + m1[4] * m2[13] + m1[8] * m2[14] + m1[12] * m2[15];
product[13] = m1[1] * m2[12] + m1[5] * m2[13] + m1[9] * m2[14] + m1[13] * m2[15];
product[14] = m1[2] * m2[12] + m1[6] * m2[13] + m1[10] * m2[14] + m1[14] * m2[15];
product[15] = m1[3] * m2[12] + m1[7] * m2[13] + m1[11] * m2[14] + m1[15] * m2[15];
memcpy(dst, product, MATRIX_SIZE);
}
inline void MathUtil::negateMatrix(const float* m, float* dst)
inline void MathUtilC::negateMatrix(const float* m, float* dst)
{
dst[0] = -m[0];
dst[1] = -m[1];
@ -148,7 +172,7 @@ inline void MathUtil::negateMatrix(const float* m, float* dst)
dst[15] = -m[15];
}
inline void MathUtil::transposeMatrix(const float* m, float* dst)
inline void MathUtilC::transposeMatrix(const float* m, float* dst)
{
float t[16] = {
m[0], m[4], m[8], m[12],
@ -159,33 +183,33 @@ inline void MathUtil::transposeMatrix(const float* m, float* dst)
memcpy(dst, t, MATRIX_SIZE);
}
inline void MathUtil::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
inline void MathUtilC::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
{
dst[0] = x * m[0] + y * m[4] + z * m[8] + w * m[12];
dst[1] = x * m[1] + y * m[5] + z * m[9] + w * m[13];
dst[2] = x * m[2] + y * m[6] + z * m[10] + w * m[14];
}
inline void MathUtil::transformVec4(const float* m, const float* v, float* dst)
inline void MathUtilC::transformVec4(const float* m, const float* v, float* dst)
{
// Handle case where v == dst.
float x = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + v[3] * m[12];
float y = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + v[3] * m[13];
float z = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + v[3] * m[14];
float w = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + v[3] * m[15];
dst[0] = x;
dst[1] = y;
dst[2] = z;
dst[3] = w;
}
inline void MathUtil::crossVec3(const float* v1, const float* v2, float* dst)
inline void MathUtilC::crossVec3(const float* v1, const float* v2, float* dst)
{
float x = (v1[1] * v2[2]) - (v1[2] * v2[1]);
float y = (v1[2] * v2[0]) - (v1[0] * v2[2]);
float z = (v1[0] * v2[1]) - (v1[1] * v2[0]);
dst[0] = x;
dst[1] = y;
dst[2] = z;

View File

@ -17,232 +17,255 @@
This file was modified to fit the cocos2d-x project
*/
NS_CC_MATH_BEGIN
inline void MathUtil::addMatrix(const float* m, float scalar, float* dst)
class MathUtilNeon
{
public:
inline static void addMatrix(const float* m, float scalar, float* dst);
inline static void addMatrix(const float* m1, const float* m2, float* dst);
inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
inline static void multiplyMatrix(const float* m, float scalar, float* dst);
inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
inline static void negateMatrix(const float* m, float* dst);
inline static void transposeMatrix(const float* m, float* dst);
inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
inline static void transformVec4(const float* m, const float* v, float* dst);
inline static void crossVec3(const float* v1, const float* v2, float* dst);
};
inline void MathUtilNeon::addMatrix(const float* m, float scalar, float* dst)
{
asm volatile(
"vld1.32 {q0, q1}, [%1]! \n\t" // M[m0-m7]
"vld1.32 {q2, q3}, [%1] \n\t" // M[m8-m15]
"vld1.32 {d8[0]}, [%2] \n\t" // s
"vmov.f32 s17, s16 \n\t" // s
"vmov.f32 s18, s16 \n\t" // s
"vmov.f32 s19, s16 \n\t" // s
"vadd.f32 q8, q0, q4 \n\t" // DST->M[m0-m3] = M[m0-m3] + s
"vadd.f32 q9, q1, q4 \n\t" // DST->M[m4-m7] = M[m4-m7] + s
"vadd.f32 q10, q2, q4 \n\t" // DST->M[m8-m11] = M[m8-m11] + s
"vadd.f32 q11, q3, q4 \n\t" // DST->M[m12-m15] = M[m12-m15] + s
"vst1.32 {q8, q9}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {q10, q11}, [%0] \n\t" // DST->M[m8-m15]
:
: "r"(dst), "r"(m), "r"(&scalar)
: "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "memory"
);
"vld1.32 {q0, q1}, [%1]! \n\t" // M[m0-m7]
"vld1.32 {q2, q3}, [%1] \n\t" // M[m8-m15]
"vld1.32 {d8[0]}, [%2] \n\t" // s
"vmov.f32 s17, s16 \n\t" // s
"vmov.f32 s18, s16 \n\t" // s
"vmov.f32 s19, s16 \n\t" // s
"vadd.f32 q8, q0, q4 \n\t" // DST->M[m0-m3] = M[m0-m3] + s
"vadd.f32 q9, q1, q4 \n\t" // DST->M[m4-m7] = M[m4-m7] + s
"vadd.f32 q10, q2, q4 \n\t" // DST->M[m8-m11] = M[m8-m11] + s
"vadd.f32 q11, q3, q4 \n\t" // DST->M[m12-m15] = M[m12-m15] + s
"vst1.32 {q8, q9}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {q10, q11}, [%0] \n\t" // DST->M[m8-m15]
:
: "r"(dst), "r"(m), "r"(&scalar)
: "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "memory"
);
}
inline void MathUtil::addMatrix(const float* m1, const float* m2, float* dst)
inline void MathUtilNeon::addMatrix(const float* m1, const float* m2, float* dst)
{
asm volatile(
"vld1.32 {q0, q1}, [%1]! \n\t" // M1[m0-m7]
"vld1.32 {q2, q3}, [%1] \n\t" // M1[m8-m15]
"vld1.32 {q8, q9}, [%2]! \n\t" // M2[m0-m7]
"vld1.32 {q10, q11}, [%2] \n\t" // M2[m8-m15]
"vadd.f32 q12, q0, q8 \n\t" // DST->M[m0-m3] = M1[m0-m3] + M2[m0-m3]
"vadd.f32 q13, q1, q9 \n\t" // DST->M[m4-m7] = M1[m4-m7] + M2[m4-m7]
"vadd.f32 q14, q2, q10 \n\t" // DST->M[m8-m11] = M1[m8-m11] + M2[m8-m11]
"vadd.f32 q15, q3, q11 \n\t" // DST->M[m12-m15] = M1[m12-m15] + M2[m12-m15]
"vst1.32 {q12, q13}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {q14, q15}, [%0] \n\t" // DST->M[m8-m15]
:
: "r"(dst), "r"(m1), "r"(m2)
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory"
);
"vld1.32 {q0, q1}, [%1]! \n\t" // M1[m0-m7]
"vld1.32 {q2, q3}, [%1] \n\t" // M1[m8-m15]
"vld1.32 {q8, q9}, [%2]! \n\t" // M2[m0-m7]
"vld1.32 {q10, q11}, [%2] \n\t" // M2[m8-m15]
"vadd.f32 q12, q0, q8 \n\t" // DST->M[m0-m3] = M1[m0-m3] + M2[m0-m3]
"vadd.f32 q13, q1, q9 \n\t" // DST->M[m4-m7] = M1[m4-m7] + M2[m4-m7]
"vadd.f32 q14, q2, q10 \n\t" // DST->M[m8-m11] = M1[m8-m11] + M2[m8-m11]
"vadd.f32 q15, q3, q11 \n\t" // DST->M[m12-m15] = M1[m12-m15] + M2[m12-m15]
"vst1.32 {q12, q13}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {q14, q15}, [%0] \n\t" // DST->M[m8-m15]
:
: "r"(dst), "r"(m1), "r"(m2)
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory"
);
}
inline void MathUtil::subtractMatrix(const float* m1, const float* m2, float* dst)
inline void MathUtilNeon::subtractMatrix(const float* m1, const float* m2, float* dst)
{
asm volatile(
"vld1.32 {q0, q1}, [%1]! \n\t" // M1[m0-m7]
"vld1.32 {q2, q3}, [%1] \n\t" // M1[m8-m15]
"vld1.32 {q8, q9}, [%2]! \n\t" // M2[m0-m7]
"vld1.32 {q10, q11}, [%2] \n\t" // M2[m8-m15]
"vsub.f32 q12, q0, q8 \n\t" // DST->M[m0-m3] = M1[m0-m3] - M2[m0-m3]
"vsub.f32 q13, q1, q9 \n\t" // DST->M[m4-m7] = M1[m4-m7] - M2[m4-m7]
"vsub.f32 q14, q2, q10 \n\t" // DST->M[m8-m11] = M1[m8-m11] - M2[m8-m11]
"vsub.f32 q15, q3, q11 \n\t" // DST->M[m12-m15] = M1[m12-m15] - M2[m12-m15]
"vst1.32 {q12, q13}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {q14, q15}, [%0] \n\t" // DST->M[m8-m15]
:
: "r"(dst), "r"(m1), "r"(m2)
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory"
);
"vld1.32 {q0, q1}, [%1]! \n\t" // M1[m0-m7]
"vld1.32 {q2, q3}, [%1] \n\t" // M1[m8-m15]
"vld1.32 {q8, q9}, [%2]! \n\t" // M2[m0-m7]
"vld1.32 {q10, q11}, [%2] \n\t" // M2[m8-m15]
"vsub.f32 q12, q0, q8 \n\t" // DST->M[m0-m3] = M1[m0-m3] - M2[m0-m3]
"vsub.f32 q13, q1, q9 \n\t" // DST->M[m4-m7] = M1[m4-m7] - M2[m4-m7]
"vsub.f32 q14, q2, q10 \n\t" // DST->M[m8-m11] = M1[m8-m11] - M2[m8-m11]
"vsub.f32 q15, q3, q11 \n\t" // DST->M[m12-m15] = M1[m12-m15] - M2[m12-m15]
"vst1.32 {q12, q13}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {q14, q15}, [%0] \n\t" // DST->M[m8-m15]
:
: "r"(dst), "r"(m1), "r"(m2)
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory"
);
}
inline void MathUtil::multiplyMatrix(const float* m, float scalar, float* dst)
inline void MathUtilNeon::multiplyMatrix(const float* m, float scalar, float* dst)
{
asm volatile(
"vld1.32 {d0[0]}, [%2] \n\t" // M[m0-m7]
"vld1.32 {q4-q5}, [%1]! \n\t" // M[m8-m15]
"vld1.32 {q6-q7}, [%1] \n\t" // s
"vmul.f32 q8, q4, d0[0] \n\t" // DST->M[m0-m3] = M[m0-m3] * s
"vmul.f32 q9, q5, d0[0] \n\t" // DST->M[m4-m7] = M[m4-m7] * s
"vmul.f32 q10, q6, d0[0] \n\t" // DST->M[m8-m11] = M[m8-m11] * s
"vmul.f32 q11, q7, d0[0] \n\t" // DST->M[m12-m15] = M[m12-m15] * s
"vst1.32 {q8-q9}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {q10-q11}, [%0] \n\t" // DST->M[m8-m15]
:
: "r"(dst), "r"(m), "r"(&scalar)
: "q0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "memory"
);
"vld1.32 {d0[0]}, [%2] \n\t" // M[m0-m7]
"vld1.32 {q4-q5}, [%1]! \n\t" // M[m8-m15]
"vld1.32 {q6-q7}, [%1] \n\t" // s
"vmul.f32 q8, q4, d0[0] \n\t" // DST->M[m0-m3] = M[m0-m3] * s
"vmul.f32 q9, q5, d0[0] \n\t" // DST->M[m4-m7] = M[m4-m7] * s
"vmul.f32 q10, q6, d0[0] \n\t" // DST->M[m8-m11] = M[m8-m11] * s
"vmul.f32 q11, q7, d0[0] \n\t" // DST->M[m12-m15] = M[m12-m15] * s
"vst1.32 {q8-q9}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {q10-q11}, [%0] \n\t" // DST->M[m8-m15]
:
: "r"(dst), "r"(m), "r"(&scalar)
: "q0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "memory"
);
}
inline void MathUtil::multiplyMatrix(const float* m1, const float* m2, float* dst)
inline void MathUtilNeon::multiplyMatrix(const float* m1, const float* m2, float* dst)
{
asm volatile(
"vld1.32 {d16 - d19}, [%1]! \n\t" // M1[m0-m7]
"vld1.32 {d20 - d23}, [%1] \n\t" // M1[m8-m15]
"vld1.32 {d0 - d3}, [%2]! \n\t" // M2[m0-m7]
"vld1.32 {d4 - d7}, [%2] \n\t" // M2[m8-m15]
"vmul.f32 q12, q8, d0[0] \n\t" // DST->M[m0-m3] = M1[m0-m3] * M2[m0]
"vmul.f32 q13, q8, d2[0] \n\t" // DST->M[m4-m7] = M1[m4-m7] * M2[m4]
"vmul.f32 q14, q8, d4[0] \n\t" // DST->M[m8-m11] = M1[m8-m11] * M2[m8]
"vmul.f32 q15, q8, d6[0] \n\t" // DST->M[m12-m15] = M1[m12-m15] * M2[m12]
"vmla.f32 q12, q9, d0[1] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m1]
"vmla.f32 q13, q9, d2[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m5]
"vmla.f32 q14, q9, d4[1] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m9]
"vmla.f32 q15, q9, d6[1] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m13]
"vmla.f32 q12, q10, d1[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m2]
"vmla.f32 q13, q10, d3[0] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m6]
"vmla.f32 q14, q10, d5[0] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m10]
"vmla.f32 q15, q10, d7[0] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m14]
"vmla.f32 q12, q11, d1[1] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m3]
"vmla.f32 q13, q11, d3[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m7]
"vmla.f32 q14, q11, d5[1] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m11]
"vmla.f32 q15, q11, d7[1] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m15]
"vst1.32 {d24 - d27}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {d28 - d31}, [%0] \n\t" // DST->M[m8-m15]
: // output
: "r"(dst), "r"(m1), "r"(m2) // input - note *value* of pointer doesn't change.
: "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
"vld1.32 {d16 - d19}, [%1]! \n\t" // M1[m0-m7]
"vld1.32 {d20 - d23}, [%1] \n\t" // M1[m8-m15]
"vld1.32 {d0 - d3}, [%2]! \n\t" // M2[m0-m7]
"vld1.32 {d4 - d7}, [%2] \n\t" // M2[m8-m15]
"vmul.f32 q12, q8, d0[0] \n\t" // DST->M[m0-m3] = M1[m0-m3] * M2[m0]
"vmul.f32 q13, q8, d2[0] \n\t" // DST->M[m4-m7] = M1[m4-m7] * M2[m4]
"vmul.f32 q14, q8, d4[0] \n\t" // DST->M[m8-m11] = M1[m8-m11] * M2[m8]
"vmul.f32 q15, q8, d6[0] \n\t" // DST->M[m12-m15] = M1[m12-m15] * M2[m12]
"vmla.f32 q12, q9, d0[1] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m1]
"vmla.f32 q13, q9, d2[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m5]
"vmla.f32 q14, q9, d4[1] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m9]
"vmla.f32 q15, q9, d6[1] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m13]
"vmla.f32 q12, q10, d1[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m2]
"vmla.f32 q13, q10, d3[0] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m6]
"vmla.f32 q14, q10, d5[0] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m10]
"vmla.f32 q15, q10, d7[0] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m14]
"vmla.f32 q12, q11, d1[1] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m3]
"vmla.f32 q13, q11, d3[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m7]
"vmla.f32 q14, q11, d5[1] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m11]
"vmla.f32 q15, q11, d7[1] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m15]
"vst1.32 {d24 - d27}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {d28 - d31}, [%0] \n\t" // DST->M[m8-m15]
: // output
: "r"(dst), "r"(m1), "r"(m2) // input - note *value* of pointer doesn't change.
: "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
inline void MathUtil::negateMatrix(const float* m, float* dst)
inline void MathUtilNeon::negateMatrix(const float* m, float* dst)
{
asm volatile(
"vld1.32 {q0-q1}, [%1]! \n\t" // load m0-m7
"vld1.32 {q2-q3}, [%1] \n\t" // load m8-m15
"vneg.f32 q4, q0 \n\t" // negate m0-m3
"vneg.f32 q5, q1 \n\t" // negate m4-m7
"vneg.f32 q6, q2 \n\t" // negate m8-m15
"vneg.f32 q7, q3 \n\t" // negate m8-m15
"vst1.32 {q4-q5}, [%0]! \n\t" // store m0-m7
"vst1.32 {q6-q7}, [%0] \n\t" // store m8-m15
:
: "r"(dst), "r"(m)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory"
);
"vld1.32 {q0-q1}, [%1]! \n\t" // load m0-m7
"vld1.32 {q2-q3}, [%1] \n\t" // load m8-m15
"vneg.f32 q4, q0 \n\t" // negate m0-m3
"vneg.f32 q5, q1 \n\t" // negate m4-m7
"vneg.f32 q6, q2 \n\t" // negate m8-m15
"vneg.f32 q7, q3 \n\t" // negate m8-m15
"vst1.32 {q4-q5}, [%0]! \n\t" // store m0-m7
"vst1.32 {q6-q7}, [%0] \n\t" // store m8-m15
:
: "r"(dst), "r"(m)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory"
);
}
inline void MathUtil::transposeMatrix(const float* m, float* dst)
inline void MathUtilNeon::transposeMatrix(const float* m, float* dst)
{
asm volatile(
"vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%1]! \n\t" // DST->M[m0, m4, m8, m12] = M[m0-m3]
"vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1]! \n\t" // DST->M[m1, m5, m9, m12] = M[m4-m7]
"vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%1]! \n\t" // DST->M[m2, m6, m10, m12] = M[m8-m11]
"vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%1] \n\t" // DST->M[m3, m7, m11, m12] = M[m12-m15]
"vst1.32 {q0-q1}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {q2-q3}, [%0] \n\t" // DST->M[m8-m15]
:
: "r"(dst), "r"(m)
: "q0", "q1", "q2", "q3", "memory"
);
"vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%1]! \n\t" // DST->M[m0, m4, m8, m12] = M[m0-m3]
"vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1]! \n\t" // DST->M[m1, m5, m9, m12] = M[m4-m7]
"vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%1]! \n\t" // DST->M[m2, m6, m10, m12] = M[m8-m11]
"vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%1] \n\t" // DST->M[m3, m7, m11, m12] = M[m12-m15]
"vst1.32 {q0-q1}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {q2-q3}, [%0] \n\t" // DST->M[m8-m15]
:
: "r"(dst), "r"(m)
: "q0", "q1", "q2", "q3", "memory"
);
}
inline void MathUtil::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
inline void MathUtilNeon::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
{
asm volatile(
"vld1.32 {d0[0]}, [%1] \n\t" // V[x]
"vld1.32 {d0[1]}, [%2] \n\t" // V[y]
"vld1.32 {d1[0]}, [%3] \n\t" // V[z]
"vld1.32 {d1[1]}, [%4] \n\t" // V[w]
"vld1.32 {d18 - d21}, [%5]! \n\t" // M[m0-m7]
"vld1.32 {d22 - d25}, [%5] \n\t" // M[m8-m15]
"vmul.f32 q13, q9, d0[0] \n\t" // DST->V = M[m0-m3] * V[x]
"vmla.f32 q13, q10, d0[1] \n\t" // DST->V += M[m4-m7] * V[y]
"vmla.f32 q13, q11, d1[0] \n\t" // DST->V += M[m8-m11] * V[z]
"vmla.f32 q13, q12, d1[1] \n\t" // DST->V += M[m12-m15] * V[w]
"vst1.32 {d26}, [%0]! \n\t" // DST->V[x, y]
"vst1.32 {d27[0]}, [%0] \n\t" // DST->V[z]
:
: "r"(dst), "r"(&x), "r"(&y), "r"(&z), "r"(&w), "r"(m)
: "q0", "q9", "q10","q11", "q12", "q13", "memory"
);
"vld1.32 {d0[0]}, [%1] \n\t" // V[x]
"vld1.32 {d0[1]}, [%2] \n\t" // V[y]
"vld1.32 {d1[0]}, [%3] \n\t" // V[z]
"vld1.32 {d1[1]}, [%4] \n\t" // V[w]
"vld1.32 {d18 - d21}, [%5]! \n\t" // M[m0-m7]
"vld1.32 {d22 - d25}, [%5] \n\t" // M[m8-m15]
"vmul.f32 q13, q9, d0[0] \n\t" // DST->V = M[m0-m3] * V[x]
"vmla.f32 q13, q10, d0[1] \n\t" // DST->V += M[m4-m7] * V[y]
"vmla.f32 q13, q11, d1[0] \n\t" // DST->V += M[m8-m11] * V[z]
"vmla.f32 q13, q12, d1[1] \n\t" // DST->V += M[m12-m15] * V[w]
"vst1.32 {d26}, [%0]! \n\t" // DST->V[x, y]
"vst1.32 {d27[0]}, [%0] \n\t" // DST->V[z]
:
: "r"(dst), "r"(&x), "r"(&y), "r"(&z), "r"(&w), "r"(m)
: "q0", "q9", "q10","q11", "q12", "q13", "memory"
);
}
inline void MathUtil::transformVec4(const float* m, const float* v, float* dst)
inline void MathUtilNeon::transformVec4(const float* m, const float* v, float* dst)
{
asm volatile
(
"vld1.32 {d0, d1}, [%1] \n\t" // V[x, y, z, w]
"vld1.32 {d18 - d21}, [%2]! \n\t" // M[m0-m7]
"vld1.32 {d22 - d25}, [%2] \n\t" // M[m8-m15]
"vmul.f32 q13, q9, d0[0] \n\t" // DST->V = M[m0-m3] * V[x]
"vmla.f32 q13, q10, d0[1] \n\t" // DST->V = M[m4-m7] * V[y]
"vmla.f32 q13, q11, d1[0] \n\t" // DST->V = M[m8-m11] * V[z]
"vmla.f32 q13, q12, d1[1] \n\t" // DST->V = M[m12-m15] * V[w]
"vst1.32 {d26, d27}, [%0] \n\t" // DST->V
:
: "r"(dst), "r"(v), "r"(m)
: "q0", "q9", "q10","q11", "q12", "q13", "memory"
);
"vld1.32 {d0, d1}, [%1] \n\t" // V[x, y, z, w]
"vld1.32 {d18 - d21}, [%2]! \n\t" // M[m0-m7]
"vld1.32 {d22 - d25}, [%2] \n\t" // M[m8-m15]
"vmul.f32 q13, q9, d0[0] \n\t" // DST->V = M[m0-m3] * V[x]
"vmla.f32 q13, q10, d0[1] \n\t" // DST->V = M[m4-m7] * V[y]
"vmla.f32 q13, q11, d1[0] \n\t" // DST->V = M[m8-m11] * V[z]
"vmla.f32 q13, q12, d1[1] \n\t" // DST->V = M[m12-m15] * V[w]
"vst1.32 {d26, d27}, [%0] \n\t" // DST->V
:
: "r"(dst), "r"(v), "r"(m)
: "q0", "q9", "q10","q11", "q12", "q13", "memory"
);
}
inline void MathUtil::crossVec3(const float* v1, const float* v2, float* dst)
inline void MathUtilNeon::crossVec3(const float* v1, const float* v2, float* dst)
{
asm volatile(
"vld1.32 {d1[1]}, [%1] \n\t" //
"vld1.32 {d0}, [%2] \n\t" //
"vmov.f32 s2, s1 \n\t" // q0 = (v1y, v1z, v1z, v1x)
"vld1.32 {d2[1]}, [%3] \n\t" //
"vld1.32 {d3}, [%4] \n\t" //
"vmov.f32 s4, s7 \n\t" // q1 = (v2z, v2x, v2y, v2z)
"vmul.f32 d4, d0, d2 \n\t" // x = v1y * v2z, y = v1z * v2x
"vmls.f32 d4, d1, d3 \n\t" // x -= v1z * v2y, y-= v1x - v2z
"vmul.f32 d5, d3, d1[1] \n\t" // z = v1x * v2y
"vmls.f32 d5, d0, d2[1] \n\t" // z-= v1y * vx
"vst1.32 {d4}, [%0]! \n\t" // V[x, y]
"vst1.32 {d5[0]}, [%0] \n\t" // V[z]
:
: "r"(dst), "r"(v1), "r"((v1+1)), "r"(v2), "r"((v2+1))
: "q0", "q1", "q2", "memory"
);
"vld1.32 {d1[1]}, [%1] \n\t" //
"vld1.32 {d0}, [%2] \n\t" //
"vmov.f32 s2, s1 \n\t" // q0 = (v1y, v1z, v1z, v1x)
"vld1.32 {d2[1]}, [%3] \n\t" //
"vld1.32 {d3}, [%4] \n\t" //
"vmov.f32 s4, s7 \n\t" // q1 = (v2z, v2x, v2y, v2z)
"vmul.f32 d4, d0, d2 \n\t" // x = v1y * v2z, y = v1z * v2x
"vmls.f32 d4, d1, d3 \n\t" // x -= v1z * v2y, y-= v1x - v2z
"vmul.f32 d5, d3, d1[1] \n\t" // z = v1x * v2y
"vmls.f32 d5, d0, d2[1] \n\t" // z-= v1y * vx
"vst1.32 {d4}, [%0]! \n\t" // V[x, y]
"vst1.32 {d5[0]}, [%0] \n\t" // V[z]
:
: "r"(dst), "r"(v1), "r"((v1+1)), "r"(v2), "r"((v2+1))
: "q0", "q1", "q2", "memory"
);
}
NS_CC_MATH_END

View File

@ -0,0 +1,218 @@
/**
Copyright 2013 BlackBerry Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Original file from GamePlay3D: http://gameplay3d.org
This file was modified to fit the cocos2d-x project
*/
NS_CC_MATH_BEGIN
class MathUtilNeon64
{
public:
inline static void addMatrix(const float* m, float scalar, float* dst);
inline static void addMatrix(const float* m1, const float* m2, float* dst);
inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
inline static void multiplyMatrix(const float* m, float scalar, float* dst);
inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
inline static void negateMatrix(const float* m, float* dst);
inline static void transposeMatrix(const float* m, float* dst);
inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
inline static void transformVec4(const float* m, const float* v, float* dst);
inline static void crossVec3(const float* v1, const float* v2, float* dst);
};
inline void MathUtilNeon64::addMatrix(const float* m, float scalar, float* dst)
{
dst[0] = m[0] + scalar;
dst[1] = m[1] + scalar;
dst[2] = m[2] + scalar;
dst[3] = m[3] + scalar;
dst[4] = m[4] + scalar;
dst[5] = m[5] + scalar;
dst[6] = m[6] + scalar;
dst[7] = m[7] + scalar;
dst[8] = m[8] + scalar;
dst[9] = m[9] + scalar;
dst[10] = m[10] + scalar;
dst[11] = m[11] + scalar;
dst[12] = m[12] + scalar;
dst[13] = m[13] + scalar;
dst[14] = m[14] + scalar;
dst[15] = m[15] + scalar;
}
inline void MathUtilNeon64::addMatrix(const float* m1, const float* m2, float* dst)
{
dst[0] = m1[0] + m2[0];
dst[1] = m1[1] + m2[1];
dst[2] = m1[2] + m2[2];
dst[3] = m1[3] + m2[3];
dst[4] = m1[4] + m2[4];
dst[5] = m1[5] + m2[5];
dst[6] = m1[6] + m2[6];
dst[7] = m1[7] + m2[7];
dst[8] = m1[8] + m2[8];
dst[9] = m1[9] + m2[9];
dst[10] = m1[10] + m2[10];
dst[11] = m1[11] + m2[11];
dst[12] = m1[12] + m2[12];
dst[13] = m1[13] + m2[13];
dst[14] = m1[14] + m2[14];
dst[15] = m1[15] + m2[15];
}
inline void MathUtilNeon64::subtractMatrix(const float* m1, const float* m2, float* dst)
{
dst[0] = m1[0] - m2[0];
dst[1] = m1[1] - m2[1];
dst[2] = m1[2] - m2[2];
dst[3] = m1[3] - m2[3];
dst[4] = m1[4] - m2[4];
dst[5] = m1[5] - m2[5];
dst[6] = m1[6] - m2[6];
dst[7] = m1[7] - m2[7];
dst[8] = m1[8] - m2[8];
dst[9] = m1[9] - m2[9];
dst[10] = m1[10] - m2[10];
dst[11] = m1[11] - m2[11];
dst[12] = m1[12] - m2[12];
dst[13] = m1[13] - m2[13];
dst[14] = m1[14] - m2[14];
dst[15] = m1[15] - m2[15];
}
inline void MathUtilNeon64::multiplyMatrix(const float* m, float scalar, float* dst)
{
dst[0] = m[0] * scalar;
dst[1] = m[1] * scalar;
dst[2] = m[2] * scalar;
dst[3] = m[3] * scalar;
dst[4] = m[4] * scalar;
dst[5] = m[5] * scalar;
dst[6] = m[6] * scalar;
dst[7] = m[7] * scalar;
dst[8] = m[8] * scalar;
dst[9] = m[9] * scalar;
dst[10] = m[10] * scalar;
dst[11] = m[11] * scalar;
dst[12] = m[12] * scalar;
dst[13] = m[13] * scalar;
dst[14] = m[14] * scalar;
dst[15] = m[15] * scalar;
}
inline void MathUtilNeon64::multiplyMatrix(const float* m1, const float* m2, float* dst)
{
// Support the case where m1 or m2 is the same array as dst.
float product[16];
product[0] = m1[0] * m2[0] + m1[4] * m2[1] + m1[8] * m2[2] + m1[12] * m2[3];
product[1] = m1[1] * m2[0] + m1[5] * m2[1] + m1[9] * m2[2] + m1[13] * m2[3];
product[2] = m1[2] * m2[0] + m1[6] * m2[1] + m1[10] * m2[2] + m1[14] * m2[3];
product[3] = m1[3] * m2[0] + m1[7] * m2[1] + m1[11] * m2[2] + m1[15] * m2[3];
product[4] = m1[0] * m2[4] + m1[4] * m2[5] + m1[8] * m2[6] + m1[12] * m2[7];
product[5] = m1[1] * m2[4] + m1[5] * m2[5] + m1[9] * m2[6] + m1[13] * m2[7];
product[6] = m1[2] * m2[4] + m1[6] * m2[5] + m1[10] * m2[6] + m1[14] * m2[7];
product[7] = m1[3] * m2[4] + m1[7] * m2[5] + m1[11] * m2[6] + m1[15] * m2[7];
product[8] = m1[0] * m2[8] + m1[4] * m2[9] + m1[8] * m2[10] + m1[12] * m2[11];
product[9] = m1[1] * m2[8] + m1[5] * m2[9] + m1[9] * m2[10] + m1[13] * m2[11];
product[10] = m1[2] * m2[8] + m1[6] * m2[9] + m1[10] * m2[10] + m1[14] * m2[11];
product[11] = m1[3] * m2[8] + m1[7] * m2[9] + m1[11] * m2[10] + m1[15] * m2[11];
product[12] = m1[0] * m2[12] + m1[4] * m2[13] + m1[8] * m2[14] + m1[12] * m2[15];
product[13] = m1[1] * m2[12] + m1[5] * m2[13] + m1[9] * m2[14] + m1[13] * m2[15];
product[14] = m1[2] * m2[12] + m1[6] * m2[13] + m1[10] * m2[14] + m1[14] * m2[15];
product[15] = m1[3] * m2[12] + m1[7] * m2[13] + m1[11] * m2[14] + m1[15] * m2[15];
memcpy(dst, product, MATRIX_SIZE);
}
inline void MathUtilNeon64::negateMatrix(const float* m, float* dst)
{
dst[0] = -m[0];
dst[1] = -m[1];
dst[2] = -m[2];
dst[3] = -m[3];
dst[4] = -m[4];
dst[5] = -m[5];
dst[6] = -m[6];
dst[7] = -m[7];
dst[8] = -m[8];
dst[9] = -m[9];
dst[10] = -m[10];
dst[11] = -m[11];
dst[12] = -m[12];
dst[13] = -m[13];
dst[14] = -m[14];
dst[15] = -m[15];
}
inline void MathUtilNeon64::transposeMatrix(const float* m, float* dst)
{
float t[16] = {
m[0], m[4], m[8], m[12],
m[1], m[5], m[9], m[13],
m[2], m[6], m[10], m[14],
m[3], m[7], m[11], m[15]
};
memcpy(dst, t, MATRIX_SIZE);
}
inline void MathUtilNeon64::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
{
dst[0] = x * m[0] + y * m[4] + z * m[8] + w * m[12];
dst[1] = x * m[1] + y * m[5] + z * m[9] + w * m[13];
dst[2] = x * m[2] + y * m[6] + z * m[10] + w * m[14];
}
inline void MathUtilNeon64::transformVec4(const float* m, const float* v, float* dst)
{
// Handle case where v == dst.
float x = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + v[3] * m[12];
float y = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + v[3] * m[13];
float z = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + v[3] * m[14];
float w = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + v[3] * m[15];
dst[0] = x;
dst[1] = y;
dst[2] = z;
dst[3] = w;
}
inline void MathUtilNeon64::crossVec3(const float* v1, const float* v2, float* dst)
{
float x = (v1[1] * v2[2]) - (v1[2] * v2[1]);
float y = (v1[2] * v2[0]) - (v1[0] * v2[2]);
float z = (v1[0] * v2[1]) - (v1[1] * v2[0]);
dst[0] = x;
dst[1] = y;
dst[2] = z;
}
NS_CC_MATH_END

View File

@ -1,6 +1,8 @@
NS_CC_MATH_BEGIN
inline void MathUtil::addMatrix(const __m128 m[4], float scalar, __m128 dst[4])
#ifdef __SSE__
void MathUtil::addMatrix(const __m128 m[4], float scalar, __m128 dst[4])
{
__m128 s = _mm_set1_ps(scalar);
dst[0] = _mm_add_ps(m[0], s);
@ -9,7 +11,7 @@ inline void MathUtil::addMatrix(const __m128 m[4], float scalar, __m128 dst[4])
dst[3] = _mm_add_ps(m[3], s);
}
inline void MathUtil::addMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
void MathUtil::addMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
{
dst[0] = _mm_add_ps(m1[0], m2[0]);
dst[1] = _mm_add_ps(m1[1], m2[1]);
@ -17,7 +19,7 @@ inline void MathUtil::addMatrix(const __m128 m1[4], const __m128 m2[4], __m128 d
dst[3] = _mm_add_ps(m1[3], m2[3]);
}
inline void MathUtil::subtractMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
void MathUtil::subtractMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
{
dst[0] = _mm_sub_ps(m1[0], m2[0]);
dst[1] = _mm_sub_ps(m1[1], m2[1]);
@ -25,7 +27,7 @@ inline void MathUtil::subtractMatrix(const __m128 m1[4], const __m128 m2[4], __m
dst[3] = _mm_sub_ps(m1[3], m2[3]);
}
inline void MathUtil::multiplyMatrix(const __m128 m[4], float scalar, __m128 dst[4])
void MathUtil::multiplyMatrix(const __m128 m[4], float scalar, __m128 dst[4])
{
__m128 s = _mm_set1_ps(scalar);
dst[0] = _mm_mul_ps(m[0], s);
@ -34,7 +36,7 @@ inline void MathUtil::multiplyMatrix(const __m128 m[4], float scalar, __m128 dst
dst[3] = _mm_mul_ps(m[3], s);
}
inline void MathUtil::multiplyMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
void MathUtil::multiplyMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
{
__m128 dst0, dst1, dst2, dst3;
{
@ -114,7 +116,7 @@ inline void MathUtil::multiplyMatrix(const __m128 m1[4], const __m128 m2[4], __m
dst[3] = dst3;
}
inline void MathUtil::negateMatrix(const __m128 m[4], __m128 dst[4])
void MathUtil::negateMatrix(const __m128 m[4], __m128 dst[4])
{
__m128 z = _mm_setzero_ps();
dst[0] = _mm_sub_ps(z, m[0]);
@ -123,7 +125,7 @@ inline void MathUtil::negateMatrix(const __m128 m[4], __m128 dst[4])
dst[3] = _mm_sub_ps(z, m[3]);
}
inline void MathUtil::transposeMatrix(const __m128 m[4], __m128 dst[4])
void MathUtil::transposeMatrix(const __m128 m[4], __m128 dst[4])
{
__m128 tmp0 = _mm_shuffle_ps(m[0], m[1], 0x44);
__m128 tmp2 = _mm_shuffle_ps(m[0], m[1], 0xEE);
@ -136,7 +138,7 @@ inline void MathUtil::transposeMatrix(const __m128 m[4], __m128 dst[4])
dst[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
}
inline void MathUtil::transformVec4(const __m128 m[4], const __m128& v, __m128& dst)
void MathUtil::transformVec4(const __m128 m[4], const __m128& v, __m128& dst)
{
__m128 col1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
__m128 col2 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
@ -144,9 +146,12 @@ inline void MathUtil::transformVec4(const __m128 m[4], const __m128& v, __m128&
__m128 col4 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
dst = _mm_add_ps(
_mm_add_ps(_mm_mul_ps(m[0], col1), _mm_mul_ps(m[1], col2)),
_mm_add_ps(_mm_mul_ps(m[2], col3), _mm_mul_ps(m[3], col4))
);
_mm_add_ps(_mm_mul_ps(m[0], col1), _mm_mul_ps(m[1], col2)),
_mm_add_ps(_mm_mul_ps(m[2], col3), _mm_mul_ps(m[3], col4))
);
}
#endif
NS_CC_MATH_END