Refactor math simd (#2070)

This commit is contained in:
halx99 2024-08-04 09:46:36 +08:00 committed by GitHub
parent 695ccc0357
commit 8fd2a551e8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
26 changed files with 1293 additions and 1591 deletions

View File

@ -895,6 +895,7 @@ function setup_cmake($skipOS = $false, $scope = 'local') {
else {
& "$cmake_pkg_path" '--skip-license' '--prefix=/usr/local' 1>$null 2>$null
}
if (!$?) { Remove-Item $cmake_pkg_path -Force }
}
$cmake_prog, $_ = find_prog -name 'cmake' -path $cmake_bin -silent $true

View File

@ -20,10 +20,16 @@ function(_1kfetch_init)
set(_1kfetch_manifest "${_1kfetch_manifest}" CACHE STRING "" FORCE)
endif()
if(NOT EXISTS ${PWSH_PROG}) # try again
unset(PWSH_PROG CACHE)
find_program(PWSH_PROG NAMES pwsh powershell NO_PACKAGE_ROOT_PATH NO_CMAKE_PATH NO_CMAKE_ENVIRONMENT_PATH NO_CMAKE_SYSTEM_PATH NO_CMAKE_FIND_ROOT_PATH)
endif()
execute_process(COMMAND ${PWSH_PROG} ${CMAKE_CURRENT_FUNCTION_LIST_DIR}/resolv-uri.ps1
-name "1kdist"
-manifest ${_1kfetch_manifest}
OUTPUT_VARIABLE _1kdist_url
RESULT_VARIABLE _1kdist_error
)
if(_1kdist_url)
@ -33,7 +39,7 @@ function(_1kfetch_init)
set(_1kdist_base_url "${_1kdist_base_url}/${_1kdist_ver}" PARENT_SCOPE)
set(_1kdist_ver ${_1kdist_ver} PARENT_SCOPE)
else()
message(WARNING "Resolve 1kdist uri fail, the _1kfetch_dist will not work")
message(WARNING "Resolve 1kdist uri fail, ${_1kdist_error}, the _1kfetch_dist will not work")
endif()
endfunction()

2
3rdparty/README.md vendored
View File

@ -248,7 +248,7 @@
## yasio
- [![Upstream](https://img.shields.io/github/v/release/yasio/yasio?label=Upstream)](https://github.com/yasio/yasio)
- Version: 4.2.3
- Version: 4.2.4
- License: MIT WITH Anti-996
## zlib

View File

@ -60,14 +60,14 @@ YASIO_NI_API void yasio_init_globals(void(YASIO_INTEROP_DECL* pfn)(int level, co
YASIO_NI_API void yasio_cleanup_globals() { io_service::cleanup_globals(); }
struct yasio_io_event {
int kind; //
int channel;
void* thandle;
int kind; // event kind
int channel; // channel index
void* thandle; // transport
union {
void* msg;
int status; //
void* hmsg; // io_packet*
int ec; // error code
};
void* user;
void* user; // user data
};
YASIO_NI_API void* yasio_create_service(int channel_count, void(YASIO_INTEROP_DECL* event_cb)(yasio_io_event* event), void* user)
@ -82,9 +82,9 @@ YASIO_NI_API void* yasio_create_service(int channel_count, void(YASIO_INTEROP_DE
event.thandle = e->transport();
event.user = user;
if (event.kind == yasio::YEK_ON_PACKET)
event.msg = !is_packet_empty(pkt) ? &pkt : nullptr;
event.hmsg = !is_packet_empty(pkt) ? &pkt : nullptr;
else
event.status = e->status();
event.ec = e->status();
event_cb(&event);
});
return service;
@ -157,8 +157,12 @@ YASIO_NI_API void yasio_set_option(void* service_ptr, int opt, const char* pszAr
std::array<cxx17::string_view, YASIO_MAX_OPTION_ARGC> args;
int argc = 0;
yasio::split_if(&strArgs.front(), ';', [&](char* s, char* e) {
*e = '\0'; // to c style string
args[argc++] = cxx17::string_view(s, e - s);
if (e) {
*e = '\0'; // to c style string
args[argc++] = cxx17::string_view(s, e - s);
} else {
args[argc++] = cxx17::string_view{s};
}
return (argc < YASIO_MAX_OPTION_ARGC);
});

View File

@ -205,7 +205,7 @@ SOFTWARE.
/*
** The yasio version macros
*/
#define YASIO_VERSION_NUM 0x040203
#define YASIO_VERSION_NUM 0x040204
/*
** The macros used by io_service.

View File

@ -20,11 +20,11 @@
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>
#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ < 8)
# include <asm/unistd.h>
#else // __GLIBC__ == 2 && __GLIBC_MINOR__ < 8
#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ < 8) && !defined(__UCLIBC__)
# include <asm/unistd.h> // for syscall without API: eventfd
#else
# include <sys/eventfd.h>
#endif // __GLIBC__ == 2 && __GLIBC_MINOR__ < 8
#endif
#include <unistd.h>
@ -105,7 +105,7 @@ private:
// Open the descriptors. Throws on error.
inline void open_descriptors()
{
#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ < 8)
#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ < 8) && !defined(__UCLIBC__)
write_descriptor_ = read_descriptor_ = syscall(__NR_eventfd, 0);
if (read_descriptor_ != -1)
{

View File

@ -40,6 +40,9 @@ inline void yasio__print(std::string&& message) { ::write(::fileno(stdout), mess
# include <android/log.h>
# include <jni.h>
# define YASIO_LOG_TAG(tag, format, ...) __android_log_print(ANDROID_LOG_INFO, "yasio", (tag format), ##__VA_ARGS__)
#elif defined(__OHOS__)
# include <hilog/log.h>
# define YASIO_LOG_TAG(tag, format, ...) OH_LOG_INFO(LOG_APP, (tag format "\n"), ##__VA_ARGS__)
#else
# define YASIO_LOG_TAG(tag, format, ...) printf((tag format "\n"), ##__VA_ARGS__)
#endif

View File

@ -209,7 +209,7 @@ int xxsocket::pserve(const endpoint& ep)
if (!this->reopen(ep.af()))
return -1;
set_optval(SOL_SOCKET, SO_REUSEADDR, 1);
this->reuse_address(true);
int n = this->bind(ep);
if (n != 0)

View File

@ -52,6 +52,7 @@ default is: `navigator.hardwareConcurrency`
- AX_WASM_SHELL_FILE: specify the wasm shell file, by default use `${_AX_ROOT}/core/platform/wasm/shell_minimal.html`
- AX_WASM_ENABLE_DEVTOOLS: whether enable web devtools aka `pause`, `resume`, `step` buttons in webpage, default: `TRUE`
- AX_WASM_INITIAL_MEMORY: set the wasm initial memory size, default `1024MB`
- AX_WASM_ISA_SIMD: specify the wasm simd intrinsics type, default `none`, supports `sse`, `neon`, note the `wasm-simd` not support by axmol yet
## The options for axmol apps

View File

@ -186,22 +186,21 @@ endfunction()
if(EMSCRIPTEN)
set(AX_WASM_THREADS "4" CACHE STRING "Wasm threads count")
set(_AX_WASM_THREADS_INT 0)
set(_threads_hint "")
if (AX_WASM_THREADS STREQUAL "auto") # not empty string or not 0
# Enable pthread support globally
set(_threads_hint "(auto)")
include(ProcessorCount)
set(_AX_WASM_THREADS_INT 0)
ProcessorCount(_AX_WASM_THREADS_INT)
elseif(AX_WASM_THREADS MATCHES "^([0-9]+)$" OR AX_WASM_THREADS STREQUAL "navigator.hardwareConcurrency")
set(_AX_WASM_THREADS_INT ${AX_WASM_THREADS})
set(AX_WASM_THREADS "${_AX_WASM_THREADS_INT}" CACHE STRING "Wasm threads count" FORCE)
endif()
message(STATUS "AX_WASM_THREADS=${AX_WASM_THREADS}")
message(STATUS "_AX_WASM_THREADS_INT=${_AX_WASM_THREADS_INT}")
message(STATUS "AX_WASM_THREADS=${AX_WASM_THREADS}${_threads_hint}")
if (_AX_WASM_THREADS_INT)
if(AX_WASM_THREADS MATCHES "^([0-9]+)$" OR AX_WASM_THREADS STREQUAL "navigator.hardwareConcurrency")
list(APPEND _ax_compile_options -pthread)
add_link_options(-pthread -sPTHREAD_POOL_SIZE=${_AX_WASM_THREADS_INT})
add_link_options(-pthread -sPTHREAD_POOL_SIZE=${AX_WASM_THREADS})
endif()
set(AX_WASM_INITIAL_MEMORY "1024MB" CACHE STRING "")

View File

@ -400,9 +400,43 @@ if(WINDOWS)
endif()
endif()
# AX_USE_SSE
if (AX_ISA_SIMD MATCHES "sse")
target_compile_definitions(${_AX_CORE_LIB} PUBLIC AX_USE_SSE=1)
# axmol math simd intrinsics support
set(_simdc_defines)
set(_simdc_options)
if (NOT WASM) # native platforms auto detect from cmake or preprocessor check
if (AX_ISA_SIMD MATCHES "sse")
list(APPEND _simdc_defines AX_SSE_INTRINSICS=1)
if (AX_ISA_SIMD MATCHES "sse4")
list(APPEND _simdc_defines __SSE4_1__=1)
if (LINUX)
list(APPEND _simdc_options -msse4.1)
endif()
endif()
endif()
else() # wasm requires user specify SIMD intrinsics manually
set(AX_WASM_ISA_SIMD "none" CACHE STRING "")
string(TOLOWER ${AX_WASM_ISA_SIMD} AX_WASM_ISA_SIMD)
if(AX_WASM_ISA_SIMD MATCHES "sse")
message(AUTHOR_WARNING "Using SSE intrinsics for WASM ...")
list(APPEND _simdc_defines AX_SSE_INTRINSICS=1 __SSE__=1 __SSE2__=1)
list(APPEND _simdc_options -msse -msse2)
if(AX_ISA_LEVEL GREATER_EQUAL 2)
list(APPEND _simdc_defines __SSE4_1__=1)
list(APPEND _simdc_options -msse4.1)
endif()
list(APPEND _simdc_options -msimd128)
elseif(AX_WASM_ISA_SIMD MATCHES "neon")
message(AUTHOR_WARNING "Using NEON intrinsics for WASM ...")
list(APPEND _simdc_defines AX_NEON_INTRINSICS=1)
list(APPEND _simdc_options -mfpu=neon -msimd128)
endif()
endif()
if(_simdc_defines)
target_compile_definitions(${_AX_CORE_LIB} PUBLIC ${_simdc_defines})
if(_simdc_options)
target_compile_options(${_AX_CORE_LIB} PUBLIC ${_simdc_options})
endif()
endif()
# engine extensions

View File

@ -71,7 +71,7 @@ bool Configuration::init()
#if AX_ENABLE_PROFILERS
_valueDict["axmol.compiled_with_profiler"] = Value(true);
#else
_valueDict["axmol.compiled_with_profiler"] = Value(false);
_valueDict["axmol.compiled_with_profiler"] = Value(false);
#endif
#if AX_ENABLE_GL_STATE_CACHE == 0
@ -83,7 +83,17 @@ bool Configuration::init()
#if _AX_DEBUG
_valueDict["axmol.build_type"] = Value("DEBUG");
#else
_valueDict["axmol.build_type"] = Value("RELEASE");
_valueDict["axmol.build_type"] = Value("RELEASE");
#endif
#if defined(AX_SSE_INTRINSICS)
# if defined(__SSE4_1__)
_valueDict["axmol.simd"] = Value("SSE41");
# else
_valueDict["axmol.simd"] = Value("SSE2");
# endif
#elif defined(AX_NEON_INTRINSICS)
_valueDict["axmol.simd"] = Value("NEON");
#endif
return true;

View File

@ -398,7 +398,7 @@ bool Console::listenOnTCP(int port)
if (sock.pserve(ep) != 0)
{
int ec = xxsocket::get_last_errno();
AXLOGW("Console: open server failed, ec:{}", ec);
AXLOGW("Console: open server failed, ec:{}, {}", ec, xxsocket::strerror(ec));
return false;
}

View File

@ -17,7 +17,7 @@
Original file from GamePlay3D: http://gameplay3d.org
This file was modified to fit the cocos2d-x project
This file was modified to fit the axmol project
*/
#include "math/Mat4.h"
@ -459,11 +459,7 @@ void Mat4::add(float scalar)
void Mat4::add(float scalar, Mat4* dst)
{
GP_ASSERT(dst);
#ifdef AX_USE_SSE
MathUtil::addMatrix(col, scalar, dst->col);
#else
MathUtil::addMatrix(m, scalar, dst->m);
#endif
}
void Mat4::add(const Mat4& mat)
@ -474,11 +470,7 @@ void Mat4::add(const Mat4& mat)
void Mat4::add(const Mat4& m1, const Mat4& m2, Mat4* dst)
{
GP_ASSERT(dst);
#ifdef AX_USE_SSE
MathUtil::addMatrix(m1.col, m2.col, dst->col);
#else
MathUtil::addMatrix(m1.m, m2.m, dst->m);
#endif
}
bool Mat4::decompose(Vec3* scale, Quaternion* rotation, Vec3* translation) const
@ -751,11 +743,7 @@ void Mat4::multiply(float scalar, Mat4* dst) const
void Mat4::multiply(const Mat4& m, float scalar, Mat4* dst)
{
GP_ASSERT(dst);
#ifdef AX_USE_SSE
MathUtil::multiplyMatrix(m.col, scalar, dst->col);
#else
MathUtil::multiplyMatrix(m.m, scalar, dst->m);
#endif
}
void Mat4::multiply(const Mat4& mat)
@ -766,20 +754,12 @@ void Mat4::multiply(const Mat4& mat)
void Mat4::multiply(const Mat4& m1, const Mat4& m2, Mat4* dst)
{
GP_ASSERT(dst);
#ifdef AX_USE_SSE
MathUtil::multiplyMatrix(m1.col, m2.col, dst->col);
#else
MathUtil::multiplyMatrix(m1.m, m2.m, dst->m);
#endif
}
void Mat4::negate()
{
#ifdef AX_USE_SSE
MathUtil::negateMatrix(col, col);
#else
MathUtil::negateMatrix(m, m);
#endif
}
Mat4 Mat4::getNegated() const
@ -945,11 +925,7 @@ void Mat4::subtract(const Mat4& mat)
void Mat4::subtract(const Mat4& m1, const Mat4& m2, Mat4* dst)
{
GP_ASSERT(dst);
#ifdef AX_USE_SSE
MathUtil::subtractMatrix(m1.col, m2.col, dst->col);
#else
MathUtil::subtractMatrix(m1.m, m2.m, dst->m);
#endif
}
void Mat4::transformVector(Vec3* vector) const
@ -967,7 +943,7 @@ void Mat4::transformVector(float x, float y, float z, float w, Vec3* dst) const
{
GP_ASSERT(dst);
MathUtil::transformVec4(m, x, y, z, w, (float*)dst);
MathUtil::transformVec4(m, x, y, z, w, reinterpret_cast<float*>(dst));
}
void Mat4::transformVector(Vec4* vector) const
@ -979,14 +955,7 @@ void Mat4::transformVector(Vec4* vector) const
void Mat4::transformVector(const Vec4& vector, Vec4* dst) const
{
GP_ASSERT(dst);
#ifdef AX_USE_SSE
alignas(16) Vec4 inVal{vector};
alignas(16) Vec4 outVal;
MathUtil::transformVec4(col, reinterpret_cast<const __m128&>(inVal), reinterpret_cast<__m128&>(outVal));
*dst = outVal;
#else
MathUtil::transformVec4(m, (const float*)&vector, (float*)dst);
#endif
MathUtil::transformVec4(m, reinterpret_cast<const float*>(&vector), reinterpret_cast<float*>(dst));
}
void Mat4::translate(float x, float y, float z)
@ -1013,11 +982,7 @@ void Mat4::translate(const Vec3& t, Mat4* dst) const
void Mat4::transpose()
{
#ifdef AX_USE_SSE
MathUtil::transposeMatrix(col, col);
#else
MathUtil::transposeMatrix(m, m);
#endif
}
Mat4 Mat4::getTransposed() const

View File

@ -18,7 +18,7 @@
Original file from GamePlay3D: http://gameplay3d.org
This file was modified to fit the cocos2d-x project
This file was modified to fit the axmol project
*/
#ifndef MATH_MAT4_H
@ -29,10 +29,6 @@
#include "math/Vec3.h"
#include "math/Vec4.h"
#ifdef AX_USE_SSE
# include <xmmintrin.h>
#endif
/**
* @addtogroup base
* @{
@ -73,7 +69,7 @@ NS_AX_MATH_BEGIN
*
* @see Transform
*/
#ifdef AX_USE_SSE
#if defined(AX_SSE_INTRINSICS) || defined(AX_NEON_INTRINSICS)
class AX_DLL alignas(16) Mat4
#else
class AX_DLL Mat4
@ -95,10 +91,10 @@ public:
/**
* Stores the columns of this 4x4 matrix.
* */
#ifdef AX_USE_SSE
#if defined(AX_SSE_INTRINSICS) || defined(AX_NEON_INTRINSICS)
union
{
__m128 col[4];
_xm128_t col[4];
float m[16];
};
#else

View File

@ -1,5 +1,6 @@
/**
Copyright 2013 BlackBerry Inc.
Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -15,7 +16,7 @@
Original file from GamePlay3D: http://gameplay3d.org
This file was modified to fit the cocos2d-x project
This file was modified to fit the axmol project
*/
#include "math/Mat4.h"

View File

@ -1,5 +1,6 @@
/****************************************************************************
Copyright (c) 2017-2018 Xiamen Yaji Software Co., Ltd.
Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
https://axmol.dev/
@ -22,46 +23,47 @@
THE SOFTWARE.
****************************************************************************/
#ifndef __CCMATHBASE_H__
#define __CCMATHBASE_H__
#ifndef __AXMATHBASE_H__
#define __AXMATHBASE_H__
#include <memory>
#include <string.h>
#include "platform/PlatformMacros.h"
/**
* @addtogroup base
* @{
*/
/**Util macro for conversion from degrees to radians.*/
#define MATH_DEG_TO_RAD(x) ((x)*0.0174532925f)
#define MATH_DEG_TO_RAD(x) ((x) * 0.0174532925f)
/**Util macro for conversion from radians to degrees.*/
#define MATH_RAD_TO_DEG(x) ((x)*57.29577951f)
#define MATH_RAD_TO_DEG(x) ((x) * 57.29577951f)
/**
@{ Util macro for const float such as epsilon, small float and float precision tolerance.
*/
#define MATH_FLOAT_SMALL 1.0e-37f
#define MATH_TOLERANCE 2e-37f
#define MATH_PIOVER2 1.57079632679489661923f
#define MATH_EPSILON 0.000001f
#define MATH_TOLERANCE 2e-37f
#define MATH_PIOVER2 1.57079632679489661923f
#define MATH_EPSILON 0.000001f
/**@}*/
//#define MATH_PIOVER4 0.785398163397448309616f
//#define MATH_PIX2 6.28318530717958647693f
//#define MATH_E 2.71828182845904523536f
//#define MATH_LOG10E 0.4342944819032518f
//#define MATH_LOG2E 1.442695040888963387f
//#define MATH_PI 3.14159265358979323846f
//#define MATH_RANDOM_MINUS1_1() ((2.0f*((float)rand()/RAND_MAX))-1.0f) // Returns a random float between -1
// and 1. #define MATH_RANDOM_0_1() ((float)rand()/RAND_MAX) // Returns a random float
// between 0 and 1. #define MATH_CLAMP(x, lo, hi) ((x < lo) ? lo : ((x > hi) ? hi : x)) #ifndef M_1_PI #define
// M_1_PI 0.31830988618379067154
// #define MATH_PIOVER4 0.785398163397448309616f
// #define MATH_PIX2 6.28318530717958647693f
// #define MATH_E 2.71828182845904523536f
// #define MATH_LOG10E 0.4342944819032518f
// #define MATH_LOG2E 1.442695040888963387f
// #define MATH_PI 3.14159265358979323846f
// #define MATH_RANDOM_MINUS1_1() ((2.0f*((float)rand()/RAND_MAX))-1.0f) // Returns a random float between -1
// and 1. #define MATH_RANDOM_0_1() ((float)rand()/RAND_MAX) // Returns a random float
// between 0 and 1. #define MATH_CLAMP(x, lo, hi) ((x < lo) ? lo : ((x > hi) ? hi : x)) #ifndef M_1_PI #define
// M_1_PI 0.31830988618379067154
#ifdef __cplusplus
# define NS_AX_MATH_BEGIN \
namespace ax \
namespace ax \
{
# define NS_AX_MATH_END }
# define NS_AX_MATH_END }
# define USING_NS_AX_MATH using namespace ax
#else
# define NS_AX_MATH_BEGIN

View File

@ -17,7 +17,7 @@ limitations under the License.
Original file from GamePlay3D: http://gameplay3d.org
This file was modified to fit the cocos2d-x project
This file was modified to fit the axmol project
*/
#include "math/MathUtil.h"
@ -28,50 +28,10 @@ This file was modified to fit the cocos2d-x project
# include <cpu-features.h>
#endif
//#define USE_NEON32 : neon 32 code will be used
//#define USE_NEON64 : neon 64 code will be used
//#define INCLUDE_NEON32 : neon 32 code included
//#define INCLUDE_NEON64 : neon 64 code included
//#define USE_SSE : SSE code used
//#define INCLUDE_SSE : SSE code included
#if (AX_TARGET_PLATFORM == AX_PLATFORM_IOS)
# if defined(__arm64__)
# define USE_NEON64 1
# define INCLUDE_NEON64 1
# elif defined(__ARM_NEON__)
# define USE_NEON32 1
# define INCLUDE_NEON32 1
# endif
#elif (AX_TARGET_PLATFORM == AX_PLATFORM_OSX)
# if defined(__arm64__) || defined(__aarch64__)
# define USE_NEON64 1
# define INCLUDE_NEON64 1
# endif
#elif (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID)
# if defined(__arm64__) || defined(__aarch64__)
# define USE_NEON64 1
# define INCLUDE_NEON64 1
# elif defined(__ARM_NEON__)
# define INCLUDE_NEON32 1
# endif
#endif
#if defined(AX_USE_SSE)
# define USE_SSE 1
# define INCLUDE_SSE 1
#endif
#ifdef INCLUDE_NEON32
# include "math/MathUtilNeon.inl"
#endif
#ifdef INCLUDE_NEON64
# include "math/MathUtilNeon64.inl"
#endif
#ifdef INCLUDE_SSE
#if defined(AX_SSE_INTRINSICS)
# include "math/MathUtilSSE.inl"
#elif defined(AX_NEON_INTRINSICS)
# include "math/MathUtilNeon.inl"
#endif
#include "math/MathUtil.inl"
@ -106,9 +66,8 @@ float MathUtil::lerp(float from, float to, float alpha)
bool MathUtil::isNeon32Enabled()
{
#ifdef USE_NEON32
return true;
#elif (defined(INCLUDE_NEON32) && (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID))
#if defined(AX_NEON_INTRINSICS) && !AX_64BITS
# if AX_NEON_INTRINSICS == 1 && AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID
class AnrdoidNeonChecker
{
public:
@ -127,15 +86,9 @@ bool MathUtil::isNeon32Enabled()
};
static AnrdoidNeonChecker checker;
return checker.isNeonEnabled();
#else
return false;
#endif
}
bool MathUtil::isNeon64Enabled()
{
#ifdef USE_NEON64
# else
return true;
# endif
#else
return false;
#endif
@ -143,15 +96,17 @@ bool MathUtil::isNeon64Enabled()
void MathUtil::addMatrix(const float* m, float scalar, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::addMatrix(m, scalar, dst);
#elif defined(USE_NEON64)
MathUtilNeon64::addMatrix(m, scalar, dst);
#elif defined(INCLUDE_NEON32)
#if defined(AX_SSE_INTRINSICS)
MathUtilSSE::addMatrix(reinterpret_cast<const _xm128_t*>(m), scalar, reinterpret_cast<_xm128_t*>(dst));
#elif defined(AX_NEON_INTRINSICS)
# if AX_64BITS || AX_NEON_INTRINSICS > 1
MathUtilNeon::addMatrix(reinterpret_cast<const _xm128_t*>(m), scalar, reinterpret_cast<_xm128_t*>(dst));
# else
if (isNeon32Enabled())
MathUtilNeon::addMatrix(m, scalar, dst);
MathUtilNeon::addMatrix(reinterpret_cast<const _xm128_t*>(m), scalar, reinterpret_cast<_xm128_t*>(dst));
else
MathUtilC::addMatrix(m, scalar, dst);
# endif
#else
MathUtilC::addMatrix(m, scalar, dst);
#endif
@ -159,15 +114,20 @@ void MathUtil::addMatrix(const float* m, float scalar, float* dst)
void MathUtil::addMatrix(const float* m1, const float* m2, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::addMatrix(m1, m2, dst);
#elif defined(USE_NEON64)
MathUtilNeon64::addMatrix(m1, m2, dst);
#elif defined(INCLUDE_NEON32)
#if defined(AX_SSE_INTRINSICS)
MathUtilSSE::addMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
reinterpret_cast<_xm128_t*>(dst));
#elif defined(AX_NEON_INTRINSICS)
# if AX_64BITS || AX_NEON_INTRINSICS > 1
MathUtilNeon::addMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
reinterpret_cast<_xm128_t*>(dst));
# else
if (isNeon32Enabled())
MathUtilNeon::addMatrix(m1, m2, dst);
MathUtilNeon::addMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
reinterpret_cast<_xm128_t*>(dst));
else
MathUtilC::addMatrix(m1, m2, dst);
# endif
#else
MathUtilC::addMatrix(m1, m2, dst);
#endif
@ -175,15 +135,20 @@ void MathUtil::addMatrix(const float* m1, const float* m2, float* dst)
void MathUtil::subtractMatrix(const float* m1, const float* m2, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::subtractMatrix(m1, m2, dst);
#elif defined(USE_NEON64)
MathUtilNeon64::subtractMatrix(m1, m2, dst);
#elif defined(INCLUDE_NEON32)
#if defined(AX_SSE_INTRINSICS)
MathUtilSSE::subtractMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
reinterpret_cast<_xm128_t*>(dst));
#elif defined(AX_NEON_INTRINSICS)
# if AX_64BITS || AX_NEON_INTRINSICS > 1
MathUtilNeon::subtractMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
reinterpret_cast<_xm128_t*>(dst));
# else
if (isNeon32Enabled())
MathUtilNeon::subtractMatrix(m1, m2, dst);
MathUtilNeon::subtractMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
reinterpret_cast<_xm128_t*>(dst));
else
MathUtilC::subtractMatrix(m1, m2, dst);
# endif
#else
MathUtilC::subtractMatrix(m1, m2, dst);
#endif
@ -191,15 +156,17 @@ void MathUtil::subtractMatrix(const float* m1, const float* m2, float* dst)
void MathUtil::multiplyMatrix(const float* m, float scalar, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::multiplyMatrix(m, scalar, dst);
#elif defined(USE_NEON64)
MathUtilNeon64::multiplyMatrix(m, scalar, dst);
#elif defined(INCLUDE_NEON32)
#if defined(AX_SSE_INTRINSICS)
MathUtilSSE::multiplyMatrix(reinterpret_cast<const _xm128_t*>(m), scalar, reinterpret_cast<_xm128_t*>(dst));
#elif defined(AX_NEON_INTRINSICS)
# if AX_64BITS || AX_NEON_INTRINSICS > 1
MathUtilNeon::multiplyMatrix(reinterpret_cast<const _xm128_t*>(m), scalar, reinterpret_cast<_xm128_t*>(dst));
# else
if (isNeon32Enabled())
MathUtilNeon::multiplyMatrix(m, scalar, dst);
MathUtilNeon::multiplyMatrix(reinterpret_cast<const _xm128_t*>(m), scalar, reinterpret_cast<_xm128_t*>(dst));
else
MathUtilC::multiplyMatrix(m, scalar, dst);
# endif
#else
MathUtilC::multiplyMatrix(m, scalar, dst);
#endif
@ -207,15 +174,20 @@ void MathUtil::multiplyMatrix(const float* m, float scalar, float* dst)
void MathUtil::multiplyMatrix(const float* m1, const float* m2, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::multiplyMatrix(m1, m2, dst);
#elif defined(USE_NEON64)
MathUtilNeon64::multiplyMatrix(m1, m2, dst);
#elif defined(INCLUDE_NEON32)
#if defined(AX_SSE_INTRINSICS)
MathUtilSSE::multiplyMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
reinterpret_cast<_xm128_t*>(dst));
#elif defined(AX_NEON_INTRINSICS)
# if AX_64BITS || AX_NEON_INTRINSICS > 1
MathUtilNeon::multiplyMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
reinterpret_cast<_xm128_t*>(dst));
# else
if (isNeon32Enabled())
MathUtilNeon::multiplyMatrix(m1, m2, dst);
MathUtilNeon::multiplyMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
reinterpret_cast<_xm128_t*>(dst));
else
MathUtilC::multiplyMatrix(m1, m2, dst);
# endif
#else
MathUtilC::multiplyMatrix(m1, m2, dst);
#endif
@ -223,15 +195,17 @@ void MathUtil::multiplyMatrix(const float* m1, const float* m2, float* dst)
void MathUtil::negateMatrix(const float* m, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::negateMatrix(m, dst);
#elif defined(USE_NEON64)
MathUtilNeon64::negateMatrix(m, dst);
#elif defined(INCLUDE_NEON32)
#if defined(AX_SSE_INTRINSICS)
MathUtilSSE::negateMatrix(reinterpret_cast<const _xm128_t*>(m), reinterpret_cast<_xm128_t*>(dst));
#elif defined(AX_NEON_INTRINSICS)
# if AX_64BITS || AX_NEON_INTRINSICS > 1
MathUtilNeon::negateMatrix(reinterpret_cast<const _xm128_t*>(m), reinterpret_cast<_xm128_t*>(dst));
# else
if (isNeon32Enabled())
MathUtilNeon::negateMatrix(m, dst);
MathUtilNeon::negateMatrix(reinterpret_cast<const _xm128_t*>(m), reinterpret_cast<_xm128_t*>(dst));
else
MathUtilC::negateMatrix(m, dst);
# endif
#else
MathUtilC::negateMatrix(m, dst);
#endif
@ -239,47 +213,53 @@ void MathUtil::negateMatrix(const float* m, float* dst)
void MathUtil::transposeMatrix(const float* m, float* dst)
{
#ifdef USE_NEON32
MathUtilNeon::transposeMatrix(m, dst);
#elif defined(USE_NEON64)
MathUtilNeon64::transposeMatrix(m, dst);
#elif defined(INCLUDE_NEON32)
#if defined(AX_SSE_INTRINSICS)
MathUtilSSE::transposeMatrix(reinterpret_cast<const _xm128_t*>(m), reinterpret_cast<_xm128_t*>(dst));
#elif defined(AX_NEON_INTRINSICS)
# if AX_64BITS || AX_NEON_INTRINSICS > 1
MathUtilNeon::transposeMatrix(reinterpret_cast<const _xm128_t*>(m), reinterpret_cast<_xm128_t*>(dst));
# else
if (isNeon32Enabled())
MathUtilNeon::transposeMatrix(m, dst);
MathUtilNeon::transposeMatrix(reinterpret_cast<const _xm128_t*>(m), reinterpret_cast<_xm128_t*>(dst));
else
MathUtilC::transposeMatrix(m, dst);
# endif
#else
MathUtilC::transposeMatrix(m, dst);
#endif
}
void MathUtil::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
void MathUtil::transformVec4(const float* m, float x, float y, float z, float w, float* dst /*vec3*/)
{
#ifdef USE_NEON32
MathUtilNeon::transformVec4(m, x, y, z, w, dst);
#elif defined(USE_NEON64)
MathUtilNeon64::transformVec4(m, x, y, z, w, dst);
#elif defined(INCLUDE_NEON32)
#if defined(AX_SSE_INTRINSICS)
MathUtilSSE::transformVec4(reinterpret_cast<const _xm128_t*>(m), x, y, z, w, dst);
#elif defined(AX_NEON_INTRINSICS)
# if AX_64BITS || AX_NEON_INTRINSICS > 1
MathUtilNeon::transformVec4(reinterpret_cast<const _xm128_t*>(m), x, y, z, w, dst);
# else
if (isNeon32Enabled())
MathUtilNeon::transformVec4(m, x, y, z, w, dst);
MathUtilNeon::transformVec4(reinterpret_cast<const _xm128_t*>(m), x, y, z, w, dst);
else
MathUtilC::transformVec4(m, x, y, z, w, dst);
# endif
#else
MathUtilC::transformVec4(m, x, y, z, w, dst);
#endif
}
void MathUtil::transformVec4(const float* m, const float* v, float* dst)
void MathUtil::transformVec4(const float* m, const float* v, float* dst /*vec4*/)
{
#ifdef USE_NEON32
MathUtilNeon::transformVec4(m, v, dst);
#elif defined(USE_NEON64)
MathUtilNeon64::transformVec4(m, v, dst);
#elif defined(INCLUDE_NEON32)
#if defined(AX_SSE_INTRINSICS)
MathUtilSSE::transformVec4(reinterpret_cast<const _xm128_t*>(m), v, dst);
#elif defined(AX_NEON_INTRINSICS)
# if AX_64BITS || AX_NEON_INTRINSICS > 1
MathUtilNeon::transformVec4(reinterpret_cast<const _xm128_t*>(m), v, dst);
# else
if (isNeon32Enabled())
MathUtilNeon::transformVec4(m, v, dst);
MathUtilNeon::transformVec4(reinterpret_cast<const _xm128_t*>(m), v, dst);
else
MathUtilC::transformVec4(m, v, dst);
# endif
#else
MathUtilC::transformVec4(m, v, dst);
#endif
@ -287,15 +267,17 @@ void MathUtil::transformVec4(const float* m, const float* v, float* dst)
void MathUtil::crossVec3(const float* v1, const float* v2, float* dst)
{
#ifdef USE_NEON32
#if defined(AX_SSE_INTRINSICS)
MathUtilSSE::crossVec3(v1, v2, dst);
#elif defined(AX_NEON_INTRINSICS)
# if AX_64BITS || AX_NEON_INTRINSICS > 1
MathUtilNeon::crossVec3(v1, v2, dst);
#elif defined(USE_NEON64)
MathUtilNeon64::crossVec3(v1, v2, dst);
#elif defined(INCLUDE_NEON32)
# else
if (isNeon32Enabled())
MathUtilNeon::crossVec3(v1, v2, dst);
else
MathUtilC::crossVec3(v1, v2, dst);
# endif
#else
MathUtilC::crossVec3(v1, v2, dst);
#endif
@ -308,24 +290,28 @@ void MathUtil::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_
static_assert(offsetof(V3F_C4B_T2F, vertices) == 0);
static_assert(offsetof(V3F_C4B_T2F, colors) == 12);
static_assert(offsetof(V3F_C4B_T2F, texCoords) == 16);
#ifdef USE_NEON32
#if defined(AX_SSE_INTRINSICS)
MathUtilSSE::transformVertices(dst, src, count, transform);
#elif defined(AX_NEON_INTRINSICS)
# if AX_64BITS || AX_NEON_INTRINSICS > 1
MathUtilNeon::transformVertices(dst, src, count, transform);
#elif defined(USE_NEON64)
MathUtilNeon64::transformVertices(dst, src, count, transform);
#elif defined(INCLUDE_NEON32)
# else
if (isNeon32Enabled())
MathUtilNeon::transformVertices(dst, src, count, transform);
else
MathUtilC::transformVertices(dst, src, count, transform);
# endif
#else
MathUtilC::transformVertices(dst, src, count, transform);
#endif
}
void MathUtil::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset) {
#if defined(USE_NEON64)
MathUtilNeon64::transformIndices(dst, src, count, offset);
void MathUtil::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
{
#if defined(AX_SSE_INTRINSICS)
MathUtilSSE::transformIndices(dst, src, count, offset);
#elif defined(AX_NEON_INTRINSICS) && AX_64BITS
MathUtilNeon::transformIndices(dst, src, count, offset);
#else
MathUtilC::transformIndices(dst, src, count, offset);
#endif

View File

@ -18,16 +18,12 @@
Original file from GamePlay3D: http://gameplay3d.org
This file was modified to fit the cocos2d-x project
This file was modified to fit the axmol project
*/
#ifndef MATHUTIL_H_
#define MATHUTIL_H_
#ifdef AX_USE_SSE
# include <xmmintrin.h>
#endif
#include "math/MathBase.h"
@ -42,7 +38,7 @@ NS_AX_END
NS_AX_MATH_BEGIN
class Mat4;
class Vec4;
/**
* Defines a math utility class.
@ -100,26 +96,8 @@ public:
private:
// Indicates that if neon is enabled
static bool isNeon32Enabled();
static bool isNeon64Enabled();
private:
#ifdef AX_USE_SSE
static void addMatrix(const __m128 m[4], float scalar, __m128 dst[4]);
static void addMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]);
static void subtractMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]);
static void multiplyMatrix(const __m128 m[4], float scalar, __m128 dst[4]);
static void multiplyMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]);
static void negateMatrix(const __m128 m[4], __m128 dst[4]);
static void transposeMatrix(const __m128 m[4], __m128 dst[4]);
static void transformVec4(const __m128 m[4], const __m128& v, __m128& dst);
#endif
static void addMatrix(const float* m, float scalar, float* dst);
static void addMatrix(const float* m1, const float* m2, float* dst);
@ -134,9 +112,9 @@ private:
static void transposeMatrix(const float* m, float* dst);
static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
static void transformVec4(const float* m, float x, float y, float z, float w, float* dst/*vec3*/);
static void transformVec4(const float* m, const float* v, float* dst);
static void transformVec4(const float* m, const float* v, float* dst/*vec4*/);
static void crossVec3(const float* v1, const float* v2, float* dst);

View File

@ -16,7 +16,7 @@
Original file from GamePlay3D: http://gameplay3d.org
This file was modified to fit the cocos2d-x project
This file was modified to fit the axmol project
*/
NS_AX_MATH_BEGIN
@ -24,221 +24,201 @@ NS_AX_MATH_BEGIN
class MathUtilC
{
public:
inline static void addMatrix(const float* m, float scalar, float* dst);
inline static void addMatrix(const float* m1, const float* m2, float* dst);
inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
inline static void multiplyMatrix(const float* m, float scalar, float* dst);
inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
inline static void addMatrix(const float* m, float scalar, float* dst)
{
dst[0] = m[0] + scalar;
dst[1] = m[1] + scalar;
dst[2] = m[2] + scalar;
dst[3] = m[3] + scalar;
dst[4] = m[4] + scalar;
dst[5] = m[5] + scalar;
dst[6] = m[6] + scalar;
dst[7] = m[7] + scalar;
dst[8] = m[8] + scalar;
dst[9] = m[9] + scalar;
dst[10] = m[10] + scalar;
dst[11] = m[11] + scalar;
dst[12] = m[12] + scalar;
dst[13] = m[13] + scalar;
dst[14] = m[14] + scalar;
dst[15] = m[15] + scalar;
}
inline static void negateMatrix(const float* m, float* dst);
inline static void transposeMatrix(const float* m, float* dst);
inline static void addMatrix(const float* m1, const float* m2, float* dst)
{
dst[0] = m1[0] + m2[0];
dst[1] = m1[1] + m2[1];
dst[2] = m1[2] + m2[2];
dst[3] = m1[3] + m2[3];
dst[4] = m1[4] + m2[4];
dst[5] = m1[5] + m2[5];
dst[6] = m1[6] + m2[6];
dst[7] = m1[7] + m2[7];
dst[8] = m1[8] + m2[8];
dst[9] = m1[9] + m2[9];
dst[10] = m1[10] + m2[10];
dst[11] = m1[11] + m2[11];
dst[12] = m1[12] + m2[12];
dst[13] = m1[13] + m2[13];
dst[14] = m1[14] + m2[14];
dst[15] = m1[15] + m2[15];
}
inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
inline static void transformVec4(const float* m, const float* v, float* dst);
inline static void crossVec3(const float* v1, const float* v2, float* dst);
inline static void subtractMatrix(const float* m1, const float* m2, float* dst)
{
dst[0] = m1[0] - m2[0];
dst[1] = m1[1] - m2[1];
dst[2] = m1[2] - m2[2];
dst[3] = m1[3] - m2[3];
dst[4] = m1[4] - m2[4];
dst[5] = m1[5] - m2[5];
dst[6] = m1[6] - m2[6];
dst[7] = m1[7] - m2[7];
dst[8] = m1[8] - m2[8];
dst[9] = m1[9] - m2[9];
dst[10] = m1[10] - m2[10];
dst[11] = m1[11] - m2[11];
dst[12] = m1[12] - m2[12];
dst[13] = m1[13] - m2[13];
dst[14] = m1[14] - m2[14];
dst[15] = m1[15] - m2[15];
}
inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform);
inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset);
inline static void multiplyMatrix(const float* m, float scalar, float* dst)
{
dst[0] = m[0] * scalar;
dst[1] = m[1] * scalar;
dst[2] = m[2] * scalar;
dst[3] = m[3] * scalar;
dst[4] = m[4] * scalar;
dst[5] = m[5] * scalar;
dst[6] = m[6] * scalar;
dst[7] = m[7] * scalar;
dst[8] = m[8] * scalar;
dst[9] = m[9] * scalar;
dst[10] = m[10] * scalar;
dst[11] = m[11] * scalar;
dst[12] = m[12] * scalar;
dst[13] = m[13] * scalar;
dst[14] = m[14] * scalar;
dst[15] = m[15] * scalar;
}
inline static void multiplyMatrix(const float* m1, const float* m2, float* dst)
{
// Support the case where m1 or m2 is the same array as dst.
float product[16];
product[0] = m1[0] * m2[0] + m1[4] * m2[1] + m1[8] * m2[2] + m1[12] * m2[3];
product[1] = m1[1] * m2[0] + m1[5] * m2[1] + m1[9] * m2[2] + m1[13] * m2[3];
product[2] = m1[2] * m2[0] + m1[6] * m2[1] + m1[10] * m2[2] + m1[14] * m2[3];
product[3] = m1[3] * m2[0] + m1[7] * m2[1] + m1[11] * m2[2] + m1[15] * m2[3];
product[4] = m1[0] * m2[4] + m1[4] * m2[5] + m1[8] * m2[6] + m1[12] * m2[7];
product[5] = m1[1] * m2[4] + m1[5] * m2[5] + m1[9] * m2[6] + m1[13] * m2[7];
product[6] = m1[2] * m2[4] + m1[6] * m2[5] + m1[10] * m2[6] + m1[14] * m2[7];
product[7] = m1[3] * m2[4] + m1[7] * m2[5] + m1[11] * m2[6] + m1[15] * m2[7];
product[8] = m1[0] * m2[8] + m1[4] * m2[9] + m1[8] * m2[10] + m1[12] * m2[11];
product[9] = m1[1] * m2[8] + m1[5] * m2[9] + m1[9] * m2[10] + m1[13] * m2[11];
product[10] = m1[2] * m2[8] + m1[6] * m2[9] + m1[10] * m2[10] + m1[14] * m2[11];
product[11] = m1[3] * m2[8] + m1[7] * m2[9] + m1[11] * m2[10] + m1[15] * m2[11];
product[12] = m1[0] * m2[12] + m1[4] * m2[13] + m1[8] * m2[14] + m1[12] * m2[15];
product[13] = m1[1] * m2[12] + m1[5] * m2[13] + m1[9] * m2[14] + m1[13] * m2[15];
product[14] = m1[2] * m2[12] + m1[6] * m2[13] + m1[10] * m2[14] + m1[14] * m2[15];
product[15] = m1[3] * m2[12] + m1[7] * m2[13] + m1[11] * m2[14] + m1[15] * m2[15];
memcpy(dst, product, MATRIX_SIZE);
}
inline static void negateMatrix(const float* m, float* dst)
{
dst[0] = -m[0];
dst[1] = -m[1];
dst[2] = -m[2];
dst[3] = -m[3];
dst[4] = -m[4];
dst[5] = -m[5];
dst[6] = -m[6];
dst[7] = -m[7];
dst[8] = -m[8];
dst[9] = -m[9];
dst[10] = -m[10];
dst[11] = -m[11];
dst[12] = -m[12];
dst[13] = -m[13];
dst[14] = -m[14];
dst[15] = -m[15];
}
inline static void transposeMatrix(const float* m, float* dst)
{
float t[16] = {m[0], m[4], m[8], m[12], m[1], m[5], m[9], m[13],
m[2], m[6], m[10], m[14], m[3], m[7], m[11], m[15]};
memcpy(dst, t, MATRIX_SIZE);
}
inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst)
{
dst[0] = x * m[0] + y * m[4] + z * m[8] + w * m[12];
dst[1] = x * m[1] + y * m[5] + z * m[9] + w * m[13];
dst[2] = x * m[2] + y * m[6] + z * m[10] + w * m[14];
}
inline static void transformVec4(const float* m, const float* v, float* dst)
{
// Handle case where v == dst.
float x = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + v[3] * m[12];
float y = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + v[3] * m[13];
float z = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + v[3] * m[14];
float w = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + v[3] * m[15];
dst[0] = x;
dst[1] = y;
dst[2] = z;
dst[3] = w;
}
inline static void crossVec3(const float* v1, const float* v2, float* dst)
{
float x = (v1[1] * v2[2]) - (v1[2] * v2[1]);
float y = (v1[2] * v2[0]) - (v1[0] * v2[2]);
float z = (v1[0] * v2[1]) - (v1[1] * v2[0]);
dst[0] = x;
dst[1] = y;
dst[2] = z;
}
inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
{
auto end = dst + count;
auto& t = transform; // Make copy for better aliasing inference
auto m = t.m;
while (dst < end)
{
auto pos = src->vertices;
dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8] + m[12];
dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9] + m[13];
dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14];
memcpy(&dst->colors, &src->colors, sizeof(V3F_C4B_T2F::colors) + sizeof(V3F_C4B_T2F::texCoords));
++dst;
++src;
}
}
inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
{
auto end = dst + count;
while (dst < end)
{
*dst = *src + offset;
++dst;
++src;
}
}
};
inline void MathUtilC::addMatrix(const float* m, float scalar, float* dst)
{
dst[0] = m[0] + scalar;
dst[1] = m[1] + scalar;
dst[2] = m[2] + scalar;
dst[3] = m[3] + scalar;
dst[4] = m[4] + scalar;
dst[5] = m[5] + scalar;
dst[6] = m[6] + scalar;
dst[7] = m[7] + scalar;
dst[8] = m[8] + scalar;
dst[9] = m[9] + scalar;
dst[10] = m[10] + scalar;
dst[11] = m[11] + scalar;
dst[12] = m[12] + scalar;
dst[13] = m[13] + scalar;
dst[14] = m[14] + scalar;
dst[15] = m[15] + scalar;
}
inline void MathUtilC::addMatrix(const float* m1, const float* m2, float* dst)
{
dst[0] = m1[0] + m2[0];
dst[1] = m1[1] + m2[1];
dst[2] = m1[2] + m2[2];
dst[3] = m1[3] + m2[3];
dst[4] = m1[4] + m2[4];
dst[5] = m1[5] + m2[5];
dst[6] = m1[6] + m2[6];
dst[7] = m1[7] + m2[7];
dst[8] = m1[8] + m2[8];
dst[9] = m1[9] + m2[9];
dst[10] = m1[10] + m2[10];
dst[11] = m1[11] + m2[11];
dst[12] = m1[12] + m2[12];
dst[13] = m1[13] + m2[13];
dst[14] = m1[14] + m2[14];
dst[15] = m1[15] + m2[15];
}
inline void MathUtilC::subtractMatrix(const float* m1, const float* m2, float* dst)
{
dst[0] = m1[0] - m2[0];
dst[1] = m1[1] - m2[1];
dst[2] = m1[2] - m2[2];
dst[3] = m1[3] - m2[3];
dst[4] = m1[4] - m2[4];
dst[5] = m1[5] - m2[5];
dst[6] = m1[6] - m2[6];
dst[7] = m1[7] - m2[7];
dst[8] = m1[8] - m2[8];
dst[9] = m1[9] - m2[9];
dst[10] = m1[10] - m2[10];
dst[11] = m1[11] - m2[11];
dst[12] = m1[12] - m2[12];
dst[13] = m1[13] - m2[13];
dst[14] = m1[14] - m2[14];
dst[15] = m1[15] - m2[15];
}
inline void MathUtilC::multiplyMatrix(const float* m, float scalar, float* dst)
{
dst[0] = m[0] * scalar;
dst[1] = m[1] * scalar;
dst[2] = m[2] * scalar;
dst[3] = m[3] * scalar;
dst[4] = m[4] * scalar;
dst[5] = m[5] * scalar;
dst[6] = m[6] * scalar;
dst[7] = m[7] * scalar;
dst[8] = m[8] * scalar;
dst[9] = m[9] * scalar;
dst[10] = m[10] * scalar;
dst[11] = m[11] * scalar;
dst[12] = m[12] * scalar;
dst[13] = m[13] * scalar;
dst[14] = m[14] * scalar;
dst[15] = m[15] * scalar;
}
inline void MathUtilC::multiplyMatrix(const float* m1, const float* m2, float* dst)
{
// Support the case where m1 or m2 is the same array as dst.
float product[16];
product[0] = m1[0] * m2[0] + m1[4] * m2[1] + m1[8] * m2[2] + m1[12] * m2[3];
product[1] = m1[1] * m2[0] + m1[5] * m2[1] + m1[9] * m2[2] + m1[13] * m2[3];
product[2] = m1[2] * m2[0] + m1[6] * m2[1] + m1[10] * m2[2] + m1[14] * m2[3];
product[3] = m1[3] * m2[0] + m1[7] * m2[1] + m1[11] * m2[2] + m1[15] * m2[3];
product[4] = m1[0] * m2[4] + m1[4] * m2[5] + m1[8] * m2[6] + m1[12] * m2[7];
product[5] = m1[1] * m2[4] + m1[5] * m2[5] + m1[9] * m2[6] + m1[13] * m2[7];
product[6] = m1[2] * m2[4] + m1[6] * m2[5] + m1[10] * m2[6] + m1[14] * m2[7];
product[7] = m1[3] * m2[4] + m1[7] * m2[5] + m1[11] * m2[6] + m1[15] * m2[7];
product[8] = m1[0] * m2[8] + m1[4] * m2[9] + m1[8] * m2[10] + m1[12] * m2[11];
product[9] = m1[1] * m2[8] + m1[5] * m2[9] + m1[9] * m2[10] + m1[13] * m2[11];
product[10] = m1[2] * m2[8] + m1[6] * m2[9] + m1[10] * m2[10] + m1[14] * m2[11];
product[11] = m1[3] * m2[8] + m1[7] * m2[9] + m1[11] * m2[10] + m1[15] * m2[11];
product[12] = m1[0] * m2[12] + m1[4] * m2[13] + m1[8] * m2[14] + m1[12] * m2[15];
product[13] = m1[1] * m2[12] + m1[5] * m2[13] + m1[9] * m2[14] + m1[13] * m2[15];
product[14] = m1[2] * m2[12] + m1[6] * m2[13] + m1[10] * m2[14] + m1[14] * m2[15];
product[15] = m1[3] * m2[12] + m1[7] * m2[13] + m1[11] * m2[14] + m1[15] * m2[15];
memcpy(dst, product, MATRIX_SIZE);
}
inline void MathUtilC::negateMatrix(const float* m, float* dst)
{
dst[0] = -m[0];
dst[1] = -m[1];
dst[2] = -m[2];
dst[3] = -m[3];
dst[4] = -m[4];
dst[5] = -m[5];
dst[6] = -m[6];
dst[7] = -m[7];
dst[8] = -m[8];
dst[9] = -m[9];
dst[10] = -m[10];
dst[11] = -m[11];
dst[12] = -m[12];
dst[13] = -m[13];
dst[14] = -m[14];
dst[15] = -m[15];
}
inline void MathUtilC::transposeMatrix(const float* m, float* dst)
{
float t[16] = {
m[0], m[4], m[8], m[12],
m[1], m[5], m[9], m[13],
m[2], m[6], m[10], m[14],
m[3], m[7], m[11], m[15]
};
memcpy(dst, t, MATRIX_SIZE);
}
inline void MathUtilC::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
{
dst[0] = x * m[0] + y * m[4] + z * m[8] + w * m[12];
dst[1] = x * m[1] + y * m[5] + z * m[9] + w * m[13];
dst[2] = x * m[2] + y * m[6] + z * m[10] + w * m[14];
}
inline void MathUtilC::transformVec4(const float* m, const float* v, float* dst)
{
// Handle case where v == dst.
float x = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + v[3] * m[12];
float y = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + v[3] * m[13];
float z = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + v[3] * m[14];
float w = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + v[3] * m[15];
dst[0] = x;
dst[1] = y;
dst[2] = z;
dst[3] = w;
}
inline void MathUtilC::crossVec3(const float* v1, const float* v2, float* dst)
{
float x = (v1[1] * v2[2]) - (v1[2] * v2[1]);
float y = (v1[2] * v2[0]) - (v1[0] * v2[2]);
float z = (v1[0] * v2[1]) - (v1[1] * v2[0]);
dst[0] = x;
dst[1] = y;
dst[2] = z;
}
inline void MathUtilC::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
{
auto end = dst + count;
auto t = transform; // Make copy for better aliasing inference
auto m = t.m;
while (dst < end)
{
auto pos = src->vertices;
dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8] + m[12];
dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9] + m[13];
dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14];
memcpy(&dst->colors, &src->colors, sizeof(dst->colors) + sizeof(dst->texCoords));
++dst;
++src;
}
}
inline void MathUtilC::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
{
auto end = dst + count;
while (dst < end)
{
*dst = *src + offset;
++dst;
++src;
}
}
NS_AX_MATH_END

View File

@ -16,356 +16,374 @@
Original file from GamePlay3D: http://gameplay3d.org
This file was modified to fit the cocos2d-x project
This file was modified to fit the axmol project
*/
#include <arm_neon.h>
NS_AX_MATH_BEGIN
class MathUtilNeon
struct MathUtilNeon
{
public:
inline static void addMatrix(const float* m, float scalar, float* dst);
inline static void addMatrix(const float* m1, const float* m2, float* dst);
inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
inline static void multiplyMatrix(const float* m, float scalar, float* dst);
inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
#if defined(__EMSCRIPTEN__)
# define vmlaq_lane_f32(a, b, c, lane) vaddq_f32(a, vmulq_lane_f32(b, c, lane))
#endif
inline static void negateMatrix(const float* m, float* dst);
inline static void transposeMatrix(const float* m, float* dst);
inline static void addMatrix(const _xm128_t* m, float scalar, _xm128_t* dst)
{
float32x4_t s = vdupq_n_f32(scalar);
dst[0] = vaddq_f32(m[0], s);
dst[1] = vaddq_f32(m[1], s);
dst[2] = vaddq_f32(m[2], s);
dst[3] = vaddq_f32(m[3], s);
}
inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
inline static void transformVec4(const float* m, const float* v, float* dst);
inline static void crossVec3(const float* v1, const float* v2, float* dst);
inline static void addMatrix(const _xm128_t* m1, const _xm128_t* m2, _xm128_t* dst)
{
dst[0] = vaddq_f32(m1[0], m2[0]);
dst[1] = vaddq_f32(m1[1], m2[1]);
dst[2] = vaddq_f32(m1[2], m2[2]);
dst[3] = vaddq_f32(m1[3], m2[3]);
}
inline static void transformVertices(ax::V3F_C4B_T2F* dst, const ax::V3F_C4B_T2F* src, size_t count, const ax::Mat4& transform);
inline static void subtractMatrix(const _xm128_t* m1, const _xm128_t* m2, _xm128_t* dst)
{
dst[0] = vsubq_f32(m1[0], m2[0]);
dst[1] = vsubq_f32(m1[1], m2[1]);
dst[2] = vsubq_f32(m1[2], m2[2]);
dst[3] = vsubq_f32(m1[3], m2[3]);
}
inline static void multiplyMatrix(const _xm128_t* m, float scalar, _xm128_t* dst)
{
_xm128_t s = vdupq_n_f32(scalar);
UTILS_UNROLL
for (int i = 0; i < 4; ++i)
{
dst[i] = vmulq_f32(m[i], s);
}
}
inline static void multiplyMatrix(const _xm128_t* m1, const _xm128_t* m2, _xm128_t* dst)
{
float32x4_t product[4];
float32x4_t val;
UTILS_UNROLL
for (int i = 0; i < 4; ++i)
{
val = vmulq_n_f32(m1[0], vgetq_lane_f32(m2[i], 0));
val = vmlaq_n_f32(val, m1[1], vgetq_lane_f32(m2[i], 1));
val = vmlaq_n_f32(val, m1[2], vgetq_lane_f32(m2[i], 2));
val = vmlaq_n_f32(val, m1[3], vgetq_lane_f32(m2[i], 3));
product[i] = val;
}
memcpy(dst, product, sizeof(product));
}
inline static void negateMatrix(const _xm128_t* m, _xm128_t* dst)
{
UTILS_UNROLL
for (int i = 0; i < 4; ++i)
{
dst[i] = vnegq_f32(m[i]);
}
}
inline static void transposeMatrix(const _xm128_t* m, _xm128_t* dst)
{
auto tmp0 = vzipq_f32(m[0], m[2]);
auto tmp1 = vzipq_f32(m[1], m[3]);
auto tmp2 = vzipq_f32(tmp0.val[0], tmp1.val[0]);
auto tmp3 = vzipq_f32(tmp0.val[1], tmp1.val[1]);
dst[0] = tmp2.val[0];
dst[1] = tmp2.val[1];
dst[2] = tmp3.val[0];
dst[3] = tmp3.val[1];
}
inline static void transformVec4(const _xm128_t* m, float x, float y, float z, float w, float* dst/*vec3*/)
{
auto v0 = vmulq_n_f32(m[0], x);
auto v1 = vmulq_n_f32(m[1], y);
auto v2 = vmulq_n_f32(m[2], z);
auto v3 = vmulq_n_f32(m[3], w);
auto prod = vaddq_f32(v0, vaddq_f32(v1, vaddq_f32(v2, v3)));
vst1_f32(dst, vget_low_f32(prod));
vst1_lane_f32(dst + 2, vget_high_f32(prod), 0);
}
inline static void transformVec4(const _xm128_t* m, const float* v /*vec4*/, float* dst /*vec4*/)
{
auto v0 = vmulq_n_f32(m[0], v[0]);
auto v1 = vmulq_n_f32(m[1], v[1]);
auto v2 = vmulq_n_f32(m[2], v[2]);
auto v3 = vmulq_n_f32(m[3], v[3]);
auto prod = vaddq_f32(v0, vaddq_f32(v1, vaddq_f32(v2, v3)));
vst1q_f32(dst, prod);
}
inline static void crossVec3(const float* v1, const float* v2, float* dst)
{
// refer to:
// https://developer.arm.com/documentation/den0018/a/NEON-Code-Examples-with-Mixed-Operations/Cross-product/Single-cross-product
// Vector a is stored in memory such that ai is at the lower address and
// ak is at the higher address. Vector b is also stored in the same way.
float32x4_t vec_a = vcombine_f32(vld1_f32(v1 + 1), vld1_f32(v1)); // Q register = [aj, ai, ak, aj]
float32x4_t vec_b = vcombine_f32(vld1_f32(v2 + 1), vld1_f32(v2)); // Q register = [bj, bi, bk, bj]
float32x4_t vec_a_rot = vextq_f32(vec_a, vec_a, 1);
float32x4_t vec_b_rot = vextq_f32(vec_b, vec_b, 1);
float32x4_t prod = vmulq_f32(vec_a, vec_b_rot);
// prod = [ ajbj, aibj, akbi, ajbk ]
prod = vmlsq_f32(prod, vec_a_rot, vec_b);
// prod = [ ajbj-ajbj, aibj-ajbi, akbi-aibk, ajbk-akbj ]
vst1_f32(dst, vget_low_f32(prod)); // Store the lower two elements to address r
vst1_lane_f32(dst + 2, vget_high_f32(prod), 0); // Store the 3rd element
}
#if AX_64BITS
inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
{
auto end = dst + count;
// Load matrix
float32x4x4_t m = vld1q_f32_x4(transform.m);
// Process 4 vertices at a time if there's enough data
auto end4 = dst + count / 4 * 4;
while (dst < end4)
{
// Do this for each vertex
// dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8] + m[12];
// dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9] + m[13];
// dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14];
// First, load each vertex, multiply x by column 0 and add to column 3
// Note: since we're reading 4 floats it will load color bytes into v.w
float32x4_t v0 = vld1q_f32(&src[0].vertices.x);
float32x4_t r0 = vmlaq_laneq_f32(m.val[3], m.val[0], v0, 0);
float32x4_t v1 = vld1q_f32(&src[1].vertices.x);
float32x4_t r1 = vmlaq_laneq_f32(m.val[3], m.val[0], v1, 0);
float32x4_t v2 = vld1q_f32(&src[2].vertices.x);
float32x4_t r2 = vmlaq_laneq_f32(m.val[3], m.val[0], v2, 0);
float32x4_t v3 = vld1q_f32(&src[3].vertices.x);
float32x4_t r3 = vmlaq_laneq_f32(m.val[3], m.val[0], v3, 0);
// Load texCoords
float32x2_t uv0 = vld1_f32(&src[0].texCoords.u);
float32x2_t uv1 = vld1_f32(&src[1].texCoords.u);
float32x2_t uv2 = vld1_f32(&src[2].texCoords.u);
float32x2_t uv3 = vld1_f32(&src[3].texCoords.u);
// Multiply y by column 1 and add to result
r0 = vmlaq_laneq_f32(r0, m.val[1], v0, 1);
r1 = vmlaq_laneq_f32(r1, m.val[1], v1, 1);
r2 = vmlaq_laneq_f32(r2, m.val[1], v2, 1);
r3 = vmlaq_laneq_f32(r3, m.val[1], v3, 1);
// Multiply z by column 2 and add to result
r0 = vmlaq_laneq_f32(r0, m.val[2], v0, 2);
r1 = vmlaq_laneq_f32(r1, m.val[2], v1, 2);
r2 = vmlaq_laneq_f32(r2, m.val[2], v2, 2);
r3 = vmlaq_laneq_f32(r3, m.val[2], v3, 2);
// Set w to loaded color
r0 = vsetq_lane_f32(vgetq_lane_f32(v0, 3), r0, 3);
r1 = vsetq_lane_f32(vgetq_lane_f32(v1, 3), r1, 3);
r2 = vsetq_lane_f32(vgetq_lane_f32(v2, 3), r2, 3);
r3 = vsetq_lane_f32(vgetq_lane_f32(v3, 3), r3, 3);
// Store result
vst1q_f32(&dst[0].vertices.x, r0);
vst1_f32(&dst[0].texCoords.u, uv0);
vst1q_f32(&dst[1].vertices.x, r1);
vst1_f32(&dst[1].texCoords.u, uv1);
vst1q_f32(&dst[2].vertices.x, r2);
vst1_f32(&dst[2].texCoords.u, uv2);
vst1q_f32(&dst[3].vertices.x, r3);
vst1_f32(&dst[3].texCoords.u, uv3);
dst += 4;
src += 4;
}
// Process remaining vertices one by one
while (dst < end)
{
float32x4_t v = vld1q_f32(&src->vertices.x);
float32x4_t r = vmlaq_laneq_f32(m.val[3], m.val[0], v, 0);
r = vmlaq_laneq_f32(r, m.val[1], v, 1);
r = vmlaq_laneq_f32(r, m.val[2], v, 2);
r = vsetq_lane_f32(vgetq_lane_f32(v, 3), r, 3);
float32x2_t uv = vld1_f32(&src->texCoords.u);
vst1q_f32(&dst->vertices.x, r);
vst1_f32(&dst->texCoords.u, uv);
++dst;
++src;
}
}
inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
{
auto end = dst + count;
auto off = vdupq_n_u16(offset);
if (count < 8)
goto LEFTOVER;
// Process 32 indices at a time if there's enough data
while (count >= 32)
{
// Load 32 indices
uint16x8x4_t v = vld1q_u16_x4(src);
// Add offset
v.val[0] = vaddq_u16(v.val[0], off);
v.val[1] = vaddq_u16(v.val[1], off);
v.val[2] = vaddq_u16(v.val[2], off);
v.val[3] = vaddq_u16(v.val[3], off);
// Store result
vst1q_u16_x4(dst, v);
dst += 32;
src += 32;
count -= 32;
}
// Process 8 indices at a time if there's enough data
while (count >= 8)
{
uint16x8_t v = vld1q_u16(src);
v = vaddq_u16(v, off);
vst1q_u16(dst, v);
dst += 8;
src += 8;
count -= 8;
}
LEFTOVER:
// Process remaining indices one by one
while (count > 0)
{
*dst = *src + offset;
++dst;
++src;
--count;
}
}
#else
inline static void transformVertices(ax::V3F_C4B_T2F* dst,
const ax::V3F_C4B_T2F* src,
size_t count,
const ax::Mat4& transform)
{
auto end = dst + count;
// Load matrix
float32x4_t mc0 = vld1q_f32(transform.m);
float32x4_t mc1 = vld1q_f32(transform.m + 4);
float32x4_t mc2 = vld1q_f32(transform.m + 8);
float32x4_t mc3 = vld1q_f32(transform.m + 12);
// Process 4 vertices at a time
auto end4 = dst + count / 4 * 4;
while (dst < end4)
{
// Load 4 vertices. Note that color will also get loaded into w
float32x2_t xy0 = vld1_f32(&src[0].vertices.x);
float32x2_t zw0 = vld1_f32(&src[0].vertices.z);
float32x2_t uv0 = vld1_f32(&src[0].texCoords.u);
float32x2_t xy1 = vld1_f32(&src[1].vertices.x);
float32x2_t zw1 = vld1_f32(&src[1].vertices.z);
float32x2_t uv1 = vld1_f32(&src[1].texCoords.u);
float32x2_t xy2 = vld1_f32(&src[2].vertices.x);
float32x2_t zw2 = vld1_f32(&src[2].vertices.z);
float32x2_t uv2 = vld1_f32(&src[2].texCoords.u);
float32x2_t xy3 = vld1_f32(&src[3].vertices.x);
float32x2_t zw3 = vld1_f32(&src[3].vertices.z);
float32x2_t uv3 = vld1_f32(&src[3].texCoords.u);
// Multiply x by column 0
float32x4_t r0 = vmulq_lane_f32(mc0, xy0, 0);
float32x4_t r1 = vmulq_lane_f32(mc0, xy1, 0);
float32x4_t r2 = vmulq_lane_f32(mc0, xy2, 0);
float32x4_t r3 = vmulq_lane_f32(mc0, xy3, 0);
// Multiply y by column 1 and add to result
r0 = vmlaq_lane_f32(r0, mc1, xy0, 1);
r1 = vmlaq_lane_f32(r1, mc1, xy1, 1);
r2 = vmlaq_lane_f32(r2, mc1, xy2, 1);
r3 = vmlaq_lane_f32(r3, mc1, xy3, 1);
// Multiply z by column 2 and add to result
r0 = vmlaq_lane_f32(r0, mc2, zw0, 0);
r1 = vmlaq_lane_f32(r1, mc2, zw1, 0);
r2 = vmlaq_lane_f32(r2, mc2, zw2, 0);
r3 = vmlaq_lane_f32(r3, mc2, zw3, 0);
// Add column 3
r0 = vaddq_f32(r0, mc3);
r1 = vaddq_f32(r1, mc3);
r2 = vaddq_f32(r2, mc3);
r3 = vaddq_f32(r3, mc3);
// Set color
r0 = vsetq_lane_f32(vget_lane_f32(zw0, 1), r0, 3);
r1 = vsetq_lane_f32(vget_lane_f32(zw1, 1), r1, 3);
r2 = vsetq_lane_f32(vget_lane_f32(zw2, 1), r2, 3);
r3 = vsetq_lane_f32(vget_lane_f32(zw3, 1), r3, 3);
// Store result
vst1q_f32(&dst[0].vertices.x, r0);
vst1_f32(&dst[0].texCoords.u, uv0);
vst1q_f32(&dst[1].vertices.x, r1);
vst1_f32(&dst[1].texCoords.u, uv1);
vst1q_f32(&dst[2].vertices.x, r2);
vst1_f32(&dst[2].texCoords.u, uv2);
vst1q_f32(&dst[3].vertices.x, r3);
vst1_f32(&dst[3].texCoords.u, uv3);
dst += 4;
src += 4;
}
// Process remaining vertices
while (dst < end)
{
// Load vertex
float32x2_t xy = vld1_f32(&src->vertices.x);
float32x2_t zw = vld1_f32(&src->vertices.z);
float32x2_t uv = vld1_f32(&src->texCoords.u);
// Multiply x by column 0
float32x4_t r = vmulq_lane_f32(mc0, xy, 0);
// Multiply y by column 1 and add to result
r = vmlaq_lane_f32(r, mc1, xy, 1);
// Multiply z by column 2 and add to result
r = vmlaq_lane_f32(r, mc2, zw, 0);
// Add column 3
r = vaddq_f32(r, mc3);
// Set color
r = vsetq_lane_f32(vget_lane_f32(zw, 1), r, 3);
// Store result
vst1q_f32(&dst->vertices.x, r);
vst1_f32(&dst->texCoords.u, uv);
++dst;
++src;
}
}
#endif
};
inline void MathUtilNeon::addMatrix(const float* m, float scalar, float* dst)
{
asm volatile(
"vld1.32 {q0, q1}, [%1]! \n\t" // M[m0-m7]
"vld1.32 {q2, q3}, [%1] \n\t" // M[m8-m15]
"vld1.32 {d8[0]}, [%2] \n\t" // s
"vmov.f32 s17, s16 \n\t" // s
"vmov.f32 s18, s16 \n\t" // s
"vmov.f32 s19, s16 \n\t" // s
"vadd.f32 q8, q0, q4 \n\t" // DST->M[m0-m3] = M[m0-m3] + s
"vadd.f32 q9, q1, q4 \n\t" // DST->M[m4-m7] = M[m4-m7] + s
"vadd.f32 q10, q2, q4 \n\t" // DST->M[m8-m11] = M[m8-m11] + s
"vadd.f32 q11, q3, q4 \n\t" // DST->M[m12-m15] = M[m12-m15] + s
"vst1.32 {q8, q9}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {q10, q11}, [%0] \n\t" // DST->M[m8-m15]
:
: "r"(dst), "r"(m), "r"(&scalar)
: "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "memory"
);
}
inline void MathUtilNeon::addMatrix(const float* m1, const float* m2, float* dst)
{
asm volatile(
"vld1.32 {q0, q1}, [%1]! \n\t" // M1[m0-m7]
"vld1.32 {q2, q3}, [%1] \n\t" // M1[m8-m15]
"vld1.32 {q8, q9}, [%2]! \n\t" // M2[m0-m7]
"vld1.32 {q10, q11}, [%2] \n\t" // M2[m8-m15]
"vadd.f32 q12, q0, q8 \n\t" // DST->M[m0-m3] = M1[m0-m3] + M2[m0-m3]
"vadd.f32 q13, q1, q9 \n\t" // DST->M[m4-m7] = M1[m4-m7] + M2[m4-m7]
"vadd.f32 q14, q2, q10 \n\t" // DST->M[m8-m11] = M1[m8-m11] + M2[m8-m11]
"vadd.f32 q15, q3, q11 \n\t" // DST->M[m12-m15] = M1[m12-m15] + M2[m12-m15]
"vst1.32 {q12, q13}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {q14, q15}, [%0] \n\t" // DST->M[m8-m15]
:
: "r"(dst), "r"(m1), "r"(m2)
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory"
);
}
inline void MathUtilNeon::subtractMatrix(const float* m1, const float* m2, float* dst)
{
asm volatile(
"vld1.32 {q0, q1}, [%1]! \n\t" // M1[m0-m7]
"vld1.32 {q2, q3}, [%1] \n\t" // M1[m8-m15]
"vld1.32 {q8, q9}, [%2]! \n\t" // M2[m0-m7]
"vld1.32 {q10, q11}, [%2] \n\t" // M2[m8-m15]
"vsub.f32 q12, q0, q8 \n\t" // DST->M[m0-m3] = M1[m0-m3] - M2[m0-m3]
"vsub.f32 q13, q1, q9 \n\t" // DST->M[m4-m7] = M1[m4-m7] - M2[m4-m7]
"vsub.f32 q14, q2, q10 \n\t" // DST->M[m8-m11] = M1[m8-m11] - M2[m8-m11]
"vsub.f32 q15, q3, q11 \n\t" // DST->M[m12-m15] = M1[m12-m15] - M2[m12-m15]
"vst1.32 {q12, q13}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {q14, q15}, [%0] \n\t" // DST->M[m8-m15]
:
: "r"(dst), "r"(m1), "r"(m2)
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory"
);
}
inline void MathUtilNeon::multiplyMatrix(const float* m, float scalar, float* dst)
{
asm volatile(
"vld1.32 {d0[0]}, [%2] \n\t" // M[m0-m7]
"vld1.32 {q4-q5}, [%1]! \n\t" // M[m8-m15]
"vld1.32 {q6-q7}, [%1] \n\t" // s
"vmul.f32 q8, q4, d0[0] \n\t" // DST->M[m0-m3] = M[m0-m3] * s
"vmul.f32 q9, q5, d0[0] \n\t" // DST->M[m4-m7] = M[m4-m7] * s
"vmul.f32 q10, q6, d0[0] \n\t" // DST->M[m8-m11] = M[m8-m11] * s
"vmul.f32 q11, q7, d0[0] \n\t" // DST->M[m12-m15] = M[m12-m15] * s
"vst1.32 {q8-q9}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {q10-q11}, [%0] \n\t" // DST->M[m8-m15]
:
: "r"(dst), "r"(m), "r"(&scalar)
: "q0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "memory"
);
}
inline void MathUtilNeon::multiplyMatrix(const float* m1, const float* m2, float* dst)
{
asm volatile(
"vld1.32 {d16 - d19}, [%1]! \n\t" // M1[m0-m7]
"vld1.32 {d20 - d23}, [%1] \n\t" // M1[m8-m15]
"vld1.32 {d0 - d3}, [%2]! \n\t" // M2[m0-m7]
"vld1.32 {d4 - d7}, [%2] \n\t" // M2[m8-m15]
"vmul.f32 q12, q8, d0[0] \n\t" // DST->M[m0-m3] = M1[m0-m3] * M2[m0]
"vmul.f32 q13, q8, d2[0] \n\t" // DST->M[m4-m7] = M1[m4-m7] * M2[m4]
"vmul.f32 q14, q8, d4[0] \n\t" // DST->M[m8-m11] = M1[m8-m11] * M2[m8]
"vmul.f32 q15, q8, d6[0] \n\t" // DST->M[m12-m15] = M1[m12-m15] * M2[m12]
"vmla.f32 q12, q9, d0[1] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m1]
"vmla.f32 q13, q9, d2[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m5]
"vmla.f32 q14, q9, d4[1] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m9]
"vmla.f32 q15, q9, d6[1] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m13]
"vmla.f32 q12, q10, d1[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m2]
"vmla.f32 q13, q10, d3[0] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m6]
"vmla.f32 q14, q10, d5[0] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m10]
"vmla.f32 q15, q10, d7[0] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m14]
"vmla.f32 q12, q11, d1[1] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m3]
"vmla.f32 q13, q11, d3[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m7]
"vmla.f32 q14, q11, d5[1] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m11]
"vmla.f32 q15, q11, d7[1] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m15]
"vst1.32 {d24 - d27}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {d28 - d31}, [%0] \n\t" // DST->M[m8-m15]
: // output
: "r"(dst), "r"(m1), "r"(m2) // input - note *value* of pointer doesn't change.
: "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
inline void MathUtilNeon::negateMatrix(const float* m, float* dst)
{
asm volatile(
"vld1.32 {q0-q1}, [%1]! \n\t" // load m0-m7
"vld1.32 {q2-q3}, [%1] \n\t" // load m8-m15
"vneg.f32 q4, q0 \n\t" // negate m0-m3
"vneg.f32 q5, q1 \n\t" // negate m4-m7
"vneg.f32 q6, q2 \n\t" // negate m8-m15
"vneg.f32 q7, q3 \n\t" // negate m8-m15
"vst1.32 {q4-q5}, [%0]! \n\t" // store m0-m7
"vst1.32 {q6-q7}, [%0] \n\t" // store m8-m15
:
: "r"(dst), "r"(m)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory"
);
}
inline void MathUtilNeon::transposeMatrix(const float* m, float* dst)
{
asm volatile(
"vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%1]! \n\t" // DST->M[m0, m4, m8, m12] = M[m0-m3]
"vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1]! \n\t" // DST->M[m1, m5, m9, m12] = M[m4-m7]
"vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%1]! \n\t" // DST->M[m2, m6, m10, m12] = M[m8-m11]
"vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%1] \n\t" // DST->M[m3, m7, m11, m12] = M[m12-m15]
"vst1.32 {q0-q1}, [%0]! \n\t" // DST->M[m0-m7]
"vst1.32 {q2-q3}, [%0] \n\t" // DST->M[m8-m15]
:
: "r"(dst), "r"(m)
: "q0", "q1", "q2", "q3", "memory"
);
}
inline void MathUtilNeon::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
{
asm volatile(
"vld1.32 {d0[0]}, [%1] \n\t" // V[x]
"vld1.32 {d0[1]}, [%2] \n\t" // V[y]
"vld1.32 {d1[0]}, [%3] \n\t" // V[z]
"vld1.32 {d1[1]}, [%4] \n\t" // V[w]
"vld1.32 {d18 - d21}, [%5]! \n\t" // M[m0-m7]
"vld1.32 {d22 - d25}, [%5] \n\t" // M[m8-m15]
"vmul.f32 q13, q9, d0[0] \n\t" // DST->V = M[m0-m3] * V[x]
"vmla.f32 q13, q10, d0[1] \n\t" // DST->V += M[m4-m7] * V[y]
"vmla.f32 q13, q11, d1[0] \n\t" // DST->V += M[m8-m11] * V[z]
"vmla.f32 q13, q12, d1[1] \n\t" // DST->V += M[m12-m15] * V[w]
"vst1.32 {d26}, [%0]! \n\t" // DST->V[x, y]
"vst1.32 {d27[0]}, [%0] \n\t" // DST->V[z]
:
: "r"(dst), "r"(&x), "r"(&y), "r"(&z), "r"(&w), "r"(m)
: "q0", "q9", "q10","q11", "q12", "q13", "memory"
);
}
inline void MathUtilNeon::transformVec4(const float* m, const float* v, float* dst)
{
asm volatile
(
"vld1.32 {d0, d1}, [%1] \n\t" // V[x, y, z, w]
"vld1.32 {d18 - d21}, [%2]! \n\t" // M[m0-m7]
"vld1.32 {d22 - d25}, [%2] \n\t" // M[m8-m15]
"vmul.f32 q13, q9, d0[0] \n\t" // DST->V = M[m0-m3] * V[x]
"vmla.f32 q13, q10, d0[1] \n\t" // DST->V = M[m4-m7] * V[y]
"vmla.f32 q13, q11, d1[0] \n\t" // DST->V = M[m8-m11] * V[z]
"vmla.f32 q13, q12, d1[1] \n\t" // DST->V = M[m12-m15] * V[w]
"vst1.32 {d26, d27}, [%0] \n\t" // DST->V
:
: "r"(dst), "r"(v), "r"(m)
: "q0", "q9", "q10","q11", "q12", "q13", "memory"
);
}
inline void MathUtilNeon::crossVec3(const float* v1, const float* v2, float* dst)
{
asm volatile(
"vld1.32 {d1[1]}, [%1] \n\t" //
"vld1.32 {d0}, [%2] \n\t" //
"vmov.f32 s2, s1 \n\t" // q0 = (v1y, v1z, v1z, v1x)
"vld1.32 {d2[1]}, [%3] \n\t" //
"vld1.32 {d3}, [%4] \n\t" //
"vmov.f32 s4, s7 \n\t" // q1 = (v2z, v2x, v2y, v2z)
"vmul.f32 d4, d0, d2 \n\t" // x = v1y * v2z, y = v1z * v2x
"vmls.f32 d4, d1, d3 \n\t" // x -= v1z * v2y, y-= v1x - v2z
"vmul.f32 d5, d3, d1[1] \n\t" // z = v1x * v2y
"vmls.f32 d5, d0, d2[1] \n\t" // z-= v1y * vx
"vst1.32 {d4}, [%0]! \n\t" // V[x, y]
"vst1.32 {d5[0]}, [%0] \n\t" // V[z]
:
: "r"(dst), "r"(v1), "r"((v1+1)), "r"(v2), "r"((v2+1))
: "q0", "q1", "q2", "memory"
);
}
inline void MathUtilNeon::transformVertices(ax::V3F_C4B_T2F* dst, const ax::V3F_C4B_T2F* src, size_t count, const ax::Mat4& transform)
{
auto end = dst + count;
// Load matrix
float32x4_t mc0 = vld1q_f32(transform.m);
float32x4_t mc1 = vld1q_f32(transform.m + 4);
float32x4_t mc2 = vld1q_f32(transform.m + 8);
float32x4_t mc3 = vld1q_f32(transform.m + 12);
// Process 4 vertices at a time
auto end4 = dst + count / 4 * 4;
while (dst < end4)
{
// Load 4 vertices. Note that color will also get loaded into w
float32x2_t xy0 = vld1_f32(&src[0].vertices.x);
float32x2_t zw0 = vld1_f32(&src[0].vertices.z);
float32x2_t uv0 = vld1_f32(&src[0].texCoords.u);
float32x2_t xy1 = vld1_f32(&src[1].vertices.x);
float32x2_t zw1 = vld1_f32(&src[1].vertices.z);
float32x2_t uv1 = vld1_f32(&src[1].texCoords.u);
float32x2_t xy2 = vld1_f32(&src[2].vertices.x);
float32x2_t zw2 = vld1_f32(&src[2].vertices.z);
float32x2_t uv2 = vld1_f32(&src[2].texCoords.u);
float32x2_t xy3 = vld1_f32(&src[3].vertices.x);
float32x2_t zw3 = vld1_f32(&src[3].vertices.z);
float32x2_t uv3 = vld1_f32(&src[3].texCoords.u);
// Multiply x by column 0
float32x4_t r0 = vmulq_lane_f32(mc0, xy0, 0);
float32x4_t r1 = vmulq_lane_f32(mc0, xy1, 0);
float32x4_t r2 = vmulq_lane_f32(mc0, xy2, 0);
float32x4_t r3 = vmulq_lane_f32(mc0, xy3, 0);
// Multiply y by column 1 and add to result
r0 = vmlaq_lane_f32(r0, mc1, xy0, 1);
r1 = vmlaq_lane_f32(r1, mc1, xy1, 1);
r2 = vmlaq_lane_f32(r2, mc1, xy2, 1);
r3 = vmlaq_lane_f32(r3, mc1, xy3, 1);
// Multiply z by column 2 and add to result
r0 = vmlaq_lane_f32(r0, mc2, zw0, 0);
r1 = vmlaq_lane_f32(r1, mc2, zw1, 0);
r2 = vmlaq_lane_f32(r2, mc2, zw2, 0);
r3 = vmlaq_lane_f32(r3, mc2, zw3, 0);
// Add column 3
r0 = vaddq_f32(r0, mc3);
r1 = vaddq_f32(r1, mc3);
r2 = vaddq_f32(r2, mc3);
r3 = vaddq_f32(r3, mc3);
// Set color
r0 = vsetq_lane_f32(vget_lane_f32(zw0, 1), r0, 3);
r1 = vsetq_lane_f32(vget_lane_f32(zw1, 1), r1, 3);
r2 = vsetq_lane_f32(vget_lane_f32(zw2, 1), r2, 3);
r3 = vsetq_lane_f32(vget_lane_f32(zw3, 1), r3, 3);
// Store result
vst1q_f32(&dst[0].vertices.x, r0);
vst1_f32(&dst[0].texCoords.u, uv0);
vst1q_f32(&dst[1].vertices.x, r1);
vst1_f32(&dst[1].texCoords.u, uv1);
vst1q_f32(&dst[2].vertices.x, r2);
vst1_f32(&dst[2].texCoords.u, uv2);
vst1q_f32(&dst[3].vertices.x, r3);
vst1_f32(&dst[3].texCoords.u, uv3);
dst += 4;
src += 4;
}
// Process remaining vertices
while (dst < end)
{
// Load vertex
float32x2_t xy = vld1_f32(&src->vertices.x);
float32x2_t zw = vld1_f32(&src->vertices.z);
float32x2_t uv = vld1_f32(&src->texCoords.u);
// Multiply x by column 0
float32x4_t r = vmulq_lane_f32(mc0, xy, 0);
// Multiply y by column 1 and add to result
r = vmlaq_lane_f32(r, mc1, xy, 1);
// Multiply z by column 2 and add to result
r = vmlaq_lane_f32(r, mc2, zw, 0);
// Add column 3
r = vaddq_f32(r, mc3);
// Set color
r = vsetq_lane_f32(vget_lane_f32(zw, 1), r, 3);
// Store result
vst1q_f32(&dst->vertices.x, r);
vst1_f32(&dst->texCoords.u, uv);
++dst;
++src;
}
}
NS_AX_MATH_END

View File

@ -1,398 +0,0 @@
/**
Copyright 2013 BlackBerry Inc.
Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Original file from GamePlay3D: http://gameplay3d.org
This file was modified to fit the cocos2d-x project
*/
#include <arm_neon.h>
#include "base/Types.h"
NS_AX_MATH_BEGIN
class MathUtilNeon64
{
public:
inline static void addMatrix(const float* m, float scalar, float* dst);
inline static void addMatrix(const float* m1, const float* m2, float* dst);
inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
inline static void multiplyMatrix(const float* m, float scalar, float* dst);
inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
inline static void negateMatrix(const float* m, float* dst);
inline static void transposeMatrix(const float* m, float* dst);
inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
inline static void transformVec4(const float* m, const float* v, float* dst);
inline static void crossVec3(const float* v1, const float* v2, float* dst);
inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform);
inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset);
};
inline void MathUtilNeon64::addMatrix(const float* m, float scalar, float* dst)
{
asm volatile(
"ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // M[m0-m7] M[m8-m15]
"ld1r {v4.4s}, [%2] \n\t" //ssss
"fadd v8.4s, v0.4s, v4.4s \n\t" // DST->M[m0-m3] = M[m0-m3] + s
"fadd v9.4s, v1.4s, v4.4s \n\t" // DST->M[m4-m7] = M[m4-m7] + s
"fadd v10.4s, v2.4s, v4.4s \n\t" // DST->M[m8-m11] = M[m8-m11] + s
"fadd v11.4s, v3.4s, v4.4s \n\t" // DST->M[m12-m15] = M[m12-m15] + s
"st4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n\t" // Result in V9
:
: "r"(dst), "r"(m), "r"(&scalar)
: "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "memory"
);
}
inline void MathUtilNeon64::addMatrix(const float* m1, const float* m2, float* dst)
{
asm volatile(
"ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // M1[m0-m7] M1[m8-m15]
"ld4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%2] \n\t" // M2[m0-m7] M2[m8-m15]
"fadd v12.4s, v0.4s, v8.4s \n\t" // DST->M[m0-m3] = M1[m0-m3] + M2[m0-m3]
"fadd v13.4s, v1.4s, v9.4s \n\t" // DST->M[m4-m7] = M1[m4-m7] + M2[m4-m7]
"fadd v14.4s, v2.4s, v10.4s \n\t" // DST->M[m8-m11] = M1[m8-m11] + M2[m8-m11]
"fadd v15.4s, v3.4s, v11.4s \n\t" // DST->M[m12-m15] = M1[m12-m15] + M2[m12-m15]
"st4 {v12.4s, v13.4s, v14.4s, v15.4s}, [%0] \n\t" // DST->M[m0-m7] DST->M[m8-m15]
:
: "r"(dst), "r"(m1), "r"(m2)
: "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
);
}
inline void MathUtilNeon64::subtractMatrix(const float* m1, const float* m2, float* dst)
{
asm volatile(
"ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // M1[m0-m7] M1[m8-m15]
"ld4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%2] \n\t" // M2[m0-m7] M2[m8-m15]
"fsub v12.4s, v0.4s, v8.4s \n\t" // DST->M[m0-m3] = M1[m0-m3] - M2[m0-m3]
"fsub v13.4s, v1.4s, v9.4s \n\t" // DST->M[m4-m7] = M1[m4-m7] - M2[m4-m7]
"fsub v14.4s, v2.4s, v10.4s \n\t" // DST->M[m8-m11] = M1[m8-m11] - M2[m8-m11]
"fsub v15.4s, v3.4s, v11.4s \n\t" // DST->M[m12-m15] = M1[m12-m15] - M2[m12-m15]
"st4 {v12.4s, v13.4s, v14.4s, v15.4s}, [%0] \n\t" // DST->M[m0-m7] DST->M[m8-m15]
:
: "r"(dst), "r"(m1), "r"(m2)
: "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
);
}
inline void MathUtilNeon64::multiplyMatrix(const float* m, float scalar, float* dst)
{
asm volatile(
"ld1 {v0.s}[0], [%2] \n\t" //s
"ld4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%1] \n\t" //M[m0-m7] M[m8-m15]
"fmul v8.4s, v4.4s, v0.s[0] \n\t" // DST->M[m0-m3] = M[m0-m3] * s
"fmul v9.4s, v5.4s, v0.s[0] \n\t" // DST->M[m4-m7] = M[m4-m7] * s
"fmul v10.4s, v6.4s, v0.s[0] \n\t" // DST->M[m8-m11] = M[m8-m11] * s
"fmul v11.4s, v7.4s, v0.s[0] \n\t" // DST->M[m12-m15] = M[m12-m15] * s
"st4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n\t" // DST->M[m0-m7] DST->M[m8-m15]
:
: "r"(dst), "r"(m), "r"(&scalar)
: "v0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
);
}
inline void MathUtilNeon64::multiplyMatrix(const float* m1, const float* m2, float* dst)
{
asm volatile(
"ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%1] \n\t" // M1[m0-m7] M1[m8-m15] M2[m0-m7] M2[m8-m15]
"ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2] \n\t" // M2[m0-m15]
"fmul v12.4s, v8.4s, v0.s[0] \n\t" // DST->M[m0-m3] = M1[m0-m3] * M2[m0]
"fmul v13.4s, v8.4s, v0.s[1] \n\t" // DST->M[m4-m7] = M1[m4-m7] * M2[m4]
"fmul v14.4s, v8.4s, v0.s[2] \n\t" // DST->M[m8-m11] = M1[m8-m11] * M2[m8]
"fmul v15.4s, v8.4s, v0.s[3] \n\t" // DST->M[m12-m15] = M1[m12-m15] * M2[m12]
"fmla v12.4s, v9.4s, v1.s[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m1]
"fmla v13.4s, v9.4s, v1.s[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m5]
"fmla v14.4s, v9.4s, v1.s[2] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m9]
"fmla v15.4s, v9.4s, v1.s[3] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m13]
"fmla v12.4s, v10.4s, v2.s[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m2]
"fmla v13.4s, v10.4s, v2.s[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m6]
"fmla v14.4s, v10.4s, v2.s[2] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m10]
"fmla v15.4s, v10.4s, v2.s[3] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m14]
"fmla v12.4s, v11.4s, v3.s[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m3]
"fmla v13.4s, v11.4s, v3.s[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m7]
"fmla v14.4s, v11.4s, v3.s[2] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m11]
"fmla v15.4s, v11.4s, v3.s[3] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m15]
"st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%0] \n\t" // DST->M[m0-m7]// DST->M[m8-m15]
: // output
: "r"(dst), "r"(m1), "r"(m2) // input - note *value* of pointer doesn't change.
: "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
);
}
inline void MathUtilNeon64::negateMatrix(const float* m, float* dst)
{
asm volatile(
"ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // load m0-m7 load m8-m15
"fneg v4.4s, v0.4s \n\t" // negate m0-m3
"fneg v5.4s, v1.4s \n\t" // negate m4-m7
"fneg v6.4s, v2.4s \n\t" // negate m8-m15
"fneg v7.4s, v3.4s \n\t" // negate m8-m15
"st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n\t" // store m0-m7 store m8-m15
:
: "r"(dst), "r"(m)
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory"
);
}
inline void MathUtilNeon64::transposeMatrix(const float* m, float* dst)
{
asm volatile(
"ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // DST->M[m0, m4, m8, m12] = M[m0-m3]
//DST->M[m1, m5, m9, m12] = M[m4-m7]
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n\t"
:
: "r"(dst), "r"(m)
: "v0", "v1", "v2", "v3", "memory"
);
}
inline void MathUtilNeon64::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
{
asm volatile(
"ld1 {v0.s}[0], [%1] \n\t" // V[x]
"ld1 {v0.s}[1], [%2] \n\t" // V[y]
"ld1 {v0.s}[2], [%3] \n\t" // V[z]
"ld1 {v0.s}[3], [%4] \n\t" // V[w]
"ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%5] \n\t" // M[m0-m7] M[m8-m15]
"fmul v13.4s, v9.4s, v0.s[0] \n\t" // DST->V = M[m0-m3] * V[x]
"fmla v13.4s, v10.4s, v0.s[1] \n\t" // DST->V += M[m4-m7] * V[y]
"fmla v13.4s, v11.4s, v0.s[2] \n\t" // DST->V += M[m8-m11] * V[z]
"fmla v13.4s, v12.4s, v0.s[3] \n\t" // DST->V += M[m12-m15] * V[w]
//"st1 {v13.4s}, [%0] \n\t" // DST->V[x, y] // DST->V[z]
"st1 {v13.2s}, [%0], 8 \n\t"
"st1 {v13.s}[2], [%0] \n\t"
: "+r"(dst)
: "r"(&x), "r"(&y), "r"(&z), "r"(&w), "r"(m)
: "v0", "v9", "v10","v11", "v12", "v13", "memory"
);
}
inline void MathUtilNeon64::transformVec4(const float* m, const float* v, float* dst)
{
asm volatile
(
"ld1 {v0.4s}, [%1] \n\t" // V[x, y, z, w]
"ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%2] \n\t" // M[m0-m7] M[m8-m15]
"fmul v13.4s, v9.4s, v0.s[0] \n\t" // DST->V = M[m0-m3] * V[x]
"fmla v13.4s, v10.4s, v0.s[1] \n\t" // DST->V = M[m4-m7] * V[y]
"fmla v13.4s, v11.4s, v0.s[2] \n\t" // DST->V = M[m8-m11] * V[z]
"fmla v13.4s, v12.4s, v0.s[3] \n\t" // DST->V = M[m12-m15] * V[w]
"st1 {v13.4s}, [%0] \n\t" // DST->V
:
: "r"(dst), "r"(v), "r"(m)
: "v0", "v9", "v10","v11", "v12", "v13", "memory"
);
}
inline void MathUtilNeon64::crossVec3(const float* v1, const float* v2, float* dst)
{
asm volatile(
"ld1 {v0.2s}, [%2] \n\t"
"ld1 {v0.s}[2], [%1] \n\t"
"mov v0.s[3], v0.s[0] \n\t" // q0 = (v1y, v1z, v1x, v1x)
"ld1 {v1.4s}, [%3] \n\t"
"mov v1.s[3], v1.s[0] \n\t" // q1 = (v2x, v2y, v2z, v2x)
"fmul v2.4s, v0.4s, v1.4s \n\t" // x = v1y * v2z, y = v1z * v2x
"mov v0.s[0], v0.s[1] \n\t"
"mov v0.s[1], v0.s[2] \n\t"
"mov v0.s[2], v0.s[3] \n\t"
"mov v1.s[3], v1.s[2] \n\t"
"fmul v0.4s, v0.4s, v1.4s \n\t"
"mov v0.s[3], v0.s[1] \n\t"
"mov v0.s[1], v0.s[2] \n\t"
"mov v0.s[2], v0.s[0] \n\t"
"fsub v2.4s, v0.4s, v2.4s \n\t"
"mov v2.s[0], v2.s[1] \n\t"
"mov v2.s[1], v2.s[2] \n\t"
"mov v2.s[2], v2.s[3] \n\t"
"st1 {v2.2s}, [%0], 8 \n\t" // V[x, y]
"st1 {v2.s}[2], [%0] \n\t" // V[z]
: "+r"(dst)
: "r"(v1), "r"((v1+1)), "r"(v2), "r"((v2+1))
: "v0", "v1", "v2", "memory"
);
}
inline void MathUtilNeon64::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
{
auto end = dst + count;
// Load matrix
float32x4x4_t m = vld1q_f32_x4(transform.m);
// Process 4 vertices at a time if there's enough data
auto end4 = dst + count / 4 * 4;
while (dst < end4)
{
// Do this for each vertex
// dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8] + m[12];
// dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9] + m[13];
// dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14];
// First, load each vertex, multiply x by column 0 and add to column 3
// Note: since we're reading 4 floats it will load color bytes into v.w
float32x4_t v0 = vld1q_f32(&src[0].vertices.x);
float32x4_t r0 = vmlaq_laneq_f32(m.val[3], m.val[0], v0, 0);
float32x4_t v1 = vld1q_f32(&src[1].vertices.x);
float32x4_t r1 = vmlaq_laneq_f32(m.val[3], m.val[0], v1, 0);
float32x4_t v2 = vld1q_f32(&src[2].vertices.x);
float32x4_t r2 = vmlaq_laneq_f32(m.val[3], m.val[0], v2, 0);
float32x4_t v3 = vld1q_f32(&src[3].vertices.x);
float32x4_t r3 = vmlaq_laneq_f32(m.val[3], m.val[0], v3, 0);
// Load texCoords
float32x2_t uv0 = vld1_f32(&src[0].texCoords.u);
float32x2_t uv1 = vld1_f32(&src[1].texCoords.u);
float32x2_t uv2 = vld1_f32(&src[2].texCoords.u);
float32x2_t uv3 = vld1_f32(&src[3].texCoords.u);
// Multiply y by column 1 and add to result
r0 = vmlaq_laneq_f32(r0, m.val[1], v0, 1);
r1 = vmlaq_laneq_f32(r1, m.val[1], v1, 1);
r2 = vmlaq_laneq_f32(r2, m.val[1], v2, 1);
r3 = vmlaq_laneq_f32(r3, m.val[1], v3, 1);
// Multiply z by column 2 and add to result
r0 = vmlaq_laneq_f32(r0, m.val[2], v0, 2);
r1 = vmlaq_laneq_f32(r1, m.val[2], v1, 2);
r2 = vmlaq_laneq_f32(r2, m.val[2], v2, 2);
r3 = vmlaq_laneq_f32(r3, m.val[2], v3, 2);
// Set w to loaded color
r0 = vsetq_lane_f32(vgetq_lane_f32(v0, 3), r0, 3);
r1 = vsetq_lane_f32(vgetq_lane_f32(v1, 3), r1, 3);
r2 = vsetq_lane_f32(vgetq_lane_f32(v2, 3), r2, 3);
r3 = vsetq_lane_f32(vgetq_lane_f32(v3, 3), r3, 3);
// Store result
vst1q_f32(&dst[0].vertices.x, r0);
vst1_f32(&dst[0].texCoords.u, uv0);
vst1q_f32(&dst[1].vertices.x, r1);
vst1_f32(&dst[1].texCoords.u, uv1);
vst1q_f32(&dst[2].vertices.x, r2);
vst1_f32(&dst[2].texCoords.u, uv2);
vst1q_f32(&dst[3].vertices.x, r3);
vst1_f32(&dst[3].texCoords.u, uv3);
dst += 4;
src += 4;
}
// Process remaining vertices one by one
while (dst < end)
{
float32x4_t v = vld1q_f32(&src->vertices.x);
float32x4_t r = vmlaq_laneq_f32(m.val[3], m.val[0], v, 0);
r = vmlaq_laneq_f32(r, m.val[1], v, 1);
r = vmlaq_laneq_f32(r, m.val[2], v, 2);
r = vsetq_lane_f32(vgetq_lane_f32(v, 3), r, 3);
float32x2_t uv = vld1_f32(&src->texCoords.u);
vst1q_f32(&dst->vertices.x, r);
vst1_f32(&dst->texCoords.u, uv);
++dst;
++src;
}
}
inline void MathUtilNeon64::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
{
auto end = dst + count;
auto off = vdupq_n_u16(offset);
if (count < 8)
goto LEFTOVER;
// Process 32 indices at a time if there's enough data
while (count >= 32)
{
// Load 32 indices
uint16x8x4_t v = vld1q_u16_x4(src);
// Add offset
v.val[0] = vaddq_u16(v.val[0], off);
v.val[1] = vaddq_u16(v.val[1], off);
v.val[2] = vaddq_u16(v.val[2], off);
v.val[3] = vaddq_u16(v.val[3], off);
// Store result
vst1q_u16_x4(dst, v);
dst += 32;
src += 32;
count -= 32;
}
// Process 8 indices at a time if there's enough data
while (count >= 8)
{
uint16x8_t v = vld1q_u16(src);
v = vaddq_u16(v, off);
vst1q_u16(dst, v);
dst += 8;
src += 8;
count -= 8;
}
LEFTOVER:
// Process remaining indices one by one
while (count > 0)
{
*dst = *src + offset;
++dst;
++src;
--count;
}
}
NS_AX_MATH_END

View File

@ -1,157 +1,276 @@
/****************************************************************************
Copyright (c) 2010-2012 cocos2d-x.org
Copyright (c) 2013-2017 Chukong Technologies
Copyright (c) 2017-2018 Xiamen Yaji Software Co., Ltd.
Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
https://axmol.dev/
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
****************************************************************************/
NS_AX_MATH_BEGIN
#ifdef AX_USE_SSE
#ifdef AX_SSE_INTRINSICS
void MathUtil::addMatrix(const __m128 m[4], float scalar, __m128 dst[4])
struct MathUtilSSE
{
__m128 s = _mm_set1_ps(scalar);
dst[0] = _mm_add_ps(m[0], s);
dst[1] = _mm_add_ps(m[1], s);
dst[2] = _mm_add_ps(m[2], s);
dst[3] = _mm_add_ps(m[3], s);
}
void MathUtil::addMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
{
dst[0] = _mm_add_ps(m1[0], m2[0]);
dst[1] = _mm_add_ps(m1[1], m2[1]);
dst[2] = _mm_add_ps(m1[2], m2[2]);
dst[3] = _mm_add_ps(m1[3], m2[3]);
}
static void addMatrix(const __m128 m[4], float scalar, __m128 dst[4])
{
__m128 s = _mm_set1_ps(scalar);
dst[0] = _mm_add_ps(m[0], s);
dst[1] = _mm_add_ps(m[1], s);
dst[2] = _mm_add_ps(m[2], s);
dst[3] = _mm_add_ps(m[3], s);
}
void MathUtil::subtractMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
{
dst[0] = _mm_sub_ps(m1[0], m2[0]);
dst[1] = _mm_sub_ps(m1[1], m2[1]);
dst[2] = _mm_sub_ps(m1[2], m2[2]);
dst[3] = _mm_sub_ps(m1[3], m2[3]);
}
static void addMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
{
dst[0] = _mm_add_ps(m1[0], m2[0]);
dst[1] = _mm_add_ps(m1[1], m2[1]);
dst[2] = _mm_add_ps(m1[2], m2[2]);
dst[3] = _mm_add_ps(m1[3], m2[3]);
}
void MathUtil::multiplyMatrix(const __m128 m[4], float scalar, __m128 dst[4])
{
__m128 s = _mm_set1_ps(scalar);
dst[0] = _mm_mul_ps(m[0], s);
dst[1] = _mm_mul_ps(m[1], s);
dst[2] = _mm_mul_ps(m[2], s);
dst[3] = _mm_mul_ps(m[3], s);
}
static void subtractMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
{
dst[0] = _mm_sub_ps(m1[0], m2[0]);
dst[1] = _mm_sub_ps(m1[1], m2[1]);
dst[2] = _mm_sub_ps(m1[2], m2[2]);
dst[3] = _mm_sub_ps(m1[3], m2[3]);
}
void MathUtil::multiplyMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
{
__m128 dst0, dst1, dst2, dst3;
{
__m128 e0 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(0, 0, 0, 0));
__m128 e1 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(1, 1, 1, 1));
__m128 e2 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(2, 2, 2, 2));
__m128 e3 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(3, 3, 3, 3));
__m128 v0 = _mm_mul_ps(m1[0], e0);
__m128 v1 = _mm_mul_ps(m1[1], e1);
__m128 v2 = _mm_mul_ps(m1[2], e2);
__m128 v3 = _mm_mul_ps(m1[3], e3);
__m128 a0 = _mm_add_ps(v0, v1);
__m128 a1 = _mm_add_ps(v2, v3);
__m128 a2 = _mm_add_ps(a0, a1);
dst0 = a2;
}
{
__m128 e0 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(0, 0, 0, 0));
__m128 e1 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(1, 1, 1, 1));
__m128 e2 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(2, 2, 2, 2));
__m128 e3 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(3, 3, 3, 3));
__m128 v0 = _mm_mul_ps(m1[0], e0);
__m128 v1 = _mm_mul_ps(m1[1], e1);
__m128 v2 = _mm_mul_ps(m1[2], e2);
__m128 v3 = _mm_mul_ps(m1[3], e3);
__m128 a0 = _mm_add_ps(v0, v1);
__m128 a1 = _mm_add_ps(v2, v3);
__m128 a2 = _mm_add_ps(a0, a1);
dst1 = a2;
}
{
__m128 e0 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(0, 0, 0, 0));
__m128 e1 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(1, 1, 1, 1));
__m128 e2 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(2, 2, 2, 2));
__m128 e3 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(3, 3, 3, 3));
__m128 v0 = _mm_mul_ps(m1[0], e0);
__m128 v1 = _mm_mul_ps(m1[1], e1);
__m128 v2 = _mm_mul_ps(m1[2], e2);
__m128 v3 = _mm_mul_ps(m1[3], e3);
__m128 a0 = _mm_add_ps(v0, v1);
__m128 a1 = _mm_add_ps(v2, v3);
__m128 a2 = _mm_add_ps(a0, a1);
dst2 = a2;
}
{
__m128 e0 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(0, 0, 0, 0));
__m128 e1 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(1, 1, 1, 1));
__m128 e2 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(2, 2, 2, 2));
__m128 e3 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(3, 3, 3, 3));
__m128 v0 = _mm_mul_ps(m1[0], e0);
__m128 v1 = _mm_mul_ps(m1[1], e1);
__m128 v2 = _mm_mul_ps(m1[2], e2);
__m128 v3 = _mm_mul_ps(m1[3], e3);
__m128 a0 = _mm_add_ps(v0, v1);
__m128 a1 = _mm_add_ps(v2, v3);
__m128 a2 = _mm_add_ps(a0, a1);
dst3 = a2;
}
dst[0] = dst0;
dst[1] = dst1;
dst[2] = dst2;
dst[3] = dst3;
}
static void multiplyMatrix(const __m128 m[4], float scalar, __m128 dst[4])
{
__m128 s = _mm_set1_ps(scalar);
dst[0] = _mm_mul_ps(m[0], s);
dst[1] = _mm_mul_ps(m[1], s);
dst[2] = _mm_mul_ps(m[2], s);
dst[3] = _mm_mul_ps(m[3], s);
}
void MathUtil::negateMatrix(const __m128 m[4], __m128 dst[4])
{
__m128 z = _mm_setzero_ps();
dst[0] = _mm_sub_ps(z, m[0]);
dst[1] = _mm_sub_ps(z, m[1]);
dst[2] = _mm_sub_ps(z, m[2]);
dst[3] = _mm_sub_ps(z, m[3]);
}
static void multiplyMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
{
__m128 dst0, dst1, dst2, dst3;
{
__m128 e0 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(0, 0, 0, 0));
__m128 e1 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(1, 1, 1, 1));
__m128 e2 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(2, 2, 2, 2));
__m128 e3 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(3, 3, 3, 3));
void MathUtil::transposeMatrix(const __m128 m[4], __m128 dst[4])
{
__m128 tmp0 = _mm_shuffle_ps(m[0], m[1], 0x44);
__m128 tmp2 = _mm_shuffle_ps(m[0], m[1], 0xEE);
__m128 tmp1 = _mm_shuffle_ps(m[2], m[3], 0x44);
__m128 tmp3 = _mm_shuffle_ps(m[2], m[3], 0xEE);
dst[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88);
dst[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
dst[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88);
dst[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
}
__m128 v0 = _mm_mul_ps(m1[0], e0);
__m128 v1 = _mm_mul_ps(m1[1], e1);
__m128 v2 = _mm_mul_ps(m1[2], e2);
__m128 v3 = _mm_mul_ps(m1[3], e3);
void MathUtil::transformVec4(const __m128 m[4], const __m128& v, __m128& dst)
{
__m128 col1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
__m128 col2 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
__m128 col3 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2));
__m128 col4 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
dst = _mm_add_ps(
_mm_add_ps(_mm_mul_ps(m[0], col1), _mm_mul_ps(m[1], col2)),
_mm_add_ps(_mm_mul_ps(m[2], col3), _mm_mul_ps(m[3], col4))
);
}
__m128 a0 = _mm_add_ps(v0, v1);
__m128 a1 = _mm_add_ps(v2, v3);
__m128 a2 = _mm_add_ps(a0, a1);
dst0 = a2;
}
{
__m128 e0 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(0, 0, 0, 0));
__m128 e1 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(1, 1, 1, 1));
__m128 e2 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(2, 2, 2, 2));
__m128 e3 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(3, 3, 3, 3));
__m128 v0 = _mm_mul_ps(m1[0], e0);
__m128 v1 = _mm_mul_ps(m1[1], e1);
__m128 v2 = _mm_mul_ps(m1[2], e2);
__m128 v3 = _mm_mul_ps(m1[3], e3);
__m128 a0 = _mm_add_ps(v0, v1);
__m128 a1 = _mm_add_ps(v2, v3);
__m128 a2 = _mm_add_ps(a0, a1);
dst1 = a2;
}
{
__m128 e0 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(0, 0, 0, 0));
__m128 e1 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(1, 1, 1, 1));
__m128 e2 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(2, 2, 2, 2));
__m128 e3 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(3, 3, 3, 3));
__m128 v0 = _mm_mul_ps(m1[0], e0);
__m128 v1 = _mm_mul_ps(m1[1], e1);
__m128 v2 = _mm_mul_ps(m1[2], e2);
__m128 v3 = _mm_mul_ps(m1[3], e3);
__m128 a0 = _mm_add_ps(v0, v1);
__m128 a1 = _mm_add_ps(v2, v3);
__m128 a2 = _mm_add_ps(a0, a1);
dst2 = a2;
}
{
__m128 e0 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(0, 0, 0, 0));
__m128 e1 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(1, 1, 1, 1));
__m128 e2 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(2, 2, 2, 2));
__m128 e3 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(3, 3, 3, 3));
__m128 v0 = _mm_mul_ps(m1[0], e0);
__m128 v1 = _mm_mul_ps(m1[1], e1);
__m128 v2 = _mm_mul_ps(m1[2], e2);
__m128 v3 = _mm_mul_ps(m1[3], e3);
__m128 a0 = _mm_add_ps(v0, v1);
__m128 a1 = _mm_add_ps(v2, v3);
__m128 a2 = _mm_add_ps(a0, a1);
dst3 = a2;
}
dst[0] = dst0;
dst[1] = dst1;
dst[2] = dst2;
dst[3] = dst3;
}
static void negateMatrix(const __m128 m[4], __m128 dst[4])
{
__m128 z = _mm_setzero_ps();
dst[0] = _mm_sub_ps(z, m[0]);
dst[1] = _mm_sub_ps(z, m[1]);
dst[2] = _mm_sub_ps(z, m[2]);
dst[3] = _mm_sub_ps(z, m[3]);
}
static void transposeMatrix(const __m128 m[4], __m128 dst[4])
{
__m128 tmp0 = _mm_shuffle_ps(m[0], m[1], 0x44);
__m128 tmp2 = _mm_shuffle_ps(m[0], m[1], 0xEE);
__m128 tmp1 = _mm_shuffle_ps(m[2], m[3], 0x44);
__m128 tmp3 = _mm_shuffle_ps(m[2], m[3], 0xEE);
dst[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88);
dst[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
dst[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88);
dst[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
}
static void transformVec4(const __m128 m[4], float x, float y, float z, float w, float* dst /*vec3*/)
{
//__m128 res = _mm_set_ps(w, z, y, x);
//__m128 xx = _mm_shuffle_ps(res, res, _MM_SHUFFLE(0, 0, 0, 0));
//__m128 yy = _mm_shuffle_ps(res, res, _MM_SHUFFLE(1, 1, 1, 1));
//__m128 zz = _mm_shuffle_ps(res, res, _MM_SHUFFLE(2, 2, 2, 2));
//__m128 ww = _mm_shuffle_ps(res, res, _MM_SHUFFLE(3, 3, 3, 3));
__m128 xx = _mm_set1_ps(x);
__m128 yy = _mm_set1_ps(y);
__m128 zz = _mm_set1_ps(z);
__m128 ww = _mm_set1_ps(w);
auto res = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m[0], xx), _mm_mul_ps(m[1], yy)),
_mm_add_ps(_mm_mul_ps(m[2], zz), _mm_mul_ps(m[3], ww)));
_mm_storel_pi((__m64*)dst, res);
# if defined(__SSE4_1__)
*reinterpret_cast<int*>(dst + 2) = _mm_extract_ps(res, 2);
# else
dst[2] = _mm_cvtss_f32(_mm_movehl_ps(res, res));
# endif
}
static void transformVec4(const __m128 m[4], const float* v /*vec4*/, float* dst /*vec4*/)
{
//__m128 res = _mm_loadu_ps(v);
//__m128 xx = _mm_shuffle_ps(res, res, _MM_SHUFFLE(0, 0, 0, 0));
//__m128 yy = _mm_shuffle_ps(res, res, _MM_SHUFFLE(1, 1, 1, 1));
//__m128 zz = _mm_shuffle_ps(res, res, _MM_SHUFFLE(2, 2, 2, 2));
//__m128 ww = _mm_shuffle_ps(res, res, _MM_SHUFFLE(3, 3, 3, 3));
__m128 xx = _mm_set1_ps(v[0]);
__m128 yy = _mm_set1_ps(v[1]);
__m128 zz = _mm_set1_ps(v[2]);
__m128 ww = _mm_set1_ps(v[3]);
auto res = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m[0], xx), _mm_mul_ps(m[1], yy)),
_mm_add_ps(_mm_mul_ps(m[2], zz), _mm_mul_ps(m[3], ww)));
_mm_storeu_ps(dst, res);
}
static void crossVec3(const float* v1, const float* v2, float* dst)
{
__m128 a = _mm_set_ps(0.0f, v1[2], v1[1], v1[0]);
__m128 b = _mm_set_ps(0.0f, v2[2], v2[1], v2[0]);
__m128 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
__m128 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
__m128 res = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
res = _mm_shuffle_ps(res, res, _MM_SHUFFLE(3, 0, 2, 1));
_mm_storel_pi((__m64*)dst, res);
# if defined(__SSE4_1__)
*reinterpret_cast<int*>(dst + 2) = _mm_extract_ps(res, 2);
# else
dst[2] = _mm_cvtss_f32(_mm_movehl_ps(res, res));
# endif
}
static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
{
auto& m = transform.col;
for (size_t i = 0; i < count; ++i)
{
auto& vert = src[i].vertices;
__m128 v = _mm_set_ps(1.0f, vert.z, vert.y, vert.x);
v = _mm_add_ps(
_mm_add_ps(_mm_mul_ps(m[0], _mm_shuffle_ps(v, v, 0)), _mm_mul_ps(m[1], _mm_shuffle_ps(v, v, 0x55))),
_mm_add_ps(_mm_mul_ps(m[2], _mm_shuffle_ps(v, v, 0xaa)), _mm_mul_ps(m[3], _mm_shuffle_ps(v, v, 0xff))));
_mm_storeu_ps((float*)&dst[i].vertices, v);
// Copy tex coords and colors
// dst[i].texCoords = src[i].texCoords;
// dst[i].colors = src[i].colors;
memcpy(&dst[i].colors, &src[i].colors, sizeof(V3F_C4B_T2F::colors) + sizeof(V3F_C4B_T2F::texCoords));
}
}
static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
{
__m128i offset_vector = _mm_set1_epi16(offset);
size_t remainder = count % 8;
size_t rounded_count = count - remainder;
for (size_t i = 0; i < rounded_count; i += 8)
{
__m128i current_values = _mm_loadu_si128((__m128i*)(src + i)); // Load 8 values.
current_values = _mm_add_epi16(current_values, offset_vector); // Add offset to them.
_mm_storeu_si128((__m128i*)(dst + i), current_values); // Store the result.
}
// If count is not divisible by 8, add offset for the remainder elements one by one.
for (size_t i = 0; i < remainder; ++i)
{
dst[rounded_count + i] = src[rounded_count + i] + offset;
}
}
};
#endif
NS_AX_MATH_END

View File

@ -163,5 +163,26 @@ Linux: Desktop GL/Vulkan
# endif
#endif
// ## SIMD detections
#if !defined(AX_NEON_INTRINSICS)
# if (AX_TARGET_PLATFORM != AX_PLATFORM_WASM)
# if defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM) || defined(__ARM_NEON__)
# define AX_NEON_INTRINSICS 1
# endif
# endif
#endif
#ifdef AX_SSE_INTRINSICS
// axmol math ISA require SSE2 at latest
# include <emmintrin.h>
# if defined(__SSE4_1__)
# include <smmintrin.h>
# endif
using _xm128_t = __m128;
#elif defined(AX_NEON_INTRINSICS)
# include <arm_neon.h>
using _xm128_t = float32x4_t;
#endif
/// @endcond
#endif // __BASE_AX_PLATFORM_CONFIG_H__

View File

@ -89,12 +89,12 @@ Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
* @since v0.99.5
*/
#if (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID)
# if !defined(AX_ENABLE_CACHE_TEXTURE_DATA)
# define AX_ENABLE_CACHE_TEXTURE_DATA 1
# endif
# if !defined(AX_ENABLE_CACHE_TEXTURE_DATA)
# define AX_ENABLE_CACHE_TEXTURE_DATA 1
# endif
#else
# undef AX_ENABLE_CACHE_TEXTURE_DATA
# define AX_ENABLE_CACHE_TEXTURE_DATA 0
# undef AX_ENABLE_CACHE_TEXTURE_DATA
# define AX_ENABLE_CACHE_TEXTURE_DATA 0
#endif
/** @def AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST
@ -102,12 +102,12 @@ Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
*
*/
#if (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID && !AX_ENABLE_CACHE_TEXTURE_DATA)
# if !defined(AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST)
# define AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST 1
# endif
# if !defined(AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST)
# define AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST 1
# endif
#else
# undef AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST
# define AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST 0
# undef AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST
# define AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST 0
#endif
#if (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID) || (AX_TARGET_PLATFORM == AX_PLATFORM_WIN32)
@ -188,14 +188,20 @@ protected: \
varType varName; \
\
public: \
virtual inline varType get##funName() const { return varName; }
virtual inline varType get##funName() const \
{ \
return varName; \
}
#define AX_SYNTHESIZE_READONLY_PASS_BY_REF(varType, varName, funName) \
protected: \
varType varName; \
\
public: \
virtual inline const varType& get##funName() const { return varName; }
virtual inline const varType& get##funName() const \
{ \
return varName; \
}
/** @def AX_SYNTHESIZE
* It is used to declare a protected variable.
@ -209,36 +215,51 @@ public: \
* The variables and methods declared after AX_SYNTHESIZE are all public.
* If you need protected or private, please declare.
*/
#define AX_SYNTHESIZE(varType, varName, funName) \
protected: \
varType varName; \
\
public: \
virtual inline varType get##funName() const { return varName; } \
virtual inline void set##funName(varType var) { varName = var; }
#define AX_SYNTHESIZE(varType, varName, funName) \
protected: \
varType varName; \
\
public: \
virtual inline varType get##funName() const \
{ \
return varName; \
} \
virtual inline void set##funName(varType var) \
{ \
varName = var; \
}
#define AX_SYNTHESIZE_PASS_BY_REF(varType, varName, funName) \
protected: \
varType varName; \
\
public: \
virtual inline const varType& get##funName() const { return varName; } \
virtual inline void set##funName(const varType& var) { varName = var; }
#define AX_SYNTHESIZE_PASS_BY_REF(varType, varName, funName) \
protected: \
varType varName; \
\
public: \
virtual inline const varType& get##funName() const \
{ \
return varName; \
} \
virtual inline void set##funName(const varType& var) \
{ \
varName = var; \
}
#define AX_SYNTHESIZE_RETAIN(varType, varName, funName) \
private: \
varType varName; \
\
public: \
virtual inline varType get##funName() const { return varName; } \
virtual inline void set##funName(varType var) \
{ \
if (varName != var) \
{ \
AX_SAFE_RETAIN(var); \
AX_SAFE_RELEASE(varName); \
varName = var; \
} \
#define AX_SYNTHESIZE_RETAIN(varType, varName, funName) \
private: \
varType varName; \
\
public: \
virtual inline varType get##funName() const \
{ \
return varName; \
} \
virtual inline void set##funName(varType var) \
{ \
if (varName != var) \
{ \
AX_SAFE_RETAIN(var); \
AX_SAFE_RELEASE(varName); \
varName = var; \
} \
}
#define AX_SAFE_DELETE(p) \
@ -252,7 +273,7 @@ public: \
{ \
if (p) \
{ \
delete[](p); \
delete[] (p); \
(p) = nullptr; \
} \
} while (0)
@ -318,7 +339,7 @@ public: \
} while (0)
#elif _AX_DEBUG == 1
# define AXLOG(format, ...) ax::print(format, ##__VA_ARGS__)
# define AXLOG(format, ...) ax::print(format, ##__VA_ARGS__)
# define AXLOGERROR(format, ...) ax::print(format, ##__VA_ARGS__)
# define AXLOGINFO(format, ...) \
do \
@ -327,10 +348,10 @@ public: \
# define AXLOGWARN(...) __AXLOGWITHFUNCTION(__VA_ARGS__)
#elif _AX_DEBUG > 1
# define AXLOG(format, ...) ax::print(format, ##__VA_ARGS__)
# define AXLOG(format, ...) ax::print(format, ##__VA_ARGS__)
# define AXLOGERROR(format, ...) ax::print(format, ##__VA_ARGS__)
# define AXLOGINFO(format, ...) ax::print(format, ##__VA_ARGS__)
# define AXLOGWARN(...) __AXLOGWITHFUNCTION(__VA_ARGS__)
# define AXLOGINFO(format, ...) ax::print(format, ##__VA_ARGS__)
# define AXLOGWARN(...) __AXLOGWITHFUNCTION(__VA_ARGS__)
#endif // _AX_DEBUG
/** Lua engine debug */
@ -349,8 +370,8 @@ public: \
*/
#if defined(__GNUC__) && ((__GNUC__ >= 5) || ((__GNUG__ == 4) && (__GNUC_MINOR__ >= 4))) || \
(defined(__clang__) && (__clang_major__ >= 3)) || (_MSC_VER >= 1800)
# define AX_DISALLOW_COPY_AND_ASSIGN(TypeName) \
TypeName(const TypeName&) = delete; \
# define AX_DISALLOW_COPY_AND_ASSIGN(TypeName) \
TypeName(const TypeName&) = delete; \
TypeName& operator=(const TypeName&) = delete;
#else
# define AX_DISALLOW_COPY_AND_ASSIGN(TypeName) \
@ -444,15 +465,25 @@ public: \
*/
#if __has_builtin(__builtin_expect)
# ifdef __cplusplus
# define UTILS_LIKELY(exp) (__builtin_expect(!!(exp), true))
# define UTILS_LIKELY(exp) (__builtin_expect(!!(exp), true))
# define UTILS_UNLIKELY(exp) (__builtin_expect(!!(exp), false))
# else
# define UTILS_LIKELY(exp) (__builtin_expect(!!(exp), 1))
# define UTILS_LIKELY(exp) (__builtin_expect(!!(exp), 1))
# define UTILS_UNLIKELY(exp) (__builtin_expect(!!(exp), 0))
# endif
#else
# define UTILS_LIKELY(exp) (!!(exp))
# define UTILS_LIKELY(exp) (!!(exp))
# define UTILS_UNLIKELY(exp) (!!(exp))
#endif
#if defined(_MSC_VER)
// MSVC does not support loop unrolling hints
# define UTILS_UNROLL
# define UTILS_NOUNROLL
#else
// C++11 allows pragmas to be specified as part of defines using the _Pragma syntax.
# define UTILS_UNROLL _Pragma("unroll")
# define UTILS_NOUNROLL _Pragma("nounroll")
#endif
#endif // __AX_PLATFORM_MACROS_H__

View File

@ -26,57 +26,33 @@
#include <doctest.h>
#include "base/Config.h"
#include "base/Types.h"
#include "math/MathBase.h"
#include "TestUtils.h"
#if (AX_TARGET_PLATFORM == AX_PLATFORM_IOS)
#if defined(__arm64__)
#define USE_NEON64 1
#define INCLUDE_NEON64 1
#elif defined(__ARM_NEON__)
#define USE_NEON32 1
#define INCLUDE_NEON32 1
#endif
#elif (AX_TARGET_PLATFORM == AX_PLATFORM_OSX)
#if defined(__arm64__) || defined(__aarch64__)
#define USE_NEON64 1
#define INCLUDE_NEON64 1
#endif
#elif (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID)
#if defined(__arm64__) || defined(__aarch64__)
#define USE_NEON64 1
#define INCLUDE_NEON64 1
#elif defined(__ARM_NEON__)
#define INCLUDE_NEON32 1
#endif
#endif
#define INCLUDE_SSE
#define USE_SSE
#if defined(USE_NEON32) || defined(USE_NEON64) // || defined(USE_SSE)
#define SKIP_SIMD_TEST doctest::skip(false)
#if defined(AX_SSE_INTRINSICS) || defined(AX_NEON_INTRINSICS)
# define SKIP_SIMD_TEST doctest::skip(false)
#else
#define SKIP_SIMD_TEST doctest::skip(true)
# define SKIP_SIMD_TEST doctest::skip(true)
#endif
USING_NS_AX;
namespace UnitTest {
namespace UnitTest
{
#ifdef INCLUDE_NEON32
#include "math/MathUtilNeon.inl"
#endif
#ifdef INCLUDE_NEON64
#include "math/MathUtilNeon64.inl"
#endif
#ifdef INCLUDE_SSE
// #include "math/MathUtilSSE.inl"
#ifdef AX_NEON_INTRINSICS
# include "math/MathUtilNeon.inl"
#elif defined(AX_SSE_INTRINSICS)
# include "math/MathUtilSSE.inl"
#endif
#include "math/MathUtil.inl"
} // namespace UnitTest
static void __checkMathUtilResult(std::string_view description, const float* a1, const float* a2, int size)
{
// Check whether the result of the optimized instruction is the same as which is implemented in C
@ -87,11 +63,10 @@ static void __checkMathUtilResult(std::string_view description, const float* a1,
}
}
TEST_SUITE("math/MathUtil") {
TEST_SUITE("math/MathUtil")
{
using namespace UnitTest::ax;
static void checkVerticesAreEqual(const V3F_C4B_T2F* v1, const V3F_C4B_T2F* v2, size_t count)
{
for (size_t i = 0; i < count; ++i)
@ -102,84 +77,94 @@ TEST_SUITE("math/MathUtil") {
}
}
TEST_CASE("transformVertices") {
TEST_CASE("transformVertices")
{
auto count = 5;
std::vector<V3F_C4B_T2F> src(count);
std::vector<V3F_C4B_T2F> expected(count);
std::vector<V3F_C4B_T2F> dst(count);
for (int i = 0; i < count; ++i) {
for (int i = 0; i < count; ++i)
{
src[i].vertices.set(float(i), float(i + 1), float(i + 2));
src[i].colors.set(uint8_t(i + 3), uint8_t(i + 4), uint8_t(i + 5), uint8_t(i + 6));
src[i].texCoords.set(float(i + 7), float(i + 8));
expected[i] = src[i];
expected[i] = src[i];
expected[i].vertices.x = src[i].vertices.y * 4;
expected[i].vertices.y = src[i].vertices.x * -5;
expected[i].vertices.z = src[i].vertices.z * 6;
}
Mat4 transform(
0, 4, 0, 0,
-5, 0, 0, 0,
0, 0, 6, 0,
1, 2, 3, 1
);
Mat4 transform(0, 4, 0, 0, -5, 0, 0, 0, 0, 0, 6, 0, 1, 2, 3, 1);
SUBCASE("MathUtilC") {
SUBCASE("MathUtilC")
{
MathUtilC::transformVertices(dst.data(), src.data(), count, transform);
checkVerticesAreEqual(expected.data(), dst.data(), count);
}
#if INCLUDE_NEON32
SUBCASE("MathUtilNeon") {
MathUtilNeon::transformVertices(dst.data(), src.data(), count, transform);
checkVerticesAreEqual(expected.data(), dst.data(), count);
}
#endif
#if INCLUDE_NEON64
SUBCASE("MathUtilNeon64") {
MathUtilNeon64::transformVertices(dst.data(), src.data(), count, transform);
checkVerticesAreEqual(expected.data(), dst.data(), count);
}
#endif
#ifdef AX_NEON_INTRINSICS
SUBCASE("MathUtilNeon")
{
MathUtilNeon::transformVertices(dst.data(), src.data(), count, transform);
checkVerticesAreEqual(expected.data(), dst.data(), count);
}
#elif defined(AX_SSE_INTRINSICS)
SUBCASE("MathUtilSSE")
{
MathUtilSSE::transformVertices(dst.data(), src.data(), count, transform);
checkVerticesAreEqual(expected.data(), dst.data(), count);
}
#endif
}
TEST_CASE("transformIndices") {
TEST_CASE("transformIndices")
{
auto count = 43;
std::vector<uint16_t> src(count);
std::vector<uint16_t> expected(count);
for (int i = 0; i < count; ++i) {
src[i] = i;
for (int i = 0; i < count; ++i)
{
src[i] = i;
expected[i] = i + 5;
}
uint16_t offset = 5;
SUBCASE("MathUtilC") {
SUBCASE("MathUtilC")
{
std::vector<uint16_t> dst(count);
MathUtilC::transformIndices(dst.data(), src.data(), count, offset);
for (int i = 0; i < count; ++i)
CHECK_EQ(expected[i], dst[i]);
}
#if INCLUDE_NEON64
SUBCASE("MathUtilNeon64") {
std::vector<uint16_t> dst(count);
MathUtilNeon64::transformIndices(dst.data(), src.data(), count, offset);
for (int i = 0; i < count; ++i)
CHECK_EQ(expected[i], dst[i]);
}
#endif
#if defined(AX_NEON_INTRINSICS) && AX_64BITS
SUBCASE("MathUtilNeon")
{
std::vector<uint16_t> dst(count);
MathUtilNeon::transformIndices(dst.data(), src.data(), count, offset);
for (int i = 0; i < count; ++i)
CHECK_EQ(expected[i], dst[i]);
}
#elif defined(AX_SSE_INTRINSICS)
SUBCASE("MathUtilSSE")
{
std::vector<uint16_t> dst(count);
MathUtilSSE::transformIndices(dst.data(), src.data(), count, offset);
for (int i = 0; i < count; ++i)
CHECK_EQ(expected[i], dst[i]);
}
#endif
}
}
TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
TEST_CASE("old_tests") {
TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST)
{
TEST_CASE("old_tests")
{
// I know the next line looks ugly, but it's a way to test MathUtil. :)
using namespace UnitTest::ax;
@ -213,20 +198,18 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
// inline static void addMatrix(const float* m, float scalar, float* dst);
MathUtilC::addMatrix(inMat41, scalar, outMat4C);
#ifdef INCLUDE_NEON32
MathUtilNeon::addMatrix(inMat41, scalar, outMat4Opt);
#endif
#ifdef AX_NEON_INTRINSICS
MathUtilNeon::addMatrix(reinterpret_cast<const _xm128_t*>(inMat41), scalar,
reinterpret_cast<_xm128_t*>(outMat4Opt));
#endif
#ifdef INCLUDE_NEON64
MathUtilNeon64::addMatrix(inMat41, scalar, outMat4Opt);
#endif
#ifdef INCLUDE_SSE
// FIXME:
#endif
#ifdef AX_SSE_INTRINSICS
MathUtilSSE::addMatrix(reinterpret_cast<const _xm128_t*>(inMat41), scalar,
reinterpret_cast<_xm128_t*>(outMat4Opt));
#endif
__checkMathUtilResult("inline static void addMatrix(const float* m, float scalar, float* dst);", outMat4C,
outMat4Opt, MAT4_SIZE);
outMat4Opt, MAT4_SIZE);
// Clean
memset(outMat4C, 0, sizeof(outMat4C));
memset(outMat4Opt, 0, sizeof(outMat4Opt));
@ -234,20 +217,16 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
// inline static void addMatrix(const float* m1, const float* m2, float* dst);
MathUtilC::addMatrix(inMat41, inMat42, outMat4C);
#ifdef INCLUDE_NEON32
MathUtilNeon::addMatrix(inMat41, inMat42, outMat4Opt);
#endif
#ifdef INCLUDE_NEON64
MathUtilNeon64::addMatrix(inMat41, inMat42, outMat4Opt);
#endif
#ifdef INCLUDE_SSE
// FIXME:
#endif
#ifdef AX_NEON_INTRINSICS
MathUtilNeon::addMatrix(reinterpret_cast<const _xm128_t*>(inMat41), reinterpret_cast<const _xm128_t*>(inMat42),
reinterpret_cast<_xm128_t*>(outMat4Opt));
#elif defined(AX_SSE_INTRINSICS)
MathUtilSSE::addMatrix(reinterpret_cast<const _xm128_t*>(inMat41), reinterpret_cast<const _xm128_t*>(inMat42),
reinterpret_cast<_xm128_t*>(outMat4Opt));
#endif
__checkMathUtilResult("inline static void addMatrix(const float* m1, const float* m2, float* dst);", outMat4C,
outMat4Opt, MAT4_SIZE);
outMat4Opt, MAT4_SIZE);
// Clean
memset(outMat4C, 0, sizeof(outMat4C));
memset(outMat4Opt, 0, sizeof(outMat4Opt));
@ -255,20 +234,18 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
// inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
MathUtilC::subtractMatrix(inMat41, inMat42, outMat4C);
#ifdef INCLUDE_NEON32
MathUtilNeon::subtractMatrix(inMat41, inMat42, outMat4Opt);
#endif
#ifdef AX_NEON_INTRINSICS
MathUtilNeon::subtractMatrix(reinterpret_cast<const _xm128_t*>(inMat41),
reinterpret_cast<const _xm128_t*>(inMat42),
reinterpret_cast<_xm128_t*>(outMat4Opt));
#elif defined(AX_SSE_INTRINSICS)
MathUtilSSE::subtractMatrix(reinterpret_cast<const _xm128_t*>(inMat41),
reinterpret_cast<const _xm128_t*>(inMat42),
reinterpret_cast<_xm128_t*>(outMat4Opt));
#endif
#ifdef INCLUDE_NEON64
MathUtilNeon64::subtractMatrix(inMat41, inMat42, outMat4Opt);
#endif
#ifdef INCLUDE_SSE
// FIXME:
#endif
__checkMathUtilResult("inline static void subtractMatrix(const float* m1, const float* m2, float* dst);", outMat4C,
outMat4Opt, MAT4_SIZE);
__checkMathUtilResult("inline static void subtractMatrix(const float* m1, const float* m2, float* dst);",
outMat4C, outMat4Opt, MAT4_SIZE);
// Clean
memset(outMat4C, 0, sizeof(outMat4C));
memset(outMat4Opt, 0, sizeof(outMat4Opt));
@ -276,20 +253,16 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
// inline static void multiplyMatrix(const float* m, float scalar, float* dst);
MathUtilC::multiplyMatrix(inMat41, scalar, outMat4C);
#ifdef INCLUDE_NEON32
MathUtilNeon::multiplyMatrix(inMat41, scalar, outMat4Opt);
#endif
#ifdef INCLUDE_NEON64
MathUtilNeon64::multiplyMatrix(inMat41, scalar, outMat4Opt);
#endif
#ifdef INCLUDE_SSE
// FIXME:
#endif
#ifdef AX_NEON_INTRINSICS
MathUtilNeon::multiplyMatrix(reinterpret_cast<const _xm128_t*>(inMat41), scalar,
reinterpret_cast<_xm128_t*>(outMat4Opt));
#elif defined(AX_SSE_INTRINSICS)
MathUtilSSE::multiplyMatrix(reinterpret_cast<const _xm128_t*>(inMat41), scalar,
reinterpret_cast<_xm128_t*>(outMat4Opt));
#endif
__checkMathUtilResult("inline static void multiplyMatrix(const float* m, float scalar, float* dst);", outMat4C,
outMat4Opt, MAT4_SIZE);
outMat4Opt, MAT4_SIZE);
// Clean
memset(outMat4C, 0, sizeof(outMat4C));
memset(outMat4Opt, 0, sizeof(outMat4Opt));
@ -297,20 +270,18 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
// inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
MathUtilC::multiplyMatrix(inMat41, inMat42, outMat4C);
#ifdef INCLUDE_NEON32
MathUtilNeon::multiplyMatrix(inMat41, inMat42, outMat4Opt);
#endif
#ifdef AX_NEON_INTRINSICS
MathUtilNeon::multiplyMatrix(reinterpret_cast<const _xm128_t*>(inMat41),
reinterpret_cast<const _xm128_t*>(inMat42),
reinterpret_cast<_xm128_t*>(outMat4Opt));
#elif defined(AX_SSE_INTRINSICS)
MathUtilSSE::multiplyMatrix(reinterpret_cast<const _xm128_t*>(inMat41),
reinterpret_cast<const _xm128_t*>(inMat42),
reinterpret_cast<_xm128_t*>(outMat4Opt));
#endif
#ifdef INCLUDE_NEON64
MathUtilNeon64::multiplyMatrix(inMat41, inMat42, outMat4Opt);
#endif
#ifdef INCLUDE_SSE
// FIXME:
#endif
__checkMathUtilResult("inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);", outMat4C,
outMat4Opt, MAT4_SIZE);
__checkMathUtilResult("inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);",
outMat4C, outMat4Opt, MAT4_SIZE);
// Clean
memset(outMat4C, 0, sizeof(outMat4C));
memset(outMat4Opt, 0, sizeof(outMat4Opt));
@ -318,20 +289,14 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
// inline static void negateMatrix(const float* m, float* dst);
MathUtilC::negateMatrix(inMat41, outMat4C);
#ifdef INCLUDE_NEON32
MathUtilNeon::negateMatrix(inMat41, outMat4Opt);
#endif
#ifdef INCLUDE_NEON64
MathUtilNeon64::negateMatrix(inMat41, outMat4Opt);
#endif
#ifdef INCLUDE_SSE
// FIXME:
#endif
#ifdef AX_NEON_INTRINSICS
MathUtilNeon::negateMatrix(reinterpret_cast<const _xm128_t*>(inMat41), reinterpret_cast<_xm128_t*>(outMat4Opt));
#elif defined(AX_SSE_INTRINSICS)
MathUtilSSE::negateMatrix(reinterpret_cast<const _xm128_t*>(inMat41), reinterpret_cast<_xm128_t*>(outMat4Opt));
#endif
__checkMathUtilResult("inline static void negateMatrix(const float* m, float* dst);", outMat4C, outMat4Opt,
MAT4_SIZE);
MAT4_SIZE);
// Clean
memset(outMat4C, 0, sizeof(outMat4C));
memset(outMat4Opt, 0, sizeof(outMat4Opt));
@ -339,20 +304,16 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
// inline static void transposeMatrix(const float* m, float* dst);
MathUtilC::transposeMatrix(inMat41, outMat4C);
#ifdef INCLUDE_NEON32
MathUtilNeon::transposeMatrix(inMat41, outMat4Opt);
#endif
#ifdef INCLUDE_NEON64
MathUtilNeon64::transposeMatrix(inMat41, outMat4Opt);
#endif
#ifdef INCLUDE_SSE
// FIXME:
#endif
#ifdef AX_NEON_INTRINSICS
MathUtilNeon::transposeMatrix(reinterpret_cast<const _xm128_t*>(inMat41),
reinterpret_cast<_xm128_t*>(outMat4Opt));
#elif defined(AX_SSE_INTRINSICS)
MathUtilSSE::transposeMatrix(reinterpret_cast<const _xm128_t*>(inMat41),
reinterpret_cast<_xm128_t*>(outMat4Opt));
#endif
__checkMathUtilResult("inline static void transposeMatrix(const float* m, float* dst);", outMat4C, outMat4Opt,
MAT4_SIZE);
MAT4_SIZE);
// Clean
memset(outMat4C, 0, sizeof(outMat4C));
memset(outMat4Opt, 0, sizeof(outMat4Opt));
@ -360,21 +321,16 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
// inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
MathUtilC::transformVec4(inMat41, x, y, z, w, outVec4C);
#ifdef INCLUDE_NEON32
MathUtilNeon::transformVec4(inMat41, x, y, z, w, outVec4Opt);
#endif
#ifdef INCLUDE_NEON64
MathUtilNeon64::transformVec4(inMat41, x, y, z, w, outVec4Opt);
#endif
#ifdef INCLUDE_SSE
// FIXME:
#endif
#ifdef AX_NEON_INTRINSICS
MathUtilNeon::transformVec4(reinterpret_cast<const _xm128_t*>(inMat41), x, y, z, w, outVec4Opt);
#elif defined(AX_SSE_INTRINSICS)
// FIXME:
MathUtilSSE::transformVec4(reinterpret_cast<const _xm128_t*>(inMat41), x, y, z, w, outVec4Opt);
#endif
__checkMathUtilResult(
"inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);", outVec4C,
outVec4Opt, VEC4_SIZE);
"inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);",
outVec4C, outVec4Opt, VEC4_SIZE);
// Clean
memset(outVec4C, 0, sizeof(outVec4C));
memset(outVec4Opt, 0, sizeof(outVec4Opt));
@ -382,20 +338,15 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
// inline static void transformVec4(const float* m, const float* v, float* dst);
MathUtilC::transformVec4(inMat41, inVec4, outVec4C);
#ifdef INCLUDE_NEON32
MathUtilNeon::transformVec4(inMat41, inVec4, outVec4Opt);
#endif
#ifdef INCLUDE_NEON64
MathUtilNeon64::transformVec4(inMat41, inVec4, outVec4Opt);
#endif
#ifdef INCLUDE_SSE
// FIXME:
#endif
#ifdef AX_NEON_INTRINSICS
MathUtilNeon::transformVec4(reinterpret_cast<const _xm128_t*>(inMat41), reinterpret_cast<const float*>(inVec4),
reinterpret_cast<float*>(outVec4Opt));
#elif defined(AX_SSE_INTRINSICS)
MathUtilSSE::transformVec4(reinterpret_cast<const _xm128_t*>(inMat41), reinterpret_cast<const float*>(inVec4), reinterpret_cast<float*>(outVec4Opt));
#endif
__checkMathUtilResult("inline static void transformVec4(const float* m, const float* v, float* dst);", outVec4C,
outVec4Opt, VEC4_SIZE);
outVec4Opt, VEC4_SIZE);
// Clean
memset(outVec4C, 0, sizeof(outVec4C));
memset(outVec4Opt, 0, sizeof(outVec4Opt));
@ -403,20 +354,14 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
// inline static void crossVec3(const float* v1, const float* v2, float* dst);
MathUtilC::crossVec3(inVec4, inVec42, outVec4C);
#ifdef INCLUDE_NEON32
MathUtilNeon::crossVec3(inVec4, inVec42, outVec4Opt);
#endif
#ifdef INCLUDE_NEON64
MathUtilNeon64::crossVec3(inVec4, inVec42, outVec4Opt);
#endif
#ifdef INCLUDE_SSE
// FIXME:
#endif
#ifdef AX_NEON_INTRINSICS
MathUtilNeon::crossVec3(inVec4, inVec42, outVec4Opt);
#elif defined(AX_SSE_INTRINSICS)
MathUtilSSE::crossVec3(inVec4, inVec42, outVec4Opt);
#endif
__checkMathUtilResult("inline static void crossVec3(const float* v1, const float* v2, float* dst);", outVec4C,
outVec4Opt, VEC4_SIZE);
outVec4Opt, VEC4_SIZE);
// Clean
memset(outVec4C, 0, sizeof(outVec4C));
memset(outVec4Opt, 0, sizeof(outVec4Opt));