axmol/core/math/MathUtilNeon.inl

390 lines
14 KiB
Plaintext
Raw Normal View History

2019-11-23 20:27:39 +08:00
/**
Copyright 2013 BlackBerry Inc.
Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
2019-11-23 20:27:39 +08:00
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Original file from GamePlay3D: http://gameplay3d.org
2024-08-04 09:46:36 +08:00
This file was modified to fit the axmol project
2019-11-23 20:27:39 +08:00
*/
#include <arm_neon.h>
NS_AX_MATH_BEGIN
2019-11-23 20:27:39 +08:00
2024-08-04 09:46:36 +08:00
struct MathUtilNeon
2019-11-23 20:27:39 +08:00
{
2024-08-04 09:46:36 +08:00
#if defined(__EMSCRIPTEN__)
# define vmlaq_lane_f32(a, b, c, lane) vaddq_f32(a, vmulq_lane_f32(b, c, lane))
#endif
2024-08-04 09:46:36 +08:00
inline static void addMatrix(const _xm128_t* m, float scalar, _xm128_t* dst)
{
float32x4_t s = vdupq_n_f32(scalar);
dst[0] = vaddq_f32(m[0], s);
dst[1] = vaddq_f32(m[1], s);
dst[2] = vaddq_f32(m[2], s);
dst[3] = vaddq_f32(m[3], s);
}
2024-08-04 09:46:36 +08:00
inline static void addMatrix(const _xm128_t* m1, const _xm128_t* m2, _xm128_t* dst)
{
dst[0] = vaddq_f32(m1[0], m2[0]);
dst[1] = vaddq_f32(m1[1], m2[1]);
dst[2] = vaddq_f32(m1[2], m2[2]);
dst[3] = vaddq_f32(m1[3], m2[3]);
}
2024-08-04 09:46:36 +08:00
inline static void subtractMatrix(const _xm128_t* m1, const _xm128_t* m2, _xm128_t* dst)
{
dst[0] = vsubq_f32(m1[0], m2[0]);
dst[1] = vsubq_f32(m1[1], m2[1]);
dst[2] = vsubq_f32(m1[2], m2[2]);
dst[3] = vsubq_f32(m1[3], m2[3]);
}
2019-11-23 20:27:39 +08:00
2024-08-04 09:46:36 +08:00
inline static void multiplyMatrix(const _xm128_t* m, float scalar, _xm128_t* dst)
{
_xm128_t s = vdupq_n_f32(scalar);
Release 2.1.5 (#2076) * Fix unexpected libpng used * Fix string format incorrect for tests * Fix #1751, use coroutine control AutoTest flow * Update CHANGELOG.md * Added OpenType font (.otf) to the noCompress list. (#2077) * Update 1k & copyright notice in some sources * Move doctest to axmol 3rdparty * Fix ci * Update 1kdist to v90 * Update 1kiss.ps1 * DrawNodeV2 0.95.1 (#2079) * Rename remaining legacy engine related spells and improve code style * Update 3rdparty README.md * Fix checkReallySupportsASTC does not work on ios device reported by @BIGCATDOG in https://github.com/axmolengine/axmol/issues/2078 * Fix ci * FastRNG: add missing include for AXASSERT (#2081) * Delete unused files * Improve FileUtils - Rename FileUtils::createDirectory to FileUtils::createDirectories - Use splitpath_cb to optimize FileUtils::createDirectories - Rename FileUtils::getFileShortName to FileUtils::getPathBaseName - Rename FileUtils::getFileExtension to FileUtils::getPathExtension - Add FileUtils::getPathDirName - Add FileUtils::getPathBaseNameNoExtension - Mark all renamed FileUtils stubs old name deprecated - Mark all FileUtils offthread APIs deprecated * Update box2d to v2.4.2 * Disable /sdl checks explicitly for winuwp For axmol deprecated policy, we need disable /sdl checks explicitly to avoid compiler traits invoking deprecated functions as error * Update cppwinrt to 2.0.240405.15 * Update simdjson to 3.10.0 * Fix box2d testbed compile error * Improve file path to url * Fix FileUtils::createDirectories unix logic * axmol-cmdline: remove arch suffix for host build output directory * Update CHANGELOG.md * Update lua bindings --------- Co-authored-by: Dani Alias <danielgutierrezalias@gmail.com> Co-authored-by: aismann <icesoft@freenet.de> Co-authored-by: smilediver <smilediver@outlook.com>
2024-08-11 21:11:35 +08:00
AX_UNROLL
2024-08-04 09:46:36 +08:00
for (int i = 0; i < 4; ++i)
{
dst[i] = vmulq_f32(m[i], s);
}
}
2024-08-04 09:46:36 +08:00
inline static void multiplyMatrix(const _xm128_t* m1, const _xm128_t* m2, _xm128_t* dst)
{
float32x4_t product[4];
float32x4_t val;
Release 2.1.5 (#2076) * Fix unexpected libpng used * Fix string format incorrect for tests * Fix #1751, use coroutine control AutoTest flow * Update CHANGELOG.md * Added OpenType font (.otf) to the noCompress list. (#2077) * Update 1k & copyright notice in some sources * Move doctest to axmol 3rdparty * Fix ci * Update 1kdist to v90 * Update 1kiss.ps1 * DrawNodeV2 0.95.1 (#2079) * Rename remaining legacy engine related spells and improve code style * Update 3rdparty README.md * Fix checkReallySupportsASTC does not work on ios device reported by @BIGCATDOG in https://github.com/axmolengine/axmol/issues/2078 * Fix ci * FastRNG: add missing include for AXASSERT (#2081) * Delete unused files * Improve FileUtils - Rename FileUtils::createDirectory to FileUtils::createDirectories - Use splitpath_cb to optimize FileUtils::createDirectories - Rename FileUtils::getFileShortName to FileUtils::getPathBaseName - Rename FileUtils::getFileExtension to FileUtils::getPathExtension - Add FileUtils::getPathDirName - Add FileUtils::getPathBaseNameNoExtension - Mark all renamed FileUtils stubs old name deprecated - Mark all FileUtils offthread APIs deprecated * Update box2d to v2.4.2 * Disable /sdl checks explicitly for winuwp For axmol deprecated policy, we need disable /sdl checks explicitly to avoid compiler traits invoking deprecated functions as error * Update cppwinrt to 2.0.240405.15 * Update simdjson to 3.10.0 * Fix box2d testbed compile error * Improve file path to url * Fix FileUtils::createDirectories unix logic * axmol-cmdline: remove arch suffix for host build output directory * Update CHANGELOG.md * Update lua bindings --------- Co-authored-by: Dani Alias <danielgutierrezalias@gmail.com> Co-authored-by: aismann <icesoft@freenet.de> Co-authored-by: smilediver <smilediver@outlook.com>
2024-08-11 21:11:35 +08:00
AX_UNROLL
2024-08-04 09:46:36 +08:00
for (int i = 0; i < 4; ++i)
{
val = vmulq_n_f32(m1[0], vgetq_lane_f32(m2[i], 0));
val = vmlaq_n_f32(val, m1[1], vgetq_lane_f32(m2[i], 1));
val = vmlaq_n_f32(val, m1[2], vgetq_lane_f32(m2[i], 2));
val = vmlaq_n_f32(val, m1[3], vgetq_lane_f32(m2[i], 3));
product[i] = val;
}
memcpy(dst, product, sizeof(product));
}
2024-08-04 09:46:36 +08:00
inline static void negateMatrix(const _xm128_t* m, _xm128_t* dst)
{
Release 2.1.5 (#2076) * Fix unexpected libpng used * Fix string format incorrect for tests * Fix #1751, use coroutine control AutoTest flow * Update CHANGELOG.md * Added OpenType font (.otf) to the noCompress list. (#2077) * Update 1k & copyright notice in some sources * Move doctest to axmol 3rdparty * Fix ci * Update 1kdist to v90 * Update 1kiss.ps1 * DrawNodeV2 0.95.1 (#2079) * Rename remaining legacy engine related spells and improve code style * Update 3rdparty README.md * Fix checkReallySupportsASTC does not work on ios device reported by @BIGCATDOG in https://github.com/axmolengine/axmol/issues/2078 * Fix ci * FastRNG: add missing include for AXASSERT (#2081) * Delete unused files * Improve FileUtils - Rename FileUtils::createDirectory to FileUtils::createDirectories - Use splitpath_cb to optimize FileUtils::createDirectories - Rename FileUtils::getFileShortName to FileUtils::getPathBaseName - Rename FileUtils::getFileExtension to FileUtils::getPathExtension - Add FileUtils::getPathDirName - Add FileUtils::getPathBaseNameNoExtension - Mark all renamed FileUtils stubs old name deprecated - Mark all FileUtils offthread APIs deprecated * Update box2d to v2.4.2 * Disable /sdl checks explicitly for winuwp For axmol deprecated policy, we need disable /sdl checks explicitly to avoid compiler traits invoking deprecated functions as error * Update cppwinrt to 2.0.240405.15 * Update simdjson to 3.10.0 * Fix box2d testbed compile error * Improve file path to url * Fix FileUtils::createDirectories unix logic * axmol-cmdline: remove arch suffix for host build output directory * Update CHANGELOG.md * Update lua bindings --------- Co-authored-by: Dani Alias <danielgutierrezalias@gmail.com> Co-authored-by: aismann <icesoft@freenet.de> Co-authored-by: smilediver <smilediver@outlook.com>
2024-08-11 21:11:35 +08:00
AX_UNROLL
2024-08-04 09:46:36 +08:00
for (int i = 0; i < 4; ++i)
{
dst[i] = vnegq_f32(m[i]);
}
}
2024-08-04 09:46:36 +08:00
inline static void transposeMatrix(const _xm128_t* m, _xm128_t* dst)
{
2024-08-04 09:46:36 +08:00
auto tmp0 = vzipq_f32(m[0], m[2]);
auto tmp1 = vzipq_f32(m[1], m[3]);
auto tmp2 = vzipq_f32(tmp0.val[0], tmp1.val[0]);
auto tmp3 = vzipq_f32(tmp0.val[1], tmp1.val[1]);
dst[0] = tmp2.val[0];
dst[1] = tmp2.val[1];
dst[2] = tmp3.val[0];
dst[3] = tmp3.val[1];
}
2024-08-04 09:46:36 +08:00
inline static void transformVec4(const _xm128_t* m, float x, float y, float z, float w, float* dst/*vec3*/)
{
auto v0 = vmulq_n_f32(m[0], x);
auto v1 = vmulq_n_f32(m[1], y);
auto v2 = vmulq_n_f32(m[2], z);
auto v3 = vmulq_n_f32(m[3], w);
auto prod = vaddq_f32(v0, vaddq_f32(v1, vaddq_f32(v2, v3)));
vst1_f32(dst, vget_low_f32(prod));
vst1_lane_f32(dst + 2, vget_high_f32(prod), 0);
}
inline static void transformVec4(const _xm128_t* m, const float* v /*vec4*/, float* dst /*vec4*/)
{
auto v0 = vmulq_n_f32(m[0], v[0]);
auto v1 = vmulq_n_f32(m[1], v[1]);
auto v2 = vmulq_n_f32(m[2], v[2]);
auto v3 = vmulq_n_f32(m[3], v[3]);
auto prod = vaddq_f32(v0, vaddq_f32(v1, vaddq_f32(v2, v3)));
vst1q_f32(dst, prod);
}
inline static void crossVec3(const float* v1, const float* v2, float* dst)
{
// refer to:
// https://developer.arm.com/documentation/den0018/a/NEON-Code-Examples-with-Mixed-Operations/Cross-product/Single-cross-product
// Vector a is stored in memory such that ai is at the lower address and
// ak is at the higher address. Vector b is also stored in the same way.
float32x4_t vec_a = vcombine_f32(vld1_f32(v1 + 1), vld1_f32(v1)); // Q register = [aj, ai, ak, aj]
float32x4_t vec_b = vcombine_f32(vld1_f32(v2 + 1), vld1_f32(v2)); // Q register = [bj, bi, bk, bj]
float32x4_t vec_a_rot = vextq_f32(vec_a, vec_a, 1);
float32x4_t vec_b_rot = vextq_f32(vec_b, vec_b, 1);
float32x4_t prod = vmulq_f32(vec_a, vec_b_rot);
// prod = [ ajbj, aibj, akbi, ajbk ]
prod = vmlsq_f32(prod, vec_a_rot, vec_b);
// prod = [ ajbj-ajbj, aibj-ajbi, akbi-aibk, ajbk-akbj ]
vst1_f32(dst, vget_low_f32(prod)); // Store the lower two elements to address r
vst1_lane_f32(dst + 2, vget_high_f32(prod), 0); // Store the 3rd element
}
#if AX_64BITS
inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
{
auto end = dst + count;
// Load matrix
float32x4x4_t m = vld1q_f32_x4(transform.m);
// Process 4 vertices at a time if there's enough data
auto end4 = dst + count / 4 * 4;
while (dst < end4)
{
// Do this for each vertex
// dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8] + m[12];
// dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9] + m[13];
// dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14];
// First, load each vertex, multiply x by column 0 and add to column 3
// Note: since we're reading 4 floats it will load color bytes into v.w
float32x4_t v0 = vld1q_f32(&src[0].vertices.x);
float32x4_t r0 = vmlaq_laneq_f32(m.val[3], m.val[0], v0, 0);
float32x4_t v1 = vld1q_f32(&src[1].vertices.x);
float32x4_t r1 = vmlaq_laneq_f32(m.val[3], m.val[0], v1, 0);
float32x4_t v2 = vld1q_f32(&src[2].vertices.x);
float32x4_t r2 = vmlaq_laneq_f32(m.val[3], m.val[0], v2, 0);
float32x4_t v3 = vld1q_f32(&src[3].vertices.x);
float32x4_t r3 = vmlaq_laneq_f32(m.val[3], m.val[0], v3, 0);
// Load texCoords
float32x2_t uv0 = vld1_f32(&src[0].texCoords.u);
float32x2_t uv1 = vld1_f32(&src[1].texCoords.u);
float32x2_t uv2 = vld1_f32(&src[2].texCoords.u);
float32x2_t uv3 = vld1_f32(&src[3].texCoords.u);
// Multiply y by column 1 and add to result
r0 = vmlaq_laneq_f32(r0, m.val[1], v0, 1);
r1 = vmlaq_laneq_f32(r1, m.val[1], v1, 1);
r2 = vmlaq_laneq_f32(r2, m.val[1], v2, 1);
r3 = vmlaq_laneq_f32(r3, m.val[1], v3, 1);
// Multiply z by column 2 and add to result
r0 = vmlaq_laneq_f32(r0, m.val[2], v0, 2);
r1 = vmlaq_laneq_f32(r1, m.val[2], v1, 2);
r2 = vmlaq_laneq_f32(r2, m.val[2], v2, 2);
r3 = vmlaq_laneq_f32(r3, m.val[2], v3, 2);
// Set w to loaded color
r0 = vsetq_lane_f32(vgetq_lane_f32(v0, 3), r0, 3);
r1 = vsetq_lane_f32(vgetq_lane_f32(v1, 3), r1, 3);
r2 = vsetq_lane_f32(vgetq_lane_f32(v2, 3), r2, 3);
r3 = vsetq_lane_f32(vgetq_lane_f32(v3, 3), r3, 3);
// Store result
vst1q_f32(&dst[0].vertices.x, r0);
vst1_f32(&dst[0].texCoords.u, uv0);
vst1q_f32(&dst[1].vertices.x, r1);
vst1_f32(&dst[1].texCoords.u, uv1);
vst1q_f32(&dst[2].vertices.x, r2);
vst1_f32(&dst[2].texCoords.u, uv2);
vst1q_f32(&dst[3].vertices.x, r3);
vst1_f32(&dst[3].texCoords.u, uv3);
dst += 4;
src += 4;
}
// Process remaining vertices one by one
while (dst < end)
{
float32x4_t v = vld1q_f32(&src->vertices.x);
float32x4_t r = vmlaq_laneq_f32(m.val[3], m.val[0], v, 0);
r = vmlaq_laneq_f32(r, m.val[1], v, 1);
r = vmlaq_laneq_f32(r, m.val[2], v, 2);
r = vsetq_lane_f32(vgetq_lane_f32(v, 3), r, 3);
float32x2_t uv = vld1_f32(&src->texCoords.u);
vst1q_f32(&dst->vertices.x, r);
vst1_f32(&dst->texCoords.u, uv);
++dst;
++src;
}
}
inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
{
auto end = dst + count;
auto off = vdupq_n_u16(offset);
if (count < 8)
goto LEFTOVER;
// Process 32 indices at a time if there's enough data
while (count >= 32)
{
// Load 32 indices
uint16x8x4_t v = vld1q_u16_x4(src);
// Add offset
v.val[0] = vaddq_u16(v.val[0], off);
v.val[1] = vaddq_u16(v.val[1], off);
v.val[2] = vaddq_u16(v.val[2], off);
v.val[3] = vaddq_u16(v.val[3], off);
// Store result
vst1q_u16_x4(dst, v);
dst += 32;
src += 32;
count -= 32;
}
// Process 8 indices at a time if there's enough data
while (count >= 8)
{
uint16x8_t v = vld1q_u16(src);
v = vaddq_u16(v, off);
vst1q_u16(dst, v);
dst += 8;
src += 8;
count -= 8;
}
LEFTOVER:
// Process remaining indices one by one
while (count > 0)
{
*dst = *src + offset;
++dst;
++src;
--count;
}
}
#else
inline static void transformVertices(ax::V3F_C4B_T2F* dst,
const ax::V3F_C4B_T2F* src,
size_t count,
const ax::Mat4& transform)
{
auto end = dst + count;
// Load matrix
float32x4_t mc0 = vld1q_f32(transform.m);
float32x4_t mc1 = vld1q_f32(transform.m + 4);
float32x4_t mc2 = vld1q_f32(transform.m + 8);
float32x4_t mc3 = vld1q_f32(transform.m + 12);
// Process 4 vertices at a time
auto end4 = dst + count / 4 * 4;
while (dst < end4)
{
// Load 4 vertices. Note that color will also get loaded into w
float32x2_t xy0 = vld1_f32(&src[0].vertices.x);
float32x2_t zw0 = vld1_f32(&src[0].vertices.z);
float32x2_t uv0 = vld1_f32(&src[0].texCoords.u);
float32x2_t xy1 = vld1_f32(&src[1].vertices.x);
float32x2_t zw1 = vld1_f32(&src[1].vertices.z);
float32x2_t uv1 = vld1_f32(&src[1].texCoords.u);
float32x2_t xy2 = vld1_f32(&src[2].vertices.x);
float32x2_t zw2 = vld1_f32(&src[2].vertices.z);
float32x2_t uv2 = vld1_f32(&src[2].texCoords.u);
float32x2_t xy3 = vld1_f32(&src[3].vertices.x);
float32x2_t zw3 = vld1_f32(&src[3].vertices.z);
float32x2_t uv3 = vld1_f32(&src[3].texCoords.u);
// Multiply x by column 0
float32x4_t r0 = vmulq_lane_f32(mc0, xy0, 0);
float32x4_t r1 = vmulq_lane_f32(mc0, xy1, 0);
float32x4_t r2 = vmulq_lane_f32(mc0, xy2, 0);
float32x4_t r3 = vmulq_lane_f32(mc0, xy3, 0);
// Multiply y by column 1 and add to result
r0 = vmlaq_lane_f32(r0, mc1, xy0, 1);
r1 = vmlaq_lane_f32(r1, mc1, xy1, 1);
r2 = vmlaq_lane_f32(r2, mc1, xy2, 1);
r3 = vmlaq_lane_f32(r3, mc1, xy3, 1);
// Multiply z by column 2 and add to result
r0 = vmlaq_lane_f32(r0, mc2, zw0, 0);
r1 = vmlaq_lane_f32(r1, mc2, zw1, 0);
r2 = vmlaq_lane_f32(r2, mc2, zw2, 0);
r3 = vmlaq_lane_f32(r3, mc2, zw3, 0);
// Add column 3
r0 = vaddq_f32(r0, mc3);
r1 = vaddq_f32(r1, mc3);
r2 = vaddq_f32(r2, mc3);
r3 = vaddq_f32(r3, mc3);
// Set color
r0 = vsetq_lane_f32(vget_lane_f32(zw0, 1), r0, 3);
r1 = vsetq_lane_f32(vget_lane_f32(zw1, 1), r1, 3);
r2 = vsetq_lane_f32(vget_lane_f32(zw2, 1), r2, 3);
r3 = vsetq_lane_f32(vget_lane_f32(zw3, 1), r3, 3);
// Store result
vst1q_f32(&dst[0].vertices.x, r0);
vst1_f32(&dst[0].texCoords.u, uv0);
vst1q_f32(&dst[1].vertices.x, r1);
vst1_f32(&dst[1].texCoords.u, uv1);
vst1q_f32(&dst[2].vertices.x, r2);
vst1_f32(&dst[2].texCoords.u, uv2);
vst1q_f32(&dst[3].vertices.x, r3);
vst1_f32(&dst[3].texCoords.u, uv3);
dst += 4;
src += 4;
}
// Process remaining vertices
while (dst < end)
{
// Load vertex
float32x2_t xy = vld1_f32(&src->vertices.x);
float32x2_t zw = vld1_f32(&src->vertices.z);
float32x2_t uv = vld1_f32(&src->texCoords.u);
// Multiply x by column 0
float32x4_t r = vmulq_lane_f32(mc0, xy, 0);
// Multiply y by column 1 and add to result
r = vmlaq_lane_f32(r, mc1, xy, 1);
// Multiply z by column 2 and add to result
r = vmlaq_lane_f32(r, mc2, zw, 0);
// Add column 3
r = vaddq_f32(r, mc3);
// Set color
r = vsetq_lane_f32(vget_lane_f32(zw, 1), r, 3);
// Store result
vst1q_f32(&dst->vertices.x, r);
vst1_f32(&dst->texCoords.u, uv);
++dst;
++src;
}
}
#endif
};
NS_AX_MATH_END