/**************************************************************************** Copyright (c) 2010-2012 cocos2d-x.org Copyright (c) 2013-2017 Chukong Technologies Copyright (c) 2017-2018 Xiamen Yaji Software Co., Ltd. Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md). https://axmol.dev/ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ****************************************************************************/ NS_AX_MATH_BEGIN #ifdef AX_SSE_INTRINSICS struct MathUtilSSE { static void addMatrix(const __m128 m[4], float scalar, __m128 dst[4]) { __m128 s = _mm_set1_ps(scalar); dst[0] = _mm_add_ps(m[0], s); dst[1] = _mm_add_ps(m[1], s); dst[2] = _mm_add_ps(m[2], s); dst[3] = _mm_add_ps(m[3], s); } static void addMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]) { dst[0] = _mm_add_ps(m1[0], m2[0]); dst[1] = _mm_add_ps(m1[1], m2[1]); dst[2] = _mm_add_ps(m1[2], m2[2]); dst[3] = _mm_add_ps(m1[3], m2[3]); } static void subtractMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]) { dst[0] = _mm_sub_ps(m1[0], m2[0]); dst[1] = _mm_sub_ps(m1[1], m2[1]); dst[2] = _mm_sub_ps(m1[2], m2[2]); dst[3] = _mm_sub_ps(m1[3], m2[3]); } static void multiplyMatrix(const __m128 m[4], float scalar, __m128 dst[4]) { __m128 s = _mm_set1_ps(scalar); dst[0] = _mm_mul_ps(m[0], s); dst[1] = _mm_mul_ps(m[1], s); dst[2] = _mm_mul_ps(m[2], s); dst[3] = _mm_mul_ps(m[3], s); } static void multiplyMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]) { __m128 dst0, dst1, dst2, dst3; { __m128 e0 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(0, 0, 0, 0)); __m128 e1 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(1, 1, 1, 1)); __m128 e2 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(2, 2, 2, 2)); __m128 e3 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(3, 3, 3, 3)); __m128 v0 = _mm_mul_ps(m1[0], e0); __m128 v1 = _mm_mul_ps(m1[1], e1); __m128 v2 = _mm_mul_ps(m1[2], e2); __m128 v3 = _mm_mul_ps(m1[3], e3); __m128 a0 = _mm_add_ps(v0, v1); __m128 a1 = _mm_add_ps(v2, v3); __m128 a2 = _mm_add_ps(a0, a1); dst0 = a2; } { __m128 e0 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(0, 0, 0, 0)); __m128 e1 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(1, 1, 1, 1)); __m128 e2 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(2, 2, 2, 2)); __m128 e3 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(3, 3, 3, 3)); __m128 v0 = _mm_mul_ps(m1[0], e0); __m128 v1 = _mm_mul_ps(m1[1], e1); __m128 v2 = _mm_mul_ps(m1[2], e2); __m128 v3 = _mm_mul_ps(m1[3], e3); __m128 a0 = _mm_add_ps(v0, v1); __m128 a1 = _mm_add_ps(v2, v3); __m128 a2 = _mm_add_ps(a0, a1); dst1 = a2; } { __m128 e0 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(0, 0, 0, 0)); __m128 e1 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(1, 1, 1, 1)); __m128 e2 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(2, 2, 2, 2)); __m128 e3 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(3, 3, 3, 3)); __m128 v0 = _mm_mul_ps(m1[0], e0); __m128 v1 = _mm_mul_ps(m1[1], e1); __m128 v2 = _mm_mul_ps(m1[2], e2); __m128 v3 = _mm_mul_ps(m1[3], e3); __m128 a0 = _mm_add_ps(v0, v1); __m128 a1 = _mm_add_ps(v2, v3); __m128 a2 = _mm_add_ps(a0, a1); dst2 = a2; } { __m128 e0 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(0, 0, 0, 0)); __m128 e1 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(1, 1, 1, 1)); __m128 e2 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(2, 2, 2, 2)); __m128 e3 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(3, 3, 3, 3)); __m128 v0 = _mm_mul_ps(m1[0], e0); __m128 v1 = _mm_mul_ps(m1[1], e1); __m128 v2 = _mm_mul_ps(m1[2], e2); __m128 v3 = _mm_mul_ps(m1[3], e3); __m128 a0 = _mm_add_ps(v0, v1); __m128 a1 = _mm_add_ps(v2, v3); __m128 a2 = _mm_add_ps(a0, a1); dst3 = a2; } dst[0] = dst0; dst[1] = dst1; dst[2] = dst2; dst[3] = dst3; } static void negateMatrix(const __m128 m[4], __m128 dst[4]) { __m128 z = _mm_setzero_ps(); dst[0] = _mm_sub_ps(z, m[0]); dst[1] = _mm_sub_ps(z, m[1]); dst[2] = _mm_sub_ps(z, m[2]); dst[3] = _mm_sub_ps(z, m[3]); } static void transposeMatrix(const __m128 m[4], __m128 dst[4]) { __m128 tmp0 = _mm_shuffle_ps(m[0], m[1], 0x44); __m128 tmp2 = _mm_shuffle_ps(m[0], m[1], 0xEE); __m128 tmp1 = _mm_shuffle_ps(m[2], m[3], 0x44); __m128 tmp3 = _mm_shuffle_ps(m[2], m[3], 0xEE); dst[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88); dst[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD); dst[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88); dst[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD); } static void transformVec4(const __m128 m[4], float x, float y, float z, float w, float* dst /*vec3*/) { //__m128 res = _mm_set_ps(w, z, y, x); //__m128 xx = _mm_shuffle_ps(res, res, _MM_SHUFFLE(0, 0, 0, 0)); //__m128 yy = _mm_shuffle_ps(res, res, _MM_SHUFFLE(1, 1, 1, 1)); //__m128 zz = _mm_shuffle_ps(res, res, _MM_SHUFFLE(2, 2, 2, 2)); //__m128 ww = _mm_shuffle_ps(res, res, _MM_SHUFFLE(3, 3, 3, 3)); __m128 xx = _mm_set1_ps(x); __m128 yy = _mm_set1_ps(y); __m128 zz = _mm_set1_ps(z); __m128 ww = _mm_set1_ps(w); auto res = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m[0], xx), _mm_mul_ps(m[1], yy)), _mm_add_ps(_mm_mul_ps(m[2], zz), _mm_mul_ps(m[3], ww))); _mm_storel_pi((__m64*)dst, res); # if defined(__SSE4_1__) *reinterpret_cast(dst + 2) = _mm_extract_ps(res, 2); # else dst[2] = _mm_cvtss_f32(_mm_movehl_ps(res, res)); # endif } static void transformVec4(const __m128 m[4], const float* v /*vec4*/, float* dst /*vec4*/) { //__m128 res = _mm_loadu_ps(v); //__m128 xx = _mm_shuffle_ps(res, res, _MM_SHUFFLE(0, 0, 0, 0)); //__m128 yy = _mm_shuffle_ps(res, res, _MM_SHUFFLE(1, 1, 1, 1)); //__m128 zz = _mm_shuffle_ps(res, res, _MM_SHUFFLE(2, 2, 2, 2)); //__m128 ww = _mm_shuffle_ps(res, res, _MM_SHUFFLE(3, 3, 3, 3)); __m128 xx = _mm_set1_ps(v[0]); __m128 yy = _mm_set1_ps(v[1]); __m128 zz = _mm_set1_ps(v[2]); __m128 ww = _mm_set1_ps(v[3]); auto res = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m[0], xx), _mm_mul_ps(m[1], yy)), _mm_add_ps(_mm_mul_ps(m[2], zz), _mm_mul_ps(m[3], ww))); _mm_storeu_ps(dst, res); } static void crossVec3(const float* v1, const float* v2, float* dst) { __m128 a = _mm_set_ps(0.0f, v1[2], v1[1], v1[0]); __m128 b = _mm_set_ps(0.0f, v2[2], v2[1], v2[0]); __m128 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); __m128 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); __m128 res = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b)); res = _mm_shuffle_ps(res, res, _MM_SHUFFLE(3, 0, 2, 1)); _mm_storel_pi((__m64*)dst, res); # if defined(__SSE4_1__) *reinterpret_cast(dst + 2) = _mm_extract_ps(res, 2); # else dst[2] = _mm_cvtss_f32(_mm_movehl_ps(res, res)); # endif } static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform) { auto& m = transform.col; for (size_t i = 0; i < count; ++i) { auto& vert = src[i].vertices; __m128 v = _mm_set_ps(1.0f, vert.z, vert.y, vert.x); v = _mm_add_ps( _mm_add_ps(_mm_mul_ps(m[0], _mm_shuffle_ps(v, v, 0)), _mm_mul_ps(m[1], _mm_shuffle_ps(v, v, 0x55))), _mm_add_ps(_mm_mul_ps(m[2], _mm_shuffle_ps(v, v, 0xaa)), _mm_mul_ps(m[3], _mm_shuffle_ps(v, v, 0xff)))); _mm_storeu_ps((float*)&dst[i].vertices, v); // Copy tex coords and colors // dst[i].texCoords = src[i].texCoords; // dst[i].colors = src[i].colors; memcpy(&dst[i].colors, &src[i].colors, sizeof(V3F_C4B_T2F::colors) + sizeof(V3F_C4B_T2F::texCoords)); } } static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset) { __m128i offset_vector = _mm_set1_epi16(offset); size_t remainder = count % 8; size_t rounded_count = count - remainder; for (size_t i = 0; i < rounded_count; i += 8) { __m128i current_values = _mm_loadu_si128((__m128i*)(src + i)); // Load 8 values. current_values = _mm_add_epi16(current_values, offset_vector); // Add offset to them. _mm_storeu_si128((__m128i*)(dst + i), current_values); // Store the result. } // If count is not divisible by 8, add offset for the remainder elements one by one. for (size_t i = 0; i < remainder; ++i) { dst[rounded_count + i] = src[rounded_count + i] + offset; } } }; #endif NS_AX_MATH_END