mirror of https://github.com/axmolengine/axmol.git
187 lines
5.2 KiB
C++
187 lines
5.2 KiB
C++
// SPDX-License-Identifier: Apache-2.0
|
|
// ----------------------------------------------------------------------------
|
|
// Copyright 2021 Arm Limited
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
|
// use this file except in compliance with the License. You may obtain a copy
|
|
// of the License at:
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
// License for the specific language governing permissions and limitations
|
|
// under the License.
|
|
// ----------------------------------------------------------------------------
|
|
|
|
/**
|
|
* @brief Intrinsics for Armv7 NEON.
|
|
*
|
|
* This module implements a few Armv7-compatible intrinsics indentical to Armv8
|
|
* ones. Thus, astcenc can be compiled using Armv7 architecture.
|
|
*/
|
|
|
|
#ifndef ASTC_VECMATHLIB_NEON_ARMV7_4_H_INCLUDED
|
|
#define ASTC_VECMATHLIB_NEON_ARMV7_4_H_INCLUDED
|
|
|
|
#ifndef ASTCENC_SIMD_INLINE
|
|
#error "Include astcenc_vecmathlib.h, do not include directly"
|
|
#endif
|
|
|
|
#include <algorithm>
|
|
#include <cfenv>
|
|
|
|
|
|
// arm-linux-gnueabi-gcc contains the following functions by using
|
|
// #pragma GCC target ("fpu=neon-fp-armv8"), while clang does not.
|
|
#if defined(__clang__)
|
|
|
|
/**
|
|
* @brief Return the max vector of two vectors.
|
|
*
|
|
* If one vector element is numeric and the other is a quiet NaN,
|
|
* the result placed in the vector is the numerical value.
|
|
*/
|
|
ASTCENC_SIMD_INLINE float32x4_t vmaxnmq_f32(float32x4_t a, float32x4_t b)
|
|
{
|
|
uint32x4_t amask = vceqq_f32(a, a);
|
|
uint32x4_t bmask = vceqq_f32(b, b);
|
|
a = vbslq_f32(amask, a, b);
|
|
b = vbslq_f32(bmask, b, a);
|
|
return vmaxq_f32(a, b);
|
|
}
|
|
|
|
/**
|
|
* @brief Return the min vector of two vectors.
|
|
*
|
|
* If one vector element is numeric and the other is a quiet NaN,
|
|
* the result placed in the vector is the numerical value.
|
|
*/
|
|
ASTCENC_SIMD_INLINE float32x4_t vminnmq_f32(float32x4_t a, float32x4_t b)
|
|
{
|
|
uint32x4_t amask = vceqq_f32(a, a);
|
|
uint32x4_t bmask = vceqq_f32(b, b);
|
|
a = vbslq_f32(amask, a, b);
|
|
b = vbslq_f32(bmask, b, a);
|
|
return vminq_f32(a, b);
|
|
}
|
|
|
|
/**
|
|
* @brief Return a float rounded to the nearest integer value.
|
|
*/
|
|
ASTCENC_SIMD_INLINE float32x4_t vrndnq_f32(float32x4_t a)
|
|
{
|
|
assert(std::fegetround() == FE_TONEAREST);
|
|
float a0 = std::nearbyintf(vgetq_lane_f32(a, 0));
|
|
float a1 = std::nearbyintf(vgetq_lane_f32(a, 1));
|
|
float a2 = std::nearbyintf(vgetq_lane_f32(a, 2));
|
|
float a3 = std::nearbyintf(vgetq_lane_f32(a, 3));
|
|
float32x4_t c { a0, a1, a2, a3 };
|
|
return c;
|
|
}
|
|
|
|
#endif
|
|
|
|
/**
|
|
* @brief Return the horizontal maximum of a vector.
|
|
*/
|
|
ASTCENC_SIMD_INLINE float vmaxvq_f32(float32x4_t a)
|
|
{
|
|
float a0 = vgetq_lane_f32(a, 0);
|
|
float a1 = vgetq_lane_f32(a, 1);
|
|
float a2 = vgetq_lane_f32(a, 2);
|
|
float a3 = vgetq_lane_f32(a, 3);
|
|
return std::max(std::max(a0, a1), std::max(a2, a3));
|
|
}
|
|
|
|
/**
|
|
* @brief Return the horizontal maximum of a vector.
|
|
*/
|
|
ASTCENC_SIMD_INLINE float vminvq_f32(float32x4_t a)
|
|
{
|
|
float a0 = vgetq_lane_f32(a, 0);
|
|
float a1 = vgetq_lane_f32(a, 1);
|
|
float a2 = vgetq_lane_f32(a, 2);
|
|
float a3 = vgetq_lane_f32(a, 3);
|
|
return std::min(std::min(a0, a1), std::min(a2, a3));
|
|
}
|
|
|
|
/**
|
|
* @brief Return the horizontal maximum of a vector.
|
|
*/
|
|
ASTCENC_SIMD_INLINE int32_t vmaxvq_s32(int32x4_t a)
|
|
{
|
|
int32_t a0 = vgetq_lane_s32(a, 0);
|
|
int32_t a1 = vgetq_lane_s32(a, 1);
|
|
int32_t a2 = vgetq_lane_s32(a, 2);
|
|
int32_t a3 = vgetq_lane_s32(a, 3);
|
|
return std::max(std::max(a0, a1), std::max(a2, a3));
|
|
}
|
|
|
|
/**
|
|
* @brief Return the horizontal maximum of a vector.
|
|
*/
|
|
ASTCENC_SIMD_INLINE int32_t vminvq_s32(int32x4_t a)
|
|
{
|
|
int32_t a0 = vgetq_lane_s32(a, 0);
|
|
int32_t a1 = vgetq_lane_s32(a, 1);
|
|
int32_t a2 = vgetq_lane_s32(a, 2);
|
|
int32_t a3 = vgetq_lane_s32(a, 3);
|
|
return std::min(std::min(a0, a1), std::min(a2, a3));
|
|
}
|
|
|
|
/**
|
|
* @brief Return the sqrt of the lanes in the vector.
|
|
*/
|
|
ASTCENC_SIMD_INLINE float32x4_t vsqrtq_f32(float32x4_t a)
|
|
{
|
|
float a0 = std::sqrt(vgetq_lane_f32(a, 0));
|
|
float a1 = std::sqrt(vgetq_lane_f32(a, 1));
|
|
float a2 = std::sqrt(vgetq_lane_f32(a, 2));
|
|
float a3 = std::sqrt(vgetq_lane_f32(a, 3));
|
|
float32x4_t c { a0, a1, a2, a3 };
|
|
return c;
|
|
}
|
|
|
|
/**
|
|
* @brief Vector by vector division.
|
|
*/
|
|
ASTCENC_SIMD_INLINE float32x4_t vdivq_f32(float32x4_t a, float32x4_t b)
|
|
{
|
|
float a0 = vgetq_lane_f32(a, 0), b0 = vgetq_lane_f32(b, 0);
|
|
float a1 = vgetq_lane_f32(a, 1), b1 = vgetq_lane_f32(b, 1);
|
|
float a2 = vgetq_lane_f32(a, 2), b2 = vgetq_lane_f32(b, 2);
|
|
float a3 = vgetq_lane_f32(a, 3), b3 = vgetq_lane_f32(b, 3);
|
|
float32x4_t c { a0 / b0, a1 / b1, a2 / b2, a3 / b3 };
|
|
return c;
|
|
}
|
|
|
|
/**
|
|
* @brief Table vector lookup.
|
|
*/
|
|
ASTCENC_SIMD_INLINE int8x16_t vqtbl1q_s8(int8x16_t t, uint8x16_t idx)
|
|
{
|
|
int8x8x2_t tab;
|
|
tab.val[0] = vget_low_s8(t);
|
|
tab.val[1] = vget_high_s8(t);
|
|
int8x16_t id = vreinterpretq_s8_u8(idx);
|
|
return vcombine_s8(
|
|
vtbl2_s8(tab, vget_low_s8(id)),
|
|
vtbl2_s8(tab, vget_high_s8(id)));
|
|
}
|
|
|
|
/**
|
|
* @brief Horizontal integer addition.
|
|
*/
|
|
ASTCENC_SIMD_INLINE uint32_t vaddvq_u32(uint32x4_t a)
|
|
{
|
|
uint32_t a0 = vgetq_lane_u32(a, 0);
|
|
uint32_t a1 = vgetq_lane_u32(a, 1);
|
|
uint32_t a2 = vgetq_lane_u32(a, 2);
|
|
uint32_t a3 = vgetq_lane_u32(a, 3);
|
|
return a0 + a1 + a2 + a3;
|
|
}
|
|
|
|
#endif
|