// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2021 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
//	 http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------

/**
 * @brief Intrinsics for Armv7 NEON.
 *
 * This module implements a few Armv7-compatible intrinsics indentical to Armv8
 * ones. Thus, astcenc can be compiled using Armv7 architecture.
 */

#ifndef ASTC_VECMATHLIB_NEON_ARMV7_4_H_INCLUDED
#define ASTC_VECMATHLIB_NEON_ARMV7_4_H_INCLUDED

#ifndef ASTCENC_SIMD_INLINE
	#error "Include astcenc_vecmathlib.h, do not include directly"
#endif

#include <algorithm>
#include <cfenv>


// arm-linux-gnueabi-gcc contains the following functions by using
// #pragma GCC target ("fpu=neon-fp-armv8"), while clang does not.
#if defined(__clang__)

/**
 * @brief Return the max vector of two vectors.
 *
 * If one vector element is numeric and the other is a quiet NaN,
 * the result placed in the vector is the numerical value.
 */
ASTCENC_SIMD_INLINE float32x4_t vmaxnmq_f32(float32x4_t a, float32x4_t b)
{
	uint32x4_t amask = vceqq_f32(a, a);
	uint32x4_t bmask = vceqq_f32(b, b);
	a = vbslq_f32(amask, a, b);
	b = vbslq_f32(bmask, b, a);
	return vmaxq_f32(a, b);
}

/**
 * @brief Return the min vector of two vectors.
 *
 * If one vector element is numeric and the other is a quiet NaN,
 * the result placed in the vector is the numerical value.
 */
ASTCENC_SIMD_INLINE float32x4_t vminnmq_f32(float32x4_t a, float32x4_t b)
{
	uint32x4_t amask = vceqq_f32(a, a);
	uint32x4_t bmask = vceqq_f32(b, b);
	a = vbslq_f32(amask, a, b);
	b = vbslq_f32(bmask, b, a);
	return vminq_f32(a, b);
}

/**
 * @brief Return a float rounded to the nearest integer value.
 */
ASTCENC_SIMD_INLINE float32x4_t vrndnq_f32(float32x4_t a)
{
	assert(std::fegetround() == FE_TONEAREST);
	float a0 = std::nearbyintf(vgetq_lane_f32(a, 0));
	float a1 = std::nearbyintf(vgetq_lane_f32(a, 1));
	float a2 = std::nearbyintf(vgetq_lane_f32(a, 2));
	float a3 = std::nearbyintf(vgetq_lane_f32(a, 3));
	float32x4_t c { a0, a1, a2, a3 };
	return c;
}

#endif

/**
 * @brief Return the horizontal maximum of a vector.
 */
ASTCENC_SIMD_INLINE float vmaxvq_f32(float32x4_t a)
{
	float a0 = vgetq_lane_f32(a, 0);
	float a1 = vgetq_lane_f32(a, 1);
	float a2 = vgetq_lane_f32(a, 2);
	float a3 = vgetq_lane_f32(a, 3);
	return std::max(std::max(a0, a1), std::max(a2, a3));
}

/**
 * @brief Return the horizontal maximum of a vector.
 */
ASTCENC_SIMD_INLINE float vminvq_f32(float32x4_t a)
{
	float a0 = vgetq_lane_f32(a, 0);
	float a1 = vgetq_lane_f32(a, 1);
	float a2 = vgetq_lane_f32(a, 2);
	float a3 = vgetq_lane_f32(a, 3);
	return std::min(std::min(a0, a1), std::min(a2, a3));
}

/**
 * @brief Return the horizontal maximum of a vector.
 */
ASTCENC_SIMD_INLINE int32_t vmaxvq_s32(int32x4_t a)
{
	int32_t a0 = vgetq_lane_s32(a, 0);
	int32_t a1 = vgetq_lane_s32(a, 1);
	int32_t a2 = vgetq_lane_s32(a, 2);
	int32_t a3 = vgetq_lane_s32(a, 3);
	return std::max(std::max(a0, a1), std::max(a2, a3));
}

/**
 * @brief Return the horizontal maximum of a vector.
 */
ASTCENC_SIMD_INLINE int32_t vminvq_s32(int32x4_t a)
{
	int32_t a0 = vgetq_lane_s32(a, 0);
	int32_t a1 = vgetq_lane_s32(a, 1);
	int32_t a2 = vgetq_lane_s32(a, 2);
	int32_t a3 = vgetq_lane_s32(a, 3);
	return std::min(std::min(a0, a1), std::min(a2, a3));
}

/**
 * @brief Return the sqrt of the lanes in the vector.
 */
ASTCENC_SIMD_INLINE float32x4_t vsqrtq_f32(float32x4_t a)
{
	float a0 = std::sqrt(vgetq_lane_f32(a, 0));
	float a1 = std::sqrt(vgetq_lane_f32(a, 1));
	float a2 = std::sqrt(vgetq_lane_f32(a, 2));
	float a3 = std::sqrt(vgetq_lane_f32(a, 3));
	float32x4_t c { a0, a1, a2, a3 };
	return c;
}

/**
 * @brief Vector by vector division.
 */
ASTCENC_SIMD_INLINE float32x4_t vdivq_f32(float32x4_t a, float32x4_t b)
{
	float a0 = vgetq_lane_f32(a, 0), b0 = vgetq_lane_f32(b, 0);
	float a1 = vgetq_lane_f32(a, 1), b1 = vgetq_lane_f32(b, 1);
	float a2 = vgetq_lane_f32(a, 2), b2 = vgetq_lane_f32(b, 2);
	float a3 = vgetq_lane_f32(a, 3), b3 = vgetq_lane_f32(b, 3);
	float32x4_t c { a0 / b0, a1 / b1, a2 / b2, a3 / b3 };
	return c;
}

/**
 * @brief Table vector lookup.
 */
ASTCENC_SIMD_INLINE int8x16_t vqtbl1q_s8(int8x16_t t, uint8x16_t idx)
{
	int8x8x2_t tab;
	tab.val[0] = vget_low_s8(t);
	tab.val[1] = vget_high_s8(t);
	int8x16_t id = vreinterpretq_s8_u8(idx);
	return vcombine_s8(
		vtbl2_s8(tab, vget_low_s8(id)),
		vtbl2_s8(tab, vget_high_s8(id)));
}

/**
 * @brief Horizontal integer addition.
 */
ASTCENC_SIMD_INLINE uint32_t vaddvq_u32(uint32x4_t a)
{
	uint32_t a0 = vgetq_lane_u32(a, 0);
	uint32_t a1 = vgetq_lane_u32(a, 1);
	uint32_t a2 = vgetq_lane_u32(a, 2);
	uint32_t a3 = vgetq_lane_u32(a, 3);
	return a0 + a1 + a2 + a3;
}

#endif