axmol/extensions/Effekseer/Effekseer/Effekseer.SIMD.h

#pragma once
#include "Effekseer.h"


#ifndef __EFFEKSEER_SIMD_BASE_H__
#define __EFFEKSEER_SIMD_BASE_H__

#include <cstdint>
#include <cmath>

#if defined(__ARM_NEON__) || defined(__ARM_NEON)
// ARMv7/ARM64 NEON

#define EFK_SIMD_NEON

#if defined(_M_ARM64) || defined(__aarch64__)
#define EFK_SIMD_NEON_ARM64
#endif

#include <arm_neon.h>

#elif (defined(_M_AMD64) || defined(_M_X64)) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(__SSE2__)
// x86/x86-64 SSE2/AVX2

#define EFK_SIMD_SSE2

#if defined(__AVX2__)
#define EFK_SIMD_AVX2
#endif
#if defined(__AVX__) || defined(EFK_SIMD_AVX2)
#define EFK_SIMD_AVX
#endif
#if defined(__SSE4_2__) || defined(EFK_SIMD_AVX)
#define EFK_SIMD_SSE4_2
#endif
#if defined(__SSE4_1__) || defined(EFK_SIMD_SSE4_2)
#define EFK_SIMD_SSE4_1
#endif
#if defined(__SSSE3__) || defined(EFK_SIMD_SSE4_1)
#define EFK_SIMD_SSSE3
#endif
#if defined(__SSE3__) || defined(EFK_SIMD_SSSE3)
#define EFK_SIMD_SSE3
#endif

#if defined(EFK_SIMD_AVX) || defined(EFK_SIMD_AVX2)
#include <immintrin.h>
#elif defined(EFK_SIMD_SSE4_2)
#include <nmmintrin.h>
#elif defined(EFK_SIMD_SSE4_1)
#include <smmintrin.h>
#elif defined(EFK_SIMD_SSSE3)
#include <tmmintrin.h>
#elif defined(EFK_SIMD_SSE3)
#include <pmmintrin.h>
#elif defined(EFK_SIMD_SSE2)
#include <emmintrin.h>
#endif

#else
// C++ Generic Implementation (Pseudo SIMD)

#define EFK_SIMD_GEN

#endif

const float DefaultEpsilon = 1e-6f;

#endif // __EFFEKSEER_SIMD_BASE_H__

#ifndef __EFFEKSEER_SIMD_FLOAT4_GEN_H__
#define __EFFEKSEER_SIMD_FLOAT4_GEN_H__


#if defined(EFK_SIMD_GEN)

#include <cstring>
#include <algorithm>

namespace Effekseer
{

namespace SIMD
{

inline float Sqrt(float x)
{
	return std::sqrt(x);
}
inline float Rsqrt(float x)
{
	return 1.0f / std::sqrt(x);
}

struct Int4;

/**
	@brief	simd class for generic
*/
struct alignas(16) Float4
{
	union {
		float vf[4];
		int32_t vi[4];
		uint32_t vu[4];
	};

	Float4() = default;
	Float4(const Float4& rhs) = default;
	Float4(float x, float y, float z, float w) { vf[0] = x; vf[1] = y; vf[2] = z; vf[3] = w; }
	Float4(float i) { vf[0] = i; vf[1] = i; vf[2] = i; vf[3] = i; }

	float GetX() const { return vf[0]; }
	float GetY() const { return vf[1]; }
	float GetZ() const { return vf[2]; }
	float GetW() const { return vf[3]; }

	void SetX(float o) { vf[0] = o; }
	void SetY(float o) { vf[1] = o; }
	void SetZ(float o) { vf[2] = o; }
	void SetW(float o) { vf[3] = o; }

	template <size_t LANE>
	Float4 Dup() { return Float4(vf[LANE], vf[LANE], vf[LANE], vf[LANE]); }

	Int4 Convert4i() const;
	Int4 Cast4i() const;

	Float4& operator+=(const Float4& rhs)
	{
		for (size_t i = 0; i < 4; i++)
		{
			vf[i] += rhs.vf[i];
		}
		return *this;
	}

	Float4& operator-=(const Float4& rhs)
	{
		for (size_t i = 0; i < 4; i++)
		{
			vf[i] -= rhs.vf[i];
		}
		return *this;
	}

	Float4& operator*=(const Float4& rhs)
	{
		for (size_t i = 0; i < 4; i++)
		{
			vf[i] *= rhs.vf[i];
		}
		return *this;
	}

	Float4& operator*=(float rhs)
	{
		for (size_t i = 0; i < 4; i++)
		{
			vf[i] *= rhs;
		}
		return *this;
	}

	Float4& operator/=(const Float4& rhs)
	{
		for (size_t i = 0; i < 4; i++)
		{
			vf[i] /= rhs.vf[i];
		}
		return *this;
	}

	Float4& operator/=(float rhs)
	{
		for (size_t i = 0; i < 4; i++)
		{
			vf[i] /= rhs;
		}
		return *this;
	}

	static Float4 Load2(const void* mem);
	static void Store2(void* mem, const Float4& i);
	static Float4 Load3(const void* mem);
	static void Store3(void* mem, const Float4& i);
	static Float4 Load4(const void* mem);
	static void Store4(void* mem, const Float4& i);

	static Float4 SetZero();
	static Float4 SetInt(int32_t x, int32_t y, int32_t z, int32_t w);
	static Float4 SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w);
	static Float4 Sqrt(const Float4& in);
	static Float4 Rsqrt(const Float4& in);
	static Float4 Abs(const Float4& in);
	static Float4 Min(const Float4& lhs, const Float4& rhs);
	static Float4 Max(const Float4& lhs, const Float4& rhs);
	static Float4 Floor(const Float4& in);
	static Float4 Ceil(const Float4& in);
	static Float4 MulAdd(const Float4& a, const Float4& b, const Float4& c);
	static Float4 MulSub(const Float4& a, const Float4& b, const Float4& c);

	template<size_t LANE>
	static Float4 MulLane(const Float4& lhs, const Float4& rhs);
	template<size_t LANE>
	static Float4 MulAddLane(const Float4& a, const Float4& b, const Float4& c);
	template<size_t LANE>
	static Float4 MulSubLane(const Float4& a, const Float4& b, const Float4& c);
	template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
	static Float4 Swizzle(const Float4& in);

	static Float4 Dot3(const Float4& lhs, const Float4& rhs);
	static Float4 Cross3(const Float4& lhs, const Float4& rhs);

	template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
	static Float4 Mask();
	static uint32_t MoveMask(const Float4& in);
	static Float4 Select(const Float4& mask, const Float4& sel1, const Float4& sel2);
	static Float4 Equal(const Float4& lhs, const Float4& rhs);
	static Float4 NotEqual(const Float4& lhs, const Float4& rhs);
	static Float4 LessThan(const Float4& lhs, const Float4& rhs);
	static Float4 LessEqual(const Float4& lhs, const Float4& rhs);
	static Float4 GreaterThan(const Float4& lhs, const Float4& rhs);
	static Float4 GreaterEqual(const Float4& lhs, const Float4& rhs);
	static Float4 NearEqual(const Float4& lhs, const Float4& rhs, float epsilon = DefaultEpsilon);
	static Float4 IsZero(const Float4& in, float epsilon = DefaultEpsilon);
	static void Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3);
};

inline Float4 operator+(const Float4& lhs, const Float4& rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vf[i] = lhs.vf[i] + rhs.vf[i];
	}
	return ret;
}

inline Float4 operator-(const Float4& lhs, const Float4& rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vf[i] = lhs.vf[i] - rhs.vf[i];
	}
	return ret;
}

inline Float4 operator*(const Float4& lhs, const Float4& rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vf[i] = lhs.vf[i] * rhs.vf[i];
	}
	return ret;
}

inline Float4 operator*(const Float4& lhs, float rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vf[i] = lhs.vf[i] * rhs;
	}
	return ret;
}

inline Float4 operator/(const Float4& lhs, const Float4& rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vf[i] = lhs.vf[i] / rhs.vf[i];
	}
	return ret;
}

inline Float4 operator/(const Float4& lhs, float rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vf[i] = lhs.vf[i] / rhs;
	}
	return ret;
}

inline Float4 operator&(const Float4& lhs, const Float4& rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = lhs.vu[i] & rhs.vu[i];
	}
	return ret;
}

inline Float4 operator|(const Float4& lhs, const Float4& rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = lhs.vu[i] | rhs.vu[i];
	}
	return ret;
}

inline Float4 operator^(const Float4& lhs, const Float4& rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = lhs.vu[i] ^ rhs.vu[i];
	}
	return ret;
}

inline bool operator==(const Float4& lhs, const Float4& rhs)
{
	bool ret = true;
	for (size_t i = 0; i < 4; i++)
	{
		ret &= lhs.vf[i] == rhs.vf[i];
	}
	return ret;
}

inline bool operator!=(const Float4& lhs, const Float4& rhs)
{
	bool ret = true;
	for (size_t i = 0; i < 4; i++)
	{
		ret &= lhs.vf[i] == rhs.vf[i];
	}
	return !ret;
}

inline Float4 Float4::Load2(const void* mem)
{
	Float4 ret;
	memcpy(ret.vf, mem, sizeof(float) * 2);
	// This code causes bugs in asmjs
	// ret.vf[0] = *((float*)mem + 0);
	// ret.vf[1] = *((float*)mem + 1);
	return ret;
}

inline void Float4::Store2(void* mem, const Float4& i)
{
	memcpy(mem, i.vf, sizeof(float) * 2);
	// This code causes bugs in asmjs
	// *((float*)mem + 0) = i.vf[0];
	// *((float*)mem + 1) = i.vf[1];
}

inline Float4 Float4::Load3(const void* mem)
{
	Float4 ret;
	memcpy(ret.vf, mem, sizeof(float) * 3);
	// This code causes bugs in asmjs
	// ret.vf[0] = *((float*)mem + 0);
	// ret.vf[1] = *((float*)mem + 1);
	// ret.vf[2] = *((float*)mem + 2);
	return ret;
}

inline void Float4::Store3(void* mem, const Float4& i)
{
	memcpy(mem, i.vf, sizeof(float) * 3);
	// This code causes bugs in asmjs
	// *((float*)mem + 0) = i.vf[0];
	// *((float*)mem + 1) = i.vf[1];
	// *((float*)mem + 2) = i.vf[2];
}

inline Float4 Float4::Load4(const void* mem)
{
	Float4 ret;
	memcpy(ret.vf, mem, sizeof(float) * 4);
	// This code causes bugs in emscripten
	// ret.vf[0] = *((float*)mem + 0);
	// ret.vf[1] = *((float*)mem + 1);
	// ret.vf[2] = *((float*)mem + 2);
	// ret.vf[3] = *((float*)mem + 3);
	return ret;
}

inline void Float4::Store4(void* mem, const Float4& i)
{
	memcpy(mem, i.vf, sizeof(float) * 4);
	// This code causes bugs in asmjs
	// *((float*)mem + 0) = i.vf[0];
	// *((float*)mem + 1) = i.vf[1];
	// *((float*)mem + 2) = i.vf[2];
	// *((float*)mem + 3) = i.vf[3];
}

inline Float4 Float4::SetZero()
{
	Float4 ret;
	ret.vf[0] = 0.0f;
	ret.vf[1] = 0.0f;
	ret.vf[2] = 0.0f;
	ret.vf[3] = 0.0f;
	return ret;
}

inline Float4 Float4::SetInt(int32_t x, int32_t y, int32_t z, int32_t w)
{
	Float4 ret;
	ret.vu[0] = (uint32_t)x;
	ret.vu[1] = (uint32_t)y;
	ret.vu[2] = (uint32_t)z;
	ret.vu[3] = (uint32_t)w;
	return ret;
}

inline Float4 Float4::SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
{
	Float4 ret;
	ret.vu[0] = (uint32_t)x;
	ret.vu[1] = (uint32_t)y;
	ret.vu[2] = (uint32_t)z;
	ret.vu[3] = (uint32_t)w;
	return ret;
}

inline Float4 Float4::Sqrt(const Float4& in)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vf[i] = std::sqrt(in.vf[i]);
	}
	return ret;
}

inline Float4 Float4::Rsqrt(const Float4& in)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vf[i] = 1.0f / std::sqrt(in.vf[i]);
	}
	return ret;
}

inline Float4 Float4::Abs(const Float4& in)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vf[i] = std::abs(in.vf[i]);
	}
	return ret;
}

inline Float4 Float4::Min(const Float4& lhs, const Float4& rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vf[i] = std::fmin(lhs.vf[i], rhs.vf[i]);
	}
	return ret;
}

inline Float4 Float4::Max(const Float4& lhs, const Float4& rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vf[i] = std::fmax(lhs.vf[i], rhs.vf[i]);
	}
	return ret;
}

inline Float4 Float4::Floor(const Float4& in)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vf[i] = std::floor(in.vf[i]);
	}
	return ret;
}

inline Float4 Float4::Ceil(const Float4& in)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vf[i] = std::ceil(in.vf[i]);
	}
	return ret;
}

inline Float4 Float4::MulAdd(const Float4& a, const Float4& b, const Float4& c)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vf[i] = a.vf[i] + b.vf[i] * c.vf[i];
}
	return ret;
}

inline Float4 Float4::MulSub(const Float4& a, const Float4& b, const Float4& c)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vf[i] = a.vf[i] - b.vf[i] * c.vf[i];
}
	return ret;
}

inline Float4 Float4::Dot3(const Float4& lhs, const Float4& rhs)
{
	Float4 muled = lhs * rhs;
	return Float4{muled.vf[0] + muled.vf[1] + muled.vf[2], 0.0f, 0.0f, 0.0f};
}

inline Float4 Float4::Cross3(const Float4& lhs, const Float4& rhs)
{
	return Float4::Swizzle<1,2,0,3>(lhs) * Float4::Swizzle<2,0,1,3>(rhs) -
		Float4::Swizzle<2,0,1,3>(lhs) * Float4::Swizzle<1,2,0,3>(rhs);
}

template<size_t LANE>
Float4 Float4::MulLane(const Float4& lhs, const Float4& rhs)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
	return lhs * rhs.vf[LANE];
}

template<size_t LANE>
Float4 Float4::MulAddLane(const Float4& a, const Float4& b, const Float4& c)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
	return a + b * c.vf[LANE];
}

template<size_t LANE>
Float4 Float4::MulSubLane(const Float4& a, const Float4& b, const Float4& c)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
	return a - b * c.vf[LANE];
}

template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
Float4 Float4::Swizzle(const Float4& in)
{
	static_assert(indexX < 4, "indexX is must be less than 4.");
	static_assert(indexY < 4, "indexY is must be less than 4.");
	static_assert(indexZ < 4, "indexZ is must be less than 4.");
	static_assert(indexW < 4, "indexW is must be less than 4.");
	return Float4{in.vf[indexX], in.vf[indexY], in.vf[indexZ], in.vf[indexW]};
}


template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
Float4 Float4::Mask()
{
	static_assert(X >= 2, "indexX is must be set 0 or 1.");
	static_assert(Y >= 2, "indexY is must be set 0 or 1.");
	static_assert(Z >= 2, "indexZ is must be set 0 or 1.");
	static_assert(W >= 2, "indexW is must be set 0 or 1.");
	Float4 ret;
	ret.vu[0] = 0xffffffff * X;
	ret.vu[1] = 0xffffffff * Y;
	ret.vu[2] = 0xffffffff * Z;
	ret.vu[3] = 0xffffffff * W;
	return ret;
}

inline uint32_t Float4::MoveMask(const Float4& in)
{
	return (in.vu[0] & 0x1) | (in.vu[1] & 0x2) | (in.vu[2] & 0x4) | (in.vu[3] & 0x8);
}

inline Float4 Float4::Select(const Float4& mask, const Float4& sel1, const Float4& sel2)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (mask.vu[i] & sel1.vu[i]) | (~mask.vu[i] & sel2.vu[i]);
	}
	return ret;
}

inline Float4 Float4::Equal(const Float4& lhs, const Float4& rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (lhs.vf[i] == rhs.vf[i]) ? 0xffffffff : 0;
	}
	return ret;
}

inline Float4 Float4::NotEqual(const Float4& lhs, const Float4& rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (lhs.vf[i] != rhs.vf[i]) ? 0xffffffff : 0;
	}
	return ret;
}

inline Float4 Float4::LessThan(const Float4& lhs, const Float4& rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (lhs.vf[i] < rhs.vf[i]) ? 0xffffffff : 0;
	}
	return ret;
}

inline Float4 Float4::LessEqual(const Float4& lhs, const Float4& rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (lhs.vf[i] <= rhs.vf[i]) ? 0xffffffff : 0;
	}
	return ret;
}

inline Float4 Float4::GreaterThan(const Float4& lhs, const Float4& rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (lhs.vf[i] > rhs.vf[i]) ? 0xffffffff : 0;
	}
	return ret;
}

inline Float4 Float4::GreaterEqual(const Float4& lhs, const Float4& rhs)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (lhs.vf[i] >= rhs.vf[i]) ? 0xffffffff : 0;
	}
	return ret;
}

inline Float4 Float4::NearEqual(const Float4& lhs, const Float4& rhs, float epsilon)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (std::abs(lhs.vf[i] - rhs.vf[i]) <= epsilon) ? 0xffffffff : 0;
	}
	return ret;
}

inline Float4 Float4::IsZero(const Float4& in, float epsilon)
{
	Float4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (std::abs(in.vf[i]) <= epsilon) ? 0xffffffff : 0;
	}
	return ret;
}

inline void Float4::Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3)
{
	std::swap(s0.vf[1], s1.vf[0]);
	std::swap(s0.vf[2], s2.vf[0]);
	std::swap(s0.vf[3], s3.vf[0]);
	std::swap(s1.vf[2], s2.vf[1]);
	std::swap(s2.vf[3], s3.vf[2]);
	std::swap(s1.vf[3], s3.vf[1]);
}

} // namespace SIMD

} // namespace Effekseer

#endif // defined(EFK_SIMD_GEN)

#endif // __EFFEKSEER_SIMD_FLOAT4_GEN_H__

#ifndef __EFFEKSEER_SIMD_FLOAT4_NEON_H__
#define __EFFEKSEER_SIMD_FLOAT4_NEON_H__


#if defined(EFK_SIMD_NEON)

namespace Effekseer
{

namespace SIMD
{

inline float Sqrt(float x)
{
	return sqrt(x);
}

inline float Rsqrt(float x)
{
	return 1.0f / sqrt(x);
}

struct Int4;

/**
 @brief    simd class for sse
 */

struct alignas(16) Float4
{
	float32x4_t s;

	Float4() = default;
	Float4(const Float4& rhs) = default;
	Float4(float32x4_t rhs) { s = rhs; }
	Float4(uint32x4_t rhs) { s = vreinterpretq_f32_u32(rhs); }
	Float4(float x, float y, float z, float w) { const float f[4] = {x, y, z, w}; s = vld1q_f32(f); }
	Float4(float i) { s = vdupq_n_f32(i); }

	float GetX() const { return vgetq_lane_f32(s, 0); }
	float GetY() const { return vgetq_lane_f32(s, 1); }
	float GetZ() const { return vgetq_lane_f32(s, 2); }
	float GetW() const { return vgetq_lane_f32(s, 3); }

	void SetX(float i) { s = vsetq_lane_f32(i, s, 0); }
	void SetY(float i) { s = vsetq_lane_f32(i, s, 1); }
	void SetZ(float i) { s = vsetq_lane_f32(i, s, 2); }
	void SetW(float i) { s = vsetq_lane_f32(i, s, 3); }

	template <size_t LANE>
	Float4 Dup();

	Int4 Convert4i() const;
	Int4 Cast4i() const;

	Float4& operator+=(const Float4& rhs);
	Float4& operator-=(const Float4& rhs);
	Float4& operator*=(const Float4& rhs);
	Float4& operator*=(float rhs);
	Float4& operator/=(const Float4& rhs);
	Float4& operator/=(float rhs);

	static Float4 Load2(const void* mem);
	static void Store2(void* mem, const Float4& i);
	static Float4 Load3(const void* mem);
	static void Store3(void* mem, const Float4& i);
	static Float4 Load4(const void* mem);
	static void Store4(void* mem, const Float4& i);

	static Float4 SetZero();
	static Float4 SetInt(int32_t x, int32_t y, int32_t z, int32_t w);
	static Float4 SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w);
	static Float4 Sqrt(const Float4& in);
	static Float4 Rsqrt(const Float4& in);
	static Float4 Abs(const Float4& in);
	static Float4 Min(const Float4& lhs, const Float4& rhs);
	static Float4 Max(const Float4& lhs, const Float4& rhs);
	static Float4 Floor(const Float4& in);
	static Float4 Ceil(const Float4& in);
	static Float4 MulAdd(const Float4& a, const Float4& b, const Float4& c);
	static Float4 MulSub(const Float4& a, const Float4& b, const Float4& c);

	template<size_t LANE>
	static Float4 MulLane(const Float4& lhs, const Float4& rhs);
	template<size_t LANE>
	static Float4 MulAddLane(const Float4& a, const Float4& b, const Float4& c);
	template<size_t LANE>
	static Float4 MulSubLane(const Float4& a, const Float4& b, const Float4& c);
	template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
	static Float4 Swizzle(const Float4& v);

	static Float4 Dot3(const Float4& lhs, const Float4& rhs);
	static Float4 Cross3(const Float4& lhs, const Float4& rhs);

	template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
	static Float4 Mask();
	static uint32_t MoveMask(const Float4& in);
	static Float4 Select(const Float4& mask, const Float4& sel1, const Float4& sel2);
	static Float4 Equal(const Float4& lhs, const Float4& rhs);
	static Float4 NotEqual(const Float4& lhs, const Float4& rhs);
	static Float4 LessThan(const Float4& lhs, const Float4& rhs);
	static Float4 LessEqual(const Float4& lhs, const Float4& rhs);
	static Float4 GreaterThan(const Float4& lhs, const Float4& rhs);
	static Float4 GreaterEqual(const Float4& lhs, const Float4& rhs);
	static Float4 NearEqual(const Float4& lhs, const Float4& rhs, float epsilon = DefaultEpsilon);
	static Float4 IsZero(const Float4& in, float epsilon = DefaultEpsilon);
	static void Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3);

private:
	static Float4 SwizzleYZX(const Float4& in);
	static Float4 SwizzleZXY(const Float4& in);
};

template <size_t LANE>
Float4 Float4::Dup()
{
	return (LANE < 2) ?
		vdupq_lane_f32(vget_low_f32(s), LANE & 1) :
		vdupq_lane_f32(vget_high_f32(s), LANE & 1);
}

inline Float4 operator+(const Float4& lhs, const Float4& rhs)
{
	return vaddq_f32(lhs.s, rhs.s);
}

inline Float4 operator-(const Float4& lhs, const Float4& rhs)
{
	return vsubq_f32(lhs.s, rhs.s);
}

inline Float4 operator*(const Float4& lhs, const Float4& rhs)
{
	return vmulq_f32(lhs.s, rhs.s);
}

inline Float4 operator*(const Float4& lhs, float rhs)
{
	return vmulq_n_f32(lhs.s, rhs);
}

inline Float4 operator/(const Float4& lhs, const Float4& rhs)
{
#if defined(_M_ARM64) || __aarch64__
	return vdivq_f32(lhs.s, rhs.s);
#else
	float32x4_t recp = vrecpeq_f32(rhs.s);
	float32x4_t s = vrecpsq_f32(recp, rhs.s);
	recp = vmulq_f32(s, recp);
	s = vrecpsq_f32(recp, rhs.s);
	recp = vmulq_f32(s, recp);
	return vmulq_f32(lhs.s, recp);
#endif
}

inline Float4 operator/(const Float4& lhs, float rhs)
{
	return lhs * (1.0f / rhs);
}

inline Float4 operator&(const Float4& lhs, const Float4& rhs)
{
	uint32x4_t lhsi = vreinterpretq_u32_f32(lhs.s);
	uint32x4_t rhsi = vreinterpretq_u32_f32(rhs.s);
	return vreinterpretq_f32_u32(vandq_u32(lhsi, rhsi));
}

inline Float4 operator|(const Float4& lhs, const Float4& rhs)
{
	uint32x4_t lhsi = vreinterpretq_u32_f32(lhs.s);
	uint32x4_t rhsi = vreinterpretq_u32_f32(rhs.s);
	return vreinterpretq_f32_u32(vorrq_u32(lhsi, rhsi));
}

inline Float4 operator^(const Float4& lhs, const Float4& rhs)
{
	uint32x4_t lhsi = vreinterpretq_u32_f32(lhs.s);
	uint32x4_t rhsi = vreinterpretq_u32_f32(rhs.s);
	return vreinterpretq_f32_u32(veorq_u32(lhsi, rhsi));
}

inline bool operator==(const Float4& lhs, const Float4& rhs)
{
	return Float4::MoveMask(Float4::Equal(lhs, rhs)) == 0xf;
}

inline bool operator!=(const Float4& lhs, const Float4& rhs)
{
	return Float4::MoveMask(Float4::Equal(lhs, rhs)) != 0xf;
}

inline Float4& Float4::operator+=(const Float4& rhs) { return *this = *this + rhs; }
inline Float4& Float4::operator-=(const Float4& rhs) { return *this = *this - rhs; }
inline Float4& Float4::operator*=(const Float4& rhs) { return *this = *this * rhs; }
inline Float4& Float4::operator*=(float rhs) { return *this = *this * rhs; }
inline Float4& Float4::operator/=(const Float4& rhs) { return *this = *this / rhs; }
inline Float4& Float4::operator/=(float rhs) { return *this = *this / rhs; }

inline Float4 Float4::Load2(const void* mem)
{
	float32x2_t low = vld1_f32((const float*)mem);
	float32x2_t high = vdup_n_f32(0.0f);
	return vcombine_f32(low, high);
}

inline void Float4::Store2(void* mem, const Float4& i)
{
	vst1_f32((float*)mem, vget_low_f32(i.s));
}

inline Float4 Float4::Load3(const void* mem)
{
	float32x2_t low = vld1_f32((const float*)mem);
	float32x2_t high = vld1_lane_f32((const float*)mem + 2, vdup_n_f32(0.0f), 0);
	return vcombine_f32(low, high);
}

inline void Float4::Store3(void* mem, const Float4& i)
{
	vst1_f32((float*)mem, vget_low_f32(i.s));
	vst1q_lane_f32((float*)mem + 2, i.s, 2);
}

inline Float4 Float4::Load4(const void* mem)
{
	return vld1q_f32((const float*)mem);
}

inline void Float4::Store4(void* mem, const Float4& i)
{
	vst1q_f32((float*)mem, i.s);
}

inline Float4 Float4::SetZero()
{
	return vdupq_n_f32(0.0f);
}

inline Float4 Float4::SetInt(int32_t x, int32_t y, int32_t z, int32_t w)
{
	const int32_t i[4] = {x, y, z, w};
	return vreinterpretq_f32_s32(vld1q_s32(i));
}

inline Float4 Float4::SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
{
	const uint32_t i[4] = {x, y, z, w};
	return vreinterpretq_f32_u32(vld1q_u32(i));
}

inline Float4 Float4::Sqrt(const Float4& in)
{
#if defined(_M_ARM64) || __aarch64__
	return vsqrtq_f32(in.s);
#else
	return Float4(1.0f) / Float4::Rsqrt(in);
#endif
}

inline Float4 Float4::Rsqrt(const Float4& in)
{
	float32x4_t s0 = vrsqrteq_f32(in.s);
	float32x4_t p0 = vmulq_f32(in.s, s0);
	float32x4_t r0 = vrsqrtsq_f32(p0, s0);
	float32x4_t s1 = vmulq_f32(s0, r0);
	return s1;
}

inline Float4 Float4::Abs(const Float4& in)
{
	return vabsq_f32(in.s);
}

inline Float4 Float4::Min(const Float4& lhs, const Float4& rhs)
{
	return vminq_f32(lhs.s, rhs.s);
}

inline Float4 Float4::Max(const Float4& lhs, const Float4& rhs)
{
	return vmaxq_f32(lhs.s, rhs.s);
}

inline Float4 Float4::Floor(const Float4& in)
{
#if defined(_M_ARM64) || __aarch64__
	return vrndmq_f32(in.s);
#else
	int32x4_t in_i = vcvtq_s32_f32(in.s);
	float32x4_t result = vcvtq_f32_s32(in_i);
	float32x4_t larger = vcgtq_f32(result, in.s);
	larger = vcvtq_f32_s32(larger);
	return vaddq_f32(result, larger);
#endif
}

inline Float4 Float4::Ceil(const Float4& in)
{
#if defined(_M_ARM64) || __aarch64__
	return vrndpq_f32(in.s);
#else
	int32x4_t in_i = vcvtq_s32_f32(in.s);
	float32x4_t result = vcvtq_f32_s32(in_i);
	float32x4_t smaller = vcltq_f32(result, in.s);
	smaller = vcvtq_f32_s32(smaller);
	return vsubq_f32(result, smaller);
#endif
}

inline Float4 Float4::MulAdd(const Float4& a, const Float4& b, const Float4& c)
{
	return vmlaq_f32(a.s, b.s, c.s);
}

inline Float4 Float4::MulSub(const Float4& a, const Float4& b, const Float4& c)
{
	return vmlsq_f32(a.s, b.s, c.s);
}

template<size_t LANE>
inline Float4 Float4::MulLane(const Float4& lhs, const Float4& rhs)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
	float32x2_t rhs2 = (LANE < 2) ? vget_low_f32(rhs.s) : vget_high_f32(rhs.s);
	return vmulq_lane_f32(lhs.s, rhs2, LANE & 1);
}

template<size_t LANE>
inline Float4 Float4::MulAddLane(const Float4& a, const Float4& b, const Float4& c)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
	float32x2_t c2 = (LANE < 2) ? vget_low_f32(c.s) : vget_high_f32(c.s);
	return vmlaq_lane_f32(a.s, b.s, c2, LANE & 1);
}

template<size_t LANE>
inline Float4 Float4::MulSubLane(const Float4& a, const Float4& b, const Float4& c)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
	float32x2_t c2 = (LANE < 2) ? vget_low_f32(c.s) : vget_high_f32(c.s);
	return vmlsq_lane_f32(a.s, b.s, c2, LANE & 1);
}

//template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
//inline Float4 Float4::Swizzle(const Float4& v)
//{
//	static_assert(indexX < 4, "indexX is must be less than 4.");
//	static_assert(indexY < 4, "indexY is must be less than 4.");
//	static_assert(indexZ < 4, "indexZ is must be less than 4.");
//	static_assert(indexW < 4, "indexW is must be less than 4.");
//}

inline Float4 Float4::Dot3(const Float4& lhs, const Float4& rhs)
{
	float32x4_t mul = vmulq_f32(lhs.s, rhs.s);
	float32x2_t xy = vpadd_f32(vget_low_f32(mul), vget_low_f32(mul));
	float32x2_t dot = vadd_f32(xy, vget_high_f32(mul));
	return vcombine_f32(dot, vdup_n_f32(0.0f));
}

inline Float4 Float4::Cross3(const Float4& lhs, const Float4& rhs)
{
	return MulSub(SwizzleYZX(lhs.s) * SwizzleZXY(rhs.s), SwizzleZXY(lhs.s), SwizzleYZX(rhs.s));
}

template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
inline Float4 Float4::Mask()
{
	static_assert(X >= 2, "indexX is must be set 0 or 1.");
	static_assert(Y >= 2, "indexY is must be set 0 or 1.");
	static_assert(Z >= 2, "indexZ is must be set 0 or 1.");
	static_assert(W >= 2, "indexW is must be set 0 or 1.");
	const uint32_t in[4] = {0xffffffff * X, 0xffffffff * Y, 0xffffffff * Z, 0xffffffff * W};
	return vld1q_f32((const float*)in);
}

inline uint32_t Float4::MoveMask(const Float4& in)
{
	uint16x4_t u16x4 = vmovn_u32(vreinterpretq_u32_f32(in.s));
	uint16_t u16[4];
	vst1_u16(u16, u16x4);
	return (u16[0] & 1) | (u16[1] & 2) | (u16[2] & 4) | (u16[3] & 8);
}

inline Float4 Float4::Select(const Float4& mask, const Float4& sel1, const Float4& sel2)
{
	uint32x4_t maski = vreinterpretq_u32_f32(mask.s);
	return vbslq_f32(maski, sel1.s, sel2.s);
}

inline Float4 Float4::Equal(const Float4& lhs, const Float4& rhs)
{
	return vceqq_f32(lhs.s, rhs.s);
}

inline Float4 Float4::NotEqual(const Float4& lhs, const Float4& rhs)
{
	return vmvnq_u32(vceqq_f32(lhs.s, rhs.s));
}

inline Float4 Float4::LessThan(const Float4& lhs, const Float4& rhs)
{
	return vcltq_f32(lhs.s, rhs.s);
}

inline Float4 Float4::LessEqual(const Float4& lhs, const Float4& rhs)
{
	return vcleq_f32(lhs.s, rhs.s);
}

inline Float4 Float4::GreaterThan(const Float4& lhs, const Float4& rhs)
{
	return vcgtq_f32(lhs.s, rhs.s);
}

inline Float4 Float4::GreaterEqual(const Float4& lhs, const Float4& rhs)
{
	return vcgeq_f32(lhs.s, rhs.s);
}

inline Float4 Float4::NearEqual(const Float4& lhs, const Float4& rhs, float epsilon)
{
	return LessEqual(Abs(lhs - rhs), Float4(epsilon));
}

inline Float4 Float4::IsZero(const Float4& in, float epsilon)
{
	return LessEqual(Abs(in), Float4(epsilon));
}

inline void Float4::Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3)
{
	float32x4x2_t t0 = vzipq_f32(s0.s, s2.s);
	float32x4x2_t t1 = vzipq_f32(s1.s, s3.s);
	float32x4x2_t t2 = vzipq_f32(t0.val[0], t1.val[0]);
	float32x4x2_t t3 = vzipq_f32(t0.val[1], t1.val[1]);

	s0 = t2.val[0];
	s1 = t2.val[1];
	s2 = t3.val[0];
	s3 = t3.val[1];
}

inline Float4 Float4::SwizzleYZX(const Float4& in)
{
	float32x4_t ex = vextq_f32(in.s, in.s, 1);
	return vsetq_lane_f32(vgetq_lane_f32(ex, 3), ex, 2);
}

inline Float4 Float4::SwizzleZXY(const Float4& in)
{
	float32x4_t ex = vextq_f32(in.s, in.s, 3);
	return vsetq_lane_f32(vgetq_lane_f32(ex, 3), ex, 0);
}

} // namespace SIMD

} // namespace Effekseer

#endif
#endif // __EFFEKSEER_SIMD_FLOAT4_NEON_H__

#ifndef __EFFEKSEER_SIMD_FLOAT4_SSE_H__
#define __EFFEKSEER_SIMD_FLOAT4_SSE_H__


#if defined(EFK_SIMD_SSE2)

namespace Effekseer
{

namespace SIMD
{

inline float Sqrt(float x)
{
	_mm_store_ss(&x, _mm_sqrt_ss(_mm_load_ss(&x)));
	return x;
}
inline float Rsqrt(float x)
{
	_mm_store_ss(&x, _mm_rsqrt_ss(_mm_load_ss(&x)));
	return x;
}

struct Int4;

/**
	@brief	simd class for sse
*/

struct alignas(16) Float4
{
	__m128 s;

	Float4() = default;
	Float4(const Float4& rhs) = default;
	Float4(__m128 rhs) { s = rhs; }
	Float4(__m128i rhs) { s = _mm_castsi128_ps(rhs); }
	Float4(float x, float y, float z, float w) { s = _mm_setr_ps(x, y, z, w); }
	Float4(float i) { s = _mm_set_ps1(i); }

	float GetX() const { return _mm_cvtss_f32(s); }
	float GetY() const { return _mm_cvtss_f32(Swizzle<1,1,1,1>(s).s); }
	float GetZ() const { return _mm_cvtss_f32(Swizzle<2,2,2,2>(s).s); }
	float GetW() const { return _mm_cvtss_f32(Swizzle<3,3,3,3>(s).s); }

	void SetX(float i) { s = _mm_move_ss(s, _mm_set_ss(i)); }
	void SetY(float i) { s = Swizzle<1,0,2,3>(_mm_move_ss(Swizzle<1,0,2,3>(s).s, _mm_set_ss(i))).s; }
	void SetZ(float i) { s = Swizzle<2,1,0,3>(_mm_move_ss(Swizzle<2,1,0,3>(s).s, _mm_set_ss(i))).s; }
	void SetW(float i) { s = Swizzle<3,1,2,0>(_mm_move_ss(Swizzle<3,1,2,0>(s).s, _mm_set_ss(i))).s; }

	template <size_t LANE>
	Float4 Dup() { return Swizzle<LANE,LANE,LANE,LANE>(s); }

	Int4 Convert4i() const;
	Int4 Cast4i() const;

	Float4& operator+=(const Float4& rhs);
	Float4& operator-=(const Float4& rhs);
	Float4& operator*=(const Float4& rhs);
	Float4& operator*=(float rhs);
	Float4& operator/=(const Float4& rhs);
	Float4& operator/=(float rhs);

	static Float4 Load2(const void* mem);
	static void Store2(void* mem, const Float4& i);
	static Float4 Load3(const void* mem);
	static void Store3(void* mem, const Float4& i);
	static Float4 Load4(const void* mem);
	static void Store4(void* mem, const Float4& i);

	static Float4 SetZero();
	static Float4 SetInt(int32_t x, int32_t y, int32_t z, int32_t w);
	static Float4 SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w);
	static Float4 Sqrt(const Float4& in);
	static Float4 Rsqrt(const Float4& in);
	static Float4 Abs(const Float4& in);
	static Float4 Min(const Float4& lhs, const Float4& rhs);
	static Float4 Max(const Float4& lhs, const Float4& rhs);
	static Float4 Floor(const Float4& in);
	static Float4 Ceil(const Float4& in);
	static Float4 MulAdd(const Float4& a, const Float4& b, const Float4& c);
	static Float4 MulSub(const Float4& a, const Float4& b, const Float4& c);

	template<size_t LANE>
	static Float4 MulLane(const Float4& lhs, const Float4& rhs);
	template<size_t LANE>
	static Float4 MulAddLane(const Float4& a, const Float4& b, const Float4& c);
	template<size_t LANE>
	static Float4 MulSubLane(const Float4& a, const Float4& b, const Float4& c);
	template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
	static Float4 Swizzle(const Float4& v);

	static Float4 Dot3(const Float4& lhs, const Float4& rhs);
	static Float4 Cross3(const Float4& lhs, const Float4& rhs);

	template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
	static Float4 Mask();
	static uint32_t MoveMask(const Float4& in);
	static Float4 Select(const Float4& mask, const Float4& sel1, const Float4& sel2);
	static Float4 Equal(const Float4& lhs, const Float4& rhs);
	static Float4 NotEqual(const Float4& lhs, const Float4& rhs);
	static Float4 LessThan(const Float4& lhs, const Float4& rhs);
	static Float4 LessEqual(const Float4& lhs, const Float4& rhs);
	static Float4 GreaterThan(const Float4& lhs, const Float4& rhs);
	static Float4 GreaterEqual(const Float4& lhs, const Float4& rhs);
	static Float4 NearEqual(const Float4& lhs, const Float4& rhs, float epsilon = DefaultEpsilon);
	static Float4 IsZero(const Float4& in, float epsilon = DefaultEpsilon);
	static void Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3);
};

inline Float4 operator+(const Float4& lhs, const Float4& rhs)
{
	return Float4{_mm_add_ps(lhs.s, rhs.s)};
}

inline Float4 operator-(const Float4& lhs, const Float4& rhs)
{
	return Float4{_mm_sub_ps(lhs.s, rhs.s)};
}

inline Float4 operator*(const Float4& lhs, const Float4& rhs)
{
	return Float4{_mm_mul_ps(lhs.s, rhs.s)};
}

inline Float4 operator*(const Float4& lhs, float rhs)
{
	return Float4{_mm_mul_ps(lhs.s, _mm_set1_ps(rhs))};
}

inline Float4 operator/(const Float4& lhs, const Float4& rhs)
{
	return Float4{_mm_div_ps(lhs.s, rhs.s)};
}

inline Float4 operator/(const Float4& lhs, float rhs)
{
	return Float4{_mm_div_ps(lhs.s, _mm_set1_ps(rhs))};
}

inline Float4 operator&(const Float4& lhs, const Float4& rhs)
{
	return Float4{_mm_and_ps(lhs.s, rhs.s)};
}

inline Float4 operator|(const Float4& lhs, const Float4& rhs)
{
	return Float4{_mm_or_ps(lhs.s, rhs.s)};
}

inline Float4 operator^(const Float4& lhs, const Float4& rhs)
{
	return Float4{_mm_xor_ps(lhs.s, rhs.s)};
}

inline bool operator==(const Float4& lhs, const Float4& rhs)
{
	return Float4::MoveMask(Float4::Equal(lhs, rhs)) == 0xf;
}

inline bool operator!=(const Float4& lhs, const Float4& rhs)
{
	return Float4::MoveMask(Float4::Equal(lhs, rhs)) != 0xf;
}

inline Float4& Float4::operator+=(const Float4& rhs) { return *this = *this + rhs; }
inline Float4& Float4::operator-=(const Float4& rhs) { return *this = *this - rhs; }
inline Float4& Float4::operator*=(const Float4& rhs) { return *this = *this * rhs; }
inline Float4& Float4::operator*=(float rhs) { return *this = *this * rhs; }
inline Float4& Float4::operator/=(const Float4& rhs) { return *this = *this / rhs; }
inline Float4& Float4::operator/=(float rhs) { return *this = *this / rhs; }

inline Float4 Float4::Load2(const void* mem)
{
	__m128 x = _mm_load_ss((const float*)mem + 0);
	__m128 y = _mm_load_ss((const float*)mem + 1);
	return _mm_unpacklo_ps(x, y);
}

inline void Float4::Store2(void* mem, const Float4& i)
{
	Float4 t1 = Swizzle<1,1,1,1>(i.s);
	_mm_store_ss((float*)mem + 0, i.s);
	_mm_store_ss((float*)mem + 1, t1.s);
}

inline Float4 Float4::Load3(const void* mem)
{
	__m128 x = _mm_load_ss((const float*)mem + 0);
	__m128 y = _mm_load_ss((const float*)mem + 1);
	__m128 z = _mm_load_ss((const float*)mem + 2);
	__m128 xy = _mm_unpacklo_ps(x, y);
	return _mm_movelh_ps(xy, z);
}

inline void Float4::Store3(void* mem, const Float4& i)
{
	Float4 t1 = Swizzle<1,1,1,1>(i.s);
	Float4 t2 = Swizzle<2,2,2,2>(i.s);
	_mm_store_ss((float*)mem + 0, i.s);
	_mm_store_ss((float*)mem + 1, t1.s);
	_mm_store_ss((float*)mem + 2, t2.s);
}

inline Float4 Float4::Load4(const void* mem)
{
	return _mm_loadu_ps((const float*)mem);
}

inline void Float4::Store4(void* mem, const Float4& i)
{
	_mm_storeu_ps((float*)mem, i.s);
}

inline Float4 Float4::SetZero()
{
	return _mm_setzero_ps();
}

inline Float4 Float4::SetInt(int32_t x, int32_t y, int32_t z, int32_t w)
{
	return Float4{_mm_setr_epi32((int)x, (int)y, (int)z, (int)w)};
}

inline Float4 Float4::SetUInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
{
	return Float4{_mm_setr_epi32((int)x, (int)y, (int)z, (int)w)};
}

inline Float4 Float4::Sqrt(const Float4& in)
{
	return Float4{_mm_sqrt_ps(in.s)};
}

inline Float4 Float4::Rsqrt(const Float4& in)
{
	return Float4{_mm_rsqrt_ps(in.s)};
}

inline Float4 Float4::Abs(const Float4& in)
{
	return _mm_andnot_ps(_mm_set1_ps(-0.0f), in.s);
}

inline Float4 Float4::Min(const Float4& lhs, const Float4& rhs)
{
	return Float4{_mm_min_ps(lhs.s, rhs.s)};
}

inline Float4 Float4::Max(const Float4& lhs, const Float4& rhs)
{
	return Float4{_mm_max_ps(lhs.s, rhs.s)};
}

inline Float4 Float4::Floor(const Float4& in)
{
#if defined(EFK_SIMD_SSE4_2)
	return _mm_floor_ps(in.s);
#else
	__m128i in_i = _mm_cvttps_epi32(in.s);
	__m128 result = _mm_cvtepi32_ps(in_i);
	__m128 larger = _mm_cmpgt_ps(result, in.s);
	larger = _mm_cvtepi32_ps(_mm_castps_si128(larger));
	return _mm_add_ps(result, larger);
#endif
}

inline Float4 Float4::Ceil(const Float4& in)
{
#if defined(EFK_SIMD_SSE4_2)
	return _mm_ceil_ps(in.s);
#else
	__m128i in_i = _mm_cvttps_epi32(in.s);
	__m128 result = _mm_cvtepi32_ps(in_i);
	__m128 smaller = _mm_cmplt_ps(result, in.s);
	smaller = _mm_cvtepi32_ps(_mm_castps_si128(smaller));
	return _mm_sub_ps(result, smaller);
#endif
}

inline Float4 Float4::MulAdd(const Float4& a, const Float4& b, const Float4& c)
{
#if defined(EFK_SIMD_AVX2)
	return Float4{_mm_fmadd_ps(b.s, c.s, a.s)};
#else
	return Float4{_mm_add_ps(a.s, _mm_mul_ps(b.s, c.s))};
#endif
}

inline Float4 Float4::MulSub(const Float4& a, const Float4& b, const Float4& c)
{
#if defined(EFK_SIMD_AVX2)
	return Float4{_mm_fnmadd_ps(b.s, c.s, a.s)};
#else
	return Float4{_mm_sub_ps(a.s, _mm_mul_ps(b.s, c.s))};
#endif
}

template<size_t LANE>
Float4 Float4::MulLane(const Float4& lhs, const Float4& rhs)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
	return _mm_mul_ps(lhs.s, Swizzle<LANE,LANE,LANE,LANE>(rhs).s);
}

template<size_t LANE>
Float4 Float4::MulAddLane(const Float4& a, const Float4& b, const Float4& c)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
#if defined(EFK_SIMD_AVX2)
	return _mm_fmadd_ps(b.s, Swizzle<LANE,LANE,LANE,LANE>(c).s, a.s);
#else
	return _mm_add_ps(a.s, _mm_mul_ps(b.s, Swizzle<LANE,LANE,LANE,LANE>(c).s));
#endif
}

template<size_t LANE>
Float4 Float4::MulSubLane(const Float4& a, const Float4& b, const Float4& c)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
#if defined(EFK_SIMD_AVX2)
	return _mm_fnmadd_ps(b.s, Swizzle<LANE,LANE,LANE,LANE>(c).s, a.s);
#else
	return _mm_sub_ps(a.s, _mm_mul_ps(b.s, Swizzle<LANE,LANE,LANE,LANE>(c).s));
#endif
}

template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
Float4 Float4::Swizzle(const Float4& v)
{
	static_assert(indexX < 4, "indexX is must be less than 4.");
	static_assert(indexY < 4, "indexY is must be less than 4.");
	static_assert(indexZ < 4, "indexZ is must be less than 4.");
	static_assert(indexW < 4, "indexW is must be less than 4.");

#if defined(EFK_SIMD_AVX)
	return _mm_permute_ps(v.s, _MM_SHUFFLE(indexW, indexZ, indexY, indexX));
#else
	return _mm_shuffle_ps(v.s, v.s, _MM_SHUFFLE(indexW, indexZ, indexY, indexX));
#endif
}

inline Float4 Float4::Dot3(const Float4& lhs, const Float4& rhs)
{
	Float4 muled = lhs * rhs;
	return _mm_add_ss(_mm_add_ss(muled.s, Float4::Swizzle<1,1,1,1>(muled).s), Float4::Swizzle<2,2,2,2>(muled).s);
}

inline Float4 Float4::Cross3(const Float4& lhs, const Float4& rhs)
{
	return Float4::Swizzle<1,2,0,3>(lhs) * Float4::Swizzle<2,0,1,3>(rhs) -
		Float4::Swizzle<2,0,1,3>(lhs) * Float4::Swizzle<1,2,0,3>(rhs);
}

template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
inline Float4 Float4::Mask()
{
	static_assert(X >= 2, "indexX is must be set 0 or 1.");
	static_assert(Y >= 2, "indexY is must be set 0 or 1.");
	static_assert(Z >= 2, "indexZ is must be set 0 or 1.");
	static_assert(W >= 2, "indexW is must be set 0 or 1.");
	return _mm_setr_epi32(
		(int)(0xffffffff * X),
		(int)(0xffffffff * Y),
		(int)(0xffffffff * Z),
		(int)(0xffffffff * W));
}

inline uint32_t Float4::MoveMask(const Float4& in)
{
	return (uint32_t)_mm_movemask_ps(in.s);
}

inline Float4 Float4::Select(const Float4& mask, const Float4& sel1, const Float4& sel2)
{
	return _mm_or_ps(_mm_and_ps(mask.s, sel1.s), _mm_andnot_ps(mask.s, sel2.s));
}

inline Float4 Float4::Equal(const Float4& lhs, const Float4& rhs)
{
	return Float4{_mm_cmpeq_ps(lhs.s, rhs.s)};
}

inline Float4 Float4::NotEqual(const Float4& lhs, const Float4& rhs)
{
	return Float4{_mm_cmpneq_ps(lhs.s, rhs.s)};
}

inline Float4 Float4::LessThan(const Float4& lhs, const Float4& rhs)
{
	return Float4{_mm_cmplt_ps(lhs.s, rhs.s)};
}

inline Float4 Float4::LessEqual(const Float4& lhs, const Float4& rhs)
{
	return Float4{_mm_cmple_ps(lhs.s, rhs.s)};
}

inline Float4 Float4::GreaterThan(const Float4& lhs, const Float4& rhs)
{
	return Float4{_mm_cmpgt_ps(lhs.s, rhs.s)};
}

inline Float4 Float4::GreaterEqual(const Float4& lhs, const Float4& rhs)
{
	return Float4{_mm_cmpge_ps(lhs.s, rhs.s)};
}

inline Float4 Float4::NearEqual(const Float4& lhs, const Float4& rhs, float epsilon)
{
	return LessEqual(Abs(lhs - rhs), Float4(epsilon));
}

inline Float4 Float4::IsZero(const Float4& in, float epsilon)
{
	return LessEqual(Abs(in), Float4(epsilon));
}

inline void Float4::Transpose(Float4& s0, Float4& s1, Float4& s2, Float4& s3)
{
	_MM_TRANSPOSE4_PS(s0.s, s1.s, s2.s, s3.s);
}

} // namespace SIMD

} // namespace Effekseer

#endif

#endif // __EFFEKSEER_SIMD_FLOAT4_SSE_H__

#ifndef __EFFEKSEER_SIMD_INT4_GEN_H__
#define __EFFEKSEER_SIMD_INT4_GEN_H__


#if defined(EFK_SIMD_GEN)

#include <cstring>
#include <algorithm>

namespace Effekseer
{

namespace SIMD
{

struct Float4;

/**
	@brief	simd class for generic
*/
struct alignas(16) Int4
{
	union {
		float vf[4];
		int32_t vi[4];
		uint32_t vu[4];
	};

	Int4() = default;
	Int4(const Int4& rhs) = default;
	Int4(int32_t x, int32_t y, int32_t z, int32_t w) { vi[0] = x; vi[1] = y; vi[2] = z; vi[3] = w; }
	Int4(int32_t i) { vi[0] = i; vi[1] = i; vi[2] = i; vi[3] = i; }

	int32_t GetX() const { return vi[0]; }
	int32_t GetY() const { return vi[1]; }
	int32_t GetZ() const { return vi[2]; }
	int32_t GetW() const { return vi[3]; }

	void SetX(int32_t o) { vi[0] = o; }
	void SetY(int32_t o) { vi[1] = o; }
	void SetZ(int32_t o) { vi[2] = o; }
	void SetW(int32_t o) { vi[3] = o; }

	Float4 Convert4f() const;
	Float4 Cast4f() const;

	Int4& operator+=(const Int4& rhs)
	{
		for (size_t i = 0; i < 4; i++)
		{
			vi[i] += rhs.vi[i];
		}
		return *this;
	}

	Int4& operator-=(const Int4& rhs)
	{
		for (size_t i = 0; i < 4; i++)
		{
			vi[i] -= rhs.vi[i];
		}
		return *this;
	}

	Int4& operator*=(const Int4& rhs)
	{
		for (size_t i = 0; i < 4; i++)
		{
			vi[i] *= rhs.vi[i];
		}
		return *this;
	}

	Int4& operator*=(int32_t rhs)
	{
		for (size_t i = 0; i < 4; i++)
		{
			vi[i] *= rhs;
		}
		return *this;
	}

	Int4& operator/=(const Int4& rhs)
	{
		for (size_t i = 0; i < 4; i++)
		{
			vi[i] /= rhs.vi[i];
		}
		return *this;
	}

	Int4& operator/=(int32_t rhs)
	{
		for (size_t i = 0; i < 4; i++)
		{
			vi[i] /= rhs;
		}
		return *this;
	}

	static Int4 Load2(const void* mem);
	static void Store2(void* mem, const Int4& i);
	static Int4 Load3(const void* mem);
	static void Store3(void* mem, const Int4& i);
	static Int4 Load4(const void* mem);
	static void Store4(void* mem, const Int4& i);

	static Int4 SetZero();
	static Int4 Abs(const Int4& in);
	static Int4 Min(const Int4& lhs, const Int4& rhs);
	static Int4 Max(const Int4& lhs, const Int4& rhs);
	static Int4 MulAdd(const Int4& a, const Int4& b, const Int4& c);
	static Int4 MulSub(const Int4& a, const Int4& b, const Int4& c);

	template<size_t LANE>
	static Int4 MulLane(const Int4& lhs, const Int4& rhs);
	template<size_t LANE>
	static Int4 MulAddLane(const Int4& a, const Int4& b, const Int4& c);
	template<size_t LANE>
	static Int4 MulSubLane(const Int4& a, const Int4& b, const Int4& c);
	template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
	static Int4 Swizzle(const Int4& in);

	template <int COUNT>
	static Int4 ShiftL(const Int4& in);
	template <int COUNT>
	static Int4 ShiftR(const Int4& in);
	template <int COUNT>
	static Int4 ShiftRA(const Int4& in);

	template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
	static Int4 Mask();
	static uint32_t MoveMask(const Int4& in);
	static Int4 Equal(const Int4& lhs, const Int4& rhs);
	static Int4 NotEqual(const Int4& lhs, const Int4& rhs);
	static Int4 LessThan(const Int4& lhs, const Int4& rhs);
	static Int4 LessEqual(const Int4& lhs, const Int4& rhs);
	static Int4 GreaterThan(const Int4& lhs, const Int4& rhs);
	static Int4 GreaterEqual(const Int4& lhs, const Int4& rhs);
	static Int4 NearEqual(const Int4& lhs, const Int4& rhs, float epsilon = DefaultEpsilon);
	static Int4 IsZero(const Int4& in, float epsilon = DefaultEpsilon);
	static void Transpose(Int4& s0, Int4& s1, Int4& s2, Int4& s3);
};

inline Int4 operator+(const Int4& lhs, const Int4& rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vi[i] = lhs.vi[i] + rhs.vi[i];
	}
	return ret;
}

inline Int4 operator-(const Int4& lhs, const Int4& rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vi[i] = lhs.vi[i] - rhs.vi[i];
	}
	return ret;
}

inline Int4 operator*(const Int4& lhs, const Int4& rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vi[i] = lhs.vi[i] * rhs.vi[i];
	}
	return ret;
}

inline Int4 operator*(const Int4& lhs, int32_t rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vi[i] = lhs.vi[i] * rhs;
	}
	return ret;
}

inline Int4 operator/(const Int4& lhs, const Int4& rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vi[i] = lhs.vi[i] / rhs.vi[i];
	}
	return ret;
}

inline Int4 operator/(const Int4& lhs, int32_t rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vi[i] = lhs.vi[i] / rhs;
	}
	return ret;
}

inline Int4 operator&(const Int4& lhs, const Int4& rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = lhs.vu[i] & rhs.vu[i];
	}
	return ret;
}

inline Int4 operator|(const Int4& lhs, const Int4& rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = lhs.vu[i] | rhs.vu[i];
	}
	return ret;
}

inline Int4 operator^(const Int4& lhs, const Int4& rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = lhs.vu[i] ^ rhs.vu[i];
	}
	return ret;
}

inline bool operator==(const Int4& lhs, const Int4& rhs)
{
	bool ret = true;
	for (size_t i = 0; i < 4; i++)
	{
		ret &= lhs.vi[i] == rhs.vi[i];
	}
	return ret;
}

inline bool operator!=(const Int4& lhs, const Int4& rhs)
{
	bool ret = true;
	for (size_t i = 0; i < 4; i++)
	{
		ret &= lhs.vi[i] == rhs.vi[i];
	}
	return !ret;
}

inline Int4 Int4::Load2(const void* mem)
{
	Int4 ret;
	memcpy(ret.vi, mem, sizeof(float) * 2);
	// This code causes bugs in asmjs
	// ret.vi[0] = *((float*)mem + 0);
	// ret.vi[1] = *((float*)mem + 1);
	return ret;
}

inline void Int4::Store2(void* mem, const Int4& i)
{
	memcpy(mem, i.vi, sizeof(float) * 2);
	// This code causes bugs in asmjs
	// *((float*)mem + 0) = i.vi[0];
	// *((float*)mem + 1) = i.vi[1];
}

inline Int4 Int4::Load3(const void* mem)
{
	Int4 ret;
	memcpy(ret.vi, mem, sizeof(float) * 3);
	// This code causes bugs in asmjs
	// ret.vi[0] = *((float*)mem + 0);
	// ret.vi[1] = *((float*)mem + 1);
	// ret.vi[2] = *((float*)mem + 2);
	return ret;
}

inline void Int4::Store3(void* mem, const Int4& i)
{
	memcpy(mem, i.vi, sizeof(float) * 3);
	// This code causes bugs in asmjs
	// *((float*)mem + 0) = i.vi[0];
	// *((float*)mem + 1) = i.vi[1];
	// *((float*)mem + 2) = i.vi[2];
}

inline Int4 Int4::Load4(const void* mem)
{
	Int4 ret;
	memcpy(ret.vi, mem, sizeof(float) * 4);
	// This code causes bugs in emscripten
	// ret.vi[0] = *((float*)mem + 0);
	// ret.vi[1] = *((float*)mem + 1);
	// ret.vi[2] = *((float*)mem + 2);
	// ret.vi[3] = *((float*)mem + 3);
	return ret;
}

inline void Int4::Store4(void* mem, const Int4& i)
{
	memcpy(mem, i.vi, sizeof(float) * 4);
	// This code causes bugs in asmjs
	// *((float*)mem + 0) = i.vi[0];
	// *((float*)mem + 1) = i.vi[1];
	// *((float*)mem + 2) = i.vi[2];
	// *((float*)mem + 3) = i.vi[3];
}

inline Int4 Int4::SetZero()
{
	Int4 ret;
	ret.vi[0] = 0;
	ret.vi[1] = 0;
	ret.vi[2] = 0;
	ret.vi[3] = 0;
	return ret;
}

inline Int4 Int4::Abs(const Int4& in)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vi[i] = std::abs(in.vi[i]);
	}
	return ret;
}

inline Int4 Int4::Min(const Int4& lhs, const Int4& rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vi[i] = (lhs.vi[i] < rhs.vi[i]) ? lhs.vi[i] : rhs.vi[i];
	}
	return ret;
}

inline Int4 Int4::Max(const Int4& lhs, const Int4& rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vi[i] = (lhs.vi[i] > rhs.vi[i]) ? lhs.vi[i] : rhs.vi[i];
	}
	return ret;
}

inline Int4 Int4::MulAdd(const Int4& a, const Int4& b, const Int4& c)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vi[i] = a.vi[i] + b.vi[i] * c.vi[i];
}
	return ret;
}

inline Int4 Int4::MulSub(const Int4& a, const Int4& b, const Int4& c)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vi[i] = a.vi[i] - b.vi[i] * c.vi[i];
}
	return ret;
}

template<size_t LANE>
Int4 Int4::MulLane(const Int4& lhs, const Int4& rhs)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
	return lhs * rhs.vi[LANE];
}

template<size_t LANE>
Int4 Int4::MulAddLane(const Int4& a, const Int4& b, const Int4& c)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
	return a + b * c.vi[LANE];
}

template<size_t LANE>
Int4 Int4::MulSubLane(const Int4& a, const Int4& b, const Int4& c)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
	return a - b * c.vi[LANE];
}

template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
Int4 Int4::Swizzle(const Int4& in)
{
	static_assert(indexX < 4, "indexX is must be less than 4.");
	static_assert(indexY < 4, "indexY is must be less than 4.");
	static_assert(indexZ < 4, "indexZ is must be less than 4.");
	static_assert(indexW < 4, "indexW is must be less than 4.");
	return Int4{in.vi[indexX], in.vi[indexY], in.vi[indexZ], in.vi[indexW]};
}

template <int COUNT>
inline Int4 Int4::ShiftL(const Int4& lhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = lhs.vu[i] << COUNT;
	}
	return ret;
}

template <int COUNT>
inline Int4 Int4::ShiftR(const Int4& lhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = lhs.vu[i] >> COUNT;
	}
	return ret;
}

template <int COUNT>
inline Int4 Int4::ShiftRA(const Int4& lhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vi[i] = lhs.vi[i] >> COUNT;
	}
	return ret;
}

template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
Int4 Int4::Mask()
{
	static_assert(X >= 2, "indexX is must be set 0 or 1.");
	static_assert(Y >= 2, "indexY is must be set 0 or 1.");
	static_assert(Z >= 2, "indexZ is must be set 0 or 1.");
	static_assert(W >= 2, "indexW is must be set 0 or 1.");
	Int4 ret;
	ret.vu[0] = 0xffffffff * X;
	ret.vu[1] = 0xffffffff * Y;
	ret.vu[2] = 0xffffffff * Z;
	ret.vu[3] = 0xffffffff * W;
	return ret;
}

inline uint32_t Int4::MoveMask(const Int4& in)
{
	return (in.vu[0] & 0x1) | (in.vu[1] & 0x2) | (in.vu[2] & 0x4) | (in.vu[3] & 0x8);
}

inline Int4 Int4::Equal(const Int4& lhs, const Int4& rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (lhs.vi[i] == rhs.vi[i]) ? 0xffffffff : 0;
	}
	return ret;
}

inline Int4 Int4::NotEqual(const Int4& lhs, const Int4& rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (lhs.vi[i] != rhs.vi[i]) ? 0xffffffff : 0;
	}
	return ret;
}

inline Int4 Int4::LessThan(const Int4& lhs, const Int4& rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (lhs.vi[i] < rhs.vi[i]) ? 0xffffffff : 0;
	}
	return ret;
}

inline Int4 Int4::LessEqual(const Int4& lhs, const Int4& rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (lhs.vi[i] <= rhs.vi[i]) ? 0xffffffff : 0;
	}
	return ret;
}

inline Int4 Int4::GreaterThan(const Int4& lhs, const Int4& rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (lhs.vi[i] > rhs.vi[i]) ? 0xffffffff : 0;
	}
	return ret;
}

inline Int4 Int4::GreaterEqual(const Int4& lhs, const Int4& rhs)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (lhs.vi[i] >= rhs.vi[i]) ? 0xffffffff : 0;
	}
	return ret;
}

inline Int4 Int4::NearEqual(const Int4& lhs, const Int4& rhs, float epsilon)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (std::abs(lhs.vi[i] - rhs.vi[i]) <= epsilon) ? 0xffffffff : 0;
	}
	return ret;
}

inline Int4 Int4::IsZero(const Int4& in, float epsilon)
{
	Int4 ret;
	for (size_t i = 0; i < 4; i++)
	{
		ret.vu[i] = (std::abs(in.vi[i]) <= epsilon) ? 0xffffffff : 0;
	}
	return ret;
}

inline void Int4::Transpose(Int4& s0, Int4& s1, Int4& s2, Int4& s3)
{
	std::swap(s0.vi[1], s1.vi[0]);
	std::swap(s0.vi[2], s2.vi[0]);
	std::swap(s0.vi[3], s3.vi[0]);
	std::swap(s1.vi[2], s2.vi[1]);
	std::swap(s2.vi[3], s3.vi[2]);
	std::swap(s1.vi[3], s3.vi[1]);
}

} // namespace SIMD

} // namespace Effekseer

#endif

#endif // __EFFEKSEER_SIMD_INT4_GEN_H__

#ifndef __EFFEKSEER_SIMD_INT4_NEON_H__
#define __EFFEKSEER_SIMD_INT4_NEON_H__


#if defined(EFK_SIMD_NEON)

namespace Effekseer
{

namespace SIMD
{

struct Float4;

/**
 @brief    simd class for sse
 */

struct alignas(16) Int4
{
	int32x4_t s;

	Int4() = default;
	Int4(const Int4& rhs) = default;
	Int4(int32x4_t rhs) { s = rhs; }
	Int4(int32_t x, int32_t y, int32_t z, int32_t w) { const int32_t v[4] = {x, y, z, w}; s = vld1q_s32(v); }
	Int4(int32_t i) { s = vdupq_n_s32(i); }

	int32_t GetX() const { return vgetq_lane_s32(s, 0); }
	int32_t GetY() const { return vgetq_lane_s32(s, 1); }
	int32_t GetZ() const { return vgetq_lane_s32(s, 2); }
	int32_t GetW() const { return vgetq_lane_s32(s, 3); }

	void SetX(int32_t i) { s = vsetq_lane_s32(i, s, 0); }
	void SetY(int32_t i) { s = vsetq_lane_s32(i, s, 1); }
	void SetZ(int32_t i) { s = vsetq_lane_s32(i, s, 2); }
	void SetW(int32_t i) { s = vsetq_lane_s32(i, s, 3); }

	Float4 Convert4f() const;
	Float4 Cast4f() const;

	Int4& operator+=(const Int4& rhs);
	Int4& operator-=(const Int4& rhs);
	Int4& operator*=(const Int4& rhs);
	Int4& operator*=(int32_t rhs);
	Int4& operator/=(const Int4& rhs);
	Int4& operator/=(int32_t rhs);

	static Int4 Load2(const void* mem);
	static void Store2(void* mem, const Int4& i);
	static Int4 Load3(const void* mem);
	static void Store3(void* mem, const Int4& i);
	static Int4 Load4(const void* mem);
	static void Store4(void* mem, const Int4& i);

	static Int4 SetZero();
	static Int4 Abs(const Int4& in);
	static Int4 Min(const Int4& lhs, const Int4& rhs);
	static Int4 Max(const Int4& lhs, const Int4& rhs);
	static Int4 MulAdd(const Int4& a, const Int4& b, const Int4& c);
	static Int4 MulSub(const Int4& a, const Int4& b, const Int4& c);

	template<size_t LANE>
	static Int4 MulLane(const Int4& lhs, const Int4& rhs);
	template<size_t LANE>
	static Int4 MulAddLane(const Int4& a, const Int4& b, const Int4& c);
	template<size_t LANE>
	static Int4 MulSubLane(const Int4& a, const Int4& b, const Int4& c);
	template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
	static Int4 Swizzle(const Int4& v);

	template <int COUNT>
	static Int4 ShiftL(const Int4& in);
	template <int COUNT>
	static Int4 ShiftR(const Int4& in);
	template <int COUNT>
	static Int4 ShiftRA(const Int4& in);

	template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
	static Int4 Mask();
	static uint32_t MoveMask(const Int4& in);
	static Int4 Equal(const Int4& lhs, const Int4& rhs);
	static Int4 NotEqual(const Int4& lhs, const Int4& rhs);
	static Int4 LessThan(const Int4& lhs, const Int4& rhs);
	static Int4 LessEqual(const Int4& lhs, const Int4& rhs);
	static Int4 GreaterThan(const Int4& lhs, const Int4& rhs);
	static Int4 GreaterEqual(const Int4& lhs, const Int4& rhs);
	static Int4 NearEqual(const Int4& lhs, const Int4& rhs, int32_t epsilon = DefaultEpsilon);
	static Int4 IsZero(const Int4& in, int32_t epsilon = DefaultEpsilon);
	static void Transpose(Int4& s0, Int4& s1, Int4& s2, Int4& s3);

private:
	static Int4 SwizzleYZX(const Int4& in);
	static Int4 SwizzleZXY(const Int4& in);
};

inline Int4 operator+(const Int4& lhs, const Int4& rhs)
{
	return vaddq_s32(lhs.s, rhs.s);
}

inline Int4 operator-(const Int4& lhs, const Int4& rhs)
{
	return vsubq_s32(lhs.s, rhs.s);
}

inline Int4 operator*(const Int4& lhs, const Int4& rhs)
{
	return vmulq_s32(lhs.s, rhs.s);
}

inline Int4 operator*(const Int4& lhs, int32_t rhs)
{
	return vmulq_n_s32(lhs.s, rhs);
}

inline Int4 operator/(const Int4& lhs, const Int4& rhs)
{
#if defined(EFK_NEON_ARM64)
	return vdivq_s32(lhs.s, rhs.s);
#else
	return Int4(
		lhs.GetX() / rhs.GetX(),
		lhs.GetY() / rhs.GetY(),
		lhs.GetZ() / rhs.GetZ(),
		lhs.GetW() / rhs.GetW());
#endif
}

inline Int4 operator/(const Int4& lhs, int32_t rhs)
{
	return lhs * (1.0f / rhs);
}

inline Int4 operator&(const Int4& lhs, const Int4& rhs)
{
	uint32x4_t lhsi = vreinterpretq_u32_s32(lhs.s);
	uint32x4_t rhsi = vreinterpretq_u32_s32(rhs.s);
	return vreinterpretq_s32_u32(vandq_u32(lhsi, rhsi));
}

inline Int4 operator|(const Int4& lhs, const Int4& rhs)
{
	uint32x4_t lhsi = vreinterpretq_u32_s32(lhs.s);
	uint32x4_t rhsi = vreinterpretq_u32_s32(rhs.s);
	return vreinterpretq_s32_u32(vorrq_u32(lhsi, rhsi));
}

inline bool operator==(const Int4& lhs, const Int4& rhs)
{
	return Int4::MoveMask(Int4::Equal(lhs, rhs)) == 0xf;
}

inline bool operator!=(const Int4& lhs, const Int4& rhs)
{
	return Int4::MoveMask(Int4::Equal(lhs, rhs)) != 0xf;
}

inline Int4& Int4::operator+=(const Int4& rhs) { return *this = *this + rhs; }
inline Int4& Int4::operator-=(const Int4& rhs) { return *this = *this - rhs; }
inline Int4& Int4::operator*=(const Int4& rhs) { return *this = *this * rhs; }
inline Int4& Int4::operator*=(int32_t rhs) { return *this = *this * rhs; }
inline Int4& Int4::operator/=(const Int4& rhs) { return *this = *this / rhs; }
inline Int4& Int4::operator/=(int32_t rhs) { return *this = *this / rhs; }

inline Int4 Int4::Load2(const void* mem)
{
	int32x2_t low = vld1_s32((const int32_t*)mem);
	int32x2_t high = vdup_n_s32(0.0f);
	return vcombine_s32(low, high);
}

inline void Int4::Store2(void* mem, const Int4& i)
{
	vst1_s32((int32_t*)mem, vget_low_s32(i.s));
}

inline Int4 Int4::Load3(const void* mem)
{
	int32x2_t low = vld1_s32((const int32_t*)mem);
	int32x2_t high = vld1_lane_s32((const int32_t*)mem + 2, vdup_n_s32(0.0f), 0);
	return vcombine_s32(low, high);
}

inline void Int4::Store3(void* mem, const Int4& i)
{
	vst1_s32((int32_t*)mem, vget_low_s32(i.s));
	vst1q_lane_s32((int32_t*)mem + 2, i.s, 2);
}

inline Int4 Int4::Load4(const void* mem)
{
	return vld1q_s32((const int32_t*)mem);
}

inline void Int4::Store4(void* mem, const Int4& i)
{
	vst1q_s32((int32_t*)mem, i.s);
}

inline Int4 Int4::SetZero()
{
	return vdupq_n_s32(0.0f);
}

inline Int4 Int4::Abs(const Int4& in)
{
	return vabsq_s32(in.s);
}

inline Int4 Int4::Min(const Int4& lhs, const Int4& rhs)
{
	return vminq_s32(lhs.s, rhs.s);
}

inline Int4 Int4::Max(const Int4& lhs, const Int4& rhs)
{
	return vmaxq_s32(lhs.s, rhs.s);
}

inline Int4 Int4::MulAdd(const Int4& a, const Int4& b, const Int4& c)
{
	return vmlaq_s32(a.s, b.s, c.s);
}

inline Int4 Int4::MulSub(const Int4& a, const Int4& b, const Int4& c)
{
	return vmlsq_s32(a.s, b.s, c.s);
}

template<size_t LANE>
inline Int4 Int4::MulLane(const Int4& lhs, const Int4& rhs)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
	int32x2_t rhs2 = (LANE < 2) ? vget_low_s32(rhs.s) : vget_high_s32(rhs.s);
	return vmulq_lane_s32(lhs.s, rhs2, LANE & 1);
}

template<size_t LANE>
inline Int4 Int4::MulAddLane(const Int4& a, const Int4& b, const Int4& c)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
	int32x2_t c2 = (LANE < 2) ? vget_low_s32(c.s) : vget_high_s32(c.s);
	return vmlaq_lane_s32(a.s, b.s, c2, LANE & 1);
}

template<size_t LANE>
inline Int4 Int4::MulSubLane(const Int4& a, const Int4& b, const Int4& c)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
	int32x2_t c2 = (LANE < 2) ? vget_low_s32(c.s) : vget_high_s32(c.s);
	return vmlsq_lane_s32(a.s, b.s, c2, LANE & 1);
}

//template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
//inline Int4 Int4::Swizzle(const Int4& v)
//{
//	static_assert(indexX < 4, "indexX is must be less than 4.");
//	static_assert(indexY < 4, "indexY is must be less than 4.");
//	static_assert(indexZ < 4, "indexZ is must be less than 4.");
//	static_assert(indexW < 4, "indexW is must be less than 4.");
//}

template <int COUNT>
inline Int4 Int4::ShiftL(const Int4& lhs)
{
	return vreinterpretq_s32_u32(vshlq_n_u32(vreinterpretq_u32_s32(lhs.s), COUNT));
}

template <int COUNT>
inline Int4 Int4::ShiftR(const Int4& lhs)
{
	return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(lhs.s), COUNT));
}

template <int COUNT>
inline Int4 Int4::ShiftRA(const Int4& lhs)
{
	return vshrq_n_s32(lhs.s, COUNT);
}

template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
inline Int4 Int4::Mask()
{
	static_assert(X >= 2, "indexX is must be set 0 or 1.");
	static_assert(Y >= 2, "indexY is must be set 0 or 1.");
	static_assert(Z >= 2, "indexZ is must be set 0 or 1.");
	static_assert(W >= 2, "indexW is must be set 0 or 1.");
	const uint32_t in[4] = {0xffffffff * X, 0xffffffff * Y, 0xffffffff * Z, 0xffffffff * W};
	return vld1q_u32(in);
}

inline uint32_t Int4::MoveMask(const Int4& in)
{
	uint16x4_t u16x4 = vmovn_u32(vreinterpretq_u32_s32(in.s));
	uint16_t u16[4];
	vst1_u16(u16, u16x4);
	return (u16[0] & 1) | (u16[1] & 2) | (u16[2] & 4) | (u16[3] & 8);
}

inline Int4 Int4::Equal(const Int4& lhs, const Int4& rhs)
{
	return vreinterpretq_s32_u32(vceqq_s32(lhs.s, rhs.s));
}

inline Int4 Int4::NotEqual(const Int4& lhs, const Int4& rhs)
{
	return vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(lhs.s, rhs.s)));
}

inline Int4 Int4::LessThan(const Int4& lhs, const Int4& rhs)
{
	return vreinterpretq_s32_u32(vcltq_s32(lhs.s, rhs.s));
}

inline Int4 Int4::LessEqual(const Int4& lhs, const Int4& rhs)
{
	return vreinterpretq_s32_u32(vcleq_s32(lhs.s, rhs.s));
}

inline Int4 Int4::GreaterThan(const Int4& lhs, const Int4& rhs)
{
	return vreinterpretq_s32_u32(vcgtq_s32(lhs.s, rhs.s));
}

inline Int4 Int4::GreaterEqual(const Int4& lhs, const Int4& rhs)
{
	return vreinterpretq_s32_u32(vcgeq_s32(lhs.s, rhs.s));
}

inline Int4 Int4::NearEqual(const Int4& lhs, const Int4& rhs, int32_t epsilon)
{
	return LessEqual(Abs(lhs - rhs), Int4(epsilon));
}

inline Int4 Int4::IsZero(const Int4& in, int32_t epsilon)
{
	return LessEqual(Abs(in), Int4(epsilon));
}

inline void Int4::Transpose(Int4& s0, Int4& s1, Int4& s2, Int4& s3)
{
	int32x4x2_t t0 = vzipq_s32(s0.s, s2.s);
	int32x4x2_t t1 = vzipq_s32(s1.s, s3.s);
	int32x4x2_t t2 = vzipq_s32(t0.val[0], t1.val[0]);
	int32x4x2_t t3 = vzipq_s32(t0.val[1], t1.val[1]);

	s0 = t2.val[0];
	s1 = t2.val[1];
	s2 = t3.val[0];
	s3 = t3.val[1];
}

inline Int4 Int4::SwizzleYZX(const Int4& in)
{
	int32x4_t ex = vextq_s32(in.s, in.s, 1);
	return vsetq_lane_s32(vgetq_lane_s32(ex, 3), ex, 2);
}

inline Int4 Int4::SwizzleZXY(const Int4& in)
{
	int32x4_t ex = vextq_s32(in.s, in.s, 3);
	return vsetq_lane_s32(vgetq_lane_s32(ex, 3), ex, 0);
}

} // namespace SIMD

} // namespace Effekseer

#endif
#endif // __EFFEKSEER_SIMD_INT4_NEON_H__

#ifndef __EFFEKSEER_SIMD_INT4_SSE_H__
#define __EFFEKSEER_SIMD_INT4_SSE_H__


#if defined(EFK_SIMD_SSE2)

namespace Effekseer
{

namespace SIMD
{

struct Float4;

/**
	@brief	simd class for sse
*/

struct alignas(16) Int4
{
	__m128i s;

	Int4() = default;
	Int4(const Int4& rhs) = default;
	Int4(__m128i rhs) { s = rhs; }
	Int4(__m128 rhs) { s = _mm_castps_si128(rhs); }
	Int4(int32_t x, int32_t y, int32_t z, int32_t w) { s = _mm_setr_epi32((int)x, (int)y, (int)z, (int)w); }
	Int4(int32_t i) { s = _mm_set1_epi32((int)i); }

	int32_t GetX() const { return _mm_cvtsi128_si32(s); }
	int32_t GetY() const { return _mm_cvtsi128_si32(Swizzle<1,1,1,1>(s).s); }
	int32_t GetZ() const { return _mm_cvtsi128_si32(Swizzle<2,2,2,2>(s).s); }
	int32_t GetW() const { return _mm_cvtsi128_si32(Swizzle<3,3,3,3>(s).s); }

	void SetX(int32_t i) { s = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s), _mm_castsi128_ps(_mm_cvtsi32_si128(i)))); }
	void SetY(int32_t i) { s = Swizzle<1,0,2,3>(_mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(Swizzle<1,0,2,3>(s).s), _mm_castsi128_ps(_mm_cvtsi32_si128(i))))).s; }
	void SetZ(int32_t i) { s = Swizzle<2,1,0,3>(_mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(Swizzle<2,1,0,3>(s).s), _mm_castsi128_ps(_mm_cvtsi32_si128(i))))).s; }
	void SetW(int32_t i) { s = Swizzle<3,1,2,0>(_mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(Swizzle<3,1,2,0>(s).s), _mm_castsi128_ps(_mm_cvtsi32_si128(i))))).s; }

	Float4 Convert4f() const;
	Float4 Cast4f() const;

	Int4& operator+=(const Int4& rhs);
	Int4& operator-=(const Int4& rhs);
	Int4& operator*=(const Int4& rhs);
	Int4& operator*=(int32_t rhs);
	Int4& operator/=(const Int4& rhs);
	Int4& operator/=(int32_t rhs);

	static Int4 Load2(const void* mem);
	static void Store2(void* mem, const Int4& i);
	static Int4 Load3(const void* mem);
	static void Store3(void* mem, const Int4& i);
	static Int4 Load4(const void* mem);
	static void Store4(void* mem, const Int4& i);

	static Int4 SetZero();
	static Int4 Abs(const Int4& in);
	static Int4 Min(const Int4& lhs, const Int4& rhs);
	static Int4 Max(const Int4& lhs, const Int4& rhs);
	static Int4 MulAdd(const Int4& a, const Int4& b, const Int4& c);
	static Int4 MulSub(const Int4& a, const Int4& b, const Int4& c);

	template<size_t LANE>
	static Int4 MulLane(const Int4& lhs, const Int4& rhs);
	template<size_t LANE>
	static Int4 MulAddLane(const Int4& a, const Int4& b, const Int4& c);
	template<size_t LANE>
	static Int4 MulSubLane(const Int4& a, const Int4& b, const Int4& c);
	template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
	static Int4 Swizzle(const Int4& v);

	template <int COUNT>
	static Int4 ShiftL(const Int4& in);
	template <int COUNT>
	static Int4 ShiftR(const Int4& in);
	template <int COUNT>
	static Int4 ShiftRA(const Int4& in);

	template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
	static Int4 Mask();
	static uint32_t MoveMask(const Int4& in);
	static Int4 Equal(const Int4& lhs, const Int4& rhs);
	static Int4 NotEqual(const Int4& lhs, const Int4& rhs);
	static Int4 LessThan(const Int4& lhs, const Int4& rhs);
	static Int4 LessEqual(const Int4& lhs, const Int4& rhs);
	static Int4 GreaterThan(const Int4& lhs, const Int4& rhs);
	static Int4 GreaterEqual(const Int4& lhs, const Int4& rhs);
};

inline Int4 operator+(const Int4& lhs, const Int4& rhs)
{
	return Int4{_mm_add_epi32(lhs.s, rhs.s)};
}

inline Int4 operator-(const Int4& lhs, const Int4& rhs)
{
	return Int4{_mm_sub_epi32(lhs.s, rhs.s)};
}

inline Int4 operator*(const Int4& lhs, const Int4& rhs)
{
#if defined(EFK_SIMD_SSE4_1)
	return _mm_mullo_epi32(lhs.s, rhs.s);
#else
	__m128i tmp1 = _mm_mul_epu32(lhs.s, rhs.s);
	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(lhs.s, 4), _mm_srli_si128(rhs.s, 4));
	return _mm_unpacklo_epi32(
		_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
		_mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
#endif
}

inline Int4 operator*(const Int4& lhs, int32_t rhs)
{
#if defined(EFK_SIMD_SSE4_1)
	return _mm_mullo_epi32(lhs.s, _mm_set1_epi32(rhs));
#else
	__m128i tmp1 = _mm_mul_epu32(lhs.s, _mm_set1_epi32(rhs));
	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(lhs.s, 4), _mm_set1_epi32(rhs));
	return _mm_unpacklo_epi32(
		_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
		_mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
#endif
}

inline Int4 operator/(const Int4& lhs, const Int4& rhs)
{
	return Int4(
		lhs.GetX() * rhs.GetX(),
		lhs.GetY() * rhs.GetY(),
		lhs.GetZ() * rhs.GetZ(),
		lhs.GetW() * rhs.GetW());
}

inline Int4 operator/(const Int4& lhs, int32_t rhs)
{
	return Int4(
		lhs.GetX() * rhs,
		lhs.GetY() * rhs,
		lhs.GetZ() * rhs,
		lhs.GetW() * rhs);
}

inline Int4 operator&(const Int4& lhs, const Int4& rhs)
{
	return Int4{_mm_and_si128(lhs.s, rhs.s)};
}

inline Int4 operator|(const Int4& lhs, const Int4& rhs)
{
	return Int4{_mm_or_si128(lhs.s, rhs.s)};
}

inline Int4 operator^(const Int4& lhs, const Int4& rhs)
{
	return Int4{_mm_xor_si128(lhs.s, rhs.s)};
}

inline bool operator==(const Int4& lhs, const Int4& rhs)
{
	return Int4::MoveMask(Int4::Equal(lhs, rhs)) == 0xf;
}

inline bool operator!=(const Int4& lhs, const Int4& rhs)
{
	return Int4::MoveMask(Int4::Equal(lhs, rhs)) != 0xf;
}

inline Int4& Int4::operator+=(const Int4& rhs) { return *this = *this + rhs; }
inline Int4& Int4::operator-=(const Int4& rhs) { return *this = *this - rhs; }
inline Int4& Int4::operator*=(const Int4& rhs) { return *this = *this * rhs; }
inline Int4& Int4::operator*=(int32_t rhs) { return *this = *this * rhs; }
inline Int4& Int4::operator/=(const Int4& rhs) { return *this = *this / rhs; }
inline Int4& Int4::operator/=(int32_t rhs) { return *this = *this / rhs; }

inline Int4 Int4::Load2(const void* mem)
{
	__m128 x = _mm_load_ss((const float*)mem + 0);
	__m128 y = _mm_load_ss((const float*)mem + 1);
	return _mm_castps_si128(_mm_unpacklo_ps(x, y));
}

inline void Int4::Store2(void* mem, const Int4& i)
{
	Int4 t1 = Swizzle<1,1,1,1>(i);
	_mm_store_ss((float*)mem + 0, _mm_castsi128_ps(i.s));
	_mm_store_ss((float*)mem + 1, _mm_castsi128_ps(t1.s));
}

inline Int4 Int4::Load3(const void* mem)
{
	__m128 x = _mm_load_ss((const float*)mem + 0);
	__m128 y = _mm_load_ss((const float*)mem + 1);
	__m128 z = _mm_load_ss((const float*)mem + 2);
	__m128 xy = _mm_unpacklo_ps(x, y);
	return _mm_castps_si128(_mm_movelh_ps(xy, z));
}

inline void Int4::Store3(void* mem, const Int4& i)
{
	Int4 t1 = Swizzle<1,1,1,1>(i);
	Int4 t2 = Swizzle<2,2,2,2>(i);
	_mm_store_ss((float*)mem + 0, _mm_castsi128_ps(i.s));
	_mm_store_ss((float*)mem + 1, _mm_castsi128_ps(t1.s));
	_mm_store_ss((float*)mem + 2, _mm_castsi128_ps(t2.s));
}

inline Int4 Int4::Load4(const void* mem)
{
	return _mm_loadu_si128((const __m128i*)mem);
}

inline void Int4::Store4(void* mem, const Int4& i)
{
	_mm_storeu_si128((__m128i*)mem, i.s);
}

inline Int4 Int4::SetZero()
{
	return _mm_setzero_si128();
}

inline Int4 Int4::Abs(const Int4& in)
{
#if defined(EFK_SIMD_SSSE3)
	return _mm_abs_epi32(in.s);
#else
	__m128i sign = _mm_srai_epi32(in.s, 31);
	return _mm_sub_epi32(_mm_xor_si128(in.s, sign), sign);
#endif
}

inline Int4 Int4::Min(const Int4& lhs, const Int4& rhs)
{
#if defined(EFK_SIMD_SSE4_1)
	return _mm_min_epi32(lhs.s, rhs.s);
#else
	__m128i mask = _mm_cmplt_epi32(lhs.s, rhs.s);
	return _mm_or_si128(_mm_and_si128(mask, lhs.s), _mm_andnot_si128(mask, rhs.s));
#endif
}

inline Int4 Int4::Max(const Int4& lhs, const Int4& rhs)
{
#if defined(EFK_SIMD_SSE4_1)
	return _mm_max_epi32(lhs.s, rhs.s);
#else
	__m128i mask = _mm_cmpgt_epi32(lhs.s, rhs.s);
	return _mm_or_si128(_mm_and_si128(mask, lhs.s), _mm_andnot_si128(mask, rhs.s));
#endif
}

inline Int4 Int4::MulAdd(const Int4& a, const Int4& b, const Int4& c)
{
	return a + b * c;
}

inline Int4 Int4::MulSub(const Int4& a, const Int4& b, const Int4& c)
{
	return a - b * c;
}

template<size_t LANE>
Int4 Int4::MulLane(const Int4& lhs, const Int4& rhs)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
	return lhs * Int4::Swizzle<LANE,LANE,LANE,LANE>(rhs);
}

template<size_t LANE>
Int4 Int4::MulAddLane(const Int4& a, const Int4& b, const Int4& c)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
	return a + b * Int4::Swizzle<LANE,LANE,LANE,LANE>(c);
}

template<size_t LANE>
Int4 Int4::MulSubLane(const Int4& a, const Int4& b, const Int4& c)
{
	static_assert(LANE < 4, "LANE is must be less than 4.");
	return a - b * Int4::Swizzle<LANE,LANE,LANE,LANE>(c);
}

template <uint32_t indexX, uint32_t indexY, uint32_t indexZ, uint32_t indexW>
Int4 Int4::Swizzle(const Int4& v)
{
	static_assert(indexX < 4, "indexX is must be less than 4.");
	static_assert(indexY < 4, "indexY is must be less than 4.");
	static_assert(indexZ < 4, "indexZ is must be less than 4.");
	static_assert(indexW < 4, "indexW is must be less than 4.");
	return Int4{_mm_shuffle_epi32(v.s, _MM_SHUFFLE(indexW, indexZ, indexY, indexX))};
}

template <int COUNT>
inline Int4 Int4::ShiftL(const Int4& lhs)
{
	return _mm_slli_epi32(lhs.s, COUNT);
}

template <int COUNT>
inline Int4 Int4::ShiftR(const Int4& lhs)
{
	return _mm_srli_epi32(lhs.s, COUNT);
}

template <int COUNT>
inline Int4 Int4::ShiftRA(const Int4& lhs)
{
	return _mm_srai_epi32(lhs.s, COUNT);
}

template <uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
inline Int4 Int4::Mask()
{
	static_assert(X >= 2, "indexX is must be set 0 or 1.");
	static_assert(Y >= 2, "indexY is must be set 0 or 1.");
	static_assert(Z >= 2, "indexZ is must be set 0 or 1.");
	static_assert(W >= 2, "indexW is must be set 0 or 1.");
	return _mm_setr_epi32(
		(int)(0xffffffff * X),
		(int)(0xffffffff * Y),
		(int)(0xffffffff * Z),
		(int)(0xffffffff * W));
}

inline uint32_t Int4::MoveMask(const Int4& in)
{
	return (uint32_t)_mm_movemask_ps(_mm_castsi128_ps(in.s));
}

inline Int4 Int4::Equal(const Int4& lhs, const Int4& rhs)
{
	return Int4{_mm_cmpeq_epi32(lhs.s, rhs.s)};
}

inline Int4 Int4::NotEqual(const Int4& lhs, const Int4& rhs)
{
	return Int4{_mm_andnot_si128(_mm_cmpeq_epi32(lhs.s, rhs.s), _mm_set1_epi32(-1))};
}

inline Int4 Int4::LessThan(const Int4& lhs, const Int4& rhs)
{
	return Int4{_mm_cmplt_epi32(lhs.s, rhs.s)};
}

inline Int4 Int4::LessEqual(const Int4& lhs, const Int4& rhs)
{
	return Int4{_mm_andnot_si128(_mm_cmpgt_epi32(lhs.s, rhs.s), _mm_set1_epi32(-1))};
}

inline Int4 Int4::GreaterThan(const Int4& lhs, const Int4& rhs)
{
	return Int4{_mm_cmpgt_epi32(lhs.s, rhs.s)};
}

inline Int4 Int4::GreaterEqual(const Int4& lhs, const Int4& rhs)
{
	return Int4{_mm_andnot_si128(_mm_cmplt_epi32(lhs.s, rhs.s), _mm_set1_epi32(-1))};
}

} // namespace SIMD

} // namespace Effekseer

#endif

#endif // __EFFEKSEER_SIMD_INT4_SSE_H__

#ifndef __EFFEKSEER_SIMD_BRIDGE_GEN_H__
#define __EFFEKSEER_SIMD_BRIDGE_GEN_H__


#if defined(EFK_SIMD_GEN)

namespace Effekseer
{

namespace SIMD
{

inline Int4 Float4::Convert4i() const { return Int4((int32_t)vf[0], (int32_t)vf[1], (int32_t)vf[2], (int32_t)vf[3]); }

inline Int4 Float4::Cast4i() const { return Int4(vu[0], vu[1], vu[2], vu[3]); }

inline Float4 Int4::Convert4f() const { return Float4((float)vi[0], (float)vi[1], (float)vi[2], (float)vi[3]); }

inline Float4 Int4::Cast4f() const { return Float4(vf[0], vf[1], vf[2], vf[3]); }

} // namespace SIMD

} // namespace Effekseer

#endif

#endif // __EFFEKSEER_SIMD_BRIDGE_GEN_H__

#ifndef __EFFEKSEER_SIMD_BRIDGE_NEON_H__
#define __EFFEKSEER_SIMD_BRIDGE_NEON_H__


#if defined(EFK_SIMD_NEON)

namespace Effekseer
{

namespace SIMD
{

inline Int4 Float4::Convert4i() const { return vcvtq_s32_f32(s); }

inline Int4 Float4::Cast4i() const { return vreinterpretq_s32_f32(s); }

inline Float4 Int4::Convert4f() const { return vcvtq_f32_s32(s); }

inline Float4 Int4::Cast4f() const { return vreinterpretq_f32_s32(s); }

} // namespace SIMD

} // namespace Effekseer

#endif
#endif // __EFFEKSEER_SIMD_BRIDGE_NEON_H__

#ifndef __EFFEKSEER_SIMD_BRIDGE_SSE_H__
#define __EFFEKSEER_SIMD_BRIDGE_SSE_H__


#if defined(EFK_SIMD_SSE2)

namespace Effekseer
{

namespace SIMD
{

inline Int4 Float4::Convert4i() const { return _mm_cvttps_epi32(s); }

inline Int4 Float4::Cast4i() const { return _mm_castps_si128(s); }

inline Float4 Int4::Convert4f() const { return _mm_cvtepi32_ps(s); }

inline Float4 Int4::Cast4f() const { return _mm_castsi128_ps(s); }

} // namespace SIMD

} // namespace Effekseer

#endif

#endif // __EFFEKSEER_SIMD_BRIDGE_SSE_H__

#ifndef __EFFEKSEER_SIMD_VEC2F_H__
#define __EFFEKSEER_SIMD_VEC2F_H__


namespace Effekseer
{

struct Vector2D;
struct vector2d;

namespace SIMD
{

struct Vec2f
{
	Float4 s;

	explicit Vec2f() = default;
	Vec2f(const Vec2f& vec) = default;
	Vec2f(float x, float y): s(x, y, 0.0f, 1.0f) {}
	Vec2f(const std::array<float, 2>& v): s(v[0], v[1], 0.0f, 1.0f) {}
	Vec2f(const Float4& vec): s(vec) {}
	Vec2f(const Vector2D& vec);
	Vec2f(const vector2d& vec);

	float GetX() const { return s.GetX(); }
	float GetY() const { return s.GetY(); }

	void SetX(float o) { s.SetX(o); }
	void SetY(float o) { s.SetY(o); }

	Vec2f& operator+=(const Vec2f& o) { s += o.s; return *this; }
	Vec2f& operator-=(const Vec2f& o) { s -= o.s; return *this; }
	Vec2f& operator*=(const Vec2f& o) { s *= o.s; return *this; }
	Vec2f& operator*=(float o) { s *= o; return *this; }
	Vec2f& operator/=(const Vec2f& o) { s /= o.s; return *this; }
	Vec2f& operator/=(float o) { s /= o; return *this; }

	float LengthSq() const;
	float Length() const;
	bool IsZero(float range = DefaultEpsilon) const;
	Vec2f Normalize() const;

	static Vec2f Load(const void* mem);
	static void Store(void* mem, const Vec2f& i);

	static Vec2f Sqrt(const Vec2f& i);
	static Vec2f Rsqrt(const Vec2f& i);
	static Vec2f Abs(const Vec2f& i);
	static Vec2f Min(const Vec2f& lhs, const Vec2f& rhs);
	static Vec2f Max(const Vec2f& lhs, const Vec2f& rhs);
	static bool Equal(const Vec2f& lhs, const Vec2f& rhs, float epsilon);
};

inline Vec2f operator+(const Vec2f& lhs, const Vec2f& rhs)
{
	return Vec2f{lhs.s + rhs.s};
}

inline Vec2f operator-(const Vec2f& lhs, const Vec2f& rhs)
{
	return Vec2f{lhs.s - rhs.s};
}

inline Vec2f operator*(const Vec2f& lhs, const Vec2f& rhs)
{
	return Vec2f{lhs.s * rhs.s};
}

inline Vec2f operator*(const Vec2f& lhs, float rhs)
{
	return Vec2f{lhs.s * rhs};
}

inline Vec2f operator/(const Vec2f& lhs, const Vec2f& rhs)
{
	return Vec2f{lhs.s / rhs.s};
}

inline Vec2f operator/(const Vec2f& lhs, float rhs)
{
	return Vec2f{lhs.s / rhs};
}

inline bool operator==(const Vec2f& lhs, const Vec2f& rhs)
{
	return (Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) & 0x03) == 0x3;
}

inline bool operator!=(const Vec2f& lhs, const Vec2f& rhs)
{
	return (Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) & 0x03) != 0x3;
}

inline Vec2f Vec2f::Load(const void* mem)
{
	return Float4::Load2(mem);
}

inline void Vec2f::Store(void* mem, const Vec2f& i)
{
	Float4::Store2(mem, i.s);
}

inline Vec2f Vec2f::Sqrt(const Vec2f& i)
{
	return Vec2f{Float4::Sqrt(i.s)};
}

inline Vec2f Vec2f::Rsqrt(const Vec2f& i)
{
	return Vec2f{Float4::Rsqrt(i.s)};
}

inline Vec2f Vec2f::Abs(const Vec2f& i)
{
	return Vec2f{Float4::Abs(i.s)};
}

inline Vec2f Vec2f::Min(const Vec2f& lhs, const Vec2f& rhs)
{
	return Vec2f{Float4::Min(lhs.s, rhs.s)};
}

inline Vec2f Vec2f::Max(const Vec2f& lhs, const Vec2f& rhs)
{
	return Vec2f{Float4::Max(lhs.s, rhs.s)};
}

inline bool Vec2f::Equal(const Vec2f& lhs, const Vec2f& rhs, float epsilon)
{
	return (Float4::MoveMask(Float4::NearEqual(lhs.s, rhs.s, epsilon)) & 0x3) == 0x3;
}

inline float Vec2f::LengthSq() const
{
	auto o = s * s;
	return o.GetX() + o.GetY();
}

inline float Vec2f::Length() const
{
	return Effekseer::SIMD::Sqrt(LengthSq());
}

inline bool Vec2f::IsZero(float range) const
{
	return LengthSq() < range * range;
}

inline Vec2f Vec2f::Normalize() const
{
	return *this * Effekseer::SIMD::Rsqrt(LengthSq());
}

} // namespace SIMD

} // namespace Effekseer

#endif // __EFFEKSEER_VEC2F_H__

#ifndef __EFFEKSEER_SIMD_VEC3F_H__
#define __EFFEKSEER_SIMD_VEC3F_H__

#include <functional>

namespace Effekseer
{

struct Vector3D;
struct vector3d;

namespace SIMD
{

struct Mat43f;
struct Mat44f;

struct Vec3f
{
	Float4 s;

	explicit Vec3f() = default;
	Vec3f(const Vec3f& vec) = default;
	Vec3f(float x, float y, float z)
		: s(x, y, z, 1.0f)
	{
	}
	Vec3f(const Float4& vec)
		: s(vec)
	{
	}
	Vec3f(const Vector3D& vec);
	Vec3f(const vector3d& vec);
	Vec3f(const std::array<float, 3>& vec);

	float GetX() const
	{
		return s.GetX();
	}
	float GetY() const
	{
		return s.GetY();
	}
	float GetZ() const
	{
		return s.GetZ();
	}

	void SetX(float o)
	{
		s.SetX(o);
	}
	void SetY(float o)
	{
		s.SetY(o);
	}
	void SetZ(float o)
	{
		s.SetZ(o);
	}

	Vec3f& operator+=(const Vec3f& o)
	{
		s += o.s;
		return *this;
	}
	Vec3f& operator-=(const Vec3f& o)
	{
		s -= o.s;
		return *this;
	}
	Vec3f& operator*=(const Vec3f& o)
	{
		s *= o.s;
		return *this;
	}
	Vec3f& operator*=(float o)
	{
		s *= o;
		return *this;
	}
	Vec3f& operator/=(const Vec3f& o)
	{
		s /= o.s;
		return *this;
	}
	Vec3f& operator/=(float o)
	{
		s /= o;
		return *this;
	}

	float GetSquaredLength() const;
	float GetLength() const;
	bool IsZero(float epsiron = DefaultEpsilon) const;
	Vec3f Normalize() const;
	Vec3f NormalizePrecisely() const;
	Vec3f NormalizeFast() const;

	static Vec3f Load(const void* mem);
	static void Store(void* mem, const Vec3f& i);

	static Vec3f Sqrt(const Vec3f& i);
	static Vec3f Rsqrt(const Vec3f& i);
	static Vec3f Abs(const Vec3f& i);
	static Vec3f Min(const Vec3f& lhs, const Vec3f& rhs);
	static Vec3f Max(const Vec3f& lhs, const Vec3f& rhs);
	static float Dot(const Vec3f& lhs, const Vec3f& rhs);
	static Vec3f Cross(const Vec3f& lhs, const Vec3f& rhs);
	static bool Equal(const Vec3f& lhs, const Vec3f& rhs, float epsilon = DefaultEpsilon);
	static Vec3f Transform(const Vec3f& lhs, const Mat43f& rhs);
	static Vec3f Transform(const Vec3f& lhs, const Mat44f& rhs);
};

inline Vec3f operator-(const Vec3f& i)
{
	return Vec3f(-i.GetX(), -i.GetY(), -i.GetZ());
}

inline Vec3f operator+(const Vec3f& lhs, const Vec3f& rhs)
{
	return Vec3f{lhs.s + rhs.s};
}

inline Vec3f operator-(const Vec3f& lhs, const Vec3f& rhs)
{
	return Vec3f{lhs.s - rhs.s};
}

inline Vec3f operator*(const Vec3f& lhs, const Vec3f& rhs)
{
	return Vec3f{lhs.s * rhs.s};
}

inline Vec3f operator*(const Vec3f& lhs, float rhs)
{
	return Vec3f{lhs.s * rhs};
}

inline Vec3f operator/(const Vec3f& lhs, const Vec3f& rhs)
{
	return Vec3f{lhs.s / rhs.s};
}

inline Vec3f operator/(const Vec3f& lhs, float rhs)
{
	return Vec3f{lhs.s / rhs};
}

inline bool operator==(const Vec3f& lhs, const Vec3f& rhs)
{
	return (Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) & 0x07) == 0x7;
}

inline bool operator!=(const Vec3f& lhs, const Vec3f& rhs)
{
	return (Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) & 0x07) != 0x7;
}

inline Vec3f Vec3f::Load(const void* mem)
{
	return Float4::Load3(mem);
}

inline void Vec3f::Store(void* mem, const Vec3f& i)
{
	Float4::Store3(mem, i.s);
}

inline Vec3f Vec3f::Sqrt(const Vec3f& i)
{
	return Vec3f{Float4::Sqrt(i.s)};
}

inline Vec3f Vec3f::Rsqrt(const Vec3f& i)
{
	return Vec3f{Float4::Rsqrt(i.s)};
}

inline Vec3f Vec3f::Abs(const Vec3f& i)
{
	return Vec3f{Float4::Abs(i.s)};
}

inline Vec3f Vec3f::Min(const Vec3f& lhs, const Vec3f& rhs)
{
	return Vec3f{Float4::Min(lhs.s, rhs.s)};
}

inline Vec3f Vec3f::Max(const Vec3f& lhs, const Vec3f& rhs)
{
	return Vec3f{Float4::Max(lhs.s, rhs.s)};
}

inline float Vec3f::Dot(const Vec3f& lhs, const Vec3f& rhs)
{
	return Float4::Dot3(lhs.s, rhs.s).GetX();
}

inline Vec3f Vec3f::Cross(const Vec3f& lhs, const Vec3f& rhs)
{
	return Float4::Cross3(lhs.s, rhs.s);
}

inline bool Vec3f::Equal(const Vec3f& lhs, const Vec3f& rhs, float epsilon)
{
	return (Float4::MoveMask(Float4::NearEqual(lhs.s, rhs.s, epsilon)) & 0x7) == 0x7;
}

inline float Vec3f::GetSquaredLength() const
{
	auto o = s * s;
	return o.GetX() + o.GetY() + o.GetZ();
}

inline float Vec3f::GetLength() const
{
	return Effekseer::SIMD::Sqrt(GetSquaredLength());
}

inline bool Vec3f::IsZero(float epsiron) const
{
	return (Float4::MoveMask(Float4::IsZero(s, epsiron)) & 0x7) == 0x7;
}

inline Vec3f Vec3f::Normalize() const
{
	return *this * Effekseer::SIMD::Rsqrt(GetSquaredLength());
}

inline Vec3f Vec3f::NormalizePrecisely() const
{
	return *this / Effekseer::SIMD::Sqrt(GetSquaredLength());
}

inline Vec3f Vec3f::NormalizeFast() const
{
	return *this * Effekseer::SIMD::Rsqrt(GetSquaredLength());
}

} // namespace SIMD

} // namespace Effekseer

namespace std
{

template <>
struct hash<Effekseer::SIMD::Vec3f>
{
	size_t operator()(const Effekseer::SIMD::Vec3f& _Keyval) const noexcept
	{
		return std::hash<float>()(_Keyval.GetX()) + std::hash<float>()(_Keyval.GetY()) + std::hash<float>()(_Keyval.GetZ());
	}
};

} // namespace std

#endif // __EFFEKSEER_SIMD_VEC3F_H__

#ifndef __EFFEKSEER_SIMD_VEC4F_H__
#define __EFFEKSEER_SIMD_VEC4F_H__


namespace Effekseer
{

namespace SIMD
{

struct Vec4f
{
	Float4 s;

	Vec4f() = default;
	Vec4f(const Vec4f& vec) = default;
	Vec4f(const Float4& vec): s(vec) {}

	float GetX() const { return s.GetX(); }
	float GetY() const { return s.GetY(); }
	float GetZ() const { return s.GetZ(); }
	float GetW() const { return s.GetW(); }

	void SetX(float o) { s.SetX(o); }
	void SetY(float o) { s.SetY(o); }
	void SetZ(float o) { s.SetZ(o); }
	void SetW(float o) { s.SetW(o); }

	Vec4f& operator+=(const Vec4f& o)
	{
		this->s = this->s + o.s;
		return *this;
	}

	Vec4f& operator-=(const Vec4f& o)
	{
		this->s = this->s - o.s;
		return *this;
	}

	Vec4f& operator*=(const Vec4f& o)
	{
		this->s = this->s * o.s;
		return *this;
	}

	Vec4f& operator/=(const Vec4f& o)
	{
		this->s = this->s / o.s;
		return *this;
	}

	static Vec4f Sqrt(const Vec4f& i);
	static Vec4f Rsqrt(const Vec4f& i);
	static Vec4f Abs(const Vec4f& i);
	static Vec4f Min(const Vec4f& lhs, const Vec4f& rhs);
	static Vec4f Max(const Vec4f& lhs, const Vec4f& rhs);
	static bool Equal(const Vec4f& lhs, const Vec4f& rhs, float epsilon);
	static Vec4f Transform(const Vec4f& lhs, const Mat43f& rhs);
	static Vec4f Transform(const Vec4f& lhs, const Mat44f& rhs);
};

inline Vec4f operator+(const Vec4f& lhs, const Vec4f& rhs) { return Vec4f{lhs.s + rhs.s}; }

inline Vec4f operator-(const Vec4f& lhs, const Vec4f& rhs) { return Vec4f{lhs.s - rhs.s}; }

inline Vec4f operator*(const Vec4f& lhs, const Vec4f& rhs) { return Vec4f{lhs.s * rhs.s}; }

inline Vec4f operator/(const Vec4f& lhs, const Vec4f& rhs) { return Vec4f{lhs.s / rhs.s}; }

inline bool operator==(const Vec4f& lhs, const Vec4f& rhs)
{
	return Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) == 0xf;
}

inline bool operator!=(const Vec4f& lhs, const Vec4f& rhs)
{
	return Float4::MoveMask(Float4::Equal(lhs.s, rhs.s)) != 0xf;
}

inline Vec4f Vec4f::Sqrt(const Vec4f& i)
{
	return Vec4f{Float4::Sqrt(i.s)};
}

inline Vec4f Vec4f::Rsqrt(const Vec4f& i)
{
	return Vec4f{Float4::Rsqrt(i.s)};
}

inline Vec4f Vec4f::Abs(const Vec4f& i)
{
	return Vec4f{Float4::Abs(i.s)};
}

inline Vec4f Vec4f::Min(const Vec4f& lhs, const Vec4f& rhs)
{
	return Vec4f{Float4::Min(lhs.s, rhs.s)};
}

inline Vec4f Vec4f::Max(const Vec4f& lhs, const Vec4f& rhs)
{
	return Vec4f{Float4::Max(lhs.s, rhs.s)};
}

inline bool Vec4f::Equal(const Vec4f& lhs, const Vec4f& rhs, float epsilon)
{
	return (Float4::MoveMask(Float4::NearEqual(lhs.s, rhs.s, epsilon)) & 0xf) == 0xf;
}

} // namespace SIMD

} // namespace Effekseer

#endif // __EFFEKSEER_SIMD_VEC4F_H__

#ifndef __EFFEKSEER_SIMD_MAT43F_H__
#define __EFFEKSEER_SIMD_MAT43F_H__


namespace Effekseer
{

struct Matrix43;

namespace SIMD
{

struct Mat43f
{
	Float4 X;
	Float4 Y;
	Float4 Z;

	Mat43f() = default;
	Mat43f(const Mat43f& rhs) = default;
	Mat43f(float m11, float m12, float m13,
		   float m21, float m22, float m23,
		   float m31, float m32, float m33,
		   float m41, float m42, float m43);
	Mat43f(const Matrix43& mat);

	bool IsValid() const;

	Mat43f Get3x3SubMatrix() const;

	Vec3f GetScale() const;

	Mat43f GetRotation() const;

	Vec3f GetTranslation() const;

	void GetSRT(Vec3f& s, Mat43f& r, Vec3f& t) const;

	void SetTranslation(const Vec3f& t);

	Mat43f& operator*=(const Mat43f& rhs);

	Mat43f& operator*=(float rhs);

	static const Mat43f Identity;

	static bool Equal(const Mat43f& lhs, const Mat43f& rhs, float epsilon = DefaultEpsilon);

	static Mat43f SRT(const Vec3f& s, const Mat43f& r, const Vec3f& t);

	static Mat43f Scaling(float x, float y, float z);

	static Mat43f Scaling(const Vec3f& scale);

	static Mat43f RotationX(float angle);

	static Mat43f RotationY(float angle);

	static Mat43f RotationZ(float angle);

	static Mat43f RotationXYZ(float rx, float ry, float rz);

	static Mat43f RotationZXY(float rz, float rx, float ry);

	static Mat43f RotationAxis(const Vec3f& axis, float angle);

	static Mat43f RotationAxis(const Vec3f& axis, float s, float c);

	static Mat43f Translation(float x, float y, float z);

	static Mat43f Translation(const Vec3f& pos);
};

inline Mat43f::Mat43f(
	float m11, float m12, float m13,
	float m21, float m22, float m23,
	float m31, float m32, float m33,
	float m41, float m42, float m43)
	: X(m11, m21, m31, m41)
	, Y(m12, m22, m32, m42)
	, Z(m13, m23, m33, m43)
{
}

inline bool operator==(const Mat43f& lhs, const Mat43f& rhs)
{
	return lhs.X == rhs.X && lhs.Y == rhs.Y && lhs.Z == rhs.Z;
}

inline bool operator!=(const Mat43f& lhs, const Mat43f& rhs)
{
	return lhs.X != rhs.X && lhs.Y != rhs.Y && lhs.Z != rhs.Z;
}

inline Mat43f operator*(const Mat43f& lhs, const Mat43f& rhs)
{
	const Float4 mask = Float4::SetUInt(0, 0, 0, 0xffffffff);

	Mat43f res;
	res.X = mask & rhs.X;
	res.X = Float4::MulAddLane<0>(res.X, lhs.X, rhs.X);
	res.X = Float4::MulAddLane<1>(res.X, lhs.Y, rhs.X);
	res.X = Float4::MulAddLane<2>(res.X, lhs.Z, rhs.X);

	res.Y = mask & rhs.Y;
	res.Y = Float4::MulAddLane<0>(res.Y, lhs.X, rhs.Y);
	res.Y = Float4::MulAddLane<1>(res.Y, lhs.Y, rhs.Y);
	res.Y = Float4::MulAddLane<2>(res.Y, lhs.Z, rhs.Y);

	res.Z = mask & rhs.Z;
	res.Z = Float4::MulAddLane<0>(res.Z, lhs.X, rhs.Z);
	res.Z = Float4::MulAddLane<1>(res.Z, lhs.Y, rhs.Z);
	res.Z = Float4::MulAddLane<2>(res.Z, lhs.Z, rhs.Z);
	return res;
}

inline Vec3f Vec3f::Transform(const Vec3f& lhs, const Mat43f& rhs)
{
	Float4 s0 = rhs.X;
	Float4 s1 = rhs.Y;
	Float4 s2 = rhs.Z;
	Float4 s3 = Float4::SetZero();
	Float4::Transpose(s0, s1, s2, s3);

	Float4 res = Float4::MulAddLane<0>(s3, s0, lhs.s);
	res = Float4::MulAddLane<1>(res, s1, lhs.s);
	res = Float4::MulAddLane<2>(res, s2, lhs.s);
	return Vec3f{res};
}

inline Vec4f Vec4f::Transform(const Vec4f& lhs, const Mat43f& rhs)
{
	Float4 s0 = rhs.X;
	Float4 s1 = rhs.Y;
	Float4 s2 = rhs.Z;
	Float4 s3 = Float4(0.0f, 0.0f, 0.0f, 1.0f);
	Float4::Transpose(s0, s1, s2, s3);

	Float4 res = Float4::MulLane<0>(s0, lhs.s);
	res = Float4::MulAddLane<1>(res, s1, lhs.s);
	res = Float4::MulAddLane<2>(res, s2, lhs.s);
	res = Float4::MulAddLane<3>(res, s3, lhs.s);
	return res;
}

inline Mat43f& Mat43f::operator*=(const Mat43f& rhs)
{
	*this = *this * rhs;
	return *this;
}

inline Mat43f& Mat43f::operator*=(float rhs)
{
	X *= rhs;
	Y *= rhs;
	Z *= rhs;
	return *this;
}

} // namespace SIMD

} // namespace Effekseer

#endif // __EFFEKSEER_SIMD_MAT43F_H__

#ifndef __EFFEKSEER_SIMD_MAT44F_H__
#define __EFFEKSEER_SIMD_MAT44F_H__


namespace Effekseer
{

struct Matrix44;

namespace SIMD
{

struct Mat44f
{
	Float4 X;
	Float4 Y;
	Float4 Z;
	Float4 W;

	Mat44f() = default;
	Mat44f(const Mat44f& rhs) = default;
	Mat44f(float m11, float m12, float m13, float m14,
		   float m21, float m22, float m23, float m24,
		   float m31, float m32, float m33, float m34,
		   float m41, float m42, float m43, float m44);
	Mat44f(const Mat43f& mat);
	Mat44f(const Matrix44& mat);

	bool IsValid() const;

	Vec3f GetScale() const;

	Mat44f GetRotation() const;

	Vec3f GetTranslation() const;

	void GetSRT(Vec3f& s, Mat44f& r, Vec3f& t) const;

	void SetTranslation(const Vec3f& t);

	Mat44f Transpose() const;

	Mat44f& operator*=(const Mat44f& rhs);

	Mat44f& operator*=(float rhs);

	static const Mat44f Identity;

	static bool Equal(const Mat44f& lhs, const Mat44f& rhs, float epsilon = DefaultEpsilon);

	static Mat44f SRT(const Vec3f& s, const Mat44f& r, const Vec3f& t);

	static Mat44f Scaling(float x, float y, float z);

	static Mat44f Scaling(const Vec3f& scale);

	static Mat44f RotationX(float angle);

	static Mat44f RotationY(float angle);

	static Mat44f RotationZ(float angle);

	static Mat44f RotationXYZ(float rx, float ry, float rz);

	static Mat44f RotationZXY(float rz, float rx, float ry);

	static Mat44f RotationAxis(const Vec3f& axis, float angle);

	static Mat44f RotationAxis(const Vec3f& axis, float s, float c);

	static Mat44f Translation(float x, float y, float z);

	static Mat44f Translation(const Vec3f& pos);
};

inline Mat44f::Mat44f(
	float m11, float m12, float m13, float m14,
	float m21, float m22, float m23, float m24,
	float m31, float m32, float m33, float m34,
	float m41, float m42, float m43, float m44)
	: X(m11, m21, m31, m41)
	, Y(m12, m22, m32, m42)
	, Z(m13, m23, m33, m43)
	, W(m14, m24, m34, m44)
{
}

inline Mat44f::Mat44f(const Mat43f& mat)
	: X(mat.X)
	, Y(mat.Y)
	, Z(mat.Z)
	, W(0.0f, 0.0f, 0.0f, 1.0f)
{
}

inline bool operator==(const Mat44f& lhs, const Mat44f& rhs)
{
	return lhs.X == rhs.X && lhs.Y == rhs.Y && lhs.Z == rhs.Z && lhs.W == rhs.W;
}

inline bool operator!=(const Mat44f& lhs, const Mat44f& rhs)
{
	return lhs.X != rhs.X && lhs.Y != rhs.Y && lhs.Z != rhs.Z && lhs.W != rhs.W;
}

inline Mat44f operator*(const Mat44f& lhs, const Mat44f& rhs)
{
	Mat44f res;
	res.X = Float4::MulLane<0>(lhs.X, rhs.X);
	res.X = Float4::MulAddLane<1>(res.X, lhs.Y, rhs.X);
	res.X = Float4::MulAddLane<2>(res.X, lhs.Z, rhs.X);
	res.X = Float4::MulAddLane<3>(res.X, lhs.W, rhs.X);

	res.Y = Float4::MulLane<0>(lhs.X, rhs.Y);
	res.Y = Float4::MulAddLane<1>(res.Y, lhs.Y, rhs.Y);
	res.Y = Float4::MulAddLane<2>(res.Y, lhs.Z, rhs.Y);
	res.Y = Float4::MulAddLane<3>(res.Y, lhs.W, rhs.Y);

	res.Z = Float4::MulLane<0>(lhs.X, rhs.Z);
	res.Z = Float4::MulAddLane<1>(res.Z, lhs.Y, rhs.Z);
	res.Z = Float4::MulAddLane<2>(res.Z, lhs.Z, rhs.Z);
	res.Z = Float4::MulAddLane<3>(res.Z, lhs.W, rhs.Z);

	res.W = Float4::MulLane<0>(lhs.X, rhs.W);
	res.W = Float4::MulAddLane<1>(res.W, lhs.Y, rhs.W);
	res.W = Float4::MulAddLane<2>(res.W, lhs.Z, rhs.W);
	res.W = Float4::MulAddLane<3>(res.W, lhs.W, rhs.W);
	return res;
}

inline Vec3f Vec3f::Transform(const Vec3f& lhs, const Mat44f& rhs)
{
	Float4 s0 = rhs.X;
	Float4 s1 = rhs.Y;
	Float4 s2 = rhs.Z;
	Float4 s3 = rhs.W;
	Float4::Transpose(s0, s1, s2, s3);

	Float4 res = Float4::MulAddLane<0>(s3, s0, lhs.s);
	res = Float4::MulAddLane<1>(res, s1, lhs.s);
	res = Float4::MulAddLane<2>(res, s2, lhs.s);
	return Vec3f{res};
}

inline Vec4f Vec4f::Transform(const Vec4f& lhs, const Mat44f& rhs)
{
	Float4 s0 = rhs.X;
	Float4 s1 = rhs.Y;
	Float4 s2 = rhs.Z;
	Float4 s3 = rhs.W;
	Float4::Transpose(s0, s1, s2, s3);

	Float4 res = Float4::MulLane<0>(s0, lhs.s);
	res = Float4::MulAddLane<1>(res, s1, lhs.s);
	res = Float4::MulAddLane<2>(res, s2, lhs.s);
	res = Float4::MulAddLane<3>(res, s3, lhs.s);
	return res;
}

inline Mat44f& Mat44f::operator*=(const Mat44f& rhs)
{
	*this = *this * rhs;
	return *this;
}

inline Mat44f& Mat44f::operator*=(float rhs)
{
	X *= rhs;
	Y *= rhs;
	Z *= rhs;
	W *= rhs;
	return *this;
}

} // namespace SIMD

} // namespace Effekseer

#endif // __EFFEKSEER_VEC4F_H__

#ifndef __EFFEKSEER_SIMD_QUATERNIONF_H__
#define __EFFEKSEER_SIMD_QUATERNIONF_H__


namespace Effekseer
{
namespace SIMD
{

struct Quaternionf
{
	Float4 s;

	Quaternionf() = default;

	Quaternionf(float x, float y, float z, float w)
		: s(x, y, z, w)
	{
	}

	Quaternionf(Float4 s)
		: s(s)
	{
	}

	float GetX() const
	{
		return s.GetX();
	}
	float GetY() const
	{
		return s.GetY();
	}
	float GetZ() const
	{
		return s.GetZ();
	}
	float GetW() const
	{
		return s.GetW();
	}

	void SetX(float o)
	{
		s.SetX(o);
	}
	void SetY(float o)
	{
		s.SetY(o);
	}
	void SetZ(float o)
	{
		s.SetZ(o);
	}
	void SetW(float o)
	{
		s.SetW(o);
	}

	Quaternionf Inverse() const
	{
		return Quaternionf{-GetX(), -GetY(), -GetZ(), GetW()};
	}

	static Quaternionf FromMatrix(const Mat44f& mat)
	{
		const auto tr = mat.X.GetX() + mat.Y.GetY() + mat.Z.GetZ();

		if (tr > 0)
		{
			const auto qw = sqrtf(tr + 1.0f) / 2.0f;
			const auto qx = (mat.Z.GetY() - mat.Y.GetZ()) / (4.0f * qw);
			const auto qy = (mat.X.GetZ() - mat.Z.GetX()) / (4.0f * qw);
			const auto qz = (mat.Y.GetX() - mat.X.GetY()) / (4.0f * qw);
			return Quaternionf{qx, qy, qz, qw};
		}
		else if (mat.X.GetX() > mat.Y.GetY() && mat.X.GetX() > mat.Z.GetZ())
		{
			const auto qx = sqrtf(mat.X.GetX() - mat.Y.GetY() - mat.Z.GetZ() + 1.0f) / 2.0f;
			const auto qw = (mat.Z.GetY() - mat.Y.GetZ()) / (4.0f * qx);
			const auto qy = (mat.X.GetY() + mat.Y.GetX()) / (4.0f * qx);
			const auto qz = (mat.X.GetZ() + mat.Z.GetX()) / (4.0f * qx);
			return Quaternionf{qx, qy, qz, qw};
		}
		else if (mat.Y.GetY() > mat.Z.GetZ())
		{
			const auto qy = sqrtf(mat.Y.GetY() - mat.X.GetX() - mat.Z.GetZ() + 1.0f) / 2.0f;
			const auto qw = (mat.X.GetZ() - mat.Z.GetX()) / (4.0f * qy);
			const auto qx = (mat.X.GetY() + mat.Y.GetX()) / (4.0f * qy);
			const auto qz = (mat.Y.GetZ() + mat.Z.GetY()) / (4.0f * qy);
			return Quaternionf{qx, qy, qz, qw};
		}
		else
		{
			const auto qz = sqrtf(mat.Z.GetZ() - mat.X.GetX() - mat.Y.GetY() + 1.0f) / 2.0f;
			const auto qw = (mat.Y.GetX() - mat.X.GetY()) / (4.0f * qz);
			const auto qx = (mat.X.GetZ() + mat.Z.GetX()) / (4.0f * qz);
			const auto qy = (mat.Y.GetZ() + mat.Z.GetY()) / (4.0f * qz);
			return Quaternionf{qx, qy, qz, qw};
		}
	}

	static Quaternionf FromMatrix(const Mat43f& mat)
	{
		const auto tr = mat.X.GetX() + mat.Y.GetY() + mat.Z.GetZ();

		if (tr > 0)
		{
			const auto qw = sqrtf(tr + 1.0f) / 2.0f;
			const auto qx = (mat.Z.GetY() - mat.Y.GetZ()) / (4.0f * qw);
			const auto qy = (mat.X.GetZ() - mat.Z.GetX()) / (4.0f * qw);
			const auto qz = (mat.Y.GetX() - mat.X.GetY()) / (4.0f * qw);
			return Quaternionf{qx, qy, qz, qw};
		}
		else if (mat.X.GetX() > mat.Y.GetY() && mat.X.GetX() > mat.Z.GetZ())
		{
			const auto qx = sqrtf(mat.X.GetX() - mat.Y.GetY() - mat.Z.GetZ() + 1.0f) / 2.0f;
			const auto qw = (mat.Z.GetY() - mat.Y.GetZ()) / (4.0f * qx);
			const auto qy = (mat.X.GetY() + mat.Y.GetX()) / (4.0f * qx);
			const auto qz = (mat.X.GetZ() + mat.Z.GetX()) / (4.0f * qx);
			return Quaternionf{qx, qy, qz, qw};
		}
		else if (mat.Y.GetY() > mat.Z.GetZ())
		{
			const auto qy = sqrtf(mat.Y.GetY() - mat.X.GetX() - mat.Z.GetZ() + 1.0f) / 2.0f;
			const auto qw = (mat.X.GetZ() - mat.Z.GetX()) / (4.0f * qy);
			const auto qx = (mat.X.GetY() + mat.Y.GetX()) / (4.0f * qy);
			const auto qz = (mat.Y.GetZ() + mat.Z.GetY()) / (4.0f * qy);
			return Quaternionf{qx, qy, qz, qw};
		}
		else
		{
			const auto qz = sqrtf(mat.Z.GetZ() - mat.X.GetX() - mat.Y.GetY() + 1.0f) / 2.0f;
			const auto qw = (mat.Y.GetX() - mat.X.GetY()) / (4.0f * qz);
			const auto qx = (mat.X.GetZ() + mat.Z.GetX()) / (4.0f * qz);
			const auto qy = (mat.Y.GetZ() + mat.Z.GetY()) / (4.0f * qz);
			return Quaternionf{qx, qy, qz, qw};
		}
	}

	Mat43f ToMatrix() const
	{
		const auto qx = GetX();
		const auto qy = GetY();
		const auto qz = GetZ();
		const auto qw = GetW();

		const auto qxx = qx * qx;
		const auto qyy = qy * qy;
		const auto qzz = qz * qz;
		const auto qww = qw * qw;

		const auto qxy = qx * qy;
		const auto qxz = qx * qz;
		const auto qyz = qy * qz;

		const auto qxw = qx * qw;
		const auto qyw = qy * qw;
		const auto qzw = qz * qw;

		Mat43f ret;

		ret.X = SIMD::Float4{(qxx - qyy - qzz + qww), 2.0f * (qxy - qzw), 2.0f * (qxz + qyw), 0};
		ret.Y = SIMD::Float4{2.0f * (qxy + qzw), (-qxx + qyy - qzz + qww), 2.0f * (qyz - qxw), 0};
		ret.Z = SIMD::Float4{2.0f * (qxz - qyw), 2.0f * (qyz + qxw), (-qxx - qyy + qzz + qww), 0};

		return ret;
	}

	static Quaternionf Slerp(const Quaternionf& q1, const Quaternionf& q2, float t)
	{
		const auto qq = q1.s * q2.s;
		auto cosa = qq.GetX() + qq.GetY() + qq.GetZ() + qq.GetW();

		if (cosa < 0.0f)
		{
			return Slerp(q1, Quaternionf{-q2.GetX(), -q2.GetY(), -q2.GetZ(), -q2.GetW()}, t);
		}

		cosa = Min(1.0f, cosa);

		const auto alpha = acos(cosa);
		const auto smallValue = 0.00001f;
		if (alpha < smallValue)
		{
			return q1;
		}

		return Quaternionf{q1.s * sin((1.0f - t) * alpha) / sin(alpha) + q2.s * sin(t * alpha) / sin(alpha)};
	}

	static Vec3f Transform(const Vec3f& v, const Quaternionf& q)
	{
		const auto qx = q.GetX();
		const auto qy = q.GetY();
		const auto qz = q.GetZ();
		const auto qw = q.GetW();

		const auto qxx = qx * qx;
		const auto qyy = qy * qy;
		const auto qzz = qz * qz;
		const auto qww = qw * qw;

		const auto qxy = qx * qy;
		const auto qxz = qx * qz;
		const auto qyz = qy * qz;

		const auto qxw = qx * qw;
		const auto qyw = qy * qw;
		const auto qzw = qz * qw;

		const auto x = (qxx - qyy - qzz + qww) * v.GetX() + 2.0f * (qxy - qzw) * v.GetY() + 2.0f * (qxz + qyw) * v.GetZ();
		const auto y = 2.0f * (qxy + qzw) * v.GetX() + (-qxx + qyy - qzz + qww) * v.GetY() + 2.0f * (qyz - qxw) * v.GetZ();
		const auto z = 2.0f * (qxz - qyw) * v.GetX() + 2.0f * (qyz + qxw) * v.GetY() + (-qxx - qyy + qzz + qww) * v.GetZ();

		return Vec3f{x, y, z};
	}
};

inline Quaternionf operator*(const Quaternionf& lhs, const Quaternionf& rhs)
{
	// TODO optimize
	auto x = lhs.GetW() * rhs.GetX() - lhs.GetZ() * rhs.GetY() + lhs.GetY() * rhs.GetZ() + lhs.GetX() * rhs.GetW();
	auto y = lhs.GetZ() * rhs.GetX() + lhs.GetW() * rhs.GetY() - lhs.GetX() * rhs.GetZ() + lhs.GetY() * rhs.GetW();
	auto z = -lhs.GetY() * rhs.GetX() + lhs.GetX() * rhs.GetY() + lhs.GetW() * rhs.GetZ() + lhs.GetZ() * rhs.GetW();
	auto w = -lhs.GetX() * rhs.GetX() - lhs.GetY() * rhs.GetY() - lhs.GetZ() * rhs.GetZ() + lhs.GetW() * rhs.GetW();
	return Quaternionf{x, y, z, w};
}

} // namespace SIMD
} // namespace Effekseer

#endif

#ifndef __EFFEKSEER_SIMD_UTILS_H__
#define __EFFEKSEER_SIMD_UTILS_H__

#include <stdlib.h>

namespace Effekseer
{

namespace SIMD
{

template <size_t align>
class AlignedAllocationPolicy {
public:
	static void* operator new(size_t size) {
#if defined(__EMSCRIPTEN__) && __EMSCRIPTEN_minor__ < 38
		return malloc(size);
#elif defined(_WIN32)
		return _mm_malloc(size, align);
#else
		void *ptr = nullptr;
		posix_memalign(&ptr, align, size);
		return ptr;
#endif
	}
	static void operator delete(void* ptr) {
#if defined(__EMSCRIPTEN__) && __EMSCRIPTEN_minor__ < 38
		free(ptr);
#elif defined(_WIN32)
		_mm_free(ptr);
#else
		return free(ptr);
#endif
	}
};

inline Vector2D ToStruct(const Vec2f& o)
{
	Vector2D ret;
	Vec2f::Store(&ret, o);
	return ret;
}

inline Vector3D ToStruct(const Vec3f& o)
{
	Vector3D ret;
	Vec3f::Store(&ret, o);
	return ret;
}

inline Matrix43 ToStruct(const Mat43f& o)
{
	Float4 tx = o.X;
	Float4 ty = o.Y;
	Float4 tz = o.Z;
	Float4 tw = Float4::SetZero();
	Float4::Transpose(tx, ty, tz, tw);

	Matrix43 ret;
	Float4::Store3(ret.Value[0], tx);
	Float4::Store3(ret.Value[1], ty);
	Float4::Store3(ret.Value[2], tz);
	Float4::Store3(ret.Value[3], tw);
	return ret;
}

inline Matrix44 ToStruct(const Mat44f& o)
{
	Float4 tx = o.X;
	Float4 ty = o.Y;
	Float4 tz = o.Z;
	Float4 tw = o.W;
	Float4::Transpose(tx, ty, tz, tw);

	Matrix44 ret;
	Float4::Store4(ret.Values[0], tx);
	Float4::Store4(ret.Values[1], ty);
	Float4::Store4(ret.Values[2], tz);
	Float4::Store4(ret.Values[3], tw);
	return ret;
}

} // namespace SIMD

} // namespace Effekseer

#endif // __EFFEKSEER_SIMD_UTILS_H__