mirror of https://github.com/axmolengine/axmol.git
Update 3rdparty (#2252)
This commit is contained in:
parent
c37bcf8977
commit
a4a75644fb
29
1k/1kiss.ps1
29
1k/1kiss.ps1
|
@ -194,7 +194,9 @@ $1k = [_1kiss]::new()
|
||||||
# * : any
|
# * : any
|
||||||
# x.y.z~x2.y2.z2 : range
|
# x.y.z~x2.y2.z2 : range
|
||||||
$manifest = @{
|
$manifest = @{
|
||||||
msvc = '14.39+'; # cl.exe @link.exe 14.39 VS2022 17.9.x
|
# cl.exe @link.exe 14.39 VS2022 17.9.x
|
||||||
|
# exactly match format: '14.42.*'
|
||||||
|
msvc = '14.39+';
|
||||||
vs = '12.0+';
|
vs = '12.0+';
|
||||||
ndk = 'r23c';
|
ndk = 'r23c';
|
||||||
xcode = '13.0.0+'; # range
|
xcode = '13.0.0+'; # range
|
||||||
|
@ -203,7 +205,7 @@ $manifest = @{
|
||||||
# clang-cl msvc14.40 require 17.0.0+
|
# clang-cl msvc14.40 require 17.0.0+
|
||||||
llvm = '17.0.6+';
|
llvm = '17.0.6+';
|
||||||
gcc = '9.0.0+';
|
gcc = '9.0.0+';
|
||||||
cmake = '3.23.0+';
|
cmake = '3.23.0~3.31.1+';
|
||||||
ninja = '1.10.0+';
|
ninja = '1.10.0+';
|
||||||
python = '3.8.0+';
|
python = '3.8.0+';
|
||||||
jdk = '17.0.10+'; # jdk17+ works for android cmdlinetools 7.0+
|
jdk = '17.0.10+'; # jdk17+ works for android cmdlinetools 7.0+
|
||||||
|
@ -458,19 +460,24 @@ if ($1k.isfile($manifest_file)) {
|
||||||
# 1kdist
|
# 1kdist
|
||||||
$sentry_file = Join-Path $myRoot '.gitee'
|
$sentry_file = Join-Path $myRoot '.gitee'
|
||||||
$mirror = if ($1k.isfile($sentry_file)) { 'gitee' } else { 'github' }
|
$mirror = if ($1k.isfile($sentry_file)) { 'gitee' } else { 'github' }
|
||||||
$mirror_url_base = @{'github' = 'https://github.com/'; 'gitee' = 'https://gitee.com/' }[$mirror]
|
|
||||||
$1kdist_url_base = $mirror_url_base
|
|
||||||
$mirror_conf_file = $1k.realpath("$myRoot/../manifest.json")
|
$mirror_conf_file = $1k.realpath("$myRoot/../manifest.json")
|
||||||
$mirror_current = $null
|
$mirror_current = $null
|
||||||
$devtools_url_base = $null
|
$devtools_url_base = $null
|
||||||
$1kdist_ver = $null
|
$1kdist_ver = $null
|
||||||
|
|
||||||
if ($1k.isfile($mirror_conf_file)) {
|
if ($1k.isfile($mirror_conf_file)) {
|
||||||
$mirror_conf = ConvertFrom-Json (Get-Content $mirror_conf_file -raw)
|
$mirror_conf = ConvertFrom-Json (Get-Content $mirror_conf_file -raw)
|
||||||
$mirror_current = $mirror_conf.mirrors.$mirror
|
$mirror_current = $mirror_conf.mirrors.$mirror
|
||||||
|
$mirror_url_base = "https://$($mirror_current.host)/"
|
||||||
|
$1kdist_url_base = $mirror_url_base
|
||||||
|
|
||||||
$1kdist_url_base += $mirror_current.'1kdist'
|
$1kdist_url_base += $mirror_current.'1kdist'
|
||||||
$devtools_url_base += "$1kdist_url_base/devtools"
|
$devtools_url_base += "$1kdist_url_base/devtools"
|
||||||
$1kdist_ver = $mirror_conf.versions.'1kdist'
|
$1kdist_ver = $mirror_conf.versions.'1kdist'
|
||||||
$1kdist_url_base += "/$1kdist_ver"
|
$1kdist_url_base += "/$1kdist_ver"
|
||||||
|
} else {
|
||||||
|
$mirror_url_base = 'https://github.com/'
|
||||||
|
$1kdist_url_base = $mirror_url_base
|
||||||
}
|
}
|
||||||
|
|
||||||
function 1kdist_url($filename) {
|
function 1kdist_url($filename) {
|
||||||
|
@ -773,8 +780,9 @@ function find_vs() {
|
||||||
$required_vs_ver = $manifest['vs']
|
$required_vs_ver = $manifest['vs']
|
||||||
if (!$required_vs_ver) { $required_vs_ver = '12.0+' }
|
if (!$required_vs_ver) { $required_vs_ver = '12.0+' }
|
||||||
|
|
||||||
$require_comps = @('Microsoft.Component.MSBuild', 'Microsoft.VisualStudio.Component.VC.Tools.x86.x64')
|
# refer: https://learn.microsoft.com/en-us/visualstudio/install/workload-and-component-ids?view=vs-2022
|
||||||
$vs_installs = ConvertFrom-Json "$(&$VSWHERE_EXE -version $required_vs_ver.TrimEnd('+') -format 'json' -requires $require_comps)"
|
$require_comps = @('Microsoft.VisualStudio.Component.VC.Tools.x86.x64', 'Microsoft.VisualStudio.Product.BuildTools')
|
||||||
|
$vs_installs = ConvertFrom-Json "$(&$VSWHERE_EXE -version $required_vs_ver.TrimEnd('+') -format 'json' -requires $require_comps -requiresAny)"
|
||||||
$ErrorActionPreference = $eap
|
$ErrorActionPreference = $eap
|
||||||
|
|
||||||
if ($vs_installs) {
|
if ($vs_installs) {
|
||||||
|
@ -789,7 +797,7 @@ function find_vs() {
|
||||||
}
|
}
|
||||||
$Global:VS_INST = $vs_inst_latest
|
$Global:VS_INST = $vs_inst_latest
|
||||||
} else {
|
} else {
|
||||||
throw "No suitable visual studio installed, required: $required_vs_ver"
|
Write-Warning "Visual studio not found, your build may not work, required: $required_vs_ver"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1275,16 +1283,19 @@ function setup_msvc() {
|
||||||
if (!$cl_prog) {
|
if (!$cl_prog) {
|
||||||
if ($Global:VS_INST) {
|
if ($Global:VS_INST) {
|
||||||
$vs_path = $Global:VS_INST.installationPath
|
$vs_path = $Global:VS_INST.installationPath
|
||||||
Import-Module "$vs_path\Common7\Tools\Microsoft.VisualStudio.DevShell.dll"
|
|
||||||
$dev_cmd_args = "-arch=$target_cpu -host_arch=x64 -no_logo"
|
$dev_cmd_args = "-arch=$target_cpu -host_arch=x64 -no_logo"
|
||||||
|
|
||||||
|
# if explicit version specified, use it
|
||||||
if (!$manifest['msvc'].EndsWith('+')) { $dev_cmd_args += " -vcvars_ver=$cl_ver" }
|
if (!$manifest['msvc'].EndsWith('+')) { $dev_cmd_args += " -vcvars_ver=$cl_ver" }
|
||||||
|
|
||||||
|
Import-Module "$vs_path\Common7\Tools\Microsoft.VisualStudio.DevShell.dll"
|
||||||
Enter-VsDevShell -VsInstanceId $Global:VS_INST.instanceId -SkipAutomaticLocation -DevCmdArguments $dev_cmd_args
|
Enter-VsDevShell -VsInstanceId $Global:VS_INST.instanceId -SkipAutomaticLocation -DevCmdArguments $dev_cmd_args
|
||||||
|
|
||||||
$cl_prog, $cl_ver = find_prog -name 'msvc' -cmd 'cl' -silent $true -usefv $true
|
$cl_prog, $cl_ver = find_prog -name 'msvc' -cmd 'cl' -silent $true -usefv $true
|
||||||
$1k.println("Using msvc: $cl_prog, version: $cl_ver")
|
$1k.println("Using msvc: $cl_prog, version: $cl_ver")
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
throw "Visual Studio not installed!"
|
Write-Warning "MSVC not found, your build may not work, required: $cl_ver"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ mkdir -p $cacheDir
|
||||||
|
|
||||||
pwsh_ver=$1
|
pwsh_ver=$1
|
||||||
if [ "$pwsh_ver" = "" ] ; then
|
if [ "$pwsh_ver" = "" ] ; then
|
||||||
pwsh_ver='7.4.5'
|
pwsh_ver='7.4.6'
|
||||||
fi
|
fi
|
||||||
|
|
||||||
pwsh_min_ver=$2
|
pwsh_min_ver=$2
|
||||||
|
|
|
@ -6,11 +6,12 @@ param(
|
||||||
|
|
||||||
if(Test-Path $manifest_file -PathType Leaf) {
|
if(Test-Path $manifest_file -PathType Leaf) {
|
||||||
$mirror = if (!(Test-Path (Join-Path $PSScriptRoot '.gitee') -PathType Leaf)) {'github'} else {'gitee'}
|
$mirror = if (!(Test-Path (Join-Path $PSScriptRoot '.gitee') -PathType Leaf)) {'github'} else {'gitee'}
|
||||||
$url_base = @{'github' = 'https://github.com/'; 'gitee' = 'https://gitee.com/' }[$mirror]
|
|
||||||
|
|
||||||
$manifest_map = ConvertFrom-Json (Get-Content $manifest_file -raw)
|
$manifest_map = ConvertFrom-Json (Get-Content $manifest_file -raw)
|
||||||
$ver = $manifest_map.versions.PSObject.Properties[$name].Value
|
$ver = $manifest_map.versions.PSObject.Properties[$name].Value
|
||||||
$url_path = $manifest_map.mirrors.PSObject.Properties[$mirror].Value.PSObject.Properties[$name].Value
|
$mirror_current = $manifest_map.mirrors.PSObject.Properties[$mirror].Value.PSObject.Properties
|
||||||
|
$url_base = "https://$($mirror_current['host'].Value)/"
|
||||||
|
$url_path = $mirror_current[$name].Value
|
||||||
|
|
||||||
Write-Host "$url_base$url_path#$ver" -NoNewline
|
Write-Host "$url_base$url_path#$ver" -NoNewline
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
|
|
||||||
## astcenc
|
## astcenc
|
||||||
- [![Upstream](https://img.shields.io/github/v/release/ARM-software/astc-encoder?label=Upstream)](https://github.com/ARM-software/astc-encoder)
|
- [![Upstream](https://img.shields.io/github/v/release/ARM-software/astc-encoder?label=Upstream)](https://github.com/ARM-software/astc-encoder)
|
||||||
- Version: 4.8.0
|
- Version: 5.1.0
|
||||||
- License: Apache-2.0
|
- License: Apache-2.0
|
||||||
|
|
||||||
## Box2D
|
## Box2D
|
||||||
|
@ -22,7 +22,7 @@
|
||||||
|
|
||||||
## c-ares
|
## c-ares
|
||||||
- [![Upstream](https://img.shields.io/github/v/release/c-ares/c-ares?label=Upstream)](https://github.com/c-ares/c-ares)
|
- [![Upstream](https://img.shields.io/github/v/release/c-ares/c-ares?label=Upstream)](https://github.com/c-ares/c-ares)
|
||||||
- Version: 1.34.2
|
- Version: 1.34.3
|
||||||
- License: MIT
|
- License: MIT
|
||||||
|
|
||||||
## Chipmunk2D
|
## Chipmunk2D
|
||||||
|
@ -47,7 +47,7 @@
|
||||||
|
|
||||||
## curl
|
## curl
|
||||||
- [![Upstream](https://img.shields.io/github/v/release/curl/curl?label=Upstream)](https://github.com/curl/curl)
|
- [![Upstream](https://img.shields.io/github/v/release/curl/curl?label=Upstream)](https://github.com/curl/curl)
|
||||||
- Version: 8.10.1
|
- Version: 8.11.0
|
||||||
- License: Curl (MIT/X)
|
- License: Curl (MIT/X)
|
||||||
|
|
||||||
## doctest
|
## doctest
|
||||||
|
@ -124,7 +124,7 @@
|
||||||
|
|
||||||
- luajit
|
- luajit
|
||||||
- Upstream: https://github.com/LuaJIT/LuaJIT
|
- Upstream: https://github.com/LuaJIT/LuaJIT
|
||||||
- Version: 2.1-97813fb
|
- Version: 2.1-fe71d0f
|
||||||
- License: MIT
|
- License: MIT
|
||||||
|
|
||||||
- tolua
|
- tolua
|
||||||
|
|
|
@ -778,12 +778,12 @@ void compute_error_squared_rgba(
|
||||||
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||||
{
|
{
|
||||||
vmask mask = lane_ids < vint(texel_count);
|
vmask mask = lane_ids < vint(texel_count);
|
||||||
vint texel_idxs(texel_indexes + i);
|
const uint8_t* texel_idxs = texel_indexes + i;
|
||||||
|
|
||||||
vfloat data_r = gatherf(blk.data_r, texel_idxs);
|
vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
|
||||||
vfloat data_g = gatherf(blk.data_g, texel_idxs);
|
vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
|
||||||
vfloat data_b = gatherf(blk.data_b, texel_idxs);
|
vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
|
||||||
vfloat data_a = gatherf(blk.data_a, texel_idxs);
|
vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, texel_idxs);
|
||||||
|
|
||||||
vfloat uncor_param = (data_r * l_uncor_bs0)
|
vfloat uncor_param = (data_r * l_uncor_bs0)
|
||||||
+ (data_g * l_uncor_bs1)
|
+ (data_g * l_uncor_bs1)
|
||||||
|
@ -892,11 +892,11 @@ void compute_error_squared_rgb(
|
||||||
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||||
{
|
{
|
||||||
vmask mask = lane_ids < vint(texel_count);
|
vmask mask = lane_ids < vint(texel_count);
|
||||||
vint texel_idxs(texel_indexes + i);
|
const uint8_t* texel_idxs = texel_indexes + i;
|
||||||
|
|
||||||
vfloat data_r = gatherf(blk.data_r, texel_idxs);
|
vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
|
||||||
vfloat data_g = gatherf(blk.data_g, texel_idxs);
|
vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
|
||||||
vfloat data_b = gatherf(blk.data_b, texel_idxs);
|
vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
|
||||||
|
|
||||||
vfloat uncor_param = (data_r * l_uncor_bs0)
|
vfloat uncor_param = (data_r * l_uncor_bs0)
|
||||||
+ (data_g * l_uncor_bs1)
|
+ (data_g * l_uncor_bs1)
|
||||||
|
|
|
@ -98,19 +98,14 @@ void unpack_weights(
|
||||||
if (!is_dual_plane)
|
if (!is_dual_plane)
|
||||||
{
|
{
|
||||||
// Build full 64-entry weight lookup table
|
// Build full 64-entry weight lookup table
|
||||||
vint4 tab0 = vint4::load(scb.weights + 0);
|
vtable_64x8 table;
|
||||||
vint4 tab1 = vint4::load(scb.weights + 16);
|
vtable_prepare(table, scb.weights);
|
||||||
vint4 tab2 = vint4::load(scb.weights + 32);
|
|
||||||
vint4 tab3 = vint4::load(scb.weights + 48);
|
|
||||||
|
|
||||||
vint tab0p, tab1p, tab2p, tab3p;
|
|
||||||
vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
|
|
||||||
|
|
||||||
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
|
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||||
{
|
{
|
||||||
vint summed_value(8);
|
vint summed_value(8);
|
||||||
vint weight_count(di.texel_weight_count + i);
|
vint weight_count(di.texel_weight_count + i);
|
||||||
int max_weight_count = hmax(weight_count).lane<0>();
|
int max_weight_count = hmax_s(weight_count);
|
||||||
|
|
||||||
promise(max_weight_count > 0);
|
promise(max_weight_count > 0);
|
||||||
for (int j = 0; j < max_weight_count; j++)
|
for (int j = 0; j < max_weight_count; j++)
|
||||||
|
@ -118,7 +113,7 @@ void unpack_weights(
|
||||||
vint texel_weights(di.texel_weights_tr[j] + i);
|
vint texel_weights(di.texel_weights_tr[j] + i);
|
||||||
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
|
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
|
||||||
|
|
||||||
summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
|
summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int;
|
||||||
}
|
}
|
||||||
|
|
||||||
store(lsr<4>(summed_value), weights_plane1 + i);
|
store(lsr<4>(summed_value), weights_plane1 + i);
|
||||||
|
@ -128,16 +123,12 @@ void unpack_weights(
|
||||||
{
|
{
|
||||||
// Build a 32-entry weight lookup table per plane
|
// Build a 32-entry weight lookup table per plane
|
||||||
// Plane 1
|
// Plane 1
|
||||||
vint4 tab0_plane1 = vint4::load(scb.weights + 0);
|
vtable_32x8 tab_plane1;
|
||||||
vint4 tab1_plane1 = vint4::load(scb.weights + 16);
|
vtable_prepare(tab_plane1, scb.weights);
|
||||||
vint tab0_plane1p, tab1_plane1p;
|
|
||||||
vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
|
|
||||||
|
|
||||||
// Plane 2
|
// Plane 2
|
||||||
vint4 tab0_plane2 = vint4::load(scb.weights + 32);
|
vtable_32x8 tab_plane2;
|
||||||
vint4 tab1_plane2 = vint4::load(scb.weights + 48);
|
vtable_prepare(tab_plane2, scb.weights + 32);
|
||||||
vint tab0_plane2p, tab1_plane2p;
|
|
||||||
vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
|
|
||||||
|
|
||||||
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
|
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||||
{
|
{
|
||||||
|
@ -145,7 +136,7 @@ void unpack_weights(
|
||||||
vint sum_plane2(8);
|
vint sum_plane2(8);
|
||||||
|
|
||||||
vint weight_count(di.texel_weight_count + i);
|
vint weight_count(di.texel_weight_count + i);
|
||||||
int max_weight_count = hmax(weight_count).lane<0>();
|
int max_weight_count = hmax_s(weight_count);
|
||||||
|
|
||||||
promise(max_weight_count > 0);
|
promise(max_weight_count > 0);
|
||||||
for (int j = 0; j < max_weight_count; j++)
|
for (int j = 0; j < max_weight_count; j++)
|
||||||
|
@ -153,8 +144,8 @@ void unpack_weights(
|
||||||
vint texel_weights(di.texel_weights_tr[j] + i);
|
vint texel_weights(di.texel_weights_tr[j] + i);
|
||||||
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
|
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
|
||||||
|
|
||||||
sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
|
sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int;
|
||||||
sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
|
sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int;
|
||||||
}
|
}
|
||||||
|
|
||||||
store(lsr<4>(sum_plane1), weights_plane1 + i);
|
store(lsr<4>(sum_plane1), weights_plane1 + i);
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
// Copyright 2011-2023 Arm Limited
|
// Copyright 2011-2024 Arm Limited
|
||||||
//
|
//
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||||
// use this file except in compliance with the License. You may obtain a copy
|
// use this file except in compliance with the License. You may obtain a copy
|
||||||
|
@ -425,8 +425,8 @@ static unsigned int get_partition_ordering_by_mismatch_bits(
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create a running sum from the histogram array
|
// Create a running sum from the histogram array
|
||||||
// Cells store previous values only; i.e. exclude self after sum
|
// Indices store previous values only; i.e. exclude self after sum
|
||||||
unsigned int sum = 0;
|
uint16_t sum = 0;
|
||||||
for (unsigned int i = 0; i < texel_count; i++)
|
for (unsigned int i = 0; i < texel_count; i++)
|
||||||
{
|
{
|
||||||
uint16_t cnt = mscount[i];
|
uint16_t cnt = mscount[i];
|
||||||
|
|
|
@ -41,16 +41,16 @@ static vfloat bilinear_infill_vla(
|
||||||
unsigned int index
|
unsigned int index
|
||||||
) {
|
) {
|
||||||
// Load the bilinear filter texel weight indexes in the decimated grid
|
// Load the bilinear filter texel weight indexes in the decimated grid
|
||||||
vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
|
const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
|
||||||
vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
|
const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
|
||||||
vint weight_idx2 = vint(di.texel_weights_tr[2] + index);
|
const uint8_t* weight_idx2 = di.texel_weights_tr[2] + index;
|
||||||
vint weight_idx3 = vint(di.texel_weights_tr[3] + index);
|
const uint8_t* weight_idx3 = di.texel_weights_tr[3] + index;
|
||||||
|
|
||||||
// Load the bilinear filter weights from the decimated grid
|
// Load the bilinear filter weights from the decimated grid
|
||||||
vfloat weight_val0 = gatherf(weights, weight_idx0);
|
vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
|
||||||
vfloat weight_val1 = gatherf(weights, weight_idx1);
|
vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
|
||||||
vfloat weight_val2 = gatherf(weights, weight_idx2);
|
vfloat weight_val2 = gatherf_byte_inds<vfloat>(weights, weight_idx2);
|
||||||
vfloat weight_val3 = gatherf(weights, weight_idx3);
|
vfloat weight_val3 = gatherf_byte_inds<vfloat>(weights, weight_idx3);
|
||||||
|
|
||||||
// Load the weight contribution factors for each decimated weight
|
// Load the weight contribution factors for each decimated weight
|
||||||
vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
|
vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
|
||||||
|
@ -81,12 +81,12 @@ static vfloat bilinear_infill_vla_2(
|
||||||
unsigned int index
|
unsigned int index
|
||||||
) {
|
) {
|
||||||
// Load the bilinear filter texel weight indexes in the decimated grid
|
// Load the bilinear filter texel weight indexes in the decimated grid
|
||||||
vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
|
const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
|
||||||
vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
|
const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
|
||||||
|
|
||||||
// Load the bilinear filter weights from the decimated grid
|
// Load the bilinear filter weights from the decimated grid
|
||||||
vfloat weight_val0 = gatherf(weights, weight_idx0);
|
vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
|
||||||
vfloat weight_val1 = gatherf(weights, weight_idx1);
|
vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
|
||||||
|
|
||||||
// Load the weight contribution factors for each decimated weight
|
// Load the weight contribution factors for each decimated weight
|
||||||
vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
|
vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
|
||||||
|
@ -853,12 +853,6 @@ void compute_ideal_weights_for_decimation(
|
||||||
promise(texel_count > 0);
|
promise(texel_count > 0);
|
||||||
promise(weight_count > 0);
|
promise(weight_count > 0);
|
||||||
|
|
||||||
// Ensure that the end of the output arrays that are used for SIMD paths later are filled so we
|
|
||||||
// can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight
|
|
||||||
// arrays always contain space for 64 elements
|
|
||||||
unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1);
|
|
||||||
storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd);
|
|
||||||
|
|
||||||
// If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the
|
// If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the
|
||||||
// zero-initialized SIMD over-fetch region
|
// zero-initialized SIMD over-fetch region
|
||||||
if (is_direct)
|
if (is_direct)
|
||||||
|
@ -889,23 +883,23 @@ void compute_ideal_weights_for_decimation(
|
||||||
|
|
||||||
// Accumulate error weighting of all the texels using this weight
|
// Accumulate error weighting of all the texels using this weight
|
||||||
vint weight_texel_count(di.weight_texel_count + i);
|
vint weight_texel_count(di.weight_texel_count + i);
|
||||||
unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
|
unsigned int max_texel_count = hmax_s(weight_texel_count);
|
||||||
promise(max_texel_count > 0);
|
promise(max_texel_count > 0);
|
||||||
|
|
||||||
for (unsigned int j = 0; j < max_texel_count; j++)
|
for (unsigned int j = 0; j < max_texel_count; j++)
|
||||||
{
|
{
|
||||||
vint texel(di.weight_texels_tr[j] + i);
|
const uint8_t* texel = di.weight_texels_tr[j] + i;
|
||||||
vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
|
vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
|
||||||
|
|
||||||
if (!constant_wes)
|
if (!constant_wes)
|
||||||
{
|
{
|
||||||
weight_error_scale = gatherf(ei.weight_error_scale, texel);
|
weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
|
||||||
}
|
}
|
||||||
|
|
||||||
vfloat contrib_weight = weight * weight_error_scale;
|
vfloat contrib_weight = weight * weight_error_scale;
|
||||||
|
|
||||||
weight_weight += contrib_weight;
|
weight_weight += contrib_weight;
|
||||||
initial_weight += gatherf(ei.weights, texel) * contrib_weight;
|
initial_weight += gatherf_byte_inds<vfloat>(ei.weights, texel) * contrib_weight;
|
||||||
}
|
}
|
||||||
|
|
||||||
storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
|
storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
|
||||||
|
@ -947,22 +941,22 @@ void compute_ideal_weights_for_decimation(
|
||||||
|
|
||||||
// Accumulate error weighting of all the texels using this weight
|
// Accumulate error weighting of all the texels using this weight
|
||||||
vint weight_texel_count(di.weight_texel_count + i);
|
vint weight_texel_count(di.weight_texel_count + i);
|
||||||
unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
|
unsigned int max_texel_count = hmax_s(weight_texel_count);
|
||||||
promise(max_texel_count > 0);
|
promise(max_texel_count > 0);
|
||||||
|
|
||||||
for (unsigned int j = 0; j < max_texel_count; j++)
|
for (unsigned int j = 0; j < max_texel_count; j++)
|
||||||
{
|
{
|
||||||
vint texel(di.weight_texels_tr[j] + i);
|
const uint8_t* texel = di.weight_texels_tr[j] + i;
|
||||||
vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
|
vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
|
||||||
|
|
||||||
if (!constant_wes)
|
if (!constant_wes)
|
||||||
{
|
{
|
||||||
weight_error_scale = gatherf(ei.weight_error_scale, texel);
|
weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
|
||||||
}
|
}
|
||||||
|
|
||||||
vfloat scale = weight_error_scale * contrib_weight;
|
vfloat scale = weight_error_scale * contrib_weight;
|
||||||
vfloat old_weight = gatherf(infilled_weights, texel);
|
vfloat old_weight = gatherf_byte_inds<vfloat>(infilled_weights, texel);
|
||||||
vfloat ideal_weight = gatherf(ei.weights, texel);
|
vfloat ideal_weight = gatherf_byte_inds<vfloat>(ei.weights, texel);
|
||||||
|
|
||||||
error_change0 += contrib_weight * scale;
|
error_change0 += contrib_weight * scale;
|
||||||
error_change1 += (old_weight - ideal_weight) * scale;
|
error_change1 += (old_weight - ideal_weight) * scale;
|
||||||
|
@ -1023,9 +1017,8 @@ void compute_quantized_weights_for_decimation(
|
||||||
// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
|
// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
|
||||||
if (get_quant_level(quant_level) <= 16)
|
if (get_quant_level(quant_level) <= 16)
|
||||||
{
|
{
|
||||||
vint4 tab0 = vint4::load(qat.quant_to_unquant);
|
vtable_16x8 table;
|
||||||
vint tab0p;
|
vtable_prepare(table, qat.quant_to_unquant);
|
||||||
vtable_prepare(tab0, tab0p);
|
|
||||||
|
|
||||||
for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
||||||
{
|
{
|
||||||
|
@ -1038,8 +1031,8 @@ void compute_quantized_weights_for_decimation(
|
||||||
vint weightl = float_to_int(ix1);
|
vint weightl = float_to_int(ix1);
|
||||||
vint weighth = min(weightl + vint(1), steps_m1);
|
vint weighth = min(weightl + vint(1), steps_m1);
|
||||||
|
|
||||||
vint ixli = vtable_8bt_32bi(tab0p, weightl);
|
vint ixli = vtable_lookup_32bit(table, weightl);
|
||||||
vint ixhi = vtable_8bt_32bi(tab0p, weighth);
|
vint ixhi = vtable_lookup_32bit(table, weighth);
|
||||||
|
|
||||||
vfloat ixl = int_to_float(ixli);
|
vfloat ixl = int_to_float(ixli);
|
||||||
vfloat ixh = int_to_float(ixhi);
|
vfloat ixh = int_to_float(ixhi);
|
||||||
|
@ -1050,16 +1043,13 @@ void compute_quantized_weights_for_decimation(
|
||||||
|
|
||||||
// Invert the weight-scaling that was done initially
|
// Invert the weight-scaling that was done initially
|
||||||
storea(ixl * rscalev + low_boundv, weight_set_out + i);
|
storea(ixl * rscalev + low_boundv, weight_set_out + i);
|
||||||
vint scn = pack_low_bytes(weight);
|
pack_and_store_low_bytes(weight, quantized_weight_set + i);
|
||||||
store_nbytes(scn, quantized_weight_set + i);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
vint4 tab0 = vint4::load(qat.quant_to_unquant + 0);
|
vtable_32x8 table;
|
||||||
vint4 tab1 = vint4::load(qat.quant_to_unquant + 16);
|
vtable_prepare(table, qat.quant_to_unquant);
|
||||||
vint tab0p, tab1p;
|
|
||||||
vtable_prepare(tab0, tab1, tab0p, tab1p);
|
|
||||||
|
|
||||||
for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
||||||
{
|
{
|
||||||
|
@ -1072,8 +1062,8 @@ void compute_quantized_weights_for_decimation(
|
||||||
vint weightl = float_to_int(ix1);
|
vint weightl = float_to_int(ix1);
|
||||||
vint weighth = min(weightl + vint(1), steps_m1);
|
vint weighth = min(weightl + vint(1), steps_m1);
|
||||||
|
|
||||||
vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl);
|
vint ixli = vtable_lookup_32bit(table, weightl);
|
||||||
vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth);
|
vint ixhi = vtable_lookup_32bit(table, weighth);
|
||||||
|
|
||||||
vfloat ixl = int_to_float(ixli);
|
vfloat ixl = int_to_float(ixli);
|
||||||
vfloat ixh = int_to_float(ixhi);
|
vfloat ixh = int_to_float(ixhi);
|
||||||
|
@ -1084,8 +1074,7 @@ void compute_quantized_weights_for_decimation(
|
||||||
|
|
||||||
// Invert the weight-scaling that was done initially
|
// Invert the weight-scaling that was done initially
|
||||||
storea(ixl * rscalev + low_boundv, weight_set_out + i);
|
storea(ixl * rscalev + low_boundv, weight_set_out + i);
|
||||||
vint scn = pack_low_bytes(weight);
|
pack_and_store_low_bytes(weight, quantized_weight_set + i);
|
||||||
store_nbytes(scn, quantized_weight_set + i);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -58,8 +58,10 @@
|
||||||
#ifndef ASTCENC_AVX
|
#ifndef ASTCENC_AVX
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
#define ASTCENC_AVX 2
|
#define ASTCENC_AVX 2
|
||||||
|
#define ASTCENC_X86_GATHERS 1
|
||||||
#elif defined(__AVX__)
|
#elif defined(__AVX__)
|
||||||
#define ASTCENC_AVX 1
|
#define ASTCENC_AVX 1
|
||||||
|
#define ASTCENC_X86_GATHERS 1
|
||||||
#else
|
#else
|
||||||
#define ASTCENC_AVX 0
|
#define ASTCENC_AVX 0
|
||||||
#endif
|
#endif
|
||||||
|
@ -73,10 +75,25 @@
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef ASTCENC_SVE
|
||||||
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
|
#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
|
||||||
|
#define ASTCENC_SVE 8
|
||||||
|
// Auto-detected SVE can only assume vector width of 4 is available, but
|
||||||
|
// must also allow for hardware being longer and so all use of intrinsics
|
||||||
|
// must explicitly use predicate masks to limit to 4-wide.
|
||||||
|
#else
|
||||||
|
#define ASTCENC_SVE 4
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define ASTCENC_SVE 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
// Force vector-sized SIMD alignment
|
// Force vector-sized SIMD alignment
|
||||||
#if ASTCENC_AVX
|
#if ASTCENC_AVX || ASTCENC_SVE == 8
|
||||||
#define ASTCENC_VECALIGN 32
|
#define ASTCENC_VECALIGN 32
|
||||||
#elif ASTCENC_SSE || ASTCENC_NEON
|
#elif ASTCENC_SSE || ASTCENC_NEON || ASTCENC_SVE == 4
|
||||||
#define ASTCENC_VECALIGN 16
|
#define ASTCENC_VECALIGN 16
|
||||||
// Use default alignment for non-SIMD builds
|
// Use default alignment for non-SIMD builds
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
// Copyright 2011-2022 Arm Limited
|
// Copyright 2011-2024 Arm Limited
|
||||||
//
|
//
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||||
// use this file except in compliance with the License. You may obtain a copy
|
// use this file except in compliance with the License. You may obtain a copy
|
||||||
|
@ -123,21 +123,21 @@ static void compute_error_squared_rgb_single_partition(
|
||||||
vint lane_ids = vint::lane_id();
|
vint lane_ids = vint::lane_id();
|
||||||
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||||
{
|
{
|
||||||
vint tix(texel_indexes + i);
|
const uint8_t* tix = texel_indexes + i;
|
||||||
|
|
||||||
vmask mask = lane_ids < vint(texel_count);
|
vmask mask = lane_ids < vint(texel_count);
|
||||||
lane_ids += vint(ASTCENC_SIMD_WIDTH);
|
lane_ids += vint(ASTCENC_SIMD_WIDTH);
|
||||||
|
|
||||||
// Compute the error that arises from just ditching alpha
|
// Compute the error that arises from just ditching alpha
|
||||||
vfloat data_a = gatherf(blk.data_a, tix);
|
vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, tix);
|
||||||
vfloat alpha_diff = data_a - default_a;
|
vfloat alpha_diff = data_a - default_a;
|
||||||
alpha_diff = alpha_diff * alpha_diff;
|
alpha_diff = alpha_diff * alpha_diff;
|
||||||
|
|
||||||
haccumulate(a_drop_errv, alpha_diff, mask);
|
haccumulate(a_drop_errv, alpha_diff, mask);
|
||||||
|
|
||||||
vfloat data_r = gatherf(blk.data_r, tix);
|
vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, tix);
|
||||||
vfloat data_g = gatherf(blk.data_g, tix);
|
vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, tix);
|
||||||
vfloat data_b = gatherf(blk.data_b, tix);
|
vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, tix);
|
||||||
|
|
||||||
// Compute uncorrelated error
|
// Compute uncorrelated error
|
||||||
vfloat param = data_r * uncor_bs0
|
vfloat param = data_r * uncor_bs0
|
||||||
|
@ -1306,8 +1306,8 @@ unsigned int compute_ideal_endpoint_formats(
|
||||||
// Pick best mode from the SIMD result, using lowest matching index to ensure invariance
|
// Pick best mode from the SIMD result, using lowest matching index to ensure invariance
|
||||||
vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
|
vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
|
||||||
vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
|
vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
|
||||||
vbest_error_index = hmin(vbest_error_index);
|
|
||||||
int best_error_index = vbest_error_index.lane<0>();
|
int best_error_index = hmin_s(vbest_error_index);
|
||||||
|
|
||||||
best_error_weights[i] = best_error_index;
|
best_error_weights[i] = best_error_index;
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
// Copyright 2019-2022 Arm Limited
|
// Copyright 2019-2024 Arm Limited
|
||||||
// Copyright 2008 Jose Fonseca
|
// Copyright 2008 Jose Fonseca
|
||||||
//
|
//
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||||
|
@ -42,11 +42,12 @@
|
||||||
*
|
*
|
||||||
* With the current implementation ISA support is provided for:
|
* With the current implementation ISA support is provided for:
|
||||||
*
|
*
|
||||||
* * 1-wide for scalar reference.
|
* * 1-wide for scalar reference
|
||||||
* * 4-wide for Armv8-A NEON.
|
* * 4-wide for Armv8-A NEON
|
||||||
* * 4-wide for x86-64 SSE2.
|
* * 4-wide for x86-64 SSE2
|
||||||
* * 4-wide for x86-64 SSE4.1.
|
* * 4-wide for x86-64 SSE4.1
|
||||||
* * 8-wide for x86-64 AVX2.
|
* * 8-wide for Armv8-A SVE
|
||||||
|
* * 8-wide for x86-64 AVX2
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef ASTC_VECMATHLIB_H_INCLUDED
|
#ifndef ASTC_VECMATHLIB_H_INCLUDED
|
||||||
|
@ -54,7 +55,14 @@
|
||||||
|
|
||||||
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
|
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#elif ASTCENC_NEON != 0
|
#endif
|
||||||
|
|
||||||
|
#if ASTCENC_SVE != 0
|
||||||
|
#include <arm_sve.h>
|
||||||
|
#include <arm_neon_sve_bridge.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if ASTCENC_NEON != 0
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -69,8 +77,10 @@
|
||||||
#define ASTCENC_NO_INLINE __attribute__ ((noinline))
|
#define ASTCENC_NO_INLINE __attribute__ ((noinline))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indices);
|
||||||
|
|
||||||
#if ASTCENC_AVX >= 2
|
#if ASTCENC_AVX >= 2
|
||||||
/* If we have AVX2 expose 8-wide VLA. */
|
// If we have AVX2 expose 8-wide VLA.
|
||||||
#include "astcenc_vecmathlib_sse_4.h"
|
#include "astcenc_vecmathlib_sse_4.h"
|
||||||
#include "astcenc_vecmathlib_common_4.h"
|
#include "astcenc_vecmathlib_common_4.h"
|
||||||
#include "astcenc_vecmathlib_avx2_8.h"
|
#include "astcenc_vecmathlib_avx2_8.h"
|
||||||
|
@ -88,11 +98,15 @@
|
||||||
using vint = vint8;
|
using vint = vint8;
|
||||||
using vmask = vmask8;
|
using vmask = vmask8;
|
||||||
|
|
||||||
|
using vtable_16x8 = vtable8_16x8;
|
||||||
|
using vtable_32x8 = vtable8_32x8;
|
||||||
|
using vtable_64x8 = vtable8_64x8;
|
||||||
|
|
||||||
constexpr auto loada = vfloat8::loada;
|
constexpr auto loada = vfloat8::loada;
|
||||||
constexpr auto load1 = vfloat8::load1;
|
constexpr auto load1 = vfloat8::load1;
|
||||||
|
|
||||||
#elif ASTCENC_SSE >= 20
|
#elif ASTCENC_SSE >= 20
|
||||||
/* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */
|
// If we have SSE expose 4-wide VLA, and 4-wide fixed width.
|
||||||
#include "astcenc_vecmathlib_sse_4.h"
|
#include "astcenc_vecmathlib_sse_4.h"
|
||||||
#include "astcenc_vecmathlib_common_4.h"
|
#include "astcenc_vecmathlib_common_4.h"
|
||||||
|
|
||||||
|
@ -103,11 +117,46 @@
|
||||||
using vint = vint4;
|
using vint = vint4;
|
||||||
using vmask = vmask4;
|
using vmask = vmask4;
|
||||||
|
|
||||||
|
using vtable_16x8 = vtable4_16x8;
|
||||||
|
using vtable_32x8 = vtable4_32x8;
|
||||||
|
using vtable_64x8 = vtable4_64x8;
|
||||||
|
|
||||||
constexpr auto loada = vfloat4::loada;
|
constexpr auto loada = vfloat4::loada;
|
||||||
constexpr auto load1 = vfloat4::load1;
|
constexpr auto load1 = vfloat4::load1;
|
||||||
|
|
||||||
|
#elif ASTCENC_SVE == 8
|
||||||
|
// Check the compiler is configured with fixed-length 256-bit SVE.
|
||||||
|
#if !defined(__ARM_FEATURE_SVE_BITS) || (__ARM_FEATURE_SVE_BITS != 256)
|
||||||
|
#error "__ARM_FEATURE_SVE_BITS is not set to 256 bits"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// If we have SVE configured as 8-wide, expose 8-wide VLA.
|
||||||
|
#include "astcenc_vecmathlib_neon_4.h"
|
||||||
|
#include "astcenc_vecmathlib_common_4.h"
|
||||||
|
#include "astcenc_vecmathlib_sve_8.h"
|
||||||
|
|
||||||
|
#define ASTCENC_SIMD_WIDTH 8
|
||||||
|
|
||||||
|
using vfloat = vfloat8;
|
||||||
|
|
||||||
|
#if defined(ASTCENC_NO_INVARIANCE)
|
||||||
|
using vfloatacc = vfloat8;
|
||||||
|
#else
|
||||||
|
using vfloatacc = vfloat4;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
using vint = vint8;
|
||||||
|
using vmask = vmask8;
|
||||||
|
|
||||||
|
using vtable_16x8 = vtable8_16x8;
|
||||||
|
using vtable_32x8 = vtable8_32x8;
|
||||||
|
using vtable_64x8 = vtable8_64x8;
|
||||||
|
|
||||||
|
constexpr auto loada = vfloat8::loada;
|
||||||
|
constexpr auto load1 = vfloat8::load1;
|
||||||
|
|
||||||
#elif ASTCENC_NEON > 0
|
#elif ASTCENC_NEON > 0
|
||||||
/* If we have NEON expose 4-wide VLA. */
|
// If we have NEON expose 4-wide VLA.
|
||||||
#include "astcenc_vecmathlib_neon_4.h"
|
#include "astcenc_vecmathlib_neon_4.h"
|
||||||
#include "astcenc_vecmathlib_common_4.h"
|
#include "astcenc_vecmathlib_common_4.h"
|
||||||
|
|
||||||
|
@ -118,6 +167,10 @@
|
||||||
using vint = vint4;
|
using vint = vint4;
|
||||||
using vmask = vmask4;
|
using vmask = vmask4;
|
||||||
|
|
||||||
|
using vtable_16x8 = vtable4_16x8;
|
||||||
|
using vtable_32x8 = vtable4_32x8;
|
||||||
|
using vtable_64x8 = vtable4_64x8;
|
||||||
|
|
||||||
constexpr auto loada = vfloat4::loada;
|
constexpr auto loada = vfloat4::loada;
|
||||||
constexpr auto load1 = vfloat4::load1;
|
constexpr auto load1 = vfloat4::load1;
|
||||||
|
|
||||||
|
@ -150,6 +203,10 @@
|
||||||
using vint = vint4;
|
using vint = vint4;
|
||||||
using vmask = vmask4;
|
using vmask = vmask4;
|
||||||
|
|
||||||
|
using vtable_16x8 = vtable4_16x8;
|
||||||
|
using vtable_32x8 = vtable4_32x8;
|
||||||
|
using vtable_64x8 = vtable4_64x8;
|
||||||
|
|
||||||
constexpr auto loada = vfloat4::loada;
|
constexpr auto loada = vfloat4::loada;
|
||||||
constexpr auto load1 = vfloat4::load1;
|
constexpr auto load1 = vfloat4::load1;
|
||||||
#endif
|
#endif
|
||||||
|
@ -239,8 +296,8 @@ ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
|
||||||
ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
|
ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
|
||||||
{
|
{
|
||||||
vfloat z = atan(abs(y / x));
|
vfloat z = atan(abs(y / x));
|
||||||
vmask xmask = vmask(float_as_int(x).m);
|
vmask xmask = x < vfloat::zero();
|
||||||
return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y);
|
return change_sign(select(z, vfloat(astc::PI) - z, xmask), y);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -54,7 +54,7 @@ struct vfloat8
|
||||||
ASTCENC_SIMD_INLINE vfloat8() = default;
|
ASTCENC_SIMD_INLINE vfloat8() = default;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Construct from 4 values loaded from an unaligned address.
|
* @brief Construct from 8 values loaded from an unaligned address.
|
||||||
*
|
*
|
||||||
* Consider using loada() which is better with vectors if data is aligned
|
* Consider using loada() which is better with vectors if data is aligned
|
||||||
* to vector length.
|
* to vector length.
|
||||||
|
@ -74,18 +74,6 @@ struct vfloat8
|
||||||
m = _mm256_set1_ps(a);
|
m = _mm256_set1_ps(a);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Construct from 8 scalar values.
|
|
||||||
*
|
|
||||||
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
|
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE explicit vfloat8(
|
|
||||||
float a, float b, float c, float d,
|
|
||||||
float e, float f, float g, float h)
|
|
||||||
{
|
|
||||||
m = _mm256_set_ps(h, g, f, e, d, c, b, a);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Construct from an existing SIMD register.
|
* @brief Construct from an existing SIMD register.
|
||||||
*/
|
*/
|
||||||
|
@ -94,20 +82,6 @@ struct vfloat8
|
||||||
m = a;
|
m = a;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Get the scalar value of a single lane.
|
|
||||||
*/
|
|
||||||
template <int l> ASTCENC_SIMD_INLINE float lane() const
|
|
||||||
{
|
|
||||||
#if !defined(__clang__) && defined(_MSC_VER)
|
|
||||||
return m.m256_f32[l];
|
|
||||||
#else
|
|
||||||
union { __m256 m; float f[8]; } cvt;
|
|
||||||
cvt.m = m;
|
|
||||||
return cvt.f[l];
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Factory that returns a vector of zeros.
|
* @brief Factory that returns a vector of zeros.
|
||||||
*/
|
*/
|
||||||
|
@ -132,14 +106,6 @@ struct vfloat8
|
||||||
return vfloat8(_mm256_load_ps(p));
|
return vfloat8(_mm256_load_ps(p));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Factory that returns a vector containing the lane IDs.
|
|
||||||
*/
|
|
||||||
static ASTCENC_SIMD_INLINE vfloat8 lane_id()
|
|
||||||
{
|
|
||||||
return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief The vector ...
|
* @brief The vector ...
|
||||||
*/
|
*/
|
||||||
|
@ -183,25 +149,13 @@ struct vint8
|
||||||
/**
|
/**
|
||||||
* @brief Construct from 1 scalar value replicated across all lanes.
|
* @brief Construct from 1 scalar value replicated across all lanes.
|
||||||
*
|
*
|
||||||
* Consider using vfloat4::zero() for constexpr zeros.
|
* Consider using zero() for constexpr zeros.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE explicit vint8(int a)
|
ASTCENC_SIMD_INLINE explicit vint8(int a)
|
||||||
{
|
{
|
||||||
m = _mm256_set1_epi32(a);
|
m = _mm256_set1_epi32(a);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Construct from 8 scalar values.
|
|
||||||
*
|
|
||||||
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
|
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE explicit vint8(
|
|
||||||
int a, int b, int c, int d,
|
|
||||||
int e, int f, int g, int h)
|
|
||||||
{
|
|
||||||
m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Construct from an existing SIMD register.
|
* @brief Construct from an existing SIMD register.
|
||||||
*/
|
*/
|
||||||
|
@ -210,20 +164,6 @@ struct vint8
|
||||||
m = a;
|
m = a;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Get the scalar from a single lane.
|
|
||||||
*/
|
|
||||||
template <int l> ASTCENC_SIMD_INLINE int lane() const
|
|
||||||
{
|
|
||||||
#if !defined(__clang__) && defined(_MSC_VER)
|
|
||||||
return m.m256i_i32[l];
|
|
||||||
#else
|
|
||||||
union { __m256i m; int f[8]; } cvt;
|
|
||||||
cvt.m = m;
|
|
||||||
return cvt.f[l];
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Factory that returns a vector of zeros.
|
* @brief Factory that returns a vector of zeros.
|
||||||
*/
|
*/
|
||||||
|
@ -518,31 +458,45 @@ ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
|
ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
|
||||||
{
|
{
|
||||||
__m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
|
// Build min within groups of 2, then 4, then 8
|
||||||
m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
|
__m256i m = _mm256_min_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
|
||||||
m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
|
m = _mm256_min_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||||
m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
|
m = _mm256_min_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));
|
||||||
|
|
||||||
__m256i r = astcenc_mm256_set_m128i(m, m);
|
vint8 vmin(m);
|
||||||
vint8 vmin(r);
|
|
||||||
return vmin;
|
return vmin;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Return the horizontal minimum of a vector.
|
||||||
|
*/
|
||||||
|
ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
|
||||||
|
{
|
||||||
|
return _mm256_cvtsi256_si32(hmin(a).m);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Return the horizontal maximum of a vector.
|
* @brief Return the horizontal maximum of a vector.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
|
ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
|
||||||
{
|
{
|
||||||
__m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
|
// Build max within groups of 2, then 4, then 8
|
||||||
m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
|
__m256i m = _mm256_max_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
|
||||||
m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
|
m = _mm256_max_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||||
m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
|
m = _mm256_max_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));
|
||||||
|
|
||||||
__m256i r = astcenc_mm256_set_m128i(m, m);
|
vint8 vmax(m);
|
||||||
vint8 vmax(r);
|
|
||||||
return vmax;
|
return vmax;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Return the horizontal maximum of a vector.
|
||||||
|
*/
|
||||||
|
ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
|
||||||
|
{
|
||||||
|
return _mm256_cvtsi256_si32(hmax(a).m);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Store a vector to a 16B aligned memory address.
|
* @brief Store a vector to a 16B aligned memory address.
|
||||||
*/
|
*/
|
||||||
|
@ -570,18 +524,10 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
|
||||||
_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
|
_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Gather N (vector width) indices from the array.
|
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
|
|
||||||
{
|
|
||||||
return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
|
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
|
ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint8 v, uint8_t* p)
|
||||||
{
|
{
|
||||||
__m256i shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
|
__m256i shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
0, 0, 0, 0, 28, 24, 20, 16,
|
0, 0, 0, 0, 28, 24, 20, 16,
|
||||||
|
@ -593,7 +539,8 @@ ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
|
||||||
__m128i b = _mm_unpacklo_epi32(a0, a1);
|
__m128i b = _mm_unpacklo_epi32(a0, a1);
|
||||||
|
|
||||||
__m256i r = astcenc_mm256_set_m128i(b, b);
|
__m256i r = astcenc_mm256_set_m128i(b, b);
|
||||||
return vint8(r);
|
|
||||||
|
store_nbytes(vint8(r), p);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -606,7 +553,7 @@ ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// vfloat4 operators and functions
|
// vfloat8 operators and functions
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -674,7 +621,6 @@ ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
|
||||||
return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b)));
|
return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Overload: scalar by vector division.
|
* @brief Overload: scalar by vector division.
|
||||||
*/
|
*/
|
||||||
|
@ -683,7 +629,6 @@ ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
|
||||||
return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m));
|
return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Overload: vector by vector equality.
|
* @brief Overload: vector by vector equality.
|
||||||
*/
|
*/
|
||||||
|
@ -786,19 +731,6 @@ ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Return a clamped value between 0.0f and max.
|
|
||||||
*
|
|
||||||
* It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
|
|
||||||
* be returned for that lane.
|
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
|
|
||||||
{
|
|
||||||
a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
|
|
||||||
a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Return a clamped value between 0.0f and 1.0f.
|
* @brief Return a clamped value between 0.0f and 1.0f.
|
||||||
*
|
*
|
||||||
|
@ -857,7 +789,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
|
ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
|
||||||
{
|
{
|
||||||
return hmin(a).lane<0>();
|
return _mm256_cvtss_f32(hmin(a).m);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -887,7 +819,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
|
ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
|
||||||
{
|
{
|
||||||
return hmax(a).lane<0>();
|
return _mm256_cvtss_f32(hmax(a).m);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -909,14 +841,6 @@ ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
|
||||||
return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
|
return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
|
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond)
|
|
||||||
{
|
|
||||||
return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Accumulate lane-wise sums for a vector, folded 4-wide.
|
* @brief Accumulate lane-wise sums for a vector, folded 4-wide.
|
||||||
*
|
*
|
||||||
|
@ -979,6 +903,33 @@ ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
|
||||||
return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
|
return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Load a vector of gathered results from an array using byte indices from memory
|
||||||
|
*/
|
||||||
|
template<>
|
||||||
|
ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds<vfloat8>(const float* base, const uint8_t* indices)
|
||||||
|
{
|
||||||
|
#if ASTCENC_X86_GATHERS == 0
|
||||||
|
// Perform manual gather using scalar loads in two separate dependency chains,
|
||||||
|
// then merge late. MSVC translates this 1:1, which is OK. Clang turns it
|
||||||
|
// into a bunch of memory-operand inserts on 128-bit halves then merges late,
|
||||||
|
// which performs significantly worse in tests.
|
||||||
|
__m256 m0 = _mm256_broadcast_ss(base + indices[0]);
|
||||||
|
__m256 m1 = _mm256_broadcast_ss(base + indices[1]);
|
||||||
|
m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[2]), 1 << 2);
|
||||||
|
m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[3]), 1 << 3);
|
||||||
|
m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[4]), 1 << 4);
|
||||||
|
m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[5]), 1 << 5);
|
||||||
|
m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[6]), 1 << 6);
|
||||||
|
m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[7]), 1 << 7);
|
||||||
|
|
||||||
|
return vfloat8(_mm256_blend_ps(m0, m1, 0xaa));
|
||||||
|
#else
|
||||||
|
vint8 inds(indices);
|
||||||
|
return gatherf(base, inds);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Store a vector to an unaligned memory address.
|
* @brief Store a vector to an unaligned memory address.
|
||||||
*/
|
*/
|
||||||
|
@ -1045,98 +996,140 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
|
||||||
return vfloat8(_mm256_castsi256_ps(a.m));
|
return vfloat8(_mm256_castsi256_ps(a.m));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
* Table structure for a 16x 8-bit entry table.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
|
struct vtable8_16x8 {
|
||||||
{
|
vint8 t0;
|
||||||
// AVX2 duplicates the table within each 128-bit lane
|
};
|
||||||
__m128i t0n = t0.m;
|
|
||||||
t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
* Table structure for a 32x 8-bit entry table.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
|
struct vtable8_32x8 {
|
||||||
{
|
vint8 t0;
|
||||||
// AVX2 duplicates the table within each 128-bit lane
|
vint8 t1;
|
||||||
__m128i t0n = t0.m;
|
};
|
||||||
t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
|
|
||||||
|
|
||||||
__m128i t1n = _mm_xor_si128(t0.m, t1.m);
|
/*
|
||||||
t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
|
* Table structure for a 64x 8-bit entry table.
|
||||||
}
|
*/
|
||||||
|
struct vtable8_64x8 {
|
||||||
|
vint8 t0;
|
||||||
|
vint8 t1;
|
||||||
|
vint8 t2;
|
||||||
|
vint8 t3;
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
* @brief Prepare a vtable lookup table for 16x 8-bit entry table.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE void vtable_prepare(
|
ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||||
vint4 t0, vint4 t1, vint4 t2, vint4 t3,
|
vtable8_16x8& table,
|
||||||
vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
|
const uint8_t* data
|
||||||
{
|
) {
|
||||||
// AVX2 duplicates the table within each 128-bit lane
|
// AVX2 tables duplicate table entries in each 128-bit half-register
|
||||||
__m128i t0n = t0.m;
|
vint4 d0 = vint4::load(data);
|
||||||
t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
|
|
||||||
|
|
||||||
__m128i t1n = _mm_xor_si128(t0.m, t1.m);
|
table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
|
||||||
t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
|
|
||||||
|
|
||||||
__m128i t2n = _mm_xor_si128(t1.m, t2.m);
|
|
||||||
t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n));
|
|
||||||
|
|
||||||
__m128i t3n = _mm_xor_si128(t2.m, t3.m);
|
|
||||||
t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
|
* @brief Prepare a vtable lookup table for 32x 8-bit entry table.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
|
ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||||
{
|
vtable8_32x8& table,
|
||||||
|
const uint8_t* data
|
||||||
|
) {
|
||||||
|
// AVX2 tables duplicate table entries in each 128-bit half-register
|
||||||
|
vint4 d0 = vint4::load(data);
|
||||||
|
vint4 d1 = vint4::load(data + 16);
|
||||||
|
|
||||||
|
table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
|
||||||
|
table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
|
||||||
|
|
||||||
|
// XOR chain the high rows to allow table emulation
|
||||||
|
table.t1 = table.t1 ^ table.t0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Prepare a vtable lookup table 64x 8-bit entry table.
|
||||||
|
*/
|
||||||
|
ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||||
|
vtable8_64x8& table,
|
||||||
|
const uint8_t* data
|
||||||
|
) {
|
||||||
|
// AVX2 tables duplicate table entries in each 128-bit half-register
|
||||||
|
vint4 d0 = vint4::load(data);
|
||||||
|
vint4 d1 = vint4::load(data + 16);
|
||||||
|
vint4 d2 = vint4::load(data + 32);
|
||||||
|
vint4 d3 = vint4::load(data + 48);
|
||||||
|
|
||||||
|
table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
|
||||||
|
table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
|
||||||
|
table.t2 = vint8(astcenc_mm256_set_m128i(d2.m, d2.m));
|
||||||
|
table.t3 = vint8(astcenc_mm256_set_m128i(d3.m, d3.m));
|
||||||
|
|
||||||
|
// XOR chain the high rows to allow table emulation
|
||||||
|
table.t3 = table.t3 ^ table.t2;
|
||||||
|
table.t2 = table.t2 ^ table.t1;
|
||||||
|
table.t1 = table.t1 ^ table.t0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
|
||||||
|
*/
|
||||||
|
ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
|
||||||
|
const vtable8_16x8& tbl,
|
||||||
|
vint8 idx
|
||||||
|
) {
|
||||||
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
||||||
__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
||||||
|
|
||||||
__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
|
__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
|
||||||
return vint8(result);
|
return vint8(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
|
* @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
|
ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
|
||||||
{
|
const vtable8_32x8& tbl,
|
||||||
|
vint8 idx
|
||||||
|
) {
|
||||||
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
||||||
__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
||||||
|
|
||||||
__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
|
__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
|
||||||
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
|
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
|
||||||
|
|
||||||
__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
|
__m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
|
||||||
result = _mm256_xor_si256(result, result2);
|
result = _mm256_xor_si256(result, result2);
|
||||||
return vint8(result);
|
return vint8(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
|
* @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
|
ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
|
||||||
{
|
const vtable8_64x8& tbl,
|
||||||
|
vint8 idx
|
||||||
|
) {
|
||||||
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
||||||
__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
||||||
|
|
||||||
__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
|
__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
|
||||||
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
|
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
|
||||||
|
|
||||||
__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
|
__m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
|
||||||
result = _mm256_xor_si256(result, result2);
|
result = _mm256_xor_si256(result, result2);
|
||||||
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
|
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
|
||||||
|
|
||||||
result2 = _mm256_shuffle_epi8(t2.m, idxx);
|
result2 = _mm256_shuffle_epi8(tbl.t2.m, idxx);
|
||||||
result = _mm256_xor_si256(result, result2);
|
result = _mm256_xor_si256(result, result2);
|
||||||
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
|
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
|
||||||
|
|
||||||
result2 = _mm256_shuffle_epi8(t3.m, idxx);
|
result2 = _mm256_shuffle_epi8(tbl.t3.m, idxx);
|
||||||
result = _mm256_xor_si256(result, result2);
|
result = _mm256_xor_si256(result, result2);
|
||||||
|
|
||||||
return vint8(result);
|
return vint8(result);
|
||||||
|
@ -1146,7 +1139,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3
|
||||||
* @brief Return a vector of interleaved RGBA data.
|
* @brief Return a vector of interleaved RGBA data.
|
||||||
*
|
*
|
||||||
* Input vectors have the value stored in the bottom 8 bits of each lane,
|
* Input vectors have the value stored in the bottom 8 bits of each lane,
|
||||||
* with high bits set to zero.
|
* with high bits set to zero.
|
||||||
*
|
*
|
||||||
* Output vector stores a single RGBA texel packed in each lane.
|
* Output vector stores a single RGBA texel packed in each lane.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -32,26 +32,6 @@
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
|
||||||
// ============================================================================
|
|
||||||
// vmask4 operators and functions
|
|
||||||
// ============================================================================
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief True if any lanes are enabled, false otherwise.
|
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE bool any(vmask4 a)
|
|
||||||
{
|
|
||||||
return mask(a) != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief True if all lanes are enabled, false otherwise.
|
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE bool all(vmask4 a)
|
|
||||||
{
|
|
||||||
return mask(a) == 0xF;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// vint4 operators and functions
|
// vint4 operators and functions
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
@ -129,6 +109,22 @@ ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
|
||||||
return a.lane<0>() + a.lane<1>() + a.lane<2>();
|
return a.lane<0>() + a.lane<1>() + a.lane<2>();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Return the horizontal minimum of a vector.
|
||||||
|
*/
|
||||||
|
ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
|
||||||
|
{
|
||||||
|
return hmin(a).lane<0>();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Return the horizontal maximum of a vector.
|
||||||
|
*/
|
||||||
|
ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
|
||||||
|
{
|
||||||
|
return hmax(a).lane<0>();
|
||||||
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// vfloat4 operators and functions
|
// vfloat4 operators and functions
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
@ -222,18 +218,6 @@ ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
|
||||||
return min(max(a, minv), maxv);
|
return min(max(a, minv), maxv);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Return the clamped value between 0.0f and max.
|
|
||||||
*
|
|
||||||
* It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
|
|
||||||
* be returned for that lane.
|
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
|
|
||||||
{
|
|
||||||
// Do not reorder - second operand will return if either is NaN
|
|
||||||
return min(max(a, vfloat4::zero()), maxv);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Return the clamped value between 0.0f and 1.0f.
|
* @brief Return the clamped value between 0.0f and 1.0f.
|
||||||
*
|
*
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
// Copyright 2019-2023 Arm Limited
|
// Copyright 2019-2024 Arm Limited
|
||||||
//
|
//
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||||
// use this file except in compliance with the License. You may obtain a copy
|
// use this file except in compliance with the License. You may obtain a copy
|
||||||
|
@ -115,7 +115,7 @@ struct vfloat4
|
||||||
*/
|
*/
|
||||||
static ASTCENC_SIMD_INLINE vfloat4 zero()
|
static ASTCENC_SIMD_INLINE vfloat4 zero()
|
||||||
{
|
{
|
||||||
return vfloat4(vdupq_n_f32(0.0f));
|
return vfloat4(0.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -134,15 +134,6 @@ struct vfloat4
|
||||||
return vfloat4(vld1q_f32(p));
|
return vfloat4(vld1q_f32(p));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Factory that returns a vector containing the lane IDs.
|
|
||||||
*/
|
|
||||||
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
|
|
||||||
{
|
|
||||||
alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
|
|
||||||
return vfloat4(vld1q_f32(data));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Return a swizzled float 2.
|
* @brief Return a swizzled float 2.
|
||||||
*/
|
*/
|
||||||
|
@ -203,16 +194,21 @@ struct vint4
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
|
ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
|
||||||
{
|
{
|
||||||
// Cast is safe - NEON loads are allowed to be unaligned
|
#if ASTCENC_SVE == 0
|
||||||
uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
|
// Cast is safe - NEON loads are allowed to be unaligned
|
||||||
uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
|
uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
|
||||||
m = vreinterpretq_s32_u32(vmovl_u16(t16));
|
uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
|
||||||
|
m = vreinterpretq_s32_u32(vmovl_u16(t16));
|
||||||
|
#else
|
||||||
|
svint32_t data = svld1ub_s32(svptrue_pat_b32(SV_VL4), p);
|
||||||
|
m = svget_neonq(data);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Construct from 1 scalar value replicated across all lanes.
|
* @brief Construct from 1 scalar value replicated across all lanes.
|
||||||
*
|
*
|
||||||
* Consider using vfloat4::zero() for constexpr zeros.
|
* Consider using zero() for constexpr zeros.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE explicit vint4(int a)
|
ASTCENC_SIMD_INLINE explicit vint4(int a)
|
||||||
{
|
{
|
||||||
|
@ -420,6 +416,22 @@ ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
|
||||||
return vaddvq_u32(vshlq_u32(tmp, shift));
|
return vaddvq_u32(vshlq_u32(tmp, shift));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief True if any lanes are enabled, false otherwise.
|
||||||
|
*/
|
||||||
|
ASTCENC_SIMD_INLINE bool any(vmask4 a)
|
||||||
|
{
|
||||||
|
return vmaxvq_u32(a.m) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief True if all lanes are enabled, false otherwise.
|
||||||
|
*/
|
||||||
|
ASTCENC_SIMD_INLINE bool all(vmask4 a)
|
||||||
|
{
|
||||||
|
return vminvq_u32(a.m) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// vint4 operators and functions
|
// vint4 operators and functions
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
@ -612,31 +624,17 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Gather N (vector width) indices from the array.
|
* @brief Pack and store low 8 bits of each vector lane.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
|
ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* data)
|
||||||
{
|
|
||||||
alignas(16) int idx[4];
|
|
||||||
storea(indices, idx);
|
|
||||||
alignas(16) int vals[4];
|
|
||||||
vals[0] = base[idx[0]];
|
|
||||||
vals[1] = base[idx[1]];
|
|
||||||
vals[2] = base[idx[2]];
|
|
||||||
vals[3] = base[idx[3]];
|
|
||||||
return vint4(vals);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
|
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
|
|
||||||
{
|
{
|
||||||
alignas(16) uint8_t shuf[16] {
|
alignas(16) uint8_t shuf[16] {
|
||||||
0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||||
};
|
};
|
||||||
uint8x16_t idx = vld1q_u8(shuf);
|
uint8x16_t idx = vld1q_u8(shuf);
|
||||||
int8x16_t av = vreinterpretq_s8_s32(a.m);
|
int8x16_t av = vreinterpretq_s8_s32(a.m);
|
||||||
return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
|
a = vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
|
||||||
|
store_nbytes(a, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -814,21 +812,12 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
|
||||||
return vfloat4(vbslq_f32(cond.m, b.m, a.m));
|
return vfloat4(vbslq_f32(cond.m, b.m, a.m));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
|
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
|
|
||||||
{
|
|
||||||
static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
|
|
||||||
uint32x4_t mask = vcgeq_u32(cond.m, msb);
|
|
||||||
return vfloat4(vbslq_f32(mask, b.m, a.m));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Load a vector of gathered results from an array;
|
* @brief Load a vector of gathered results from an array;
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
|
ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
|
||||||
{
|
{
|
||||||
|
#if ASTCENC_SVE == 0
|
||||||
alignas(16) int idx[4];
|
alignas(16) int idx[4];
|
||||||
storea(indices, idx);
|
storea(indices, idx);
|
||||||
alignas(16) float vals[4];
|
alignas(16) float vals[4];
|
||||||
|
@ -837,8 +826,32 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
|
||||||
vals[2] = base[idx[2]];
|
vals[2] = base[idx[2]];
|
||||||
vals[3] = base[idx[3]];
|
vals[3] = base[idx[3]];
|
||||||
return vfloat4(vals);
|
return vfloat4(vals);
|
||||||
|
#else
|
||||||
|
svint32_t offsets = svset_neonq_s32(svundef_s32(), indices.m);
|
||||||
|
svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
|
||||||
|
return vfloat4(svget_neonq_f32(data));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Load a vector of gathered results from an array using byte indices from memory
|
||||||
|
*/
|
||||||
|
template<>
|
||||||
|
ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
|
||||||
|
{
|
||||||
|
#if ASTCENC_SVE == 0
|
||||||
|
alignas(16) float vals[4];
|
||||||
|
vals[0] = base[indices[0]];
|
||||||
|
vals[1] = base[indices[1]];
|
||||||
|
vals[2] = base[indices[2]];
|
||||||
|
vals[3] = base[indices[3]];
|
||||||
|
return vfloat4(vals);
|
||||||
|
#else
|
||||||
|
svint32_t offsets = svld1ub_s32(svptrue_pat_b32(SV_VL4), indices);
|
||||||
|
svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
|
||||||
|
return vfloat4(svget_neonq_f32(data));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
/**
|
/**
|
||||||
* @brief Store a vector to an unaligned memory address.
|
* @brief Store a vector to an unaligned memory address.
|
||||||
*/
|
*/
|
||||||
|
@ -950,87 +963,105 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
|
||||||
return vfloat4(vreinterpretq_f32_s32(v.m));
|
return vfloat4(vreinterpretq_f32_s32(v.m));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
* Table structure for a 16x 8-bit entry table.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
|
struct vtable4_16x8 {
|
||||||
{
|
uint8x16_t t0;
|
||||||
t0p = t0;
|
};
|
||||||
}
|
|
||||||
|
|
||||||
|
/*
|
||||||
/**
|
* Table structure for a 32x 8-bit entry table.
|
||||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
|
struct vtable4_32x8 {
|
||||||
{
|
uint8x16x2_t t01;
|
||||||
t0p = t0;
|
};
|
||||||
t1p = t1;
|
|
||||||
}
|
/*
|
||||||
|
* Table structure for a 64x 8-bit entry table.
|
||||||
|
*/
|
||||||
|
struct vtable4_64x8 {
|
||||||
|
uint8x16x4_t t0123;
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
* @brief Prepare a vtable lookup table for 16x 8-bit entry table.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE void vtable_prepare(
|
ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||||
vint4 t0, vint4 t1, vint4 t2, vint4 t3,
|
vtable4_16x8& table,
|
||||||
vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
|
const uint8_t* data
|
||||||
{
|
) {
|
||||||
t0p = t0;
|
table.t0 = vld1q_u8(data);
|
||||||
t1p = t1;
|
|
||||||
t2p = t2;
|
|
||||||
t3p = t3;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
|
* @brief Prepare a vtable lookup table for 32x 8-bit entry table.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
|
ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||||
{
|
vtable4_32x8& table,
|
||||||
int8x16_t table {
|
const uint8_t* data
|
||||||
vreinterpretq_s8_s32(t0.m)
|
) {
|
||||||
|
table.t01 = uint8x16x2_t {
|
||||||
|
vld1q_u8(data),
|
||||||
|
vld1q_u8(data + 16)
|
||||||
};
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Prepare a vtable lookup table 64x 8-bit entry table.
|
||||||
|
*/
|
||||||
|
ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||||
|
vtable4_64x8& table,
|
||||||
|
const uint8_t* data
|
||||||
|
) {
|
||||||
|
table.t0123 = uint8x16x4_t {
|
||||||
|
vld1q_u8(data),
|
||||||
|
vld1q_u8(data + 16),
|
||||||
|
vld1q_u8(data + 32),
|
||||||
|
vld1q_u8(data + 48)
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
|
||||||
|
*/
|
||||||
|
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
|
||||||
|
const vtable4_16x8& tbl,
|
||||||
|
vint4 idx
|
||||||
|
) {
|
||||||
// Set index byte above max index for unused bytes so table lookup returns zero
|
// Set index byte above max index for unused bytes so table lookup returns zero
|
||||||
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
|
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
|
||||||
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
|
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
|
||||||
|
|
||||||
return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes)));
|
return vint4(vreinterpretq_s32_u8(vqtbl1q_u8(tbl.t0, idx_bytes)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
|
* @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
|
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
|
||||||
{
|
const vtable4_32x8& tbl,
|
||||||
int8x16x2_t table {
|
vint4 idx
|
||||||
vreinterpretq_s8_s32(t0.m),
|
) {
|
||||||
vreinterpretq_s8_s32(t1.m)
|
|
||||||
};
|
|
||||||
|
|
||||||
// Set index byte above max index for unused bytes so table lookup returns zero
|
// Set index byte above max index for unused bytes so table lookup returns zero
|
||||||
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
|
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
|
||||||
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
|
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
|
||||||
|
|
||||||
return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes)));
|
return vint4(vreinterpretq_s32_u8(vqtbl2q_u8(tbl.t01, idx_bytes)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
|
* @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
|
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
|
||||||
{
|
const vtable4_64x8& tbl,
|
||||||
int8x16x4_t table {
|
vint4 idx
|
||||||
vreinterpretq_s8_s32(t0.m),
|
) {
|
||||||
vreinterpretq_s8_s32(t1.m),
|
|
||||||
vreinterpretq_s8_s32(t2.m),
|
|
||||||
vreinterpretq_s8_s32(t3.m)
|
|
||||||
};
|
|
||||||
|
|
||||||
// Set index byte above max index for unused bytes so table lookup returns zero
|
// Set index byte above max index for unused bytes so table lookup returns zero
|
||||||
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
|
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
|
||||||
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
|
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
|
||||||
|
|
||||||
return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes)));
|
return vint4(vreinterpretq_s32_u8(vqtbl4q_u8(tbl.t0123, idx_bytes)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -139,14 +139,6 @@ struct vfloat4
|
||||||
return vfloat4(p);
|
return vfloat4(p);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Factory that returns a vector containing the lane IDs.
|
|
||||||
*/
|
|
||||||
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
|
|
||||||
{
|
|
||||||
return vfloat4(0.0f, 1.0f, 2.0f, 3.0f);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Return a swizzled float 2.
|
* @brief Return a swizzled float 2.
|
||||||
*/
|
*/
|
||||||
|
@ -233,7 +225,7 @@ struct vint4
|
||||||
/**
|
/**
|
||||||
* @brief Construct from 4 scalar values replicated across all lanes.
|
* @brief Construct from 4 scalar values replicated across all lanes.
|
||||||
*
|
*
|
||||||
* Consider using vint4::zero() for constexpr zeros.
|
* Consider using zero() for constexpr zeros.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE explicit vint4(int a)
|
ASTCENC_SIMD_INLINE explicit vint4(int a)
|
||||||
{
|
{
|
||||||
|
@ -354,7 +346,7 @@ struct vmask4
|
||||||
/**
|
/**
|
||||||
* @brief Get the scalar value of a single lane.
|
* @brief Get the scalar value of a single lane.
|
||||||
*/
|
*/
|
||||||
template <int l> ASTCENC_SIMD_INLINE float lane() const
|
template <int l> ASTCENC_SIMD_INLINE bool lane() const
|
||||||
{
|
{
|
||||||
return m[l] != 0;
|
return m[l] != 0;
|
||||||
}
|
}
|
||||||
|
@ -426,6 +418,22 @@ ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
|
||||||
((a.m[3] >> 28) & 0x8);
|
((a.m[3] >> 28) & 0x8);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief True if any lanes are enabled, false otherwise.
|
||||||
|
*/
|
||||||
|
ASTCENC_SIMD_INLINE bool any(vmask4 a)
|
||||||
|
{
|
||||||
|
return mask(a) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief True if all lanes are enabled, false otherwise.
|
||||||
|
*/
|
||||||
|
ASTCENC_SIMD_INLINE bool all(vmask4 a)
|
||||||
|
{
|
||||||
|
return mask(a) == 0xF;
|
||||||
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// vint4 operators and functions
|
// vint4 operators and functions
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
@ -684,21 +692,10 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
|
||||||
std::memcpy(p, a.m, sizeof(uint8_t) * 4);
|
std::memcpy(p, a.m, sizeof(uint8_t) * 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Gather N (vector width) indices from the array.
|
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
|
|
||||||
{
|
|
||||||
return vint4(base[indices.m[0]],
|
|
||||||
base[indices.m[1]],
|
|
||||||
base[indices.m[2]],
|
|
||||||
base[indices.m[3]]);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
|
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
|
ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p)
|
||||||
{
|
{
|
||||||
int b0 = a.m[0] & 0xFF;
|
int b0 = a.m[0] & 0xFF;
|
||||||
int b1 = a.m[1] & 0xFF;
|
int b1 = a.m[1] & 0xFF;
|
||||||
|
@ -706,7 +703,8 @@ ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
|
||||||
int b3 = a.m[3] & 0xFF;
|
int b3 = a.m[3] & 0xFF;
|
||||||
|
|
||||||
int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
|
int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
|
||||||
return vint4(b, 0, 0, 0);
|
a = vint4(b, 0, 0, 0);
|
||||||
|
store_nbytes(a, p);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -934,17 +932,6 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
|
||||||
(cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
|
(cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
|
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
|
|
||||||
{
|
|
||||||
return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
|
|
||||||
(cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
|
|
||||||
(cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
|
|
||||||
(cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Load a vector of gathered results from an array;
|
* @brief Load a vector of gathered results from an array;
|
||||||
*/
|
*/
|
||||||
|
@ -956,6 +943,18 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
|
||||||
base[indices.m[3]]);
|
base[indices.m[3]]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Load a vector of gathered results from an array using byte indices from memory
|
||||||
|
*/
|
||||||
|
template<>
|
||||||
|
ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
|
||||||
|
{
|
||||||
|
return vfloat4(base[indices[0]],
|
||||||
|
base[indices[1]],
|
||||||
|
base[indices[2]],
|
||||||
|
base[indices[3]]);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Store a vector to an unaligned memory address.
|
* @brief Store a vector to an unaligned memory address.
|
||||||
*/
|
*/
|
||||||
|
@ -1080,84 +1079,94 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
* Table structure for a 16x 8-bit entry table.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
|
struct vtable4_16x8 {
|
||||||
{
|
const uint8_t* data;
|
||||||
t0p = t0;
|
};
|
||||||
}
|
|
||||||
|
/*
|
||||||
|
* Table structure for a 32x 8-bit entry table.
|
||||||
|
*/
|
||||||
|
struct vtable4_32x8 {
|
||||||
|
const uint8_t* data;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Table structure for a 64x 8-bit entry table.
|
||||||
|
*/
|
||||||
|
struct vtable4_64x8 {
|
||||||
|
const uint8_t* data;
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
* @brief Prepare a vtable lookup table for 16x 8-bit entry table.
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
|
|
||||||
{
|
|
||||||
t0p = t0;
|
|
||||||
t1p = t1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE void vtable_prepare(
|
ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||||
vint4 t0, vint4 t1, vint4 t2, vint4 t3,
|
vtable4_16x8& table,
|
||||||
vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
|
const uint8_t* data
|
||||||
{
|
) {
|
||||||
t0p = t0;
|
table.data = data;
|
||||||
t1p = t1;
|
|
||||||
t2p = t2;
|
|
||||||
t3p = t3;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
|
* @brief Prepare a vtable lookup table for 32x 8-bit entry table.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
|
ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||||
{
|
vtable4_32x8& table,
|
||||||
uint8_t table[16];
|
const uint8_t* data
|
||||||
|
) {
|
||||||
std::memcpy(table + 0, t0.m, 4 * sizeof(int));
|
table.data = data;
|
||||||
|
|
||||||
return vint4(table[idx.lane<0>()],
|
|
||||||
table[idx.lane<1>()],
|
|
||||||
table[idx.lane<2>()],
|
|
||||||
table[idx.lane<3>()]);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
|
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
|
|
||||||
{
|
|
||||||
uint8_t table[32];
|
|
||||||
|
|
||||||
std::memcpy(table + 0, t0.m, 4 * sizeof(int));
|
|
||||||
std::memcpy(table + 16, t1.m, 4 * sizeof(int));
|
|
||||||
|
|
||||||
return vint4(table[idx.lane<0>()],
|
|
||||||
table[idx.lane<1>()],
|
|
||||||
table[idx.lane<2>()],
|
|
||||||
table[idx.lane<3>()]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
|
* @brief Prepare a vtable lookup table 64x 8-bit entry table.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
|
ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||||
{
|
vtable4_64x8& table,
|
||||||
uint8_t table[64];
|
const uint8_t* data
|
||||||
|
) {
|
||||||
|
table.data = data;
|
||||||
|
}
|
||||||
|
|
||||||
std::memcpy(table + 0, t0.m, 4 * sizeof(int));
|
/**
|
||||||
std::memcpy(table + 16, t1.m, 4 * sizeof(int));
|
* @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
|
||||||
std::memcpy(table + 32, t2.m, 4 * sizeof(int));
|
*/
|
||||||
std::memcpy(table + 48, t3.m, 4 * sizeof(int));
|
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
|
||||||
|
const vtable4_16x8& table,
|
||||||
|
vint4 idx
|
||||||
|
) {
|
||||||
|
return vint4(table.data[idx.lane<0>()],
|
||||||
|
table.data[idx.lane<1>()],
|
||||||
|
table.data[idx.lane<2>()],
|
||||||
|
table.data[idx.lane<3>()]);
|
||||||
|
}
|
||||||
|
|
||||||
return vint4(table[idx.lane<0>()],
|
/**
|
||||||
table[idx.lane<1>()],
|
* @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
|
||||||
table[idx.lane<2>()],
|
*/
|
||||||
table[idx.lane<3>()]);
|
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
|
||||||
|
const vtable4_32x8& table,
|
||||||
|
vint4 idx
|
||||||
|
) {
|
||||||
|
return vint4(table.data[idx.lane<0>()],
|
||||||
|
table.data[idx.lane<1>()],
|
||||||
|
table.data[idx.lane<2>()],
|
||||||
|
table.data[idx.lane<3>()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
|
||||||
|
*/
|
||||||
|
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
|
||||||
|
const vtable4_64x8& table,
|
||||||
|
vint4 idx
|
||||||
|
) {
|
||||||
|
return vint4(table.data[idx.lane<0>()],
|
||||||
|
table.data[idx.lane<1>()],
|
||||||
|
table.data[idx.lane<2>()],
|
||||||
|
table.data[idx.lane<3>()]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
// Copyright 2019-2023 Arm Limited
|
// Copyright 2019-2024 Arm Limited
|
||||||
//
|
//
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||||
// use this file except in compliance with the License. You may obtain a copy
|
// use this file except in compliance with the License. You may obtain a copy
|
||||||
|
@ -142,14 +142,6 @@ struct vfloat4
|
||||||
return vfloat4(_mm_load_ps(p));
|
return vfloat4(_mm_load_ps(p));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Factory that returns a vector containing the lane IDs.
|
|
||||||
*/
|
|
||||||
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
|
|
||||||
{
|
|
||||||
return vfloat4(_mm_set_ps(3, 2, 1, 0));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Return a swizzled float 2.
|
* @brief Return a swizzled float 2.
|
||||||
*/
|
*/
|
||||||
|
@ -229,7 +221,7 @@ struct vint4
|
||||||
/**
|
/**
|
||||||
* @brief Construct from 1 scalar value replicated across all lanes.
|
* @brief Construct from 1 scalar value replicated across all lanes.
|
||||||
*
|
*
|
||||||
* Consider using vfloat4::zero() for constexpr zeros.
|
* Consider using zero() for constexpr zeros.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE explicit vint4(int a)
|
ASTCENC_SIMD_INLINE explicit vint4(int a)
|
||||||
{
|
{
|
||||||
|
@ -436,6 +428,22 @@ ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
|
||||||
return static_cast<unsigned int>(_mm_movemask_ps(a.m));
|
return static_cast<unsigned int>(_mm_movemask_ps(a.m));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief True if any lanes are enabled, false otherwise.
|
||||||
|
*/
|
||||||
|
ASTCENC_SIMD_INLINE bool any(vmask4 a)
|
||||||
|
{
|
||||||
|
return mask(a) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief True if all lanes are enabled, false otherwise.
|
||||||
|
*/
|
||||||
|
ASTCENC_SIMD_INLINE bool all(vmask4 a)
|
||||||
|
{
|
||||||
|
return mask(a) == 0xF;
|
||||||
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// vint4 operators and functions
|
// vint4 operators and functions
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
@ -598,9 +606,9 @@ ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
|
ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
|
||||||
{
|
{
|
||||||
a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
|
a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))));
|
||||||
a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
|
a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2))));
|
||||||
return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -608,9 +616,9 @@ ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
|
ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
|
||||||
{
|
{
|
||||||
a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
|
a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))));
|
||||||
a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
|
a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2))));
|
||||||
return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -663,32 +671,20 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
|
||||||
_mm_store_ss(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
|
_mm_store_ss(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Gather N (vector width) indices from the array.
|
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
|
|
||||||
{
|
|
||||||
#if ASTCENC_AVX >= 2
|
|
||||||
return vint4(_mm_i32gather_epi32(base, indices.m, 4));
|
|
||||||
#else
|
|
||||||
alignas(16) int idx[4];
|
|
||||||
storea(indices, idx);
|
|
||||||
return vint4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
|
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
|
ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p)
|
||||||
{
|
{
|
||||||
#if ASTCENC_SSE >= 41
|
#if ASTCENC_SSE >= 41
|
||||||
__m128i shuf = _mm_set_epi8(0,0,0,0, 0,0,0,0, 0,0,0,0, 12,8,4,0);
|
__m128i shuf = _mm_set_epi8(0,0,0,0, 0,0,0,0, 0,0,0,0, 12,8,4,0);
|
||||||
return vint4(_mm_shuffle_epi8(a.m, shuf));
|
a = vint4(_mm_shuffle_epi8(a.m, shuf));
|
||||||
|
store_nbytes(a, p);
|
||||||
#else
|
#else
|
||||||
__m128i va = _mm_unpacklo_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(1,1,1,1)));
|
__m128i va = _mm_unpacklo_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(1,1,1,1)));
|
||||||
__m128i vb = _mm_unpackhi_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(3,3,3,3)));
|
__m128i vb = _mm_unpackhi_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(3,3,3,3)));
|
||||||
return vint4(_mm_unpacklo_epi16(va, vb));
|
a = vint4(_mm_unpacklo_epi16(va, vb));
|
||||||
|
store_nbytes(a, p);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -899,25 +895,12 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
|
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
|
|
||||||
{
|
|
||||||
#if ASTCENC_SSE >= 41
|
|
||||||
return vfloat4(_mm_blendv_ps(a.m, b.m, cond.m));
|
|
||||||
#else
|
|
||||||
__m128 d = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(cond.m), 31));
|
|
||||||
return vfloat4(_mm_or_ps(_mm_and_ps(d, b.m), _mm_andnot_ps(d, a.m)));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Load a vector of gathered results from an array;
|
* @brief Load a vector of gathered results from an array;
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
|
ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
|
||||||
{
|
{
|
||||||
#if ASTCENC_AVX >= 2
|
#if ASTCENC_AVX >= 2 && ASTCENC_X86_GATHERS != 0
|
||||||
return vfloat4(_mm_i32gather_ps(base, indices.m, 4));
|
return vfloat4(_mm_i32gather_ps(base, indices.m, 4));
|
||||||
#else
|
#else
|
||||||
alignas(16) int idx[4];
|
alignas(16) int idx[4];
|
||||||
|
@ -926,6 +909,23 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Load a vector of gathered results from an array using byte indices from memory
|
||||||
|
*/
|
||||||
|
template<>
|
||||||
|
ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
|
||||||
|
{
|
||||||
|
// Experimentally, in this particular use case (byte indices in memory),
|
||||||
|
// using 4 separate scalar loads is appreciably faster than using gathers
|
||||||
|
// even if they're available, on every x86 uArch tried, so always do the
|
||||||
|
// separate loads even when ASTCENC_X86_GATHERS is enabled.
|
||||||
|
//
|
||||||
|
// Tested on:
|
||||||
|
// - Intel Skylake-X, Coffee Lake, Crestmont, Redwood Cove
|
||||||
|
// - AMD Zen 2, Zen 4
|
||||||
|
return vfloat4(base[indices[0]], base[indices[1]], base[indices[2]], base[indices[3]]);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Store a vector to an unaligned memory address.
|
* @brief Store a vector to an unaligned memory address.
|
||||||
*/
|
*/
|
||||||
|
@ -1054,136 +1054,173 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
|
||||||
return vfloat4(_mm_castsi128_ps(v.m));
|
return vfloat4(_mm_castsi128_ps(v.m));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
* Table structure for a 16x 8-bit entry table.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
|
struct vtable4_16x8 {
|
||||||
{
|
|
||||||
t0p = t0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
|
||||||
*/
|
|
||||||
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
|
|
||||||
{
|
|
||||||
#if ASTCENC_SSE >= 41
|
#if ASTCENC_SSE >= 41
|
||||||
t0p = t0;
|
vint4 t0;
|
||||||
t1p = t0 ^ t1;
|
|
||||||
#else
|
#else
|
||||||
t0p = t0;
|
const uint8_t* data;
|
||||||
t1p = t1;
|
|
||||||
#endif
|
#endif
|
||||||
}
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Table structure for a 32x 8-bit entry table.
|
||||||
|
*/
|
||||||
|
struct vtable4_32x8 {
|
||||||
|
#if ASTCENC_SSE >= 41
|
||||||
|
vint4 t0;
|
||||||
|
vint4 t1;
|
||||||
|
#else
|
||||||
|
const uint8_t* data;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Table structure for a 64x 8-bit entry table.
|
||||||
|
*/
|
||||||
|
struct vtable4_64x8 {
|
||||||
|
#if ASTCENC_SSE >= 41
|
||||||
|
vint4 t0;
|
||||||
|
vint4 t1;
|
||||||
|
vint4 t2;
|
||||||
|
vint4 t3;
|
||||||
|
#else
|
||||||
|
const uint8_t* data;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
* @brief Prepare a vtable lookup table for 16x 8-bit entry table.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE void vtable_prepare(
|
ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||||
vint4 t0, vint4 t1, vint4 t2, vint4 t3,
|
vtable4_16x8& table,
|
||||||
vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
|
const uint8_t* data
|
||||||
{
|
) {
|
||||||
#if ASTCENC_SSE >= 41
|
#if ASTCENC_SSE >= 41
|
||||||
t0p = t0;
|
table.t0 = vint4::load(data);
|
||||||
t1p = t0 ^ t1;
|
|
||||||
t2p = t1 ^ t2;
|
|
||||||
t3p = t2 ^ t3;
|
|
||||||
#else
|
#else
|
||||||
t0p = t0;
|
table.data = data;
|
||||||
t1p = t1;
|
|
||||||
t2p = t2;
|
|
||||||
t3p = t3;
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
|
* @brief Prepare a vtable lookup table for 32x 8-bit entry table.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
|
ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||||
{
|
vtable4_32x8& table,
|
||||||
|
const uint8_t* data
|
||||||
|
) {
|
||||||
#if ASTCENC_SSE >= 41
|
#if ASTCENC_SSE >= 41
|
||||||
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
table.t0 = vint4::load(data);
|
||||||
__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
table.t1 = vint4::load(data + 16);
|
||||||
|
|
||||||
__m128i result = _mm_shuffle_epi8(t0.m, idxx);
|
table.t1 = table.t1 ^ table.t0;
|
||||||
return vint4(result);
|
|
||||||
#else
|
#else
|
||||||
uint8_t table[16];
|
table.data = data;
|
||||||
|
|
||||||
std::memcpy(table + 0, &t0.m, 4 * sizeof(int));
|
|
||||||
|
|
||||||
return vint4(table[idx.lane<0>()],
|
|
||||||
table[idx.lane<1>()],
|
|
||||||
table[idx.lane<2>()],
|
|
||||||
table[idx.lane<3>()]);
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
|
* @brief Prepare a vtable lookup table 64x 8-bit entry table.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
|
ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||||
{
|
vtable4_64x8& table,
|
||||||
|
const uint8_t* data
|
||||||
|
) {
|
||||||
#if ASTCENC_SSE >= 41
|
#if ASTCENC_SSE >= 41
|
||||||
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
table.t0 = vint4::load(data);
|
||||||
__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
table.t1 = vint4::load(data + 16);
|
||||||
|
table.t2 = vint4::load(data + 32);
|
||||||
|
table.t3 = vint4::load(data + 48);
|
||||||
|
|
||||||
__m128i result = _mm_shuffle_epi8(t0.m, idxx);
|
table.t3 = table.t3 ^ table.t2;
|
||||||
idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
|
table.t2 = table.t2 ^ table.t1;
|
||||||
|
table.t1 = table.t1 ^ table.t0;
|
||||||
__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
|
|
||||||
result = _mm_xor_si128(result, result2);
|
|
||||||
|
|
||||||
return vint4(result);
|
|
||||||
#else
|
#else
|
||||||
uint8_t table[32];
|
table.data = data;
|
||||||
|
|
||||||
std::memcpy(table + 0, &t0.m, 4 * sizeof(int));
|
|
||||||
std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
|
|
||||||
|
|
||||||
return vint4(table[idx.lane<0>()],
|
|
||||||
table[idx.lane<1>()],
|
|
||||||
table[idx.lane<2>()],
|
|
||||||
table[idx.lane<3>()]);
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
|
* @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
|
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
|
||||||
{
|
const vtable4_16x8& tbl,
|
||||||
|
vint4 idx
|
||||||
|
) {
|
||||||
#if ASTCENC_SSE >= 41
|
#if ASTCENC_SSE >= 41
|
||||||
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
||||||
__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
||||||
|
|
||||||
__m128i result = _mm_shuffle_epi8(t0.m, idxx);
|
__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
|
||||||
|
return vint4(result);
|
||||||
|
#else
|
||||||
|
return vint4(tbl.data[idx.lane<0>()],
|
||||||
|
tbl.data[idx.lane<1>()],
|
||||||
|
tbl.data[idx.lane<2>()],
|
||||||
|
tbl.data[idx.lane<3>()]);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
|
||||||
|
*/
|
||||||
|
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
|
||||||
|
const vtable4_32x8& tbl,
|
||||||
|
vint4 idx
|
||||||
|
) {
|
||||||
|
#if ASTCENC_SSE >= 41
|
||||||
|
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
||||||
|
__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
||||||
|
|
||||||
|
__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
|
||||||
idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
|
idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
|
||||||
|
|
||||||
__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
|
__m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx);
|
||||||
result = _mm_xor_si128(result, result2);
|
|
||||||
idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
|
|
||||||
|
|
||||||
result2 = _mm_shuffle_epi8(t2.m, idxx);
|
|
||||||
result = _mm_xor_si128(result, result2);
|
|
||||||
idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
|
|
||||||
|
|
||||||
result2 = _mm_shuffle_epi8(t3.m, idxx);
|
|
||||||
result = _mm_xor_si128(result, result2);
|
result = _mm_xor_si128(result, result2);
|
||||||
|
|
||||||
return vint4(result);
|
return vint4(result);
|
||||||
#else
|
#else
|
||||||
uint8_t table[64];
|
return vint4(tbl.data[idx.lane<0>()],
|
||||||
|
tbl.data[idx.lane<1>()],
|
||||||
|
tbl.data[idx.lane<2>()],
|
||||||
|
tbl.data[idx.lane<3>()]);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
std::memcpy(table + 0, &t0.m, 4 * sizeof(int));
|
/**
|
||||||
std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
|
* @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
|
||||||
std::memcpy(table + 32, &t2.m, 4 * sizeof(int));
|
*/
|
||||||
std::memcpy(table + 48, &t3.m, 4 * sizeof(int));
|
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
|
||||||
|
const vtable4_64x8& tbl,
|
||||||
|
vint4 idx
|
||||||
|
) {
|
||||||
|
#if ASTCENC_SSE >= 41
|
||||||
|
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
||||||
|
__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
||||||
|
|
||||||
return vint4(table[idx.lane<0>()],
|
__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
|
||||||
table[idx.lane<1>()],
|
idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
|
||||||
table[idx.lane<2>()],
|
|
||||||
table[idx.lane<3>()]);
|
__m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx);
|
||||||
|
result = _mm_xor_si128(result, result2);
|
||||||
|
idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
|
||||||
|
|
||||||
|
result2 = _mm_shuffle_epi8(tbl.t2.m, idxx);
|
||||||
|
result = _mm_xor_si128(result, result2);
|
||||||
|
idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
|
||||||
|
|
||||||
|
result2 = _mm_shuffle_epi8(tbl.t3.m, idxx);
|
||||||
|
result = _mm_xor_si128(result, result2);
|
||||||
|
|
||||||
|
return vint4(result);
|
||||||
|
#else
|
||||||
|
return vint4(tbl.data[idx.lane<0>()],
|
||||||
|
tbl.data[idx.lane<1>()],
|
||||||
|
tbl.data[idx.lane<2>()],
|
||||||
|
tbl.data[idx.lane<3>()]);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1307,7 +1344,11 @@ ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
|
||||||
*/
|
*/
|
||||||
ASTCENC_SIMD_INLINE int popcount(uint64_t v)
|
ASTCENC_SIMD_INLINE int popcount(uint64_t v)
|
||||||
{
|
{
|
||||||
|
#if !defined(__x86_64__) && !defined(_M_AMD64)
|
||||||
|
return static_cast<int>(__builtin_popcountll(v));
|
||||||
|
#else
|
||||||
return static_cast<int>(_mm_popcnt_u64(v));
|
return static_cast<int>(_mm_popcnt_u64(v));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // ASTCENC_POPCNT >= 1
|
#endif // ASTCENC_POPCNT >= 1
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -43,6 +43,7 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <cfloat>
|
||||||
|
|
||||||
static constexpr unsigned int ANGULAR_STEPS { 32 };
|
static constexpr unsigned int ANGULAR_STEPS { 32 };
|
||||||
|
|
||||||
|
@ -104,14 +105,17 @@ static void compute_angular_offsets(
|
||||||
// Precompute isample; arrays are always allocated 64 elements long
|
// Precompute isample; arrays are always allocated 64 elements long
|
||||||
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
||||||
{
|
{
|
||||||
// Add 2^23 and interpreting bits extracts round-to-nearest int
|
// Ideal weight can be outside [0, 1] range, so clamp to fit table
|
||||||
vfloat sample = loada(dec_weight_ideal_value + i) * (SINCOS_STEPS - 1.0f) + vfloat(12582912.0f);
|
vfloat ideal_weight = clampzo(loada(dec_weight_ideal_value + i));
|
||||||
vint isample = float_as_int(sample) & vint((SINCOS_STEPS - 1));
|
|
||||||
|
// Convert a weight to a sincos table index
|
||||||
|
vfloat sample = ideal_weight * (SINCOS_STEPS - 1.0f);
|
||||||
|
vint isample = float_to_int_rtn(sample);
|
||||||
storea(isample, isamplev + i);
|
storea(isample, isamplev + i);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
|
// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
|
||||||
vfloat mult = vfloat(1.0f / (2.0f * astc::PI));
|
vfloat mult(1.0f / (2.0f * astc::PI));
|
||||||
|
|
||||||
for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
|
for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
|
||||||
{
|
{
|
||||||
|
@ -164,18 +168,41 @@ static void compute_lowest_and_highest_weight(
|
||||||
promise(weight_count > 0);
|
promise(weight_count > 0);
|
||||||
promise(max_angular_steps > 0);
|
promise(max_angular_steps > 0);
|
||||||
|
|
||||||
vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
|
vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
|
||||||
|
|
||||||
|
// Compute minimum/maximum weights in the weight array. Our remapping
|
||||||
|
// is monotonic, so the min/max rounded weights relate to the min/max
|
||||||
|
// unrounded weights in a straightforward way.
|
||||||
|
vfloat min_weight(FLT_MAX);
|
||||||
|
vfloat max_weight(-FLT_MAX);
|
||||||
|
|
||||||
|
vint lane_id = vint::lane_id();
|
||||||
|
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
||||||
|
{
|
||||||
|
vmask active = lane_id < vint(weight_count);
|
||||||
|
lane_id += vint(ASTCENC_SIMD_WIDTH);
|
||||||
|
|
||||||
|
vfloat weights = loada(dec_weight_ideal_value + i);
|
||||||
|
min_weight = min(min_weight, select(min_weight, weights, active));
|
||||||
|
max_weight = max(max_weight, select(max_weight, weights, active));
|
||||||
|
}
|
||||||
|
|
||||||
|
min_weight = hmin(min_weight);
|
||||||
|
max_weight = hmax(max_weight);
|
||||||
|
|
||||||
// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
|
// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
|
||||||
for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
|
for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
|
||||||
{
|
{
|
||||||
vfloat minidx(128.0f);
|
|
||||||
vfloat maxidx(-128.0f);
|
|
||||||
vfloat errval = vfloat::zero();
|
vfloat errval = vfloat::zero();
|
||||||
vfloat cut_low_weight_err = vfloat::zero();
|
vfloat cut_low_weight_err = vfloat::zero();
|
||||||
vfloat cut_high_weight_err = vfloat::zero();
|
vfloat cut_high_weight_err = vfloat::zero();
|
||||||
vfloat offset = loada(offsets + sp);
|
vfloat offset = loada(offsets + sp);
|
||||||
|
|
||||||
|
// We know the min and max weight values, so we can figure out
|
||||||
|
// the corresponding indices before we enter the loop.
|
||||||
|
vfloat minidx = round(min_weight * rcp_stepsize - offset);
|
||||||
|
vfloat maxidx = round(max_weight * rcp_stepsize - offset);
|
||||||
|
|
||||||
for (unsigned int j = 0; j < weight_count; j++)
|
for (unsigned int j = 0; j < weight_count; j++)
|
||||||
{
|
{
|
||||||
vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
|
vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
|
||||||
|
@ -183,22 +210,12 @@ static void compute_lowest_and_highest_weight(
|
||||||
vfloat diff = sval - svalrte;
|
vfloat diff = sval - svalrte;
|
||||||
errval += diff * diff;
|
errval += diff * diff;
|
||||||
|
|
||||||
// Reset tracker on min hit
|
// Accumulate errors for minimum index
|
||||||
vmask mask = svalrte < minidx;
|
vmask mask = svalrte == minidx;
|
||||||
minidx = select(minidx, svalrte, mask);
|
|
||||||
cut_low_weight_err = select(cut_low_weight_err, vfloat::zero(), mask);
|
|
||||||
|
|
||||||
// Accumulate on min hit
|
|
||||||
mask = svalrte == minidx;
|
|
||||||
vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
|
vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
|
||||||
cut_low_weight_err = select(cut_low_weight_err, accum, mask);
|
cut_low_weight_err = select(cut_low_weight_err, accum, mask);
|
||||||
|
|
||||||
// Reset tracker on max hit
|
// Accumulate errors for maximum index
|
||||||
mask = svalrte > maxidx;
|
|
||||||
maxidx = select(maxidx, svalrte, mask);
|
|
||||||
cut_high_weight_err = select(cut_high_weight_err, vfloat::zero(), mask);
|
|
||||||
|
|
||||||
// Accumulate on max hit
|
|
||||||
mask = svalrte == maxidx;
|
mask = svalrte == maxidx;
|
||||||
accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
|
accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
|
||||||
cut_high_weight_err = select(cut_high_weight_err, accum, mask);
|
cut_high_weight_err = select(cut_high_weight_err, accum, mask);
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"versions": {
|
"versions": {
|
||||||
"1kdist": "v93",
|
"1kdist": "v95",
|
||||||
"oboe": "1.9.0",
|
"oboe": "1.9.0",
|
||||||
"kcp": "v1.7-f2aa30e",
|
"kcp": "v1.7-f2aa30e",
|
||||||
"lz4": "v1.10.0",
|
"lz4": "v1.10.0",
|
||||||
|
@ -9,6 +9,7 @@
|
||||||
},
|
},
|
||||||
"mirrors": {
|
"mirrors": {
|
||||||
"github": {
|
"github": {
|
||||||
|
"host": "github.com",
|
||||||
"1kdist": "simdsoft/1kiss/releases/download",
|
"1kdist": "simdsoft/1kiss/releases/download",
|
||||||
"oboe": "google/oboe.git",
|
"oboe": "google/oboe.git",
|
||||||
"kcp": "skywind3000/kcp.git",
|
"kcp": "skywind3000/kcp.git",
|
||||||
|
@ -19,6 +20,7 @@
|
||||||
},
|
},
|
||||||
"gitee":
|
"gitee":
|
||||||
{
|
{
|
||||||
|
"host": "gitee.com",
|
||||||
"1kdist": "simdsoft/1kiss/releases/download",
|
"1kdist": "simdsoft/1kiss/releases/download",
|
||||||
"oboe": "simdsoft/oboe.git",
|
"oboe": "simdsoft/oboe.git",
|
||||||
"kcp": "simdsoft/kcp.git",
|
"kcp": "simdsoft/kcp.git",
|
||||||
|
|
Loading…
Reference in New Issue