diff --git a/1k/1kiss.ps1 b/1k/1kiss.ps1 index 7875d48052..8e734cf2aa 100644 --- a/1k/1kiss.ps1 +++ b/1k/1kiss.ps1 @@ -194,7 +194,9 @@ $1k = [_1kiss]::new() # * : any # x.y.z~x2.y2.z2 : range $manifest = @{ - msvc = '14.39+'; # cl.exe @link.exe 14.39 VS2022 17.9.x + # cl.exe @link.exe 14.39 VS2022 17.9.x + # exactly match format: '14.42.*' + msvc = '14.39+'; vs = '12.0+'; ndk = 'r23c'; xcode = '13.0.0+'; # range @@ -203,7 +205,7 @@ $manifest = @{ # clang-cl msvc14.40 require 17.0.0+ llvm = '17.0.6+'; gcc = '9.0.0+'; - cmake = '3.23.0+'; + cmake = '3.23.0~3.31.1+'; ninja = '1.10.0+'; python = '3.8.0+'; jdk = '17.0.10+'; # jdk17+ works for android cmdlinetools 7.0+ @@ -458,19 +460,24 @@ if ($1k.isfile($manifest_file)) { # 1kdist $sentry_file = Join-Path $myRoot '.gitee' $mirror = if ($1k.isfile($sentry_file)) { 'gitee' } else { 'github' } -$mirror_url_base = @{'github' = 'https://github.com/'; 'gitee' = 'https://gitee.com/' }[$mirror] -$1kdist_url_base = $mirror_url_base $mirror_conf_file = $1k.realpath("$myRoot/../manifest.json") $mirror_current = $null $devtools_url_base = $null $1kdist_ver = $null + if ($1k.isfile($mirror_conf_file)) { $mirror_conf = ConvertFrom-Json (Get-Content $mirror_conf_file -raw) $mirror_current = $mirror_conf.mirrors.$mirror + $mirror_url_base = "https://$($mirror_current.host)/" + $1kdist_url_base = $mirror_url_base + $1kdist_url_base += $mirror_current.'1kdist' $devtools_url_base += "$1kdist_url_base/devtools" $1kdist_ver = $mirror_conf.versions.'1kdist' $1kdist_url_base += "/$1kdist_ver" +} else { + $mirror_url_base = 'https://github.com/' + $1kdist_url_base = $mirror_url_base } function 1kdist_url($filename) { @@ -773,8 +780,9 @@ function find_vs() { $required_vs_ver = $manifest['vs'] if (!$required_vs_ver) { $required_vs_ver = '12.0+' } - $require_comps = @('Microsoft.Component.MSBuild', 'Microsoft.VisualStudio.Component.VC.Tools.x86.x64') - $vs_installs = ConvertFrom-Json "$(&$VSWHERE_EXE -version $required_vs_ver.TrimEnd('+') -format 'json' -requires $require_comps)" + # refer: https://learn.microsoft.com/en-us/visualstudio/install/workload-and-component-ids?view=vs-2022 + $require_comps = @('Microsoft.VisualStudio.Component.VC.Tools.x86.x64', 'Microsoft.VisualStudio.Product.BuildTools') + $vs_installs = ConvertFrom-Json "$(&$VSWHERE_EXE -version $required_vs_ver.TrimEnd('+') -format 'json' -requires $require_comps -requiresAny)" $ErrorActionPreference = $eap if ($vs_installs) { @@ -789,7 +797,7 @@ function find_vs() { } $Global:VS_INST = $vs_inst_latest } else { - throw "No suitable visual studio installed, required: $required_vs_ver" + Write-Warning "Visual studio not found, your build may not work, required: $required_vs_ver" } } } @@ -1275,16 +1283,19 @@ function setup_msvc() { if (!$cl_prog) { if ($Global:VS_INST) { $vs_path = $Global:VS_INST.installationPath - Import-Module "$vs_path\Common7\Tools\Microsoft.VisualStudio.DevShell.dll" $dev_cmd_args = "-arch=$target_cpu -host_arch=x64 -no_logo" + + # if explicit version specified, use it if (!$manifest['msvc'].EndsWith('+')) { $dev_cmd_args += " -vcvars_ver=$cl_ver" } + + Import-Module "$vs_path\Common7\Tools\Microsoft.VisualStudio.DevShell.dll" Enter-VsDevShell -VsInstanceId $Global:VS_INST.instanceId -SkipAutomaticLocation -DevCmdArguments $dev_cmd_args $cl_prog, $cl_ver = find_prog -name 'msvc' -cmd 'cl' -silent $true -usefv $true $1k.println("Using msvc: $cl_prog, version: $cl_ver") } else { - throw "Visual Studio not installed!" + Write-Warning "MSVC not found, your build may not work, required: $cl_ver" } } diff --git a/1k/install-pwsh.sh b/1k/install-pwsh.sh index f88923602f..e9cd29f294 100755 --- a/1k/install-pwsh.sh +++ b/1k/install-pwsh.sh @@ -12,7 +12,7 @@ mkdir -p $cacheDir pwsh_ver=$1 if [ "$pwsh_ver" = "" ] ; then - pwsh_ver='7.4.5' + pwsh_ver='7.4.6' fi pwsh_min_ver=$2 diff --git a/1k/resolv-uri.ps1 b/1k/resolv-uri.ps1 index b873d0280c..9a956b8098 100644 --- a/1k/resolv-uri.ps1 +++ b/1k/resolv-uri.ps1 @@ -6,11 +6,12 @@ param( if(Test-Path $manifest_file -PathType Leaf) { $mirror = if (!(Test-Path (Join-Path $PSScriptRoot '.gitee') -PathType Leaf)) {'github'} else {'gitee'} - $url_base = @{'github' = 'https://github.com/'; 'gitee' = 'https://gitee.com/' }[$mirror] $manifest_map = ConvertFrom-Json (Get-Content $manifest_file -raw) $ver = $manifest_map.versions.PSObject.Properties[$name].Value - $url_path = $manifest_map.mirrors.PSObject.Properties[$mirror].Value.PSObject.Properties[$name].Value + $mirror_current = $manifest_map.mirrors.PSObject.Properties[$mirror].Value.PSObject.Properties + $url_base = "https://$($mirror_current['host'].Value)/" + $url_path = $mirror_current[$name].Value Write-Host "$url_base$url_path#$ver" -NoNewline } diff --git a/3rdparty/README.md b/3rdparty/README.md index d2d401354e..2f5ba0cb8c 100644 --- a/3rdparty/README.md +++ b/3rdparty/README.md @@ -6,7 +6,7 @@ ## astcenc - [![Upstream](https://img.shields.io/github/v/release/ARM-software/astc-encoder?label=Upstream)](https://github.com/ARM-software/astc-encoder) -- Version: 4.8.0 +- Version: 5.1.0 - License: Apache-2.0 ## Box2D @@ -22,7 +22,7 @@ ## c-ares - [![Upstream](https://img.shields.io/github/v/release/c-ares/c-ares?label=Upstream)](https://github.com/c-ares/c-ares) -- Version: 1.34.2 +- Version: 1.34.3 - License: MIT ## Chipmunk2D @@ -47,7 +47,7 @@ ## curl - [![Upstream](https://img.shields.io/github/v/release/curl/curl?label=Upstream)](https://github.com/curl/curl) -- Version: 8.10.1 +- Version: 8.11.0 - License: Curl (MIT/X) ## doctest @@ -124,7 +124,7 @@ - luajit - Upstream: https://github.com/LuaJIT/LuaJIT - - Version: 2.1-97813fb + - Version: 2.1-fe71d0f - License: MIT - tolua diff --git a/3rdparty/astcenc/astcenc_averages_and_directions.cpp b/3rdparty/astcenc/astcenc_averages_and_directions.cpp index dcff0d224b..8e2f8d8c46 100644 --- a/3rdparty/astcenc/astcenc_averages_and_directions.cpp +++ b/3rdparty/astcenc/astcenc_averages_and_directions.cpp @@ -778,12 +778,12 @@ void compute_error_squared_rgba( for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) { vmask mask = lane_ids < vint(texel_count); - vint texel_idxs(texel_indexes + i); + const uint8_t* texel_idxs = texel_indexes + i; - vfloat data_r = gatherf(blk.data_r, texel_idxs); - vfloat data_g = gatherf(blk.data_g, texel_idxs); - vfloat data_b = gatherf(blk.data_b, texel_idxs); - vfloat data_a = gatherf(blk.data_a, texel_idxs); + vfloat data_r = gatherf_byte_inds(blk.data_r, texel_idxs); + vfloat data_g = gatherf_byte_inds(blk.data_g, texel_idxs); + vfloat data_b = gatherf_byte_inds(blk.data_b, texel_idxs); + vfloat data_a = gatherf_byte_inds(blk.data_a, texel_idxs); vfloat uncor_param = (data_r * l_uncor_bs0) + (data_g * l_uncor_bs1) @@ -892,11 +892,11 @@ void compute_error_squared_rgb( for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) { vmask mask = lane_ids < vint(texel_count); - vint texel_idxs(texel_indexes + i); + const uint8_t* texel_idxs = texel_indexes + i; - vfloat data_r = gatherf(blk.data_r, texel_idxs); - vfloat data_g = gatherf(blk.data_g, texel_idxs); - vfloat data_b = gatherf(blk.data_b, texel_idxs); + vfloat data_r = gatherf_byte_inds(blk.data_r, texel_idxs); + vfloat data_g = gatherf_byte_inds(blk.data_g, texel_idxs); + vfloat data_b = gatherf_byte_inds(blk.data_b, texel_idxs); vfloat uncor_param = (data_r * l_uncor_bs0) + (data_g * l_uncor_bs1) diff --git a/3rdparty/astcenc/astcenc_decompress_symbolic.cpp b/3rdparty/astcenc/astcenc_decompress_symbolic.cpp index 7463f7e20b..e7791eef6d 100644 --- a/3rdparty/astcenc/astcenc_decompress_symbolic.cpp +++ b/3rdparty/astcenc/astcenc_decompress_symbolic.cpp @@ -98,19 +98,14 @@ void unpack_weights( if (!is_dual_plane) { // Build full 64-entry weight lookup table - vint4 tab0 = vint4::load(scb.weights + 0); - vint4 tab1 = vint4::load(scb.weights + 16); - vint4 tab2 = vint4::load(scb.weights + 32); - vint4 tab3 = vint4::load(scb.weights + 48); - - vint tab0p, tab1p, tab2p, tab3p; - vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p); + vtable_64x8 table; + vtable_prepare(table, scb.weights); for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH) { vint summed_value(8); vint weight_count(di.texel_weight_count + i); - int max_weight_count = hmax(weight_count).lane<0>(); + int max_weight_count = hmax_s(weight_count); promise(max_weight_count > 0); for (int j = 0; j < max_weight_count; j++) @@ -118,7 +113,7 @@ void unpack_weights( vint texel_weights(di.texel_weights_tr[j] + i); vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i); - summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int; + summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int; } store(lsr<4>(summed_value), weights_plane1 + i); @@ -128,16 +123,12 @@ void unpack_weights( { // Build a 32-entry weight lookup table per plane // Plane 1 - vint4 tab0_plane1 = vint4::load(scb.weights + 0); - vint4 tab1_plane1 = vint4::load(scb.weights + 16); - vint tab0_plane1p, tab1_plane1p; - vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p); + vtable_32x8 tab_plane1; + vtable_prepare(tab_plane1, scb.weights); // Plane 2 - vint4 tab0_plane2 = vint4::load(scb.weights + 32); - vint4 tab1_plane2 = vint4::load(scb.weights + 48); - vint tab0_plane2p, tab1_plane2p; - vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p); + vtable_32x8 tab_plane2; + vtable_prepare(tab_plane2, scb.weights + 32); for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH) { @@ -145,7 +136,7 @@ void unpack_weights( vint sum_plane2(8); vint weight_count(di.texel_weight_count + i); - int max_weight_count = hmax(weight_count).lane<0>(); + int max_weight_count = hmax_s(weight_count); promise(max_weight_count > 0); for (int j = 0; j < max_weight_count; j++) @@ -153,8 +144,8 @@ void unpack_weights( vint texel_weights(di.texel_weights_tr[j] + i); vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i); - sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int; - sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int; + sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int; + sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int; } store(lsr<4>(sum_plane1), weights_plane1 + i); diff --git a/3rdparty/astcenc/astcenc_find_best_partitioning.cpp b/3rdparty/astcenc/astcenc_find_best_partitioning.cpp index bfbcc35e94..f2e432826d 100644 --- a/3rdparty/astcenc/astcenc_find_best_partitioning.cpp +++ b/3rdparty/astcenc/astcenc_find_best_partitioning.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2023 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -425,8 +425,8 @@ static unsigned int get_partition_ordering_by_mismatch_bits( } // Create a running sum from the histogram array - // Cells store previous values only; i.e. exclude self after sum - unsigned int sum = 0; + // Indices store previous values only; i.e. exclude self after sum + uint16_t sum = 0; for (unsigned int i = 0; i < texel_count; i++) { uint16_t cnt = mscount[i]; diff --git a/3rdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp b/3rdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp index 051782fd0f..8e6ee2f4bb 100644 --- a/3rdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp +++ b/3rdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp @@ -41,16 +41,16 @@ static vfloat bilinear_infill_vla( unsigned int index ) { // Load the bilinear filter texel weight indexes in the decimated grid - vint weight_idx0 = vint(di.texel_weights_tr[0] + index); - vint weight_idx1 = vint(di.texel_weights_tr[1] + index); - vint weight_idx2 = vint(di.texel_weights_tr[2] + index); - vint weight_idx3 = vint(di.texel_weights_tr[3] + index); + const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index; + const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index; + const uint8_t* weight_idx2 = di.texel_weights_tr[2] + index; + const uint8_t* weight_idx3 = di.texel_weights_tr[3] + index; // Load the bilinear filter weights from the decimated grid - vfloat weight_val0 = gatherf(weights, weight_idx0); - vfloat weight_val1 = gatherf(weights, weight_idx1); - vfloat weight_val2 = gatherf(weights, weight_idx2); - vfloat weight_val3 = gatherf(weights, weight_idx3); + vfloat weight_val0 = gatherf_byte_inds(weights, weight_idx0); + vfloat weight_val1 = gatherf_byte_inds(weights, weight_idx1); + vfloat weight_val2 = gatherf_byte_inds(weights, weight_idx2); + vfloat weight_val3 = gatherf_byte_inds(weights, weight_idx3); // Load the weight contribution factors for each decimated weight vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index); @@ -81,12 +81,12 @@ static vfloat bilinear_infill_vla_2( unsigned int index ) { // Load the bilinear filter texel weight indexes in the decimated grid - vint weight_idx0 = vint(di.texel_weights_tr[0] + index); - vint weight_idx1 = vint(di.texel_weights_tr[1] + index); + const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index; + const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index; // Load the bilinear filter weights from the decimated grid - vfloat weight_val0 = gatherf(weights, weight_idx0); - vfloat weight_val1 = gatherf(weights, weight_idx1); + vfloat weight_val0 = gatherf_byte_inds(weights, weight_idx0); + vfloat weight_val1 = gatherf_byte_inds(weights, weight_idx1); // Load the weight contribution factors for each decimated weight vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index); @@ -853,12 +853,6 @@ void compute_ideal_weights_for_decimation( promise(texel_count > 0); promise(weight_count > 0); - // Ensure that the end of the output arrays that are used for SIMD paths later are filled so we - // can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight - // arrays always contain space for 64 elements - unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1); - storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd); - // If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the // zero-initialized SIMD over-fetch region if (is_direct) @@ -889,23 +883,23 @@ void compute_ideal_weights_for_decimation( // Accumulate error weighting of all the texels using this weight vint weight_texel_count(di.weight_texel_count + i); - unsigned int max_texel_count = hmax(weight_texel_count).lane<0>(); + unsigned int max_texel_count = hmax_s(weight_texel_count); promise(max_texel_count > 0); for (unsigned int j = 0; j < max_texel_count; j++) { - vint texel(di.weight_texels_tr[j] + i); + const uint8_t* texel = di.weight_texels_tr[j] + i; vfloat weight = loada(di.weights_texel_contribs_tr[j] + i); if (!constant_wes) { - weight_error_scale = gatherf(ei.weight_error_scale, texel); + weight_error_scale = gatherf_byte_inds(ei.weight_error_scale, texel); } vfloat contrib_weight = weight * weight_error_scale; weight_weight += contrib_weight; - initial_weight += gatherf(ei.weights, texel) * contrib_weight; + initial_weight += gatherf_byte_inds(ei.weights, texel) * contrib_weight; } storea(initial_weight / weight_weight, dec_weight_ideal_value + i); @@ -947,22 +941,22 @@ void compute_ideal_weights_for_decimation( // Accumulate error weighting of all the texels using this weight vint weight_texel_count(di.weight_texel_count + i); - unsigned int max_texel_count = hmax(weight_texel_count).lane<0>(); + unsigned int max_texel_count = hmax_s(weight_texel_count); promise(max_texel_count > 0); for (unsigned int j = 0; j < max_texel_count; j++) { - vint texel(di.weight_texels_tr[j] + i); + const uint8_t* texel = di.weight_texels_tr[j] + i; vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i); if (!constant_wes) { - weight_error_scale = gatherf(ei.weight_error_scale, texel); + weight_error_scale = gatherf_byte_inds(ei.weight_error_scale, texel); } vfloat scale = weight_error_scale * contrib_weight; - vfloat old_weight = gatherf(infilled_weights, texel); - vfloat ideal_weight = gatherf(ei.weights, texel); + vfloat old_weight = gatherf_byte_inds(infilled_weights, texel); + vfloat ideal_weight = gatherf_byte_inds(ei.weights, texel); error_change0 += contrib_weight * scale; error_change1 += (old_weight - ideal_weight) * scale; @@ -1023,9 +1017,8 @@ void compute_quantized_weights_for_decimation( // safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements if (get_quant_level(quant_level) <= 16) { - vint4 tab0 = vint4::load(qat.quant_to_unquant); - vint tab0p; - vtable_prepare(tab0, tab0p); + vtable_16x8 table; + vtable_prepare(table, qat.quant_to_unquant); for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) { @@ -1038,8 +1031,8 @@ void compute_quantized_weights_for_decimation( vint weightl = float_to_int(ix1); vint weighth = min(weightl + vint(1), steps_m1); - vint ixli = vtable_8bt_32bi(tab0p, weightl); - vint ixhi = vtable_8bt_32bi(tab0p, weighth); + vint ixli = vtable_lookup_32bit(table, weightl); + vint ixhi = vtable_lookup_32bit(table, weighth); vfloat ixl = int_to_float(ixli); vfloat ixh = int_to_float(ixhi); @@ -1050,16 +1043,13 @@ void compute_quantized_weights_for_decimation( // Invert the weight-scaling that was done initially storea(ixl * rscalev + low_boundv, weight_set_out + i); - vint scn = pack_low_bytes(weight); - store_nbytes(scn, quantized_weight_set + i); + pack_and_store_low_bytes(weight, quantized_weight_set + i); } } else { - vint4 tab0 = vint4::load(qat.quant_to_unquant + 0); - vint4 tab1 = vint4::load(qat.quant_to_unquant + 16); - vint tab0p, tab1p; - vtable_prepare(tab0, tab1, tab0p, tab1p); + vtable_32x8 table; + vtable_prepare(table, qat.quant_to_unquant); for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) { @@ -1072,8 +1062,8 @@ void compute_quantized_weights_for_decimation( vint weightl = float_to_int(ix1); vint weighth = min(weightl + vint(1), steps_m1); - vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl); - vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth); + vint ixli = vtable_lookup_32bit(table, weightl); + vint ixhi = vtable_lookup_32bit(table, weighth); vfloat ixl = int_to_float(ixli); vfloat ixh = int_to_float(ixhi); @@ -1084,8 +1074,7 @@ void compute_quantized_weights_for_decimation( // Invert the weight-scaling that was done initially storea(ixl * rscalev + low_boundv, weight_set_out + i); - vint scn = pack_low_bytes(weight); - store_nbytes(scn, quantized_weight_set + i); + pack_and_store_low_bytes(weight, quantized_weight_set + i); } } } diff --git a/3rdparty/astcenc/astcenc_mathlib.h b/3rdparty/astcenc/astcenc_mathlib.h index 562d6597f2..1d73bf1d23 100644 --- a/3rdparty/astcenc/astcenc_mathlib.h +++ b/3rdparty/astcenc/astcenc_mathlib.h @@ -58,8 +58,10 @@ #ifndef ASTCENC_AVX #if defined(__AVX2__) #define ASTCENC_AVX 2 + #define ASTCENC_X86_GATHERS 1 #elif defined(__AVX__) #define ASTCENC_AVX 1 + #define ASTCENC_X86_GATHERS 1 #else #define ASTCENC_AVX 0 #endif @@ -73,10 +75,25 @@ #endif #endif +#ifndef ASTCENC_SVE + #if defined(__ARM_FEATURE_SVE) + #if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256 + #define ASTCENC_SVE 8 + // Auto-detected SVE can only assume vector width of 4 is available, but + // must also allow for hardware being longer and so all use of intrinsics + // must explicitly use predicate masks to limit to 4-wide. + #else + #define ASTCENC_SVE 4 + #endif + #else + #define ASTCENC_SVE 0 + #endif +#endif + // Force vector-sized SIMD alignment -#if ASTCENC_AVX +#if ASTCENC_AVX || ASTCENC_SVE == 8 #define ASTCENC_VECALIGN 32 -#elif ASTCENC_SSE || ASTCENC_NEON +#elif ASTCENC_SSE || ASTCENC_NEON || ASTCENC_SVE == 4 #define ASTCENC_VECALIGN 16 // Use default alignment for non-SIMD builds #else diff --git a/3rdparty/astcenc/astcenc_pick_best_endpoint_format.cpp b/3rdparty/astcenc/astcenc_pick_best_endpoint_format.cpp index f25140d4c7..bf872a9249 100644 --- a/3rdparty/astcenc/astcenc_pick_best_endpoint_format.cpp +++ b/3rdparty/astcenc/astcenc_pick_best_endpoint_format.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2022 Arm Limited +// Copyright 2011-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -123,21 +123,21 @@ static void compute_error_squared_rgb_single_partition( vint lane_ids = vint::lane_id(); for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) { - vint tix(texel_indexes + i); + const uint8_t* tix = texel_indexes + i; vmask mask = lane_ids < vint(texel_count); lane_ids += vint(ASTCENC_SIMD_WIDTH); // Compute the error that arises from just ditching alpha - vfloat data_a = gatherf(blk.data_a, tix); + vfloat data_a = gatherf_byte_inds(blk.data_a, tix); vfloat alpha_diff = data_a - default_a; alpha_diff = alpha_diff * alpha_diff; haccumulate(a_drop_errv, alpha_diff, mask); - vfloat data_r = gatherf(blk.data_r, tix); - vfloat data_g = gatherf(blk.data_g, tix); - vfloat data_b = gatherf(blk.data_b, tix); + vfloat data_r = gatherf_byte_inds(blk.data_r, tix); + vfloat data_g = gatherf_byte_inds(blk.data_g, tix); + vfloat data_b = gatherf_byte_inds(blk.data_b, tix); // Compute uncorrelated error vfloat param = data_r * uncor_bs0 @@ -1306,8 +1306,8 @@ unsigned int compute_ideal_endpoint_formats( // Pick best mode from the SIMD result, using lowest matching index to ensure invariance vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error); vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error); - vbest_error_index = hmin(vbest_error_index); - int best_error_index = vbest_error_index.lane<0>(); + + int best_error_index = hmin_s(vbest_error_index); best_error_weights[i] = best_error_index; diff --git a/3rdparty/astcenc/astcenc_vecmathlib.h b/3rdparty/astcenc/astcenc_vecmathlib.h index d48f1d73ea..e6ae97cc4b 100644 --- a/3rdparty/astcenc/astcenc_vecmathlib.h +++ b/3rdparty/astcenc/astcenc_vecmathlib.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2019-2022 Arm Limited +// Copyright 2019-2024 Arm Limited // Copyright 2008 Jose Fonseca // // Licensed under the Apache License, Version 2.0 (the "License"); you may not @@ -42,11 +42,12 @@ * * With the current implementation ISA support is provided for: * - * * 1-wide for scalar reference. - * * 4-wide for Armv8-A NEON. - * * 4-wide for x86-64 SSE2. - * * 4-wide for x86-64 SSE4.1. - * * 8-wide for x86-64 AVX2. + * * 1-wide for scalar reference + * * 4-wide for Armv8-A NEON + * * 4-wide for x86-64 SSE2 + * * 4-wide for x86-64 SSE4.1 + * * 8-wide for Armv8-A SVE + * * 8-wide for x86-64 AVX2 */ #ifndef ASTC_VECMATHLIB_H_INCLUDED @@ -54,7 +55,14 @@ #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 #include -#elif ASTCENC_NEON != 0 +#endif + +#if ASTCENC_SVE != 0 + #include + #include +#endif + +#if ASTCENC_NEON != 0 #include #endif @@ -69,8 +77,10 @@ #define ASTCENC_NO_INLINE __attribute__ ((noinline)) #endif +template T gatherf_byte_inds(const float* base, const uint8_t* indices); + #if ASTCENC_AVX >= 2 - /* If we have AVX2 expose 8-wide VLA. */ + // If we have AVX2 expose 8-wide VLA. #include "astcenc_vecmathlib_sse_4.h" #include "astcenc_vecmathlib_common_4.h" #include "astcenc_vecmathlib_avx2_8.h" @@ -88,11 +98,15 @@ using vint = vint8; using vmask = vmask8; + using vtable_16x8 = vtable8_16x8; + using vtable_32x8 = vtable8_32x8; + using vtable_64x8 = vtable8_64x8; + constexpr auto loada = vfloat8::loada; constexpr auto load1 = vfloat8::load1; #elif ASTCENC_SSE >= 20 - /* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */ + // If we have SSE expose 4-wide VLA, and 4-wide fixed width. #include "astcenc_vecmathlib_sse_4.h" #include "astcenc_vecmathlib_common_4.h" @@ -103,11 +117,46 @@ using vint = vint4; using vmask = vmask4; + using vtable_16x8 = vtable4_16x8; + using vtable_32x8 = vtable4_32x8; + using vtable_64x8 = vtable4_64x8; + constexpr auto loada = vfloat4::loada; constexpr auto load1 = vfloat4::load1; +#elif ASTCENC_SVE == 8 + // Check the compiler is configured with fixed-length 256-bit SVE. + #if !defined(__ARM_FEATURE_SVE_BITS) || (__ARM_FEATURE_SVE_BITS != 256) + #error "__ARM_FEATURE_SVE_BITS is not set to 256 bits" + #endif + + // If we have SVE configured as 8-wide, expose 8-wide VLA. + #include "astcenc_vecmathlib_neon_4.h" + #include "astcenc_vecmathlib_common_4.h" + #include "astcenc_vecmathlib_sve_8.h" + + #define ASTCENC_SIMD_WIDTH 8 + + using vfloat = vfloat8; + + #if defined(ASTCENC_NO_INVARIANCE) + using vfloatacc = vfloat8; + #else + using vfloatacc = vfloat4; + #endif + + using vint = vint8; + using vmask = vmask8; + + using vtable_16x8 = vtable8_16x8; + using vtable_32x8 = vtable8_32x8; + using vtable_64x8 = vtable8_64x8; + + constexpr auto loada = vfloat8::loada; + constexpr auto load1 = vfloat8::load1; + #elif ASTCENC_NEON > 0 - /* If we have NEON expose 4-wide VLA. */ + // If we have NEON expose 4-wide VLA. #include "astcenc_vecmathlib_neon_4.h" #include "astcenc_vecmathlib_common_4.h" @@ -118,6 +167,10 @@ using vint = vint4; using vmask = vmask4; + using vtable_16x8 = vtable4_16x8; + using vtable_32x8 = vtable4_32x8; + using vtable_64x8 = vtable4_64x8; + constexpr auto loada = vfloat4::loada; constexpr auto load1 = vfloat4::load1; @@ -150,6 +203,10 @@ using vint = vint4; using vmask = vmask4; + using vtable_16x8 = vtable4_16x8; + using vtable_32x8 = vtable4_32x8; + using vtable_64x8 = vtable4_64x8; + constexpr auto loada = vfloat4::loada; constexpr auto load1 = vfloat4::load1; #endif @@ -239,8 +296,8 @@ ASTCENC_SIMD_INLINE vfloat atan(vfloat x) ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x) { vfloat z = atan(abs(y / x)); - vmask xmask = vmask(float_as_int(x).m); - return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y); + vmask xmask = x < vfloat::zero(); + return change_sign(select(z, vfloat(astc::PI) - z, xmask), y); } /* diff --git a/3rdparty/astcenc/astcenc_vecmathlib_avx2_8.h b/3rdparty/astcenc/astcenc_vecmathlib_avx2_8.h index 3ca25e35e1..b400b313ba 100644 --- a/3rdparty/astcenc/astcenc_vecmathlib_avx2_8.h +++ b/3rdparty/astcenc/astcenc_vecmathlib_avx2_8.h @@ -54,7 +54,7 @@ struct vfloat8 ASTCENC_SIMD_INLINE vfloat8() = default; /** - * @brief Construct from 4 values loaded from an unaligned address. + * @brief Construct from 8 values loaded from an unaligned address. * * Consider using loada() which is better with vectors if data is aligned * to vector length. @@ -74,18 +74,6 @@ struct vfloat8 m = _mm256_set1_ps(a); } - /** - * @brief Construct from 8 scalar values. - * - * The value of @c a is stored to lane 0 (LSB) in the SIMD register. - */ - ASTCENC_SIMD_INLINE explicit vfloat8( - float a, float b, float c, float d, - float e, float f, float g, float h) - { - m = _mm256_set_ps(h, g, f, e, d, c, b, a); - } - /** * @brief Construct from an existing SIMD register. */ @@ -94,20 +82,6 @@ struct vfloat8 m = a; } - /** - * @brief Get the scalar value of a single lane. - */ - template ASTCENC_SIMD_INLINE float lane() const - { - #if !defined(__clang__) && defined(_MSC_VER) - return m.m256_f32[l]; - #else - union { __m256 m; float f[8]; } cvt; - cvt.m = m; - return cvt.f[l]; - #endif - } - /** * @brief Factory that returns a vector of zeros. */ @@ -132,14 +106,6 @@ struct vfloat8 return vfloat8(_mm256_load_ps(p)); } - /** - * @brief Factory that returns a vector containing the lane IDs. - */ - static ASTCENC_SIMD_INLINE vfloat8 lane_id() - { - return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0)); - } - /** * @brief The vector ... */ @@ -183,25 +149,13 @@ struct vint8 /** * @brief Construct from 1 scalar value replicated across all lanes. * - * Consider using vfloat4::zero() for constexpr zeros. + * Consider using zero() for constexpr zeros. */ ASTCENC_SIMD_INLINE explicit vint8(int a) { m = _mm256_set1_epi32(a); } - /** - * @brief Construct from 8 scalar values. - * - * The value of @c a is stored to lane 0 (LSB) in the SIMD register. - */ - ASTCENC_SIMD_INLINE explicit vint8( - int a, int b, int c, int d, - int e, int f, int g, int h) - { - m = _mm256_set_epi32(h, g, f, e, d, c, b, a); - } - /** * @brief Construct from an existing SIMD register. */ @@ -210,20 +164,6 @@ struct vint8 m = a; } - /** - * @brief Get the scalar from a single lane. - */ - template ASTCENC_SIMD_INLINE int lane() const - { - #if !defined(__clang__) && defined(_MSC_VER) - return m.m256i_i32[l]; - #else - union { __m256i m; int f[8]; } cvt; - cvt.m = m; - return cvt.f[l]; - #endif - } - /** * @brief Factory that returns a vector of zeros. */ @@ -518,31 +458,45 @@ ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b) */ ASTCENC_SIMD_INLINE vint8 hmin(vint8 a) { - __m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1)); - m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2))); - m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1))); - m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0)); + // Build min within groups of 2, then 4, then 8 + __m256i m = _mm256_min_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))); + m = _mm256_min_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2))); + m = _mm256_min_epi32(m, _mm256_permute2x128_si256(m, m, 0x01)); - __m256i r = astcenc_mm256_set_m128i(m, m); - vint8 vmin(r); + vint8 vmin(m); return vmin; } +/** + * @brief Return the horizontal minimum of a vector. + */ +ASTCENC_SIMD_INLINE int hmin_s(vint8 a) +{ + return _mm256_cvtsi256_si32(hmin(a).m); +} + /** * @brief Return the horizontal maximum of a vector. */ ASTCENC_SIMD_INLINE vint8 hmax(vint8 a) { - __m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1)); - m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2))); - m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1))); - m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0)); + // Build max within groups of 2, then 4, then 8 + __m256i m = _mm256_max_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))); + m = _mm256_max_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2))); + m = _mm256_max_epi32(m, _mm256_permute2x128_si256(m, m, 0x01)); - __m256i r = astcenc_mm256_set_m128i(m, m); - vint8 vmax(r); + vint8 vmax(m); return vmax; } +/** + * @brief Return the horizontal maximum of a vector. + */ +ASTCENC_SIMD_INLINE int hmax_s(vint8 a) +{ + return _mm256_cvtsi256_si32(hmax(a).m); +} + /** * @brief Store a vector to a 16B aligned memory address. */ @@ -570,18 +524,10 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p) _mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0)); } -/** - * @brief Gather N (vector width) indices from the array. - */ -ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices) -{ - return vint8(_mm256_i32gather_epi32(base, indices.m, 4)); -} - /** * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector. */ -ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v) +ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint8 v, uint8_t* p) { __m256i shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, @@ -593,7 +539,8 @@ ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v) __m128i b = _mm_unpacklo_epi32(a0, a1); __m256i r = astcenc_mm256_set_m128i(b, b); - return vint8(r); + + store_nbytes(vint8(r), p); } /** @@ -606,7 +553,7 @@ ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond) } // ============================================================================ -// vfloat4 operators and functions +// vfloat8 operators and functions // ============================================================================ /** @@ -674,7 +621,6 @@ ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b) return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b))); } - /** * @brief Overload: scalar by vector division. */ @@ -683,7 +629,6 @@ ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b) return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m)); } - /** * @brief Overload: vector by vector equality. */ @@ -786,19 +731,6 @@ ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a) return a; } -/** - * @brief Return a clamped value between 0.0f and max. - * - * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will - * be returned for that lane. - */ -ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a) -{ - a.m = _mm256_max_ps(a.m, _mm256_setzero_ps()); - a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max)); - return a; -} - /** * @brief Return a clamped value between 0.0f and 1.0f. * @@ -857,7 +789,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a) */ ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a) { - return hmin(a).lane<0>(); + return _mm256_cvtss_f32(hmin(a).m); } /** @@ -887,7 +819,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a) */ ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a) { - return hmax(a).lane<0>(); + return _mm256_cvtss_f32(hmax(a).m); } /** @@ -909,14 +841,6 @@ ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond) return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m)); } -/** - * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. - */ -ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond) -{ - return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m)); -} - /** * @brief Accumulate lane-wise sums for a vector, folded 4-wide. * @@ -979,6 +903,33 @@ ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices) return vfloat8(_mm256_i32gather_ps(base, indices.m, 4)); } +/** + * @brief Load a vector of gathered results from an array using byte indices from memory + */ +template<> +ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds(const float* base, const uint8_t* indices) +{ +#if ASTCENC_X86_GATHERS == 0 + // Perform manual gather using scalar loads in two separate dependency chains, + // then merge late. MSVC translates this 1:1, which is OK. Clang turns it + // into a bunch of memory-operand inserts on 128-bit halves then merges late, + // which performs significantly worse in tests. + __m256 m0 = _mm256_broadcast_ss(base + indices[0]); + __m256 m1 = _mm256_broadcast_ss(base + indices[1]); + m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[2]), 1 << 2); + m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[3]), 1 << 3); + m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[4]), 1 << 4); + m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[5]), 1 << 5); + m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[6]), 1 << 6); + m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[7]), 1 << 7); + + return vfloat8(_mm256_blend_ps(m0, m1, 0xaa)); +#else + vint8 inds(indices); + return gatherf(base, inds); +#endif +} + /** * @brief Store a vector to an unaligned memory address. */ @@ -1045,98 +996,140 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a) return vfloat8(_mm256_castsi256_ps(a.m)); } -/** - * @brief Prepare a vtable lookup table for use with the native SIMD size. +/* + * Table structure for a 16x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p) -{ - // AVX2 duplicates the table within each 128-bit lane - __m128i t0n = t0.m; - t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n)); -} +struct vtable8_16x8 { + vint8 t0; +}; -/** - * @brief Prepare a vtable lookup table for use with the native SIMD size. +/* + * Table structure for a 32x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p) -{ - // AVX2 duplicates the table within each 128-bit lane - __m128i t0n = t0.m; - t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n)); +struct vtable8_32x8 { + vint8 t0; + vint8 t1; +}; - __m128i t1n = _mm_xor_si128(t0.m, t1.m); - t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n)); -} +/* + * Table structure for a 64x 8-bit entry table. + */ +struct vtable8_64x8 { + vint8 t0; + vint8 t1; + vint8 t2; + vint8 t3; +}; /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 16x 8-bit entry table. */ ASTCENC_SIMD_INLINE void vtable_prepare( - vint4 t0, vint4 t1, vint4 t2, vint4 t3, - vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p) -{ - // AVX2 duplicates the table within each 128-bit lane - __m128i t0n = t0.m; - t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n)); + vtable8_16x8& table, + const uint8_t* data +) { + // AVX2 tables duplicate table entries in each 128-bit half-register + vint4 d0 = vint4::load(data); - __m128i t1n = _mm_xor_si128(t0.m, t1.m); - t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n)); - - __m128i t2n = _mm_xor_si128(t1.m, t2.m); - t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n)); - - __m128i t3n = _mm_xor_si128(t2.m, t3.m); - t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n)); + table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m)); } /** - * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. + * @brief Prepare a vtable lookup table for 32x 8-bit entry table. */ -ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx) -{ +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable8_32x8& table, + const uint8_t* data +) { + // AVX2 tables duplicate table entries in each 128-bit half-register + vint4 d0 = vint4::load(data); + vint4 d1 = vint4::load(data + 16); + + table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m)); + table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m)); + + // XOR chain the high rows to allow table emulation + table.t1 = table.t1 ^ table.t0; +} + +/** + * @brief Prepare a vtable lookup table 64x 8-bit entry table. + */ +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable8_64x8& table, + const uint8_t* data +) { + // AVX2 tables duplicate table entries in each 128-bit half-register + vint4 d0 = vint4::load(data); + vint4 d1 = vint4::load(data + 16); + vint4 d2 = vint4::load(data + 32); + vint4 d3 = vint4::load(data + 48); + + table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m)); + table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m)); + table.t2 = vint8(astcenc_mm256_set_m128i(d2.m, d2.m)); + table.t3 = vint8(astcenc_mm256_set_m128i(d3.m, d3.m)); + + // XOR chain the high rows to allow table emulation + table.t3 = table.t3 ^ table.t2; + table.t2 = table.t2 ^ table.t1; + table.t1 = table.t1 ^ table.t0; +} + +/** + * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. + */ +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( + const vtable8_16x8& tbl, + vint8 idx +) { // Set index byte MSB to 1 for unused bytes so shuffle returns zero __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast(0xFFFFFF00))); - __m256i result = _mm256_shuffle_epi8(t0.m, idxx); + __m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx); return vint8(result); } /** - * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx) -{ +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( + const vtable8_32x8& tbl, + vint8 idx +) { // Set index byte MSB to 1 for unused bytes so shuffle returns zero __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast(0xFFFFFF00))); - __m256i result = _mm256_shuffle_epi8(t0.m, idxx); + __m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx); idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); - __m256i result2 = _mm256_shuffle_epi8(t1.m, idxx); + __m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx); result = _mm256_xor_si256(result, result2); return vint8(result); } /** - * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx) -{ +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( + const vtable8_64x8& tbl, + vint8 idx +) { // Set index byte MSB to 1 for unused bytes so shuffle returns zero __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast(0xFFFFFF00))); - __m256i result = _mm256_shuffle_epi8(t0.m, idxx); + __m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx); idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); - __m256i result2 = _mm256_shuffle_epi8(t1.m, idxx); + __m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx); result = _mm256_xor_si256(result, result2); idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); - result2 = _mm256_shuffle_epi8(t2.m, idxx); + result2 = _mm256_shuffle_epi8(tbl.t2.m, idxx); result = _mm256_xor_si256(result, result2); idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); - result2 = _mm256_shuffle_epi8(t3.m, idxx); + result2 = _mm256_shuffle_epi8(tbl.t3.m, idxx); result = _mm256_xor_si256(result, result2); return vint8(result); @@ -1146,7 +1139,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3 * @brief Return a vector of interleaved RGBA data. * * Input vectors have the value stored in the bottom 8 bits of each lane, - * with high bits set to zero. + * with high bits set to zero. * * Output vector stores a single RGBA texel packed in each lane. */ diff --git a/3rdparty/astcenc/astcenc_vecmathlib_common_4.h b/3rdparty/astcenc/astcenc_vecmathlib_common_4.h index 1e04367c1f..db4f13a6cd 100644 --- a/3rdparty/astcenc/astcenc_vecmathlib_common_4.h +++ b/3rdparty/astcenc/astcenc_vecmathlib_common_4.h @@ -32,26 +32,6 @@ #include -// ============================================================================ -// vmask4 operators and functions -// ============================================================================ - -/** - * @brief True if any lanes are enabled, false otherwise. - */ -ASTCENC_SIMD_INLINE bool any(vmask4 a) -{ - return mask(a) != 0; -} - -/** - * @brief True if all lanes are enabled, false otherwise. - */ -ASTCENC_SIMD_INLINE bool all(vmask4 a) -{ - return mask(a) == 0xF; -} - // ============================================================================ // vint4 operators and functions // ============================================================================ @@ -129,6 +109,22 @@ ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a) return a.lane<0>() + a.lane<1>() + a.lane<2>(); } +/** + * @brief Return the horizontal minimum of a vector. + */ +ASTCENC_SIMD_INLINE int hmin_s(vint4 a) +{ + return hmin(a).lane<0>(); +} + +/** + * @brief Return the horizontal maximum of a vector. + */ +ASTCENC_SIMD_INLINE int hmax_s(vint4 a) +{ + return hmax(a).lane<0>(); +} + // ============================================================================ // vfloat4 operators and functions // ============================================================================ @@ -222,18 +218,6 @@ ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a) return min(max(a, minv), maxv); } -/** - * @brief Return the clamped value between 0.0f and max. - * - * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will - * be returned for that lane. - */ -ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a) -{ - // Do not reorder - second operand will return if either is NaN - return min(max(a, vfloat4::zero()), maxv); -} - /** * @brief Return the clamped value between 0.0f and 1.0f. * diff --git a/3rdparty/astcenc/astcenc_vecmathlib_neon_4.h b/3rdparty/astcenc/astcenc_vecmathlib_neon_4.h index 42545e7562..7cfd0a2f6d 100644 --- a/3rdparty/astcenc/astcenc_vecmathlib_neon_4.h +++ b/3rdparty/astcenc/astcenc_vecmathlib_neon_4.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2019-2023 Arm Limited +// Copyright 2019-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -115,7 +115,7 @@ struct vfloat4 */ static ASTCENC_SIMD_INLINE vfloat4 zero() { - return vfloat4(vdupq_n_f32(0.0f)); + return vfloat4(0.0f); } /** @@ -134,15 +134,6 @@ struct vfloat4 return vfloat4(vld1q_f32(p)); } - /** - * @brief Factory that returns a vector containing the lane IDs. - */ - static ASTCENC_SIMD_INLINE vfloat4 lane_id() - { - alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f }; - return vfloat4(vld1q_f32(data)); - } - /** * @brief Return a swizzled float 2. */ @@ -203,16 +194,21 @@ struct vint4 */ ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p) { - // Cast is safe - NEON loads are allowed to be unaligned - uint32x2_t t8 = vld1_dup_u32(reinterpret_cast(p)); - uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8))); - m = vreinterpretq_s32_u32(vmovl_u16(t16)); +#if ASTCENC_SVE == 0 + // Cast is safe - NEON loads are allowed to be unaligned + uint32x2_t t8 = vld1_dup_u32(reinterpret_cast(p)); + uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8))); + m = vreinterpretq_s32_u32(vmovl_u16(t16)); +#else + svint32_t data = svld1ub_s32(svptrue_pat_b32(SV_VL4), p); + m = svget_neonq(data); +#endif } /** * @brief Construct from 1 scalar value replicated across all lanes. * - * Consider using vfloat4::zero() for constexpr zeros. + * Consider using zero() for constexpr zeros. */ ASTCENC_SIMD_INLINE explicit vint4(int a) { @@ -420,6 +416,22 @@ ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a) return vaddvq_u32(vshlq_u32(tmp, shift)); } +/** + * @brief True if any lanes are enabled, false otherwise. + */ +ASTCENC_SIMD_INLINE bool any(vmask4 a) +{ + return vmaxvq_u32(a.m) != 0; +} + +/** + * @brief True if all lanes are enabled, false otherwise. + */ +ASTCENC_SIMD_INLINE bool all(vmask4 a) +{ + return vminvq_u32(a.m) != 0; +} + // ============================================================================ // vint4 operators and functions // ============================================================================ @@ -612,31 +624,17 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p) } /** - * @brief Gather N (vector width) indices from the array. + * @brief Pack and store low 8 bits of each vector lane. */ -ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices) -{ - alignas(16) int idx[4]; - storea(indices, idx); - alignas(16) int vals[4]; - vals[0] = base[idx[0]]; - vals[1] = base[idx[1]]; - vals[2] = base[idx[2]]; - vals[3] = base[idx[3]]; - return vint4(vals); -} - -/** - * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector. - */ -ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a) +ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* data) { alignas(16) uint8_t shuf[16] { 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; uint8x16_t idx = vld1q_u8(shuf); int8x16_t av = vreinterpretq_s8_s32(a.m); - return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx))); + a = vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx))); + store_nbytes(a, data); } /** @@ -814,21 +812,12 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond) return vfloat4(vbslq_f32(cond.m, b.m, a.m)); } -/** - * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. - */ -ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond) -{ - static const uint32x4_t msb = vdupq_n_u32(0x80000000u); - uint32x4_t mask = vcgeq_u32(cond.m, msb); - return vfloat4(vbslq_f32(mask, b.m, a.m)); -} - /** * @brief Load a vector of gathered results from an array; */ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices) { +#if ASTCENC_SVE == 0 alignas(16) int idx[4]; storea(indices, idx); alignas(16) float vals[4]; @@ -837,8 +826,32 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices) vals[2] = base[idx[2]]; vals[3] = base[idx[3]]; return vfloat4(vals); +#else + svint32_t offsets = svset_neonq_s32(svundef_s32(), indices.m); + svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets); + return vfloat4(svget_neonq_f32(data)); +#endif } +/** + * @brief Load a vector of gathered results from an array using byte indices from memory + */ +template<> +ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds(const float* base, const uint8_t* indices) +{ +#if ASTCENC_SVE == 0 + alignas(16) float vals[4]; + vals[0] = base[indices[0]]; + vals[1] = base[indices[1]]; + vals[2] = base[indices[2]]; + vals[3] = base[indices[3]]; + return vfloat4(vals); +#else + svint32_t offsets = svld1ub_s32(svptrue_pat_b32(SV_VL4), indices); + svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets); + return vfloat4(svget_neonq_f32(data)); +#endif +} /** * @brief Store a vector to an unaligned memory address. */ @@ -950,87 +963,105 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v) return vfloat4(vreinterpretq_f32_s32(v.m)); } -/** - * @brief Prepare a vtable lookup table for use with the native SIMD size. +/* + * Table structure for a 16x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p) -{ - t0p = t0; -} +struct vtable4_16x8 { + uint8x16_t t0; +}; - -/** - * @brief Prepare a vtable lookup table for use with the native SIMD size. +/* + * Table structure for a 32x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p) -{ - t0p = t0; - t1p = t1; -} +struct vtable4_32x8 { + uint8x16x2_t t01; +}; + +/* + * Table structure for a 64x 8-bit entry table. + */ +struct vtable4_64x8 { + uint8x16x4_t t0123; +}; /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 16x 8-bit entry table. */ ASTCENC_SIMD_INLINE void vtable_prepare( - vint4 t0, vint4 t1, vint4 t2, vint4 t3, - vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p) -{ - t0p = t0; - t1p = t1; - t2p = t2; - t3p = t3; + vtable4_16x8& table, + const uint8_t* data +) { + table.t0 = vld1q_u8(data); } /** - * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. + * @brief Prepare a vtable lookup table for 32x 8-bit entry table. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) -{ - int8x16_t table { - vreinterpretq_s8_s32(t0.m) +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_32x8& table, + const uint8_t* data +) { + table.t01 = uint8x16x2_t { + vld1q_u8(data), + vld1q_u8(data + 16) }; +} +/** + * @brief Prepare a vtable lookup table 64x 8-bit entry table. + */ +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_64x8& table, + const uint8_t* data +) { + table.t0123 = uint8x16x4_t { + vld1q_u8(data), + vld1q_u8(data + 16), + vld1q_u8(data + 32), + vld1q_u8(data + 48) + }; +} + +/** + * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. + */ +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_16x8& tbl, + vint4 idx +) { // Set index byte above max index for unused bytes so table lookup returns zero int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); - return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes))); + return vint4(vreinterpretq_s32_u8(vqtbl1q_u8(tbl.t0, idx_bytes))); } /** - * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) -{ - int8x16x2_t table { - vreinterpretq_s8_s32(t0.m), - vreinterpretq_s8_s32(t1.m) - }; - +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_32x8& tbl, + vint4 idx +) { // Set index byte above max index for unused bytes so table lookup returns zero int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); - return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes))); + return vint4(vreinterpretq_s32_u8(vqtbl2q_u8(tbl.t01, idx_bytes))); } /** - * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx) -{ - int8x16x4_t table { - vreinterpretq_s8_s32(t0.m), - vreinterpretq_s8_s32(t1.m), - vreinterpretq_s8_s32(t2.m), - vreinterpretq_s8_s32(t3.m) - }; - +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_64x8& tbl, + vint4 idx +) { // Set index byte above max index for unused bytes so table lookup returns zero int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); - return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes))); + return vint4(vreinterpretq_s32_u8(vqtbl4q_u8(tbl.t0123, idx_bytes))); } /** diff --git a/3rdparty/astcenc/astcenc_vecmathlib_none_4.h b/3rdparty/astcenc/astcenc_vecmathlib_none_4.h index be7348eff1..862f592a42 100644 --- a/3rdparty/astcenc/astcenc_vecmathlib_none_4.h +++ b/3rdparty/astcenc/astcenc_vecmathlib_none_4.h @@ -139,14 +139,6 @@ struct vfloat4 return vfloat4(p); } - /** - * @brief Factory that returns a vector containing the lane IDs. - */ - static ASTCENC_SIMD_INLINE vfloat4 lane_id() - { - return vfloat4(0.0f, 1.0f, 2.0f, 3.0f); - } - /** * @brief Return a swizzled float 2. */ @@ -233,7 +225,7 @@ struct vint4 /** * @brief Construct from 4 scalar values replicated across all lanes. * - * Consider using vint4::zero() for constexpr zeros. + * Consider using zero() for constexpr zeros. */ ASTCENC_SIMD_INLINE explicit vint4(int a) { @@ -354,7 +346,7 @@ struct vmask4 /** * @brief Get the scalar value of a single lane. */ - template ASTCENC_SIMD_INLINE float lane() const + template ASTCENC_SIMD_INLINE bool lane() const { return m[l] != 0; } @@ -426,6 +418,22 @@ ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a) ((a.m[3] >> 28) & 0x8); } +/** + * @brief True if any lanes are enabled, false otherwise. + */ +ASTCENC_SIMD_INLINE bool any(vmask4 a) +{ + return mask(a) != 0; +} + +/** + * @brief True if all lanes are enabled, false otherwise. + */ +ASTCENC_SIMD_INLINE bool all(vmask4 a) +{ + return mask(a) == 0xF; +} + // ============================================================================ // vint4 operators and functions // ============================================================================ @@ -684,21 +692,10 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p) std::memcpy(p, a.m, sizeof(uint8_t) * 4); } -/** - * @brief Gather N (vector width) indices from the array. - */ -ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices) -{ - return vint4(base[indices.m[0]], - base[indices.m[1]], - base[indices.m[2]], - base[indices.m[3]]); -} - /** * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector. */ -ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a) +ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p) { int b0 = a.m[0] & 0xFF; int b1 = a.m[1] & 0xFF; @@ -706,7 +703,8 @@ ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a) int b3 = a.m[3] & 0xFF; int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24); - return vint4(b, 0, 0, 0); + a = vint4(b, 0, 0, 0); + store_nbytes(a, p); } /** @@ -934,17 +932,6 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond) (cond.m[3] & static_cast(0x80000000)) ? b.m[3] : a.m[3]); } -/** - * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. - */ -ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond) -{ - return vfloat4((cond.m[0] & static_cast(0x80000000)) ? b.m[0] : a.m[0], - (cond.m[1] & static_cast(0x80000000)) ? b.m[1] : a.m[1], - (cond.m[2] & static_cast(0x80000000)) ? b.m[2] : a.m[2], - (cond.m[3] & static_cast(0x80000000)) ? b.m[3] : a.m[3]); -} - /** * @brief Load a vector of gathered results from an array; */ @@ -956,6 +943,18 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices) base[indices.m[3]]); } +/** + * @brief Load a vector of gathered results from an array using byte indices from memory + */ +template<> +ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds(const float* base, const uint8_t* indices) +{ + return vfloat4(base[indices[0]], + base[indices[1]], + base[indices[2]], + base[indices[3]]); +} + /** * @brief Store a vector to an unaligned memory address. */ @@ -1080,84 +1079,94 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a) return r; } -/** - * @brief Prepare a vtable lookup table for use with the native SIMD size. +/* + * Table structure for a 16x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p) -{ - t0p = t0; -} +struct vtable4_16x8 { + const uint8_t* data; +}; + +/* + * Table structure for a 32x 8-bit entry table. + */ +struct vtable4_32x8 { + const uint8_t* data; +}; + +/* + * Table structure for a 64x 8-bit entry table. + */ +struct vtable4_64x8 { + const uint8_t* data; +}; /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. - */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p) -{ - t0p = t0; - t1p = t1; -} - -/** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 16x 8-bit entry table. */ ASTCENC_SIMD_INLINE void vtable_prepare( - vint4 t0, vint4 t1, vint4 t2, vint4 t3, - vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p) -{ - t0p = t0; - t1p = t1; - t2p = t2; - t3p = t3; + vtable4_16x8& table, + const uint8_t* data +) { + table.data = data; } /** - * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. + * @brief Prepare a vtable lookup table for 32x 8-bit entry table. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) -{ - uint8_t table[16]; - - std::memcpy(table + 0, t0.m, 4 * sizeof(int)); - - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); -} - - -/** - * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. - */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) -{ - uint8_t table[32]; - - std::memcpy(table + 0, t0.m, 4 * sizeof(int)); - std::memcpy(table + 16, t1.m, 4 * sizeof(int)); - - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_32x8& table, + const uint8_t* data +) { + table.data = data; } /** - * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. + * @brief Prepare a vtable lookup table 64x 8-bit entry table. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx) -{ - uint8_t table[64]; +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_64x8& table, + const uint8_t* data +) { + table.data = data; +} - std::memcpy(table + 0, t0.m, 4 * sizeof(int)); - std::memcpy(table + 16, t1.m, 4 * sizeof(int)); - std::memcpy(table + 32, t2.m, 4 * sizeof(int)); - std::memcpy(table + 48, t3.m, 4 * sizeof(int)); +/** + * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. + */ +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_16x8& table, + vint4 idx +) { + return vint4(table.data[idx.lane<0>()], + table.data[idx.lane<1>()], + table.data[idx.lane<2>()], + table.data[idx.lane<3>()]); +} - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); +/** + * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. + */ +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_32x8& table, + vint4 idx +) { + return vint4(table.data[idx.lane<0>()], + table.data[idx.lane<1>()], + table.data[idx.lane<2>()], + table.data[idx.lane<3>()]); +} + +/** + * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. + */ +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_64x8& table, + vint4 idx +) { + return vint4(table.data[idx.lane<0>()], + table.data[idx.lane<1>()], + table.data[idx.lane<2>()], + table.data[idx.lane<3>()]); } /** diff --git a/3rdparty/astcenc/astcenc_vecmathlib_sse_4.h b/3rdparty/astcenc/astcenc_vecmathlib_sse_4.h index b69655f904..938ead66eb 100644 --- a/3rdparty/astcenc/astcenc_vecmathlib_sse_4.h +++ b/3rdparty/astcenc/astcenc_vecmathlib_sse_4.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2019-2023 Arm Limited +// Copyright 2019-2024 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -142,14 +142,6 @@ struct vfloat4 return vfloat4(_mm_load_ps(p)); } - /** - * @brief Factory that returns a vector containing the lane IDs. - */ - static ASTCENC_SIMD_INLINE vfloat4 lane_id() - { - return vfloat4(_mm_set_ps(3, 2, 1, 0)); - } - /** * @brief Return a swizzled float 2. */ @@ -229,7 +221,7 @@ struct vint4 /** * @brief Construct from 1 scalar value replicated across all lanes. * - * Consider using vfloat4::zero() for constexpr zeros. + * Consider using zero() for constexpr zeros. */ ASTCENC_SIMD_INLINE explicit vint4(int a) { @@ -436,6 +428,22 @@ ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a) return static_cast(_mm_movemask_ps(a.m)); } +/** + * @brief True if any lanes are enabled, false otherwise. + */ +ASTCENC_SIMD_INLINE bool any(vmask4 a) +{ + return mask(a) != 0; +} + +/** + * @brief True if all lanes are enabled, false otherwise. + */ +ASTCENC_SIMD_INLINE bool all(vmask4 a) +{ + return mask(a) == 0xF; +} + // ============================================================================ // vint4 operators and functions // ============================================================================ @@ -598,9 +606,9 @@ ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b) */ ASTCENC_SIMD_INLINE vint4 hmin(vint4 a) { - a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2)))); - a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1)))); - return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0))); + a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)))); + a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2)))); + return a; } /* @@ -608,9 +616,9 @@ ASTCENC_SIMD_INLINE vint4 hmin(vint4 a) */ ASTCENC_SIMD_INLINE vint4 hmax(vint4 a) { - a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2)))); - a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1)))); - return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0))); + a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)))); + a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2)))); + return a; } /** @@ -663,32 +671,20 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p) _mm_store_ss(reinterpret_cast(p), _mm_castsi128_ps(a.m)); } -/** - * @brief Gather N (vector width) indices from the array. - */ -ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices) -{ -#if ASTCENC_AVX >= 2 - return vint4(_mm_i32gather_epi32(base, indices.m, 4)); -#else - alignas(16) int idx[4]; - storea(indices, idx); - return vint4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]); -#endif -} - /** * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector. */ -ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a) +ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p) { #if ASTCENC_SSE >= 41 __m128i shuf = _mm_set_epi8(0,0,0,0, 0,0,0,0, 0,0,0,0, 12,8,4,0); - return vint4(_mm_shuffle_epi8(a.m, shuf)); + a = vint4(_mm_shuffle_epi8(a.m, shuf)); + store_nbytes(a, p); #else __m128i va = _mm_unpacklo_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(1,1,1,1))); __m128i vb = _mm_unpackhi_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(3,3,3,3))); - return vint4(_mm_unpacklo_epi16(va, vb)); + a = vint4(_mm_unpacklo_epi16(va, vb)); + store_nbytes(a, p); #endif } @@ -899,25 +895,12 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond) #endif } -/** - * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. - */ -ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond) -{ -#if ASTCENC_SSE >= 41 - return vfloat4(_mm_blendv_ps(a.m, b.m, cond.m)); -#else - __m128 d = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(cond.m), 31)); - return vfloat4(_mm_or_ps(_mm_and_ps(d, b.m), _mm_andnot_ps(d, a.m))); -#endif -} - /** * @brief Load a vector of gathered results from an array; */ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices) { -#if ASTCENC_AVX >= 2 +#if ASTCENC_AVX >= 2 && ASTCENC_X86_GATHERS != 0 return vfloat4(_mm_i32gather_ps(base, indices.m, 4)); #else alignas(16) int idx[4]; @@ -926,6 +909,23 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices) #endif } +/** + * @brief Load a vector of gathered results from an array using byte indices from memory + */ +template<> +ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds(const float* base, const uint8_t* indices) +{ + // Experimentally, in this particular use case (byte indices in memory), + // using 4 separate scalar loads is appreciably faster than using gathers + // even if they're available, on every x86 uArch tried, so always do the + // separate loads even when ASTCENC_X86_GATHERS is enabled. + // + // Tested on: + // - Intel Skylake-X, Coffee Lake, Crestmont, Redwood Cove + // - AMD Zen 2, Zen 4 + return vfloat4(base[indices[0]], base[indices[1]], base[indices[2]], base[indices[3]]); +} + /** * @brief Store a vector to an unaligned memory address. */ @@ -1054,136 +1054,173 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v) return vfloat4(_mm_castsi128_ps(v.m)); } -/** - * @brief Prepare a vtable lookup table for use with the native SIMD size. +/* + * Table structure for a 16x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p) -{ - t0p = t0; -} - -/** - * @brief Prepare a vtable lookup table for use with the native SIMD size. - */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p) -{ +struct vtable4_16x8 { #if ASTCENC_SSE >= 41 - t0p = t0; - t1p = t0 ^ t1; + vint4 t0; #else - t0p = t0; - t1p = t1; + const uint8_t* data; #endif -} +}; + +/* + * Table structure for a 32x 8-bit entry table. + */ +struct vtable4_32x8 { +#if ASTCENC_SSE >= 41 + vint4 t0; + vint4 t1; +#else + const uint8_t* data; +#endif +}; + +/* + * Table structure for a 64x 8-bit entry table. + */ +struct vtable4_64x8 { +#if ASTCENC_SSE >= 41 + vint4 t0; + vint4 t1; + vint4 t2; + vint4 t3; +#else + const uint8_t* data; +#endif +}; /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 16x 8-bit entry table. */ ASTCENC_SIMD_INLINE void vtable_prepare( - vint4 t0, vint4 t1, vint4 t2, vint4 t3, - vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p) -{ + vtable4_16x8& table, + const uint8_t* data +) { #if ASTCENC_SSE >= 41 - t0p = t0; - t1p = t0 ^ t1; - t2p = t1 ^ t2; - t3p = t2 ^ t3; + table.t0 = vint4::load(data); #else - t0p = t0; - t1p = t1; - t2p = t2; - t3p = t3; + table.data = data; #endif } /** - * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. + * @brief Prepare a vtable lookup table for 32x 8-bit entry table. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) -{ +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_32x8& table, + const uint8_t* data +) { #if ASTCENC_SSE >= 41 - // Set index byte MSB to 1 for unused bytes so shuffle returns zero - __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast(0xFFFFFF00))); + table.t0 = vint4::load(data); + table.t1 = vint4::load(data + 16); - __m128i result = _mm_shuffle_epi8(t0.m, idxx); - return vint4(result); + table.t1 = table.t1 ^ table.t0; #else - uint8_t table[16]; - - std::memcpy(table + 0, &t0.m, 4 * sizeof(int)); - - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); + table.data = data; #endif } /** - * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. + * @brief Prepare a vtable lookup table 64x 8-bit entry table. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) -{ +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_64x8& table, + const uint8_t* data +) { #if ASTCENC_SSE >= 41 - // Set index byte MSB to 1 for unused bytes so shuffle returns zero - __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast(0xFFFFFF00))); + table.t0 = vint4::load(data); + table.t1 = vint4::load(data + 16); + table.t2 = vint4::load(data + 32); + table.t3 = vint4::load(data + 48); - __m128i result = _mm_shuffle_epi8(t0.m, idxx); - idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16)); - - __m128i result2 = _mm_shuffle_epi8(t1.m, idxx); - result = _mm_xor_si128(result, result2); - - return vint4(result); + table.t3 = table.t3 ^ table.t2; + table.t2 = table.t2 ^ table.t1; + table.t1 = table.t1 ^ table.t0; #else - uint8_t table[32]; - - std::memcpy(table + 0, &t0.m, 4 * sizeof(int)); - std::memcpy(table + 16, &t1.m, 4 * sizeof(int)); - - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); + table.data = data; #endif } /** - * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx) -{ +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_16x8& tbl, + vint4 idx +) { #if ASTCENC_SSE >= 41 // Set index byte MSB to 1 for unused bytes so shuffle returns zero __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast(0xFFFFFF00))); - __m128i result = _mm_shuffle_epi8(t0.m, idxx); + __m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx); + return vint4(result); +#else + return vint4(tbl.data[idx.lane<0>()], + tbl.data[idx.lane<1>()], + tbl.data[idx.lane<2>()], + tbl.data[idx.lane<3>()]); +#endif +} + +/** + * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. + */ +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_32x8& tbl, + vint4 idx +) { +#if ASTCENC_SSE >= 41 + // Set index byte MSB to 1 for unused bytes so shuffle returns zero + __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast(0xFFFFFF00))); + + __m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx); idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16)); - __m128i result2 = _mm_shuffle_epi8(t1.m, idxx); - result = _mm_xor_si128(result, result2); - idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16)); - - result2 = _mm_shuffle_epi8(t2.m, idxx); - result = _mm_xor_si128(result, result2); - idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16)); - - result2 = _mm_shuffle_epi8(t3.m, idxx); + __m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx); result = _mm_xor_si128(result, result2); return vint4(result); #else - uint8_t table[64]; + return vint4(tbl.data[idx.lane<0>()], + tbl.data[idx.lane<1>()], + tbl.data[idx.lane<2>()], + tbl.data[idx.lane<3>()]); +#endif +} - std::memcpy(table + 0, &t0.m, 4 * sizeof(int)); - std::memcpy(table + 16, &t1.m, 4 * sizeof(int)); - std::memcpy(table + 32, &t2.m, 4 * sizeof(int)); - std::memcpy(table + 48, &t3.m, 4 * sizeof(int)); +/** + * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. + */ +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( + const vtable4_64x8& tbl, + vint4 idx +) { +#if ASTCENC_SSE >= 41 + // Set index byte MSB to 1 for unused bytes so shuffle returns zero + __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast(0xFFFFFF00))); - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); + __m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx); + idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16)); + + __m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx); + result = _mm_xor_si128(result, result2); + idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16)); + + result2 = _mm_shuffle_epi8(tbl.t2.m, idxx); + result = _mm_xor_si128(result, result2); + idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16)); + + result2 = _mm_shuffle_epi8(tbl.t3.m, idxx); + result = _mm_xor_si128(result, result2); + + return vint4(result); +#else + return vint4(tbl.data[idx.lane<0>()], + tbl.data[idx.lane<1>()], + tbl.data[idx.lane<2>()], + tbl.data[idx.lane<3>()]); #endif } @@ -1307,7 +1344,11 @@ ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b) */ ASTCENC_SIMD_INLINE int popcount(uint64_t v) { +#if !defined(__x86_64__) && !defined(_M_AMD64) + return static_cast(__builtin_popcountll(v)); +#else return static_cast(_mm_popcnt_u64(v)); +#endif } #endif // ASTCENC_POPCNT >= 1 diff --git a/3rdparty/astcenc/astcenc_vecmathlib_sve_8.h b/3rdparty/astcenc/astcenc_vecmathlib_sve_8.h new file mode 100644 index 0000000000..1e98df02b1 --- /dev/null +++ b/3rdparty/astcenc/astcenc_vecmathlib_sve_8.h @@ -0,0 +1,1092 @@ +// SPDX-License-Identifier: Apache-2.0 +// ---------------------------------------------------------------------------- +// Copyright 2019-2024 Arm Limited +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// ---------------------------------------------------------------------------- + +/** + * @brief 8x32-bit vectors, implemented using SVE. + * + * This module implements 8-wide 32-bit float, int, and mask vectors for Arm + * SVE. + * + * There is a baseline level of functionality provided by all vector widths and + * implementations. This is implemented using identical function signatures, + * modulo data type, so we can use them as substitutable implementations in VLA + * code. + */ + +#ifndef ASTC_VECMATHLIB_SVE_8_H_INCLUDED +#define ASTC_VECMATHLIB_SVE_8_H_INCLUDED + +#ifndef ASTCENC_SIMD_INLINE + #error "Include astcenc_vecmathlib.h, do not include directly" +#endif + +#include + +typedef svbool_t svbool_8_t __attribute__((arm_sve_vector_bits(256))); +typedef svuint8_t svuint8_8_t __attribute__((arm_sve_vector_bits(256))); +typedef svuint16_t svuint16_8_t __attribute__((arm_sve_vector_bits(256))); +typedef svuint32_t svuint32_8_t __attribute__((arm_sve_vector_bits(256))); +typedef svint32_t svint32_8_t __attribute__((arm_sve_vector_bits(256))); +typedef svfloat32_t svfloat32_8_t __attribute__((arm_sve_vector_bits(256))); + +// ============================================================================ +// vfloat8 data type +// ============================================================================ + +/** + * @brief Data type for 8-wide floats. + */ +struct vfloat8 +{ + /** + * @brief Construct from zero-initialized value. + */ + ASTCENC_SIMD_INLINE vfloat8() = default; + + /** + * @brief Construct from 8 values loaded from an unaligned address. + * + * Consider using loada() which is better with vectors if data is aligned + * to vector length. + */ + ASTCENC_SIMD_INLINE explicit vfloat8(const float *p) + { + m = svld1_f32(svptrue_b32(), p); + } + + /** + * @brief Construct from 1 scalar value replicated across all lanes. + * + * Consider using zero() for constexpr zeros. + */ + ASTCENC_SIMD_INLINE explicit vfloat8(float a) + { + m = svdup_f32(a); + } + + /** + * @brief Construct from an existing SIMD register. + */ + ASTCENC_SIMD_INLINE explicit vfloat8(svfloat32_8_t a) + { + m = a; + } + + /** + * @brief Factory that returns a vector of zeros. + */ + static ASTCENC_SIMD_INLINE vfloat8 zero() + { + return vfloat8(0.0f); + } + + /** + * @brief Factory that returns a replicated scalar loaded from memory. + */ + static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p) + { + return vfloat8(*p); + } + + /** + * @brief Factory that returns a vector loaded from 32B aligned memory. + */ + static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p) + { + return vfloat8(p); + } + + /** + * @brief The vector ... + */ + svfloat32_8_t m; +}; + +// ============================================================================ +// vint8 data type +// ============================================================================ + +/** + * @brief Data type for 8-wide ints. + */ +struct vint8 +{ + /** + * @brief Construct from zero-initialized value. + */ + ASTCENC_SIMD_INLINE vint8() = default; + + /** + * @brief Construct from 8 values loaded from an unaligned address. + * + * Consider using loada() which is better with vectors if data is aligned + * to vector length. + */ + ASTCENC_SIMD_INLINE explicit vint8(const int *p) + { + m = svld1_s32(svptrue_b32(), p); + } + + /** + * @brief Construct from 8 uint8_t loaded from an unaligned address. + */ + ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p) + { + // Load 8-bit values and expand to 32-bits + m = svld1ub_s32(svptrue_b32(), p); + } + + /** + * @brief Construct from 1 scalar value replicated across all lanes. + * + * Consider using zero() for constexpr zeros. + */ + ASTCENC_SIMD_INLINE explicit vint8(int a) + { + m = svdup_s32(a); + } + + /** + * @brief Construct from an existing SIMD register. + */ + ASTCENC_SIMD_INLINE explicit vint8(svint32_8_t a) + { + m = a; + } + + /** + * @brief Factory that returns a vector of zeros. + */ + static ASTCENC_SIMD_INLINE vint8 zero() + { + return vint8(0.0f); + } + + /** + * @brief Factory that returns a replicated scalar loaded from memory. + */ + static ASTCENC_SIMD_INLINE vint8 load1(const int* p) + { + return vint8(*p); + } + + /** + * @brief Factory that returns a vector loaded from unaligned memory. + */ + static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p) + { + svuint8_8_t data = svld1_u8(svptrue_b8(), p); + return vint8(svreinterpret_s32_u8(data)); + } + + /** + * @brief Factory that returns a vector loaded from 32B aligned memory. + */ + static ASTCENC_SIMD_INLINE vint8 loada(const int* p) + { + return vint8(p); + } + + /** + * @brief Factory that returns a vector containing the lane IDs. + */ + static ASTCENC_SIMD_INLINE vint8 lane_id() + { + return vint8(svindex_s32(0, 1)); + } + + /** + * @brief The vector ... + */ + svint32_8_t m; +}; + +// ============================================================================ +// vmask8 data type +// ============================================================================ + +/** + * @brief Data type for 8-wide control plane masks. + */ +struct vmask8 +{ + /** + * @brief Construct from an existing SIMD register. + */ + ASTCENC_SIMD_INLINE explicit vmask8(svbool_8_t a) + { + m = a; + } + + /** + * @brief Construct from 1 scalar value. + */ + ASTCENC_SIMD_INLINE explicit vmask8(bool a) + { + m = svdup_b32(a); + } + + /** + * @brief The vector ... + */ + svbool_8_t m; +}; + +// ============================================================================ +// vmask8 operators and functions +// ============================================================================ + +/** + * @brief Overload: mask union (or). + */ +ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b) +{ + return vmask8(svorr_z(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: mask intersect (and). + */ +ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b) +{ + return vmask8(svand_z(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: mask difference (xor). + */ +ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b) +{ + return vmask8(sveor_z(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: mask invert (not). + */ +ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a) +{ + return vmask8(svnot_z(svptrue_b32(), a.m)); +} + +/** + * @brief Return a 8-bit mask code indicating mask status. + * + * bit0 = lane 0 + */ +ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a) +{ + alignas(32) const int shifta[8] { 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 }; + svint32_8_t template_vals = svld1_s32(svptrue_b32(), shifta); + svint32_8_t active_vals = svsel_s32(a.m, template_vals, svdup_s32(0)); + return static_cast(svaddv_s32(svptrue_b32(), active_vals)); +} + +/** + * @brief True if any lanes are enabled, false otherwise. + */ +ASTCENC_SIMD_INLINE bool any(vmask8 a) +{ + return svptest_any(svptrue_b32(), a.m); +} + +/** + * @brief True if all lanes are enabled, false otherwise. + */ +ASTCENC_SIMD_INLINE bool all(vmask8 a) +{ + return !svptest_any(svptrue_b32(), (~a).m); +} + +// ============================================================================ +// vint8 operators and functions +// ============================================================================ +/** + * @brief Overload: vector by vector addition. + */ +ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b) +{ + return vint8(svadd_s32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector incremental addition. + */ +ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b) +{ + a = a + b; + return a; +} + +/** + * @brief Overload: vector by vector subtraction. + */ +ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b) +{ + return vint8(svsub_s32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector multiplication. + */ +ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b) +{ + return vint8(svmul_s32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector bit invert. + */ +ASTCENC_SIMD_INLINE vint8 operator~(vint8 a) +{ + return vint8(svnot_s32_x(svptrue_b32(), a.m)); +} + +/** + * @brief Overload: vector by vector bitwise or. + */ +ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b) +{ + return vint8(svorr_s32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector bitwise and. + */ +ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b) +{ + return vint8(svand_s32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector bitwise xor. + */ +ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b) +{ + return vint8(sveor_s32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector equality. + */ +ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b) +{ + return vmask8(svcmpeq_s32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector inequality. + */ +ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b) +{ + return vmask8(svcmpne_s32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector less than. + */ +ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b) +{ + return vmask8(svcmplt_s32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector greater than. + */ +ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b) +{ + return vmask8(svcmpgt_s32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Logical shift left. + */ +template ASTCENC_SIMD_INLINE vint8 lsl(vint8 a) +{ + return vint8(svlsl_n_s32_x(svptrue_b32(), a.m, s)); +} + +/** + * @brief Arithmetic shift right. + */ +template ASTCENC_SIMD_INLINE vint8 asr(vint8 a) +{ + return vint8(svasr_n_s32_x(svptrue_b32(), a.m, s)); +} + +/** + * @brief Logical shift right. + */ +template ASTCENC_SIMD_INLINE vint8 lsr(vint8 a) +{ + svuint32_8_t r = svreinterpret_u32_s32(a.m); + r = svlsr_n_u32_x(svptrue_b32(), r, s); + return vint8(svreinterpret_s32_u32(r)); +} + +/** + * @brief Return the min vector of two vectors. + */ +ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b) +{ + return vint8(svmin_s32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Return the max vector of two vectors. + */ +ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b) +{ + return vint8(svmax_s32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Return the horizontal minimum of a vector. + */ +ASTCENC_SIMD_INLINE vint8 hmin(vint8 a) +{ + return vint8(svminv_s32(svptrue_b32(), a.m)); +} + +/** + * @brief Return the horizontal minimum of a vector. + */ +ASTCENC_SIMD_INLINE int hmin_s(vint8 a) +{ + return svminv_s32(svptrue_b32(), a.m); +} + +/** + * @brief Return the horizontal maximum of a vector. + */ +ASTCENC_SIMD_INLINE vint8 hmax(vint8 a) +{ + return vint8(svmaxv_s32(svptrue_b32(), a.m)); +} + +/** + * @brief Return the horizontal maximum of a vector. + */ +ASTCENC_SIMD_INLINE int hmax_s(vint8 a) +{ + return svmaxv_s32(svptrue_b32(), a.m); +} + +/** + * @brief Store a vector to a 16B aligned memory address. + */ +ASTCENC_SIMD_INLINE void storea(vint8 a, int* p) +{ + svst1_s32(svptrue_b32(), p, a.m); +} + +/** + * @brief Store a vector to an unaligned memory address. + */ +ASTCENC_SIMD_INLINE void store(vint8 a, int* p) +{ + svst1_s32(svptrue_b32(), p, a.m); +} + +/** + * @brief Store lowest N (vector width) bytes into an unaligned address. + */ +ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p) +{ + svuint8_8_t r = svreinterpret_u8_s32(a.m); + svst1_u8(svptrue_pat_b8(SV_VL8), p, r); +} + +/** + * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector. + */ +ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint8 v, uint8_t* p) +{ + svuint32_8_t data = svreinterpret_u32_s32(v.m); + svst1b_u32(svptrue_b32(), p, data); +} + +/** + * @brief Return lanes from @c b if @c cond is set, else @c a. + */ +ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond) +{ + return vint8(svsel_s32(cond.m, b.m, a.m)); +} + +// ============================================================================ +// vfloat8 operators and functions +// ============================================================================ + +/** + * @brief Overload: vector by vector addition. + */ +ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b) +{ + return vfloat8(svadd_f32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector incremental addition. + */ +ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b) +{ + a = a + b; + return a; +} + +/** + * @brief Overload: vector by vector subtraction. + */ +ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b) +{ + return vfloat8(svsub_f32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector multiplication. + */ +ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b) +{ + return vfloat8(svmul_f32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by scalar multiplication. + */ +ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b) +{ + return vfloat8(svmul_f32_x(svptrue_b32(), a.m, svdup_f32(b))); +} + +/** + * @brief Overload: scalar by vector multiplication. + */ +ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b) +{ + return vfloat8(svmul_f32_x(svptrue_b32(), svdup_f32(a), b.m)); +} + +/** + * @brief Overload: vector by vector division. + */ +ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b) +{ + return vfloat8(svdiv_f32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by scalar division. + */ +ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b) +{ + return vfloat8(svdiv_f32_x(svptrue_b32(), a.m, svdup_f32(b))); +} + +/** + * @brief Overload: scalar by vector division. + */ +ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b) +{ + return vfloat8(svdiv_f32_x(svptrue_b32(), svdup_f32(a), b.m)); +} + +/** + * @brief Overload: vector by vector equality. + */ +ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b) +{ + return vmask8(svcmpeq_f32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector inequality. + */ +ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b) +{ + return vmask8(svcmpne_f32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector less than. + */ +ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b) +{ + return vmask8(svcmplt_f32(svptrue_b32(), a.m, b.m));; +} + +/** + * @brief Overload: vector by vector greater than. + */ +ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b) +{ + return vmask8(svcmpgt_f32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector less than or equal. + */ +ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b) +{ + return vmask8(svcmple_f32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Overload: vector by vector greater than or equal. + */ +ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b) +{ + return vmask8(svcmpge_f32(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Return the min vector of two vectors. + * + * If either lane value is NaN, the other lane will be returned. + */ +ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b) +{ + return vfloat8(svminnm_f32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Return the min vector of a vector and a scalar. + * + * If either lane value is NaN, the other lane will be returned. + */ +ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b) +{ + return min(a, vfloat8(b)); +} + +/** + * @brief Return the max vector of two vectors. + * + * If either lane value is NaN, the other lane will be returned. + */ +ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b) +{ + return vfloat8(svmaxnm_f32_x(svptrue_b32(), a.m, b.m)); +} + +/** + * @brief Return the max vector of a vector and a scalar. + * + * If either lane value is NaN, the other lane will be returned. + */ +ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b) +{ + return max(a, vfloat8(b)); +} + +/** + * @brief Return the clamped value between min and max. + * + * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN + * then @c min will be returned for that lane. + */ +ASTCENC_SIMD_INLINE vfloat8 clamp(float minv, float maxv, vfloat8 a) +{ + return min(max(a, minv), maxv); +} + +/** + * @brief Return a clamped value between 0.0f and 1.0f. + * + * If @c a is NaN then zero will be returned for that lane. + */ +ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a) +{ + return clamp(0.0f, 1.0f, a); +} + +/** + * @brief Return the absolute value of the float vector. + */ +ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a) +{ + return vfloat8(svabs_f32_x(svptrue_b32(), a.m)); +} + +/** + * @brief Return a float rounded to the nearest integer value. + */ +ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a) +{ + return vfloat8(svrintn_f32_x(svptrue_b32(), a.m)); +} + +/** + * @brief Return the horizontal minimum of a vector. + */ +ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a) +{ + return vfloat8(svminnmv_f32(svptrue_b32(), a.m)); +} + +/** + * @brief Return the horizontal minimum of a vector. + */ +ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a) +{ + return svminnmv_f32(svptrue_b32(), a.m); +} + +/** + * @brief Return the horizontal maximum of a vector. + */ +ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a) +{ + return vfloat8(svmaxnmv_f32(svptrue_b32(), a.m)); +} + +/** + * @brief Return the horizontal maximum of a vector. + */ +ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a) +{ + return svmaxnmv_f32(svptrue_b32(), a.m); +} + +/** + * @brief Return the horizontal sum of a vector. + */ +ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a) +{ + // Can't use svaddv - it's not invariant + vfloat4 lo(svget_neonq_f32(a.m)); + vfloat4 hi(svget_neonq_f32(svext_f32(a.m, a.m, 4))); + return hadd_s(lo) + hadd_s(hi); +} + +/** + * @brief Return lanes from @c b if @c cond is set, else @c a. + */ +ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond) +{ + return vfloat8(svsel_f32(cond.m, b.m, a.m)); +} + +/** + * @brief Accumulate lane-wise sums for a vector, folded 4-wide. + * + * This is invariant with 4-wide implementations. + */ +ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a) +{ + vfloat4 lo(svget_neonq_f32(a.m)); + haccumulate(accum, lo); + + vfloat4 hi(svget_neonq_f32(svext_f32(a.m, a.m, 4))); + haccumulate(accum, hi); +} + +/** + * @brief Accumulate lane-wise sums for a vector. + * + * This is NOT invariant with 4-wide implementations. + */ +ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a) +{ + accum += a; +} + +/** + * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide. + * + * This is invariant with 4-wide implementations. + */ +ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m) +{ + a = select(vfloat8::zero(), a, m); + haccumulate(accum, a); +} + +/** + * @brief Accumulate masked lane-wise sums for a vector. + * + * This is NOT invariant with 4-wide implementations. + */ +ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m) +{ + accum.m = svadd_f32_m(m.m, accum.m, a.m); +} + +/** + * @brief Return the sqrt of the lanes in the vector. + */ +ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a) +{ + return vfloat8(svsqrt_f32_x(svptrue_b32(), a.m)); +} + +/** + * @brief Load a vector of gathered results from an array; + */ +ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices) +{ + return vfloat8(svld1_gather_s32index_f32(svptrue_b32(), base, indices.m)); +} + +/** + * @brief Load a vector of gathered results from an array using byte indices from memory + */ +template<> +ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds(const float* base, const uint8_t* indices) +{ + svint32_t offsets = svld1ub_s32(svptrue_b32(), indices); + return vfloat8(svld1_gather_s32index_f32(svptrue_b32(), base, offsets)); +} + +/** + * @brief Store a vector to an unaligned memory address. + */ +ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p) +{ + svst1_f32(svptrue_b32(), p, a.m); +} + +/** + * @brief Store a vector to a 32B aligned memory address. + */ +ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p) +{ + svst1_f32(svptrue_b32(), p, a.m); +} + +/** + * @brief Return a integer value for a float vector, using truncation. + */ +ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a) +{ + return vint8(svcvt_s32_f32_x(svptrue_b32(), a.m)); +} + +/** + * @brief Return a integer value for a float vector, using round-to-nearest. + */ +ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a) +{ + a = a + vfloat8(0.5f); + return vint8(svcvt_s32_f32_x(svptrue_b32(), a.m)); +} + +/** + * @brief Return a float value for an integer vector. + */ +ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a) +{ + return vfloat8(svcvt_f32_s32_x(svptrue_b32(), a.m)); +} + +/** + * @brief Return a float value as an integer bit pattern (i.e. no conversion). + * + * It is a common trick to convert floats into integer bit patterns, perform + * some bit hackery based on knowledge they are IEEE 754 layout, and then + * convert them back again. This is the first half of that flip. + */ +ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a) +{ + return vint8(svreinterpret_s32_f32(a.m)); +} + +/** + * @brief Return a integer value as a float bit pattern (i.e. no conversion). + * + * It is a common trick to convert floats into integer bit patterns, perform + * some bit hackery based on knowledge they are IEEE 754 layout, and then + * convert them back again. This is the second half of that flip. + */ +ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a) +{ + return vfloat8(svreinterpret_f32_s32(a.m)); +} + +/* + * Table structure for a 16x 8-bit entry table. + */ +struct vtable8_16x8 { + svuint8_8_t t0; +}; + +/* + * Table structure for a 32x 8-bit entry table. + */ +struct vtable8_32x8 { + svuint8_8_t t0; +}; + +/* + * Table structure for a 64x 8-bit entry table. + */ +struct vtable8_64x8 { + svuint8_8_t t0; + svuint8_8_t t1; +}; + +/** + * @brief Prepare a vtable lookup table for 16x 8-bit entry table. + */ +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable8_16x8& table, + const uint8_t* data +) { + // Top half of register will be zeros + table.t0 = svld1_u8(svptrue_pat_b8(SV_VL16), data); +} + +/** + * @brief Prepare a vtable lookup table for 32x 8-bit entry table. + */ +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable8_32x8& table, + const uint8_t* data +) { + table.t0 = svld1_u8(svptrue_b8(), data); +} + +/** + * @brief Prepare a vtable lookup table 64x 8-bit entry table. + */ +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable8_64x8& table, + const uint8_t* data +) { + table.t0 = svld1_u8(svptrue_b8(), data); + table.t1 = svld1_u8(svptrue_b8(), data + 32); +} + +/** + * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. + */ +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( + const vtable8_16x8& tbl, + vint8 idx +) { + // Set index byte above max index for unused bytes so table lookup returns zero + svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00)); + svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked); + + svuint8_8_t result = svtbl_u8(tbl.t0, idx_bytes); + return vint8(svreinterpret_s32_u8(result)); +} + +/** + * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. + */ +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( + const vtable8_32x8& tbl, + vint8 idx +) { + // Set index byte above max index for unused bytes so table lookup returns zero + svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00)); + svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked); + + svuint8_8_t result = svtbl_u8(tbl.t0, idx_bytes); + return vint8(svreinterpret_s32_u8(result)); +} + +/** + * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. + * + * Future: SVE2 can directly do svtbl2_u8() for a two register table. + */ +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( + const vtable8_64x8& tbl, + vint8 idx +) { + // Set index byte above max index for unused bytes so table lookup returns zero + svint32_8_t idxm = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00)); + + svuint8_8_t idxm8 = svreinterpret_u8_s32(idxm); + svuint8_8_t t0_lookup = svtbl_u8(tbl.t0, idxm8); + + idxm8 = svsub_u8_x(svptrue_b8(), idxm8, svdup_u8(32)); + svuint8_8_t t1_lookup = svtbl_u8(tbl.t1, idxm8); + + svuint8_8_t result = svorr_u8_x(svptrue_b32(), t0_lookup, t1_lookup); + return vint8(svreinterpret_s32_u8(result)); +} + +/** + * @brief Return a vector of interleaved RGBA data. + * + * Input vectors have the value stored in the bottom 8 bits of each lane, + * with high bits set to zero. + * + * Output vector stores a single RGBA texel packed in each lane. + */ +ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a) +{ + return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a); +} + +/** + * @brief Store a vector, skipping masked lanes. + * + * All masked lanes must be at the end of vector, after all non-masked lanes. + */ +ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask) +{ + svst1_s32(mask.m, reinterpret_cast(base), data.m); +} + +/** + * @brief Debug function to print a vector of ints. + */ +ASTCENC_SIMD_INLINE void print(vint8 a) +{ + alignas(32) int v[8]; + storea(a, v); + printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n", + v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); +} + +/** + * @brief Debug function to print a vector of ints. + */ +ASTCENC_SIMD_INLINE void printx(vint8 a) +{ + alignas(32) int v[8]; + storea(a, v); + printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n", + v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); +} + +/** + * @brief Debug function to print a vector of floats. + */ +ASTCENC_SIMD_INLINE void print(vfloat8 a) +{ + alignas(32) float v[8]; + storea(a, v); + printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n", + static_cast(v[0]), static_cast(v[1]), + static_cast(v[2]), static_cast(v[3]), + static_cast(v[4]), static_cast(v[5]), + static_cast(v[6]), static_cast(v[7])); +} + +/** + * @brief Debug function to print a vector of masks. + */ +ASTCENC_SIMD_INLINE void print(vmask8 a) +{ + print(select(vint8(0), vint8(1), a)); +} + +#endif // #ifndef ASTC_VECMATHLIB_SVE_8_H_INCLUDED diff --git a/3rdparty/astcenc/astcenc_weight_align.cpp b/3rdparty/astcenc/astcenc_weight_align.cpp index 4e993e7397..b20541644a 100644 --- a/3rdparty/astcenc/astcenc_weight_align.cpp +++ b/3rdparty/astcenc/astcenc_weight_align.cpp @@ -43,6 +43,7 @@ #include #include #include +#include static constexpr unsigned int ANGULAR_STEPS { 32 }; @@ -104,14 +105,17 @@ static void compute_angular_offsets( // Precompute isample; arrays are always allocated 64 elements long for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) { - // Add 2^23 and interpreting bits extracts round-to-nearest int - vfloat sample = loada(dec_weight_ideal_value + i) * (SINCOS_STEPS - 1.0f) + vfloat(12582912.0f); - vint isample = float_as_int(sample) & vint((SINCOS_STEPS - 1)); + // Ideal weight can be outside [0, 1] range, so clamp to fit table + vfloat ideal_weight = clampzo(loada(dec_weight_ideal_value + i)); + + // Convert a weight to a sincos table index + vfloat sample = ideal_weight * (SINCOS_STEPS - 1.0f); + vint isample = float_to_int_rtn(sample); storea(isample, isamplev + i); } // Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max - vfloat mult = vfloat(1.0f / (2.0f * astc::PI)); + vfloat mult(1.0f / (2.0f * astc::PI)); for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH) { @@ -164,18 +168,41 @@ static void compute_lowest_and_highest_weight( promise(weight_count > 0); promise(max_angular_steps > 0); - vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f); + vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f); + + // Compute minimum/maximum weights in the weight array. Our remapping + // is monotonic, so the min/max rounded weights relate to the min/max + // unrounded weights in a straightforward way. + vfloat min_weight(FLT_MAX); + vfloat max_weight(-FLT_MAX); + + vint lane_id = vint::lane_id(); + for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) + { + vmask active = lane_id < vint(weight_count); + lane_id += vint(ASTCENC_SIMD_WIDTH); + + vfloat weights = loada(dec_weight_ideal_value + i); + min_weight = min(min_weight, select(min_weight, weights, active)); + max_weight = max(max_weight, select(max_weight, weights, active)); + } + + min_weight = hmin(min_weight); + max_weight = hmax(max_weight); // Arrays are ANGULAR_STEPS long, so always safe to run full vectors for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH) { - vfloat minidx(128.0f); - vfloat maxidx(-128.0f); vfloat errval = vfloat::zero(); vfloat cut_low_weight_err = vfloat::zero(); vfloat cut_high_weight_err = vfloat::zero(); vfloat offset = loada(offsets + sp); + // We know the min and max weight values, so we can figure out + // the corresponding indices before we enter the loop. + vfloat minidx = round(min_weight * rcp_stepsize - offset); + vfloat maxidx = round(max_weight * rcp_stepsize - offset); + for (unsigned int j = 0; j < weight_count; j++) { vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset; @@ -183,22 +210,12 @@ static void compute_lowest_and_highest_weight( vfloat diff = sval - svalrte; errval += diff * diff; - // Reset tracker on min hit - vmask mask = svalrte < minidx; - minidx = select(minidx, svalrte, mask); - cut_low_weight_err = select(cut_low_weight_err, vfloat::zero(), mask); - - // Accumulate on min hit - mask = svalrte == minidx; + // Accumulate errors for minimum index + vmask mask = svalrte == minidx; vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff; cut_low_weight_err = select(cut_low_weight_err, accum, mask); - // Reset tracker on max hit - mask = svalrte > maxidx; - maxidx = select(maxidx, svalrte, mask); - cut_high_weight_err = select(cut_high_weight_err, vfloat::zero(), mask); - - // Accumulate on max hit + // Accumulate errors for maximum index mask = svalrte == maxidx; accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff; cut_high_weight_err = select(cut_high_weight_err, accum, mask); diff --git a/manifest.json b/manifest.json index 6b75da9796..86f9b366e9 100644 --- a/manifest.json +++ b/manifest.json @@ -1,6 +1,6 @@ { "versions": { - "1kdist": "v93", + "1kdist": "v95", "oboe": "1.9.0", "kcp": "v1.7-f2aa30e", "lz4": "v1.10.0", @@ -9,6 +9,7 @@ }, "mirrors": { "github": { + "host": "github.com", "1kdist": "simdsoft/1kiss/releases/download", "oboe": "google/oboe.git", "kcp": "skywind3000/kcp.git", @@ -19,6 +20,7 @@ }, "gitee": { + "host": "gitee.com", "1kdist": "simdsoft/1kiss/releases/download", "oboe": "simdsoft/oboe.git", "kcp": "simdsoft/kcp.git",