mirror of https://github.com/axmolengine/axmol.git
712 lines
21 KiB
C++
712 lines
21 KiB
C++
|
// SPDX-License-Identifier: Apache-2.0
|
||
|
// ----------------------------------------------------------------------------
|
||
|
// Copyright 2011-2021 Arm Limited
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||
|
// use this file except in compliance with the License. You may obtain a copy
|
||
|
// of the License at:
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||
|
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||
|
// License for the specific language governing permissions and limitations
|
||
|
// under the License.
|
||
|
// ----------------------------------------------------------------------------
|
||
|
|
||
|
/**
|
||
|
* @brief Functions for finding dominant direction of a set of colors.
|
||
|
*/
|
||
|
#if !defined(ASTCENC_DECOMPRESS_ONLY)
|
||
|
|
||
|
#include "astcenc_internal.h"
|
||
|
|
||
|
#include <cassert>
|
||
|
|
||
|
/* See header for documentation. */
|
||
|
void compute_avgs_and_dirs_4_comp(
|
||
|
const partition_info& pi,
|
||
|
const image_block& blk,
|
||
|
const error_weight_block& ewb,
|
||
|
partition_metrics pm[BLOCK_MAX_PARTITIONS]
|
||
|
) {
|
||
|
int partition_count = pi.partition_count;
|
||
|
promise(partition_count > 0);
|
||
|
|
||
|
for (int partition = 0; partition < partition_count; partition++)
|
||
|
{
|
||
|
const uint8_t *weights = pi.texels_of_partition[partition];
|
||
|
|
||
|
vfloat4 error_sum = vfloat4::zero();
|
||
|
vfloat4 base_sum = vfloat4::zero();
|
||
|
vfloat4 rgba_min(1e38f);
|
||
|
vfloat4 rgba_max(-1e38f);
|
||
|
float partition_weight = 0.0f;
|
||
|
|
||
|
int texel_count = pi.partition_texel_count[partition];
|
||
|
promise(texel_count > 0);
|
||
|
|
||
|
for (int i = 0; i < texel_count; i++)
|
||
|
{
|
||
|
int iwt = weights[i];
|
||
|
float weight = ewb.texel_weight[iwt];
|
||
|
vfloat4 texel_datum = blk.texel(iwt);
|
||
|
vfloat4 error_weight = ewb.error_weights[iwt];
|
||
|
|
||
|
if (weight > 1e-10f)
|
||
|
{
|
||
|
rgba_min = min(texel_datum, rgba_min);
|
||
|
rgba_max = max(texel_datum, rgba_max);
|
||
|
}
|
||
|
|
||
|
partition_weight += weight;
|
||
|
base_sum += texel_datum * weight;
|
||
|
error_sum += error_weight;
|
||
|
}
|
||
|
|
||
|
error_sum = error_sum / texel_count;
|
||
|
vfloat4 csf = normalize(sqrt(error_sum)) * 2.0f;
|
||
|
|
||
|
vfloat4 average = base_sum * (1.0f / astc::max(partition_weight, 1e-7f));
|
||
|
|
||
|
pm[partition].error_weight = error_sum;
|
||
|
pm[partition].avg = average * csf;
|
||
|
pm[partition].color_scale = csf;
|
||
|
pm[partition].icolor_scale = 1.0f / max(csf, 1e-7f);
|
||
|
vfloat4 range = max(rgba_max - rgba_min, 1e-10f);
|
||
|
pm[partition].range_sq = range * range;
|
||
|
|
||
|
vfloat4 sum_xp = vfloat4::zero();
|
||
|
vfloat4 sum_yp = vfloat4::zero();
|
||
|
vfloat4 sum_zp = vfloat4::zero();
|
||
|
vfloat4 sum_wp = vfloat4::zero();
|
||
|
|
||
|
for (int i = 0; i < texel_count; i++)
|
||
|
{
|
||
|
int iwt = weights[i];
|
||
|
float weight = ewb.texel_weight[iwt];
|
||
|
vfloat4 texel_datum = blk.texel(iwt);
|
||
|
texel_datum = (texel_datum - average) * weight;
|
||
|
|
||
|
vfloat4 zero = vfloat4::zero();
|
||
|
|
||
|
vmask4 tdm0 = vfloat4(texel_datum.lane<0>()) > zero;
|
||
|
sum_xp += select(zero, texel_datum, tdm0);
|
||
|
|
||
|
vmask4 tdm1 = vfloat4(texel_datum.lane<1>()) > zero;
|
||
|
sum_yp += select(zero, texel_datum, tdm1);
|
||
|
|
||
|
vmask4 tdm2 = vfloat4(texel_datum.lane<2>()) > zero;
|
||
|
sum_zp += select(zero, texel_datum, tdm2);
|
||
|
|
||
|
vmask4 tdm3 = vfloat4(texel_datum.lane<3>()) > zero;
|
||
|
sum_wp += select(zero, texel_datum, tdm3);
|
||
|
}
|
||
|
|
||
|
float prod_xp = dot_s(sum_xp, sum_xp);
|
||
|
float prod_yp = dot_s(sum_yp, sum_yp);
|
||
|
float prod_zp = dot_s(sum_zp, sum_zp);
|
||
|
float prod_wp = dot_s(sum_wp, sum_wp);
|
||
|
|
||
|
vfloat4 best_vector = sum_xp;
|
||
|
float best_sum = prod_xp;
|
||
|
|
||
|
if (prod_yp > best_sum)
|
||
|
{
|
||
|
best_vector = sum_yp;
|
||
|
best_sum = prod_yp;
|
||
|
}
|
||
|
|
||
|
if (prod_zp > best_sum)
|
||
|
{
|
||
|
best_vector = sum_zp;
|
||
|
best_sum = prod_zp;
|
||
|
}
|
||
|
|
||
|
if (prod_wp > best_sum)
|
||
|
{
|
||
|
best_vector = sum_wp;
|
||
|
}
|
||
|
|
||
|
pm[partition].dir = best_vector;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* See header for documentation. */
|
||
|
void compute_avgs_and_dirs_3_comp(
|
||
|
const partition_info& pi,
|
||
|
const image_block& blk,
|
||
|
const error_weight_block& ewb,
|
||
|
unsigned int omitted_component,
|
||
|
partition_metrics pm[BLOCK_MAX_PARTITIONS]
|
||
|
) {
|
||
|
const float *texel_weights = ewb.texel_weight_rgb;
|
||
|
|
||
|
const float* data_vr = blk.data_r;
|
||
|
const float* data_vg = blk.data_g;
|
||
|
const float* data_vb = blk.data_b;
|
||
|
|
||
|
const float* error_vr = ewb.texel_weight_r;
|
||
|
const float* error_vg = ewb.texel_weight_g;
|
||
|
const float* error_vb = ewb.texel_weight_b;
|
||
|
|
||
|
if (omitted_component == 0)
|
||
|
{
|
||
|
texel_weights = ewb.texel_weight_gba;
|
||
|
|
||
|
data_vr = blk.data_g;
|
||
|
data_vg = blk.data_b;
|
||
|
data_vb = blk.data_a;
|
||
|
|
||
|
error_vr = ewb.texel_weight_g;
|
||
|
error_vg = ewb.texel_weight_b;
|
||
|
error_vb = ewb.texel_weight_a;
|
||
|
}
|
||
|
else if (omitted_component == 1)
|
||
|
{
|
||
|
texel_weights = ewb.texel_weight_rba;
|
||
|
|
||
|
data_vg = blk.data_b;
|
||
|
data_vb = blk.data_a;
|
||
|
|
||
|
error_vg = ewb.texel_weight_b;
|
||
|
error_vb = ewb.texel_weight_a;
|
||
|
}
|
||
|
else if (omitted_component == 2)
|
||
|
{
|
||
|
texel_weights = ewb.texel_weight_rga;
|
||
|
|
||
|
data_vb = blk.data_a;
|
||
|
|
||
|
error_vb = ewb.texel_weight_a;
|
||
|
}
|
||
|
|
||
|
int partition_count = pi.partition_count;
|
||
|
promise(partition_count > 0);
|
||
|
|
||
|
for (int partition = 0; partition < partition_count; partition++)
|
||
|
{
|
||
|
const uint8_t *weights = pi.texels_of_partition[partition];
|
||
|
|
||
|
vfloat4 error_sum = vfloat4::zero();
|
||
|
vfloat4 base_sum = vfloat4::zero();
|
||
|
vfloat4 rgb_min(1e38f);
|
||
|
vfloat4 rgb_max(-1e38f);
|
||
|
float partition_weight = 0.0f;
|
||
|
|
||
|
int texel_count = pi.partition_texel_count[partition];
|
||
|
promise(texel_count > 0);
|
||
|
|
||
|
for (int i = 0; i < texel_count; i++)
|
||
|
{
|
||
|
int iwt = weights[i];
|
||
|
float weight = texel_weights[iwt];
|
||
|
|
||
|
vfloat4 texel_datum(data_vr[iwt],
|
||
|
data_vg[iwt],
|
||
|
data_vb[iwt],
|
||
|
0.0f);
|
||
|
|
||
|
vfloat4 error_weight(error_vr[iwt],
|
||
|
error_vg[iwt],
|
||
|
error_vb[iwt],
|
||
|
0.0f);
|
||
|
|
||
|
if (weight > 1e-10f)
|
||
|
{
|
||
|
rgb_min = min(texel_datum, rgb_min);
|
||
|
rgb_max = max(texel_datum, rgb_max);
|
||
|
}
|
||
|
|
||
|
partition_weight += weight;
|
||
|
base_sum += texel_datum * weight;
|
||
|
error_sum += error_weight;
|
||
|
}
|
||
|
|
||
|
error_sum = error_sum / texel_count;
|
||
|
vfloat4 csf = normalize(sqrt(error_sum)) * 1.73205080f;
|
||
|
|
||
|
vfloat4 average = base_sum * (1.0f / astc::max(partition_weight, 1e-7f));
|
||
|
|
||
|
pm[partition].error_weight = error_sum;
|
||
|
pm[partition].avg = average * csf;
|
||
|
pm[partition].color_scale = csf;
|
||
|
pm[partition].icolor_scale = 1.0f / max(csf, 1e-7f);
|
||
|
vfloat4 range = max(rgb_max - rgb_min, 1e-10f);
|
||
|
pm[partition].range_sq = range * range;
|
||
|
|
||
|
vfloat4 sum_xp = vfloat4::zero();
|
||
|
vfloat4 sum_yp = vfloat4::zero();
|
||
|
vfloat4 sum_zp = vfloat4::zero();
|
||
|
|
||
|
for (int i = 0; i < texel_count; i++)
|
||
|
{
|
||
|
int iwt = weights[i];
|
||
|
float weight = texel_weights[iwt];
|
||
|
vfloat4 texel_datum = vfloat3(data_vr[iwt],
|
||
|
data_vg[iwt],
|
||
|
data_vb[iwt]);
|
||
|
texel_datum = (texel_datum - average) * weight;
|
||
|
|
||
|
vfloat4 zero = vfloat4::zero();
|
||
|
|
||
|
vmask4 tdm0 = vfloat4(texel_datum.lane<0>()) > zero;
|
||
|
sum_xp += select(zero, texel_datum, tdm0);
|
||
|
|
||
|
vmask4 tdm1 = vfloat4(texel_datum.lane<1>()) > zero;
|
||
|
sum_yp += select(zero, texel_datum, tdm1);
|
||
|
|
||
|
vmask4 tdm2 = vfloat4(texel_datum.lane<2>()) > zero;
|
||
|
sum_zp += select(zero, texel_datum, tdm2);
|
||
|
}
|
||
|
|
||
|
float prod_xp = dot3_s(sum_xp, sum_xp);
|
||
|
float prod_yp = dot3_s(sum_yp, sum_yp);
|
||
|
float prod_zp = dot3_s(sum_zp, sum_zp);
|
||
|
|
||
|
vfloat4 best_vector = sum_xp;
|
||
|
float best_sum = prod_xp;
|
||
|
|
||
|
if (prod_yp > best_sum)
|
||
|
{
|
||
|
best_vector = sum_yp;
|
||
|
best_sum = prod_yp;
|
||
|
}
|
||
|
|
||
|
if (prod_zp > best_sum)
|
||
|
{
|
||
|
best_vector = sum_zp;
|
||
|
}
|
||
|
|
||
|
pm[partition].dir = best_vector;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* See header for documentation. */
|
||
|
void compute_avgs_and_dirs_2_comp(
|
||
|
const partition_info& pt,
|
||
|
const image_block& blk,
|
||
|
const error_weight_block& ewb,
|
||
|
unsigned int component1,
|
||
|
unsigned int component2,
|
||
|
partition_metrics pm[BLOCK_MAX_PARTITIONS]
|
||
|
) {
|
||
|
const float *texel_weights;
|
||
|
|
||
|
const float* data_vr = nullptr;
|
||
|
const float* data_vg = nullptr;
|
||
|
|
||
|
const float* error_vr = nullptr;
|
||
|
const float* error_vg = nullptr;
|
||
|
|
||
|
if (component1 == 0 && component2 == 1)
|
||
|
{
|
||
|
texel_weights = ewb.texel_weight_rg;
|
||
|
|
||
|
data_vr = blk.data_r;
|
||
|
data_vg = blk.data_g;
|
||
|
|
||
|
error_vr = ewb.texel_weight_r;
|
||
|
error_vg = ewb.texel_weight_g;
|
||
|
}
|
||
|
else if (component1 == 0 && component2 == 2)
|
||
|
{
|
||
|
texel_weights = ewb.texel_weight_rb;
|
||
|
|
||
|
data_vr = blk.data_r;
|
||
|
data_vg = blk.data_b;
|
||
|
|
||
|
error_vr = ewb.texel_weight_r;
|
||
|
error_vg = ewb.texel_weight_b;
|
||
|
}
|
||
|
else // (component1 == 1 && component2 == 2)
|
||
|
{
|
||
|
assert(component1 == 1 && component2 == 2);
|
||
|
texel_weights = ewb.texel_weight_gb;
|
||
|
|
||
|
data_vr = blk.data_g;
|
||
|
data_vg = blk.data_b;
|
||
|
|
||
|
|
||
|
error_vr = ewb.texel_weight_g;
|
||
|
error_vg = ewb.texel_weight_b;
|
||
|
}
|
||
|
|
||
|
unsigned int partition_count = pt.partition_count;
|
||
|
promise(partition_count > 0);
|
||
|
|
||
|
for (unsigned int partition = 0; partition < partition_count; partition++)
|
||
|
{
|
||
|
const uint8_t *weights = pt.texels_of_partition[partition];
|
||
|
|
||
|
vfloat4 error_sum = vfloat4::zero();
|
||
|
vfloat4 base_sum = vfloat4::zero();
|
||
|
float partition_weight = 0.0f;
|
||
|
|
||
|
unsigned int texel_count = pt.partition_texel_count[partition];
|
||
|
promise(texel_count > 0);
|
||
|
|
||
|
for (unsigned int i = 0; i < texel_count; i++)
|
||
|
{
|
||
|
unsigned int iwt = weights[i];
|
||
|
float weight = texel_weights[iwt];
|
||
|
vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]) * weight;
|
||
|
|
||
|
vfloat4 error_weight = vfloat2(error_vr[iwt], error_vg[iwt]);
|
||
|
|
||
|
partition_weight += weight;
|
||
|
base_sum += texel_datum;
|
||
|
error_sum += error_weight;
|
||
|
}
|
||
|
|
||
|
error_sum = error_sum / texel_count;
|
||
|
vfloat4 csf = normalize(sqrt(error_sum)) * 1.41421356f;
|
||
|
vfloat4 average = base_sum * (1.0f / astc::max(partition_weight, 1e-7f));
|
||
|
|
||
|
|
||
|
pm[partition].error_weight = error_sum;
|
||
|
pm[partition].avg = average * csf;
|
||
|
pm[partition].color_scale = csf;
|
||
|
pm[partition].icolor_scale = 1.0f / max(csf, 1e-7f);
|
||
|
|
||
|
vfloat4 sum_xp = vfloat4::zero();
|
||
|
vfloat4 sum_yp = vfloat4::zero();
|
||
|
|
||
|
for (unsigned int i = 0; i < texel_count; i++)
|
||
|
{
|
||
|
unsigned int iwt = weights[i];
|
||
|
float weight = texel_weights[iwt];
|
||
|
vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
|
||
|
texel_datum = (texel_datum - average) * weight;
|
||
|
|
||
|
vfloat4 zero = vfloat4::zero();
|
||
|
|
||
|
vmask4 tdm0 = vfloat4(texel_datum.lane<0>()) > zero;
|
||
|
sum_xp += select(zero, texel_datum, tdm0);
|
||
|
|
||
|
vmask4 tdm1 = vfloat4(texel_datum.lane<1>()) > zero;
|
||
|
sum_yp += select(zero, texel_datum, tdm1);
|
||
|
}
|
||
|
|
||
|
float prod_xp = dot_s(sum_xp, sum_xp);
|
||
|
float prod_yp = dot_s(sum_yp, sum_yp);
|
||
|
|
||
|
vfloat4 best_vector = sum_xp;
|
||
|
float best_sum = prod_xp;
|
||
|
|
||
|
if (prod_yp > best_sum)
|
||
|
{
|
||
|
best_vector = sum_yp;
|
||
|
}
|
||
|
|
||
|
pm[partition].dir = best_vector;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* See header for documentation. */
|
||
|
void compute_error_squared_rgba(
|
||
|
const partition_info& pi,
|
||
|
const image_block& blk,
|
||
|
const error_weight_block& ewb,
|
||
|
const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
|
||
|
const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
|
||
|
float uncor_lengths[BLOCK_MAX_PARTITIONS],
|
||
|
float samec_lengths[BLOCK_MAX_PARTITIONS],
|
||
|
float& uncor_error,
|
||
|
float& samec_error
|
||
|
) {
|
||
|
unsigned int partition_count = pi.partition_count;
|
||
|
promise(partition_count > 0);
|
||
|
|
||
|
uncor_error = 0.0f;
|
||
|
samec_error = 0.0f;
|
||
|
|
||
|
for (unsigned int partition = 0; partition < partition_count; partition++)
|
||
|
{
|
||
|
const uint8_t *weights = pi.texels_of_partition[partition];
|
||
|
|
||
|
float uncor_loparam = 1e10f;
|
||
|
float uncor_hiparam = -1e10f;
|
||
|
|
||
|
float samec_loparam = 1e10f;
|
||
|
float samec_hiparam = -1e10f;
|
||
|
|
||
|
processed_line4 l_uncor = uncor_plines[partition];
|
||
|
processed_line4 l_samec = samec_plines[partition];
|
||
|
|
||
|
unsigned int texel_count = pi.partition_texel_count[partition];
|
||
|
promise(texel_count > 0);
|
||
|
|
||
|
// Vectorize some useful scalar inputs
|
||
|
vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
|
||
|
vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
|
||
|
vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
|
||
|
vfloat l_uncor_bs3(l_uncor.bs.lane<3>());
|
||
|
|
||
|
vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
|
||
|
vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
|
||
|
vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
|
||
|
vfloat l_uncor_amod3(l_uncor.amod.lane<3>());
|
||
|
|
||
|
vfloat l_uncor_bis0(l_uncor.bis.lane<0>());
|
||
|
vfloat l_uncor_bis1(l_uncor.bis.lane<1>());
|
||
|
vfloat l_uncor_bis2(l_uncor.bis.lane<2>());
|
||
|
vfloat l_uncor_bis3(l_uncor.bis.lane<3>());
|
||
|
|
||
|
vfloat l_samec_bs0(l_samec.bs.lane<0>());
|
||
|
vfloat l_samec_bs1(l_samec.bs.lane<1>());
|
||
|
vfloat l_samec_bs2(l_samec.bs.lane<2>());
|
||
|
vfloat l_samec_bs3(l_samec.bs.lane<3>());
|
||
|
|
||
|
assert(all(l_samec.amod == vfloat4(0.0f)));
|
||
|
|
||
|
vfloat l_samec_bis0(l_samec.bis.lane<0>());
|
||
|
vfloat l_samec_bis1(l_samec.bis.lane<1>());
|
||
|
vfloat l_samec_bis2(l_samec.bis.lane<2>());
|
||
|
vfloat l_samec_bis3(l_samec.bis.lane<3>());
|
||
|
|
||
|
vfloat uncor_loparamv(1e10f);
|
||
|
vfloat uncor_hiparamv(-1e10f);
|
||
|
vfloat4 uncor_errorsumv = vfloat4::zero();
|
||
|
|
||
|
vfloat samec_loparamv(1e10f);
|
||
|
vfloat samec_hiparamv(-1e10f);
|
||
|
vfloat4 samec_errorsumv = vfloat4::zero();
|
||
|
|
||
|
// This implementation over-shoots, but this is safe as we initialize the weights array
|
||
|
// to extend the last value. This means min/max are not impacted, but we need to mask
|
||
|
// out the dummy values when we compute the line weighting.
|
||
|
vint lane_ids = vint::lane_id();
|
||
|
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||
|
{
|
||
|
vmask mask = lane_ids < vint(texel_count);
|
||
|
vint texel_idxs(&(weights[i]));
|
||
|
|
||
|
vfloat data_r = gatherf(blk.data_r, texel_idxs);
|
||
|
vfloat data_g = gatherf(blk.data_g, texel_idxs);
|
||
|
vfloat data_b = gatherf(blk.data_b, texel_idxs);
|
||
|
vfloat data_a = gatherf(blk.data_a, texel_idxs);
|
||
|
|
||
|
vfloat ew_r = gatherf(ewb.texel_weight_r, texel_idxs);
|
||
|
vfloat ew_g = gatherf(ewb.texel_weight_g, texel_idxs);
|
||
|
vfloat ew_b = gatherf(ewb.texel_weight_b, texel_idxs);
|
||
|
vfloat ew_a = gatherf(ewb.texel_weight_a, texel_idxs);
|
||
|
|
||
|
vfloat uncor_param = (data_r * l_uncor_bs0)
|
||
|
+ (data_g * l_uncor_bs1)
|
||
|
+ (data_b * l_uncor_bs2)
|
||
|
+ (data_a * l_uncor_bs3);
|
||
|
|
||
|
uncor_loparamv = min(uncor_param, uncor_loparamv);
|
||
|
uncor_hiparamv = max(uncor_param, uncor_hiparamv);
|
||
|
|
||
|
vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
|
||
|
+ (uncor_param * l_uncor_bis0);
|
||
|
vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
|
||
|
+ (uncor_param * l_uncor_bis1);
|
||
|
vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
|
||
|
+ (uncor_param * l_uncor_bis2);
|
||
|
vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
|
||
|
+ (uncor_param * l_uncor_bis3);
|
||
|
|
||
|
vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
|
||
|
+ (ew_g * uncor_dist1 * uncor_dist1)
|
||
|
+ (ew_b * uncor_dist2 * uncor_dist2)
|
||
|
+ (ew_a * uncor_dist3 * uncor_dist3);
|
||
|
|
||
|
uncor_err = select(vfloat::zero(), uncor_err, mask);
|
||
|
haccumulate(uncor_errorsumv, uncor_err);
|
||
|
|
||
|
// Process samechroma data
|
||
|
vfloat samec_param = (data_r * l_samec_bs0)
|
||
|
+ (data_g * l_samec_bs1)
|
||
|
+ (data_b * l_samec_bs2)
|
||
|
+ (data_a * l_samec_bs3);
|
||
|
|
||
|
samec_loparamv = min(samec_param, samec_loparamv);
|
||
|
samec_hiparamv = max(samec_param, samec_hiparamv);
|
||
|
|
||
|
vfloat samec_dist0 = samec_param * l_samec_bis0 - data_r;
|
||
|
vfloat samec_dist1 = samec_param * l_samec_bis1 - data_g;
|
||
|
vfloat samec_dist2 = samec_param * l_samec_bis2 - data_b;
|
||
|
vfloat samec_dist3 = samec_param * l_samec_bis3 - data_a;
|
||
|
|
||
|
vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
|
||
|
+ (ew_g * samec_dist1 * samec_dist1)
|
||
|
+ (ew_b * samec_dist2 * samec_dist2)
|
||
|
+ (ew_a * samec_dist3 * samec_dist3);
|
||
|
|
||
|
samec_err = select(vfloat::zero(), samec_err, mask);
|
||
|
haccumulate(samec_errorsumv, samec_err);
|
||
|
|
||
|
lane_ids = lane_ids + vint(ASTCENC_SIMD_WIDTH);
|
||
|
}
|
||
|
|
||
|
uncor_loparam = hmin_s(uncor_loparamv);
|
||
|
uncor_hiparam = hmax_s(uncor_hiparamv);
|
||
|
|
||
|
samec_loparam = hmin_s(samec_loparamv);
|
||
|
samec_hiparam = hmax_s(samec_hiparamv);
|
||
|
|
||
|
// Resolve the final scalar accumulator sum
|
||
|
haccumulate(uncor_error, uncor_errorsumv);
|
||
|
haccumulate(samec_error, samec_errorsumv);
|
||
|
|
||
|
float uncor_linelen = uncor_hiparam - uncor_loparam;
|
||
|
float samec_linelen = samec_hiparam - samec_loparam;
|
||
|
|
||
|
// Turn very small numbers and NaNs into a small number
|
||
|
uncor_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
|
||
|
samec_lengths[partition] = astc::max(samec_linelen, 1e-7f);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* See header for documentation. */
|
||
|
void compute_error_squared_rgb(
|
||
|
const partition_info& pi,
|
||
|
const image_block& blk,
|
||
|
const error_weight_block& ewb,
|
||
|
partition_lines3 plines[BLOCK_MAX_PARTITIONS],
|
||
|
float& uncor_error,
|
||
|
float& samec_error
|
||
|
) {
|
||
|
unsigned int partition_count = pi.partition_count;
|
||
|
promise(partition_count > 0);
|
||
|
|
||
|
uncor_error = 0.0f;
|
||
|
samec_error = 0.0f;
|
||
|
|
||
|
for (unsigned int partition = 0; partition < partition_count; partition++)
|
||
|
{
|
||
|
partition_lines3& pl = plines[partition];
|
||
|
const uint8_t *weights = pi.texels_of_partition[partition];
|
||
|
unsigned int texel_count = pi.partition_texel_count[partition];
|
||
|
promise(texel_count > 0);
|
||
|
|
||
|
float uncor_loparam = 1e10f;
|
||
|
float uncor_hiparam = -1e10f;
|
||
|
|
||
|
float samec_loparam = 1e10f;
|
||
|
float samec_hiparam = -1e10f;
|
||
|
|
||
|
processed_line3 l_uncor = pl.uncor_pline;
|
||
|
processed_line3 l_samec = pl.samec_pline;
|
||
|
|
||
|
// This implementation is an example vectorization of this function.
|
||
|
// It works for - the codec is a 2-4% faster than not vectorizing - but
|
||
|
// the benefit is limited by the use of gathers and register pressure
|
||
|
|
||
|
// Vectorize some useful scalar inputs
|
||
|
vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
|
||
|
vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
|
||
|
vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
|
||
|
|
||
|
vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
|
||
|
vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
|
||
|
vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
|
||
|
|
||
|
vfloat l_uncor_bis0(l_uncor.bis.lane<0>());
|
||
|
vfloat l_uncor_bis1(l_uncor.bis.lane<1>());
|
||
|
vfloat l_uncor_bis2(l_uncor.bis.lane<2>());
|
||
|
|
||
|
vfloat l_samec_bs0(l_samec.bs.lane<0>());
|
||
|
vfloat l_samec_bs1(l_samec.bs.lane<1>());
|
||
|
vfloat l_samec_bs2(l_samec.bs.lane<2>());
|
||
|
|
||
|
assert(all(l_samec.amod == vfloat4(0.0f)));
|
||
|
|
||
|
vfloat l_samec_bis0(l_samec.bis.lane<0>());
|
||
|
vfloat l_samec_bis1(l_samec.bis.lane<1>());
|
||
|
vfloat l_samec_bis2(l_samec.bis.lane<2>());
|
||
|
|
||
|
vfloat uncor_loparamv(1e10f);
|
||
|
vfloat uncor_hiparamv(-1e10f);
|
||
|
vfloat4 uncor_errorsumv = vfloat4::zero();
|
||
|
|
||
|
vfloat samec_loparamv(1e10f);
|
||
|
vfloat samec_hiparamv(-1e10f);
|
||
|
vfloat4 samec_errorsumv = vfloat4::zero();
|
||
|
|
||
|
// This implementation over-shoots, but this is safe as we initialize the weights array
|
||
|
// to extend the last value. This means min/max are not impacted, but we need to mask
|
||
|
// out the dummy values when we compute the line weighting.
|
||
|
vint lane_ids = vint::lane_id();
|
||
|
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||
|
{
|
||
|
vmask mask = lane_ids < vint(texel_count);
|
||
|
vint texel_idxs(&(weights[i]));
|
||
|
|
||
|
vfloat data_r = gatherf(blk.data_r, texel_idxs);
|
||
|
vfloat data_g = gatherf(blk.data_g, texel_idxs);
|
||
|
vfloat data_b = gatherf(blk.data_b, texel_idxs);
|
||
|
|
||
|
vfloat ew_r = gatherf(ewb.texel_weight_r, texel_idxs);
|
||
|
vfloat ew_g = gatherf(ewb.texel_weight_g, texel_idxs);
|
||
|
vfloat ew_b = gatherf(ewb.texel_weight_b, texel_idxs);
|
||
|
|
||
|
vfloat uncor_param = (data_r * l_uncor_bs0)
|
||
|
+ (data_g * l_uncor_bs1)
|
||
|
+ (data_b * l_uncor_bs2);
|
||
|
|
||
|
uncor_loparamv = min(uncor_param, uncor_loparamv);
|
||
|
uncor_hiparamv = max(uncor_param, uncor_hiparamv);
|
||
|
|
||
|
vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
|
||
|
+ (uncor_param * l_uncor_bis0);
|
||
|
vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
|
||
|
+ (uncor_param * l_uncor_bis1);
|
||
|
vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
|
||
|
+ (uncor_param * l_uncor_bis2);
|
||
|
|
||
|
vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
|
||
|
+ (ew_g * uncor_dist1 * uncor_dist1)
|
||
|
+ (ew_b * uncor_dist2 * uncor_dist2);
|
||
|
|
||
|
uncor_err = select(vfloat::zero(), uncor_err, mask);
|
||
|
haccumulate(uncor_errorsumv, uncor_err);
|
||
|
|
||
|
// Process samechroma data
|
||
|
vfloat samec_param = (data_r * l_samec_bs0)
|
||
|
+ (data_g * l_samec_bs1)
|
||
|
+ (data_b * l_samec_bs2);
|
||
|
|
||
|
samec_loparamv = min(samec_param, samec_loparamv);
|
||
|
samec_hiparamv = max(samec_param, samec_hiparamv);
|
||
|
|
||
|
|
||
|
vfloat samec_dist0 = samec_param * l_samec_bis0 - data_r;
|
||
|
vfloat samec_dist1 = samec_param * l_samec_bis1 - data_g;
|
||
|
vfloat samec_dist2 = samec_param * l_samec_bis2 - data_b;
|
||
|
|
||
|
vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
|
||
|
+ (ew_g * samec_dist1 * samec_dist1)
|
||
|
+ (ew_b * samec_dist2 * samec_dist2);
|
||
|
|
||
|
samec_err = select(vfloat::zero(), samec_err, mask);
|
||
|
haccumulate(samec_errorsumv, samec_err);
|
||
|
|
||
|
lane_ids = lane_ids + vint(ASTCENC_SIMD_WIDTH);
|
||
|
}
|
||
|
|
||
|
uncor_loparam = hmin_s(uncor_loparamv);
|
||
|
uncor_hiparam = hmax_s(uncor_hiparamv);
|
||
|
|
||
|
samec_loparam = hmin_s(samec_loparamv);
|
||
|
samec_hiparam = hmax_s(samec_hiparamv);
|
||
|
|
||
|
// Resolve the final scalar accumulator sum
|
||
|
haccumulate(uncor_error, uncor_errorsumv);
|
||
|
haccumulate(samec_error, samec_errorsumv);
|
||
|
|
||
|
float uncor_linelen = uncor_hiparam - uncor_loparam;
|
||
|
float samec_linelen = samec_hiparam - samec_loparam;
|
||
|
|
||
|
// Turn very small numbers and NaNs into a small number
|
||
|
pl.uncor_line_len = astc::max(uncor_linelen, 1e-7f);
|
||
|
pl.samec_line_len = astc::max(samec_linelen, 1e-7f);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#endif
|