axmol/external/astc/astc_weight_align.cpp

541 lines
18 KiB
C++

// ----------------------------------------------------------------------------
// This confidential and proprietary software may be used only as authorised
// by a licensing agreement from Arm Limited.
// (C) COPYRIGHT 2011-2020 Arm Limited, ALL RIGHTS RESERVED
// The entire notice above must be reproduced on all authorised copies and
// copies may only be made to the extent permitted by a licensing agreement
// from Arm Limited.
// ----------------------------------------------------------------------------
/**
* @brief Functions for angular-sum algorithm for weight alignment.
*
* This algorithm works as follows:
* - we compute a complex number P as (cos s*i, sin s*i) for each weight,
* where i is the input value and s is a scaling factor based on the spacing
* between the weights.
* - we then add together complex numbers for all the weights.
* - we then compute the length and angle of the resulting sum.
*
* This should produce the following results:
* - perfect alignment results in a vector whose length is equal to the sum of
* lengths of all inputs
* - even distribution results in a vector of length 0.
* - all samples identical results in perfect alignment for every scaling.
*
* For each scaling factor within a given set, we compute an alignment factor
* from 0 to 1. This should then result in some scalings standing out as having
* particularly good alignment factors; we can use this to produce a set of
* candidate scale/shift values for various quantization levels; we should then
* actually try them and see what happens.
*
* Assuming N quantization steps, the scaling factor becomes s=2*PI*(N-1); we
* should probably have about 1 scaling factor for every 1/4 quantization step
* (perhaps 1/8 for low levels of quantization).
*/
#include "astc_codec_internals.h"
#include <stdio.h>
static const float angular_steppings[] = {
1.0, 1.125, 1.25, 1.375, 1.5, 1.625, 1.75, 1.875,
2.0, 2.25, 2.5, 2.75,
3.0, 3.25, 3.5, 3.75,
4.0, 4.25, 4.5, 4.75,
5.0, 5.25, 5.5, 5.75,
6.0, 6.25, 6.5, 6.75,
7.0, 7.25, 7.5, 7.75,
8.0, 8.5,
9.0, 9.5,
10.0, 10.5,
11.0, 11.5,
12.0, 12.5,
13.0, 13.5,
14.0, 14.5,
15.0, 15.5,
16.0, 16.5,
17.0, 17.5,
18.0, 18.5,
19.0, 19.5,
20.0, 20.5,
21.0, 21.5,
22.0, 22.5,
23.0, 23.5,
24.0, 24.5,
25.0, 25.5,
26.0, 26.5,
27.0, 27.5,
28.0, 28.5,
29.0, 29.5,
30.0, 30.5,
31.0, 31.5,
32.0, 32.5,
33.0, 33.5,
34.0, 34.5,
35.0, 35.5
};
#define ANGULAR_STEPS ((int)(sizeof(angular_steppings)/sizeof(angular_steppings[0])))
static float stepsizes[ANGULAR_STEPS];
static float stepsizes_sqr[ANGULAR_STEPS];
static int max_angular_steps_needed_for_quant_level[13];
// we store sine/cosine values for 64 possible weight values; this causes
// slight quality loss compared to using sin() and cos() directly.
#define SINCOS_STEPS 64
static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
void prepare_angular_tables(void)
{
int i, j;
int max_angular_steps_needed_for_quant_steps[40];
for (i = 0; i < ANGULAR_STEPS; i++)
{
stepsizes[i] = 1.0f / angular_steppings[i];
stepsizes_sqr[i] = stepsizes[i] * stepsizes[i];
for (j = 0; j < SINCOS_STEPS; j++)
{
sin_table[j][i] = static_cast < float >(sin((2.0f * M_PI / (SINCOS_STEPS - 1.0f)) * angular_steppings[i] * j));
cos_table[j][i] = static_cast < float >(cos((2.0f * M_PI / (SINCOS_STEPS - 1.0f)) * angular_steppings[i] * j));
}
int p = static_cast < int >(floor(angular_steppings[i])) + 1;
max_angular_steps_needed_for_quant_steps[p] = MIN(i + 1, ANGULAR_STEPS - 1);
}
// yes, the next-to-last entry is supposed to have the value 33. This because under
// ASTC, the 32-weight mode leaves a double-sized hole in the middle of the
// weight space, so we are better off matching 33 weights than 32.
static const int steps_of_level[] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 33, 36 };
for (i = 0; i < 13; i++)
max_angular_steps_needed_for_quant_level[i] = max_angular_steps_needed_for_quant_steps[steps_of_level[i]];
}
union if32
{
float f;
int32_t s;
uint32_t u;
};
// function to compute angular sums; then, from the
// angular sums, compute alignment factor and offset.
/* static inline */
void compute_angular_offsets(int samplecount, const float *samples, const float *sample_weights, int max_angular_steps, float *offsets)
{
int i, j;
float anglesum_x[ANGULAR_STEPS];
float anglesum_y[ANGULAR_STEPS];
for (i = 0; i < max_angular_steps; i++)
{
anglesum_x[i] = 0;
anglesum_y[i] = 0;
}
// compute the angle-sums.
for (i = 0; i < samplecount; i++)
{
float sample = samples[i];
float sample_weight = sample_weights[i];
if32 p;
p.f = (sample * (SINCOS_STEPS - 1.0f)) + 12582912.0f;
unsigned int isample = p.u & 0x3F;
const float *sinptr = sin_table[isample];
const float *cosptr = cos_table[isample];
for (j = 0; j < max_angular_steps; j++)
{
float cp = cosptr[j];
float sp = sinptr[j];
anglesum_x[j] += cp * sample_weight;
anglesum_y[j] += sp * sample_weight;
}
}
// post-process the angle-sums
for (i = 0; i < max_angular_steps; i++)
{
float angle = atan2(anglesum_y[i], anglesum_x[i]); // positive angle -> positive offset
offsets[i] = angle * (stepsizes[i] * (1.0f / (2.0f * (float)M_PI)));
}
}
// for a given step-size and a given offset, compute the
// lowest and highest weight that results from quantizing using the stepsize & offset.
// also, compute the resulting error.
void compute_lowest_and_highest_weight(int samplecount, const float *samples, const float *sample_weights,
int max_angular_steps, const float *offsets,
int8_t * lowest_weight, int8_t * highest_weight,
float *error, float *cut_low_weight_error, float *cut_high_weight_error)
{
float error_from_forcing_weight_down[60];
float error_from_forcing_weight_either_way[60];
for (int i = 0; i < 60; i++)
{
error_from_forcing_weight_down[i] = 0;
error_from_forcing_weight_either_way[i] = 0;
}
// weight + 12
static const unsigned int idxtab[128] = {
12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35,
36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50, 51,
52, 53, 54, 55, 55, 55, 55, 55,
55, 55, 55, 55, 55, 55, 55, 55,
55, 55, 55, 55, 55, 55, 55, 55,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11
};
for (int sp = 0; sp < max_angular_steps; sp++)
{
unsigned int minidx_bias12 = 55;
unsigned int maxidx_bias12 = 0;
float errval = 0.0f;
float rcp_stepsize = angular_steppings[sp];
float offset = offsets[sp];
float scaled_offset = rcp_stepsize * offset;
for (int i = 0; i < samplecount - 1; i += 2)
{
float wt1 = sample_weights[i];
float wt2 = sample_weights[i + 1];
if32 p1, p2;
float sval1 = (samples[i] * rcp_stepsize) - scaled_offset;
float sval2 = (samples[i + 1] * rcp_stepsize) - scaled_offset;
p1.f = sval1 + 12582912.0f; // FP representation abuse to avoid floor() and float->int conversion
p2.f = sval2 + 12582912.0f; // FP representation abuse to avoid floor() and float->int conversion
float isval1 = p1.f - 12582912.0f;
float isval2 = p2.f - 12582912.0f;
float dif1 = sval1 - isval1;
float dif2 = sval2 - isval2;
errval += (dif1 * wt1) * dif1;
errval += (dif2 * wt2) * dif2;
// table lookups that really perform a minmax function.
unsigned int idx1_bias12 = idxtab[p1.u & 0x7F];
unsigned int idx2_bias12 = idxtab[p2.u & 0x7F];
if (idx1_bias12 < minidx_bias12)
minidx_bias12 = idx1_bias12;
if (idx1_bias12 > maxidx_bias12)
maxidx_bias12 = idx1_bias12;
if (idx2_bias12 < minidx_bias12)
minidx_bias12 = idx2_bias12;
if (idx2_bias12 > maxidx_bias12)
maxidx_bias12 = idx2_bias12;
error_from_forcing_weight_either_way[idx1_bias12] += wt1;
error_from_forcing_weight_down[idx1_bias12] += (dif1 * wt1);
error_from_forcing_weight_either_way[idx2_bias12] += wt2;
error_from_forcing_weight_down[idx2_bias12] += (dif2 * wt2);
}
if (samplecount & 1)
{
int i = samplecount - 1;
float wt = sample_weights[i];
if32 p;
float sval = (samples[i] * rcp_stepsize) - scaled_offset;
p.f = sval + 12582912.0f; // FP representation abuse to avoid floor() and float->int conversion
float isval = p.f - 12582912.0f;
float dif = sval - isval;
errval += (dif * wt) * dif;
unsigned int idx_bias12 = idxtab[p.u & 0x7F];
if (idx_bias12 < minidx_bias12)
minidx_bias12 = idx_bias12;
if (idx_bias12 > maxidx_bias12)
maxidx_bias12 = idx_bias12;
error_from_forcing_weight_either_way[idx_bias12] += wt;
error_from_forcing_weight_down[idx_bias12] += dif * wt;
}
lowest_weight[sp] = (int)minidx_bias12 - 12;
highest_weight[sp] = (int)maxidx_bias12 - 12;
error[sp] = errval;
// the cut_(lowest/highest)_weight_error indicate the error that results from
// forcing samples that should have had the (lowest/highest) weight value
// one step (up/down).
cut_low_weight_error[sp] = error_from_forcing_weight_either_way[minidx_bias12] - 2.0f * error_from_forcing_weight_down[minidx_bias12];
cut_high_weight_error[sp] = error_from_forcing_weight_either_way[maxidx_bias12] + 2.0f * error_from_forcing_weight_down[maxidx_bias12];
// clear out the error-from-forcing values we actually used in this pass
// so that these are clean for the next pass.
for (unsigned int ui = minidx_bias12 & ~0x3; ui <= maxidx_bias12; ui += 4)
{
error_from_forcing_weight_either_way[ui] = 0;
error_from_forcing_weight_down[ui] = 0;
error_from_forcing_weight_either_way[ui + 1] = 0;
error_from_forcing_weight_down[ui + 1] = 0;
error_from_forcing_weight_either_way[ui + 2] = 0;
error_from_forcing_weight_down[ui + 2] = 0;
error_from_forcing_weight_either_way[ui + 3] = 0;
error_from_forcing_weight_down[ui + 3] = 0;
}
}
for (int sp = 0; sp < max_angular_steps; sp++)
{
float errscale = stepsizes_sqr[sp];
error[sp] *= errscale;
cut_low_weight_error[sp] *= errscale;
cut_high_weight_error[sp] *= errscale;
}
}
// main function for running the angular algorithm.
void compute_angular_endpoints_for_quantization_levels(int samplecount, const float *samples, const float *sample_weights, int max_quantization_level, float low_value[12], float high_value[12])
{
int i;
max_quantization_level++; // Temporarily increase level - needs refinement
static const int quantization_steps_for_level[13] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 33, 36 };
int max_quantization_steps = quantization_steps_for_level[max_quantization_level];
float offsets[ANGULAR_STEPS];
int max_angular_steps = max_angular_steps_needed_for_quant_level[max_quantization_level];
compute_angular_offsets(samplecount, samples, sample_weights, max_angular_steps, offsets);
// the +4 offsets are to allow for vectorization within compute_lowest_and_highest_weight().
int8_t lowest_weight[ANGULAR_STEPS + 4];
int8_t highest_weight[ANGULAR_STEPS + 4];
float error[ANGULAR_STEPS + 4];
float cut_low_weight_error[ANGULAR_STEPS + 4];
float cut_high_weight_error[ANGULAR_STEPS + 4];
compute_lowest_and_highest_weight(samplecount, samples, sample_weights, max_angular_steps, offsets, lowest_weight, highest_weight, error, cut_low_weight_error, cut_high_weight_error);
#ifdef DEBUG_PRINT_DIAGNOSTICS
if (print_diagnostics)
{
printf("%s : max-angular-steps=%d \n", __func__, max_angular_steps);
printf("Samplecount=%d, max_quantization_level=%d\n", samplecount, max_quantization_level);
for (i = 0; i < samplecount; i++)
printf("Sample %d : %f (weight %f)\n", i, samples[i], sample_weights[i]);
for (i = 0; i < max_angular_steps; i++)
{
printf("%d: offset=%f error=%f lowest=%d highest=%d cl=%f ch=%f\n", i, offsets[i], error[i], lowest_weight[i], highest_weight[i], cut_low_weight_error[i], cut_high_weight_error[i]);
}
printf("\n");
}
#endif
// for each quantization level, find the best error terms.
float best_errors[40];
int best_scale[40];
uint8_t cut_low_weight[40];
for (i = 0; i < (max_quantization_steps + 4); i++)
{
best_errors[i] = 1e30f;
best_scale[i] = -1; // Indicates no solution found
cut_low_weight[i] = 0;
}
for (i = 0; i < max_angular_steps; i++)
{
int samplecount_weight = highest_weight[i] - lowest_weight[i] + 1;
if (samplecount_weight >= (max_quantization_steps + 4))
{
continue;
}
if (samplecount_weight < 2)
{
samplecount_weight = 2;
}
if (best_errors[samplecount_weight] > error[i])
{
best_errors[samplecount_weight] = error[i];
best_scale[samplecount_weight] = i;
cut_low_weight[samplecount_weight] = 0;
}
float error_cut_low = error[i] + cut_low_weight_error[i];
float error_cut_high = error[i] + cut_high_weight_error[i];
float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i];
if (best_errors[samplecount_weight - 1] > error_cut_low)
{
best_errors[samplecount_weight - 1] = error_cut_low;
best_scale[samplecount_weight - 1] = i;
cut_low_weight[samplecount_weight - 1] = 1;
}
if (best_errors[samplecount_weight - 1] > error_cut_high)
{
best_errors[samplecount_weight - 1] = error_cut_high;
best_scale[samplecount_weight - 1] = i;
cut_low_weight[samplecount_weight - 1] = 0;
}
if (best_errors[samplecount_weight - 2] > error_cut_low_high)
{
best_errors[samplecount_weight - 2] = error_cut_low_high;
best_scale[samplecount_weight - 2] = i;
cut_low_weight[samplecount_weight - 2] = 1;
}
}
// if we got a better error-value for a low sample count than for a high one,
// use the low sample count error value for the higher sample count as well.
for (i = 3; i <= max_quantization_steps; i++)
{
if (best_errors[i] > best_errors[i - 1])
{
best_errors[i] = best_errors[i - 1];
best_scale[i] = best_scale[i - 1];
cut_low_weight[i] = cut_low_weight[i - 1];
}
}
max_quantization_level--; // Decrease level again (see corresponding ++, above)
static const int ql_weights[12] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 33 };
for (i = 0; i <= max_quantization_level; i++)
{
int q = ql_weights[i];
int bsi = best_scale[q];
// Did we find anything?
if(bsi < 0)
{
printf("ERROR: Unable to find an encoding within the specified error limits. Please revise the error limit values and try again.\n");
exit(1);
}
float stepsize = stepsizes[bsi];
int lwi = lowest_weight[bsi] + cut_low_weight[q];
int hwi = lwi + q - 1;
float offset = offsets[bsi];
low_value[i] = offset + lwi * stepsize;
high_value[i] = offset + hwi * stepsize;
}
}
// helper functions that will compute ideal angular-endpoints
// for a given set of weights and a given block size descriptors
void compute_angular_endpoints_1plane(float mode_cutoff, const block_size_descriptor * bsd,
const float *decimated_quantized_weights, const float *decimated_weights,
float low_value[MAX_WEIGHT_MODES], float high_value[MAX_WEIGHT_MODES])
{
int i;
float low_values[MAX_DECIMATION_MODES][12];
float high_values[MAX_DECIMATION_MODES][12];
for (i = 0; i < MAX_DECIMATION_MODES; i++)
{
int samplecount = bsd->decimation_mode_samples[i];
int quant_mode = bsd->decimation_mode_maxprec_1plane[i];
float percentile = bsd->decimation_mode_percentile[i];
int permit_encode = bsd->permit_encode[i];
if (permit_encode == 0 || samplecount < 1 || quant_mode < 0 || percentile > mode_cutoff)
continue;
compute_angular_endpoints_for_quantization_levels(samplecount,
decimated_quantized_weights + i * MAX_WEIGHTS_PER_BLOCK,
decimated_weights + i * MAX_WEIGHTS_PER_BLOCK, quant_mode, low_values[i], high_values[i]);
}
for (i = 0; i < MAX_WEIGHT_MODES; i++)
{
if (bsd->block_modes[i].is_dual_plane != 0 || bsd->block_modes[i].percentile > mode_cutoff)
continue;
int quant_mode = bsd->block_modes[i].quantization_mode;
int decim_mode = bsd->block_modes[i].decimation_mode;
low_value[i] = low_values[decim_mode][quant_mode];
high_value[i] = high_values[decim_mode][quant_mode];
}
}
void compute_angular_endpoints_2planes(float mode_cutoff,
const block_size_descriptor * bsd,
const float *decimated_quantized_weights,
const float *decimated_weights,
float low_value1[MAX_WEIGHT_MODES], float high_value1[MAX_WEIGHT_MODES], float low_value2[MAX_WEIGHT_MODES], float high_value2[MAX_WEIGHT_MODES])
{
int i;
float low_values1[MAX_DECIMATION_MODES][12];
float high_values1[MAX_DECIMATION_MODES][12];
float low_values2[MAX_DECIMATION_MODES][12];
float high_values2[MAX_DECIMATION_MODES][12];
for (i = 0; i < MAX_DECIMATION_MODES; i++)
{
int samplecount = bsd->decimation_mode_samples[i];
int quant_mode = bsd->decimation_mode_maxprec_2planes[i];
float percentile = bsd->decimation_mode_percentile[i];
int permit_encode = bsd->permit_encode[i];
if (permit_encode == 0 || samplecount < 1 || quant_mode < 0 || percentile > mode_cutoff)
continue;
compute_angular_endpoints_for_quantization_levels(samplecount,
decimated_quantized_weights + 2 * i * MAX_WEIGHTS_PER_BLOCK,
decimated_weights + 2 * i * MAX_WEIGHTS_PER_BLOCK, quant_mode, low_values1[i], high_values1[i]);
compute_angular_endpoints_for_quantization_levels(samplecount,
decimated_quantized_weights + (2 * i + 1) * MAX_WEIGHTS_PER_BLOCK,
decimated_weights + (2 * i + 1) * MAX_WEIGHTS_PER_BLOCK, quant_mode, low_values2[i], high_values2[i]);
}
for (i = 0; i < MAX_WEIGHT_MODES; i++)
{
if (bsd->block_modes[i].is_dual_plane != 1 || bsd->block_modes[i].percentile > mode_cutoff)
continue;
int quant_mode = bsd->block_modes[i].quantization_mode;
int decim_mode = bsd->block_modes[i].decimation_mode;
low_value1[i] = low_values1[decim_mode][quant_mode];
high_value1[i] = high_values1[decim_mode][quant_mode];
low_value2[i] = low_values2[decim_mode][quant_mode];
high_value2[i] = high_values2[decim_mode][quant_mode];
}
}