axmol/external/astc/astc_weight_align.cpp

// ----------------------------------------------------------------------------
//  This confidential and proprietary software may be used only as authorised
//  by a licensing agreement from Arm Limited.
//      (C) COPYRIGHT 2011-2020 Arm Limited, ALL RIGHTS RESERVED
//  The entire notice above must be reproduced on all authorised copies and
//  copies may only be made to the extent permitted by a licensing agreement
//  from Arm Limited.
// ----------------------------------------------------------------------------

/**
 * @brief Functions for angular-sum algorithm for weight alignment.
 *
 * This algorithm works as follows:
 * - we compute a complex number P as (cos s*i, sin s*i) for each weight,
 *   where i is the input value and s is a scaling factor based on the spacing
 *   between the weights.
 * - we then add together complex numbers for all the weights.
 * - we then compute the length and angle of the resulting sum.
 *
 * This should produce the following results:
 * - perfect alignment results in a vector whose length is equal to the sum of
 *   lengths of all inputs
 * - even distribution results in a vector of length 0.
 * - all samples identical results in perfect alignment for every scaling.
 *
 * For each scaling factor within a given set, we compute an alignment factor
 * from 0 to 1. This should then result in some scalings standing out as having
 * particularly good alignment factors; we can use this to produce a set of
 * candidate scale/shift values for various quantization levels; we should then
 * actually try them and see what happens.
 *
 * Assuming N quantization steps, the scaling factor becomes s=2*PI*(N-1); we
 * should probably have about 1 scaling factor for every 1/4 quantization step
 * (perhaps 1/8 for low levels of quantization).
 */

#include "astc_codec_internals.h"

#include <stdio.h>

static const float angular_steppings[] = {
	 1.0,  1.125, 1.25, 1.375, 1.5, 1.625, 1.75, 1.875,

	 2.0,  2.25,  2.5,  2.75,
	 3.0,  3.25,  3.5,  3.75,
	 4.0,  4.25,  4.5,  4.75,
	 5.0,  5.25,  5.5,  5.75,
	 6.0,  6.25,  6.5,  6.75,
	 7.0,  7.25,  7.5,  7.75,

	 8.0,  8.5,
	 9.0,  9.5,
	10.0, 10.5,
	11.0, 11.5,
	12.0, 12.5,
	13.0, 13.5,
	14.0, 14.5,
	15.0, 15.5,
	16.0, 16.5,
	17.0, 17.5,
	18.0, 18.5,
	19.0, 19.5,
	20.0, 20.5,
	21.0, 21.5,
	22.0, 22.5,
	23.0, 23.5,
	24.0, 24.5,
	25.0, 25.5,
	26.0, 26.5,
	27.0, 27.5,
	28.0, 28.5,
	29.0, 29.5,
	30.0, 30.5,
	31.0, 31.5,
	32.0, 32.5,
	33.0, 33.5,
	34.0, 34.5,
	35.0, 35.5
};

#define ANGULAR_STEPS ((int)(sizeof(angular_steppings)/sizeof(angular_steppings[0])))

static float stepsizes[ANGULAR_STEPS];
static float stepsizes_sqr[ANGULAR_STEPS];

static int max_angular_steps_needed_for_quant_level[13];

// we store sine/cosine values for 64 possible weight values; this causes
// slight quality loss compared to using sin() and cos() directly.

#define SINCOS_STEPS 64

static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];

void prepare_angular_tables(void)
{
	int i, j;
	int max_angular_steps_needed_for_quant_steps[40];
	for (i = 0; i < ANGULAR_STEPS; i++)
	{
		stepsizes[i] = 1.0f / angular_steppings[i];
		stepsizes_sqr[i] = stepsizes[i] * stepsizes[i];

		for (j = 0; j < SINCOS_STEPS; j++)
		{
			sin_table[j][i] = static_cast < float >(sin((2.0f * M_PI / (SINCOS_STEPS - 1.0f)) * angular_steppings[i] * j));
			cos_table[j][i] = static_cast < float >(cos((2.0f * M_PI / (SINCOS_STEPS - 1.0f)) * angular_steppings[i] * j));
		}

		int p = static_cast < int >(floor(angular_steppings[i])) + 1;
		max_angular_steps_needed_for_quant_steps[p] = MIN(i + 1, ANGULAR_STEPS - 1);
	}

	// yes, the next-to-last entry is supposed to have the value 33. This because under
	// ASTC, the 32-weight mode leaves a double-sized hole in the middle of the
	// weight space, so we are better off matching 33 weights than 32.
	static const int steps_of_level[] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 33, 36 };

	for (i = 0; i < 13; i++)
		max_angular_steps_needed_for_quant_level[i] = max_angular_steps_needed_for_quant_steps[steps_of_level[i]];
}

union if32
{
	float f;
	int32_t s;
	uint32_t u;
};

// function to compute angular sums; then, from the
// angular sums, compute alignment factor and offset.

/* static inline */
void compute_angular_offsets(int samplecount, const float *samples, const float *sample_weights, int max_angular_steps, float *offsets)
{
	int i, j;

	float anglesum_x[ANGULAR_STEPS];
	float anglesum_y[ANGULAR_STEPS];

	for (i = 0; i < max_angular_steps; i++)
	{
		anglesum_x[i] = 0;
		anglesum_y[i] = 0;
	}

	// compute the angle-sums.
	for (i = 0; i < samplecount; i++)
	{
		float sample = samples[i];
		float sample_weight = sample_weights[i];
		if32 p;
		p.f = (sample * (SINCOS_STEPS - 1.0f)) + 12582912.0f;
		unsigned int isample = p.u & 0x3F;

		const float *sinptr = sin_table[isample];
		const float *cosptr = cos_table[isample];

		for (j = 0; j < max_angular_steps; j++)
		{
			float cp = cosptr[j];
			float sp = sinptr[j];

			anglesum_x[j] += cp * sample_weight;
			anglesum_y[j] += sp * sample_weight;
		}
	}

	// post-process the angle-sums
	for (i = 0; i < max_angular_steps; i++)
	{
		float angle = atan2(anglesum_y[i], anglesum_x[i]);	// positive angle -> positive offset
		offsets[i] = angle * (stepsizes[i] * (1.0f / (2.0f * (float)M_PI)));
	}
}

// for a given step-size and a given offset, compute the
// lowest and highest weight that results from quantizing using the stepsize & offset.
// also, compute the resulting error.

void compute_lowest_and_highest_weight(int samplecount, const float *samples, const float *sample_weights,
									  int max_angular_steps, const float *offsets,
									  int8_t * lowest_weight, int8_t * highest_weight,
									  float *error, float *cut_low_weight_error, float *cut_high_weight_error)
{
	float error_from_forcing_weight_down[60];
	float error_from_forcing_weight_either_way[60];
	for (int i = 0; i < 60; i++)
	{
		error_from_forcing_weight_down[i] = 0;
		error_from_forcing_weight_either_way[i] = 0;
	}

	// weight + 12
	static const unsigned int idxtab[128] = {
		12, 13, 14, 15, 16, 17, 18, 19,
		20, 21, 22, 23, 24, 25, 26, 27,
		28, 29, 30, 31, 32, 33, 34, 35,
		36, 37, 38, 39, 40, 41, 42, 43,
		44, 45, 46, 47, 48, 49, 50, 51,
		52, 53, 54, 55, 55, 55, 55, 55,
		55, 55, 55, 55, 55, 55, 55, 55,
		55, 55, 55, 55, 55, 55, 55, 55,
		 0,  0,  0,  0,  0,  0,  0,  0,
		 0,  0,  0,  0,  0,  0,  0,  0,
		 0,  0,  0,  0,  0,  0,  0,  0,
		 0,  0,  0,  0,  0,  0,  0,  0,
		 0,  0,  0,  0,  0,  0,  0,  0,
		 0,  0,  0,  0,  0,  0,  0,  0,
		 0,  0,  0,  0,  0,  1,  2,  3,
		 4,  5,  6,  7,  8,  9, 10, 11
	};

	for (int sp = 0; sp < max_angular_steps; sp++)
	{
		unsigned int minidx_bias12 = 55;
		unsigned int maxidx_bias12 = 0;

		float errval = 0.0f;

		float rcp_stepsize = angular_steppings[sp];
		float offset = offsets[sp];

		float scaled_offset = rcp_stepsize * offset;

		for (int i = 0; i < samplecount - 1; i += 2)
		{
			float wt1 = sample_weights[i];
			float wt2 = sample_weights[i + 1];
			if32 p1, p2;
			float sval1 = (samples[i] * rcp_stepsize) - scaled_offset;
			float sval2 = (samples[i + 1] * rcp_stepsize) - scaled_offset;
			p1.f = sval1 + 12582912.0f;	// FP representation abuse to avoid floor() and float->int conversion
			p2.f = sval2 + 12582912.0f;	// FP representation abuse to avoid floor() and float->int conversion
			float isval1 = p1.f - 12582912.0f;
			float isval2 = p2.f - 12582912.0f;
			float dif1 = sval1 - isval1;
			float dif2 = sval2 - isval2;

			errval += (dif1 * wt1) * dif1;
			errval += (dif2 * wt2) * dif2;

			// table lookups that really perform a minmax function.
			unsigned int idx1_bias12 = idxtab[p1.u & 0x7F];
			unsigned int idx2_bias12 = idxtab[p2.u & 0x7F];

			if (idx1_bias12 < minidx_bias12)
				minidx_bias12 = idx1_bias12;
			if (idx1_bias12 > maxidx_bias12)
				maxidx_bias12 = idx1_bias12;
			if (idx2_bias12 < minidx_bias12)
				minidx_bias12 = idx2_bias12;
			if (idx2_bias12 > maxidx_bias12)
				maxidx_bias12 = idx2_bias12;

			error_from_forcing_weight_either_way[idx1_bias12] += wt1;
			error_from_forcing_weight_down[idx1_bias12] += (dif1 * wt1);

			error_from_forcing_weight_either_way[idx2_bias12] += wt2;
			error_from_forcing_weight_down[idx2_bias12] += (dif2 * wt2);
		}

		if (samplecount & 1)
		{
			int i = samplecount - 1;
			float wt = sample_weights[i];
			if32 p;
			float sval = (samples[i] * rcp_stepsize) - scaled_offset;
			p.f = sval + 12582912.0f;	// FP representation abuse to avoid floor() and float->int conversion
			float isval = p.f - 12582912.0f;
			float dif = sval - isval;

			errval += (dif * wt) * dif;

			unsigned int idx_bias12 = idxtab[p.u & 0x7F];

			if (idx_bias12 < minidx_bias12)
				minidx_bias12 = idx_bias12;
			if (idx_bias12 > maxidx_bias12)
				maxidx_bias12 = idx_bias12;

			error_from_forcing_weight_either_way[idx_bias12] += wt;
			error_from_forcing_weight_down[idx_bias12] += dif * wt;
		}

		lowest_weight[sp] = (int)minidx_bias12 - 12;
		highest_weight[sp] = (int)maxidx_bias12 - 12;
		error[sp] = errval;

		// the cut_(lowest/highest)_weight_error indicate the error that results from
		// forcing samples that should have had the (lowest/highest) weight value
		// one step (up/down).
		cut_low_weight_error[sp] = error_from_forcing_weight_either_way[minidx_bias12] - 2.0f * error_from_forcing_weight_down[minidx_bias12];
		cut_high_weight_error[sp] = error_from_forcing_weight_either_way[maxidx_bias12] + 2.0f * error_from_forcing_weight_down[maxidx_bias12];

		// clear out the error-from-forcing values we actually used in this pass
		// so that these are clean for the next pass.
		for (unsigned int ui = minidx_bias12 & ~0x3; ui <= maxidx_bias12; ui += 4)
		{
			error_from_forcing_weight_either_way[ui] = 0;
			error_from_forcing_weight_down[ui] = 0;
			error_from_forcing_weight_either_way[ui + 1] = 0;
			error_from_forcing_weight_down[ui + 1] = 0;
			error_from_forcing_weight_either_way[ui + 2] = 0;
			error_from_forcing_weight_down[ui + 2] = 0;
			error_from_forcing_weight_either_way[ui + 3] = 0;
			error_from_forcing_weight_down[ui + 3] = 0;
		}
	}

	for (int sp = 0; sp < max_angular_steps; sp++)
	{
		float errscale = stepsizes_sqr[sp];
		error[sp] *= errscale;
		cut_low_weight_error[sp] *= errscale;
		cut_high_weight_error[sp] *= errscale;
	}
}

// main function for running the angular algorithm.
void compute_angular_endpoints_for_quantization_levels(int samplecount, const float *samples, const float *sample_weights, int max_quantization_level, float low_value[12], float high_value[12])
{
	int i;

	max_quantization_level++;	// Temporarily increase level - needs refinement

	static const int quantization_steps_for_level[13] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 33, 36 };
	int max_quantization_steps = quantization_steps_for_level[max_quantization_level];

	float offsets[ANGULAR_STEPS];

	int max_angular_steps = max_angular_steps_needed_for_quant_level[max_quantization_level];

	compute_angular_offsets(samplecount, samples, sample_weights, max_angular_steps, offsets);

	// the +4 offsets are to allow for vectorization within compute_lowest_and_highest_weight().
	int8_t lowest_weight[ANGULAR_STEPS + 4];
	int8_t highest_weight[ANGULAR_STEPS + 4];
	float error[ANGULAR_STEPS + 4];

	float cut_low_weight_error[ANGULAR_STEPS + 4];
	float cut_high_weight_error[ANGULAR_STEPS + 4];

	compute_lowest_and_highest_weight(samplecount, samples, sample_weights, max_angular_steps, offsets, lowest_weight, highest_weight, error, cut_low_weight_error, cut_high_weight_error);

	#ifdef DEBUG_PRINT_DIAGNOSTICS
		if (print_diagnostics)
		{
			printf("%s : max-angular-steps=%d \n", __func__, max_angular_steps);
			printf("Samplecount=%d, max_quantization_level=%d\n", samplecount, max_quantization_level);
			for (i = 0; i < samplecount; i++)
				printf("Sample %d : %f (weight %f)\n", i, samples[i], sample_weights[i]);

			for (i = 0; i < max_angular_steps; i++)
			{
				printf("%d: offset=%f error=%f lowest=%d highest=%d cl=%f ch=%f\n", i, offsets[i], error[i], lowest_weight[i], highest_weight[i], cut_low_weight_error[i], cut_high_weight_error[i]);
			}
			printf("\n");
		}
	#endif

	// for each quantization level, find the best error terms.
	float best_errors[40];
	int best_scale[40];
	uint8_t cut_low_weight[40];

	for (i = 0; i < (max_quantization_steps + 4); i++)
	{
		best_errors[i] = 1e30f;
		best_scale[i] = -1;	// Indicates no solution found
		cut_low_weight[i] = 0;
	}

	for (i = 0; i < max_angular_steps; i++)
	{
		int samplecount_weight = highest_weight[i] - lowest_weight[i] + 1;
		if (samplecount_weight >= (max_quantization_steps + 4))
		{
			continue;
		}

		if (samplecount_weight < 2)
		{
			samplecount_weight = 2;
		}

		if (best_errors[samplecount_weight] > error[i])
		{
			best_errors[samplecount_weight] = error[i];
			best_scale[samplecount_weight] = i;
			cut_low_weight[samplecount_weight] = 0;
		}

		float error_cut_low = error[i] + cut_low_weight_error[i];
		float error_cut_high = error[i] + cut_high_weight_error[i];
		float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i];

		if (best_errors[samplecount_weight - 1] > error_cut_low)
		{
			best_errors[samplecount_weight - 1] = error_cut_low;
			best_scale[samplecount_weight - 1] = i;
			cut_low_weight[samplecount_weight - 1] = 1;
		}

		if (best_errors[samplecount_weight - 1] > error_cut_high)
		{
			best_errors[samplecount_weight - 1] = error_cut_high;
			best_scale[samplecount_weight - 1] = i;
			cut_low_weight[samplecount_weight - 1] = 0;
		}

		if (best_errors[samplecount_weight - 2] > error_cut_low_high)
		{
			best_errors[samplecount_weight - 2] = error_cut_low_high;
			best_scale[samplecount_weight - 2] = i;
			cut_low_weight[samplecount_weight - 2] = 1;
		}

	}

	// if we got a better error-value for a low sample count than for a high one,
	// use the low sample count error value for the higher sample count as well.
	for (i = 3; i <= max_quantization_steps; i++)
	{
		if (best_errors[i] > best_errors[i - 1])
		{
			best_errors[i] = best_errors[i - 1];
			best_scale[i] = best_scale[i - 1];
			cut_low_weight[i] = cut_low_weight[i - 1];
		}
	}

	max_quantization_level--;	// Decrease level again (see corresponding ++, above)

	static const int ql_weights[12] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 33 };
	for (i = 0; i <= max_quantization_level; i++)
	{
		int q = ql_weights[i];
		int bsi = best_scale[q];

		// Did we find anything?
		if(bsi < 0)
		{
			printf("ERROR: Unable to find an encoding within the specified error limits. Please revise the error limit values and try again.\n");
			exit(1);
		}

		float stepsize = stepsizes[bsi];
		int lwi = lowest_weight[bsi] + cut_low_weight[q];
		int hwi = lwi + q - 1;
		float offset = offsets[bsi];

		low_value[i] = offset + lwi * stepsize;
		high_value[i] = offset + hwi * stepsize;
	}

}

// helper functions that will compute ideal angular-endpoints
// for a given set of weights and a given block size descriptors
void compute_angular_endpoints_1plane(float mode_cutoff, const block_size_descriptor * bsd,
									  const float *decimated_quantized_weights, const float *decimated_weights,
									  float low_value[MAX_WEIGHT_MODES], float high_value[MAX_WEIGHT_MODES])
{
	int i;
	float low_values[MAX_DECIMATION_MODES][12];
	float high_values[MAX_DECIMATION_MODES][12];

	for (i = 0; i < MAX_DECIMATION_MODES; i++)
	{
		int samplecount = bsd->decimation_mode_samples[i];
		int quant_mode = bsd->decimation_mode_maxprec_1plane[i];
		float percentile = bsd->decimation_mode_percentile[i];
		int permit_encode = bsd->permit_encode[i];
		if (permit_encode == 0 || samplecount < 1 || quant_mode < 0 || percentile > mode_cutoff)
			continue;


		compute_angular_endpoints_for_quantization_levels(samplecount,
														  decimated_quantized_weights + i * MAX_WEIGHTS_PER_BLOCK,
														  decimated_weights + i * MAX_WEIGHTS_PER_BLOCK, quant_mode, low_values[i], high_values[i]);
	}

	for (i = 0; i < MAX_WEIGHT_MODES; i++)
	{
		if (bsd->block_modes[i].is_dual_plane != 0 || bsd->block_modes[i].percentile > mode_cutoff)
			continue;
		int quant_mode = bsd->block_modes[i].quantization_mode;
		int decim_mode = bsd->block_modes[i].decimation_mode;

		low_value[i] = low_values[decim_mode][quant_mode];
		high_value[i] = high_values[decim_mode][quant_mode];
	}
}

void compute_angular_endpoints_2planes(float mode_cutoff,
									   const block_size_descriptor * bsd,
									   const float *decimated_quantized_weights,
									   const float *decimated_weights,
									   float low_value1[MAX_WEIGHT_MODES], float high_value1[MAX_WEIGHT_MODES], float low_value2[MAX_WEIGHT_MODES], float high_value2[MAX_WEIGHT_MODES])
{
	int i;
	float low_values1[MAX_DECIMATION_MODES][12];
	float high_values1[MAX_DECIMATION_MODES][12];
	float low_values2[MAX_DECIMATION_MODES][12];
	float high_values2[MAX_DECIMATION_MODES][12];

	for (i = 0; i < MAX_DECIMATION_MODES; i++)
	{
		int samplecount = bsd->decimation_mode_samples[i];
		int quant_mode = bsd->decimation_mode_maxprec_2planes[i];
		float percentile = bsd->decimation_mode_percentile[i];
		int permit_encode = bsd->permit_encode[i];
		if (permit_encode == 0 || samplecount < 1 || quant_mode < 0 || percentile > mode_cutoff)
			continue;

		compute_angular_endpoints_for_quantization_levels(samplecount,
														  decimated_quantized_weights + 2 * i * MAX_WEIGHTS_PER_BLOCK,
														  decimated_weights + 2 * i * MAX_WEIGHTS_PER_BLOCK, quant_mode, low_values1[i], high_values1[i]);

		compute_angular_endpoints_for_quantization_levels(samplecount,
														  decimated_quantized_weights + (2 * i + 1) * MAX_WEIGHTS_PER_BLOCK,
														  decimated_weights + (2 * i + 1) * MAX_WEIGHTS_PER_BLOCK, quant_mode, low_values2[i], high_values2[i]);

	}

	for (i = 0; i < MAX_WEIGHT_MODES; i++)
	{
		if (bsd->block_modes[i].is_dual_plane != 1 || bsd->block_modes[i].percentile > mode_cutoff)
			continue;
		int quant_mode = bsd->block_modes[i].quantization_mode;
		int decim_mode = bsd->block_modes[i].decimation_mode;

		low_value1[i] = low_values1[decim_mode][quant_mode];
		high_value1[i] = high_values1[decim_mode][quant_mode];
		low_value2[i] = low_values2[decim_mode][quant_mode];
		high_value2[i] = high_values2[decim_mode][quant_mode];
	}
}