axmol/external/astc/astc_compress_symbolic.cpp

// ----------------------------------------------------------------------------
//  This confidential and proprietary software may be used only as authorised
//  by a licensing agreement from Arm Limited.
//      (C) COPYRIGHT 2011-2020 Arm Limited, ALL RIGHTS RESERVED
//  The entire notice above must be reproduced on all authorised copies and
//  copies may only be made to the extent permitted by a licensing agreement
//  from Arm Limited.
// ----------------------------------------------------------------------------

/**
 * @brief Functions to compress a symbolic block.
 */

#include "astc_codec_internals.h"

#include "softfloat.h"
#include <string.h>
#include <stdio.h>

#ifdef DEBUG_CAPTURE_NAN
	#ifndef _GNU_SOURCE
		#define _GNU_SOURCE
	#endif

	#include <fenv.h>
#endif

int realign_weights(astc_decode_mode decode_mode,
					int xdim, int ydim, int zdim, const imageblock * blk, const error_weight_block * ewb, symbolic_compressed_block * scb, uint8_t * weight_set8, uint8_t * plane2_weight_set8)
{
	int i, j;

	// get the appropriate partition descriptor.
	int partition_count = scb->partition_count;
	const partition_info *pt = get_partition_table(xdim, ydim, zdim, partition_count);
	pt += scb->partition_index;

	// get the appropriate block descriptor
	const block_size_descriptor *bsd = get_block_size_descriptor(xdim, ydim, zdim);
	const decimation_table *const *ixtab2 = bsd->decimation_tables;

	const decimation_table *it = ixtab2[bsd->block_modes[scb->block_mode].decimation_mode];

	int is_dual_plane = bsd->block_modes[scb->block_mode].is_dual_plane;

	// get quantization-parameters
	int weight_quantization_level = bsd->block_modes[scb->block_mode].quantization_mode;

	// decode the color endpoints
	ushort4 color_endpoint0[4];
	ushort4 color_endpoint1[4];
	int rgb_hdr[4];
	int alpha_hdr[4];
	int nan_endpoint[4];

	for (i = 0; i < partition_count; i++)
		unpack_color_endpoints(decode_mode,
							   scb->color_formats[i], scb->color_quantization_level, scb->color_values[i], &rgb_hdr[i], &alpha_hdr[i], &nan_endpoint[i], &(color_endpoint0[i]), &(color_endpoint1[i]));

	float uq_plane1_weights[MAX_WEIGHTS_PER_BLOCK];
	float uq_plane2_weights[MAX_WEIGHTS_PER_BLOCK];
	int weight_count = it->num_weights;

	// read and unquantize the weights.

	const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[weight_quantization_level]);

	for (i = 0; i < weight_count; i++)
	{
		uq_plane1_weights[i] = qat->unquantized_value_flt[weight_set8[i]];
	}
	if (is_dual_plane)
	{
		for (i = 0; i < weight_count; i++)
			uq_plane2_weights[i] = qat->unquantized_value_flt[plane2_weight_set8[i]];
	}

	int plane2_color_component = is_dual_plane ? scb->plane2_color_component : -1;

	// for each weight, unquantize the weight, use it to compute a color and a color error.
	// then, increment the weight until the color error stops decreasing
	// then, decrement the weight until the color error stops increasing

	#define COMPUTE_ERROR( errorvar ) \
		errorvar = 0.0f; \
		for(j=0;j<texels_to_evaluate;j++) \
		{ \
			int texel = it->weight_texel[i][j]; \
			int partition = pt->partition_of_texel[texel]; \
			float plane1_weight = compute_value_of_texel_flt( texel, it, uq_plane1_weights ); \
			float plane2_weight = 0.0f; \
			if( is_dual_plane ) \
				plane2_weight = compute_value_of_texel_flt( texel, it, uq_plane2_weights ); \
			int int_plane1_weight = static_cast<int>(floor( plane1_weight*64.0f + 0.5f ) ); \
			int int_plane2_weight = static_cast<int>(floor( plane2_weight*64.0f + 0.5f ) ); \
			ushort4 lrp_color = lerp_color_int( \
				decode_mode, \
				color_endpoint0[partition], \
				color_endpoint1[partition], \
				int_plane1_weight, \
				int_plane2_weight, \
				plane2_color_component ); \
			float4 color = float4( lrp_color.x, lrp_color.y, lrp_color.z, lrp_color.w ); \
			float4 origcolor = float4( \
				blk->work_data[4*texel], \
				blk->work_data[4*texel+1], \
				blk->work_data[4*texel+2], \
				blk->work_data[4*texel+3] ); \
			float4 error_weight = ewb->error_weights[texel]; \
			float4 colordiff = color - origcolor; \
			errorvar += dot( colordiff*colordiff, error_weight ); \
		}

	int adjustments = 0;

	for (i = 0; i < weight_count; i++)
	{
		int current_wt = weight_set8[i];
		int texels_to_evaluate = it->weight_num_texels[i];

		float current_error;

		COMPUTE_ERROR(current_error);

		// increment until error starts increasing.
		while (1)
		{
			int next_wt = qat->next_quantized_value[current_wt];
			if (next_wt == current_wt)
				break;
			uq_plane1_weights[i] = qat->unquantized_value_flt[next_wt];
			float next_error;
			COMPUTE_ERROR(next_error);
			if (next_error < current_error)
			{
				// succeeded, increment the weight
				current_wt = next_wt;
				current_error = next_error;
				adjustments++;
			}
			else
			{
				// failed, back out the attempted increment
				uq_plane1_weights[i] = qat->unquantized_value_flt[current_wt];
				break;
			}
		}

		// decrement until error starts increasing
		while (1)
		{
			int prev_wt = qat->prev_quantized_value[current_wt];
			if (prev_wt == current_wt)
				break;
			uq_plane1_weights[i] = qat->unquantized_value_flt[prev_wt];
			float prev_error;
			COMPUTE_ERROR(prev_error);
			if (prev_error < current_error)
			{
				// succeeded, decrement the weight
				current_wt = prev_wt;
				current_error = prev_error;
				adjustments++;
			}
			else
			{
				// failed, back out the attempted decrement
				uq_plane1_weights[i] = qat->unquantized_value_flt[current_wt];
				break;
			}
		}

		weight_set8[i] = current_wt;
	}

	if (!is_dual_plane)
		return adjustments;

	// processing of the second plane of weights
	for (i = 0; i < weight_count; i++)
	{
		int current_wt = plane2_weight_set8[i];
		int texels_to_evaluate = it->weight_num_texels[i];

		float current_error;

		COMPUTE_ERROR(current_error);

		// increment until error starts increasing.
		while (1)
		{
			int next_wt = qat->next_quantized_value[current_wt];
			if (next_wt == current_wt)
				break;
			uq_plane2_weights[i] = qat->unquantized_value_flt[next_wt];
			float next_error;
			COMPUTE_ERROR(next_error);
			if (next_error < current_error)
			{
				// succeeded, increment the weight
				current_wt = next_wt;
				current_error = next_error;
				adjustments++;
			}
			else
			{
				// failed, back out the attempted increment
				uq_plane2_weights[i] = qat->unquantized_value_flt[current_wt];
				break;
			}
		}

		// decrement until error starts increasing
		while (1)
		{
			int prev_wt = qat->prev_quantized_value[current_wt];
			if (prev_wt == current_wt)
				break;
			uq_plane2_weights[i] = qat->unquantized_value_flt[prev_wt];
			float prev_error;
			COMPUTE_ERROR(prev_error);
			if (prev_error < current_error)
			{
				// succeeded, decrement the weight
				current_wt = prev_wt;
				current_error = prev_error;
				adjustments++;
			}
			else
			{
				// failed, back out the attempted decrement
				uq_plane2_weights[i] = qat->unquantized_value_flt[current_wt];
				break;
			}
		}

		plane2_weight_set8[i] = current_wt;
	}

	return adjustments;
}

/*
	function for compressing a block symbolically, given that we have already decided on a partition
*/
static void compress_symbolic_block_fixed_partition_1_plane(astc_decode_mode decode_mode,
															float mode_cutoff,
															int max_refinement_iters,
															int xdim, int ydim, int zdim,
															int partition_count, int partition_index,
															const imageblock * blk, const error_weight_block * ewb, symbolic_compressed_block * scb,
															compress_fixed_partition_buffers * tmpbuf)
{
	int i, j, k;

	static const int free_bits_for_partition_count[5] = { 0, 115 - 4, 111 - 4 - PARTITION_BITS, 108 - 4 - PARTITION_BITS, 105 - 4 - PARTITION_BITS };

	const partition_info *pi = get_partition_table(xdim, ydim, zdim, partition_count);
	pi += partition_index;

	// first, compute ideal weights and endpoint colors, under thre assumption that
	// there is no quantization or decimation going on.
	endpoints_and_weights *ei = tmpbuf->ei1;
	endpoints_and_weights *eix = tmpbuf->eix1;
	compute_endpoints_and_ideal_weights_1_plane(xdim, ydim, zdim, pi, blk, ewb, ei);

	// next, compute ideal weights and endpoint colors for every decimation.
	const block_size_descriptor *bsd = get_block_size_descriptor(xdim, ydim, zdim);
	const decimation_table *const *ixtab2 = bsd->decimation_tables;

	float *decimated_quantized_weights = tmpbuf->decimated_quantized_weights;
	float *decimated_weights = tmpbuf->decimated_weights;
	float *flt_quantized_decimated_quantized_weights = tmpbuf->flt_quantized_decimated_quantized_weights;
	uint8_t *u8_quantized_decimated_quantized_weights = tmpbuf->u8_quantized_decimated_quantized_weights;

	// for each decimation mode, compute an ideal set of weights
	// (that is, weights computed with the assumption that they are not quantized)
	for (i = 0; i < MAX_DECIMATION_MODES; i++)
	{
		if (bsd->permit_encode[i] == 0 || bsd->decimation_mode_maxprec_1plane[i] < 0 || bsd->decimation_mode_percentile[i] > mode_cutoff)
			continue;
		eix[i] = *ei;
		compute_ideal_weights_for_decimation_table(&(eix[i]), ixtab2[i], decimated_quantized_weights + i * MAX_WEIGHTS_PER_BLOCK, decimated_weights + i * MAX_WEIGHTS_PER_BLOCK);

	}

	// compute maximum colors for the endpoints and ideal weights.
	// for each endpoint-and-ideal-weight pair, compute the smallest weight value
	// that will result in a color value greater than 1.
	float4 min_ep = float4(10, 10, 10, 10);
	for (i = 0; i < partition_count; i++)
	{
		#ifdef DEBUG_CAPTURE_NAN
			fedisableexcept(FE_DIVBYZERO | FE_INVALID);
		#endif

		float4 ep = (float4(1, 1, 1, 1) - ei->ep.endpt0[i]) / (ei->ep.endpt1[i] - ei->ep.endpt0[i]);
		if (ep.x > 0.5f && ep.x < min_ep.x)
			min_ep.x = ep.x;
		if (ep.y > 0.5f && ep.y < min_ep.y)
			min_ep.y = ep.y;
		if (ep.z > 0.5f && ep.z < min_ep.z)
			min_ep.z = ep.z;
		if (ep.w > 0.5f && ep.w < min_ep.w)
			min_ep.w = ep.w;

		#ifdef DEBUG_CAPTURE_NAN
			feenableexcept(FE_DIVBYZERO | FE_INVALID);
		#endif
	}

	float min_wt_cutoff = MIN(MIN(min_ep.x, min_ep.y), MIN(min_ep.z, min_ep.w));

	// for each mode, use the angular method to compute a shift.
	float weight_low_value[MAX_WEIGHT_MODES];
	float weight_high_value[MAX_WEIGHT_MODES];

	compute_angular_endpoints_1plane(mode_cutoff, bsd, decimated_quantized_weights, decimated_weights, weight_low_value, weight_high_value);

	// for each mode (which specifies a decimation and a quantization):
	// * compute number of bits needed for the quantized weights.
	// * generate an optimized set of quantized weights.
	// * compute quantization errors for the mode.
	int qwt_bitcounts[MAX_WEIGHT_MODES];
	float qwt_errors[MAX_WEIGHT_MODES];

	for (i = 0; i < MAX_WEIGHT_MODES; i++)
	{
		if (bsd->block_modes[i].permit_encode == 0 || bsd->block_modes[i].is_dual_plane != 0 || bsd->block_modes[i].percentile > mode_cutoff)
		{
			qwt_errors[i] = 1e38f;
			continue;
		}
		if (weight_high_value[i] > 1.02f * min_wt_cutoff)
			weight_high_value[i] = 1.0f;

		int decimation_mode = bsd->block_modes[i].decimation_mode;
		if (bsd->decimation_mode_percentile[decimation_mode] > mode_cutoff)
			ASTC_CODEC_INTERNAL_ERROR();

		// compute weight bitcount for the mode
		int bits_used_by_weights = compute_ise_bitcount(ixtab2[decimation_mode]->num_weights,
														(quantization_method) bsd->block_modes[i].quantization_mode);
		int bitcount = free_bits_for_partition_count[partition_count] - bits_used_by_weights;
		if (bitcount <= 0 || bits_used_by_weights < 24 || bits_used_by_weights > 96)
		{
			qwt_errors[i] = 1e38f;
			continue;
		}
		qwt_bitcounts[i] = bitcount;

		// then, generate the optimized set of weights for the weight mode.
		compute_ideal_quantized_weights_for_decimation_table(&(eix[decimation_mode]),
															 ixtab2[decimation_mode],
															 weight_low_value[i], weight_high_value[i],
															 decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * decimation_mode,
															 flt_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * i,
															 u8_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * i,
															 bsd->block_modes[i].quantization_mode);

		// then, compute weight-errors for the weight mode.
		qwt_errors[i] = compute_error_of_weight_set(&(eix[decimation_mode]), ixtab2[decimation_mode], flt_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * i);

		#ifdef DEBUG_PRINT_DIAGNOSTICS
			if (print_diagnostics)
				printf("Block mode %d -> weight error = %f\n", i, qwt_errors[i]);
		#endif
	}

	// for each weighting mode, determine the optimal combination of color endpoint encodings
	// and weight encodings; return results for the 4 best-looking modes.

	int partition_format_specifiers[4][4];
	int quantized_weight[4];
	int color_quantization_level[4];
	int color_quantization_level_mod[4];
	determine_optimal_set_of_endpoint_formats_to_use(xdim, ydim, zdim, pi, blk, ewb, &(ei->ep), -1,	// used to flag that we are in single-weight mode
													 qwt_bitcounts, qwt_errors, partition_format_specifiers, quantized_weight, color_quantization_level, color_quantization_level_mod);

	// then iterate over the 4 believed-to-be-best modes to find out which one is
	// actually best.
	for (i = 0; i < 4; i++)
	{
		uint8_t *u8_weight_src;
		int weights_to_copy;

		if (quantized_weight[i] < 0)
		{
			scb->error_block = 1;
			scb++;
			continue;
		}

		int decimation_mode = bsd->block_modes[quantized_weight[i]].decimation_mode;
		int weight_quantization_mode = bsd->block_modes[quantized_weight[i]].quantization_mode;
		const decimation_table *it = ixtab2[decimation_mode];

		#ifdef DEBUG_PRINT_DIAGNOSTICS
			if (print_diagnostics)
			{
				printf("Selected mode = %d\n", quantized_weight[i]);
				printf("Selected decimation mode = %d\n", decimation_mode);
				printf("Selected weight-quantization mode = %d\n", weight_quantization_mode);
			}
		#endif

		u8_weight_src = u8_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * quantized_weight[i];

		weights_to_copy = it->num_weights;

		// recompute the ideal color endpoints before storing them.
		float4 rgbs_colors[4];
		float4 rgbo_colors[4];

		int l;
		for (l = 0; l < max_refinement_iters; l++)
		{
			recompute_ideal_colors(xdim, ydim, zdim, weight_quantization_mode, &(eix[decimation_mode].ep), rgbs_colors, rgbo_colors, u8_weight_src, NULL, -1, pi, it, blk, ewb);

			// quantize the chosen color

			// store the colors for the block
			for (j = 0; j < partition_count; j++)
			{
				scb->color_formats[j] = pack_color_endpoints(eix[decimation_mode].ep.endpt0[j],
															 eix[decimation_mode].ep.endpt1[j],
															 rgbs_colors[j], rgbo_colors[j], partition_format_specifiers[i][j], scb->color_values[j], color_quantization_level[i]);
			}

			// if all the color endpoint modes are the same, we get a few more
			// bits to store colors; let's see if we can take advantage of this:
			// requantize all the colors and see if the endpoint modes remain the same;
			// if they do, then exploit it.
			scb->color_formats_matched = 0;

			if ((partition_count >= 2 && scb->color_formats[0] == scb->color_formats[1]
				&& color_quantization_level[i] != color_quantization_level_mod[i])
				&& (partition_count == 2 || (scb->color_formats[0] == scb->color_formats[2] && (partition_count == 3 || (scb->color_formats[0] == scb->color_formats[3])))))
			{
				int colorvals[4][12];
				int color_formats_mod[4];
				for (j = 0; j < partition_count; j++)
				{
					color_formats_mod[j] = pack_color_endpoints(eix[decimation_mode].ep.endpt0[j],
																eix[decimation_mode].ep.endpt1[j],
																rgbs_colors[j], rgbo_colors[j], partition_format_specifiers[i][j], colorvals[j], color_quantization_level_mod[i]);
				}
				if (color_formats_mod[0] == color_formats_mod[1]
					&& (partition_count == 2 || (color_formats_mod[0] == color_formats_mod[2] && (partition_count == 3 || (color_formats_mod[0] == color_formats_mod[3])))))
				{
					scb->color_formats_matched = 1;
					for (j = 0; j < 4; j++)
						for (k = 0; k < 12; k++)
							scb->color_values[j][k] = colorvals[j][k];
					for (j = 0; j < 4; j++)
						scb->color_formats[j] = color_formats_mod[j];
				}
			}

			// store header fields
			scb->partition_count = partition_count;
			scb->partition_index = partition_index;
			scb->color_quantization_level = scb->color_formats_matched ? color_quantization_level_mod[i] : color_quantization_level[i];
			scb->block_mode = quantized_weight[i];
			scb->error_block = 0;

			if (scb->color_quantization_level < 4)
			{
				scb->error_block = 1;	// should never happen, but cannot prove it impossible.
			}

			// perform a final pass over the weights to try to improve them.
			int adjustments = realign_weights(decode_mode,
											  xdim, ydim, zdim,
											  blk, ewb, scb,
											  u8_weight_src,
											  NULL);

			if (adjustments == 0)
				break;
		}

		for (j = 0; j < weights_to_copy; j++)
			scb->plane1_weights[j] = u8_weight_src[j];

		scb++;
	}
}

static void compress_symbolic_block_fixed_partition_2_planes(astc_decode_mode decode_mode,
															 float mode_cutoff,
															 int max_refinement_iters,
															 int xdim, int ydim, int zdim,
															 int partition_count, int partition_index,
															 int separate_component, const imageblock * blk, const error_weight_block * ewb,
															 symbolic_compressed_block * scb,
															 compress_fixed_partition_buffers * tmpbuf)
{
	int i, j, k;

	static const int free_bits_for_partition_count[5] =
		{ 0, 113 - 4, 109 - 4 - PARTITION_BITS, 106 - 4 - PARTITION_BITS, 103 - 4 - PARTITION_BITS };

	const partition_info *pi = get_partition_table(xdim, ydim, zdim, partition_count);
	pi += partition_index;

	// first, compute ideal weights and endpoint colors
	endpoints_and_weights *ei1 = tmpbuf->ei1;
	endpoints_and_weights *ei2 = tmpbuf->ei2;
	endpoints_and_weights *eix1 = tmpbuf->eix1;
	endpoints_and_weights *eix2 = tmpbuf->eix2;
	compute_endpoints_and_ideal_weights_2_planes(xdim, ydim, zdim, pi, blk, ewb, separate_component, ei1, ei2);

	// next, compute ideal weights and endpoint colors for every decimation.
	const block_size_descriptor *bsd = get_block_size_descriptor(xdim, ydim, zdim);
	const decimation_table *const *ixtab2 = bsd->decimation_tables;

	float *decimated_quantized_weights = tmpbuf->decimated_quantized_weights;
	float *decimated_weights = tmpbuf->decimated_weights;
	float *flt_quantized_decimated_quantized_weights = tmpbuf->flt_quantized_decimated_quantized_weights;
	uint8_t *u8_quantized_decimated_quantized_weights = tmpbuf->u8_quantized_decimated_quantized_weights;

	// for each decimation mode, compute an ideal set of weights
	for (i = 0; i < MAX_DECIMATION_MODES; i++)
	{
		if (bsd->permit_encode[i] == 0 || bsd->decimation_mode_maxprec_2planes[i] < 0 || bsd->decimation_mode_percentile[i] > mode_cutoff)
			continue;

		eix1[i] = *ei1;
		eix2[i] = *ei2;
		compute_ideal_weights_for_decimation_table(&(eix1[i]), ixtab2[i], decimated_quantized_weights + (2 * i) * MAX_WEIGHTS_PER_BLOCK, decimated_weights + (2 * i) * MAX_WEIGHTS_PER_BLOCK);
		compute_ideal_weights_for_decimation_table(&(eix2[i]), ixtab2[i], decimated_quantized_weights + (2 * i + 1) * MAX_WEIGHTS_PER_BLOCK, decimated_weights + (2 * i + 1) * MAX_WEIGHTS_PER_BLOCK);
	}

	// compute maximum colors for the endpoints and ideal weights.
	// for each endpoint-and-ideal-weight pair, compute the smallest weight value
	// that will result in a color value greater than 1.

	float4 min_ep1 = float4(10, 10, 10, 10);
	float4 min_ep2 = float4(10, 10, 10, 10);
	for (i = 0; i < partition_count; i++)
	{
		#ifdef DEBUG_CAPTURE_NAN
			fedisableexcept(FE_DIVBYZERO | FE_INVALID);
		#endif

		float4 ep1 = (float4(1, 1, 1, 1) - ei1->ep.endpt0[i]) / (ei1->ep.endpt1[i] - ei1->ep.endpt0[i]);
		if (ep1.x > 0.5f && ep1.x < min_ep1.x)
			min_ep1.x = ep1.x;
		if (ep1.y > 0.5f && ep1.y < min_ep1.y)
			min_ep1.y = ep1.y;
		if (ep1.z > 0.5f && ep1.z < min_ep1.z)
			min_ep1.z = ep1.z;
		if (ep1.w > 0.5f && ep1.w < min_ep1.w)
			min_ep1.w = ep1.w;
		float4 ep2 = (float4(1, 1, 1, 1) - ei2->ep.endpt0[i]) / (ei2->ep.endpt1[i] - ei2->ep.endpt0[i]);
		if (ep2.x > 0.5f && ep2.x < min_ep2.x)
			min_ep2.x = ep2.x;
		if (ep2.y > 0.5f && ep2.y < min_ep2.y)
			min_ep2.y = ep2.y;
		if (ep2.z > 0.5f && ep2.z < min_ep2.z)
			min_ep2.z = ep2.z;
		if (ep2.w > 0.5f && ep2.w < min_ep2.w)
			min_ep2.w = ep2.w;

		#ifdef DEBUG_CAPTURE_NAN
			feenableexcept(FE_DIVBYZERO | FE_INVALID);
		#endif
	}

	float min_wt_cutoff1, min_wt_cutoff2;
	switch (separate_component)
	{
	case 0:
		min_wt_cutoff2 = min_ep2.x;
		min_ep1.x = 1e30f;
		break;
	case 1:
		min_wt_cutoff2 = min_ep2.y;
		min_ep1.y = 1e30f;
		break;
	case 2:
		min_wt_cutoff2 = min_ep2.z;
		min_ep1.z = 1e30f;
		break;
	case 3:
		min_wt_cutoff2 = min_ep2.w;
		min_ep1.w = 1e30f;
		break;
	default:
		min_wt_cutoff2 = 1e30f;
	}

	min_wt_cutoff1 = MIN(MIN(min_ep1.x, min_ep1.y), MIN(min_ep1.z, min_ep1.w));

	float weight_low_value1[MAX_WEIGHT_MODES];
	float weight_high_value1[MAX_WEIGHT_MODES];
	float weight_low_value2[MAX_WEIGHT_MODES];
	float weight_high_value2[MAX_WEIGHT_MODES];

	compute_angular_endpoints_2planes(mode_cutoff, bsd, decimated_quantized_weights, decimated_weights, weight_low_value1, weight_high_value1, weight_low_value2, weight_high_value2);

	// for each mode (which specifies a decimation and a quantization):
	// * generate an optimized set of quantized weights.
	// * compute quantization errors for each mode
	// * compute number of bits needed for the quantized weights.

	int qwt_bitcounts[MAX_WEIGHT_MODES];
	float qwt_errors[MAX_WEIGHT_MODES];
	for (i = 0; i < MAX_WEIGHT_MODES; i++)
	{
		if (bsd->block_modes[i].permit_encode == 0 || bsd->block_modes[i].is_dual_plane != 1 || bsd->block_modes[i].percentile > mode_cutoff)
		{
			qwt_errors[i] = 1e38f;
			continue;
		}
		int decimation_mode = bsd->block_modes[i].decimation_mode;

		if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
			weight_high_value1[i] = 1.0f;
		if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
			weight_high_value2[i] = 1.0f;

		// compute weight bitcount for the mode
		int bits_used_by_weights = compute_ise_bitcount(2 * ixtab2[decimation_mode]->num_weights,
														(quantization_method) bsd->block_modes[i].quantization_mode);
		int bitcount = free_bits_for_partition_count[partition_count] - bits_used_by_weights;
		if (bitcount <= 0 || bits_used_by_weights < 24 || bits_used_by_weights > 96)
		{
			qwt_errors[i] = 1e38f;
			continue;
		}
		qwt_bitcounts[i] = bitcount;

		// then, generate the optimized set of weights for the mode.
		compute_ideal_quantized_weights_for_decimation_table(&(eix1[decimation_mode]),
															 ixtab2[decimation_mode],
															 weight_low_value1[i],
															 weight_high_value1[i],
															 decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * decimation_mode),
															 flt_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * i),
															 u8_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * i), bsd->block_modes[i].quantization_mode);

		compute_ideal_quantized_weights_for_decimation_table(&(eix2[decimation_mode]),
															 ixtab2[decimation_mode],
															 weight_low_value2[i],
															 weight_high_value2[i],
															 decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * decimation_mode + 1),
															 flt_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * i + 1),
															 u8_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * i + 1), bsd->block_modes[i].quantization_mode);


		// then, compute quantization errors for the block mode.
		qwt_errors[i] =
			compute_error_of_weight_set(&(eix1[decimation_mode]),
									   ixtab2[decimation_mode],
									   flt_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * i))
			+ compute_error_of_weight_set(&(eix2[decimation_mode]), ixtab2[decimation_mode], flt_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * i + 1));
	}

	// decide the optimal combination of color endpoint encodings and weight encodings.
	int partition_format_specifiers[4][4];
	int quantized_weight[4];
	int color_quantization_level[4];
	int color_quantization_level_mod[4];

	endpoints epm;
	merge_endpoints(&(ei1->ep), &(ei2->ep), separate_component, &epm);

	determine_optimal_set_of_endpoint_formats_to_use(xdim, ydim, zdim,
													 pi,
													 blk,
													 ewb,
													 &epm, separate_component, qwt_bitcounts, qwt_errors, partition_format_specifiers, quantized_weight, color_quantization_level, color_quantization_level_mod);

	for (i = 0; i < 4; i++)
	{
		if (quantized_weight[i] < 0)
		{
			scb->error_block = 1;
			scb++;
			continue;
		}

		uint8_t *u8_weight1_src;
		uint8_t *u8_weight2_src;
		int weights_to_copy;

		int decimation_mode = bsd->block_modes[quantized_weight[i]].decimation_mode;
		int weight_quantization_mode = bsd->block_modes[quantized_weight[i]].quantization_mode;
		const decimation_table *it = ixtab2[decimation_mode];

		u8_weight1_src = u8_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * quantized_weight[i]);
		u8_weight2_src = u8_quantized_decimated_quantized_weights + MAX_WEIGHTS_PER_BLOCK * (2 * quantized_weight[i] + 1);

		weights_to_copy = it->num_weights;

		// recompute the ideal color endpoints before storing them.
		merge_endpoints(&(eix1[decimation_mode].ep), &(eix2[decimation_mode].ep), separate_component, &epm);

		float4 rgbs_colors[4];
		float4 rgbo_colors[4];

		int l;
		for (l = 0; l < max_refinement_iters; l++)
		{
			recompute_ideal_colors(xdim, ydim, zdim, weight_quantization_mode, &epm, rgbs_colors, rgbo_colors, u8_weight1_src, u8_weight2_src, separate_component, pi, it, blk, ewb);

			// store the colors for the block
			for (j = 0; j < partition_count; j++)
			{
				scb->color_formats[j] = pack_color_endpoints(epm.endpt0[j],
															 epm.endpt1[j],
															 rgbs_colors[j], rgbo_colors[j], partition_format_specifiers[i][j], scb->color_values[j], color_quantization_level[i]);
			}
			scb->color_formats_matched = 0;

			if ((partition_count >= 2 && scb->color_formats[0] == scb->color_formats[1]
				&& color_quantization_level[i] != color_quantization_level_mod[i])
				&& (partition_count == 2 || (scb->color_formats[0] == scb->color_formats[2] && (partition_count == 3 || (scb->color_formats[0] == scb->color_formats[3])))))
			{
				int colorvals[4][12];
				int color_formats_mod[4];
				for (j = 0; j < partition_count; j++)
				{
					color_formats_mod[j] = pack_color_endpoints(epm.endpt0[j],
																epm.endpt1[j],
																rgbs_colors[j], rgbo_colors[j], partition_format_specifiers[i][j], colorvals[j], color_quantization_level_mod[i]);
				}
				if (color_formats_mod[0] == color_formats_mod[1]
					&& (partition_count == 2 || (color_formats_mod[0] == color_formats_mod[2] && (partition_count == 3 || (color_formats_mod[0] == color_formats_mod[3])))))
				{
					scb->color_formats_matched = 1;
					for (j = 0; j < 4; j++)
						for (k = 0; k < 12; k++)
							scb->color_values[j][k] = colorvals[j][k];
					for (j = 0; j < 4; j++)
						scb->color_formats[j] = color_formats_mod[j];
				}
			}

			// store header fields
			scb->partition_count = partition_count;
			scb->partition_index = partition_index;
			scb->color_quantization_level = scb->color_formats_matched ? color_quantization_level_mod[i] : color_quantization_level[i];
			scb->block_mode = quantized_weight[i];
			scb->plane2_color_component = separate_component;
			scb->error_block = 0;

			if (scb->color_quantization_level < 4)
			{
				scb->error_block = 1;	// should never happen, but cannot prove it impossible
			}

			int adjustments = realign_weights(decode_mode,
											  xdim, ydim, zdim,
											  blk, ewb, scb,
											  u8_weight1_src,
											  u8_weight2_src);

			if (adjustments == 0)
				break;
		}

		for (j = 0; j < weights_to_copy; j++)
		{
			scb->plane1_weights[j] = u8_weight1_src[j];
			scb->plane2_weights[j] = u8_weight2_src[j];
		}

		scb++;
	}
}

void expand_block_artifact_suppression(int xdim, int ydim, int zdim, error_weighting_params * ewp)
{
	int x, y, z;
	float centerpos_x = (xdim - 1) * 0.5f;
	float centerpos_y = (ydim - 1) * 0.5f;
	float centerpos_z = (zdim - 1) * 0.5f;
	float *bef = ewp->block_artifact_suppression_expanded;

	for (z = 0; z < zdim; z++)
	{
		for (y = 0; y < ydim; y++)
		{
			for (x = 0; x < xdim; x++)
			{
				float xdif = (x - centerpos_x) / xdim;
				float ydif = (y - centerpos_y) / ydim;
				float zdif = (z - centerpos_z) / zdim;

				float wdif = 0.36f;
				float dist = sqrt(xdif * xdif + ydif * ydif + zdif * zdif + wdif * wdif);
				*bef = pow(dist, ewp->block_artifact_suppression);
				bef++;
			}
		}
	}
}

// Function to set error weights for each color component for each texel in a block.
// Returns the sum of all the error values set.
float prepare_error_weight_block(const astc_codec_image * input_image,
								 int xdim, int ydim, int zdim, const error_weighting_params * ewp, const imageblock * blk, error_weight_block * ewb, error_weight_block_orig * ewbo)
{
	int idx = 0;

	int any_mean_stdev_weight =
		ewp->rgb_base_weight != 1.0 || ewp->alpha_base_weight != 1.0 || ewp->rgb_mean_weight != 0.0 || ewp->rgb_stdev_weight != 0.0 || ewp->alpha_mean_weight != 0.0 || ewp->alpha_stdev_weight != 0.0;

	float4 color_weights = float4(ewp->rgba_weights[0],
								  ewp->rgba_weights[1],
								  ewp->rgba_weights[2],
								  ewp->rgba_weights[3]);

	ewb->contains_zeroweight_texels = 0;

	for (int z = 0; z < zdim; z++)
	{
		for (int y = 0; y < ydim; y++)
		{
			for (int x = 0; x < xdim; x++)
			{
				int xpos = x + blk->xpos;
				int ypos = y + blk->ypos;
				int zpos = z + blk->zpos;

				if (xpos >= input_image->xsize || ypos >= input_image->ysize || zpos >= input_image->zsize)
				{
					float4 weights = float4(1e-11f, 1e-11f, 1e-11f, 1e-11f);
					ewb->error_weights[idx] = weights;
					ewb->contains_zeroweight_texels = 1;
				}
				else
				{
					float4 error_weight = float4(ewp->rgb_base_weight,
												 ewp->rgb_base_weight,
												 ewp->rgb_base_weight,
												 ewp->alpha_base_weight);

					if (any_mean_stdev_weight)
					{
						float4 avg = input_averages[zpos][ypos][xpos];
						if (avg.x < 6e-5f)
							avg.x = 6e-5f;
						if (avg.y < 6e-5f)
							avg.y = 6e-5f;
						if (avg.z < 6e-5f)
							avg.z = 6e-5f;
						if (avg.w < 6e-5f)
							avg.w = 6e-5f;

						avg = avg * avg;

						float4 variance = input_variances[zpos][ypos][xpos];
						variance = variance * variance;

						float favg = (avg.x + avg.y + avg.z) * (1.0f / 3.0f);
						float fvar = (variance.x + variance.y + variance.z) * (1.0f / 3.0f);

						float mixing = ewp->rgb_mean_and_stdev_mixing;
						avg.xyz = float3(favg, favg, favg) * mixing + avg.xyz * (1.0f - mixing);
						variance.xyz = float3(fvar, fvar, fvar) * mixing + variance.xyz * (1.0f - mixing);

						float4 stdev = float4(sqrt(MAX(variance.x, 0.0f)),
											  sqrt(MAX(variance.y, 0.0f)),
											  sqrt(MAX(variance.z, 0.0f)),
											  sqrt(MAX(variance.w, 0.0f)));

						avg.xyz = avg.xyz * ewp->rgb_mean_weight;
						avg.w = avg.w * ewp->alpha_mean_weight;
						stdev.xyz = stdev.xyz * ewp->rgb_stdev_weight;
						stdev.w = stdev.w * ewp->alpha_stdev_weight;
						error_weight = error_weight + avg + stdev;

						error_weight = float4(1.0f, 1.0f, 1.0f, 1.0f) / error_weight;
					}

					if (ewp->ra_normal_angular_scale)
					{
						// Convert from 0 to 1 to -1 to +1 range.
						float xN = (blk->orig_data[4 * idx] - 0.5f) * 2.0f;
						float yN = (blk->orig_data[4 * idx + 3] - 0.5f) * 2.0f;

						float denom = 1.0f - xN * xN - yN * yN;
						if (denom < 0.1f)
							denom = 0.1f;
						denom = 1.0f / denom;
						error_weight.x *= 1.0f + xN * xN * denom;
						error_weight.w *= 1.0f + yN * yN * denom;
					}

					if (ewp->enable_rgb_scale_with_alpha)
					{
						float alpha_scale;
						if (ewp->alpha_radius != 0)
							alpha_scale = input_alpha_averages[zpos][ypos][xpos];
						else
							alpha_scale = blk->orig_data[4 * idx + 3];
						if (alpha_scale < 0.0001f)
							alpha_scale = 0.0001f;
						alpha_scale *= alpha_scale;
						error_weight.xyz = error_weight.xyz * alpha_scale;
					}
					error_weight = error_weight * color_weights;
					error_weight = error_weight * ewp->block_artifact_suppression_expanded[idx];

					// if we perform a conversion from linear to sRGB, then we multiply
					// the weight with the derivative of the linear->sRGB transform function.
					if (perform_srgb_transform)
					{
						float r = blk->orig_data[4 * idx];
						float g = blk->orig_data[4 * idx + 1];
						float b = blk->orig_data[4 * idx + 2];
						if (r < 0.0031308f)
							r = 12.92f;
						else
							r = 0.4396f * pow(r, -0.58333f);
						if (g < 0.0031308f)
							g = 12.92f;
						else
							g = 0.4396f * pow(g, -0.58333f);
						if (b < 0.0031308f)
							b = 12.92f;
						else
							b = 0.4396f * pow(b, -0.58333f);
						error_weight.x *= r;
						error_weight.y *= g;
						error_weight.z *= b;
					}

					// when we loaded the block to begin with, we applied a transfer function
					// and computed the derivative of the transfer function. However, the
					// error-weight computation so far is based on the original color values,
					// not the transfer-function values. As such, we must multiply the
					// error weights by the derivative of the inverse of the transfer function,
					// which is equivalent to dividing by the derivative of the transfer
					// function.

					ewbo->error_weights[idx] = error_weight;

					error_weight.x /= (blk->deriv_data[4 * idx] * blk->deriv_data[4 * idx] * 1e-10f);
					error_weight.y /= (blk->deriv_data[4 * idx + 1] * blk->deriv_data[4 * idx + 1] * 1e-10f);
					error_weight.z /= (blk->deriv_data[4 * idx + 2] * blk->deriv_data[4 * idx + 2] * 1e-10f);
					error_weight.w /= (blk->deriv_data[4 * idx + 3] * blk->deriv_data[4 * idx + 3] * 1e-10f);

					ewb->error_weights[idx] = error_weight;
					if (dot(error_weight, float4(1, 1, 1, 1)) < 1e-10f)
						ewb->contains_zeroweight_texels = 1;
				}
				idx++;
			}
		}
	}

	float4 error_weight_sum = float4(0, 0, 0, 0);
	int texels_per_block = xdim * ydim * zdim;
	for (int i = 0; i < texels_per_block; i++)
	{
		error_weight_sum = error_weight_sum + ewb->error_weights[i];

		ewb->texel_weight_r[i] = ewb->error_weights[i].x;
		ewb->texel_weight_g[i] = ewb->error_weights[i].y;
		ewb->texel_weight_b[i] = ewb->error_weights[i].z;
		ewb->texel_weight_a[i] = ewb->error_weights[i].w;

		ewb->texel_weight_rg[i] = (ewb->error_weights[i].x + ewb->error_weights[i].y) * 0.5f;
		ewb->texel_weight_rb[i] = (ewb->error_weights[i].x + ewb->error_weights[i].z) * 0.5f;
		ewb->texel_weight_gb[i] = (ewb->error_weights[i].y + ewb->error_weights[i].z) * 0.5f;
		ewb->texel_weight_ra[i] = (ewb->error_weights[i].x + ewb->error_weights[i].w) * 0.5f;

		ewb->texel_weight_gba[i] = (ewb->error_weights[i].y + ewb->error_weights[i].z + ewb->error_weights[i].w) * 0.333333f;
		ewb->texel_weight_rba[i] = (ewb->error_weights[i].x + ewb->error_weights[i].z + ewb->error_weights[i].w) * 0.333333f;
		ewb->texel_weight_rga[i] = (ewb->error_weights[i].x + ewb->error_weights[i].y + ewb->error_weights[i].w) * 0.333333f;
		ewb->texel_weight_rgb[i] = (ewb->error_weights[i].x + ewb->error_weights[i].y + ewb->error_weights[i].z) * 0.333333f;
		ewb->texel_weight[i] = (ewb->error_weights[i].x + ewb->error_weights[i].y + ewb->error_weights[i].z + ewb->error_weights[i].w) * 0.25f;
	}

	return dot(error_weight_sum, float4(1, 1, 1, 1));
}

/*
	functions to analyze block statistical properties:
		* simple properties: * mean * variance
		* covariance-matrix correllation coefficients
 */

// compute averages and covariance matrices for 4 components
static void compute_covariance_matrix(int xdim, int ydim, int zdim, const imageblock * blk, const error_weight_block * ewb, mat4 * cov_matrix)
{
	int i;

	int texels_per_block = xdim * ydim * zdim;

	float r_sum = 0.0f;
	float g_sum = 0.0f;
	float b_sum = 0.0f;
	float a_sum = 0.0f;
	float rr_sum = 0.0f;
	float gg_sum = 0.0f;
	float bb_sum = 0.0f;
	float aa_sum = 0.0f;
	float rg_sum = 0.0f;
	float rb_sum = 0.0f;
	float ra_sum = 0.0f;
	float gb_sum = 0.0f;
	float ga_sum = 0.0f;
	float ba_sum = 0.0f;

	float weight_sum = 0.0f;

	for (i = 0; i < texels_per_block; i++)
	{
		float weight = ewb->texel_weight[i];
		if (weight < 0.0f)
			ASTC_CODEC_INTERNAL_ERROR();
		weight_sum += weight;
		float r = blk->work_data[4 * i];
		float g = blk->work_data[4 * i + 1];
		float b = blk->work_data[4 * i + 2];
		float a = blk->work_data[4 * i + 3];
		r_sum += r * weight;
		rr_sum += r * (r * weight);
		rg_sum += g * (r * weight);
		rb_sum += b * (r * weight);
		ra_sum += a * (r * weight);
		g_sum += g * weight;
		gg_sum += g * (g * weight);
		gb_sum += b * (g * weight);
		ga_sum += a * (g * weight);
		b_sum += b * weight;
		bb_sum += b * (b * weight);
		ba_sum += a * (b * weight);
		a_sum += a * weight;
		aa_sum += a * (a * weight);
	}

	float rpt = 1.0f / MAX(weight_sum, 1e-7f);
	float rs = r_sum;
	float gs = g_sum;
	float bs = b_sum;
	float as = a_sum;

	cov_matrix->v[0] = float4(rr_sum - rs * rs * rpt, rg_sum - rs * gs * rpt, rb_sum - rs * bs * rpt, ra_sum - rs * as * rpt);
	cov_matrix->v[1] = float4(rg_sum - rs * gs * rpt, gg_sum - gs * gs * rpt, gb_sum - gs * bs * rpt, ga_sum - gs * as * rpt);
	cov_matrix->v[2] = float4(rb_sum - rs * bs * rpt, gb_sum - gs * bs * rpt, bb_sum - bs * bs * rpt, ba_sum - bs * as * rpt);
	cov_matrix->v[3] = float4(ra_sum - rs * as * rpt, ga_sum - gs * as * rpt, ba_sum - bs * as * rpt, aa_sum - as * as * rpt);
}

void prepare_block_statistics(int xdim, int ydim, int zdim, const imageblock * blk, const error_weight_block * ewb, int *is_normal_map, float *lowest_correl)
{
	int i;

	mat4 cov_matrix;

	compute_covariance_matrix(xdim, ydim, zdim, blk, ewb, &cov_matrix);

	// use the covariance matrix to compute
	// correllation coefficients
	float rr_var = cov_matrix.v[0].x;
	float gg_var = cov_matrix.v[1].y;
	float bb_var = cov_matrix.v[2].z;
	float aa_var = cov_matrix.v[3].w;

	float rg_correlation = cov_matrix.v[0].y / sqrt(MAX(rr_var * gg_var, 1e-30f));
	float rb_correlation = cov_matrix.v[0].z / sqrt(MAX(rr_var * bb_var, 1e-30f));
	float ra_correlation = cov_matrix.v[0].w / sqrt(MAX(rr_var * aa_var, 1e-30f));
	float gb_correlation = cov_matrix.v[1].z / sqrt(MAX(gg_var * bb_var, 1e-30f));
	float ga_correlation = cov_matrix.v[1].w / sqrt(MAX(gg_var * aa_var, 1e-30f));
	float ba_correlation = cov_matrix.v[2].w / sqrt(MAX(bb_var * aa_var, 1e-30f));

	if (astc_isnan(rg_correlation))
		rg_correlation = 1.0f;
	if (astc_isnan(rb_correlation))
		rb_correlation = 1.0f;
	if (astc_isnan(ra_correlation))
		ra_correlation = 1.0f;
	if (astc_isnan(gb_correlation))
		gb_correlation = 1.0f;
	if (astc_isnan(ga_correlation))
		ga_correlation = 1.0f;
	if (astc_isnan(ba_correlation))
		ba_correlation = 1.0f;

	float lowest_correlation = MIN(fabs(rg_correlation), fabs(rb_correlation));
	lowest_correlation = MIN(lowest_correlation, fabs(ra_correlation));
	lowest_correlation = MIN(lowest_correlation, fabs(gb_correlation));
	lowest_correlation = MIN(lowest_correlation, fabs(ga_correlation));
	lowest_correlation = MIN(lowest_correlation, fabs(ba_correlation));
	*lowest_correl = lowest_correlation;

	// compute a "normal-map" factor
	// this factor should be exactly 0.0 for a normal map, while it may be all over the
	// place for anything that is NOT a normal map. We can probably assume that a factor
	// of less than 0.2f represents a normal map.

	float nf_sum = 0.0f;

	int texels_per_block = xdim * ydim * zdim;

	for (i = 0; i < texels_per_block; i++)
	{
		float3 val = float3(blk->orig_data[4 * i],
							blk->orig_data[4 * i + 1],
							blk->orig_data[4 * i + 2]);
		val = (val - float3(0.5f, 0.5f, 0.5f)) * 2.0f;
		float length_squared = dot(val, val);
		float nf = fabs(length_squared - 1.0f);
		nf_sum += nf;
	}
	float nf_avg = nf_sum / texels_per_block;
	*is_normal_map = nf_avg < 0.2;
}

void compress_constant_color_block(int xdim, int ydim, int zdim, const imageblock * blk, const error_weight_block * ewb, symbolic_compressed_block * scb)
{
	int texel_count = xdim * ydim * zdim;
	int i;

	float4 color_sum = float4(0, 0, 0, 0);
	float4 color_weight_sum = float4(0, 0, 0, 0);

	const float *clp = blk->work_data;
	for (i = 0; i < texel_count; i++)
	{
		float4 weights = ewb->error_weights[i];
		float4 color_data = float4(clp[4 * i], clp[4 * i + 1], clp[4 * i + 2], clp[4 * i + 3]);
		color_sum = color_sum + (color_data * weights);
		color_weight_sum = color_weight_sum + weights;
	}

	float4 avg_color = color_sum / color_weight_sum;

	int use_fp16 = blk->rgb_lns[0];

	#ifdef DEBUG_PRINT_DIAGNOSTICS
		if (print_diagnostics)
		{
			printf("Averaged color: %f %f %f %f\n", avg_color.x, avg_color.y, avg_color.z, avg_color.w);
		}
	#endif

	// convert the color
	if (blk->rgb_lns[0])
	{
		int avg_red = static_cast < int >(floor(avg_color.x + 0.5f));
		int avg_green = static_cast < int >(floor(avg_color.y + 0.5f));
		int avg_blue = static_cast < int >(floor(avg_color.z + 0.5f));

		if (avg_red < 0)
			avg_red = 0;
		else if (avg_red > 65535)
			avg_red = 65535;

		if (avg_green < 0)
			avg_green = 0;
		else if (avg_green > 65535)
			avg_green = 65535;

		if (avg_blue < 0)
			avg_blue = 0;
		else if (avg_blue > 65535)
			avg_blue = 65535;

		avg_color.x = sf16_to_float(lns_to_sf16(avg_red));
		avg_color.y = sf16_to_float(lns_to_sf16(avg_green));
		avg_color.z = sf16_to_float(lns_to_sf16(avg_blue));
	}
	else
	{
		avg_color.x *= (1.0f / 65535.0f);
		avg_color.y *= (1.0f / 65535.0f);
		avg_color.z *= (1.0f / 65535.0f);
	}

	if (blk->alpha_lns[0])
	{
		int avg_alpha = static_cast < int >(floor(avg_color.w + 0.5f));

		if (avg_alpha < 0)
			avg_alpha = 0;
		else if (avg_alpha > 65535)
			avg_alpha = 65535;

		avg_color.w = sf16_to_float(lns_to_sf16(avg_alpha));
	}
	else
	{
		avg_color.w *= (1.0f / 65535.0f);
	}

#ifdef DEBUG_PRINT_DIAGNOSTICS
	if (print_diagnostics)
	{
		printf("Averaged color: %f %f %f %f   (%d)\n", avg_color.x, avg_color.y, avg_color.z, avg_color.w, use_fp16);

	}
#endif

	if (use_fp16)
	{
		scb->error_block = 0;
		scb->block_mode = -1;
		scb->partition_count = 0;
		scb->constant_color[0] = float_to_sf16(avg_color.x, SF_NEARESTEVEN);
		scb->constant_color[1] = float_to_sf16(avg_color.y, SF_NEARESTEVEN);
		scb->constant_color[2] = float_to_sf16(avg_color.z, SF_NEARESTEVEN);
		scb->constant_color[3] = float_to_sf16(avg_color.w, SF_NEARESTEVEN);
	}
	else
	{
		scb->error_block = 0;
		scb->block_mode = -2;
		scb->partition_count = 0;
		float red = avg_color.x;
		float green = avg_color.y;
		float blue = avg_color.z;
		float alpha = avg_color.w;
		if (red < 0)
			red = 0;
		else if (red > 1)
			red = 1;
		if (green < 0)
			green = 0;
		else if (green > 1)
			green = 1;
		if (blue < 0)
			blue = 0;
		else if (blue > 1)
			blue = 1;
		if (alpha < 0)
			alpha = 0;
		else if (alpha > 1)
			alpha = 1;
		scb->constant_color[0] = static_cast < int >(floor(red * 65535.0f + 0.5f));
		scb->constant_color[1] = static_cast < int >(floor(green * 65535.0f + 0.5f));
		scb->constant_color[2] = static_cast < int >(floor(blue * 65535.0f + 0.5f));
		scb->constant_color[3] = static_cast < int >(floor(alpha * 65535.0f + 0.5f));
	}
}

int block_mode_histogram[2048];

float compress_symbolic_block(const astc_codec_image * input_image,
							  astc_decode_mode decode_mode, int xdim, int ydim, int zdim, const error_weighting_params * ewp, const imageblock * blk, symbolic_compressed_block * scb,
							  compress_symbolic_block_buffers * tmpbuf)
{
	int i, j;
	int xpos = blk->xpos;
	int ypos = blk->ypos;
	int zpos = blk->zpos;

	#ifdef DEBUG_PRINT_DIAGNOSTICS
		if (print_diagnostics)
		{
			printf("Diagnostics of block of dimension %d x %d x %d\n\n", xdim, ydim, zdim);

			printf("XPos: %d  YPos: %d  ZPos: %d\n", xpos, ypos, zpos);

			printf("Red-min: %f   Red-max: %f\n", blk->red_min, blk->red_max);
			printf("Green-min: %f   Green-max: %f\n", blk->green_min, blk->green_max);
			printf("Blue-min: %f   Blue-max: %f\n", blk->blue_min, blk->blue_max);
			printf("Alpha-min: %f   Alpha-max: %f\n", blk->alpha_min, blk->alpha_max);
			printf("Grayscale: %d\n", blk->grayscale);

			for (int z = 0; z < zdim; z++)
				for (int y = 0; y < ydim; y++)
					for (int x = 0; x < xdim; x++)
					{
						int idx = ((z * ydim + y) * xdim + x) * 4;
						printf("Texel (%d %d %d) : orig=< %g, %g, %g, %g >, work=< %g, %g, %g, %g >\n",
							x, y, z,
							blk->orig_data[idx],
							blk->orig_data[idx + 1], blk->orig_data[idx + 2], blk->orig_data[idx + 3], blk->work_data[idx], blk->work_data[idx + 1], blk->work_data[idx + 2], blk->work_data[idx + 3]);
					}
			printf("\n");
		}
	#endif

	if (blk->red_min == blk->red_max && blk->green_min == blk->green_max && blk->blue_min == blk->blue_max && blk->alpha_min == blk->alpha_max)
	{
		// detected a constant-color block. Encode as FP16 if using HDR
		scb->error_block = 0;

		if (rgb_force_use_of_hdr)
		{
			scb->block_mode = -1;
			scb->partition_count = 0;
			scb->constant_color[0] = float_to_sf16(blk->orig_data[0], SF_NEARESTEVEN);
			scb->constant_color[1] = float_to_sf16(blk->orig_data[1], SF_NEARESTEVEN);
			scb->constant_color[2] = float_to_sf16(blk->orig_data[2], SF_NEARESTEVEN);
			scb->constant_color[3] = float_to_sf16(blk->orig_data[3], SF_NEARESTEVEN);
		}
		else
		{
			// Encode as UNORM16 if NOT using HDR.
			scb->block_mode = -2;
			scb->partition_count = 0;
			float red = blk->orig_data[0];
			float green = blk->orig_data[1];
			float blue = blk->orig_data[2];
			float alpha = blk->orig_data[3];
			if (red < 0)
				red = 0;
			else if (red > 1)
				red = 1;
			if (green < 0)
				green = 0;
			else if (green > 1)
				green = 1;
			if (blue < 0)
				blue = 0;
			else if (blue > 1)
				blue = 1;
			if (alpha < 0)
				alpha = 0;
			else if (alpha > 1)
				alpha = 1;
			scb->constant_color[0] = (int)floor(red * 65535.0f + 0.5f);
			scb->constant_color[1] = (int)floor(green * 65535.0f + 0.5f);
			scb->constant_color[2] = (int)floor(blue * 65535.0f + 0.5f);
			scb->constant_color[3] = (int)floor(alpha * 65535.0f + 0.5f);
		}

		#ifdef DEBUG_PRINT_DIAGNOSTICS
			if (print_diagnostics)
			{
				printf("Block is single-color <%4.4X %4.4X %4.4X %4.4X>\n", scb->constant_color[0], scb->constant_color[1], scb->constant_color[2], scb->constant_color[3]);
			}
		#endif

		if (print_tile_errors)
			printf("0\n");

		physical_compressed_block psb = symbolic_to_physical(xdim, ydim, zdim, scb);
		physical_to_symbolic(xdim, ydim, zdim, psb, scb);

		return 0.0f;
	}

	error_weight_block *ewb = tmpbuf->ewb;
	error_weight_block_orig *ewbo = tmpbuf->ewbo;

	float error_weight_sum = prepare_error_weight_block(input_image,
														xdim, ydim, zdim,
														ewp, blk, ewb, ewbo);

	#ifdef DEBUG_PRINT_DIAGNOSTICS
		if (print_diagnostics)
		{
			printf("\n");
			for (int z = 0; z < zdim; z++)
				for (int y = 0; y < ydim; y++)
					for (int x = 0; x < xdim; x++)
					{
						int idx = (z * ydim + y) * xdim + x;
						printf("ErrorWeight (%d %d %d) : < %g, %g, %g, %g >\n", x, y, z, ewb->error_weights[idx].x, ewb->error_weights[idx].y, ewb->error_weights[idx].z, ewb->error_weights[idx].w);
					}
			printf("\n");
		}
	#endif

	symbolic_compressed_block *tempblocks = tmpbuf->tempblocks;

	float error_of_best_block = 1e20f;
	// int modesel=0;

	imageblock *temp = tmpbuf->temp;

	float best_errorvals_in_modes[17];
	for (i = 0; i < 17; i++)
		best_errorvals_in_modes[i] = 1e30f;

	int uses_alpha = imageblock_uses_alpha(blk);

	// compression of average-color blocks disabled for the time being;
	// they produce extremely severe block artifacts.
#if 0
	// first, compress an averaged-color block
	compress_constant_color_block(xdim, ydim, zdim, blk, ewb, scb);

	decompress_symbolic_block(decode_mode, xdim, ydim, zdim, xpos, ypos, zpos, scb, temp);

	float avgblock_errorval = compute_imageblock_difference(xdim, ydim, zdim,
															blk, temp, ewb) * 4.0f;	// bias somewhat against the average-color block.

	#ifdef DEBUG_PRINT_DIAGNOSTICS
		if (print_diagnostics)
		{
			printf("\n-----------------------------------\n");
			printf("Average-color block test completed\n");
			printf("Resulting error value: %g\n", avgblock_errorval);
		}
	#endif


	if (avgblock_errorval < error_of_best_block)
	{
		#ifdef DEBUG_PRINT_DIAGNOSTICS
			if (print_diagnostics)
				printf("Accepted as better than previous-best-error, which was %g\n", error_of_best_block);
		#endif

		error_of_best_block = avgblock_errorval;
		// *scb = tempblocks[j];
		modesel = 0;
	}

	#ifdef DEBUG_PRINT_DIAGNOSTICS
		if (print_diagnostics)
		{
			printf("-----------------------------------\n");
		}
	#endif
#endif


	float mode_cutoff = ewp->block_mode_cutoff;

	// next, test mode #0. This mode uses 1 plane of weights and 1 partition.
	// we test it twice, first with a modecutoff of 0, then with the specified mode-cutoff.
	// This causes an early-out that speeds up encoding of "easy" content.

	float modecutoffs[2];
	float errorval_mult[2] = { 2.5, 1 };
	modecutoffs[0] = 0;
	modecutoffs[1] = mode_cutoff;

	#if 0
		if ((error_of_best_block / error_weight_sum) < ewp->texel_avg_error_limit)
			goto END_OF_TESTS;
	#endif

	float best_errorval_in_mode;
	for (i = 0; i < 2; i++)
	{
		compress_symbolic_block_fixed_partition_1_plane(decode_mode, modecutoffs[i], ewp->max_refinement_iters, xdim, ydim, zdim, 1,	// partition count
														0,	// partition index
														blk, ewb, tempblocks, tmpbuf->plane1);

		best_errorval_in_mode = 1e30f;
		for (j = 0; j < 4; j++)
		{
			if (tempblocks[j].error_block)
				continue;
			decompress_symbolic_block(decode_mode, xdim, ydim, zdim, xpos, ypos, zpos, tempblocks + j, temp);
			float errorval = compute_imageblock_difference(xdim, ydim, zdim,
														   blk, temp, ewb) * errorval_mult[i];

			#ifdef DEBUG_PRINT_DIAGNOSTICS
				if (print_diagnostics)
				{
					printf("\n-----------------------------------\n");
					printf("Single-weight partition test 0 (1 partition) completed\n");
					printf("Resulting error value: %g\n", errorval);
				}
			#endif

			if (errorval < best_errorval_in_mode)
				best_errorval_in_mode = errorval;

			if (errorval < error_of_best_block)
			{
				#ifdef DEBUG_PRINT_DIAGNOSTICS
					if (print_diagnostics)
						printf("Accepted as better than previous-best-error, which was %g\n", error_of_best_block);
				#endif

				error_of_best_block = errorval;
				*scb = tempblocks[j];
			}

			#ifdef DEBUG_PRINT_DIAGNOSTICS
				if (print_diagnostics)
				{
					printf("-----------------------------------\n");
				}
			#endif
		}

		best_errorvals_in_modes[0] = best_errorval_in_mode;
		if ((error_of_best_block / error_weight_sum) < ewp->texel_avg_error_limit)
			goto END_OF_TESTS;
	}

	int is_normal_map;
	float lowest_correl;
	prepare_block_statistics(xdim, ydim, zdim, blk, ewb, &is_normal_map, &lowest_correl);

	if (is_normal_map && lowest_correl < 0.99f)
		lowest_correl = 0.99f;

	// next, test the four possible 1-partition, 2-planes modes
	for (i = 0; i < 4; i++)
	{

		if (lowest_correl > ewp->lowest_correlation_cutoff)
			continue;

		if (blk->grayscale && i != 3)
			continue;

		if (!uses_alpha && i == 3)
			continue;

		compress_symbolic_block_fixed_partition_2_planes(decode_mode, mode_cutoff, ewp->max_refinement_iters, xdim, ydim, zdim, 1,	// partition count
														 0,	// partition index
														 i,	// the color component to test a separate plane of weights for.
														 blk, ewb, tempblocks, tmpbuf->planes2);

		best_errorval_in_mode = 1e30f;
		for (j = 0; j < 4; j++)
		{
			if (tempblocks[j].error_block)
				continue;
			decompress_symbolic_block(decode_mode, xdim, ydim, zdim, xpos, ypos, zpos, tempblocks + j, temp);
			float errorval = compute_imageblock_difference(xdim, ydim, zdim,
														   blk, temp, ewb);

			#ifdef DEBUG_PRINT_DIAGNOSTICS
				if (print_diagnostics)
				{
					printf("\n-----------------------------------\n");
					printf("Dual-weight partition test %d (1 partition) completed\n", i);
					printf("Resulting error value: %g\n", errorval);
				}
			#endif

			if (errorval < best_errorval_in_mode)
				best_errorval_in_mode = errorval;

			if (errorval < error_of_best_block)
			{
				#ifdef DEBUG_PRINT_DIAGNOSTICS
					if (print_diagnostics)
						printf("Accepted as better than previous-best-error, which was %g\n", error_of_best_block);
				#endif

				error_of_best_block = errorval;
				*scb = tempblocks[j];
			}

			#ifdef DEBUG_PRINT_DIAGNOSTICS
				if (print_diagnostics)
				{
					printf("-----------------------------------\n");
				}
			#endif

			best_errorvals_in_modes[i + 1] = best_errorval_in_mode;
		}

		if ((error_of_best_block / error_weight_sum) < ewp->texel_avg_error_limit)
			goto END_OF_TESTS;
	}

	// find best blocks for 2, 3 and 4 partitions
	int partition_count;
	for (partition_count = 2; partition_count <= 4; partition_count++)
	{
		int partition_indices_1plane[2];
		int partition_indices_2planes[2];

		find_best_partitionings(ewp->partition_search_limit,
								xdim, ydim, zdim, partition_count, blk, ewb, 1,
								&(partition_indices_1plane[0]), &(partition_indices_1plane[1]), &(partition_indices_2planes[0]));

		for (i = 0; i < 2; i++)
		{
			compress_symbolic_block_fixed_partition_1_plane(decode_mode, mode_cutoff, ewp->max_refinement_iters, xdim, ydim, zdim, partition_count, partition_indices_1plane[i], blk, ewb, tempblocks, tmpbuf->plane1);

			best_errorval_in_mode = 1e30f;
			for (j = 0; j < 4; j++)
			{
				if (tempblocks[j].error_block)
					continue;
				decompress_symbolic_block(decode_mode, xdim, ydim, zdim, xpos, ypos, zpos, tempblocks + j, temp);
				float errorval = compute_imageblock_difference(xdim, ydim, zdim,
															   blk, temp, ewb);

				#ifdef DEBUG_PRINT_DIAGNOSTICS
					if (print_diagnostics)
					{
						printf("\n-----------------------------------\n");
						printf("Single-weight partition test %d (%d partitions) completed\n", i, partition_count);
						printf("Resulting error value: %g\n", errorval);
					}
				#endif

				if (errorval < best_errorval_in_mode)
					best_errorval_in_mode = errorval;

				if (errorval < error_of_best_block)
				{
					#ifdef DEBUG_PRINT_DIAGNOSTICS
						if (print_diagnostics)
							printf("Accepted as better than previous-best-error, which was %g\n", error_of_best_block);
					#endif

					error_of_best_block = errorval;
					*scb = tempblocks[j];
				}
			}

			best_errorvals_in_modes[4 * (partition_count - 2) + 5 + i] = best_errorval_in_mode;

			#ifdef DEBUG_PRINT_DIAGNOSTICS
				if (print_diagnostics)
				{
					printf("-----------------------------------\n");
				}
			#endif

			if ((error_of_best_block / error_weight_sum) < ewp->texel_avg_error_limit)
				goto END_OF_TESTS;
		}


		if (partition_count == 2 && !is_normal_map && MIN(best_errorvals_in_modes[5], best_errorvals_in_modes[6]) > (best_errorvals_in_modes[0] * ewp->partition_1_to_2_limit))
			goto END_OF_TESTS;

		// don't bother to check 4 partitions for dual plane of weights, ever.
		if (partition_count == 4)
			break;

		for (i = 0; i < 2; i++)
		{
			if (lowest_correl > ewp->lowest_correlation_cutoff)
				continue;
			compress_symbolic_block_fixed_partition_2_planes(decode_mode,
															 mode_cutoff,
															 ewp->max_refinement_iters,
															 xdim, ydim, zdim,
															 partition_count,
															 partition_indices_2planes[i] & (PARTITION_COUNT - 1), partition_indices_2planes[i] >> PARTITION_BITS,
															 blk, ewb, tempblocks, tmpbuf->planes2);

			best_errorval_in_mode = 1e30f;
			for (j = 0; j < 4; j++)
			{
				if (tempblocks[j].error_block)
					continue;
				decompress_symbolic_block(decode_mode, xdim, ydim, zdim, xpos, ypos, zpos, tempblocks + j, temp);

				float errorval = compute_imageblock_difference(xdim, ydim, zdim,
															   blk, temp, ewb);

				#ifdef DEBUG_PRINT_DIAGNOSTICS
					if (print_diagnostics)
					{
						printf("\n-----------------------------------\n");
						printf("Dual-weight partition test %d (%d partitions) completed\n", i, partition_count);
						printf("Resulting error value: %g\n", errorval);
					}
				#endif

				if (errorval < best_errorval_in_mode)
					best_errorval_in_mode = errorval;

				if (errorval < error_of_best_block)
				{
					#ifdef DEBUG_PRINT_DIAGNOSTICS
						if (print_diagnostics)
							printf("Accepted as better than previous-best-error, which was %g\n", error_of_best_block);
					#endif

					error_of_best_block = errorval;
					*scb = tempblocks[j];
				}
			}

			best_errorvals_in_modes[4 * (partition_count - 2) + 5 + 2 + i] = best_errorval_in_mode;

			#ifdef DEBUG_PRINT_DIAGNOSTICS
				if (print_diagnostics)
				{
					printf("-----------------------------------\n");
				}
			#endif

			if ((error_of_best_block / error_weight_sum) < ewp->texel_avg_error_limit)
				goto END_OF_TESTS;
		}
	}

END_OF_TESTS:

	if (scb->block_mode >= 0)
		block_mode_histogram[scb->block_mode & 0x7ff]++;

	// compress/decompress to a physical block
	physical_compressed_block psb = symbolic_to_physical(xdim, ydim, zdim, scb);
	physical_to_symbolic(xdim, ydim, zdim, psb, scb);


	if (print_tile_errors)
		printf("%g\n", error_of_best_block);

	// mean squared error per color component.
	return error_of_best_block / ((float)xdim * ydim * zdim);
}