axmol/external/astc/astc_image_load_store.cpp

// ----------------------------------------------------------------------------
//  This confidential and proprietary software may be used only as authorised
//  by a licensing agreement from Arm Limited.
//      (C) COPYRIGHT 2011-2019 Arm Limited, ALL RIGHTS RESERVED
//  The entire notice above must be reproduced on all authorised copies and
//  copies may only be made to the extent permitted by a licensing agreement
//  from Arm Limited.
// ----------------------------------------------------------------------------

/**
 * @brief Functions for loading/storing ASTC compressed images.
 */


#include "astc_codec_internals.h"

#include <stdio.h>
#include "softfloat.h"

void destroy_image(astc_codec_image * img)
{
	if (img == NULL)
		return;

	if (img->imagedata8)
	{
		delete[]img->imagedata8[0][0];
		delete[]img->imagedata8[0];
		delete[]img->imagedata8;
	}

	if (img->imagedata16)
	{
		delete[]img->imagedata16[0][0];
		delete[]img->imagedata16[0];
		delete[]img->imagedata16;
	}

	delete img;
}

astc_codec_image *allocate_image(int bitness, int xsize, int ysize, int zsize, int padding)
{
	int i, j;
	astc_codec_image *img = new astc_codec_image;
	img->xsize = xsize;
	img->ysize = ysize;
	img->zsize = zsize;
	img->padding = padding;

	int exsize = xsize + 2 * padding;
	int eysize = ysize + 2 * padding;
	int ezsize = (zsize == 1) ? 1 : zsize + 2 * padding;

	if (bitness == 8)
	{
		img->imagedata8 = new uint8_t **[ezsize];
		img->imagedata8[0] = new uint8_t *[ezsize * eysize];
		img->imagedata8[0][0] = new uint8_t[4 * ezsize * eysize * exsize];
		for (i = 1; i < ezsize; i++)
		{
			img->imagedata8[i] = img->imagedata8[0] + i * eysize;
			img->imagedata8[i][0] = img->imagedata8[0][0] + 4 * i * exsize * eysize;
		}
		for (i = 0; i < ezsize; i++)
			for (j = 1; j < eysize; j++)
				img->imagedata8[i][j] = img->imagedata8[i][0] + 4 * j * exsize;

		img->imagedata16 = NULL;
	}
	else if (bitness == 16)
	{
		img->imagedata16 = new uint16_t **[ezsize];
		img->imagedata16[0] = new uint16_t *[ezsize * eysize];
		img->imagedata16[0][0] = new uint16_t[4 * ezsize * eysize * exsize];
		for (i = 1; i < ezsize; i++)
		{
			img->imagedata16[i] = img->imagedata16[0] + i * eysize;
			img->imagedata16[i][0] = img->imagedata16[0][0] + 4 * i * exsize * eysize;
		}
		for (i = 0; i < ezsize; i++)
			for (j = 1; j < eysize; j++)
				img->imagedata16[i][j] = img->imagedata16[i][0] + 4 * j * exsize;

		img->imagedata8 = NULL;
	}
	else
	{
		ASTC_CODEC_INTERNAL_ERROR();
	}

	return img;
}

void initialize_image(astc_codec_image * img)
{
	int x, y, z;

	int exsize = img->xsize + 2 * img->padding;
	int eysize = img->ysize + 2 * img->padding;
	int ezsize = (img->zsize == 1) ? 1 : img->zsize + 2 * img->padding;

	if (img->imagedata8)
	{
		for (z = 0; z < ezsize; z++)
			for (y = 0; y < eysize; y++)
				for (x = 0; x < exsize; x++)
				{
					img->imagedata8[z][y][4 * x] = 0;
					img->imagedata8[z][y][4 * x + 1] = 0;
					img->imagedata8[z][y][4 * x + 2] = 0;
					img->imagedata8[z][y][4 * x + 3] = 0xFF;
				}
	}
	else if (img->imagedata16)
	{
		for (z = 0; z < ezsize; z++)
			for (y = 0; y < eysize; y++)
				for (x = 0; x < exsize; x++)
				{
					img->imagedata16[z][y][4 * x] = 0;
					img->imagedata16[z][y][4 * x + 1] = 0;
					img->imagedata16[z][y][4 * x + 2] = 0;
					img->imagedata16[z][y][4 * x + 3] = 0x3C00;
				}
	}
	else
	{
		ASTC_CODEC_INTERNAL_ERROR();
	}
}

// fill the padding area of the input-file buffer with clamp-to-edge data
// Done inefficiently, in that it will overwrite all the interior data at least once;
// this is not considered a problem, since this makes up a very small part of total
// running time.
void fill_image_padding_area(astc_codec_image * img)
{
	if (img->padding == 0)
		return;

	int x, y, z, i;
	int exsize = img->xsize + 2 * img->padding;
	int eysize = img->ysize + 2 * img->padding;
	int ezsize = (img->zsize == 1) ? 1 : (img->zsize + 2 * img->padding);

	int xmin = img->padding;
	int ymin = img->padding;
	int zmin = (img->zsize == 1) ? 0 : img->padding;
	int xmax = img->xsize + img->padding - 1;
	int ymax = img->ysize + img->padding - 1;
	int zmax = (img->zsize == 1) ? 0 : img->zsize + img->padding - 1;

	// This is a very simple implementation. Possible optimizations include:
	// * Testing if texel is outside the edge.
	// * Looping over texels that we know are outside the edge.
	if (img->imagedata8)
	{
		for (z = 0; z < ezsize; z++)
		{
			int zc = MIN(MAX(z, zmin), zmax);
			for (y = 0; y < eysize; y++)
			{
				int yc = MIN(MAX(y, ymin), ymax);
				for (x = 0; x < exsize; x++)
				{
					int xc = MIN(MAX(x, xmin), xmax);
					for (i = 0; i < 4; i++)
					{
						img->imagedata8[z][y][4 * x + i] = img->imagedata8[zc][yc][4 * xc + i];
					}
				}
			}
		}
	}
	else if (img->imagedata16)
	{
		for (z = 0; z < ezsize; z++)
		{
			int zc = MIN(MAX(z, zmin), zmax);
			for (y = 0; y < eysize; y++)
			{
				int yc = MIN(MAX(y, ymin), ymax);
				for (x = 0; x < exsize; x++)
				{
					int xc = MIN(MAX(x, xmin), xmax);
					for (i = 0; i < 4; i++)
					{
						img->imagedata16[z][y][4 * x + i] = img->imagedata16[zc][yc][4 * xc + i];
					}
				}
			}
		}
	}
}

int determine_image_channels(const astc_codec_image * img)
{
	int x, y, z;

	int xsize = img->xsize;
	int ysize = img->ysize;
	int zsize = img->zsize;
	// scan through the image data
	// to determine how many color channels the image has.

	int lum_mask;
	int alpha_mask;
	int alpha_mask_ref;
	if (img->imagedata8)
	{
		alpha_mask_ref = 0xFF;
		alpha_mask = 0xFF;
		lum_mask = 0;
		for (z = 0; z < zsize; z++)
		{
			for (y = 0; y < ysize; y++)
			{
				for (x = 0; x < xsize; x++)
				{
					int r = img->imagedata8[z][y][4 * x];
					int g = img->imagedata8[z][y][4 * x + 1];
					int b = img->imagedata8[z][y][4 * x + 2];
					int a = img->imagedata8[z][y][4 * x + 3];
					lum_mask |= (r ^ g) | (r ^ b);
					alpha_mask &= a;
				}
			}
		}
	}
	else						// if( bitness == 16 )
	{
		alpha_mask_ref = 0xFFFF;
		alpha_mask = 0xFFFF;
		lum_mask = 0;
		for (z = 0; z < zsize; z++)
		{
			for (y = 0; y < ysize; y++)
			{
				for (x = 0; x < xsize; x++)
				{
					int r = img->imagedata16[z][y][4 * x];
					int g = img->imagedata16[z][y][4 * x + 1];
					int b = img->imagedata16[z][y][4 * x + 2];
					int a = img->imagedata16[z][y][4 * x + 3];
					lum_mask |= (r ^ g) | (r ^ b);
					alpha_mask &= (a ^ 0xC3FF);	// a ^ 0xC3FF returns FFFF if and only if the input is 1.0
				}
			}
		}
	}

	int image_channels = 1 + (lum_mask == 0 ? 0 : 2) + (alpha_mask == alpha_mask_ref ? 0 : 1);

	return image_channels;
}

// conversion functions between the LNS representation and the FP16 representation.
float float_to_lns(float p)
{
	if (astc_isnan(p) || p <= 1.0f / 67108864.0f)
	{
		// underflow or NaN value, return 0.
		// We count underflow if the input value is smaller than 2^-26.
		return 0;
	}

	if (fabs(p) >= 65536.0f)
	{
		// overflow, return a +INF value
		return 65535;
	}

	int expo;
	float normfrac = frexp(p, &expo);
	float p1;
	if (expo < -13)
	{
		// input number is smaller than 2^-14. In this case, multiply by 2^25.
		p1 = p * 33554432.0f;
		expo = 0;
	}
	else
	{
		expo += 14;
		p1 = (normfrac - 0.5f) * 4096.0f;
	}

	if (p1 < 384.0f)
		p1 *= 4.0f / 3.0f;
	else if (p1 <= 1408.0f)
		p1 += 128.0f;
	else
		p1 = (p1 + 512.0f) * (4.0f / 5.0f);

	p1 += expo * 2048.0f;
	return p1 + 1.0f;
}

uint16_t lns_to_sf16(uint16_t p)
{
	uint16_t mc = p & 0x7FF;
	uint16_t ec = p >> 11;
	uint16_t mt;
	if (mc < 512)
		mt = 3 * mc;
	else if (mc < 1536)
		mt = 4 * mc - 512;
	else
		mt = 5 * mc - 2048;

	uint16_t res = (ec << 10) | (mt >> 3);
	if (res >= 0x7BFF)
		res = 0x7BFF;
	return res;
}

// conversion function from 16-bit LDR value to FP16.
// note: for LDR interpolation, it is impossible to get a denormal result;
// this simplifies the conversion.
// FALSE; we can receive a very small UNORM16 through the constant-block.
uint16_t unorm16_to_sf16(uint16_t p)
{
	if (p == 0xFFFF)
		return 0x3C00;			// value of 1.0 .
	if (p < 4)
		return p << 8;

	int lz = clz32(p) - 16;
	p <<= (lz + 1);
	p >>= 6;
	p |= (14 - lz) << 10;
	return p;
}

void imageblock_initialize_deriv_from_work_and_orig(imageblock * pb, int pixelcount)
{
	int i;

	const float *fptr = pb->orig_data;
	const float *wptr = pb->work_data;
	float *dptr = pb->deriv_data;

	for (i = 0; i < pixelcount; i++)
	{
		// compute derivatives for RGB first
		if (pb->rgb_lns[i])
		{
			float r = MAX(fptr[0], 6e-5f);
			float g = MAX(fptr[1], 6e-5f);
			float b = MAX(fptr[2], 6e-5f);

			float rderiv = (float_to_lns(r * 1.05f) - float_to_lns(r)) / (r * 0.05f);
			float gderiv = (float_to_lns(g * 1.05f) - float_to_lns(g)) / (g * 0.05f);
			float bderiv = (float_to_lns(b * 1.05f) - float_to_lns(b)) / (b * 0.05f);

			// the derivative may not actually take values smaller than 1/32 or larger than 2^25;
			// if it does, we clamp it.
			if (rderiv < (1.0f / 32.0f))
				rderiv = (1.0f / 32.0f);
			else if (rderiv > 33554432.0f)
				rderiv = 33554432.0f;

			if (gderiv < (1.0f / 32.0f))
				gderiv = (1.0f / 32.0f);
			else if (gderiv > 33554432.0f)
				gderiv = 33554432.0f;

			if (bderiv < (1.0f / 32.0f))
				bderiv = (1.0f / 32.0f);
			else if (bderiv > 33554432.0f)
				bderiv = 33554432.0f;

			dptr[0] = rderiv;
			dptr[1] = gderiv;
			dptr[2] = bderiv;
		}
		else
		{
			dptr[0] = 65535.0f;
			dptr[1] = 65535.0f;
			dptr[2] = 65535.0f;
		}

		// then compute derivatives for Alpha
		if (pb->alpha_lns[i])
		{
			float a = MAX(fptr[3], 6e-5f);
			float aderiv = (float_to_lns(a * 1.05f) - float_to_lns(a)) / (a * 0.05f);
			// the derivative may not actually take values smaller than 1/32 or larger than 2^25;
			// if it does, we clamp it.
			if (aderiv < (1.0f / 32.0f))
				aderiv = (1.0f / 32.0f);
			else if (aderiv > 33554432.0f)
				aderiv = 33554432.0f;

			dptr[3] = aderiv;
		}
		else
		{
			dptr[3] = 65535.0f;
		}

		fptr += 4;
		wptr += 4;
		dptr += 4;
	}
}

// helper function to initialize the work-data from the orig-data
void imageblock_initialize_work_from_orig(imageblock * pb, int pixelcount)
{
	int i;
	float *fptr = pb->orig_data;
	float *wptr = pb->work_data;

	for (i = 0; i < pixelcount; i++)
	{
		if (pb->rgb_lns[i])
		{
			wptr[0] = float_to_lns(fptr[0]);
			wptr[1] = float_to_lns(fptr[1]);
			wptr[2] = float_to_lns(fptr[2]);
		}
		else
		{
			wptr[0] = fptr[0] * 65535.0f;
			wptr[1] = fptr[1] * 65535.0f;
			wptr[2] = fptr[2] * 65535.0f;
		}

		if (pb->alpha_lns[i])
		{
			wptr[3] = float_to_lns(fptr[3]);
		}
		else
		{
			wptr[3] = fptr[3] * 65535.0f;
		}

		fptr += 4;
		wptr += 4;
	}

	imageblock_initialize_deriv_from_work_and_orig(pb, pixelcount);
}

// helper function to initialize the orig-data from the work-data
void imageblock_initialize_orig_from_work(imageblock * pb, int pixelcount)
{
	int i;
	float *fptr = pb->orig_data;
	float *wptr = pb->work_data;

	for (i = 0; i < pixelcount; i++)
	{
		if (pb->rgb_lns[i])
		{
			fptr[0] = sf16_to_float(lns_to_sf16((uint16_t) wptr[0]));
			fptr[1] = sf16_to_float(lns_to_sf16((uint16_t) wptr[1]));
			fptr[2] = sf16_to_float(lns_to_sf16((uint16_t) wptr[2]));
		}
		else
		{
			fptr[0] = sf16_to_float(unorm16_to_sf16((uint16_t) wptr[0]));
			fptr[1] = sf16_to_float(unorm16_to_sf16((uint16_t) wptr[1]));
			fptr[2] = sf16_to_float(unorm16_to_sf16((uint16_t) wptr[2]));
		}

		if (pb->alpha_lns[i])
		{
			fptr[3] = sf16_to_float(lns_to_sf16((uint16_t) wptr[3]));
		}
		else
		{
			fptr[3] = sf16_to_float(unorm16_to_sf16((uint16_t) wptr[3]));
		}

		fptr += 4;
		wptr += 4;
	}

	imageblock_initialize_deriv_from_work_and_orig(pb, pixelcount);
}

// fetch an imageblock from the input file.
void fetch_imageblock(const astc_codec_image * img, imageblock * pb,	// picture-block to initialize with image data
					  // block dimensions
					  int xdim, int ydim, int zdim,
					  // position in texture.
					  int xpos, int ypos, int zpos, swizzlepattern swz)
{
	float *fptr = pb->orig_data;
	int xsize = img->xsize + 2 * img->padding;
	int ysize = img->ysize + 2 * img->padding;
	int zsize = (img->zsize == 1) ? 1 : img->zsize + 2 * img->padding;

	int x, y, z, i;

	pb->xpos = xpos;
	pb->ypos = ypos;
	pb->zpos = zpos;

	xpos += img->padding;
	ypos += img->padding;
	if (img->zsize > 1)
		zpos += img->padding;

	float data[6];
	data[4] = 0;
	data[5] = 1;

	if (img->imagedata8)
	{
		for (z = 0; z < zdim; z++)
		{
			for (y = 0; y < ydim; y++)
			{
				for (x = 0; x < xdim; x++)
				{
					int xi = xpos + x;
					int yi = ypos + y;
					int zi = zpos + z;
					// clamp XY coordinates to the picture.
					if (xi < 0)
						xi = 0;
					if (yi < 0)
						yi = 0;
					if (zi < 0)
						zi = 0;
					if (xi >= xsize)
						xi = xsize - 1;
					if (yi >= ysize)
						yi = ysize - 1;
					if (zi >= zsize)
						zi = zsize - 1;

					int r = img->imagedata8[zi][yi][4 * xi];
					int g = img->imagedata8[zi][yi][4 * xi + 1];
					int b = img->imagedata8[zi][yi][4 * xi + 2];
					int a = img->imagedata8[zi][yi][4 * xi + 3];

					data[0] = r / 255.0f;
					data[1] = g / 255.0f;
					data[2] = b / 255.0f;
					data[3] = a / 255.0f;

					fptr[0] = data[swz.r];
					fptr[1] = data[swz.g];
					fptr[2] = data[swz.b];
					fptr[3] = data[swz.a];

					fptr += 4;
				}
			}
		}
	}
	else if (img->imagedata16)
	{
		for (z = 0; z < zdim; z++)
		{
			for (y = 0; y < ydim; y++)
			{
				for (x = 0; x < xdim; x++)
				{
					int xi = xpos + x;
					int yi = ypos + y;
					int zi = zpos + z;
					// clamp XY coordinates to the picture.
					if (xi < 0)
						xi = 0;
					if (yi < 0)
						yi = 0;
					if (zi < 0)
						zi = 0;
					if (xi >= xsize)
						xi = xsize - 1;
					if (yi >= ysize)
						yi = ysize - 1;
					if (zi >= ysize)
						zi = zsize - 1;

					int r = img->imagedata16[zi][yi][4 * xi];
					int g = img->imagedata16[zi][yi][4 * xi + 1];
					int b = img->imagedata16[zi][yi][4 * xi + 2];
					int a = img->imagedata16[zi][yi][4 * xi + 3];

					float rf = sf16_to_float(r);
					float gf = sf16_to_float(g);
					float bf = sf16_to_float(b);
					float af = sf16_to_float(a);

					// equalize the color components somewhat, and get rid of negative values.
					rf = MAX(rf, 1e-8f);
					gf = MAX(gf, 1e-8f);
					bf = MAX(bf, 1e-8f);
					af = MAX(af, 1e-8f);

					data[0] = rf;
					data[1] = gf;
					data[2] = bf;
					data[3] = af;

					fptr[0] = data[swz.r];
					fptr[1] = data[swz.g];
					fptr[2] = data[swz.b];
					fptr[3] = data[swz.a];
					fptr += 4;
				}
			}
		}
	}

	// perform sRGB-to-linear transform on input data, if requested.
	int pixelcount = xdim * ydim * zdim;

	if (perform_srgb_transform)
	{
		fptr = pb->orig_data;
		for (i = 0; i < pixelcount; i++)
		{
			float r = fptr[0];
			float g = fptr[1];
			float b = fptr[2];

			if (r <= 0.04045f)
				r = r * (1.0f / 12.92f);
			else if (r <= 1)
				r = pow((r + 0.055f) * (1.0f / 1.055f), 2.4f);

			if (g <= 0.04045f)
				g = g * (1.0f / 12.92f);
			else if (g <= 1)
				g = pow((g + 0.055f) * (1.0f / 1.055f), 2.4f);

			if (b <= 0.04045f)
				b = b * (1.0f / 12.92f);
			else if (b <= 1)
				b = pow((b + 0.055f) * (1.0f / 1.055f), 2.4f);

			fptr[0] = r;
			fptr[1] = g;
			fptr[2] = b;

			fptr += 4;
		}
	}

	// collect color max-value, in order to determine whether to use LDR or HDR
	// interpolation.
	float max_red, max_green, max_blue, max_alpha;
	max_red = 0.0f;
	max_green = 0.0f;
	max_blue = 0.0f;
	max_alpha = 0.0f;

	fptr = pb->orig_data;
	for (i = 0; i < pixelcount; i++)
	{
		float r = fptr[0];
		float g = fptr[1];
		float b = fptr[2];
		float a = fptr[3];

		if (r > max_red)
			max_red = r;
		if (g > max_green)
			max_green = g;
		if (b > max_blue)
			max_blue = b;
		if (a > max_alpha)
			max_alpha = a;

		fptr += 4;
	}

	float max_rgb = MAX(max_red, MAX(max_green, max_blue));

	// use LNS if:
	// * RGB-maximum is less than 0.15
	// * RGB-maximum is greater than 1
	// * Alpha-maximum is greater than 1
	int rgb_lns = (max_rgb < 0.15f || max_rgb > 1.0f || max_alpha > 1.0f) ? 1 : 0;
	int alpha_lns = rgb_lns ? (max_alpha > 1.0f || max_alpha < 0.15f) : 0;

	// not yet though; for the time being, just obey the command line.
	rgb_lns = rgb_force_use_of_hdr;
	alpha_lns = alpha_force_use_of_hdr;

	// impose the choice on every pixel when encoding.
	for (i = 0; i < pixelcount; i++)
	{
		pb->rgb_lns[i] = rgb_lns;
		pb->alpha_lns[i] = alpha_lns;
		pb->nan_texel[i] = 0;
	}

	imageblock_initialize_work_from_orig(pb, pixelcount);
	update_imageblock_flags(pb, xdim, ydim, zdim);
}

void write_imageblock(astc_codec_image * img, const imageblock * pb,	// picture-block to initialize with image data. We assume that orig_data is valid.
					  // block dimensions
					  int xdim, int ydim, int zdim,
					  // position to write the block to
					  int xpos, int ypos, int zpos, swizzlepattern swz)
{
	const float *fptr = pb->orig_data;
	const uint8_t *nptr = pb->nan_texel;
	int xsize = img->xsize;
	int ysize = img->ysize;
	int zsize = img->zsize;
	int x, y, z;

	float data[7];
	data[4] = 0.0f;
	data[5] = 1.0f;

	if (img->imagedata8)
	{
		for (z = 0; z < zdim; z++)
		{
			for (y = 0; y < ydim; y++)
			{
				for (x = 0; x < xdim; x++)
				{
					int xi = xpos + x;
					int yi = ypos + y;
					int zi = zpos + z;

					if (xi >= 0 && yi >= 0 && zi >= 0 && xi < xsize && yi < ysize && zi < zsize)
					{
						if (*nptr)
						{
							// NaN-pixel, but we can't display it. Display purple instead.
							img->imagedata8[zi][yi][4 * xi] = 0xFF;
							img->imagedata8[zi][yi][4 * xi + 1] = 0x00;
							img->imagedata8[zi][yi][4 * xi + 2] = 0xFF;
							img->imagedata8[zi][yi][4 * xi + 3] = 0xFF;
						}
						else
						{
							// apply swizzle
							if (perform_srgb_transform)
							{
								float r = fptr[0];
								float g = fptr[1];
								float b = fptr[2];

								if (r <= 0.0031308f)
									r = r * 12.92f;
								else if (r <= 1)
									r = 1.055f * pow(r, (1.0f / 2.4f)) - 0.055f;

								if (g <= 0.0031308f)
									g = g * 12.92f;
								else if (g <= 1)
									g = 1.055f * pow(g, (1.0f / 2.4f)) - 0.055f;

								if (b <= 0.0031308f)
									b = b * 12.92f;
								else if (b <= 1)
									b = 1.055f * pow(b, (1.0f / 2.4f)) - 0.055f;

								data[0] = r;
								data[1] = g;
								data[2] = b;
							}
							else
							{
								float r = fptr[0];
								float g = fptr[1];
								float b = fptr[2];

								data[0] = r;
								data[1] = g;
								data[2] = b;
							}
							data[3] = fptr[3];

							float xcoord = (data[0] * 2.0f) - 1.0f;
							float ycoord = (data[3] * 2.0f) - 1.0f;
							float zcoord = 1.0f - xcoord * xcoord - ycoord * ycoord;
							if (zcoord < 0.0f)
								zcoord = 0.0f;
							data[6] = (sqrt(zcoord) * 0.5f) + 0.5f;

							// clamp to [0,1]
							if (data[0] > 1.0f)
								data[0] = 1.0f;
							if (data[1] > 1.0f)
								data[1] = 1.0f;
							if (data[2] > 1.0f)
								data[2] = 1.0f;
							if (data[3] > 1.0f)
								data[3] = 1.0f;

							// pack the data
							int ri = static_cast < int >(floor(data[swz.r] * 255.0f + 0.5f));
							int gi = static_cast < int >(floor(data[swz.g] * 255.0f + 0.5f));
							int bi = static_cast < int >(floor(data[swz.b] * 255.0f + 0.5f));
							int ai = static_cast < int >(floor(data[swz.a] * 255.0f + 0.5f));

							img->imagedata8[zi][yi][4 * xi] = ri;
							img->imagedata8[zi][yi][4 * xi + 1] = gi;
							img->imagedata8[zi][yi][4 * xi + 2] = bi;
							img->imagedata8[zi][yi][4 * xi + 3] = ai;
						}
					}
					fptr += 4;
					nptr++;
				}
			}
		}
	}
	else if (img->imagedata16)
	{
		for (z = 0; z < zdim; z++)
		{
			for (y = 0; y < ydim; y++)
			{
				for (x = 0; x < xdim; x++)
				{
					int xi = xpos + x;
					int yi = ypos + y;
					int zi = zpos + z;

					if (xi >= 0 && yi >= 0 && zi >= 0 && xi < xsize && yi < ysize && zi < zsize)
					{
						if (*nptr)
						{
							img->imagedata16[zi][yi][4 * xi] = 0xFFFF;
							img->imagedata16[zi][yi][4 * xi + 1] = 0xFFFF;
							img->imagedata16[zi][yi][4 * xi + 2] = 0xFFFF;
							img->imagedata16[zi][yi][4 * xi + 3] = 0xFFFF;
						}

						else
						{
							// apply swizzle
							if (perform_srgb_transform)
							{
								float r = fptr[0];
								float g = fptr[1];
								float b = fptr[2];

								if (r <= 0.0031308f)
									r = r * 12.92f;
								else if (r <= 1)
									r = 1.055f * pow(r, (1.0f / 2.4f)) - 0.055f;
								if (g <= 0.0031308f)
									g = g * 12.92f;
								else if (g <= 1)
									g = 1.055f * pow(g, (1.0f / 2.4f)) - 0.055f;
								if (b <= 0.0031308f)
									b = b * 12.92f;
								else if (b <= 1)
									b = 1.055f * pow(b, (1.0f / 2.4f)) - 0.055f;

								data[0] = r;
								data[1] = g;
								data[2] = b;
							}
							else
							{
								data[0] = fptr[0];
								data[1] = fptr[1];
								data[2] = fptr[2];
							}
							data[3] = fptr[3];

							float xN = (data[0] * 2.0f) - 1.0f;
							float yN = (data[3] * 2.0f) - 1.0f;
							float zN = 1.0f - xN * xN - yN * yN;
							if (zN < 0.0f)
								zN = 0.0f;
							data[6] = (sqrt(zN) * 0.5f) + 0.5f;

							int r = float_to_sf16(data[swz.r], SF_NEARESTEVEN);
							int g = float_to_sf16(data[swz.g], SF_NEARESTEVEN);
							int b = float_to_sf16(data[swz.b], SF_NEARESTEVEN);
							int a = float_to_sf16(data[swz.a], SF_NEARESTEVEN);
							img->imagedata16[zi][yi][4 * xi] = r;
							img->imagedata16[zi][yi][4 * xi + 1] = g;
							img->imagedata16[zi][yi][4 * xi + 2] = b;
							img->imagedata16[zi][yi][4 * xi + 3] = a;
						}
					}
					fptr += 4;
					nptr++;
				}
			}
		}
	}
}

/*
   For an imageblock, update its flags.
   The updating is done based on work_data, not orig_data.
*/
void update_imageblock_flags(imageblock * pb, int xdim, int ydim, int zdim)
{
	int i;
	float red_min = 1e38f, red_max = -1e38f;
	float green_min = 1e38f, green_max = -1e38f;
	float blue_min = 1e38f, blue_max = -1e38f;
	float alpha_min = 1e38f, alpha_max = -1e38f;

	int texels_per_block = xdim * ydim * zdim;

	int grayscale = 1;

	for (i = 0; i < texels_per_block; i++)
	{
		float red = pb->work_data[4 * i];
		float green = pb->work_data[4 * i + 1];
		float blue = pb->work_data[4 * i + 2];
		float alpha = pb->work_data[4 * i + 3];
		if (red < red_min)
			red_min = red;
		if (red > red_max)
			red_max = red;
		if (green < green_min)
			green_min = green;
		if (green > green_max)
			green_max = green;
		if (blue < blue_min)
			blue_min = blue;
		if (blue > blue_max)
			blue_max = blue;
		if (alpha < alpha_min)
			alpha_min = alpha;
		if (alpha > alpha_max)
			alpha_max = alpha;

		if (grayscale == 1 && (red != green || red != blue))
			grayscale = 0;
	}

	pb->red_min = red_min;
	pb->red_max = red_max;
	pb->green_min = green_min;
	pb->green_max = green_max;
	pb->blue_min = blue_min;
	pb->blue_max = blue_max;
	pb->alpha_min = alpha_min;
	pb->alpha_max = alpha_max;
	pb->grayscale = grayscale;
}

// Helper functions for various error-metric calculations
double clampx(double p)
{
	if (astc_isnan(p) || p < 0.0f)
		p = 0.0f;
	else if (p > 65504.0f)
		p = 65504.0f;
	return p;
}

// logarithm-function, linearized from 2^-14.
double xlog2(double p)
{
	if (p >= 0.00006103515625)
		return log(p) * 1.44269504088896340735;	// log(x)/log(2)
	else
		return -15.44269504088896340735 + p * 23637.11554992477646609062;
}

// mPSNR tone-mapping operator
double mpsnr_operator(double v, int fstop)
{
	int64_t vl = 1LL << (fstop + 32);
	double vl2 = (double)vl * (1.0 / 4294967296.0);
	v *= vl2;
	v = pow(v, (1.0 / 2.2));
	v *= 255.0f;
	if (astc_isnan(v) || v < 0.0f)
		v = 0.0f;
	else if (v > 255.0f)
		v = 255.0f;
	return v;
}

double mpsnr_sumdiff(double v1, double v2, int low_fstop, int high_fstop)
{
	int i;
	double summa = 0.0;
	for (i = low_fstop; i <= high_fstop; i++)
	{
		double mv1 = mpsnr_operator(v1, i);
		double mv2 = mpsnr_operator(v2, i);
		double mdiff = mv1 - mv2;
		summa += mdiff * mdiff;
	}
	return summa;
}

// Compute PSNR and other error metrics between input and output image
void compute_error_metrics(int compute_hdr_error_metrics, int input_components, const astc_codec_image * img1, const astc_codec_image * img2, int low_fstop, int high_fstop, int psnrmode)
{
	int x, y, z;
	static int channelmasks[5] = { 0x00, 0x07, 0x0C, 0x07, 0x0F };
	int channelmask;

	channelmask = channelmasks[input_components];

	double4 errorsum = double4(0, 0, 0, 0);
	double4 alpha_scaled_errorsum = double4(0, 0, 0, 0);
	double4 log_errorsum = double4(0, 0, 0, 0);
	double4 mpsnr_errorsum = double4(0, 0, 0, 0);

	int xsize = MIN(img1->xsize, img2->xsize);
	int ysize = MIN(img1->ysize, img2->ysize);
	int zsize = MIN(img1->zsize, img2->zsize);

	if (img1->xsize != img2->xsize || img1->ysize != img2->ysize || img1->zsize != img2->zsize)
	{
		printf("Warning: comparing images of different size:\n"
			   "Image 1: %dx%dx%d\n" "Image 2: %dx%dx%d\n" "Only intersection region will be compared.\n", img1->xsize, img1->ysize, img1->zsize, img2->xsize, img2->ysize, img2->zsize);
	}

	if (compute_hdr_error_metrics)
	{
		printf("Computing error metrics ... ");
		fflush(stdout);
	}

	int img1pad = img1->padding;
	int img2pad = img2->padding;

	double rgb_peak = 0.0f;

	for (z = 0; z < zsize; z++)
	{
		for (y = 0; y < ysize; y++)
		{
			int ze1 = (img1->zsize == 1) ? z : z + img1pad;
			int ze2 = (img2->zsize == 1) ? z : z + img2pad;

			int ye1 = y + img1pad;
			int ye2 = y + img2pad;

			for (x = 0; x < xsize; x++)
			{
				double4 input_color1;
				double4 input_color2;

				int xe1 = 4 * x + 4 * img1pad;
				int xe2 = 4 * x + 4 * img2pad;

				if (img1->imagedata8)
				{
					input_color1 =
						double4(img1->imagedata8[ze1][ye1][xe1] * (1.0f / 255.0f),
								img1->imagedata8[ze1][ye1][xe1 + 1] * (1.0f / 255.0f), img1->imagedata8[ze1][ye1][xe1 + 2] * (1.0f / 255.0f), img1->imagedata8[ze1][ye1][xe1 + 3] * (1.0f / 255.0f));
				}
				else
				{
					input_color1 =
						double4(clampx(sf16_to_float(img1->imagedata16[ze1][ye1][xe1])),
								clampx(sf16_to_float(img1->imagedata16[ze1][ye1][xe1 + 1])),
								clampx(sf16_to_float(img1->imagedata16[ze1][ye1][xe1 + 2])), clampx(sf16_to_float(img1->imagedata16[ze1][ye1][xe1 + 3])));
				}

				if (img2->imagedata8)
				{
					input_color2 =
						double4(img2->imagedata8[ze2][ye2][xe2] * (1.0f / 255.0f),
								img2->imagedata8[ze2][ye2][xe2 + 1] * (1.0f / 255.0f), img2->imagedata8[ze2][ye2][xe2 + 2] * (1.0f / 255.0f), img2->imagedata8[ze2][ye2][xe2 + 3] * (1.0f / 255.0f));
				}
				else
				{
					input_color2 =
						double4(clampx(sf16_to_float(img2->imagedata16[ze2][ye2][xe2])),
								clampx(sf16_to_float(img2->imagedata16[ze2][ye2][xe2 + 1])),
								clampx(sf16_to_float(img2->imagedata16[ze2][ye2][xe2 + 2])), clampx(sf16_to_float(img2->imagedata16[ze2][ye2][xe2 + 3])));
				}

				rgb_peak = MAX(MAX(input_color1.x, input_color1.y), MAX(input_color1.z, rgb_peak));

				double4 diffcolor = input_color1 - input_color2;
				errorsum = errorsum + diffcolor * diffcolor;

				double4 alpha_scaled_diffcolor = double4(diffcolor.xyz * input_color1.w, diffcolor.w);
				alpha_scaled_errorsum = alpha_scaled_errorsum + alpha_scaled_diffcolor * alpha_scaled_diffcolor;

				if (compute_hdr_error_metrics)
				{
					double4 log_input_color1 = double4(xlog2(input_color1.x),
													   xlog2(input_color1.y),
													   xlog2(input_color1.z),
													   xlog2(input_color1.w));

					double4 log_input_color2 = double4(xlog2(input_color2.x),
													   xlog2(input_color2.y),
													   xlog2(input_color2.z),
													   xlog2(input_color2.w));

					double4 log_diffcolor = log_input_color1 - log_input_color2;

					log_errorsum = log_errorsum + log_diffcolor * log_diffcolor;

					double4 mpsnr_error = double4(mpsnr_sumdiff(input_color1.x, input_color2.x, low_fstop, high_fstop),
												  mpsnr_sumdiff(input_color1.y, input_color2.y, low_fstop, high_fstop),
												  mpsnr_sumdiff(input_color1.z, input_color2.z, low_fstop, high_fstop),
												  mpsnr_sumdiff(input_color1.w, input_color2.w, low_fstop, high_fstop));
					mpsnr_errorsum = mpsnr_errorsum + mpsnr_error;
				}
			}
		}
	}

	if (compute_hdr_error_metrics)
	{
		printf("done\n");
	}

	double pixels = xsize * ysize * zsize;

	double num = 0.0;
	double alpha_num = 0.0;
	double log_num = 0.0;
	double mpsnr_num = 0.0;
	double samples = 0.0;

	if (channelmask & 1)
	{
		num += errorsum.x;
		alpha_num += alpha_scaled_errorsum.x;
		log_num += log_errorsum.x;
		mpsnr_num += mpsnr_errorsum.x;
		samples += pixels;
	}

	if (channelmask & 2)
	{
		num += errorsum.y;
		alpha_num += alpha_scaled_errorsum.y;
		log_num += log_errorsum.y;
		mpsnr_num += mpsnr_errorsum.y;
		samples += pixels;
	}

	if (channelmask & 4)
	{
		num += errorsum.z;
		alpha_num += alpha_scaled_errorsum.z;
		log_num += log_errorsum.z;
		mpsnr_num += mpsnr_errorsum.z;
		samples += pixels;
	}

	if (channelmask & 8)
	{
		num += errorsum.w;
		alpha_num += alpha_scaled_errorsum.w;
		/* log_num += log_errorsum.w; mpsnr_num += mpsnr_errorsum.w; */
		samples += pixels;
	}

	double denom = samples;
	double mpsnr_denom = pixels * 3.0 * (high_fstop - low_fstop + 1) * 255.0f * 255.0f;

	double psnr;
	if (num == 0)
		psnr = 999.0;
	else
		psnr = 10.0 * log10((double)denom / (double)num);

	double rgb_psnr = psnr;

	if(psnrmode == 1)
	{
		if (channelmask & 8)
		{
			printf("PSNR (LDR-RGBA): %.6lf dB\n", psnr);

			double alpha_psnr;
			if (alpha_num == 0)
				alpha_psnr = 999.0;
			else
				alpha_psnr = 10.0 * log10((double)denom / (double)alpha_num);
			printf("Alpha-Weighted PSNR: %.6lf dB\n", alpha_psnr);

			double rgb_num = errorsum.x + errorsum.y + errorsum.z;
			if (rgb_num == 0)
				rgb_psnr = 999.0;
			else
				rgb_psnr = 10.0 * log10((double)pixels * 3 / (double)rgb_num);
			printf("PSNR (LDR-RGB): %.6lf dB\n", rgb_psnr);
		}
		else
			printf("PSNR (LDR-RGB): %.6lf dB\n", psnr);

		if (compute_hdr_error_metrics)
		{
			printf("Color peak value: %f\n", rgb_peak);
			printf("PSNR (RGB normalized to peak): %f dB\n", rgb_psnr + 20.0 * log10(rgb_peak));

			double mpsnr;
			if (mpsnr_num == 0)
				mpsnr = 999.0;
			else
				mpsnr = 10.0 * log10((double)mpsnr_denom / (double)mpsnr_num);
			printf("mPSNR (RGB) [fstops: %+d to %+d] : %.6lf dB\n", low_fstop, high_fstop, mpsnr);

			double logrmse = sqrt((double)log_num / (double)pixels);
			printf("LogRMSE (RGB): %.6lf\n", logrmse);
		}
	}
}

/*
	Main image loader function.

	We have specialized loaders for DDS, KTX and HTGA; for other formats, we use stb_image.
	This image loader will choose one based on filename.
*/
astc_codec_image *astc_codec_load_image(const char *input_filename, int padding, int *load_result)
{
	#define LOAD_HTGA 0
	#define LOAD_KTX 1
	#define LOAD_DDS 2
	#define LOAD_STB_IMAGE 3

	// check the ending of the input filename
	int load_fileformat = LOAD_STB_IMAGE;
	size_t filename_len = strlen(input_filename);

	const char *eptr = input_filename + filename_len - 5;
	if (eptr > input_filename && (strcmp(eptr, ".htga") == 0 || strcmp(eptr, ".HTGA") == 0))
		load_fileformat = LOAD_HTGA;
	eptr = input_filename + filename_len - 4;
	if (eptr > input_filename && (strcmp(eptr, ".ktx") == 0 || strcmp(eptr, ".KTX") == 0))
		load_fileformat = LOAD_KTX;
	if (eptr > input_filename && (strcmp(eptr, ".dds") == 0 || strcmp(eptr, ".DDS") == 0))
		load_fileformat = LOAD_DDS;

	// OpenEXR support: call exr_to_htga to convert from EXR to HTGA.
	char htga_load_filename[300];
	int load_exr = 0;
	if (eptr > input_filename && (strcmp(eptr, ".exr") == 0 || strcmp(eptr, ".EXR") == 0))
	{
		// don't support filenames longer than 250 characters; this way, we
		// cannot get a buffer overflow from the sprintfs below.
		if (filename_len > 250)
		{
			*load_result = -1;
			return NULL;
		}

		char exr_to_htga_command[550];
		sprintf(htga_load_filename, "%s.htga", input_filename);
		sprintf(exr_to_htga_command, "exr_to_htga -q %s %s", input_filename, htga_load_filename);

		//int retval = system(exr_to_htga_command);
		int retval = 0;
		if (retval != 0)
		{
			printf("Failed to run exr_to_htga to convert input .exr file.\n");
			exit(1);
		}
		input_filename = htga_load_filename;
		load_fileformat = LOAD_HTGA;
		load_exr = 1;
	}

	astc_codec_image *input_image;

	switch (load_fileformat)
	{
	case LOAD_KTX:
		input_image = load_ktx_uncompressed_image(input_filename, padding, load_result);
		break;
	case LOAD_DDS:
		input_image = load_dds_uncompressed_image(input_filename, padding, load_result);
		break;
	case LOAD_HTGA:
		input_image = load_tga_image(input_filename, padding, load_result);
		break;
	case LOAD_STB_IMAGE:
		input_image = load_image_with_stb(input_filename, padding, load_result);
		break;
	default:
		ASTC_CODEC_INTERNAL_ERROR();
	}

	if (load_exr)
		astc_codec_unlink(htga_load_filename);

	return input_image;
}

int get_output_filename_enforced_bitness(const char *output_filename)
{
	if (output_filename == NULL)
		return -1;

	size_t filename_len = strlen(output_filename);
	const char *eptr = output_filename + filename_len - 5;

	if (eptr > output_filename && (strcmp(eptr, ".htga") == 0 || strcmp(eptr, ".HTGA") == 0))
	{
		return 16;
	}

	eptr = output_filename + filename_len - 4;
	if (eptr > output_filename && (strcmp(eptr, ".tga") == 0 || strcmp(eptr, ".TGA") == 0))
	{
		return 8;
	}
	if (eptr > output_filename && (strcmp(eptr, ".exr") == 0 || strcmp(eptr, ".EXR") == 0))
	{
		return 16;
	}

	// file formats that don't match any of the templates above are capable of accommodating
	// both 8-bit and 16-bit data (DDS, KTX)
	return -1;
}

int astc_codec_store_image(const astc_codec_image * output_image, const char *output_filename, int bitness, const char **format_string)
{
	#define STORE_TGA 0
	#define STORE_HTGA 1
	#define STORE_KTX 2
	#define STORE_DDS 3
	#define STORE_EXR 4

	size_t filename_len = strlen(output_filename);

	int store_fileformat = STORE_TGA;
	const char *eptr = output_filename + filename_len - 5;
	if (eptr > output_filename && (strcmp(eptr, ".htga") == 0 || strcmp(eptr, ".HTGA") == 0))
	{
		store_fileformat = STORE_HTGA;
	}
	eptr = output_filename + filename_len - 4;

	if (eptr > output_filename && (strcmp(eptr, ".ktx") == 0 || strcmp(eptr, ".KTX") == 0))
	{
		store_fileformat = STORE_KTX;
	}

	if (eptr > output_filename && (strcmp(eptr, ".dds") == 0 || strcmp(eptr, ".DDS") == 0))
	{
		store_fileformat = STORE_DDS;
	}

	if (eptr > output_filename && (strcmp(eptr, ".exr") == 0 || strcmp(eptr, ".EXR") == 0))
	{
		store_fileformat = STORE_EXR;
	}

	if (store_fileformat == STORE_TGA && bitness == 16)
		store_fileformat = STORE_HTGA;

	// guard against OpenEXR files with too-long names
	if (store_fileformat == STORE_EXR && filename_len > 250)
	{
		*format_string = "EXR";
		return -1;
	}

	char htga_output_filename[300];
	char htga_output_command[550];
	int system_retval;

	int store_result = -1;
	switch (store_fileformat)
	{
	case STORE_TGA:
	case STORE_HTGA:
		*format_string = bitness == 16 ? "HTGA" : "TGA";
		store_result = store_tga_image(output_image, output_filename, bitness);
		break;
	case STORE_KTX:
		*format_string = "KTX";
		store_result = store_ktx_uncompressed_image(output_image, output_filename, bitness);
		break;
	case STORE_DDS:
		*format_string = "DDS";
		store_result = store_dds_uncompressed_image(output_image, output_filename, bitness);
		break;
	case STORE_EXR:
		*format_string = "EXR";
		sprintf(htga_output_filename, "%s.htga", output_filename);
		store_result = store_tga_image(output_image, htga_output_filename, 16);
		sprintf(htga_output_command, "exr_to_htga -e %s %s", htga_output_filename, output_filename);
		//system_retval = system(htga_output_command);
		astc_codec_unlink(htga_output_filename);
		if (system_retval != 0)
			store_result = -99;
		break;
	default:
		ASTC_CODEC_INTERNAL_ERROR();
	};

	return store_result;
}