// ---------------------------------------------------------------------------- // This confidential and proprietary software may be used only as authorised // by a licensing agreement from Arm Limited. // (C) COPYRIGHT 2011-2019 Arm Limited, ALL RIGHTS RESERVED // The entire notice above must be reproduced on all authorised copies and // copies may only be made to the extent permitted by a licensing agreement // from Arm Limited. // ---------------------------------------------------------------------------- /** * @brief Functions to calculate variance per-pixel-channel in a NxN footprint. * * We want N to be parametric. The routine below uses summed area tables in * order to execute in O(1) time per pixel, independent of big N is. */ #include "astc_codec_internals.h" #include "softfloat.h" #include float4 *** input_averages; float *** input_alpha_averages; float4 *** input_variances; // routine to compute averages and variances for a pixel region. // The routine computes both in a single pass, using a summed-area table // to decouple the running time from the averaging/variance kernel size. static void compute_pixel_region_variance(const astc_codec_image * img, float rgb_power_to_use, float alpha_power_to_use, swizzlepattern swz, int use_z_axis, int source_xoffset,int source_yoffset, int source_zoffset, // position of upper-left pixel in data set int xsize, int ysize, int zsize, // the size of the region to actually compute averages and variances for. int avg_var_kernel_radius, int alpha_kernel_radius, int dest_xoffset, int dest_yoffset, int dest_zoffset) { int x, y, z; int kernel_radius = MAX(avg_var_kernel_radius, alpha_kernel_radius); int kerneldim = 2 * kernel_radius + 1; // allocate memory int xpadsize = xsize + kerneldim; int ypadsize = ysize + kerneldim; int zpadsize = zsize + (use_z_axis ? kerneldim : 1); double4 ***varbuf1 = new double4 **[zpadsize]; double4 ***varbuf2 = new double4 **[zpadsize]; varbuf1[0] = new double4 *[ypadsize * zpadsize]; varbuf2[0] = new double4 *[ypadsize * zpadsize]; varbuf1[0][0] = new double4[xpadsize * ypadsize * zpadsize]; varbuf2[0][0] = new double4[xpadsize * ypadsize * zpadsize]; for (z = 1; z < zpadsize; z++) { varbuf1[z] = varbuf1[0] + ypadsize * z; varbuf2[z] = varbuf2[0] + ypadsize * z; varbuf1[z][0] = varbuf1[0][0] + xpadsize * ypadsize * z; varbuf2[z][0] = varbuf2[0][0] + xpadsize * ypadsize * z; } for (z = 0; z < zpadsize; z++) { for (y = 1; y < ypadsize; y++) { varbuf1[z][y] = varbuf1[z][0] + xpadsize * y; varbuf2[z][y] = varbuf2[z][0] + xpadsize * y; } } int powers_are_1 = (rgb_power_to_use == 1.0f) && (alpha_power_to_use == 1.0f); // load x and x^2 values into the allocated buffers if (img->imagedata8) { uint8_t data[6]; data[4] = 0; data[5] = 255; for (z = 0; z < zpadsize - 1; z++) { int z_src = z + source_zoffset - (use_z_axis ? kernel_radius : 0); for (y = 0; y < ypadsize - 1; y++) { int y_src = y + source_yoffset - kernel_radius; for (x = 0; x < xpadsize - 1; x++) { int x_src = x + source_xoffset - kernel_radius; data[0] = img->imagedata8[z_src][y_src][4 * x_src + 0]; data[1] = img->imagedata8[z_src][y_src][4 * x_src + 1]; data[2] = img->imagedata8[z_src][y_src][4 * x_src + 2]; data[3] = img->imagedata8[z_src][y_src][4 * x_src + 3]; uint8_t r = data[swz.r]; uint8_t g = data[swz.g]; uint8_t b = data[swz.b]; uint8_t a = data[swz.a]; double4 d = double4(r * (1.0 / 255.0), g * (1.0 / 255.0), b * (1.0 / 255.0), a * (1.0 / 255.0)); if (perform_srgb_transform) { d.x = (d.x <= 0.04045) ? d.x * (1.0 / 12.92) : (d.x <= 1) ? pow((d.x + 0.055) * (1.0 / 1.055), 2.4) : d.x; d.y = (d.y <= 0.04045) ? d.y * (1.0 / 12.92) : (d.y <= 1) ? pow((d.y + 0.055) * (1.0 / 1.055), 2.4) : d.y; d.z = (d.z <= 0.04045) ? d.z * (1.0 / 12.92) : (d.z <= 1) ? pow((d.z + 0.055) * (1.0 / 1.055), 2.4) : d.z; } if (!powers_are_1) { d.x = pow(MAX(d.x, 1e-6), (double)rgb_power_to_use); d.y = pow(MAX(d.y, 1e-6), (double)rgb_power_to_use); d.z = pow(MAX(d.z, 1e-6), (double)rgb_power_to_use); d.w = pow(MAX(d.w, 1e-6), (double)alpha_power_to_use); } varbuf1[z][y][x] = d; varbuf2[z][y][x] = d * d; } } } } else { uint16_t data[6]; data[4] = 0; data[5] = 0x3C00; // 1.0 encoded as FP16. for (z = 0; z < zpadsize - 1; z++) { int z_src = z + source_zoffset - (use_z_axis ? kernel_radius : 0); for (y = 0; y < ypadsize - 1; y++) { int y_src = y + source_yoffset - kernel_radius; for (x = 0; x < xpadsize - 1; x++) { int x_src = x + source_xoffset - kernel_radius; data[0] = img->imagedata16[z_src][y_src][4 * x_src]; data[1] = img->imagedata16[z_src][y_src][4 * x_src + 1]; data[2] = img->imagedata16[z_src][y_src][4 * x_src + 2]; data[3] = img->imagedata16[z_src][y_src][4 * x_src + 3]; uint16_t r = data[swz.r]; uint16_t g = data[swz.g]; uint16_t b = data[swz.b]; uint16_t a = data[swz.a]; double4 d = double4(sf16_to_float(r), sf16_to_float(g), sf16_to_float(b), sf16_to_float(a)); if (perform_srgb_transform) { d.x = (d.x <= 0.04045) ? d.x * (1.0 / 12.92) : (d.x <= 1) ? pow((d.x + 0.055) * (1.0 / 1.055), 2.4) : d.x; d.y = (d.y <= 0.04045) ? d.y * (1.0 / 12.92) : (d.y <= 1) ? pow((d.y + 0.055) * (1.0 / 1.055), 2.4) : d.y; d.z = (d.z <= 0.04045) ? d.z * (1.0 / 12.92) : (d.z <= 1) ? pow((d.z + 0.055) * (1.0 / 1.055), 2.4) : d.z; } if (!powers_are_1) { d.x = pow(MAX(d.x, 1e-6), (double)rgb_power_to_use); d.y = pow(MAX(d.y, 1e-6), (double)rgb_power_to_use); d.z = pow(MAX(d.z, 1e-6), (double)rgb_power_to_use); d.w = pow(MAX(d.w, 1e-6), (double)alpha_power_to_use); } varbuf1[z][y][x] = d; varbuf2[z][y][x] = d * d; } } } } // pad out buffers with 0s for (z = 0; z < zpadsize; z++) { for (y = 0; y < ypadsize; y++) { varbuf1[z][y][xpadsize - 1] = double4(0.0, 0.0, 0.0, 0.0); varbuf2[z][y][xpadsize - 1] = double4(0.0, 0.0, 0.0, 0.0); } for (x = 0; x < xpadsize; x++) { varbuf1[z][ypadsize - 1][x] = double4(0.0, 0.0, 0.0, 0.0); varbuf2[z][ypadsize - 1][x] = double4(0.0, 0.0, 0.0, 0.0); } } if (use_z_axis) { for (y = 0; y < ypadsize; y++) { for (x = 0; x < xpadsize; x++) { varbuf1[zpadsize - 1][y][x] = double4(0.0, 0.0, 0.0, 0.0); varbuf2[zpadsize - 1][y][x] = double4(0.0, 0.0, 0.0, 0.0); } } } // generate summed-area tables for x and x2; this is done in-place for (z = 0; z < zpadsize; z++) { for (y = 0; y < ypadsize; y++) { double4 summa1 = double4(0.0, 0.0, 0.0, 0.0); double4 summa2 = double4(0.0, 0.0, 0.0, 0.0); for (x = 0; x < xpadsize; x++) { double4 val1 = varbuf1[z][y][x]; double4 val2 = varbuf2[z][y][x]; varbuf1[z][y][x] = summa1; varbuf2[z][y][x] = summa2; summa1 = summa1 + val1; summa2 = summa2 + val2; } } } for (z = 0; z < zpadsize; z++) { for (x = 0; x < xpadsize; x++) { double4 summa1 = double4(0.0, 0.0, 0.0, 0.0); double4 summa2 = double4(0.0, 0.0, 0.0, 0.0); for (y = 0; y < ypadsize; y++) { double4 val1 = varbuf1[z][y][x]; double4 val2 = varbuf2[z][y][x]; varbuf1[z][y][x] = summa1; varbuf2[z][y][x] = summa2; summa1 = summa1 + val1; summa2 = summa2 + val2; } } } if (use_z_axis) { for (y = 0; y < ypadsize; y++) { for (x = 0; x < xpadsize; x++) { double4 summa1 = double4(0.0, 0.0, 0.0, 0.0); double4 summa2 = double4(0.0, 0.0, 0.0, 0.0); for (z = 0; z < zpadsize; z++) { double4 val1 = varbuf1[z][y][x]; double4 val2 = varbuf2[z][y][x]; varbuf1[z][y][x] = summa1; varbuf2[z][y][x] = summa2; summa1 = summa1 + val1; summa2 = summa2 + val2; } } } } int avg_var_kerneldim = 2 * avg_var_kernel_radius + 1; int alpha_kerneldim = 2 * alpha_kernel_radius + 1; // compute a few constants used in the variance-calculation. double avg_var_samples; double alpha_rsamples; double mul1; if (use_z_axis) { avg_var_samples = avg_var_kerneldim * avg_var_kerneldim * avg_var_kerneldim; alpha_rsamples = 1.0 / (alpha_kerneldim * alpha_kerneldim * alpha_kerneldim); } else { avg_var_samples = avg_var_kerneldim * avg_var_kerneldim; alpha_rsamples = 1.0 / (alpha_kerneldim * alpha_kerneldim); } double avg_var_rsamples = 1.0 / avg_var_samples; if (avg_var_samples == 1) mul1 = 1.0; else mul1 = 1.0 / (avg_var_samples * (avg_var_samples - 1)); double mul2 = avg_var_samples * mul1; // use the summed-area tables to compute variance for each sample-neighborhood if (use_z_axis) { for (z = 0; z < zsize; z++) { int z_src = z + kernel_radius; int z_dst = z + dest_zoffset; for (y = 0; y < ysize; y++) { int y_src = y + kernel_radius; int y_dst = y + dest_yoffset; for (x = 0; x < xsize; x++) { int x_src = x + kernel_radius; int x_dst = x + dest_xoffset; // summed-area table lookups for alpha average double vasum = (varbuf1[z_src + 1][y_src - alpha_kernel_radius][x_src - alpha_kernel_radius].w - varbuf1[z_src + 1][y_src - alpha_kernel_radius][x_src + alpha_kernel_radius + 1].w - varbuf1[z_src + 1][y_src + alpha_kernel_radius + 1][x_src - alpha_kernel_radius].w + varbuf1[z_src + 1][y_src + alpha_kernel_radius + 1][x_src + alpha_kernel_radius + 1].w) - (varbuf1[z_src][y_src - alpha_kernel_radius][x_src - alpha_kernel_radius].w - varbuf1[z_src][y_src - alpha_kernel_radius][x_src + alpha_kernel_radius + 1].w - varbuf1[z_src][y_src + alpha_kernel_radius + 1][x_src - alpha_kernel_radius].w + varbuf1[z_src][y_src + alpha_kernel_radius + 1][x_src + alpha_kernel_radius + 1].w); input_alpha_averages[z_dst][y_dst][x_dst] = static_cast < float >(vasum * alpha_rsamples); // summed-area table lookups for RGBA average double4 v0sum = (varbuf1[z_src + 1][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius] - varbuf1[z_src + 1][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1] - varbuf1[z_src + 1][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius] + varbuf1[z_src + 1][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1]) - (varbuf1[z_src][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius] - varbuf1[z_src][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1] - varbuf1[z_src][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius] + varbuf1[z_src][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1]); double4 avg = v0sum * avg_var_rsamples; float4 favg = float4(static_cast < float >(avg.x), static_cast < float >(avg.y), static_cast < float >(avg.z), static_cast < float >(avg.w)); input_averages[z_dst][y_dst][x_dst] = favg; // summed-area table lookups for variance double4 v1sum = (varbuf1[z_src + 1][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius] - varbuf1[z_src + 1][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1] - varbuf1[z_src + 1][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius] + varbuf1[z_src + 1][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1]) - (varbuf1[z_src][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius] - varbuf1[z_src][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1] - varbuf1[z_src][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius] + varbuf1[z_src][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1]); double4 v2sum = (varbuf2[z_src + 1][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius] - varbuf2[z_src + 1][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1] - varbuf2[z_src + 1][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius] + varbuf2[z_src + 1][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1]) - (varbuf2[z_src][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius] - varbuf2[z_src][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1] - varbuf2[z_src][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius] + varbuf2[z_src][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1]); // the actual variance double4 variance = mul2 * v2sum - mul1 * (v1sum * v1sum); float4 fvar = float4(static_cast < float >(variance.x), static_cast < float >(variance.y), static_cast < float >(variance.z), static_cast < float >(variance.w)); input_variances[z_dst][y_dst][x_dst] = fvar; } } } } else { for (z = 0; z < zsize; z++) { int z_src = z; int z_dst = z + dest_zoffset; for (y = 0; y < ysize; y++) { int y_src = y + kernel_radius; int y_dst = y + dest_yoffset; for (x = 0; x < xsize; x++) { int x_src = x + kernel_radius; int x_dst = x + dest_xoffset; // summed-area table lookups for alpha average double vasum = varbuf1[z_src][y_src - alpha_kernel_radius][x_src - alpha_kernel_radius].w - varbuf1[z_src][y_src - alpha_kernel_radius][x_src + alpha_kernel_radius + 1].w - varbuf1[z_src][y_src + alpha_kernel_radius + 1][x_src - alpha_kernel_radius].w + varbuf1[z_src][y_src + alpha_kernel_radius + 1][x_src + alpha_kernel_radius + 1].w; input_alpha_averages[z_dst][y_dst][x_dst] = static_cast < float >(vasum * alpha_rsamples); // summed-area table lookups for RGBA average double4 v0sum = varbuf1[z_src][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius] - varbuf1[z_src][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1] - varbuf1[z_src][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius] + varbuf1[z_src][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1]; double4 avg = v0sum * avg_var_rsamples; float4 favg = float4(static_cast < float >(avg.x), static_cast < float >(avg.y), static_cast < float >(avg.z), static_cast < float >(avg.w)); input_averages[z_dst][y_dst][x_dst] = favg; // summed-area table lookups for variance double4 v1sum = varbuf1[z_src][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius] - varbuf1[z_src][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1] - varbuf1[z_src][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius] + varbuf1[z_src][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1]; double4 v2sum = varbuf2[z_src][y_src - avg_var_kernel_radius][x_src - avg_var_kernel_radius] - varbuf2[z_src][y_src - avg_var_kernel_radius][x_src + avg_var_kernel_radius + 1] - varbuf2[z_src][y_src + avg_var_kernel_radius + 1][x_src - avg_var_kernel_radius] + varbuf2[z_src][y_src + avg_var_kernel_radius + 1][x_src + avg_var_kernel_radius + 1]; // the actual variance double4 variance = mul2 * v2sum - mul1 * (v1sum * v1sum); float4 fvar = float4(static_cast < float >(variance.x), static_cast < float >(variance.y), static_cast < float >(variance.z), static_cast < float >(variance.w)); input_variances[z_dst][y_dst][x_dst] = fvar; } } } } delete[] varbuf2[0][0]; delete[] varbuf1[0][0]; delete[] varbuf2[0]; delete[] varbuf1[0]; delete[] varbuf2; delete[] varbuf1; } static void allocate_input_average_and_variance_buffers(int xsize, int ysize, int zsize) { int y, z; if (input_averages) { delete[] input_averages[0][0]; delete[] input_averages[0]; delete[] input_averages; } if (input_variances) { delete[] input_variances[0][0]; delete[] input_variances[0]; delete[] input_variances; } if (input_alpha_averages) { delete[] input_alpha_averages[0][0]; delete[] input_alpha_averages[0]; delete[] input_alpha_averages; } input_averages = new float4 **[zsize]; input_variances = new float4 **[zsize]; input_alpha_averages = new float **[zsize]; input_averages[0] = new float4 *[ysize * zsize]; input_variances[0] = new float4 *[ysize * zsize]; input_alpha_averages[0] = new float *[ysize * zsize]; input_averages[0][0] = new float4[xsize * ysize * zsize]; input_variances[0][0] = new float4[xsize * ysize * zsize]; input_alpha_averages[0][0] = new float[xsize * ysize * zsize]; for (z = 1; z < zsize; z++) { input_averages[z] = input_averages[0] + z * ysize; input_variances[z] = input_variances[0] + z * ysize; input_alpha_averages[z] = input_alpha_averages[0] + z * ysize; input_averages[z][0] = input_averages[0][0] + z * ysize * xsize; input_variances[z][0] = input_variances[0][0] + z * ysize * xsize; input_alpha_averages[z][0] = input_alpha_averages[0][0] + z * ysize * xsize; } for (z = 0; z < zsize; z++) { for (y = 1; y < ysize; y++) { input_averages[z][y] = input_averages[z][0] + y * xsize; input_variances[z][y] = input_variances[z][0] + y * xsize; input_alpha_averages[z][y] = input_alpha_averages[z][0] + y * xsize; } } } // compute averages and variances for the current input image. void compute_averages_and_variances(const astc_codec_image * img, float rgb_power_to_use, float alpha_power_to_use, int avg_var_kernel_radius, int alpha_kernel_radius, swizzlepattern swz) { int xsize = img->xsize; int ysize = img->ysize; int zsize = img->zsize; allocate_input_average_and_variance_buffers(xsize, ysize, zsize); int x, y, z; for (z = 0; z < zsize; z += 32) { int zblocksize = MIN(32, zsize - z); for (y = 0; y < ysize; y += 32) { int yblocksize = MIN(32, ysize - y); for (x = 0; x < xsize; x += 32) { int xblocksize = MIN(32, xsize - x); compute_pixel_region_variance(img, rgb_power_to_use, alpha_power_to_use, swz, (zsize > 1), x + img->padding, y + img->padding, z + (zsize > 1 ? img->padding : 0), xblocksize, yblocksize, zblocksize, avg_var_kernel_radius, alpha_kernel_radius, x, y, z); } } } }