// SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- // Copyright 2011-2022 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at: // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // ---------------------------------------------------------------------------- #if !defined(ASTCENC_DECOMPRESS_ONLY) /** * @brief Functions to calculate variance per component in a NxN footprint. * * We need N to be parametric, so the routine below uses summed area tables in order to execute in * O(1) time independent of how big N is. * * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first * perform a binary reduction, and then distributes the results. This method means that there is no * serial dependency between a given element and the next one, and also significantly improves * numerical stability allowing us to use floats rather than doubles. */ #include "astcenc_internal.h" #include /** * @brief Generate a prefix-sum array using the Brent-Kung algorithm. * * This will take an input array of the form: * v0, v1, v2, ... * ... and modify in-place to turn it into a prefix-sum array of the form: * v0, v0+v1, v0+v1+v2, ... * * @param d The array to prefix-sum. * @param items The number of items in the array. * @param stride The item spacing in the array; i.e. dense arrays should use 1. */ static void brent_kung_prefix_sum( vfloat4* d, size_t items, int stride ) { if (items < 2) return; size_t lc_stride = 2; size_t log2_stride = 1; // The reduction-tree loop do { size_t step = lc_stride >> 1; size_t start = lc_stride - 1; size_t iters = items >> log2_stride; vfloat4 *da = d + (start * stride); ptrdiff_t ofs = -(ptrdiff_t)(step * stride); size_t ofs_stride = stride << log2_stride; while (iters) { *da = *da + da[ofs]; da += ofs_stride; iters--; } log2_stride += 1; lc_stride <<= 1; } while (lc_stride <= items); // The expansion-tree loop do { log2_stride -= 1; lc_stride >>= 1; size_t step = lc_stride >> 1; size_t start = step + lc_stride - 1; size_t iters = (items - step) >> log2_stride; vfloat4 *da = d + (start * stride); ptrdiff_t ofs = -(ptrdiff_t)(step * stride); size_t ofs_stride = stride << log2_stride; while (iters) { *da = *da + da[ofs]; da += ofs_stride; iters--; } } while (lc_stride > 2); } /** * @brief Compute averages for a pixel region. * * The routine computes both in a single pass, using a summed-area table to decouple the running * time from the averaging/variance kernel size. * * @param[out] ctx The compressor context storing the output data. * @param arg The input parameter structure. */ static void compute_pixel_region_variance( astcenc_context& ctx, const pixel_region_args& arg ) { // Unpack the memory structure into local variables const astcenc_image* img = arg.img; astcenc_swizzle swz = arg.swz; bool have_z = arg.have_z; int size_x = arg.size_x; int size_y = arg.size_y; int size_z = arg.size_z; int offset_x = arg.offset_x; int offset_y = arg.offset_y; int offset_z = arg.offset_z; int alpha_kernel_radius = arg.alpha_kernel_radius; float* input_alpha_averages = ctx.input_alpha_averages; vfloat4* work_memory = arg.work_memory; // Compute memory sizes and dimensions that we need int kernel_radius = alpha_kernel_radius; int kerneldim = 2 * kernel_radius + 1; int kernel_radius_xy = kernel_radius; int kernel_radius_z = have_z ? kernel_radius : 0; int padsize_x = size_x + kerneldim; int padsize_y = size_y + kerneldim; int padsize_z = size_z + (have_z ? kerneldim : 0); int sizeprod = padsize_x * padsize_y * padsize_z; int zd_start = have_z ? 1 : 0; vfloat4 *varbuf1 = work_memory; vfloat4 *varbuf2 = work_memory + sizeprod; // Scaling factors to apply to Y and Z for accesses into the work buffers int yst = padsize_x; int zst = padsize_x * padsize_y; // Scaling factors to apply to Y and Z for accesses into result buffers int ydt = img->dim_x; int zdt = img->dim_x * img->dim_y; // Macros to act as accessor functions for the work-memory #define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x] #define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x] // Load N and N^2 values into the work buffers if (img->data_type == ASTCENC_TYPE_U8) { // Swizzle data structure 4 = ZERO, 5 = ONE uint8_t data[6]; data[ASTCENC_SWZ_0] = 0; data[ASTCENC_SWZ_1] = 255; for (int z = zd_start; z < padsize_z; z++) { int z_src = (z - zd_start) + offset_z - kernel_radius_z; z_src = astc::clamp(z_src, 0, (int)(img->dim_z - 1)); uint8_t* data8 = static_cast(img->data[z_src]); for (int y = 1; y < padsize_y; y++) { int y_src = (y - 1) + offset_y - kernel_radius_xy; y_src = astc::clamp(y_src, 0, (int)(img->dim_y - 1)); for (int x = 1; x < padsize_x; x++) { int x_src = (x - 1) + offset_x - kernel_radius_xy; x_src = astc::clamp(x_src, 0, (int)(img->dim_x - 1)); data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src )]; data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)]; data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)]; uint8_t r = data[swz.r]; uint8_t g = data[swz.g]; uint8_t b = data[swz.b]; uint8_t a = data[swz.a]; vfloat4 d = vfloat4 (r * (1.0f / 255.0f), g * (1.0f / 255.0f), b * (1.0f / 255.0f), a * (1.0f / 255.0f)); VARBUF1(z, y, x) = d; VARBUF2(z, y, x) = d * d; } } } } else if (img->data_type == ASTCENC_TYPE_F16) { // Swizzle data structure 4 = ZERO, 5 = ONE (in FP16) uint16_t data[6]; data[ASTCENC_SWZ_0] = 0; data[ASTCENC_SWZ_1] = 0x3C00; for (int z = zd_start; z < padsize_z; z++) { int z_src = (z - zd_start) + offset_z - kernel_radius_z; z_src = astc::clamp(z_src, 0, (int)(img->dim_z - 1)); uint16_t* data16 = static_cast(img->data[z_src]); for (int y = 1; y < padsize_y; y++) { int y_src = (y - 1) + offset_y - kernel_radius_xy; y_src = astc::clamp(y_src, 0, (int)(img->dim_y - 1)); for (int x = 1; x < padsize_x; x++) { int x_src = (x - 1) + offset_x - kernel_radius_xy; x_src = astc::clamp(x_src, 0, (int)(img->dim_x - 1)); data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src )]; data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)]; data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)]; vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]); vfloat4 d = float16_to_float(di); VARBUF1(z, y, x) = d; VARBUF2(z, y, x) = d * d; } } } } else // if (img->data_type == ASTCENC_TYPE_F32) { assert(img->data_type == ASTCENC_TYPE_F32); // Swizzle data structure 4 = ZERO, 5 = ONE (in FP16) float data[6]; data[ASTCENC_SWZ_0] = 0.0f; data[ASTCENC_SWZ_1] = 1.0f; for (int z = zd_start; z < padsize_z; z++) { int z_src = (z - zd_start) + offset_z - kernel_radius_z; z_src = astc::clamp(z_src, 0, (int)(img->dim_z - 1)); float* data32 = static_cast(img->data[z_src]); for (int y = 1; y < padsize_y; y++) { int y_src = (y - 1) + offset_y - kernel_radius_xy; y_src = astc::clamp(y_src, 0, (int)(img->dim_y - 1)); for (int x = 1; x < padsize_x; x++) { int x_src = (x - 1) + offset_x - kernel_radius_xy; x_src = astc::clamp(x_src, 0, (int)(img->dim_x - 1)); data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src )]; data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)]; data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)]; float r = data[swz.r]; float g = data[swz.g]; float b = data[swz.b]; float a = data[swz.a]; vfloat4 d(r, g, b, a); VARBUF1(z, y, x) = d; VARBUF2(z, y, x) = d * d; } } } } // Pad with an extra layer of 0s; this forms the edge of the SAT tables vfloat4 vbz = vfloat4::zero(); for (int z = 0; z < padsize_z; z++) { for (int y = 0; y < padsize_y; y++) { VARBUF1(z, y, 0) = vbz; VARBUF2(z, y, 0) = vbz; } for (int x = 0; x < padsize_x; x++) { VARBUF1(z, 0, x) = vbz; VARBUF2(z, 0, x) = vbz; } } if (have_z) { for (int y = 0; y < padsize_y; y++) { for (int x = 0; x < padsize_x; x++) { VARBUF1(0, y, x) = vbz; VARBUF2(0, y, x) = vbz; } } } // Generate summed-area tables for N and N^2; this is done in-place, using // a Brent-Kung parallel-prefix based algorithm to minimize precision loss for (int z = zd_start; z < padsize_z; z++) { for (int y = 1; y < padsize_y; y++) { brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1); brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1); } } for (int z = zd_start; z < padsize_z; z++) { for (int x = 1; x < padsize_x; x++) { brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst); brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst); } } if (have_z) { for (int y = 1; y < padsize_y; y++) { for (int x = 1; x < padsize_x; x++) { brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst); brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst); } } } int alpha_kdim = 2 * alpha_kernel_radius + 1; // Compute a few constants used in the variance-calculation. float alpha_rsamples; if (have_z) { alpha_rsamples = 1.0f / (float)(alpha_kdim * alpha_kdim * alpha_kdim); } else { alpha_rsamples = 1.0f / (float)(alpha_kdim * alpha_kdim); } // Use the summed-area tables to compute variance for each neighborhood if (have_z) { for (int z = 0; z < size_z; z++) { int z_src = z + kernel_radius_z; int z_dst = z + offset_z; int z_low = z_src - alpha_kernel_radius; int z_high = z_src + alpha_kernel_radius + 1; for (int y = 0; y < size_y; y++) { int y_src = y + kernel_radius_xy; int y_dst = y + offset_y; int y_low = y_src - alpha_kernel_radius; int y_high = y_src + alpha_kernel_radius + 1; for (int x = 0; x < size_x; x++) { int x_src = x + kernel_radius_xy; int x_dst = x + offset_x; int x_low = x_src - alpha_kernel_radius; int x_high = x_src + alpha_kernel_radius + 1; // Summed-area table lookups for alpha average float vasum = ( VARBUF1(z_high, y_low, x_low).lane<3>() - VARBUF1(z_high, y_low, x_high).lane<3>() - VARBUF1(z_high, y_high, x_low).lane<3>() + VARBUF1(z_high, y_high, x_high).lane<3>()) - ( VARBUF1(z_low, y_low, x_low).lane<3>() - VARBUF1(z_low, y_low, x_high).lane<3>() - VARBUF1(z_low, y_high, x_low).lane<3>() + VARBUF1(z_low, y_high, x_high).lane<3>()); int out_index = z_dst * zdt + y_dst * ydt + x_dst; input_alpha_averages[out_index] = (vasum * alpha_rsamples); } } } } else { for (int y = 0; y < size_y; y++) { int y_src = y + kernel_radius_xy; int y_dst = y + offset_y; int y_low = y_src - alpha_kernel_radius; int y_high = y_src + alpha_kernel_radius + 1; for (int x = 0; x < size_x; x++) { int x_src = x + kernel_radius_xy; int x_dst = x + offset_x; int x_low = x_src - alpha_kernel_radius; int x_high = x_src + alpha_kernel_radius + 1; // Summed-area table lookups for alpha average float vasum = VARBUF1(0, y_low, x_low).lane<3>() - VARBUF1(0, y_low, x_high).lane<3>() - VARBUF1(0, y_high, x_low).lane<3>() + VARBUF1(0, y_high, x_high).lane<3>(); int out_index = y_dst * ydt + x_dst; input_alpha_averages[out_index] = (vasum * alpha_rsamples); } } } } void compute_averages( astcenc_context& ctx, const avg_args &ag ) { pixel_region_args arg = ag.arg; arg.work_memory = new vfloat4[ag.work_memory_size]; int size_x = ag.img_size_x; int size_y = ag.img_size_y; int size_z = ag.img_size_z; int step_xy = ag.blk_size_xy; int step_z = ag.blk_size_z; int y_tasks = (size_y + step_xy - 1) / step_xy; // All threads run this processing loop until there is no work remaining while (true) { unsigned int count; unsigned int base = ctx.manage_avg.get_task_assignment(16, count); if (!count) { break; } for (unsigned int i = base; i < base + count; i++) { int z = (i / (y_tasks)) * step_z; int y = (i - (z * y_tasks)) * step_xy; arg.size_z = astc::min(step_z, size_z - z); arg.offset_z = z; arg.size_y = astc::min(step_xy, size_y - y); arg.offset_y = y; for (int x = 0; x < size_x; x += step_xy) { arg.size_x = astc::min(step_xy, size_x - x); arg.offset_x = x; compute_pixel_region_variance(ctx, arg); } } ctx.manage_avg.complete_task_assignment(count); } delete[] arg.work_memory; } /* See header for documentation. */ unsigned int init_compute_averages( const astcenc_image& img, unsigned int alpha_kernel_radius, const astcenc_swizzle& swz, avg_args& ag ) { unsigned int size_x = img.dim_x; unsigned int size_y = img.dim_y; unsigned int size_z = img.dim_z; // Compute maximum block size and from that the working memory buffer size unsigned int kernel_radius = alpha_kernel_radius; unsigned int kerneldim = 2 * kernel_radius + 1; bool have_z = (size_z > 1); unsigned int max_blk_size_xy = have_z ? 16 : 32; unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u); unsigned int max_padsize_xy = max_blk_size_xy + kerneldim; unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0); // Perform block-wise averages calculations across the image // Initialize fields which are not populated until later ag.arg.size_x = 0; ag.arg.size_y = 0; ag.arg.size_z = 0; ag.arg.offset_x = 0; ag.arg.offset_y = 0; ag.arg.offset_z = 0; ag.arg.work_memory = nullptr; ag.arg.img = &img; ag.arg.swz = swz; ag.arg.have_z = have_z; ag.arg.alpha_kernel_radius = alpha_kernel_radius; ag.img_size_x = size_x; ag.img_size_y = size_y; ag.img_size_z = size_z; ag.blk_size_xy = max_blk_size_xy; ag.blk_size_z = max_blk_size_z; ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z; // The parallel task count unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z; unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy; return z_tasks * y_tasks; } #endif