mirror of https://github.com/axmolengine/axmol.git
533 lines
15 KiB
C++
533 lines
15 KiB
C++
// SPDX-License-Identifier: Apache-2.0
|
|
// ----------------------------------------------------------------------------
|
|
// Copyright 2011-2022 Arm Limited
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
|
// use this file except in compliance with the License. You may obtain a copy
|
|
// of the License at:
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
// License for the specific language governing permissions and limitations
|
|
// under the License.
|
|
// ----------------------------------------------------------------------------
|
|
|
|
#if !defined(ASTCENC_DECOMPRESS_ONLY)
|
|
|
|
/**
|
|
* @brief Functions to calculate variance per component in a NxN footprint.
|
|
*
|
|
* We need N to be parametric, so the routine below uses summed area tables in order to execute in
|
|
* O(1) time independent of how big N is.
|
|
*
|
|
* The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
|
|
* perform a binary reduction, and then distributes the results. This method means that there is no
|
|
* serial dependency between a given element and the next one, and also significantly improves
|
|
* numerical stability allowing us to use floats rather than doubles.
|
|
*/
|
|
|
|
#include "astcenc_internal.h"
|
|
|
|
#include <cassert>
|
|
|
|
/**
|
|
* @brief Generate a prefix-sum array using the Brent-Kung algorithm.
|
|
*
|
|
* This will take an input array of the form:
|
|
* v0, v1, v2, ...
|
|
* ... and modify in-place to turn it into a prefix-sum array of the form:
|
|
* v0, v0+v1, v0+v1+v2, ...
|
|
*
|
|
* @param d The array to prefix-sum.
|
|
* @param items The number of items in the array.
|
|
* @param stride The item spacing in the array; i.e. dense arrays should use 1.
|
|
*/
|
|
static void brent_kung_prefix_sum(
|
|
vfloat4* d,
|
|
size_t items,
|
|
int stride
|
|
) {
|
|
if (items < 2)
|
|
return;
|
|
|
|
size_t lc_stride = 2;
|
|
size_t log2_stride = 1;
|
|
|
|
// The reduction-tree loop
|
|
do {
|
|
size_t step = lc_stride >> 1;
|
|
size_t start = lc_stride - 1;
|
|
size_t iters = items >> log2_stride;
|
|
|
|
vfloat4 *da = d + (start * stride);
|
|
ptrdiff_t ofs = -(ptrdiff_t)(step * stride);
|
|
size_t ofs_stride = stride << log2_stride;
|
|
|
|
while (iters)
|
|
{
|
|
*da = *da + da[ofs];
|
|
da += ofs_stride;
|
|
iters--;
|
|
}
|
|
|
|
log2_stride += 1;
|
|
lc_stride <<= 1;
|
|
} while (lc_stride <= items);
|
|
|
|
// The expansion-tree loop
|
|
do {
|
|
log2_stride -= 1;
|
|
lc_stride >>= 1;
|
|
|
|
size_t step = lc_stride >> 1;
|
|
size_t start = step + lc_stride - 1;
|
|
size_t iters = (items - step) >> log2_stride;
|
|
|
|
vfloat4 *da = d + (start * stride);
|
|
ptrdiff_t ofs = -(ptrdiff_t)(step * stride);
|
|
size_t ofs_stride = stride << log2_stride;
|
|
|
|
while (iters)
|
|
{
|
|
*da = *da + da[ofs];
|
|
da += ofs_stride;
|
|
iters--;
|
|
}
|
|
} while (lc_stride > 2);
|
|
}
|
|
|
|
/**
|
|
* @brief Compute averages for a pixel region.
|
|
*
|
|
* The routine computes both in a single pass, using a summed-area table to decouple the running
|
|
* time from the averaging/variance kernel size.
|
|
*
|
|
* @param[out] ctx The compressor context storing the output data.
|
|
* @param arg The input parameter structure.
|
|
*/
|
|
static void compute_pixel_region_variance(
|
|
astcenc_context& ctx,
|
|
const pixel_region_args& arg
|
|
) {
|
|
// Unpack the memory structure into local variables
|
|
const astcenc_image* img = arg.img;
|
|
astcenc_swizzle swz = arg.swz;
|
|
bool have_z = arg.have_z;
|
|
|
|
int size_x = arg.size_x;
|
|
int size_y = arg.size_y;
|
|
int size_z = arg.size_z;
|
|
|
|
int offset_x = arg.offset_x;
|
|
int offset_y = arg.offset_y;
|
|
int offset_z = arg.offset_z;
|
|
|
|
int alpha_kernel_radius = arg.alpha_kernel_radius;
|
|
|
|
float* input_alpha_averages = ctx.input_alpha_averages;
|
|
vfloat4* work_memory = arg.work_memory;
|
|
|
|
// Compute memory sizes and dimensions that we need
|
|
int kernel_radius = alpha_kernel_radius;
|
|
int kerneldim = 2 * kernel_radius + 1;
|
|
int kernel_radius_xy = kernel_radius;
|
|
int kernel_radius_z = have_z ? kernel_radius : 0;
|
|
|
|
int padsize_x = size_x + kerneldim;
|
|
int padsize_y = size_y + kerneldim;
|
|
int padsize_z = size_z + (have_z ? kerneldim : 0);
|
|
int sizeprod = padsize_x * padsize_y * padsize_z;
|
|
|
|
int zd_start = have_z ? 1 : 0;
|
|
|
|
vfloat4 *varbuf1 = work_memory;
|
|
vfloat4 *varbuf2 = work_memory + sizeprod;
|
|
|
|
// Scaling factors to apply to Y and Z for accesses into the work buffers
|
|
int yst = padsize_x;
|
|
int zst = padsize_x * padsize_y;
|
|
|
|
// Scaling factors to apply to Y and Z for accesses into result buffers
|
|
int ydt = img->dim_x;
|
|
int zdt = img->dim_x * img->dim_y;
|
|
|
|
// Macros to act as accessor functions for the work-memory
|
|
#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
|
|
#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
|
|
|
|
// Load N and N^2 values into the work buffers
|
|
if (img->data_type == ASTCENC_TYPE_U8)
|
|
{
|
|
// Swizzle data structure 4 = ZERO, 5 = ONE
|
|
uint8_t data[6];
|
|
data[ASTCENC_SWZ_0] = 0;
|
|
data[ASTCENC_SWZ_1] = 255;
|
|
|
|
for (int z = zd_start; z < padsize_z; z++)
|
|
{
|
|
int z_src = (z - zd_start) + offset_z - kernel_radius_z;
|
|
z_src = astc::clamp(z_src, 0, (int)(img->dim_z - 1));
|
|
uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
|
|
|
|
for (int y = 1; y < padsize_y; y++)
|
|
{
|
|
int y_src = (y - 1) + offset_y - kernel_radius_xy;
|
|
y_src = astc::clamp(y_src, 0, (int)(img->dim_y - 1));
|
|
|
|
for (int x = 1; x < padsize_x; x++)
|
|
{
|
|
int x_src = (x - 1) + offset_x - kernel_radius_xy;
|
|
x_src = astc::clamp(x_src, 0, (int)(img->dim_x - 1));
|
|
|
|
data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src )];
|
|
data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
|
|
data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
|
|
data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
|
|
|
|
uint8_t r = data[swz.r];
|
|
uint8_t g = data[swz.g];
|
|
uint8_t b = data[swz.b];
|
|
uint8_t a = data[swz.a];
|
|
|
|
vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
|
|
g * (1.0f / 255.0f),
|
|
b * (1.0f / 255.0f),
|
|
a * (1.0f / 255.0f));
|
|
|
|
VARBUF1(z, y, x) = d;
|
|
VARBUF2(z, y, x) = d * d;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else if (img->data_type == ASTCENC_TYPE_F16)
|
|
{
|
|
// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
|
|
uint16_t data[6];
|
|
data[ASTCENC_SWZ_0] = 0;
|
|
data[ASTCENC_SWZ_1] = 0x3C00;
|
|
|
|
for (int z = zd_start; z < padsize_z; z++)
|
|
{
|
|
int z_src = (z - zd_start) + offset_z - kernel_radius_z;
|
|
z_src = astc::clamp(z_src, 0, (int)(img->dim_z - 1));
|
|
uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
|
|
|
|
for (int y = 1; y < padsize_y; y++)
|
|
{
|
|
int y_src = (y - 1) + offset_y - kernel_radius_xy;
|
|
y_src = astc::clamp(y_src, 0, (int)(img->dim_y - 1));
|
|
|
|
for (int x = 1; x < padsize_x; x++)
|
|
{
|
|
int x_src = (x - 1) + offset_x - kernel_radius_xy;
|
|
x_src = astc::clamp(x_src, 0, (int)(img->dim_x - 1));
|
|
|
|
data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src )];
|
|
data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
|
|
data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
|
|
data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
|
|
|
|
vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
|
|
vfloat4 d = float16_to_float(di);
|
|
|
|
VARBUF1(z, y, x) = d;
|
|
VARBUF2(z, y, x) = d * d;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else // if (img->data_type == ASTCENC_TYPE_F32)
|
|
{
|
|
assert(img->data_type == ASTCENC_TYPE_F32);
|
|
|
|
// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
|
|
float data[6];
|
|
data[ASTCENC_SWZ_0] = 0.0f;
|
|
data[ASTCENC_SWZ_1] = 1.0f;
|
|
|
|
for (int z = zd_start; z < padsize_z; z++)
|
|
{
|
|
int z_src = (z - zd_start) + offset_z - kernel_radius_z;
|
|
z_src = astc::clamp(z_src, 0, (int)(img->dim_z - 1));
|
|
float* data32 = static_cast<float*>(img->data[z_src]);
|
|
|
|
for (int y = 1; y < padsize_y; y++)
|
|
{
|
|
int y_src = (y - 1) + offset_y - kernel_radius_xy;
|
|
y_src = astc::clamp(y_src, 0, (int)(img->dim_y - 1));
|
|
|
|
for (int x = 1; x < padsize_x; x++)
|
|
{
|
|
int x_src = (x - 1) + offset_x - kernel_radius_xy;
|
|
x_src = astc::clamp(x_src, 0, (int)(img->dim_x - 1));
|
|
|
|
data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src )];
|
|
data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
|
|
data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
|
|
data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
|
|
|
|
float r = data[swz.r];
|
|
float g = data[swz.g];
|
|
float b = data[swz.b];
|
|
float a = data[swz.a];
|
|
|
|
vfloat4 d(r, g, b, a);
|
|
|
|
VARBUF1(z, y, x) = d;
|
|
VARBUF2(z, y, x) = d * d;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Pad with an extra layer of 0s; this forms the edge of the SAT tables
|
|
vfloat4 vbz = vfloat4::zero();
|
|
for (int z = 0; z < padsize_z; z++)
|
|
{
|
|
for (int y = 0; y < padsize_y; y++)
|
|
{
|
|
VARBUF1(z, y, 0) = vbz;
|
|
VARBUF2(z, y, 0) = vbz;
|
|
}
|
|
|
|
for (int x = 0; x < padsize_x; x++)
|
|
{
|
|
VARBUF1(z, 0, x) = vbz;
|
|
VARBUF2(z, 0, x) = vbz;
|
|
}
|
|
}
|
|
|
|
if (have_z)
|
|
{
|
|
for (int y = 0; y < padsize_y; y++)
|
|
{
|
|
for (int x = 0; x < padsize_x; x++)
|
|
{
|
|
VARBUF1(0, y, x) = vbz;
|
|
VARBUF2(0, y, x) = vbz;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Generate summed-area tables for N and N^2; this is done in-place, using
|
|
// a Brent-Kung parallel-prefix based algorithm to minimize precision loss
|
|
for (int z = zd_start; z < padsize_z; z++)
|
|
{
|
|
for (int y = 1; y < padsize_y; y++)
|
|
{
|
|
brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
|
|
brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
|
|
}
|
|
}
|
|
|
|
for (int z = zd_start; z < padsize_z; z++)
|
|
{
|
|
for (int x = 1; x < padsize_x; x++)
|
|
{
|
|
brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
|
|
brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
|
|
}
|
|
}
|
|
|
|
if (have_z)
|
|
{
|
|
for (int y = 1; y < padsize_y; y++)
|
|
{
|
|
for (int x = 1; x < padsize_x; x++)
|
|
{
|
|
brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
|
|
brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
|
|
}
|
|
}
|
|
}
|
|
|
|
int alpha_kdim = 2 * alpha_kernel_radius + 1;
|
|
|
|
// Compute a few constants used in the variance-calculation.
|
|
float alpha_rsamples;
|
|
|
|
if (have_z)
|
|
{
|
|
alpha_rsamples = 1.0f / (float)(alpha_kdim * alpha_kdim * alpha_kdim);
|
|
}
|
|
else
|
|
{
|
|
alpha_rsamples = 1.0f / (float)(alpha_kdim * alpha_kdim);
|
|
}
|
|
|
|
// Use the summed-area tables to compute variance for each neighborhood
|
|
if (have_z)
|
|
{
|
|
for (int z = 0; z < size_z; z++)
|
|
{
|
|
int z_src = z + kernel_radius_z;
|
|
int z_dst = z + offset_z;
|
|
int z_low = z_src - alpha_kernel_radius;
|
|
int z_high = z_src + alpha_kernel_radius + 1;
|
|
|
|
for (int y = 0; y < size_y; y++)
|
|
{
|
|
int y_src = y + kernel_radius_xy;
|
|
int y_dst = y + offset_y;
|
|
int y_low = y_src - alpha_kernel_radius;
|
|
int y_high = y_src + alpha_kernel_radius + 1;
|
|
|
|
for (int x = 0; x < size_x; x++)
|
|
{
|
|
int x_src = x + kernel_radius_xy;
|
|
int x_dst = x + offset_x;
|
|
int x_low = x_src - alpha_kernel_radius;
|
|
int x_high = x_src + alpha_kernel_radius + 1;
|
|
|
|
// Summed-area table lookups for alpha average
|
|
float vasum = ( VARBUF1(z_high, y_low, x_low).lane<3>()
|
|
- VARBUF1(z_high, y_low, x_high).lane<3>()
|
|
- VARBUF1(z_high, y_high, x_low).lane<3>()
|
|
+ VARBUF1(z_high, y_high, x_high).lane<3>()) -
|
|
( VARBUF1(z_low, y_low, x_low).lane<3>()
|
|
- VARBUF1(z_low, y_low, x_high).lane<3>()
|
|
- VARBUF1(z_low, y_high, x_low).lane<3>()
|
|
+ VARBUF1(z_low, y_high, x_high).lane<3>());
|
|
|
|
int out_index = z_dst * zdt + y_dst * ydt + x_dst;
|
|
input_alpha_averages[out_index] = (vasum * alpha_rsamples);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (int y = 0; y < size_y; y++)
|
|
{
|
|
int y_src = y + kernel_radius_xy;
|
|
int y_dst = y + offset_y;
|
|
int y_low = y_src - alpha_kernel_radius;
|
|
int y_high = y_src + alpha_kernel_radius + 1;
|
|
|
|
for (int x = 0; x < size_x; x++)
|
|
{
|
|
int x_src = x + kernel_radius_xy;
|
|
int x_dst = x + offset_x;
|
|
int x_low = x_src - alpha_kernel_radius;
|
|
int x_high = x_src + alpha_kernel_radius + 1;
|
|
|
|
// Summed-area table lookups for alpha average
|
|
float vasum = VARBUF1(0, y_low, x_low).lane<3>()
|
|
- VARBUF1(0, y_low, x_high).lane<3>()
|
|
- VARBUF1(0, y_high, x_low).lane<3>()
|
|
+ VARBUF1(0, y_high, x_high).lane<3>();
|
|
|
|
int out_index = y_dst * ydt + x_dst;
|
|
input_alpha_averages[out_index] = (vasum * alpha_rsamples);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void compute_averages(
|
|
astcenc_context& ctx,
|
|
const avg_args &ag
|
|
) {
|
|
pixel_region_args arg = ag.arg;
|
|
arg.work_memory = new vfloat4[ag.work_memory_size];
|
|
|
|
int size_x = ag.img_size_x;
|
|
int size_y = ag.img_size_y;
|
|
int size_z = ag.img_size_z;
|
|
|
|
int step_xy = ag.blk_size_xy;
|
|
int step_z = ag.blk_size_z;
|
|
|
|
int y_tasks = (size_y + step_xy - 1) / step_xy;
|
|
|
|
// All threads run this processing loop until there is no work remaining
|
|
while (true)
|
|
{
|
|
unsigned int count;
|
|
unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
|
|
if (!count)
|
|
{
|
|
break;
|
|
}
|
|
|
|
for (unsigned int i = base; i < base + count; i++)
|
|
{
|
|
int z = (i / (y_tasks)) * step_z;
|
|
int y = (i - (z * y_tasks)) * step_xy;
|
|
|
|
arg.size_z = astc::min(step_z, size_z - z);
|
|
arg.offset_z = z;
|
|
|
|
arg.size_y = astc::min(step_xy, size_y - y);
|
|
arg.offset_y = y;
|
|
|
|
for (int x = 0; x < size_x; x += step_xy)
|
|
{
|
|
arg.size_x = astc::min(step_xy, size_x - x);
|
|
arg.offset_x = x;
|
|
compute_pixel_region_variance(ctx, arg);
|
|
}
|
|
}
|
|
|
|
ctx.manage_avg.complete_task_assignment(count);
|
|
}
|
|
|
|
delete[] arg.work_memory;
|
|
}
|
|
|
|
/* See header for documentation. */
|
|
unsigned int init_compute_averages(
|
|
const astcenc_image& img,
|
|
unsigned int alpha_kernel_radius,
|
|
const astcenc_swizzle& swz,
|
|
avg_args& ag
|
|
) {
|
|
unsigned int size_x = img.dim_x;
|
|
unsigned int size_y = img.dim_y;
|
|
unsigned int size_z = img.dim_z;
|
|
|
|
// Compute maximum block size and from that the working memory buffer size
|
|
unsigned int kernel_radius = alpha_kernel_radius;
|
|
unsigned int kerneldim = 2 * kernel_radius + 1;
|
|
|
|
bool have_z = (size_z > 1);
|
|
unsigned int max_blk_size_xy = have_z ? 16 : 32;
|
|
unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
|
|
|
|
unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
|
|
unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
|
|
|
|
// Perform block-wise averages calculations across the image
|
|
// Initialize fields which are not populated until later
|
|
ag.arg.size_x = 0;
|
|
ag.arg.size_y = 0;
|
|
ag.arg.size_z = 0;
|
|
ag.arg.offset_x = 0;
|
|
ag.arg.offset_y = 0;
|
|
ag.arg.offset_z = 0;
|
|
ag.arg.work_memory = nullptr;
|
|
|
|
ag.arg.img = &img;
|
|
ag.arg.swz = swz;
|
|
ag.arg.have_z = have_z;
|
|
ag.arg.alpha_kernel_radius = alpha_kernel_radius;
|
|
|
|
ag.img_size_x = size_x;
|
|
ag.img_size_y = size_y;
|
|
ag.img_size_z = size_z;
|
|
ag.blk_size_xy = max_blk_size_xy;
|
|
ag.blk_size_z = max_blk_size_z;
|
|
ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
|
|
|
|
// The parallel task count
|
|
unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
|
|
unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
|
|
return z_tasks * y_tasks;
|
|
}
|
|
|
|
#endif
|