mirror of https://github.com/axmolengine/axmol.git
570 lines
25 KiB
C
570 lines
25 KiB
C
/*
|
|
* jdsample-neon.c - upsampling (Arm Neon)
|
|
*
|
|
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
|
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
|
|
*
|
|
* This software is provided 'as-is', without any express or implied
|
|
* warranty. In no event will the authors be held liable for any damages
|
|
* arising from the use of this software.
|
|
*
|
|
* Permission is granted to anyone to use this software for any purpose,
|
|
* including commercial applications, and to alter it and redistribute it
|
|
* freely, subject to the following restrictions:
|
|
*
|
|
* 1. The origin of this software must not be misrepresented; you must not
|
|
* claim that you wrote the original software. If you use this software
|
|
* in a product, an acknowledgment in the product documentation would be
|
|
* appreciated but is not required.
|
|
* 2. Altered source versions must be plainly marked as such, and must not be
|
|
* misrepresented as being the original software.
|
|
* 3. This notice may not be removed or altered from any source distribution.
|
|
*/
|
|
|
|
#define JPEG_INTERNALS
|
|
#include "../../jinclude.h"
|
|
#include "../../jpeglib.h"
|
|
#include "../../jsimd.h"
|
|
#include "../../jdct.h"
|
|
#include "../../jsimddct.h"
|
|
#include "../jsimd.h"
|
|
|
|
#include <arm_neon.h>
|
|
|
|
|
|
/* The diagram below shows a row of samples produced by h2v1 downsampling.
|
|
*
|
|
* s0 s1 s2
|
|
* +---------+---------+---------+
|
|
* | | | |
|
|
* | p0 p1 | p2 p3 | p4 p5 |
|
|
* | | | |
|
|
* +---------+---------+---------+
|
|
*
|
|
* Samples s0-s2 were created by averaging the original pixel component values
|
|
* centered at positions p0-p5 above. To approximate those original pixel
|
|
* component values, we proportionally blend the adjacent samples in each row.
|
|
*
|
|
* An upsampled pixel component value is computed by blending the sample
|
|
* containing the pixel center with the nearest neighboring sample, in the
|
|
* ratio 3:1. For example:
|
|
* p1(upsampled) = 3/4 * s0 + 1/4 * s1
|
|
* p2(upsampled) = 3/4 * s1 + 1/4 * s0
|
|
* When computing the first and last pixel component values in the row, there
|
|
* is no adjacent sample to blend, so:
|
|
* p0(upsampled) = s0
|
|
* p5(upsampled) = s2
|
|
*/
|
|
|
|
void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
|
|
JDIMENSION downsampled_width,
|
|
JSAMPARRAY input_data,
|
|
JSAMPARRAY *output_data_ptr)
|
|
{
|
|
JSAMPARRAY output_data = *output_data_ptr;
|
|
JSAMPROW inptr, outptr;
|
|
int inrow;
|
|
unsigned colctr;
|
|
/* Set up constants. */
|
|
const uint16x8_t one_u16 = vdupq_n_u16(1);
|
|
const uint8x8_t three_u8 = vdup_n_u8(3);
|
|
|
|
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
|
|
inptr = input_data[inrow];
|
|
outptr = output_data[inrow];
|
|
/* First pixel component value in this row of the original image */
|
|
*outptr = (JSAMPLE)GETJSAMPLE(*inptr);
|
|
|
|
/* 3/4 * containing sample + 1/4 * nearest neighboring sample
|
|
* For p1: containing sample = s0, nearest neighboring sample = s1
|
|
* For p2: containing sample = s1, nearest neighboring sample = s0
|
|
*/
|
|
uint8x16_t s0 = vld1q_u8(inptr);
|
|
uint8x16_t s1 = vld1q_u8(inptr + 1);
|
|
/* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
|
|
* denote low half and high half respectively.
|
|
*/
|
|
uint16x8_t s1_add_3s0_l =
|
|
vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
|
|
uint16x8_t s1_add_3s0_h =
|
|
vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
|
|
uint16x8_t s0_add_3s1_l =
|
|
vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
|
|
uint16x8_t s0_add_3s1_h =
|
|
vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
|
|
/* Add ordered dithering bias to odd pixel values. */
|
|
s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
|
|
s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
|
|
|
|
/* The offset is initially 1, because the first pixel component has already
|
|
* been stored. However, in subsequent iterations of the SIMD loop, this
|
|
* offset is (2 * colctr - 1) to stay within the bounds of the sample
|
|
* buffers without having to resort to a slow scalar tail case for the last
|
|
* (downsampled_width % 16) samples. See "Creation of 2-D sample arrays"
|
|
* in jmemmgr.c for more details.
|
|
*/
|
|
unsigned outptr_offset = 1;
|
|
uint8x16x2_t output_pixels;
|
|
|
|
/* We use software pipelining to maximise performance. The code indented
|
|
* an extra two spaces begins the next iteration of the loop.
|
|
*/
|
|
for (colctr = 16; colctr < downsampled_width; colctr += 16) {
|
|
|
|
s0 = vld1q_u8(inptr + colctr - 1);
|
|
s1 = vld1q_u8(inptr + colctr);
|
|
|
|
/* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
|
|
output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
|
|
vrshrn_n_u16(s1_add_3s0_h, 2));
|
|
output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
|
|
vshrn_n_u16(s0_add_3s1_h, 2));
|
|
|
|
/* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
|
|
* denote low half and high half respectively.
|
|
*/
|
|
s1_add_3s0_l =
|
|
vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
|
|
s1_add_3s0_h =
|
|
vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
|
|
s0_add_3s1_l =
|
|
vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
|
|
s0_add_3s1_h =
|
|
vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
|
|
/* Add ordered dithering bias to odd pixel values. */
|
|
s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
|
|
s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
|
|
|
|
/* Store pixel component values to memory. */
|
|
vst2q_u8(outptr + outptr_offset, output_pixels);
|
|
outptr_offset = 2 * colctr - 1;
|
|
}
|
|
|
|
/* Complete the last iteration of the loop. */
|
|
|
|
/* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
|
|
output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
|
|
vrshrn_n_u16(s1_add_3s0_h, 2));
|
|
output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
|
|
vshrn_n_u16(s0_add_3s1_h, 2));
|
|
/* Store pixel component values to memory. */
|
|
vst2q_u8(outptr + outptr_offset, output_pixels);
|
|
|
|
/* Last pixel component value in this row of the original image */
|
|
outptr[2 * downsampled_width - 1] =
|
|
GETJSAMPLE(inptr[downsampled_width - 1]);
|
|
}
|
|
}
|
|
|
|
|
|
/* The diagram below shows an array of samples produced by h2v2 downsampling.
|
|
*
|
|
* s0 s1 s2
|
|
* +---------+---------+---------+
|
|
* | p0 p1 | p2 p3 | p4 p5 |
|
|
* sA | | | |
|
|
* | p6 p7 | p8 p9 | p10 p11|
|
|
* +---------+---------+---------+
|
|
* | p12 p13| p14 p15| p16 p17|
|
|
* sB | | | |
|
|
* | p18 p19| p20 p21| p22 p23|
|
|
* +---------+---------+---------+
|
|
* | p24 p25| p26 p27| p28 p29|
|
|
* sC | | | |
|
|
* | p30 p31| p32 p33| p34 p35|
|
|
* +---------+---------+---------+
|
|
*
|
|
* Samples s0A-s2C were created by averaging the original pixel component
|
|
* values centered at positions p0-p35 above. To approximate one of those
|
|
* original pixel component values, we proportionally blend the sample
|
|
* containing the pixel center with the nearest neighboring samples in each
|
|
* row, column, and diagonal.
|
|
*
|
|
* An upsampled pixel component value is computed by first blending the sample
|
|
* containing the pixel center with the nearest neighboring samples in the
|
|
* same column, in the ratio 3:1, and then blending each column sum with the
|
|
* nearest neighboring column sum, in the ratio 3:1. For example:
|
|
* p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
|
|
* 1/4 * (3/4 * s0B + 1/4 * s0A)
|
|
* = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
|
|
* When computing the first and last pixel component values in the row, there
|
|
* is no horizontally adjacent sample to blend, so:
|
|
* p12(upsampled) = 3/4 * s0B + 1/4 * s0A
|
|
* p23(upsampled) = 3/4 * s2B + 1/4 * s2C
|
|
* When computing the first and last pixel component values in the column,
|
|
* there is no vertically adjacent sample to blend, so:
|
|
* p2(upsampled) = 3/4 * s1A + 1/4 * s0A
|
|
* p33(upsampled) = 3/4 * s1C + 1/4 * s2C
|
|
* When computing the corner pixel component values, there is no adjacent
|
|
* sample to blend, so:
|
|
* p0(upsampled) = s0A
|
|
* p35(upsampled) = s2C
|
|
*/
|
|
|
|
void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
|
|
JDIMENSION downsampled_width,
|
|
JSAMPARRAY input_data,
|
|
JSAMPARRAY *output_data_ptr)
|
|
{
|
|
JSAMPARRAY output_data = *output_data_ptr;
|
|
JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
|
|
int inrow, outrow;
|
|
unsigned colctr;
|
|
/* Set up constants. */
|
|
const uint16x8_t seven_u16 = vdupq_n_u16(7);
|
|
const uint8x8_t three_u8 = vdup_n_u8(3);
|
|
const uint16x8_t three_u16 = vdupq_n_u16(3);
|
|
|
|
inrow = outrow = 0;
|
|
while (outrow < max_v_samp_factor) {
|
|
inptr0 = input_data[inrow - 1];
|
|
inptr1 = input_data[inrow];
|
|
inptr2 = input_data[inrow + 1];
|
|
/* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
|
|
* respectively.
|
|
*/
|
|
outptr0 = output_data[outrow++];
|
|
outptr1 = output_data[outrow++];
|
|
|
|
/* First pixel component value in this row of the original image */
|
|
int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
|
|
*outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
|
|
int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
|
|
*outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
|
|
|
|
/* Step 1: Blend samples vertically in columns s0 and s1.
|
|
* Leave the divide by 4 until the end, when it can be done for both
|
|
* dimensions at once, right-shifting by 4.
|
|
*/
|
|
|
|
/* Load and compute s0colsum0 and s0colsum1. */
|
|
uint8x16_t s0A = vld1q_u8(inptr0);
|
|
uint8x16_t s0B = vld1q_u8(inptr1);
|
|
uint8x16_t s0C = vld1q_u8(inptr2);
|
|
/* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
|
|
* denote low half and high half respectively.
|
|
*/
|
|
uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
|
|
vget_low_u8(s0B), three_u8);
|
|
uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
|
|
vget_high_u8(s0B), three_u8);
|
|
uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
|
|
vget_low_u8(s0B), three_u8);
|
|
uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
|
|
vget_high_u8(s0B), three_u8);
|
|
/* Load and compute s1colsum0 and s1colsum1. */
|
|
uint8x16_t s1A = vld1q_u8(inptr0 + 1);
|
|
uint8x16_t s1B = vld1q_u8(inptr1 + 1);
|
|
uint8x16_t s1C = vld1q_u8(inptr2 + 1);
|
|
uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
|
|
vget_low_u8(s1B), three_u8);
|
|
uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
|
|
vget_high_u8(s1B), three_u8);
|
|
uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
|
|
vget_low_u8(s1B), three_u8);
|
|
uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
|
|
vget_high_u8(s1B), three_u8);
|
|
|
|
/* Step 2: Blend the already-blended columns. */
|
|
|
|
uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
|
|
uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
|
|
uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
|
|
uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
|
|
uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
|
|
uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
|
|
uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
|
|
uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
|
|
/* Add ordered dithering bias to odd pixel values. */
|
|
output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
|
|
output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
|
|
output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
|
|
output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
|
|
/* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
|
|
uint8x16x2_t output_pixels0 = { {
|
|
vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
|
|
vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
|
|
} };
|
|
uint8x16x2_t output_pixels1 = { {
|
|
vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
|
|
vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
|
|
} };
|
|
|
|
/* Store pixel component values to memory.
|
|
* The minimum size of the output buffer for each row is 64 bytes => no
|
|
* need to worry about buffer overflow here. See "Creation of 2-D sample
|
|
* arrays" in jmemmgr.c for more details.
|
|
*/
|
|
vst2q_u8(outptr0 + 1, output_pixels0);
|
|
vst2q_u8(outptr1 + 1, output_pixels1);
|
|
|
|
/* The first pixel of the image shifted our loads and stores by one byte.
|
|
* We have to re-align on a 32-byte boundary at some point before the end
|
|
* of the row (we do it now on the 32/33 pixel boundary) to stay within the
|
|
* bounds of the sample buffers without having to resort to a slow scalar
|
|
* tail case for the last (downsampled_width % 16) samples. See "Creation
|
|
* of 2-D sample arrays" in jmemmgr.c for more details.
|
|
*/
|
|
for (colctr = 16; colctr < downsampled_width; colctr += 16) {
|
|
/* Step 1: Blend samples vertically in columns s0 and s1. */
|
|
|
|
/* Load and compute s0colsum0 and s0colsum1. */
|
|
s0A = vld1q_u8(inptr0 + colctr - 1);
|
|
s0B = vld1q_u8(inptr1 + colctr - 1);
|
|
s0C = vld1q_u8(inptr2 + colctr - 1);
|
|
s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
|
|
three_u8);
|
|
s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
|
|
three_u8);
|
|
s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
|
|
three_u8);
|
|
s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
|
|
three_u8);
|
|
/* Load and compute s1colsum0 and s1colsum1. */
|
|
s1A = vld1q_u8(inptr0 + colctr);
|
|
s1B = vld1q_u8(inptr1 + colctr);
|
|
s1C = vld1q_u8(inptr2 + colctr);
|
|
s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
|
|
three_u8);
|
|
s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
|
|
three_u8);
|
|
s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
|
|
three_u8);
|
|
s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
|
|
three_u8);
|
|
|
|
/* Step 2: Blend the already-blended columns. */
|
|
|
|
output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
|
|
output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
|
|
output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
|
|
output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
|
|
output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
|
|
output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
|
|
output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
|
|
output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
|
|
/* Add ordered dithering bias to odd pixel values. */
|
|
output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
|
|
output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
|
|
output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
|
|
output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
|
|
/* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
|
|
output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
|
|
vshrn_n_u16(output0_p1_h, 4));
|
|
output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
|
|
vrshrn_n_u16(output0_p2_h, 4));
|
|
output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
|
|
vshrn_n_u16(output1_p1_h, 4));
|
|
output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
|
|
vrshrn_n_u16(output1_p2_h, 4));
|
|
/* Store pixel component values to memory. */
|
|
vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
|
|
vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
|
|
}
|
|
|
|
/* Last pixel component value in this row of the original image */
|
|
int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
|
|
GETJSAMPLE(inptr0[downsampled_width - 1]);
|
|
outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
|
|
int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
|
|
GETJSAMPLE(inptr2[downsampled_width - 1]);
|
|
outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
|
|
inrow++;
|
|
}
|
|
}
|
|
|
|
|
|
/* The diagram below shows a column of samples produced by h1v2 downsampling
|
|
* (or by losslessly rotating or transposing an h2v1-downsampled image.)
|
|
*
|
|
* +---------+
|
|
* | p0 |
|
|
* sA | |
|
|
* | p1 |
|
|
* +---------+
|
|
* | p2 |
|
|
* sB | |
|
|
* | p3 |
|
|
* +---------+
|
|
* | p4 |
|
|
* sC | |
|
|
* | p5 |
|
|
* +---------+
|
|
*
|
|
* Samples sA-sC were created by averaging the original pixel component values
|
|
* centered at positions p0-p5 above. To approximate those original pixel
|
|
* component values, we proportionally blend the adjacent samples in each
|
|
* column.
|
|
*
|
|
* An upsampled pixel component value is computed by blending the sample
|
|
* containing the pixel center with the nearest neighboring sample, in the
|
|
* ratio 3:1. For example:
|
|
* p1(upsampled) = 3/4 * sA + 1/4 * sB
|
|
* p2(upsampled) = 3/4 * sB + 1/4 * sA
|
|
* When computing the first and last pixel component values in the column,
|
|
* there is no adjacent sample to blend, so:
|
|
* p0(upsampled) = sA
|
|
* p5(upsampled) = sC
|
|
*/
|
|
|
|
void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
|
|
JDIMENSION downsampled_width,
|
|
JSAMPARRAY input_data,
|
|
JSAMPARRAY *output_data_ptr)
|
|
{
|
|
JSAMPARRAY output_data = *output_data_ptr;
|
|
JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
|
|
int inrow, outrow;
|
|
unsigned colctr;
|
|
/* Set up constants. */
|
|
const uint16x8_t one_u16 = vdupq_n_u16(1);
|
|
const uint8x8_t three_u8 = vdup_n_u8(3);
|
|
|
|
inrow = outrow = 0;
|
|
while (outrow < max_v_samp_factor) {
|
|
inptr0 = input_data[inrow - 1];
|
|
inptr1 = input_data[inrow];
|
|
inptr2 = input_data[inrow + 1];
|
|
/* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
|
|
* respectively.
|
|
*/
|
|
outptr0 = output_data[outrow++];
|
|
outptr1 = output_data[outrow++];
|
|
inrow++;
|
|
|
|
/* The size of the input and output buffers is always a multiple of 32
|
|
* bytes => no need to worry about buffer overflow when reading/writing
|
|
* memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more
|
|
* details.
|
|
*/
|
|
for (colctr = 0; colctr < downsampled_width; colctr += 16) {
|
|
/* Load samples. */
|
|
uint8x16_t sA = vld1q_u8(inptr0 + colctr);
|
|
uint8x16_t sB = vld1q_u8(inptr1 + colctr);
|
|
uint8x16_t sC = vld1q_u8(inptr2 + colctr);
|
|
/* Blend samples vertically. */
|
|
uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
|
|
vget_low_u8(sB), three_u8);
|
|
uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
|
|
vget_high_u8(sB), three_u8);
|
|
uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
|
|
vget_low_u8(sB), three_u8);
|
|
uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
|
|
vget_high_u8(sB), three_u8);
|
|
/* Add ordered dithering bias to pixel values in even output rows. */
|
|
colsum0_l = vaddq_u16(colsum0_l, one_u16);
|
|
colsum0_h = vaddq_u16(colsum0_h, one_u16);
|
|
/* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
|
|
uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
|
|
vshrn_n_u16(colsum0_h, 2));
|
|
uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
|
|
vrshrn_n_u16(colsum1_h, 2));
|
|
/* Store pixel component values to memory. */
|
|
vst1q_u8(outptr0 + colctr, output_pixels0);
|
|
vst1q_u8(outptr1 + colctr, output_pixels1);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/* The diagram below shows a row of samples produced by h2v1 downsampling.
|
|
*
|
|
* s0 s1
|
|
* +---------+---------+
|
|
* | | |
|
|
* | p0 p1 | p2 p3 |
|
|
* | | |
|
|
* +---------+---------+
|
|
*
|
|
* Samples s0 and s1 were created by averaging the original pixel component
|
|
* values centered at positions p0-p3 above. To approximate those original
|
|
* pixel component values, we duplicate the samples horizontally:
|
|
* p0(upsampled) = p1(upsampled) = s0
|
|
* p2(upsampled) = p3(upsampled) = s1
|
|
*/
|
|
|
|
void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
|
|
JSAMPARRAY input_data,
|
|
JSAMPARRAY *output_data_ptr)
|
|
{
|
|
JSAMPARRAY output_data = *output_data_ptr;
|
|
JSAMPROW inptr, outptr;
|
|
int inrow;
|
|
unsigned colctr;
|
|
|
|
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
|
|
inptr = input_data[inrow];
|
|
outptr = output_data[inrow];
|
|
for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
|
|
uint8x16_t samples = vld1q_u8(inptr + colctr);
|
|
/* Duplicate the samples. The store operation below interleaves them so
|
|
* that adjacent pixel component values take on the same sample value,
|
|
* per above.
|
|
*/
|
|
uint8x16x2_t output_pixels = { { samples, samples } };
|
|
/* Store pixel component values to memory.
|
|
* Due to the way sample buffers are allocated, we don't need to worry
|
|
* about tail cases when output_width is not a multiple of 32. See
|
|
* "Creation of 2-D sample arrays" in jmemmgr.c for details.
|
|
*/
|
|
vst2q_u8(outptr + 2 * colctr, output_pixels);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/* The diagram below shows an array of samples produced by h2v2 downsampling.
|
|
*
|
|
* s0 s1
|
|
* +---------+---------+
|
|
* | p0 p1 | p2 p3 |
|
|
* sA | | |
|
|
* | p4 p5 | p6 p7 |
|
|
* +---------+---------+
|
|
* | p8 p9 | p10 p11|
|
|
* sB | | |
|
|
* | p12 p13| p14 p15|
|
|
* +---------+---------+
|
|
*
|
|
* Samples s0A-s1B were created by averaging the original pixel component
|
|
* values centered at positions p0-p15 above. To approximate those original
|
|
* pixel component values, we duplicate the samples both horizontally and
|
|
* vertically:
|
|
* p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
|
|
* p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
|
|
* p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
|
|
* p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
|
|
*/
|
|
|
|
void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
|
|
JSAMPARRAY input_data,
|
|
JSAMPARRAY *output_data_ptr)
|
|
{
|
|
JSAMPARRAY output_data = *output_data_ptr;
|
|
JSAMPROW inptr, outptr0, outptr1;
|
|
int inrow, outrow;
|
|
unsigned colctr;
|
|
|
|
for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
|
|
inptr = input_data[inrow];
|
|
outptr0 = output_data[outrow++];
|
|
outptr1 = output_data[outrow++];
|
|
|
|
for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
|
|
uint8x16_t samples = vld1q_u8(inptr + colctr);
|
|
/* Duplicate the samples. The store operation below interleaves them so
|
|
* that adjacent pixel component values take on the same sample value,
|
|
* per above.
|
|
*/
|
|
uint8x16x2_t output_pixels = { { samples, samples } };
|
|
/* Store pixel component values for both output rows to memory.
|
|
* Due to the way sample buffers are allocated, we don't need to worry
|
|
* about tail cases when output_width is not a multiple of 32. See
|
|
* "Creation of 2-D sample arrays" in jmemmgr.c for details.
|
|
*/
|
|
vst2q_u8(outptr0 + 2 * colctr, output_pixels);
|
|
vst2q_u8(outptr1 + 2 * colctr, output_pixels);
|
|
}
|
|
}
|
|
}
|