axmol/external/jpeg/simd/arm/jquanti-neon.c

/*
 * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
 *
 * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the authors be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute it
 * freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must not
 *    claim that you wrote the original software. If you use this software
 *    in a product, an acknowledgment in the product documentation would be
 *    appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must not be
 *    misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source distribution.
 */

#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"

#include <arm_neon.h>


/* After downsampling, the resulting sample values are in the range [0, 255],
 * but the Discrete Cosine Transform (DCT) operates on values centered around
 * 0.
 *
 * To prepare sample values for the DCT, load samples into a DCT workspace,
 * subtracting CENTERJSAMPLE (128).  The samples, now in the range [-128, 127],
 * are also widened from 8- to 16-bit.
 *
 * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
 */

void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
                         DCTELEM *workspace)
{
  uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
  uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
  uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
  uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
  uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
  uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
  uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
  uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);

  int16x8_t row0 =
    vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
  int16x8_t row1 =
    vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
  int16x8_t row2 =
    vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
  int16x8_t row3 =
    vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
  int16x8_t row4 =
    vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
  int16x8_t row5 =
    vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
  int16x8_t row6 =
    vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
  int16x8_t row7 =
    vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));

  vst1q_s16(workspace + 0 * DCTSIZE, row0);
  vst1q_s16(workspace + 1 * DCTSIZE, row1);
  vst1q_s16(workspace + 2 * DCTSIZE, row2);
  vst1q_s16(workspace + 3 * DCTSIZE, row3);
  vst1q_s16(workspace + 4 * DCTSIZE, row4);
  vst1q_s16(workspace + 5 * DCTSIZE, row5);
  vst1q_s16(workspace + 6 * DCTSIZE, row6);
  vst1q_s16(workspace + 7 * DCTSIZE, row7);
}


/* After the DCT, the resulting array of coefficient values needs to be divided
 * by an array of quantization values.
 *
 * To avoid a slow division operation, the DCT coefficients are multiplied by
 * the (scaled) reciprocals of the quantization values and then right-shifted.
 *
 * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
 */

void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
                         DCTELEM *workspace)
{
  JCOEFPTR out_ptr = coef_block;
  UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
  UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
  DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
  int i;

  for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
    /* Load DCT coefficients. */
    int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
    int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
    int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
    int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
    /* Load reciprocals of quantization values. */
    uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
    uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
    uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
    uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
    uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
    uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
    uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
    uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
    int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
    int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
    int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
    int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);

    /* Extract sign from coefficients. */
    int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
    int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
    int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
    int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
    /* Get absolute value of DCT coefficients. */
    uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
    uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
    uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
    uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
    /* Add correction. */
    abs_row0 = vaddq_u16(abs_row0, corr0);
    abs_row1 = vaddq_u16(abs_row1, corr1);
    abs_row2 = vaddq_u16(abs_row2, corr2);
    abs_row3 = vaddq_u16(abs_row3, corr3);

    /* Multiply DCT coefficients by quantization reciprocals. */
    int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
                                                       vget_low_u16(recip0)));
    int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
                                                       vget_high_u16(recip0)));
    int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
                                                       vget_low_u16(recip1)));
    int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
                                                       vget_high_u16(recip1)));
    int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
                                                       vget_low_u16(recip2)));
    int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
                                                       vget_high_u16(recip2)));
    int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
                                                       vget_low_u16(recip3)));
    int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
                                                       vget_high_u16(recip3)));
    /* Narrow back to 16-bit. */
    row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
    row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
    row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
    row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));

    /* Since VSHR only supports an immediate as its second argument, negate the
     * shift value and shift left.
     */
    row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
                                           vnegq_s16(shift0)));
    row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
                                           vnegq_s16(shift1)));
    row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
                                           vnegq_s16(shift2)));
    row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
                                           vnegq_s16(shift3)));

    /* Restore sign to original product. */
    row0 = veorq_s16(row0, sign_row0);
    row0 = vsubq_s16(row0, sign_row0);
    row1 = veorq_s16(row1, sign_row1);
    row1 = vsubq_s16(row1, sign_row1);
    row2 = veorq_s16(row2, sign_row2);
    row2 = vsubq_s16(row2, sign_row2);
    row3 = veorq_s16(row3, sign_row3);
    row3 = vsubq_s16(row3, sign_row3);

    /* Store quantized coefficients to memory. */
    vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
    vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
    vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
    vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
  }
}
Update libjpeg-turbo to v2.1.0 [ci build] 2021-04-25 21:41:28 +08:00			`/*`
			`* jquanti-neon.c - sample data conversion and quantization (Arm Neon)`
			`*`
			`* Copyright (C) 2020, Arm Limited. All Rights Reserved.`
			`*`
			`* This software is provided 'as-is', without any express or implied`
			`* warranty. In no event will the authors be held liable for any damages`
			`* arising from the use of this software.`
			`*`
			`* Permission is granted to anyone to use this software for any purpose,`
			`* including commercial applications, and to alter it and redistribute it`
			`* freely, subject to the following restrictions:`
			`*`
			`* 1. The origin of this software must not be misrepresented; you must not`
			`* claim that you wrote the original software. If you use this software`
			`* in a product, an acknowledgment in the product documentation would be`
			`* appreciated but is not required.`
			`* 2. Altered source versions must be plainly marked as such, and must not be`
			`* misrepresented as being the original software.`
			`* 3. This notice may not be removed or altered from any source distribution.`
			`*/`

			`#define JPEG_INTERNALS`
			`#include "../../jinclude.h"`
			`#include "../../jpeglib.h"`
			`#include "../../jsimd.h"`
			`#include "../../jdct.h"`
			`#include "../../jsimddct.h"`
			`#include "../jsimd.h"`

			`#include <arm_neon.h>`


			`/* After downsampling, the resulting sample values are in the range [0, 255],`
			`* but the Discrete Cosine Transform (DCT) operates on values centered around`
			`* 0.`
			`*`
			`* To prepare sample values for the DCT, load samples into a DCT workspace,`
			`* subtracting CENTERJSAMPLE (128). The samples, now in the range [-128, 127],`
			`* are also widened from 8- to 16-bit.`
			`*`
			`* The equivalent scalar C function convsamp() can be found in jcdctmgr.c.`
			`*/`

			`void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,`
			`DCTELEM *workspace)`
			`{`
			`uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);`
			`uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);`
			`uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);`
			`uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);`
			`uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);`
			`uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);`
			`uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);`
			`uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);`

			`int16x8_t row0 =`
			`vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));`
			`int16x8_t row1 =`
			`vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));`
			`int16x8_t row2 =`
			`vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));`
			`int16x8_t row3 =`
			`vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));`
			`int16x8_t row4 =`
			`vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));`
			`int16x8_t row5 =`
			`vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));`
			`int16x8_t row6 =`
			`vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));`
			`int16x8_t row7 =`
			`vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));`

			`vst1q_s16(workspace + 0 * DCTSIZE, row0);`
			`vst1q_s16(workspace + 1 * DCTSIZE, row1);`
			`vst1q_s16(workspace + 2 * DCTSIZE, row2);`
			`vst1q_s16(workspace + 3 * DCTSIZE, row3);`
			`vst1q_s16(workspace + 4 * DCTSIZE, row4);`
			`vst1q_s16(workspace + 5 * DCTSIZE, row5);`
			`vst1q_s16(workspace + 6 * DCTSIZE, row6);`
			`vst1q_s16(workspace + 7 * DCTSIZE, row7);`
			`}`


			`/* After the DCT, the resulting array of coefficient values needs to be divided`
			`* by an array of quantization values.`
			`*`
			`* To avoid a slow division operation, the DCT coefficients are multiplied by`
			`* the (scaled) reciprocals of the quantization values and then right-shifted.`
			`*`
			`* The equivalent scalar C function quantize() can be found in jcdctmgr.c.`
			`*/`

			`void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,`
			`DCTELEM *workspace)`
			`{`
			`JCOEFPTR out_ptr = coef_block;`
			`UDCTELEM recip_ptr = (UDCTELEM )divisors;`
			`UDCTELEM corr_ptr = (UDCTELEM )divisors + DCTSIZE2;`
			`DCTELEM shift_ptr = divisors + 3 DCTSIZE2;`
			`int i;`

			`for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {`
			`/* Load DCT coefficients. */`
			`int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);`
			`int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);`
			`int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);`
			`int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);`
			`/* Load reciprocals of quantization values. */`
			`uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);`
			`uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);`
			`uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);`
			`uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);`
			`uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);`
			`uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);`
			`uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);`
			`uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);`
			`int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);`
			`int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);`
			`int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);`
			`int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);`

			`/* Extract sign from coefficients. */`
			`int16x8_t sign_row0 = vshrq_n_s16(row0, 15);`
			`int16x8_t sign_row1 = vshrq_n_s16(row1, 15);`
			`int16x8_t sign_row2 = vshrq_n_s16(row2, 15);`
			`int16x8_t sign_row3 = vshrq_n_s16(row3, 15);`
			`/* Get absolute value of DCT coefficients. */`
			`uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));`
			`uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));`
			`uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));`
			`uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));`
			`/* Add correction. */`
			`abs_row0 = vaddq_u16(abs_row0, corr0);`
			`abs_row1 = vaddq_u16(abs_row1, corr1);`
			`abs_row2 = vaddq_u16(abs_row2, corr2);`
			`abs_row3 = vaddq_u16(abs_row3, corr3);`

			`/* Multiply DCT coefficients by quantization reciprocals. */`
			`int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),`
			`vget_low_u16(recip0)));`
			`int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),`
			`vget_high_u16(recip0)));`
			`int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),`
			`vget_low_u16(recip1)));`
			`int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),`
			`vget_high_u16(recip1)));`
			`int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),`
			`vget_low_u16(recip2)));`
			`int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),`
			`vget_high_u16(recip2)));`
			`int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),`
			`vget_low_u16(recip3)));`
			`int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),`
			`vget_high_u16(recip3)));`
			`/* Narrow back to 16-bit. */`
			`row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));`
			`row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));`
			`row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));`
			`row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));`

			`/* Since VSHR only supports an immediate as its second argument, negate the`
			`* shift value and shift left.`
			`*/`
			`row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),`
			`vnegq_s16(shift0)));`
			`row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),`
			`vnegq_s16(shift1)));`
			`row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),`
			`vnegq_s16(shift2)));`
			`row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),`
			`vnegq_s16(shift3)));`

			`/* Restore sign to original product. */`
			`row0 = veorq_s16(row0, sign_row0);`
			`row0 = vsubq_s16(row0, sign_row0);`
			`row1 = veorq_s16(row1, sign_row1);`
			`row1 = vsubq_s16(row1, sign_row1);`
			`row2 = veorq_s16(row2, sign_row2);`
			`row2 = vsubq_s16(row2, sign_row2);`
			`row3 = veorq_s16(row3, sign_row3);`
			`row3 = vsubq_s16(row3, sign_row3);`

			`/* Store quantized coefficients to memory. */`
			`vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);`
			`vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);`
			`vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);`
			`vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);`
			`}`
			`}`