axmol/external/jpeg/simd/arm/jcsample-neon.c

/*
 * jcsample-neon.c - downsampling (Arm Neon)
 *
 * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the authors be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute it
 * freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must not
 *    claim that you wrote the original software. If you use this software
 *    in a product, an acknowledgment in the product documentation would be
 *    appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must not be
 *    misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source distribution.
 */

#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"

#include <arm_neon.h>


ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 0 */
  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 1 */
  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 2 */
  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 3 */
  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 4 */
  0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 5 */
  0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 6 */
  0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 7 */
  0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 8 */
  0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06,   /* Pad 9 */
  0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05,   /* Pad 10 */
  0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04,   /* Pad 11 */
  0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
  0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03,   /* Pad 12 */
  0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
  0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,   /* Pad 13 */
  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
  0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,   /* Pad 14 */
  0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,   /* Pad 15 */
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};


/* Downsample pixel values of a single component.
 * This version handles the common case of 2:1 horizontal and 1:1 vertical,
 * without smoothing.
 */

void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
                                JDIMENSION v_samp_factor,
                                JDIMENSION width_in_blocks,
                                JSAMPARRAY input_data, JSAMPARRAY output_data)
{
  JSAMPROW inptr, outptr;
  /* Load expansion mask to pad remaining elements of last DCT block. */
  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
  const uint8x16_t expand_mask =
    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
  /* Load bias pattern (alternating every pixel.) */
  /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
  unsigned i, outrow;

  for (outrow = 0; outrow < v_samp_factor; outrow++) {
    outptr = output_data[outrow];
    inptr = input_data[outrow];

    /* Downsample all but the last DCT block of pixels. */
    for (i = 0; i < width_in_blocks - 1; i++) {
      uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
      /* Add adjacent pixel values, widen to 16-bit, and add bias. */
      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
      /* Divide total by 2 and narrow to 8-bit. */
      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
      /* Store samples to memory. */
      vst1_u8(outptr + i * DCTSIZE, samples_u8);
    }

    /* Load pixels in last DCT block into a table. */
    uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
#if defined(__aarch64__) || defined(_M_ARM64)
    /* Pad the empty elements with the value of the last pixel. */
    pixels = vqtbl1q_u8(pixels, expand_mask);
#else
    uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
    pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
                         vtbl2_u8(table, vget_high_u8(expand_mask)));
#endif
    /* Add adjacent pixel values, widen to 16-bit, and add bias. */
    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
    /* Divide total by 2, narrow to 8-bit, and store. */
    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
  }
}


/* Downsample pixel values of a single component.
 * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
 * without smoothing.
 */

void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
                                JDIMENSION v_samp_factor,
                                JDIMENSION width_in_blocks,
                                JSAMPARRAY input_data, JSAMPARRAY output_data)
{
  JSAMPROW inptr0, inptr1, outptr;
  /* Load expansion mask to pad remaining elements of last DCT block. */
  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
  const uint8x16_t expand_mask =
    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
  /* Load bias pattern (alternating every pixel.) */
  /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
  unsigned i, outrow;

  for (outrow = 0; outrow < v_samp_factor; outrow++) {
    outptr = output_data[outrow];
    inptr0 = input_data[outrow];
    inptr1 = input_data[outrow + 1];

    /* Downsample all but the last DCT block of pixels. */
    for (i = 0; i < width_in_blocks - 1; i++) {
      uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
      uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
      /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
      /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
       */
      samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
      /* Divide total by 4 and narrow to 8-bit. */
      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
      /* Store samples to memory and increment pointers. */
      vst1_u8(outptr + i * DCTSIZE, samples_u8);
    }

    /* Load pixels in last DCT block into a table. */
    uint8x16_t pixels_r0 =
      vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
    uint8x16_t pixels_r1 =
      vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
#if defined(__aarch64__) || defined(_M_ARM64)
    /* Pad the empty elements with the value of the last pixel. */
    pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
    pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
#else
    uint8x8x2_t table_r0 =
      { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
    uint8x8x2_t table_r1 =
      { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
    pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
                            vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
    pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
                            vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
#endif
    /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
    /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
    samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
    /* Divide total by 4, narrow to 8-bit, and store. */
    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
  }
}
Update libjpeg-turbo to v2.1.0 [ci build] 2021-04-25 21:41:28 +08:00			`/*`
			`* jcsample-neon.c - downsampling (Arm Neon)`
			`*`
			`* Copyright (C) 2020, Arm Limited. All Rights Reserved.`
			`*`
			`* This software is provided 'as-is', without any express or implied`
			`* warranty. In no event will the authors be held liable for any damages`
			`* arising from the use of this software.`
			`*`
			`* Permission is granted to anyone to use this software for any purpose,`
			`* including commercial applications, and to alter it and redistribute it`
			`* freely, subject to the following restrictions:`
			`*`
			`* 1. The origin of this software must not be misrepresented; you must not`
			`* claim that you wrote the original software. If you use this software`
			`* in a product, an acknowledgment in the product documentation would be`
			`* appreciated but is not required.`
			`* 2. Altered source versions must be plainly marked as such, and must not be`
			`* misrepresented as being the original software.`
			`* 3. This notice may not be removed or altered from any source distribution.`
			`*/`

			`#define JPEG_INTERNALS`
			`#include "../../jinclude.h"`
			`#include "../../jpeglib.h"`
			`#include "../../jsimd.h"`
			`#include "../../jdct.h"`
			`#include "../../jsimddct.h"`
			`#include "../jsimd.h"`
			`#include "align.h"`

			`#include <arm_neon.h>`


			`ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {`
			`0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 0 */`
			`0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,`
			`0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 1 */`
			`0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,`
			`0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 2 */`
			`0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,`
			`0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 3 */`
			`0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,`
			`0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 4 */`
			`0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,`
			`0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 5 */`
			`0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,`
			`0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 6 */`
			`0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,`
			`0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 7 */`
			`0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,`
			`0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 8 */`
			`0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,`
			`0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, /* Pad 9 */`
			`0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,`
			`0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, /* Pad 10 */`
			`0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,`
			`0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, /* Pad 11 */`
			`0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,`
			`0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, /* Pad 12 */`
			`0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,`
			`0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* Pad 13 */`
			`0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,`
			`0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* Pad 14 */`
			`0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,`
			`0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Pad 15 */`
			`0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00`
			`};`


			`/* Downsample pixel values of a single component.`
			`* This version handles the common case of 2:1 horizontal and 1:1 vertical,`
			`* without smoothing.`
			`*/`

			`void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,`
			`JDIMENSION v_samp_factor,`
			`JDIMENSION width_in_blocks,`
			`JSAMPARRAY input_data, JSAMPARRAY output_data)`
			`{`
			`JSAMPROW inptr, outptr;`
			`/* Load expansion mask to pad remaining elements of last DCT block. */`
			`const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);`
			`const uint8x16_t expand_mask =`
			`vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);`
			`/* Load bias pattern (alternating every pixel.) */`
			`/* { 0, 1, 0, 1, 0, 1, 0, 1 } */`
			`const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));`
			`unsigned i, outrow;`

			`for (outrow = 0; outrow < v_samp_factor; outrow++) {`
			`outptr = output_data[outrow];`
			`inptr = input_data[outrow];`

			`/* Downsample all but the last DCT block of pixels. */`
			`for (i = 0; i < width_in_blocks - 1; i++) {`
			`uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);`
			`/* Add adjacent pixel values, widen to 16-bit, and add bias. */`
			`uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);`
			`/* Divide total by 2 and narrow to 8-bit. */`
			`uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);`
			`/* Store samples to memory. */`
			`vst1_u8(outptr + i * DCTSIZE, samples_u8);`
			`}`

			`/* Load pixels in last DCT block into a table. */`
			`uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);`
			`#if defined(__aarch64__) \|\| defined(_M_ARM64)`
			`/* Pad the empty elements with the value of the last pixel. */`
			`pixels = vqtbl1q_u8(pixels, expand_mask);`
			`#else`
			`uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };`
			`pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),`
			`vtbl2_u8(table, vget_high_u8(expand_mask)));`
			`#endif`
			`/* Add adjacent pixel values, widen to 16-bit, and add bias. */`
			`uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);`
			`/* Divide total by 2, narrow to 8-bit, and store. */`
			`uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);`
			`vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);`
			`}`
			`}`


			`/* Downsample pixel values of a single component.`
			`* This version handles the standard case of 2:1 horizontal and 2:1 vertical,`
			`* without smoothing.`
			`*/`

			`void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,`
			`JDIMENSION v_samp_factor,`
			`JDIMENSION width_in_blocks,`
			`JSAMPARRAY input_data, JSAMPARRAY output_data)`
			`{`
			`JSAMPROW inptr0, inptr1, outptr;`
			`/* Load expansion mask to pad remaining elements of last DCT block. */`
			`const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);`
			`const uint8x16_t expand_mask =`
			`vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);`
			`/* Load bias pattern (alternating every pixel.) */`
			`/* { 1, 2, 1, 2, 1, 2, 1, 2 } */`
			`const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));`
			`unsigned i, outrow;`

			`for (outrow = 0; outrow < v_samp_factor; outrow++) {`
			`outptr = output_data[outrow];`
			`inptr0 = input_data[outrow];`
			`inptr1 = input_data[outrow + 1];`

			`/* Downsample all but the last DCT block of pixels. */`
			`for (i = 0; i < width_in_blocks - 1; i++) {`
			`uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);`
			`uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);`
			`/* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */`
			`uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);`
			`/* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.`
			`*/`
			`samples_u16 = vpadalq_u8(samples_u16, pixels_r1);`
			`/* Divide total by 4 and narrow to 8-bit. */`
			`uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);`
			`/* Store samples to memory and increment pointers. */`
			`vst1_u8(outptr + i * DCTSIZE, samples_u8);`
			`}`

			`/* Load pixels in last DCT block into a table. */`
			`uint8x16_t pixels_r0 =`
			`vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);`
			`uint8x16_t pixels_r1 =`
			`vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);`
			`#if defined(__aarch64__) \|\| defined(_M_ARM64)`
			`/* Pad the empty elements with the value of the last pixel. */`
			`pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);`
			`pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);`
			`#else`
			`uint8x8x2_t table_r0 =`
			`{ { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };`
			`uint8x8x2_t table_r1 =`
			`{ { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };`
			`pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),`
			`vtbl2_u8(table_r0, vget_high_u8(expand_mask)));`
			`pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),`
			`vtbl2_u8(table_r1, vget_high_u8(expand_mask)));`
			`#endif`
			`/* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */`
			`uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);`
			`/* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */`
			`samples_u16 = vpadalq_u8(samples_u16, pixels_r1);`
			`/* Divide total by 4, narrow to 8-bit, and store. */`
			`uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);`
			`vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);`
			`}`
			`}`