axmol/external/jpeg/simd/arm/jdcolext-neon.c

/*
 * jdcolext-neon.c - colorspace conversion (Arm Neon)
 *
 * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
 * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the authors be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute it
 * freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must not
 *    claim that you wrote the original software. If you use this software
 *    in a product, an acknowledgment in the product documentation would be
 *    appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must not be
 *    misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source distribution.
 */

/* This file is included by jdcolor-neon.c. */


/* YCbCr -> RGB conversion is defined by the following equations:
 *    R = Y                        + 1.40200 * (Cr - 128)
 *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
 *    B = Y + 1.77200 * (Cb - 128)
 *
 * Scaled integer constants are used to avoid floating-point arithmetic:
 *    0.3441467 = 11277 * 2^-15
 *    0.7141418 = 23401 * 2^-15
 *    1.4020386 = 22971 * 2^-14
 *    1.7720337 = 29033 * 2^-14
 * These constants are defined in jdcolor-neon.c.
 *
 * To ensure correct results, rounding is used when descaling.
 */

/* Notes on safe memory access for YCbCr -> RGB conversion routines:
 *
 * Input memory buffers can be safely overread up to the next multiple of
 * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
 * jmemmgr.c.
 *
 * The output buffer cannot safely be written beyond output_width, since
 * output_buf points to a possibly unpadded row in the decompressed image
 * buffer allocated by the calling program.
 */

void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
                                JDIMENSION input_row, JSAMPARRAY output_buf,
                                int num_rows)
{
  JSAMPROW outptr;
  /* Pointers to Y, Cb, and Cr data */
  JSAMPROW inptr0, inptr1, inptr2;

  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
  const int16x8_t neg_128 = vdupq_n_s16(-128);

  while (--num_rows >= 0) {
    inptr0 = input_buf[0][input_row];
    inptr1 = input_buf[1][input_row];
    inptr2 = input_buf[2][input_row];
    input_row++;
    outptr = *output_buf++;
    int cols_remaining = output_width;
    for (; cols_remaining >= 16; cols_remaining -= 16) {
      uint8x16_t y  = vld1q_u8(inptr0);
      uint8x16_t cb = vld1q_u8(inptr1);
      uint8x16_t cr = vld1q_u8(inptr2);
      /* Subtract 128 from Cb and Cr. */
      int16x8_t cr_128_l =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
                                       vget_low_u8(cr)));
      int16x8_t cr_128_h =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
                                       vget_high_u8(cr)));
      int16x8_t cb_128_l =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
                                       vget_low_u8(cb)));
      int16x8_t cb_128_h =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
                                       vget_high_u8(cb)));
      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
      int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
      int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
                                            consts, 0);
      int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
      int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
                                            consts, 0);
      g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
                                  consts, 1);
      g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
                                  consts, 1);
      g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
                                  consts, 1);
      g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
                                  consts, 1);
      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
      int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
                                         vrshrn_n_s32(g_sub_y_lh, 15));
      int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
                                         vrshrn_n_s32(g_sub_y_hh, 15));
      /* Compute R-Y: 1.40200 * (Cr - 128) */
      int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
                                               consts, 2);
      int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
                                               consts, 2);
      /* Compute B-Y: 1.77200 * (Cb - 128) */
      int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
                                               consts, 3);
      int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
                                               consts, 3);
      /* Add Y. */
      int16x8_t r_l =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
                                       vget_low_u8(y)));
      int16x8_t r_h =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
                                       vget_high_u8(y)));
      int16x8_t b_l =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
                                       vget_low_u8(y)));
      int16x8_t b_h =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
                                       vget_high_u8(y)));
      int16x8_t g_l =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
                                       vget_low_u8(y)));
      int16x8_t g_h =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
                                       vget_high_u8(y)));

#if RGB_PIXELSIZE == 4
      uint8x16x4_t rgba;
      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
      rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
      rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
      rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
      /* Set alpha channel to opaque (0xFF). */
      rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
      /* Store RGBA pixel data to memory. */
      vst4q_u8(outptr, rgba);
#elif RGB_PIXELSIZE == 3
      uint8x16x3_t rgb;
      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
      rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
      rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
      rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
      /* Store RGB pixel data to memory. */
      vst3q_u8(outptr, rgb);
#else
      /* Pack R, G, and B values in ratio 5:6:5. */
      uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
      uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
      /* Store RGB pixel data to memory. */
      vst1q_u16((uint16_t *)outptr, rgb565_l);
      vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
#endif

      /* Increment pointers. */
      inptr0 += 16;
      inptr1 += 16;
      inptr2 += 16;
      outptr += (RGB_PIXELSIZE * 16);
    }

    if (cols_remaining >= 8) {
      uint8x8_t y  = vld1_u8(inptr0);
      uint8x8_t cb = vld1_u8(inptr1);
      uint8x8_t cr = vld1_u8(inptr2);
      /* Subtract 128 from Cb and Cr. */
      int16x8_t cr_128 =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
      int16x8_t cb_128 =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
                                       vrshrn_n_s32(g_sub_y_h, 15));
      /* Compute R-Y: 1.40200 * (Cr - 128) */
      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
                                             consts, 2);
      /* Compute B-Y: 1.77200 * (Cb - 128) */
      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
                                             consts, 3);
      /* Add Y. */
      int16x8_t r =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
      int16x8_t b =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
      int16x8_t g =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));

#if RGB_PIXELSIZE == 4
      uint8x8x4_t rgba;
      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
      rgba.val[RGB_RED] = vqmovun_s16(r);
      rgba.val[RGB_GREEN] = vqmovun_s16(g);
      rgba.val[RGB_BLUE] = vqmovun_s16(b);
      /* Set alpha channel to opaque (0xFF). */
      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
      /* Store RGBA pixel data to memory. */
      vst4_u8(outptr, rgba);
#elif RGB_PIXELSIZE == 3
      uint8x8x3_t rgb;
      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
      rgb.val[RGB_RED] = vqmovun_s16(r);
      rgb.val[RGB_GREEN] = vqmovun_s16(g);
      rgb.val[RGB_BLUE] = vqmovun_s16(b);
      /* Store RGB pixel data to memory. */
      vst3_u8(outptr, rgb);
#else
      /* Pack R, G, and B values in ratio 5:6:5. */
      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
      /* Store RGB pixel data to memory. */
      vst1q_u16((uint16_t *)outptr, rgb565);
#endif

      /* Increment pointers. */
      inptr0 += 8;
      inptr1 += 8;
      inptr2 += 8;
      outptr += (RGB_PIXELSIZE * 8);
      cols_remaining -= 8;
    }

    /* Handle the tail elements. */
    if (cols_remaining > 0) {
      uint8x8_t y  = vld1_u8(inptr0);
      uint8x8_t cb = vld1_u8(inptr1);
      uint8x8_t cr = vld1_u8(inptr2);
      /* Subtract 128 from Cb and Cr. */
      int16x8_t cr_128 =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
      int16x8_t cb_128 =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
                                       vrshrn_n_s32(g_sub_y_h, 15));
      /* Compute R-Y: 1.40200 * (Cr - 128) */
      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
                                             consts, 2);
      /* Compute B-Y: 1.77200 * (Cb - 128) */
      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
                                             consts, 3);
      /* Add Y. */
      int16x8_t r =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
      int16x8_t b =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
      int16x8_t g =
        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));

#if RGB_PIXELSIZE == 4
      uint8x8x4_t rgba;
      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
      rgba.val[RGB_RED] = vqmovun_s16(r);
      rgba.val[RGB_GREEN] = vqmovun_s16(g);
      rgba.val[RGB_BLUE] = vqmovun_s16(b);
      /* Set alpha channel to opaque (0xFF). */
      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
      /* Store RGBA pixel data to memory. */
      switch (cols_remaining) {
      case 7:
        vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
      case 6:
        vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
      case 5:
        vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
      case 4:
        vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
      case 3:
        vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
      case 2:
        vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
      case 1:
        vst4_lane_u8(outptr, rgba, 0);
      default:
        break;
      }
#elif RGB_PIXELSIZE == 3
      uint8x8x3_t rgb;
      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
      rgb.val[RGB_RED] = vqmovun_s16(r);
      rgb.val[RGB_GREEN] = vqmovun_s16(g);
      rgb.val[RGB_BLUE] = vqmovun_s16(b);
      /* Store RGB pixel data to memory. */
      switch (cols_remaining) {
      case 7:
        vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
      case 6:
        vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
      case 5:
        vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
      case 4:
        vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
      case 3:
        vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
      case 2:
        vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
      case 1:
        vst3_lane_u8(outptr, rgb, 0);
      default:
        break;
      }
#else
      /* Pack R, G, and B values in ratio 5:6:5. */
      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
      /* Store RGB565 pixel data to memory. */
      switch (cols_remaining) {
      case 7:
        vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
      case 6:
        vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
      case 5:
        vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
      case 4:
        vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
      case 3:
        vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
      case 2:
        vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
      case 1:
        vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
      default:
        break;
      }
#endif
    }
  }
}
Update libjpeg-turbo to v2.1.0 [ci build] 2021-04-25 21:41:28 +08:00			`/*`
			`* jdcolext-neon.c - colorspace conversion (Arm Neon)`
			`*`
			`* Copyright (C) 2020, Arm Limited. All Rights Reserved.`
			`* Copyright (C) 2020, D. R. Commander. All Rights Reserved.`
			`*`
			`* This software is provided 'as-is', without any express or implied`
			`* warranty. In no event will the authors be held liable for any damages`
			`* arising from the use of this software.`
			`*`
			`* Permission is granted to anyone to use this software for any purpose,`
			`* including commercial applications, and to alter it and redistribute it`
			`* freely, subject to the following restrictions:`
			`*`
			`* 1. The origin of this software must not be misrepresented; you must not`
			`* claim that you wrote the original software. If you use this software`
			`* in a product, an acknowledgment in the product documentation would be`
			`* appreciated but is not required.`
			`* 2. Altered source versions must be plainly marked as such, and must not be`
			`* misrepresented as being the original software.`
			`* 3. This notice may not be removed or altered from any source distribution.`
			`*/`

			`/* This file is included by jdcolor-neon.c. */`


			`/* YCbCr -> RGB conversion is defined by the following equations:`
			`* R = Y + 1.40200 * (Cr - 128)`
			`* G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)`
			`* B = Y + 1.77200 * (Cb - 128)`
			`*`
			`* Scaled integer constants are used to avoid floating-point arithmetic:`
			`* 0.3441467 = 11277 * 2^-15`
			`* 0.7141418 = 23401 * 2^-15`
			`* 1.4020386 = 22971 * 2^-14`
			`* 1.7720337 = 29033 * 2^-14`
			`* These constants are defined in jdcolor-neon.c.`
			`*`
			`* To ensure correct results, rounding is used when descaling.`
			`*/`

			`/* Notes on safe memory access for YCbCr -> RGB conversion routines:`
			`*`
			`* Input memory buffers can be safely overread up to the next multiple of`
			`* ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in`
			`* jmemmgr.c.`
			`*`
			`* The output buffer cannot safely be written beyond output_width, since`
			`* output_buf points to a possibly unpadded row in the decompressed image`
			`* buffer allocated by the calling program.`
			`*/`

			`void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,`
			`JDIMENSION input_row, JSAMPARRAY output_buf,`
			`int num_rows)`
			`{`
			`JSAMPROW outptr;`
			`/* Pointers to Y, Cb, and Cr data */`
			`JSAMPROW inptr0, inptr1, inptr2;`

			`const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);`
			`const int16x8_t neg_128 = vdupq_n_s16(-128);`

			`while (--num_rows >= 0) {`
			`inptr0 = input_buf[0][input_row];`
			`inptr1 = input_buf[1][input_row];`
			`inptr2 = input_buf[2][input_row];`
			`input_row++;`
			`outptr = *output_buf++;`
			`int cols_remaining = output_width;`
			`for (; cols_remaining >= 16; cols_remaining -= 16) {`
			`uint8x16_t y = vld1q_u8(inptr0);`
			`uint8x16_t cb = vld1q_u8(inptr1);`
			`uint8x16_t cr = vld1q_u8(inptr2);`
			`/* Subtract 128 from Cb and Cr. */`
			`int16x8_t cr_128_l =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),`
			`vget_low_u8(cr)));`
			`int16x8_t cr_128_h =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),`
			`vget_high_u8(cr)));`
			`int16x8_t cb_128_l =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),`
			`vget_low_u8(cb)));`
			`int16x8_t cb_128_h =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),`
			`vget_high_u8(cb)));`
			`/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */`
			`int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);`
			`int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),`
			`consts, 0);`
			`int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);`
			`int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),`
			`consts, 0);`
			`g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),`
			`consts, 1);`
			`g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),`
			`consts, 1);`
			`g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),`
			`consts, 1);`
			`g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),`
			`consts, 1);`
			`/* Descale G components: shift right 15, round, and narrow to 16-bit. */`
			`int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),`
			`vrshrn_n_s32(g_sub_y_lh, 15));`
			`int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),`
			`vrshrn_n_s32(g_sub_y_hh, 15));`
			`/* Compute R-Y: 1.40200 * (Cr - 128) */`
			`int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),`
			`consts, 2);`
			`int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),`
			`consts, 2);`
			`/* Compute B-Y: 1.77200 * (Cb - 128) */`
			`int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),`
			`consts, 3);`
			`int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),`
			`consts, 3);`
			`/* Add Y. */`
			`int16x8_t r_l =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),`
			`vget_low_u8(y)));`
			`int16x8_t r_h =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),`
			`vget_high_u8(y)));`
			`int16x8_t b_l =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),`
			`vget_low_u8(y)));`
			`int16x8_t b_h =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),`
			`vget_high_u8(y)));`
			`int16x8_t g_l =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),`
			`vget_low_u8(y)));`
			`int16x8_t g_h =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),`
			`vget_high_u8(y)));`

			`#if RGB_PIXELSIZE == 4`
			`uint8x16x4_t rgba;`
			`/* Convert each component to unsigned and narrow, clamping to [0-255]. */`
			`rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));`
			`rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));`
			`rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));`
			`/* Set alpha channel to opaque (0xFF). */`
			`rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);`
			`/* Store RGBA pixel data to memory. */`
			`vst4q_u8(outptr, rgba);`
			`#elif RGB_PIXELSIZE == 3`
			`uint8x16x3_t rgb;`
			`/* Convert each component to unsigned and narrow, clamping to [0-255]. */`
			`rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));`
			`rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));`
			`rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));`
			`/* Store RGB pixel data to memory. */`
			`vst3q_u8(outptr, rgb);`
			`#else`
			`/* Pack R, G, and B values in ratio 5:6:5. */`
			`uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);`
			`rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);`
			`rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);`
			`uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);`
			`rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);`
			`rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);`
			`/* Store RGB pixel data to memory. */`
			`vst1q_u16((uint16_t *)outptr, rgb565_l);`
			`vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);`
			`#endif`

			`/* Increment pointers. */`
			`inptr0 += 16;`
			`inptr1 += 16;`
			`inptr2 += 16;`
			`outptr += (RGB_PIXELSIZE * 16);`
			`}`

			`if (cols_remaining >= 8) {`
			`uint8x8_t y = vld1_u8(inptr0);`
			`uint8x8_t cb = vld1_u8(inptr1);`
			`uint8x8_t cr = vld1_u8(inptr2);`
			`/* Subtract 128 from Cb and Cr. */`
			`int16x8_t cr_128 =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));`
			`int16x8_t cb_128 =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));`
			`/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */`
			`int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);`
			`int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);`
			`g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);`
			`g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);`
			`/* Descale G components: shift right 15, round, and narrow to 16-bit. */`
			`int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),`
			`vrshrn_n_s32(g_sub_y_h, 15));`
			`/* Compute R-Y: 1.40200 * (Cr - 128) */`
			`int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),`
			`consts, 2);`
			`/* Compute B-Y: 1.77200 * (Cb - 128) */`
			`int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),`
			`consts, 3);`
			`/* Add Y. */`
			`int16x8_t r =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));`
			`int16x8_t b =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));`
			`int16x8_t g =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));`

			`#if RGB_PIXELSIZE == 4`
			`uint8x8x4_t rgba;`
			`/* Convert each component to unsigned and narrow, clamping to [0-255]. */`
			`rgba.val[RGB_RED] = vqmovun_s16(r);`
			`rgba.val[RGB_GREEN] = vqmovun_s16(g);`
			`rgba.val[RGB_BLUE] = vqmovun_s16(b);`
			`/* Set alpha channel to opaque (0xFF). */`
			`rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);`
			`/* Store RGBA pixel data to memory. */`
			`vst4_u8(outptr, rgba);`
			`#elif RGB_PIXELSIZE == 3`
			`uint8x8x3_t rgb;`
			`/* Convert each component to unsigned and narrow, clamping to [0-255]. */`
			`rgb.val[RGB_RED] = vqmovun_s16(r);`
			`rgb.val[RGB_GREEN] = vqmovun_s16(g);`
			`rgb.val[RGB_BLUE] = vqmovun_s16(b);`
			`/* Store RGB pixel data to memory. */`
			`vst3_u8(outptr, rgb);`
			`#else`
			`/* Pack R, G, and B values in ratio 5:6:5. */`
			`uint16x8_t rgb565 = vqshluq_n_s16(r, 8);`
			`rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);`
			`rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);`
			`/* Store RGB pixel data to memory. */`
			`vst1q_u16((uint16_t *)outptr, rgb565);`
			`#endif`

			`/* Increment pointers. */`
			`inptr0 += 8;`
			`inptr1 += 8;`
			`inptr2 += 8;`
			`outptr += (RGB_PIXELSIZE * 8);`
			`cols_remaining -= 8;`
			`}`

			`/* Handle the tail elements. */`
			`if (cols_remaining > 0) {`
			`uint8x8_t y = vld1_u8(inptr0);`
			`uint8x8_t cb = vld1_u8(inptr1);`
			`uint8x8_t cr = vld1_u8(inptr2);`
			`/* Subtract 128 from Cb and Cr. */`
			`int16x8_t cr_128 =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));`
			`int16x8_t cb_128 =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));`
			`/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */`
			`int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);`
			`int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);`
			`g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);`
			`g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);`
			`/* Descale G components: shift right 15, round, and narrow to 16-bit. */`
			`int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),`
			`vrshrn_n_s32(g_sub_y_h, 15));`
			`/* Compute R-Y: 1.40200 * (Cr - 128) */`
			`int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),`
			`consts, 2);`
			`/* Compute B-Y: 1.77200 * (Cb - 128) */`
			`int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),`
			`consts, 3);`
			`/* Add Y. */`
			`int16x8_t r =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));`
			`int16x8_t b =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));`
			`int16x8_t g =`
			`vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));`

			`#if RGB_PIXELSIZE == 4`
			`uint8x8x4_t rgba;`
			`/* Convert each component to unsigned and narrow, clamping to [0-255]. */`
			`rgba.val[RGB_RED] = vqmovun_s16(r);`
			`rgba.val[RGB_GREEN] = vqmovun_s16(g);`
			`rgba.val[RGB_BLUE] = vqmovun_s16(b);`
			`/* Set alpha channel to opaque (0xFF). */`
			`rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);`
			`/* Store RGBA pixel data to memory. */`
			`switch (cols_remaining) {`
			`case 7:`
			`vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);`
			`case 6:`
			`vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);`
			`case 5:`
			`vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);`
			`case 4:`
			`vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);`
			`case 3:`
			`vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);`
			`case 2:`
			`vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);`
			`case 1:`
			`vst4_lane_u8(outptr, rgba, 0);`
			`default:`
			`break;`
			`}`
			`#elif RGB_PIXELSIZE == 3`
			`uint8x8x3_t rgb;`
			`/* Convert each component to unsigned and narrow, clamping to [0-255]. */`
			`rgb.val[RGB_RED] = vqmovun_s16(r);`
			`rgb.val[RGB_GREEN] = vqmovun_s16(g);`
			`rgb.val[RGB_BLUE] = vqmovun_s16(b);`
			`/* Store RGB pixel data to memory. */`
			`switch (cols_remaining) {`
			`case 7:`
			`vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);`
			`case 6:`
			`vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);`
			`case 5:`
			`vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);`
			`case 4:`
			`vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);`
			`case 3:`
			`vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);`
			`case 2:`
			`vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);`
			`case 1:`
			`vst3_lane_u8(outptr, rgb, 0);`
			`default:`
			`break;`
			`}`
			`#else`
			`/* Pack R, G, and B values in ratio 5:6:5. */`
			`uint16x8_t rgb565 = vqshluq_n_s16(r, 8);`
			`rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);`
			`rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);`
			`/* Store RGB565 pixel data to memory. */`
			`switch (cols_remaining) {`
			`case 7:`
			`vst1q_lane_u16((uint16_t )(outptr + 6 RGB_PIXELSIZE), rgb565, 6);`
			`case 6:`
			`vst1q_lane_u16((uint16_t )(outptr + 5 RGB_PIXELSIZE), rgb565, 5);`
			`case 5:`
			`vst1q_lane_u16((uint16_t )(outptr + 4 RGB_PIXELSIZE), rgb565, 4);`
			`case 4:`
			`vst1q_lane_u16((uint16_t )(outptr + 3 RGB_PIXELSIZE), rgb565, 3);`
			`case 3:`
			`vst1q_lane_u16((uint16_t )(outptr + 2 RGB_PIXELSIZE), rgb565, 2);`
			`case 2:`
			`vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);`
			`case 1:`
			`vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);`
			`default:`
			`break;`
			`}`
			`#endif`
			`}`
			`}`
			`}`