jdsample-neon.c - mozsearch

mozilla-central/media/libjpeg/simd/arm/jdsample-neon.c (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Core :: Graphics: ImageLib

Revision control

Copy as Markdown

Other Tools

/*

 * jdsample-neon.c - upsampling (Arm Neon)

 * Copyright (C) 2020, Arm Limited.  All Rights Reserved.

 * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.

 * This software is provided 'as-is', without any express or implied

 * warranty.  In no event will the authors be held liable for any damages

 * arising from the use of this software.

 * Permission is granted to anyone to use this software for any purpose,

 * including commercial applications, and to alter it and redistribute it

 * freely, subject to the following restrictions:

 * 1. The origin of this software must not be misrepresented; you must not

 *    claim that you wrote the original software. If you use this software

 *    in a product, an acknowledgment in the product documentation would be

 *    appreciated but is not required.

 * 2. Altered source versions must be plainly marked as such, and must not be

 *    misrepresented as being the original software.

 * 3. This notice may not be removed or altered from any source distribution.

*/

#define JPEG_INTERNALS

#include "../../jinclude.h"

#include "../../jpeglib.h"

#include "../../jsimd.h"

#include "../../jdct.h"

#include "../../jsimddct.h"

#include "../jsimd.h"

#include <arm_neon.h>

/* The diagram below shows a row of samples produced by h2v1 downsampling.

 *                s0        s1        s2

 *            +---------+---------+---------+

 *            |         |         |         |

 *            | p0   p1 | p2   p3 | p4   p5 |

 *            |         |         |         |

 *            +---------+---------+---------+

 * Samples s0-s2 were created by averaging the original pixel component values

 * centered at positions p0-p5 above.  To approximate those original pixel

 * component values, we proportionally blend the adjacent samples in each row.

 * An upsampled pixel component value is computed by blending the sample

 * containing the pixel center with the nearest neighboring sample, in the

 * ratio 3:1.  For example:

 *     p1(upsampled) = 3/4 * s0 + 1/4 * s1

 *     p2(upsampled) = 3/4 * s1 + 1/4 * s0

 * When computing the first and last pixel component values in the row, there

 * is no adjacent sample to blend, so:

 *     p0(upsampled) = s0

 *     p5(upsampled) = s2

*/

void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,

                                    JDIMENSION downsampled_width,

                                    JSAMPARRAY input_data,

                                    JSAMPARRAY *output_data_ptr)

  JSAMPARRAY output_data = *output_data_ptr;

  JSAMPROW inptr, outptr;

  int inrow;

  unsigned colctr;

  /* Set up constants. */

  const uint16x8_t one_u16 = vdupq_n_u16(1);

  const uint8x8_t three_u8 = vdup_n_u8(3);

  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {

    inptr = input_data[inrow];

    outptr = output_data[inrow];

    /* First pixel component value in this row of the original image */

    *outptr = (JSAMPLE)GETJSAMPLE(*inptr);

    /*    3/4 * containing sample + 1/4 * nearest neighboring sample

     * For p1: containing sample = s0, nearest neighboring sample = s1

     * For p2: containing sample = s1, nearest neighboring sample = s0

*/

    uint8x16_t s0 = vld1q_u8(inptr);

    uint8x16_t s1 = vld1q_u8(inptr + 1);

    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes

     * denote low half and high half respectively.

*/

    uint16x8_t s1_add_3s0_l =

      vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);

    uint16x8_t s1_add_3s0_h =

      vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);

    uint16x8_t s0_add_3s1_l =

      vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);

    uint16x8_t s0_add_3s1_h =

      vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);

    /* Add ordered dithering bias to odd pixel values. */

    s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);

    s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);

    /* The offset is initially 1, because the first pixel component has already

     * been stored.  However, in subsequent iterations of the SIMD loop, this

     * offset is (2 * colctr - 1) to stay within the bounds of the sample

     * buffers without having to resort to a slow scalar tail case for the last

     * (downsampled_width % 16) samples.  See "Creation of 2-D sample arrays"

     * in jmemmgr.c for more details.

*/

    unsigned outptr_offset = 1;

    uint8x16x2_t output_pixels;

    /* We use software pipelining to maximise performance.  The code indented

     * an extra two spaces begins the next iteration of the loop.

*/

    for (colctr = 16; colctr < downsampled_width; colctr += 16) {

        s0 = vld1q_u8(inptr + colctr - 1);

        s1 = vld1q_u8(inptr + colctr);

      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */

      output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),

                                         vrshrn_n_u16(s1_add_3s0_h, 2));

      output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),

                                         vshrn_n_u16(s0_add_3s1_h, 2));

        /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes

         * denote low half and high half respectively.

*/

        s1_add_3s0_l =

          vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);

        s1_add_3s0_h =

          vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);

        s0_add_3s1_l =

          vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);

        s0_add_3s1_h =

          vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);

        /* Add ordered dithering bias to odd pixel values. */

        s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);

        s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);

      /* Store pixel component values to memory. */

      vst2q_u8(outptr + outptr_offset, output_pixels);

      outptr_offset = 2 * colctr - 1;

    /* Complete the last iteration of the loop. */

    /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */

    output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),

                                       vrshrn_n_u16(s1_add_3s0_h, 2));

    output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),

                                       vshrn_n_u16(s0_add_3s1_h, 2));

    /* Store pixel component values to memory. */

    vst2q_u8(outptr + outptr_offset, output_pixels);

    /* Last pixel component value in this row of the original image */

    outptr[2 * downsampled_width - 1] =

      GETJSAMPLE(inptr[downsampled_width - 1]);

/* The diagram below shows an array of samples produced by h2v2 downsampling.

 *                s0        s1        s2

 *            +---------+---------+---------+

 *            | p0   p1 | p2   p3 | p4   p5 |

 *       sA   |         |         |         |

 *            | p6   p7 | p8   p9 | p10  p11|

 *            +---------+---------+---------+

 *            | p12  p13| p14  p15| p16  p17|

 *       sB   |         |         |         |

 *            | p18  p19| p20  p21| p22  p23|

 *            +---------+---------+---------+

 *            | p24  p25| p26  p27| p28  p29|

 *       sC   |         |         |         |

 *            | p30  p31| p32  p33| p34  p35|

 *            +---------+---------+---------+

 * Samples s0A-s2C were created by averaging the original pixel component

 * values centered at positions p0-p35 above.  To approximate one of those

 * original pixel component values, we proportionally blend the sample

 * containing the pixel center with the nearest neighboring samples in each

 * row, column, and diagonal.

 * An upsampled pixel component value is computed by first blending the sample

 * containing the pixel center with the nearest neighboring samples in the

 * same column, in the ratio 3:1, and then blending each column sum with the

 * nearest neighboring column sum, in the ratio 3:1.  For example:

 *     p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +

 *                      1/4 * (3/4 * s0B + 1/4 * s0A)

 *                    = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A

 * When computing the first and last pixel component values in the row, there

 * is no horizontally adjacent sample to blend, so:

 *     p12(upsampled) = 3/4 * s0B + 1/4 * s0A

 *     p23(upsampled) = 3/4 * s2B + 1/4 * s2C

 * When computing the first and last pixel component values in the column,

 * there is no vertically adjacent sample to blend, so:

 *     p2(upsampled) = 3/4 * s1A + 1/4 * s0A

 *     p33(upsampled) = 3/4 * s1C + 1/4 * s2C

 * When computing the corner pixel component values, there is no adjacent

 * sample to blend, so:

 *     p0(upsampled) = s0A

 *     p35(upsampled) = s2C

*/

void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,

                                    JDIMENSION downsampled_width,

                                    JSAMPARRAY input_data,

                                    JSAMPARRAY *output_data_ptr)

  JSAMPARRAY output_data = *output_data_ptr;

  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;

  int inrow, outrow;

  unsigned colctr;

  /* Set up constants. */

  const uint16x8_t seven_u16 = vdupq_n_u16(7);

  const uint8x8_t three_u8 = vdup_n_u8(3);

  const uint16x8_t three_u16 = vdupq_n_u16(3);

  inrow = outrow = 0;

  while (outrow < max_v_samp_factor) {

    inptr0 = input_data[inrow - 1];

    inptr1 = input_data[inrow];

    inptr2 = input_data[inrow + 1];

    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,

     * respectively.

*/

    outptr0 = output_data[outrow++];

    outptr1 = output_data[outrow++];

    /* First pixel component value in this row of the original image */

    int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);

    *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);

    int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);

    *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);

    /* Step 1: Blend samples vertically in columns s0 and s1.

     * Leave the divide by 4 until the end, when it can be done for both

     * dimensions at once, right-shifting by 4.

*/

    /* Load and compute s0colsum0 and s0colsum1. */

    uint8x16_t s0A = vld1q_u8(inptr0);

    uint8x16_t s0B = vld1q_u8(inptr1);

    uint8x16_t s0C = vld1q_u8(inptr2);

    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes

     * denote low half and high half respectively.

*/

    uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),

                                      vget_low_u8(s0B), three_u8);

    uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),

                                      vget_high_u8(s0B), three_u8);

    uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),

                                      vget_low_u8(s0B), three_u8);

    uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),

                                      vget_high_u8(s0B), three_u8);

    /* Load and compute s1colsum0 and s1colsum1. */

    uint8x16_t s1A = vld1q_u8(inptr0 + 1);

    uint8x16_t s1B = vld1q_u8(inptr1 + 1);

    uint8x16_t s1C = vld1q_u8(inptr2 + 1);

    uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),

                                      vget_low_u8(s1B), three_u8);

    uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),

                                      vget_high_u8(s1B), three_u8);

    uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),

                                      vget_low_u8(s1B), three_u8);

    uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),

                                      vget_high_u8(s1B), three_u8);

    /* Step 2: Blend the already-blended columns. */

    uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);

    uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);

    uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);

    uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);

    uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);

    uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);

    uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);

    uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);

    /* Add ordered dithering bias to odd pixel values. */

    output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);

    output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);

    output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);

    output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);

    /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */

    uint8x16x2_t output_pixels0 = { {

      vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),

      vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))

    } };

    uint8x16x2_t output_pixels1 = { {

      vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),

      vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))

    } };

    /* Store pixel component values to memory.

     * The minimum size of the output buffer for each row is 64 bytes => no

     * need to worry about buffer overflow here.  See "Creation of 2-D sample

     * arrays" in jmemmgr.c for more details.

*/

    vst2q_u8(outptr0 + 1, output_pixels0);

    vst2q_u8(outptr1 + 1, output_pixels1);

    /* The first pixel of the image shifted our loads and stores by one byte.

     * We have to re-align on a 32-byte boundary at some point before the end

     * of the row (we do it now on the 32/33 pixel boundary) to stay within the

     * bounds of the sample buffers without having to resort to a slow scalar

     * tail case for the last (downsampled_width % 16) samples.  See "Creation

     * of 2-D sample arrays" in jmemmgr.c for more details.

*/

    for (colctr = 16; colctr < downsampled_width; colctr += 16) {

      /* Step 1: Blend samples vertically in columns s0 and s1. */

      /* Load and compute s0colsum0 and s0colsum1. */

      s0A = vld1q_u8(inptr0 + colctr - 1);

      s0B = vld1q_u8(inptr1 + colctr - 1);

      s0C = vld1q_u8(inptr2 + colctr - 1);

      s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),

                             three_u8);

      s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),

                             three_u8);

      s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),

                             three_u8);

      s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),

                             three_u8);

      /* Load and compute s1colsum0 and s1colsum1. */

      s1A = vld1q_u8(inptr0 + colctr);

      s1B = vld1q_u8(inptr1 + colctr);

      s1C = vld1q_u8(inptr2 + colctr);

      s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),

                             three_u8);

      s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),

                             three_u8);

      s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),

                             three_u8);

      s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),

                             three_u8);

      /* Step 2: Blend the already-blended columns. */

      output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);

      output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);

      output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);

      output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);

      output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);

      output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);

      output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);

      output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);

      /* Add ordered dithering bias to odd pixel values. */

      output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);

      output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);

      output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);

      output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);

      /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */

      output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),

                                          vshrn_n_u16(output0_p1_h, 4));

      output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),

                                          vrshrn_n_u16(output0_p2_h, 4));

      output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),

                                          vshrn_n_u16(output1_p1_h, 4));

      output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),

                                          vrshrn_n_u16(output1_p2_h, 4));

      /* Store pixel component values to memory. */

      vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);

      vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);

    /* Last pixel component value in this row of the original image */

    int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +

                    GETJSAMPLE(inptr0[downsampled_width - 1]);

    outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);

    int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +

                    GETJSAMPLE(inptr2[downsampled_width - 1]);

    outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);

    inrow++;

/* The diagram below shows a column of samples produced by h1v2 downsampling

 * (or by losslessly rotating or transposing an h2v1-downsampled image.)

 *            +---------+

 *            |   p0    |

 *     sA     |         |

 *            |   p1    |

 *            +---------+

 *            |   p2    |

 *     sB     |         |

 *            |   p3    |

 *            +---------+

 *            |   p4    |

 *     sC     |         |

 *            |   p5    |

 *            +---------+

 * Samples sA-sC were created by averaging the original pixel component values

 * centered at positions p0-p5 above.  To approximate those original pixel

 * component values, we proportionally blend the adjacent samples in each

 * column.

 * An upsampled pixel component value is computed by blending the sample

 * containing the pixel center with the nearest neighboring sample, in the

 * ratio 3:1.  For example:

 *     p1(upsampled) = 3/4 * sA + 1/4 * sB

 *     p2(upsampled) = 3/4 * sB + 1/4 * sA

 * When computing the first and last pixel component values in the column,

 * there is no adjacent sample to blend, so:

 *     p0(upsampled) = sA

 *     p5(upsampled) = sC

*/

void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,

                                    JDIMENSION downsampled_width,

                                    JSAMPARRAY input_data,

                                    JSAMPARRAY *output_data_ptr)

  JSAMPARRAY output_data = *output_data_ptr;

  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;

  int inrow, outrow;

  unsigned colctr;

  /* Set up constants. */

  const uint16x8_t one_u16 = vdupq_n_u16(1);

  const uint8x8_t three_u8 = vdup_n_u8(3);

  inrow = outrow = 0;

  while (outrow < max_v_samp_factor) {

    inptr0 = input_data[inrow - 1];

    inptr1 = input_data[inrow];

    inptr2 = input_data[inrow + 1];

    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,

     * respectively.

*/

    outptr0 = output_data[outrow++];

    outptr1 = output_data[outrow++];

    inrow++;

    /* The size of the input and output buffers is always a multiple of 32

     * bytes => no need to worry about buffer overflow when reading/writing

     * memory.  See "Creation of 2-D sample arrays" in jmemmgr.c for more

     * details.

*/

    for (colctr = 0; colctr < downsampled_width; colctr += 16) {

      /* Load samples. */

      uint8x16_t sA = vld1q_u8(inptr0 + colctr);

      uint8x16_t sB = vld1q_u8(inptr1 + colctr);

      uint8x16_t sC = vld1q_u8(inptr2 + colctr);

      /* Blend samples vertically. */

      uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),

                                      vget_low_u8(sB), three_u8);

      uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),

                                      vget_high_u8(sB), three_u8);

      uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),

                                      vget_low_u8(sB), three_u8);

      uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),

                                      vget_high_u8(sB), three_u8);

      /* Add ordered dithering bias to pixel values in even output rows. */

      colsum0_l = vaddq_u16(colsum0_l, one_u16);

      colsum0_h = vaddq_u16(colsum0_h, one_u16);

      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */

      uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),

                                              vshrn_n_u16(colsum0_h, 2));

      uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),

                                              vrshrn_n_u16(colsum1_h, 2));

      /* Store pixel component values to memory. */

      vst1q_u8(outptr0 + colctr, output_pixels0);

      vst1q_u8(outptr1 + colctr, output_pixels1);

/* The diagram below shows a row of samples produced by h2v1 downsampling.

 *                s0        s1

 *            +---------+---------+

 *            |         |         |

 *            | p0   p1 | p2   p3 |

 *            |         |         |

 *            +---------+---------+

 * Samples s0 and s1 were created by averaging the original pixel component

 * values centered at positions p0-p3 above.  To approximate those original

 * pixel component values, we duplicate the samples horizontally:

 *     p0(upsampled) = p1(upsampled) = s0

 *     p2(upsampled) = p3(upsampled) = s1

*/

void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,

                              JSAMPARRAY input_data,

                              JSAMPARRAY *output_data_ptr)

  JSAMPARRAY output_data = *output_data_ptr;

  JSAMPROW inptr, outptr;

  int inrow;

  unsigned colctr;

  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {

    inptr = input_data[inrow];

    outptr = output_data[inrow];

    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {

      uint8x16_t samples = vld1q_u8(inptr + colctr);

      /* Duplicate the samples.  The store operation below interleaves them so

       * that adjacent pixel component values take on the same sample value,

       * per above.

*/

      uint8x16x2_t output_pixels = { { samples, samples } };

      /* Store pixel component values to memory.

       * Due to the way sample buffers are allocated, we don't need to worry

       * about tail cases when output_width is not a multiple of 32.  See

       * "Creation of 2-D sample arrays" in jmemmgr.c for details.

*/

      vst2q_u8(outptr + 2 * colctr, output_pixels);

/* The diagram below shows an array of samples produced by h2v2 downsampling.

 *                s0        s1

 *            +---------+---------+

 *            | p0   p1 | p2   p3 |

 *       sA   |         |         |

 *            | p4   p5 | p6   p7 |

 *            +---------+---------+

 *            | p8   p9 | p10  p11|

 *       sB   |         |         |

 *            | p12  p13| p14  p15|

 *            +---------+---------+

 * Samples s0A-s1B were created by averaging the original pixel component

 * values centered at positions p0-p15 above.  To approximate those original

 * pixel component values, we duplicate the samples both horizontally and

 * vertically:

 *     p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A

 *     p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A

 *     p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B

 *     p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B

*/

void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,

                              JSAMPARRAY input_data,

                              JSAMPARRAY *output_data_ptr)

  JSAMPARRAY output_data = *output_data_ptr;

  JSAMPROW inptr, outptr0, outptr1;

  int inrow, outrow;

  unsigned colctr;

  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {

    inptr = input_data[inrow];

    outptr0 = output_data[outrow++];

    outptr1 = output_data[outrow++];

    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {

      uint8x16_t samples = vld1q_u8(inptr + colctr);

      /* Duplicate the samples.  The store operation below interleaves them so

       * that adjacent pixel component values take on the same sample value,

       * per above.

*/

      uint8x16x2_t output_pixels = { { samples, samples } };

      /* Store pixel component values for both output rows to memory.

       * Due to the way sample buffers are allocated, we don't need to worry

       * about tail cases when output_width is not a multiple of 32.  See

       * "Creation of 2-D sample arrays" in jmemmgr.c for details.

*/

      vst2q_u8(outptr0 + 2 * colctr, output_pixels);

      vst2q_u8(outptr1 + 2 * colctr, output_pixels);