Blame src/dsp/dec_neon.c

Packit 9c6abc
// Copyright 2012 Google Inc. All Rights Reserved.
Packit 9c6abc
//
Packit 9c6abc
// Use of this source code is governed by a BSD-style license
Packit 9c6abc
// that can be found in the COPYING file in the root of the source
Packit 9c6abc
// tree. An additional intellectual property rights grant can be found
Packit 9c6abc
// in the file PATENTS. All contributing project authors may
Packit 9c6abc
// be found in the AUTHORS file in the root of the source tree.
Packit 9c6abc
// -----------------------------------------------------------------------------
Packit 9c6abc
//
Packit 9c6abc
// ARM NEON version of dsp functions and loop filtering.
Packit 9c6abc
//
Packit 9c6abc
// Authors: Somnath Banerjee (somnath@google.com)
Packit 9c6abc
//          Johann Koenig (johannkoenig@google.com)
Packit 9c6abc
Packit 9c6abc
#include "src/dsp/dsp.h"
Packit 9c6abc
Packit 9c6abc
#if defined(WEBP_USE_NEON)
Packit 9c6abc
Packit 9c6abc
#include "src/dsp/neon.h"
Packit 9c6abc
#include "src/dec/vp8i_dec.h"
Packit 9c6abc
Packit 9c6abc
//------------------------------------------------------------------------------
Packit 9c6abc
// NxM Loading functions
Packit 9c6abc
Packit 9c6abc
#if !defined(WORK_AROUND_GCC)
Packit 9c6abc
Packit 9c6abc
// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
Packit 9c6abc
// (register alloc, probably). The variants somewhat mitigate the problem, but
Packit 9c6abc
// not quite. HFilter16i() remains problematic.
Packit 9c6abc
static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
Packit 9c6abc
                                            int stride) {
Packit 9c6abc
  const uint8x8_t zero = vdup_n_u8(0);
Packit 9c6abc
  uint8x8x4_t out;
Packit 9c6abc
  INIT_VECTOR4(out, zero, zero, zero, zero);
Packit 9c6abc
  out = vld4_lane_u8(src + 0 * stride, out, 0);
Packit 9c6abc
  out = vld4_lane_u8(src + 1 * stride, out, 1);
Packit 9c6abc
  out = vld4_lane_u8(src + 2 * stride, out, 2);
Packit 9c6abc
  out = vld4_lane_u8(src + 3 * stride, out, 3);
Packit 9c6abc
  out = vld4_lane_u8(src + 4 * stride, out, 4);
Packit 9c6abc
  out = vld4_lane_u8(src + 5 * stride, out, 5);
Packit 9c6abc
  out = vld4_lane_u8(src + 6 * stride, out, 6);
Packit 9c6abc
  out = vld4_lane_u8(src + 7 * stride, out, 7);
Packit 9c6abc
  return out;
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
Packit 9c6abc
                                      uint8x16_t* const p1,
Packit 9c6abc
                                      uint8x16_t* const p0,
Packit 9c6abc
                                      uint8x16_t* const q0,
Packit 9c6abc
                                      uint8x16_t* const q1) {
Packit 9c6abc
  // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
Packit 9c6abc
  // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
Packit 9c6abc
  const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
Packit 9c6abc
  const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
Packit 9c6abc
  *p1 = vcombine_u8(row0.val[0], row8.val[0]);
Packit 9c6abc
  *p0 = vcombine_u8(row0.val[1], row8.val[1]);
Packit 9c6abc
  *q0 = vcombine_u8(row0.val[2], row8.val[2]);
Packit 9c6abc
  *q1 = vcombine_u8(row0.val[3], row8.val[3]);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
#else  // WORK_AROUND_GCC
Packit 9c6abc
Packit 9c6abc
#define LOADQ_LANE_32b(VALUE, LANE) do {                             \
Packit 9c6abc
  (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE));   \
Packit 9c6abc
  src += stride;                                                     \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
Packit 9c6abc
                                      uint8x16_t* const p1,
Packit 9c6abc
                                      uint8x16_t* const p0,
Packit 9c6abc
                                      uint8x16_t* const q0,
Packit 9c6abc
                                      uint8x16_t* const q1) {
Packit 9c6abc
  const uint32x4_t zero = vdupq_n_u32(0);
Packit 9c6abc
  uint32x4x4_t in;
Packit 9c6abc
  INIT_VECTOR4(in, zero, zero, zero, zero);
Packit 9c6abc
  src -= 2;
Packit 9c6abc
  LOADQ_LANE_32b(in.val[0], 0);
Packit 9c6abc
  LOADQ_LANE_32b(in.val[1], 0);
Packit 9c6abc
  LOADQ_LANE_32b(in.val[2], 0);
Packit 9c6abc
  LOADQ_LANE_32b(in.val[3], 0);
Packit 9c6abc
  LOADQ_LANE_32b(in.val[0], 1);
Packit 9c6abc
  LOADQ_LANE_32b(in.val[1], 1);
Packit 9c6abc
  LOADQ_LANE_32b(in.val[2], 1);
Packit 9c6abc
  LOADQ_LANE_32b(in.val[3], 1);
Packit 9c6abc
  LOADQ_LANE_32b(in.val[0], 2);
Packit 9c6abc
  LOADQ_LANE_32b(in.val[1], 2);
Packit 9c6abc
  LOADQ_LANE_32b(in.val[2], 2);
Packit 9c6abc
  LOADQ_LANE_32b(in.val[3], 2);
Packit 9c6abc
  LOADQ_LANE_32b(in.val[0], 3);
Packit 9c6abc
  LOADQ_LANE_32b(in.val[1], 3);
Packit 9c6abc
  LOADQ_LANE_32b(in.val[2], 3);
Packit 9c6abc
  LOADQ_LANE_32b(in.val[3], 3);
Packit 9c6abc
  // Transpose four 4x4 parts:
Packit 9c6abc
  {
Packit 9c6abc
    const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
Packit 9c6abc
                                        vreinterpretq_u8_u32(in.val[1]));
Packit 9c6abc
    const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
Packit 9c6abc
                                        vreinterpretq_u8_u32(in.val[3]));
Packit 9c6abc
    const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
Packit 9c6abc
                                         vreinterpretq_u16_u8(row23.val[0]));
Packit 9c6abc
    const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
Packit 9c6abc
                                         vreinterpretq_u16_u8(row23.val[1]));
Packit 9c6abc
    *p1 = vreinterpretq_u8_u16(row02.val[0]);
Packit 9c6abc
    *p0 = vreinterpretq_u8_u16(row13.val[0]);
Packit 9c6abc
    *q0 = vreinterpretq_u8_u16(row02.val[1]);
Packit 9c6abc
    *q1 = vreinterpretq_u8_u16(row13.val[1]);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
#undef LOADQ_LANE_32b
Packit 9c6abc
Packit 9c6abc
#endif  // !WORK_AROUND_GCC
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Load8x16_NEON(
Packit 9c6abc
    const uint8_t* const src, int stride,
Packit 9c6abc
    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
Packit 9c6abc
    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
Packit 9c6abc
    uint8x16_t* const q2, uint8x16_t* const q3) {
Packit 9c6abc
  Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
Packit 9c6abc
  Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
Packit 9c6abc
                                      uint8x16_t* const p1,
Packit 9c6abc
                                      uint8x16_t* const p0,
Packit 9c6abc
                                      uint8x16_t* const q0,
Packit 9c6abc
                                      uint8x16_t* const q1) {
Packit 9c6abc
  *p1 = vld1q_u8(src - 2 * stride);
Packit 9c6abc
  *p0 = vld1q_u8(src - 1 * stride);
Packit 9c6abc
  *q0 = vld1q_u8(src + 0 * stride);
Packit 9c6abc
  *q1 = vld1q_u8(src + 1 * stride);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Load16x8_NEON(
Packit 9c6abc
    const uint8_t* const src, int stride,
Packit 9c6abc
    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
Packit 9c6abc
    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
Packit 9c6abc
    uint8x16_t* const q2, uint8x16_t* const q3) {
Packit 9c6abc
  Load16x4_NEON(src - 2  * stride, stride, p3, p2, p1, p0);
Packit 9c6abc
  Load16x4_NEON(src + 2  * stride, stride, q0, q1, q2, q3);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Load8x8x2_NEON(
Packit 9c6abc
    const uint8_t* const u, const uint8_t* const v, int stride,
Packit 9c6abc
    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
Packit 9c6abc
    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
Packit 9c6abc
    uint8x16_t* const q2, uint8x16_t* const q3) {
Packit 9c6abc
  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
Packit 9c6abc
  // and the v-samples on the higher half.
Packit 9c6abc
  *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
Packit 9c6abc
  *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
Packit 9c6abc
  *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
Packit 9c6abc
  *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
Packit 9c6abc
  *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
Packit 9c6abc
  *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
Packit 9c6abc
  *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
Packit 9c6abc
  *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
#if !defined(WORK_AROUND_GCC)
Packit 9c6abc
Packit 9c6abc
#define LOAD_UV_8(ROW) \
Packit 9c6abc
  vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Load8x8x2T_NEON(
Packit 9c6abc
    const uint8_t* const u, const uint8_t* const v, int stride,
Packit 9c6abc
    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
Packit 9c6abc
    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
Packit 9c6abc
    uint8x16_t* const q2, uint8x16_t* const q3) {
Packit 9c6abc
  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
Packit 9c6abc
  // and the v-samples on the higher half.
Packit 9c6abc
  const uint8x16_t row0 = LOAD_UV_8(0);
Packit 9c6abc
  const uint8x16_t row1 = LOAD_UV_8(1);
Packit 9c6abc
  const uint8x16_t row2 = LOAD_UV_8(2);
Packit 9c6abc
  const uint8x16_t row3 = LOAD_UV_8(3);
Packit 9c6abc
  const uint8x16_t row4 = LOAD_UV_8(4);
Packit 9c6abc
  const uint8x16_t row5 = LOAD_UV_8(5);
Packit 9c6abc
  const uint8x16_t row6 = LOAD_UV_8(6);
Packit 9c6abc
  const uint8x16_t row7 = LOAD_UV_8(7);
Packit 9c6abc
  // Perform two side-by-side 8x8 transposes
Packit 9c6abc
  // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
Packit 9c6abc
  // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
Packit 9c6abc
  // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
Packit 9c6abc
  // u30 u31 u32 u33 u34 u35 u36 u37 | ...
Packit 9c6abc
  // u40 u41 u42 u43 u44 u45 u46 u47 | ...
Packit 9c6abc
  // u50 u51 u52 u53 u54 u55 u56 u57 | ...
Packit 9c6abc
  // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
Packit 9c6abc
  // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
Packit 9c6abc
  const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
Packit 9c6abc
                                                    // u01 u11 u03 u13 ...
Packit 9c6abc
  const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
Packit 9c6abc
                                                    // u21 u31 u23 u33 ...
Packit 9c6abc
  const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
Packit 9c6abc
  const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
Packit 9c6abc
  const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
Packit 9c6abc
                                       vreinterpretq_u16_u8(row23.val[0]));
Packit 9c6abc
  const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
Packit 9c6abc
                                       vreinterpretq_u16_u8(row23.val[1]));
Packit 9c6abc
  const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
Packit 9c6abc
                                       vreinterpretq_u16_u8(row67.val[0]));
Packit 9c6abc
  const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
Packit 9c6abc
                                       vreinterpretq_u16_u8(row67.val[1]));
Packit 9c6abc
  const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
Packit 9c6abc
                                       vreinterpretq_u32_u16(row46.val[0]));
Packit 9c6abc
  const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
Packit 9c6abc
                                       vreinterpretq_u32_u16(row46.val[1]));
Packit 9c6abc
  const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
Packit 9c6abc
                                       vreinterpretq_u32_u16(row57.val[0]));
Packit 9c6abc
  const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
Packit 9c6abc
                                       vreinterpretq_u32_u16(row57.val[1]));
Packit 9c6abc
  *p3 = vreinterpretq_u8_u32(row04.val[0]);
Packit 9c6abc
  *p2 = vreinterpretq_u8_u32(row15.val[0]);
Packit 9c6abc
  *p1 = vreinterpretq_u8_u32(row26.val[0]);
Packit 9c6abc
  *p0 = vreinterpretq_u8_u32(row37.val[0]);
Packit 9c6abc
  *q0 = vreinterpretq_u8_u32(row04.val[1]);
Packit 9c6abc
  *q1 = vreinterpretq_u8_u32(row15.val[1]);
Packit 9c6abc
  *q2 = vreinterpretq_u8_u32(row26.val[1]);
Packit 9c6abc
  *q3 = vreinterpretq_u8_u32(row37.val[1]);
Packit 9c6abc
}
Packit 9c6abc
#undef LOAD_UV_8
Packit 9c6abc
Packit 9c6abc
#endif  // !WORK_AROUND_GCC
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
Packit 9c6abc
                                      uint8_t* const dst, int stride) {
Packit 9c6abc
  vst2_lane_u8(dst + 0 * stride, v, 0);
Packit 9c6abc
  vst2_lane_u8(dst + 1 * stride, v, 1);
Packit 9c6abc
  vst2_lane_u8(dst + 2 * stride, v, 2);
Packit 9c6abc
  vst2_lane_u8(dst + 3 * stride, v, 3);
Packit 9c6abc
  vst2_lane_u8(dst + 4 * stride, v, 4);
Packit 9c6abc
  vst2_lane_u8(dst + 5 * stride, v, 5);
Packit 9c6abc
  vst2_lane_u8(dst + 6 * stride, v, 6);
Packit 9c6abc
  vst2_lane_u8(dst + 7 * stride, v, 7);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
Packit 9c6abc
                                       uint8_t* const dst, int stride) {
Packit 9c6abc
  uint8x8x2_t lo, hi;
Packit 9c6abc
  lo.val[0] = vget_low_u8(p0);
Packit 9c6abc
  lo.val[1] = vget_low_u8(q0);
Packit 9c6abc
  hi.val[0] = vget_high_u8(p0);
Packit 9c6abc
  hi.val[1] = vget_high_u8(q0);
Packit 9c6abc
  Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
Packit 9c6abc
  Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
#if !defined(WORK_AROUND_GCC)
Packit 9c6abc
static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
Packit 9c6abc
                                      uint8_t* const dst, int stride) {
Packit 9c6abc
  vst4_lane_u8(dst + 0 * stride, v, 0);
Packit 9c6abc
  vst4_lane_u8(dst + 1 * stride, v, 1);
Packit 9c6abc
  vst4_lane_u8(dst + 2 * stride, v, 2);
Packit 9c6abc
  vst4_lane_u8(dst + 3 * stride, v, 3);
Packit 9c6abc
  vst4_lane_u8(dst + 4 * stride, v, 4);
Packit 9c6abc
  vst4_lane_u8(dst + 5 * stride, v, 5);
Packit 9c6abc
  vst4_lane_u8(dst + 6 * stride, v, 6);
Packit 9c6abc
  vst4_lane_u8(dst + 7 * stride, v, 7);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
Packit 9c6abc
                                       const uint8x16_t q0, const uint8x16_t q1,
Packit 9c6abc
                                       uint8_t* const dst, int stride) {
Packit 9c6abc
  uint8x8x4_t lo, hi;
Packit 9c6abc
  INIT_VECTOR4(lo,
Packit 9c6abc
               vget_low_u8(p1), vget_low_u8(p0),
Packit 9c6abc
               vget_low_u8(q0), vget_low_u8(q1));
Packit 9c6abc
  INIT_VECTOR4(hi,
Packit 9c6abc
               vget_high_u8(p1), vget_high_u8(p0),
Packit 9c6abc
               vget_high_u8(q0), vget_high_u8(q1));
Packit 9c6abc
  Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
Packit 9c6abc
  Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
Packit 9c6abc
}
Packit 9c6abc
#endif  // !WORK_AROUND_GCC
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
Packit 9c6abc
                                       uint8_t* const dst, int stride) {
Packit 9c6abc
  vst1q_u8(dst - stride, p0);
Packit 9c6abc
  vst1q_u8(dst, q0);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
Packit 9c6abc
                                       const uint8x16_t q0, const uint8x16_t q1,
Packit 9c6abc
                                       uint8_t* const dst, int stride) {
Packit 9c6abc
  Store16x2_NEON(p1, p0, dst - stride, stride);
Packit 9c6abc
  Store16x2_NEON(q0, q1, dst + stride, stride);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
Packit 9c6abc
                                        const uint8x16_t q0,
Packit 9c6abc
                                        uint8_t* const u, uint8_t* const v,
Packit 9c6abc
                                        int stride) {
Packit 9c6abc
  // p0 and q0 contain the u+v samples packed in low/high halves.
Packit 9c6abc
  vst1_u8(u - stride, vget_low_u8(p0));
Packit 9c6abc
  vst1_u8(u,          vget_low_u8(q0));
Packit 9c6abc
  vst1_u8(v - stride, vget_high_u8(p0));
Packit 9c6abc
  vst1_u8(v,          vget_high_u8(q0));
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
Packit 9c6abc
                                        const uint8x16_t p0,
Packit 9c6abc
                                        const uint8x16_t q0,
Packit 9c6abc
                                        const uint8x16_t q1,
Packit 9c6abc
                                        uint8_t* const u, uint8_t* const v,
Packit 9c6abc
                                        int stride) {
Packit 9c6abc
  // The p1...q1 registers contain the u+v samples packed in low/high halves.
Packit 9c6abc
  Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
Packit 9c6abc
  Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
#if !defined(WORK_AROUND_GCC)
Packit 9c6abc
Packit 9c6abc
#define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
Packit 9c6abc
  vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
Packit 9c6abc
  vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
Packit 9c6abc
  (DST) += stride;                                \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Store6x8x2_NEON(
Packit 9c6abc
    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
Packit 9c6abc
    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
Packit 9c6abc
    uint8_t* u, uint8_t* v, int stride) {
Packit 9c6abc
  uint8x8x3_t u0, u1, v0, v1;
Packit 9c6abc
  INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
Packit 9c6abc
  INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
Packit 9c6abc
  INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
Packit 9c6abc
  INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
Packit 9c6abc
  STORE6_LANE(u, u0, u1, 0);
Packit 9c6abc
  STORE6_LANE(u, u0, u1, 1);
Packit 9c6abc
  STORE6_LANE(u, u0, u1, 2);
Packit 9c6abc
  STORE6_LANE(u, u0, u1, 3);
Packit 9c6abc
  STORE6_LANE(u, u0, u1, 4);
Packit 9c6abc
  STORE6_LANE(u, u0, u1, 5);
Packit 9c6abc
  STORE6_LANE(u, u0, u1, 6);
Packit 9c6abc
  STORE6_LANE(u, u0, u1, 7);
Packit 9c6abc
  STORE6_LANE(v, v0, v1, 0);
Packit 9c6abc
  STORE6_LANE(v, v0, v1, 1);
Packit 9c6abc
  STORE6_LANE(v, v0, v1, 2);
Packit 9c6abc
  STORE6_LANE(v, v0, v1, 3);
Packit 9c6abc
  STORE6_LANE(v, v0, v1, 4);
Packit 9c6abc
  STORE6_LANE(v, v0, v1, 5);
Packit 9c6abc
  STORE6_LANE(v, v0, v1, 6);
Packit 9c6abc
  STORE6_LANE(v, v0, v1, 7);
Packit 9c6abc
}
Packit 9c6abc
#undef STORE6_LANE
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
Packit 9c6abc
                                        const uint8x16_t p0,
Packit 9c6abc
                                        const uint8x16_t q0,
Packit 9c6abc
                                        const uint8x16_t q1,
Packit 9c6abc
                                        uint8_t* const u, uint8_t* const v,
Packit 9c6abc
                                        int stride) {
Packit 9c6abc
  uint8x8x4_t u0, v0;
Packit 9c6abc
  INIT_VECTOR4(u0,
Packit 9c6abc
               vget_low_u8(p1), vget_low_u8(p0),
Packit 9c6abc
               vget_low_u8(q0), vget_low_u8(q1));
Packit 9c6abc
  INIT_VECTOR4(v0,
Packit 9c6abc
               vget_high_u8(p1), vget_high_u8(p0),
Packit 9c6abc
               vget_high_u8(q0), vget_high_u8(q1));
Packit 9c6abc
  vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
Packit 9c6abc
  vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
Packit 9c6abc
  vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
Packit 9c6abc
  vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
Packit 9c6abc
  vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
Packit 9c6abc
  vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
Packit 9c6abc
  vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
Packit 9c6abc
  vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
Packit 9c6abc
  vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
Packit 9c6abc
  vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
Packit 9c6abc
  vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
Packit 9c6abc
  vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
Packit 9c6abc
  vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
Packit 9c6abc
  vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
Packit 9c6abc
  vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
Packit 9c6abc
  vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
#endif  // !WORK_AROUND_GCC
Packit 9c6abc
Packit 9c6abc
// Zero extend 'v' to an int16x8_t.
Packit 9c6abc
static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
Packit 9c6abc
  return vreinterpretq_s16_u16(vmovl_u8(v));
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
Packit 9c6abc
// to the corresponding rows of 'dst'.
Packit 9c6abc
static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
Packit 9c6abc
                                                 const int16x8_t dst01,
Packit 9c6abc
                                                 const int16x8_t dst23) {
Packit 9c6abc
  // Unsigned saturate to 8b.
Packit 9c6abc
  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
Packit 9c6abc
  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
Packit 9c6abc
Packit 9c6abc
  // Store the results.
Packit 9c6abc
  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
Packit 9c6abc
  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
Packit 9c6abc
  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
Packit 9c6abc
  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
Packit 9c6abc
                                    const int16x8_t row23,
Packit 9c6abc
                                    uint8_t* const dst) {
Packit 9c6abc
  uint32x2_t dst01 = vdup_n_u32(0);
Packit 9c6abc
  uint32x2_t dst23 = vdup_n_u32(0);
Packit 9c6abc
Packit 9c6abc
  // Load the source pixels.
Packit 9c6abc
  dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
Packit 9c6abc
  dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
Packit 9c6abc
  dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
Packit 9c6abc
  dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
Packit 9c6abc
Packit 9c6abc
  {
Packit 9c6abc
    // Convert to 16b.
Packit 9c6abc
    const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
Packit 9c6abc
    const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
Packit 9c6abc
Packit 9c6abc
    // Descale with rounding.
Packit 9c6abc
    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
Packit 9c6abc
    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
Packit 9c6abc
    // Add the inverse transform.
Packit 9c6abc
    SaturateAndStore4x4_NEON(dst, out01, out23);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
//-----------------------------------------------------------------------------
Packit 9c6abc
// Simple In-loop filtering (Paragraph 15.2)
Packit 9c6abc
Packit 9c6abc
static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
Packit 9c6abc
                                   const uint8x16_t q0, const uint8x16_t q1,
Packit 9c6abc
                                   int thresh) {
Packit 9c6abc
  const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
Packit 9c6abc
  const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
Packit 9c6abc
  const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
Packit 9c6abc
  const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);  // 2 * abs(p0-q0)
Packit 9c6abc
  const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);       // abs(p1-q1) / 2
Packit 9c6abc
  const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
Packit 9c6abc
  const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
Packit 9c6abc
  return mask;
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static int8x16_t FlipSign_NEON(const uint8x16_t v) {
Packit 9c6abc
  const uint8x16_t sign_bit = vdupq_n_u8(0x80);
Packit 9c6abc
  return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
Packit 9c6abc
  const int8x16_t sign_bit = vdupq_n_s8(0x80);
Packit 9c6abc
  return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
Packit 9c6abc
                                   const int8x16_t q0, const int8x16_t q1) {
Packit 9c6abc
  const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
Packit 9c6abc
  const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
Packit 9c6abc
  const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
Packit 9c6abc
  const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // (p1-q1) + 2 * (q0 - p0)
Packit 9c6abc
  const int8x16_t s3 = vqaddq_s8(q0_p0, s2);      // (p1-q1) + 3 * (q0 - p0)
Packit 9c6abc
  return s3;
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
Packit 9c6abc
  const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
Packit 9c6abc
  const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
Packit 9c6abc
  const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
Packit 9c6abc
  return s2;
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
//------------------------------------------------------------------------------
Packit 9c6abc
Packit 9c6abc
static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
Packit 9c6abc
                                    const int8x16_t delta,
Packit 9c6abc
                                    int8x16_t* const op0,
Packit 9c6abc
                                    int8x16_t* const oq0) {
Packit 9c6abc
  const int8x16_t kCst3 = vdupq_n_s8(0x03);
Packit 9c6abc
  const int8x16_t kCst4 = vdupq_n_s8(0x04);
Packit 9c6abc
  const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
Packit 9c6abc
  const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
Packit 9c6abc
  const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
Packit 9c6abc
  const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
Packit 9c6abc
  *op0 = vqaddq_s8(p0s, delta3);
Packit 9c6abc
  *oq0 = vqsubq_s8(q0s, delta4);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
#if defined(WEBP_USE_INTRINSICS)
Packit 9c6abc
Packit 9c6abc
static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
Packit 9c6abc
                              const int8x16_t delta,
Packit 9c6abc
                              uint8x16_t* const op0, uint8x16_t* const oq0) {
Packit 9c6abc
  const int8x16_t kCst3 = vdupq_n_s8(0x03);
Packit 9c6abc
  const int8x16_t kCst4 = vdupq_n_s8(0x04);
Packit 9c6abc
  const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
Packit 9c6abc
  const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
Packit 9c6abc
  const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
Packit 9c6abc
  const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
Packit 9c6abc
  const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
Packit 9c6abc
  const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
Packit 9c6abc
  *op0 = FlipSignBack_NEON(sp0);
Packit 9c6abc
  *oq0 = FlipSignBack_NEON(sq0);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
Packit 9c6abc
                           const uint8x16_t q0, const uint8x16_t q1,
Packit 9c6abc
                           const uint8x16_t mask,
Packit 9c6abc
                           uint8x16_t* const op0, uint8x16_t* const oq0) {
Packit 9c6abc
  const int8x16_t p1s = FlipSign_NEON(p1);
Packit 9c6abc
  const int8x16_t p0s = FlipSign_NEON(p0);
Packit 9c6abc
  const int8x16_t q0s = FlipSign_NEON(q0);
Packit 9c6abc
  const int8x16_t q1s = FlipSign_NEON(q1);
Packit 9c6abc
  const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
Packit 9c6abc
  const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
Packit 9c6abc
  ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
Packit 9c6abc
  uint8x16_t p1, p0, q0, q1, op0, oq0;
Packit 9c6abc
  Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1;;
Packit 9c6abc
  {
Packit 9c6abc
    const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
Packit 9c6abc
    DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
Packit 9c6abc
  }
Packit 9c6abc
  Store16x2_NEON(op0, oq0, p, stride);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
Packit 9c6abc
  uint8x16_t p1, p0, q0, q1, oq0, op0;
Packit 9c6abc
  Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1;;
Packit 9c6abc
  {
Packit 9c6abc
    const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
Packit 9c6abc
    DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
Packit 9c6abc
  }
Packit 9c6abc
  Store2x16_NEON(op0, oq0, p, stride);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
#else
Packit 9c6abc
Packit 9c6abc
// Load/Store vertical edge
Packit 9c6abc
#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
Packit 9c6abc
  "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
Packit 9c6abc
  "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
Packit 9c6abc
  "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
Packit 9c6abc
  "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
Packit 9c6abc
  "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
Packit 9c6abc
  "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
Packit 9c6abc
  "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
Packit 9c6abc
  "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
Packit 9c6abc
Packit 9c6abc
#define STORE8x2(c1, c2, p, stride)                                            \
Packit 9c6abc
  "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
Packit 9c6abc
  "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
Packit 9c6abc
  "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
Packit 9c6abc
  "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
Packit 9c6abc
  "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
Packit 9c6abc
  "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
Packit 9c6abc
  "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
Packit 9c6abc
  "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
Packit 9c6abc
Packit 9c6abc
#define QRegs "q0", "q1", "q2", "q3",                                          \
Packit 9c6abc
              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
Packit 9c6abc
Packit 9c6abc
#define FLIP_SIGN_BIT2(a, b, s)                                                \
Packit 9c6abc
  "veor     " #a "," #a "," #s "               \n"                             \
Packit 9c6abc
  "veor     " #b "," #b "," #s "               \n"                             \
Packit 9c6abc
Packit 9c6abc
#define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
Packit 9c6abc
  FLIP_SIGN_BIT2(a, b, s)                                                      \
Packit 9c6abc
  FLIP_SIGN_BIT2(c, d, s)                                                      \
Packit 9c6abc
Packit 9c6abc
#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
Packit 9c6abc
  "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
Packit 9c6abc
  "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
Packit 9c6abc
  "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
Packit 9c6abc
  "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
Packit 9c6abc
  "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
Packit 9c6abc
  "vdup.8     q14, " #thresh "            \n"                                  \
Packit 9c6abc
  "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
Packit 9c6abc
Packit 9c6abc
#define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
Packit 9c6abc
  "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
Packit 9c6abc
  "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
Packit 9c6abc
  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
Packit 9c6abc
  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
Packit 9c6abc
  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
Packit 9c6abc
Packit 9c6abc
#define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
Packit 9c6abc
  "vmov.i8    q15, #0x03                  \n"                                  \
Packit 9c6abc
  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
Packit 9c6abc
  "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
Packit 9c6abc
  "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
Packit 9c6abc
                                                                               \
Packit 9c6abc
  "vmov.i8    q15, #0x04                  \n"                                  \
Packit 9c6abc
  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
Packit 9c6abc
  "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
Packit 9c6abc
  "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
Packit 9c6abc
Packit 9c6abc
// Applies filter on 2 pixels (p0 and q0)
Packit 9c6abc
#define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
Packit 9c6abc
  NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
Packit 9c6abc
  "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
Packit 9c6abc
  FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
Packit 9c6abc
  GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
Packit 9c6abc
  "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
Packit 9c6abc
  DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
Packit 9c6abc
  FLIP_SIGN_BIT2(p0, q0, q10)
Packit 9c6abc
Packit 9c6abc
static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
Packit 9c6abc
  __asm__ volatile (
Packit 9c6abc
    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
Packit 9c6abc
Packit 9c6abc
    "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
Packit 9c6abc
    "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
Packit 9c6abc
    "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
Packit 9c6abc
    "vld1.u8    {q12}, [%[p]]                  \n"  // q1
Packit 9c6abc
Packit 9c6abc
    DO_FILTER2(q1, q2, q3, q12, %[thresh])
Packit 9c6abc
Packit 9c6abc
    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
Packit 9c6abc
Packit 9c6abc
    "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
Packit 9c6abc
    "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
Packit 9c6abc
    : [p] "+r"(p)
Packit 9c6abc
    : [stride] "r"(stride), [thresh] "r"(thresh)
Packit 9c6abc
    : "memory", QRegs
Packit 9c6abc
  );
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
Packit 9c6abc
  __asm__ volatile (
Packit 9c6abc
    "sub        r4, %[p], #2                   \n"  // base1 = p - 2
Packit 9c6abc
    "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
Packit 9c6abc
    "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
Packit 9c6abc
Packit 9c6abc
    LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
Packit 9c6abc
    LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
Packit 9c6abc
    "vswp       d3, d24                        \n"  // p1:q1 p0:q3
Packit 9c6abc
    "vswp       d5, d26                        \n"  // q0:q2 q1:q4
Packit 9c6abc
    "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
Packit 9c6abc
Packit 9c6abc
    DO_FILTER2(q1, q2, q12, q13, %[thresh])
Packit 9c6abc
Packit 9c6abc
    "sub        %[p], %[p], #1                 \n"  // p - 1
Packit 9c6abc
Packit 9c6abc
    "vswp        d5, d24                       \n"
Packit 9c6abc
    STORE8x2(d4, d5, [%[p]], %[stride])
Packit 9c6abc
    STORE8x2(d24, d25, [%[p]], %[stride])
Packit 9c6abc
Packit 9c6abc
    : [p] "+r"(p)
Packit 9c6abc
    : [stride] "r"(stride), [thresh] "r"(thresh)
Packit 9c6abc
    : "memory", "r4", "r5", "r6", QRegs
Packit 9c6abc
  );
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
#undef LOAD8x4
Packit 9c6abc
#undef STORE8x2
Packit 9c6abc
Packit 9c6abc
#endif    // WEBP_USE_INTRINSICS
Packit 9c6abc
Packit 9c6abc
static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
Packit 9c6abc
  uint32_t k;
Packit 9c6abc
  for (k = 3; k != 0; --k) {
Packit 9c6abc
    p += 4 * stride;
Packit 9c6abc
    SimpleVFilter16_NEON(p, stride, thresh);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
Packit 9c6abc
  uint32_t k;
Packit 9c6abc
  for (k = 3; k != 0; --k) {
Packit 9c6abc
    p += 4;
Packit 9c6abc
    SimpleHFilter16_NEON(p, stride, thresh);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
//------------------------------------------------------------------------------
Packit 9c6abc
// Complex In-loop filtering (Paragraph 15.3)
Packit 9c6abc
Packit 9c6abc
static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
Packit 9c6abc
                                const uint8x16_t q0, const uint8x16_t q1,
Packit 9c6abc
                                int hev_thresh) {
Packit 9c6abc
  const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
Packit 9c6abc
  const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
Packit 9c6abc
  const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
Packit 9c6abc
  const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
Packit 9c6abc
  const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
Packit 9c6abc
  return mask;
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
Packit 9c6abc
                                    const uint8x16_t p1, const uint8x16_t p0,
Packit 9c6abc
                                    const uint8x16_t q0, const uint8x16_t q1,
Packit 9c6abc
                                    const uint8x16_t q2, const uint8x16_t q3,
Packit 9c6abc
                                    int ithresh, int thresh) {
Packit 9c6abc
  const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
Packit 9c6abc
  const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
Packit 9c6abc
  const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
Packit 9c6abc
  const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
Packit 9c6abc
  const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);  // abs(q3 - q2)
Packit 9c6abc
  const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);  // abs(q2 - q1)
Packit 9c6abc
  const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
Packit 9c6abc
  const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
Packit 9c6abc
  const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
Packit 9c6abc
  const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
Packit 9c6abc
  const uint8x16_t max12 = vmaxq_u8(max1, max2);
Packit 9c6abc
  const uint8x16_t max123 = vmaxq_u8(max12, max3);
Packit 9c6abc
  const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
Packit 9c6abc
  const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
Packit 9c6abc
  const uint8x16_t mask = vandq_u8(mask1, mask2);
Packit 9c6abc
  return mask;
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
//  4-points filter
Packit 9c6abc
Packit 9c6abc
static void ApplyFilter4_NEON(
Packit 9c6abc
    const int8x16_t p1, const int8x16_t p0,
Packit 9c6abc
    const int8x16_t q0, const int8x16_t q1,
Packit 9c6abc
    const int8x16_t delta0,
Packit 9c6abc
    uint8x16_t* const op1, uint8x16_t* const op0,
Packit 9c6abc
    uint8x16_t* const oq0, uint8x16_t* const oq1) {
Packit 9c6abc
  const int8x16_t kCst3 = vdupq_n_s8(0x03);
Packit 9c6abc
  const int8x16_t kCst4 = vdupq_n_s8(0x04);
Packit 9c6abc
  const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
Packit 9c6abc
  const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
Packit 9c6abc
  const int8x16_t a1 = vshrq_n_s8(delta1, 3);
Packit 9c6abc
  const int8x16_t a2 = vshrq_n_s8(delta2, 3);
Packit 9c6abc
  const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
Packit 9c6abc
  *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2));  // clip(p0 + a2)
Packit 9c6abc
  *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - a1)
Packit 9c6abc
  *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3));  // clip(p1 + a3)
Packit 9c6abc
  *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3));  // clip(q1 - a3)
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void DoFilter4_NEON(
Packit 9c6abc
    const uint8x16_t p1, const uint8x16_t p0,
Packit 9c6abc
    const uint8x16_t q0, const uint8x16_t q1,
Packit 9c6abc
    const uint8x16_t mask, const uint8x16_t hev_mask,
Packit 9c6abc
    uint8x16_t* const op1, uint8x16_t* const op0,
Packit 9c6abc
    uint8x16_t* const oq0, uint8x16_t* const oq1) {
Packit 9c6abc
  // This is a fused version of DoFilter2() calling ApplyFilter2 directly
Packit 9c6abc
  const int8x16_t p1s = FlipSign_NEON(p1);
Packit 9c6abc
  int8x16_t p0s = FlipSign_NEON(p0);
Packit 9c6abc
  int8x16_t q0s = FlipSign_NEON(q0);
Packit 9c6abc
  const int8x16_t q1s = FlipSign_NEON(q1);
Packit 9c6abc
  const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
Packit 9c6abc
Packit 9c6abc
  // do_filter2 part (simple loopfilter on pixels with hev)
Packit 9c6abc
  {
Packit 9c6abc
    const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
Packit 9c6abc
    const int8x16_t simple_lf_delta =
Packit 9c6abc
        vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
Packit 9c6abc
    ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
Packit 9c6abc
  }
Packit 9c6abc
Packit 9c6abc
  // do_filter4 part (complex loopfilter on pixels without hev)
Packit 9c6abc
  {
Packit 9c6abc
    const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
Packit 9c6abc
    // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
Packit 9c6abc
    const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
Packit 9c6abc
    const int8x16_t complex_lf_delta =
Packit 9c6abc
        vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
Packit 9c6abc
    ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
//  6-points filter
Packit 9c6abc
Packit 9c6abc
static void ApplyFilter6_NEON(
Packit 9c6abc
    const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
Packit 9c6abc
    const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
Packit 9c6abc
    const int8x16_t delta,
Packit 9c6abc
    uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
Packit 9c6abc
    uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
Packit 9c6abc
  // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
Packit 9c6abc
  // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
Packit 9c6abc
  // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
Packit 9c6abc
  //   X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
Packit 9c6abc
  const int8x8_t delta_lo = vget_low_s8(delta);
Packit 9c6abc
  const int8x8_t delta_hi = vget_high_s8(delta);
Packit 9c6abc
  const int8x8_t kCst9 = vdup_n_s8(9);
Packit 9c6abc
  const int16x8_t kCstm1 = vdupq_n_s16(-1);
Packit 9c6abc
  const int8x8_t kCst18 = vdup_n_s8(18);
Packit 9c6abc
  const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo);  // S = 9 * a - 1
Packit 9c6abc
  const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
Packit 9c6abc
  const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo);   // S + 18 * a
Packit 9c6abc
  const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
Packit 9c6abc
  const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7);   // (9 * a + 63) >> 7
Packit 9c6abc
  const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
Packit 9c6abc
  const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6);   // (9 * a + 31) >> 6
Packit 9c6abc
  const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
Packit 9c6abc
  const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7);   // (27 * a + 63) >> 7
Packit 9c6abc
  const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
Packit 9c6abc
  const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
Packit 9c6abc
  const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
Packit 9c6abc
  const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
Packit 9c6abc
Packit 9c6abc
  *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1));  // clip(p0 + a1)
Packit 9c6abc
  *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - q1)
Packit 9c6abc
  *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2));  // clip(q1 - a2)
Packit 9c6abc
  *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2));  // clip(p1 + a2)
Packit 9c6abc
  *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3));  // clip(q2 - a3)
Packit 9c6abc
  *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3));  // clip(p2 + a3)
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void DoFilter6_NEON(
Packit 9c6abc
    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
Packit 9c6abc
    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
Packit 9c6abc
    const uint8x16_t mask, const uint8x16_t hev_mask,
Packit 9c6abc
    uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
Packit 9c6abc
    uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
Packit 9c6abc
  // This is a fused version of DoFilter2() calling ApplyFilter2 directly
Packit 9c6abc
  const int8x16_t p2s = FlipSign_NEON(p2);
Packit 9c6abc
  const int8x16_t p1s = FlipSign_NEON(p1);
Packit 9c6abc
  int8x16_t p0s = FlipSign_NEON(p0);
Packit 9c6abc
  int8x16_t q0s = FlipSign_NEON(q0);
Packit 9c6abc
  const int8x16_t q1s = FlipSign_NEON(q1);
Packit 9c6abc
  const int8x16_t q2s = FlipSign_NEON(q2);
Packit 9c6abc
  const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
Packit 9c6abc
  const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
Packit 9c6abc
Packit 9c6abc
  // do_filter2 part (simple loopfilter on pixels with hev)
Packit 9c6abc
  {
Packit 9c6abc
    const int8x16_t simple_lf_delta =
Packit 9c6abc
        vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
Packit 9c6abc
    ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
Packit 9c6abc
  }
Packit 9c6abc
Packit 9c6abc
  // do_filter6 part (complex loopfilter on pixels without hev)
Packit 9c6abc
  {
Packit 9c6abc
    // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
Packit 9c6abc
    const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
Packit 9c6abc
    const int8x16_t complex_lf_delta =
Packit 9c6abc
        vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
Packit 9c6abc
    ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
Packit 9c6abc
                      op2, op1, op0, oq0, oq1, oq2);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
// on macroblock edges
Packit 9c6abc
Packit 9c6abc
static void VFilter16_NEON(uint8_t* p, int stride,
Packit 9c6abc
                           int thresh, int ithresh, int hev_thresh) {
Packit 9c6abc
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
Packit 9c6abc
  Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3;;
Packit 9c6abc
  {
Packit 9c6abc
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
Packit 9c6abc
                                              ithresh, thresh);
Packit 9c6abc
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
Packit 9c6abc
    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
Packit 9c6abc
    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
Packit 9c6abc
                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
Packit 9c6abc
    Store16x2_NEON(op2, op1, p - 2 * stride, stride);
Packit 9c6abc
    Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
Packit 9c6abc
    Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void HFilter16_NEON(uint8_t* p, int stride,
Packit 9c6abc
                           int thresh, int ithresh, int hev_thresh) {
Packit 9c6abc
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
Packit 9c6abc
  Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3;;
Packit 9c6abc
  {
Packit 9c6abc
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
Packit 9c6abc
                                              ithresh, thresh);
Packit 9c6abc
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
Packit 9c6abc
    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
Packit 9c6abc
    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
Packit 9c6abc
                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
Packit 9c6abc
    Store2x16_NEON(op2, op1, p - 2, stride);
Packit 9c6abc
    Store2x16_NEON(op0, oq0, p + 0, stride);
Packit 9c6abc
    Store2x16_NEON(oq1, oq2, p + 2, stride);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
// on three inner edges
Packit 9c6abc
static void VFilter16i_NEON(uint8_t* p, int stride,
Packit 9c6abc
                            int thresh, int ithresh, int hev_thresh) {
Packit 9c6abc
  uint32_t k;
Packit 9c6abc
  uint8x16_t p3, p2, p1, p0;
Packit 9c6abc
  Load16x4_NEON(p + 2  * stride, stride, &p3, &p2, &p1, &p0;;
Packit 9c6abc
  for (k = 3; k != 0; --k) {
Packit 9c6abc
    uint8x16_t q0, q1, q2, q3;
Packit 9c6abc
    p += 4 * stride;
Packit 9c6abc
    Load16x4_NEON(p + 2  * stride, stride, &q0, &q1, &q2, &q3;;
Packit 9c6abc
    {
Packit 9c6abc
      const uint8x16_t mask =
Packit 9c6abc
          NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
Packit 9c6abc
      const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
Packit 9c6abc
      // p3 and p2 are not just temporary variables here: they will be
Packit 9c6abc
      // re-used for next span. And q2/q3 will become p1/p0 accordingly.
Packit 9c6abc
      DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2;;
Packit 9c6abc
      Store16x4_NEON(p1, p0, p3, p2, p, stride);
Packit 9c6abc
      p1 = q2;
Packit 9c6abc
      p0 = q3;
Packit 9c6abc
    }
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
#if !defined(WORK_AROUND_GCC)
Packit 9c6abc
static void HFilter16i_NEON(uint8_t* p, int stride,
Packit 9c6abc
                            int thresh, int ithresh, int hev_thresh) {
Packit 9c6abc
  uint32_t k;
Packit 9c6abc
  uint8x16_t p3, p2, p1, p0;
Packit 9c6abc
  Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0;;
Packit 9c6abc
  for (k = 3; k != 0; --k) {
Packit 9c6abc
    uint8x16_t q0, q1, q2, q3;
Packit 9c6abc
    p += 4;
Packit 9c6abc
    Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3;;
Packit 9c6abc
    {
Packit 9c6abc
      const uint8x16_t mask =
Packit 9c6abc
          NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
Packit 9c6abc
      const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
Packit 9c6abc
      DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2;;
Packit 9c6abc
      Store4x16_NEON(p1, p0, p3, p2, p, stride);
Packit 9c6abc
      p1 = q2;
Packit 9c6abc
      p0 = q3;
Packit 9c6abc
    }
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
#endif  // !WORK_AROUND_GCC
Packit 9c6abc
Packit 9c6abc
// 8-pixels wide variant, for chroma filtering
Packit 9c6abc
static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
Packit 9c6abc
                          int thresh, int ithresh, int hev_thresh) {
Packit 9c6abc
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
Packit 9c6abc
  Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3;;
Packit 9c6abc
  {
Packit 9c6abc
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
Packit 9c6abc
                                              ithresh, thresh);
Packit 9c6abc
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
Packit 9c6abc
    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
Packit 9c6abc
    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
Packit 9c6abc
                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
Packit 9c6abc
    Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
Packit 9c6abc
    Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
Packit 9c6abc
    Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
Packit 9c6abc
                           int thresh, int ithresh, int hev_thresh) {
Packit 9c6abc
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
Packit 9c6abc
  u += 4 * stride;
Packit 9c6abc
  v += 4 * stride;
Packit 9c6abc
  Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3;;
Packit 9c6abc
  {
Packit 9c6abc
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
Packit 9c6abc
                                              ithresh, thresh);
Packit 9c6abc
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
Packit 9c6abc
    uint8x16_t op1, op0, oq0, oq1;
Packit 9c6abc
    DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
Packit 9c6abc
    Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
#if !defined(WORK_AROUND_GCC)
Packit 9c6abc
static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
Packit 9c6abc
                          int thresh, int ithresh, int hev_thresh) {
Packit 9c6abc
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
Packit 9c6abc
  Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3;;
Packit 9c6abc
  {
Packit 9c6abc
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
Packit 9c6abc
                                              ithresh, thresh);
Packit 9c6abc
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
Packit 9c6abc
    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
Packit 9c6abc
    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
Packit 9c6abc
                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
Packit 9c6abc
    Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
Packit 9c6abc
                           int thresh, int ithresh, int hev_thresh) {
Packit 9c6abc
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
Packit 9c6abc
  u += 4;
Packit 9c6abc
  v += 4;
Packit 9c6abc
  Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3;;
Packit 9c6abc
  {
Packit 9c6abc
    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
Packit 9c6abc
                                              ithresh, thresh);
Packit 9c6abc
    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
Packit 9c6abc
    uint8x16_t op1, op0, oq0, oq1;
Packit 9c6abc
    DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
Packit 9c6abc
    Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
#endif  // !WORK_AROUND_GCC
Packit 9c6abc
Packit 9c6abc
//-----------------------------------------------------------------------------
Packit 9c6abc
// Inverse transforms (Paragraph 14.4)
Packit 9c6abc
Packit 9c6abc
// Technically these are unsigned but vqdmulh is only available in signed.
Packit 9c6abc
// vqdmulh returns high half (effectively >> 16) but also doubles the value,
Packit 9c6abc
// changing the >> 16 to >> 15 and requiring an additional >> 1.
Packit 9c6abc
// We use this to our advantage with kC2. The canonical value is 35468.
Packit 9c6abc
// However, the high bit is set so treating it as signed will give incorrect
Packit 9c6abc
// results. We avoid this by down shifting by 1 here to clear the highest bit.
Packit 9c6abc
// Combined with the doubling effect of vqdmulh we get >> 16.
Packit 9c6abc
// This can not be applied to kC1 because the lowest bit is set. Down shifting
Packit 9c6abc
// the constant would reduce precision.
Packit 9c6abc
Packit 9c6abc
// libwebp uses a trick to avoid some extra addition that libvpx does.
Packit 9c6abc
// Instead of:
Packit 9c6abc
// temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
Packit 9c6abc
// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
Packit 9c6abc
// same issue with kC1 and vqdmulh that we work around by down shifting kC2
Packit 9c6abc
Packit 9c6abc
static const int16_t kC1 = 20091;
Packit 9c6abc
static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
Packit 9c6abc
Packit 9c6abc
#if defined(WEBP_USE_INTRINSICS)
Packit 9c6abc
static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
Packit 9c6abc
                                          const int16x8_t in1,
Packit 9c6abc
                                          int16x8x2_t* const out) {
Packit 9c6abc
  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
Packit 9c6abc
  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
Packit 9c6abc
  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
Packit 9c6abc
                                                  // b0 d0 b1 d1 b2 d2 ...
Packit 9c6abc
  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
Packit 9c6abc
  // {rows} = in0 | in4
Packit 9c6abc
  //          in8 | in12
Packit 9c6abc
  // B1 = in4 | in12
Packit 9c6abc
  const int16x8_t B1 =
Packit 9c6abc
      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
Packit 9c6abc
  // C0 = kC1 * in4 | kC1 * in12
Packit 9c6abc
  // C1 = kC2 * in4 | kC2 * in12
Packit 9c6abc
  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
Packit 9c6abc
  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
Packit 9c6abc
  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
Packit 9c6abc
                                vget_low_s16(rows->val[1]));   // in0 + in8
Packit 9c6abc
  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
Packit 9c6abc
                                vget_low_s16(rows->val[1]));   // in0 - in8
Packit 9c6abc
  // c = kC2 * in4 - kC1 * in12
Packit 9c6abc
  // d = kC1 * in4 + kC2 * in12
Packit 9c6abc
  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
Packit 9c6abc
  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
Packit 9c6abc
  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
Packit 9c6abc
  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
Packit 9c6abc
  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
Packit 9c6abc
  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
Packit 9c6abc
  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
Packit 9c6abc
  Transpose8x2_NEON(E0, E1, rows);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
Packit 9c6abc
  int16x8x2_t rows;
Packit 9c6abc
  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
Packit 9c6abc
  TransformPass_NEON(&rows);
Packit 9c6abc
  TransformPass_NEON(&rows);
Packit 9c6abc
  Add4x4_NEON(rows.val[0], rows.val[1], dst);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
#else
Packit 9c6abc
Packit 9c6abc
static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
Packit 9c6abc
  const int kBPS = BPS;
Packit 9c6abc
  // kC1, kC2. Padded because vld1.16 loads 8 bytes
Packit 9c6abc
  const int16_t constants[4] = { kC1, kC2, 0, 0 };
Packit 9c6abc
  /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
Packit 9c6abc
  __asm__ volatile (
Packit 9c6abc
    "vld1.16         {q1, q2}, [%[in]]           \n"
Packit 9c6abc
    "vld1.16         {d0}, [%[constants]]        \n"
Packit 9c6abc
Packit 9c6abc
    /* d2: in[0]
Packit 9c6abc
     * d3: in[8]
Packit 9c6abc
     * d4: in[4]
Packit 9c6abc
     * d5: in[12]
Packit 9c6abc
     */
Packit 9c6abc
    "vswp            d3, d4                      \n"
Packit 9c6abc
Packit 9c6abc
    /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
Packit 9c6abc
     * q9 = {in[4], in[12]} * kC2 >> 16
Packit 9c6abc
     */
Packit 9c6abc
    "vqdmulh.s16     q8, q2, d0[0]               \n"
Packit 9c6abc
    "vqdmulh.s16     q9, q2, d0[1]               \n"
Packit 9c6abc
Packit 9c6abc
    /* d22 = a = in[0] + in[8]
Packit 9c6abc
     * d23 = b = in[0] - in[8]
Packit 9c6abc
     */
Packit 9c6abc
    "vqadd.s16       d22, d2, d3                 \n"
Packit 9c6abc
    "vqsub.s16       d23, d2, d3                 \n"
Packit 9c6abc
Packit 9c6abc
    /* The multiplication should be x * kC1 >> 16
Packit 9c6abc
     * However, with vqdmulh we get x * kC1 * 2 >> 16
Packit 9c6abc
     * (multiply, double, return high half)
Packit 9c6abc
     * We avoided this in kC2 by pre-shifting the constant.
Packit 9c6abc
     * q8 = in[4]/[12] * kC1 >> 16
Packit 9c6abc
     */
Packit 9c6abc
    "vshr.s16        q8, q8, #1                  \n"
Packit 9c6abc
Packit 9c6abc
    /* Add {in[4], in[12]} back after the multiplication. This is handled by
Packit 9c6abc
     * adding 1 << 16 to kC1 in the libwebp C code.
Packit 9c6abc
     */
Packit 9c6abc
    "vqadd.s16       q8, q2, q8                  \n"
Packit 9c6abc
Packit 9c6abc
    /* d20 = c = in[4]*kC2 - in[12]*kC1
Packit 9c6abc
     * d21 = d = in[4]*kC1 + in[12]*kC2
Packit 9c6abc
     */
Packit 9c6abc
    "vqsub.s16       d20, d18, d17               \n"
Packit 9c6abc
    "vqadd.s16       d21, d19, d16               \n"
Packit 9c6abc
Packit 9c6abc
    /* d2 = tmp[0] = a + d
Packit 9c6abc
     * d3 = tmp[1] = b + c
Packit 9c6abc
     * d4 = tmp[2] = b - c
Packit 9c6abc
     * d5 = tmp[3] = a - d
Packit 9c6abc
     */
Packit 9c6abc
    "vqadd.s16       d2, d22, d21                \n"
Packit 9c6abc
    "vqadd.s16       d3, d23, d20                \n"
Packit 9c6abc
    "vqsub.s16       d4, d23, d20                \n"
Packit 9c6abc
    "vqsub.s16       d5, d22, d21                \n"
Packit 9c6abc
Packit 9c6abc
    "vzip.16         q1, q2                      \n"
Packit 9c6abc
    "vzip.16         q1, q2                      \n"
Packit 9c6abc
Packit 9c6abc
    "vswp            d3, d4                      \n"
Packit 9c6abc
Packit 9c6abc
    /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
Packit 9c6abc
     * q9 = {tmp[4], tmp[12]} * kC2 >> 16
Packit 9c6abc
     */
Packit 9c6abc
    "vqdmulh.s16     q8, q2, d0[0]               \n"
Packit 9c6abc
    "vqdmulh.s16     q9, q2, d0[1]               \n"
Packit 9c6abc
Packit 9c6abc
    /* d22 = a = tmp[0] + tmp[8]
Packit 9c6abc
     * d23 = b = tmp[0] - tmp[8]
Packit 9c6abc
     */
Packit 9c6abc
    "vqadd.s16       d22, d2, d3                 \n"
Packit 9c6abc
    "vqsub.s16       d23, d2, d3                 \n"
Packit 9c6abc
Packit 9c6abc
    /* See long winded explanations prior */
Packit 9c6abc
    "vshr.s16        q8, q8, #1                  \n"
Packit 9c6abc
    "vqadd.s16       q8, q2, q8                  \n"
Packit 9c6abc
Packit 9c6abc
    /* d20 = c = in[4]*kC2 - in[12]*kC1
Packit 9c6abc
     * d21 = d = in[4]*kC1 + in[12]*kC2
Packit 9c6abc
     */
Packit 9c6abc
    "vqsub.s16       d20, d18, d17               \n"
Packit 9c6abc
    "vqadd.s16       d21, d19, d16               \n"
Packit 9c6abc
Packit 9c6abc
    /* d2 = tmp[0] = a + d
Packit 9c6abc
     * d3 = tmp[1] = b + c
Packit 9c6abc
     * d4 = tmp[2] = b - c
Packit 9c6abc
     * d5 = tmp[3] = a - d
Packit 9c6abc
     */
Packit 9c6abc
    "vqadd.s16       d2, d22, d21                \n"
Packit 9c6abc
    "vqadd.s16       d3, d23, d20                \n"
Packit 9c6abc
    "vqsub.s16       d4, d23, d20                \n"
Packit 9c6abc
    "vqsub.s16       d5, d22, d21                \n"
Packit 9c6abc
Packit 9c6abc
    "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
Packit 9c6abc
    "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
Packit 9c6abc
    "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
Packit 9c6abc
    "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
Packit 9c6abc
Packit 9c6abc
    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
Packit 9c6abc
Packit 9c6abc
    /* (val) + 4 >> 3 */
Packit 9c6abc
    "vrshr.s16       d2, d2, #3                  \n"
Packit 9c6abc
    "vrshr.s16       d3, d3, #3                  \n"
Packit 9c6abc
    "vrshr.s16       d4, d4, #3                  \n"
Packit 9c6abc
    "vrshr.s16       d5, d5, #3                  \n"
Packit 9c6abc
Packit 9c6abc
    "vzip.16         q1, q2                      \n"
Packit 9c6abc
    "vzip.16         q1, q2                      \n"
Packit 9c6abc
Packit 9c6abc
    /* Must accumulate before saturating */
Packit 9c6abc
    "vmovl.u8        q8, d6                      \n"
Packit 9c6abc
    "vmovl.u8        q9, d7                      \n"
Packit 9c6abc
Packit 9c6abc
    "vqadd.s16       q1, q1, q8                  \n"
Packit 9c6abc
    "vqadd.s16       q2, q2, q9                  \n"
Packit 9c6abc
Packit 9c6abc
    "vqmovun.s16     d0, q1                      \n"
Packit 9c6abc
    "vqmovun.s16     d1, q2                      \n"
Packit 9c6abc
Packit 9c6abc
    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
Packit 9c6abc
    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
Packit 9c6abc
    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
Packit 9c6abc
    "vst1.32         d1[1], [%[dst]]             \n"
Packit 9c6abc
Packit 9c6abc
    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
Packit 9c6abc
    : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
Packit 9c6abc
    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
Packit 9c6abc
  );
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
#endif    // WEBP_USE_INTRINSICS
Packit 9c6abc
Packit 9c6abc
static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
Packit 9c6abc
  TransformOne_NEON(in, dst);
Packit 9c6abc
  if (do_two) {
Packit 9c6abc
    TransformOne_NEON(in + 16, dst + 4);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
Packit 9c6abc
  const int16x8_t DC = vdupq_n_s16(in[0]);
Packit 9c6abc
  Add4x4_NEON(DC, DC, dst);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
//------------------------------------------------------------------------------
Packit 9c6abc
Packit 9c6abc
#define STORE_WHT(dst, col, rows) do {                  \
Packit 9c6abc
  *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
Packit 9c6abc
  *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
Packit 9c6abc
  *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
Packit 9c6abc
  *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
Packit 9c6abc
  int32x4x4_t tmp;
Packit 9c6abc
Packit 9c6abc
  {
Packit 9c6abc
    // Load the source.
Packit 9c6abc
    const int16x4_t in00_03 = vld1_s16(in + 0);
Packit 9c6abc
    const int16x4_t in04_07 = vld1_s16(in + 4);
Packit 9c6abc
    const int16x4_t in08_11 = vld1_s16(in + 8);
Packit 9c6abc
    const int16x4_t in12_15 = vld1_s16(in + 12);
Packit 9c6abc
    const int32x4_t a0 = vaddl_s16(in00_03, in12_15);  // in[0..3] + in[12..15]
Packit 9c6abc
    const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
Packit 9c6abc
    const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
Packit 9c6abc
    const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
Packit 9c6abc
    tmp.val[0] = vaddq_s32(a0, a1);
Packit 9c6abc
    tmp.val[1] = vaddq_s32(a3, a2);
Packit 9c6abc
    tmp.val[2] = vsubq_s32(a0, a1);
Packit 9c6abc
    tmp.val[3] = vsubq_s32(a3, a2);
Packit 9c6abc
    // Arrange the temporary results column-wise.
Packit 9c6abc
    tmp = Transpose4x4_NEON(tmp);
Packit 9c6abc
  }
Packit 9c6abc
Packit 9c6abc
  {
Packit 9c6abc
    const int32x4_t kCst3 = vdupq_n_s32(3);
Packit 9c6abc
    const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
Packit 9c6abc
    const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
Packit 9c6abc
    const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
Packit 9c6abc
    const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
Packit 9c6abc
    const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
Packit 9c6abc
Packit 9c6abc
    tmp.val[0] = vaddq_s32(a0, a1);
Packit 9c6abc
    tmp.val[1] = vaddq_s32(a3, a2);
Packit 9c6abc
    tmp.val[2] = vsubq_s32(a0, a1);
Packit 9c6abc
    tmp.val[3] = vsubq_s32(a3, a2);
Packit 9c6abc
Packit 9c6abc
    // right shift the results by 3.
Packit 9c6abc
    tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
Packit 9c6abc
    tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
Packit 9c6abc
    tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
Packit 9c6abc
    tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
Packit 9c6abc
Packit 9c6abc
    STORE_WHT(out, 0, tmp);
Packit 9c6abc
    STORE_WHT(out, 1, tmp);
Packit 9c6abc
    STORE_WHT(out, 2, tmp);
Packit 9c6abc
    STORE_WHT(out, 3, tmp);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
#undef STORE_WHT
Packit 9c6abc
Packit 9c6abc
//------------------------------------------------------------------------------
Packit 9c6abc
Packit 9c6abc
#define MUL(a, b) (((a) * (b)) >> 16)
Packit 9c6abc
static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
Packit 9c6abc
  static const int kC1_full = 20091 + (1 << 16);
Packit 9c6abc
  static const int kC2_full = 35468;
Packit 9c6abc
  const int16x4_t A = vld1_dup_s16(in);
Packit 9c6abc
  const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
Packit 9c6abc
  const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
Packit 9c6abc
  const int c1 = MUL(in[1], kC2_full);
Packit 9c6abc
  const int d1 = MUL(in[1], kC1_full);
Packit 9c6abc
  const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
Packit 9c6abc
                      (uint64_t)( c1 & 0xffff) << 16 |
Packit 9c6abc
                      (uint64_t)(-c1 & 0xffff) << 32 |
Packit 9c6abc
                      (uint64_t)(-d1 & 0xffff) << 48;
Packit 9c6abc
  const int16x4_t CD = vcreate_s16(cd);
Packit 9c6abc
  const int16x4_t B = vqadd_s16(A, CD);
Packit 9c6abc
  const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
Packit 9c6abc
  const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
Packit 9c6abc
  Add4x4_NEON(m0_m1, m2_m3, dst);
Packit 9c6abc
}
Packit 9c6abc
#undef MUL
Packit 9c6abc
Packit 9c6abc
//------------------------------------------------------------------------------
Packit 9c6abc
// 4x4
Packit 9c6abc
Packit 9c6abc
static void DC4_NEON(uint8_t* dst) {    // DC
Packit 9c6abc
  const uint8x8_t A = vld1_u8(dst - BPS);  // top row
Packit 9c6abc
  const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
Packit 9c6abc
  const uint16x4_t p1 = vpadd_u16(p0, p0);
Packit 9c6abc
  const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
Packit 9c6abc
  const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
Packit 9c6abc
  const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
Packit 9c6abc
  const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
Packit 9c6abc
  const uint16x8_t s0 = vaddq_u16(L0, L1);
Packit 9c6abc
  const uint16x8_t s1 = vaddq_u16(L2, L3);
Packit 9c6abc
  const uint16x8_t s01 = vaddq_u16(s0, s1);
Packit 9c6abc
  const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
Packit 9c6abc
  const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
Packit 9c6abc
  const uint8x8_t dc = vdup_lane_u8(dc0, 0);
Packit 9c6abc
  int i;
Packit 9c6abc
  for (i = 0; i < 4; ++i) {
Packit 9c6abc
    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
// TrueMotion (4x4 + 8x8)
Packit 9c6abc
static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
Packit 9c6abc
  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
Packit 9c6abc
  const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
Packit 9c6abc
  const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));  // A[c] - A[-1]
Packit 9c6abc
  int y;
Packit 9c6abc
  for (y = 0; y < size; y += 4) {
Packit 9c6abc
    // left edge
Packit 9c6abc
    const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
Packit 9c6abc
    const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
Packit 9c6abc
    const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
Packit 9c6abc
    const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
Packit 9c6abc
    const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
Packit 9c6abc
    const int16x8_t r1 = vaddq_s16(L1, d);
Packit 9c6abc
    const int16x8_t r2 = vaddq_s16(L2, d);
Packit 9c6abc
    const int16x8_t r3 = vaddq_s16(L3, d);
Packit 9c6abc
    // Saturate and store the result.
Packit 9c6abc
    const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
Packit 9c6abc
    const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
Packit 9c6abc
    const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
Packit 9c6abc
    const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
Packit 9c6abc
    if (size == 4) {
Packit 9c6abc
      vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
Packit 9c6abc
      vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
Packit 9c6abc
      vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
Packit 9c6abc
      vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
Packit 9c6abc
    } else {
Packit 9c6abc
      vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
Packit 9c6abc
      vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
Packit 9c6abc
      vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
Packit 9c6abc
      vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
Packit 9c6abc
    }
Packit 9c6abc
    dst += 4 * BPS;
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
Packit 9c6abc
Packit 9c6abc
static void VE4_NEON(uint8_t* dst) {    // vertical
Packit 9c6abc
  // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
Packit 9c6abc
  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
Packit 9c6abc
  const uint64x1_t A1 = vshr_n_u64(A0, 8);
Packit 9c6abc
  const uint64x1_t A2 = vshr_n_u64(A0, 16);
Packit 9c6abc
  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
Packit 9c6abc
  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
Packit 9c6abc
  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
Packit 9c6abc
  const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
Packit 9c6abc
  const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
Packit 9c6abc
  int i;
Packit 9c6abc
  for (i = 0; i < 4; ++i) {
Packit 9c6abc
    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void RD4_NEON(uint8_t* dst) {   // Down-right
Packit 9c6abc
  const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
Packit 9c6abc
  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
Packit 9c6abc
  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
Packit 9c6abc
  const uint32_t I = dst[-1 + 0 * BPS];
Packit 9c6abc
  const uint32_t J = dst[-1 + 1 * BPS];
Packit 9c6abc
  const uint32_t K = dst[-1 + 2 * BPS];
Packit 9c6abc
  const uint32_t L = dst[-1 + 3 * BPS];
Packit 9c6abc
  const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
Packit 9c6abc
  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
Packit 9c6abc
  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
Packit 9c6abc
  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
Packit 9c6abc
  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
Packit 9c6abc
  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
Packit 9c6abc
  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
Packit 9c6abc
  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
Packit 9c6abc
  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
Packit 9c6abc
  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
Packit 9c6abc
  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
Packit 9c6abc
  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
Packit 9c6abc
  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
Packit 9c6abc
  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
Packit 9c6abc
  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
Packit 9c6abc
  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
Packit 9c6abc
  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
Packit 9c6abc
  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void LD4_NEON(uint8_t* dst) {    // Down-left
Packit 9c6abc
  // Note using the same shift trick as VE4() is slower here.
Packit 9c6abc
  const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
Packit 9c6abc
  const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
Packit 9c6abc
  const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
Packit 9c6abc
  const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
Packit 9c6abc
  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
Packit 9c6abc
  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
Packit 9c6abc
  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
Packit 9c6abc
  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
Packit 9c6abc
  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
Packit 9c6abc
  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
Packit 9c6abc
  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
Packit 9c6abc
  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
Packit 9c6abc
  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
Packit 9c6abc
  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
Packit 9c6abc
  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
//------------------------------------------------------------------------------
Packit 9c6abc
// Chroma
Packit 9c6abc
Packit 9c6abc
static void VE8uv_NEON(uint8_t* dst) {    // vertical
Packit 9c6abc
  const uint8x8_t top = vld1_u8(dst - BPS);
Packit 9c6abc
  int j;
Packit 9c6abc
  for (j = 0; j < 8; ++j) {
Packit 9c6abc
    vst1_u8(dst + j * BPS, top);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void HE8uv_NEON(uint8_t* dst) {    // horizontal
Packit 9c6abc
  int j;
Packit 9c6abc
  for (j = 0; j < 8; ++j) {
Packit 9c6abc
    const uint8x8_t left = vld1_dup_u8(dst - 1);
Packit 9c6abc
    vst1_u8(dst, left);
Packit 9c6abc
    dst += BPS;
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
Packit 9c6abc
  uint16x8_t sum_top;
Packit 9c6abc
  uint16x8_t sum_left;
Packit 9c6abc
  uint8x8_t dc0;
Packit 9c6abc
Packit 9c6abc
  if (do_top) {
Packit 9c6abc
    const uint8x8_t A = vld1_u8(dst - BPS);  // top row
Packit 9c6abc
    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
Packit 9c6abc
    const uint16x4_t p1 = vpadd_u16(p0, p0);
Packit 9c6abc
    const uint16x4_t p2 = vpadd_u16(p1, p1);
Packit 9c6abc
    sum_top = vcombine_u16(p2, p2);
Packit 9c6abc
  }
Packit 9c6abc
Packit 9c6abc
  if (do_left) {
Packit 9c6abc
    const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
Packit 9c6abc
    const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
Packit 9c6abc
    const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
Packit 9c6abc
    const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
Packit 9c6abc
    const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
Packit 9c6abc
    const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
Packit 9c6abc
    const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
Packit 9c6abc
    const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
Packit 9c6abc
    const uint16x8_t s0 = vaddq_u16(L0, L1);
Packit 9c6abc
    const uint16x8_t s1 = vaddq_u16(L2, L3);
Packit 9c6abc
    const uint16x8_t s2 = vaddq_u16(L4, L5);
Packit 9c6abc
    const uint16x8_t s3 = vaddq_u16(L6, L7);
Packit 9c6abc
    const uint16x8_t s01 = vaddq_u16(s0, s1);
Packit 9c6abc
    const uint16x8_t s23 = vaddq_u16(s2, s3);
Packit 9c6abc
    sum_left = vaddq_u16(s01, s23);
Packit 9c6abc
  }
Packit 9c6abc
Packit 9c6abc
  if (do_top && do_left) {
Packit 9c6abc
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
Packit 9c6abc
    dc0 = vrshrn_n_u16(sum, 4);
Packit 9c6abc
  } else if (do_top) {
Packit 9c6abc
    dc0 = vrshrn_n_u16(sum_top, 3);
Packit 9c6abc
  } else if (do_left) {
Packit 9c6abc
    dc0 = vrshrn_n_u16(sum_left, 3);
Packit 9c6abc
  } else {
Packit 9c6abc
    dc0 = vdup_n_u8(0x80);
Packit 9c6abc
  }
Packit 9c6abc
Packit 9c6abc
  {
Packit 9c6abc
    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
Packit 9c6abc
    int i;
Packit 9c6abc
    for (i = 0; i < 8; ++i) {
Packit 9c6abc
      vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
Packit 9c6abc
    }
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
Packit 9c6abc
static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
Packit 9c6abc
static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
Packit 9c6abc
static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
Packit 9c6abc
Packit 9c6abc
static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
Packit 9c6abc
Packit 9c6abc
//------------------------------------------------------------------------------
Packit 9c6abc
// 16x16
Packit 9c6abc
Packit 9c6abc
static void VE16_NEON(uint8_t* dst) {     // vertical
Packit 9c6abc
  const uint8x16_t top = vld1q_u8(dst - BPS);
Packit 9c6abc
  int j;
Packit 9c6abc
  for (j = 0; j < 16; ++j) {
Packit 9c6abc
    vst1q_u8(dst + j * BPS, top);
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void HE16_NEON(uint8_t* dst) {     // horizontal
Packit 9c6abc
  int j;
Packit 9c6abc
  for (j = 0; j < 16; ++j) {
Packit 9c6abc
    const uint8x16_t left = vld1q_dup_u8(dst - 1);
Packit 9c6abc
    vst1q_u8(dst, left);
Packit 9c6abc
    dst += BPS;
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
Packit 9c6abc
  uint16x8_t sum_top;
Packit 9c6abc
  uint16x8_t sum_left;
Packit 9c6abc
  uint8x8_t dc0;
Packit 9c6abc
Packit 9c6abc
  if (do_top) {
Packit 9c6abc
    const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
Packit 9c6abc
    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
Packit 9c6abc
    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
Packit 9c6abc
    const uint16x4_t p2 = vpadd_u16(p1, p1);
Packit 9c6abc
    const uint16x4_t p3 = vpadd_u16(p2, p2);
Packit 9c6abc
    sum_top = vcombine_u16(p3, p3);
Packit 9c6abc
  }
Packit 9c6abc
Packit 9c6abc
  if (do_left) {
Packit 9c6abc
    int i;
Packit 9c6abc
    sum_left = vdupq_n_u16(0);
Packit 9c6abc
    for (i = 0; i < 16; i += 8) {
Packit 9c6abc
      const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
Packit 9c6abc
      const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
Packit 9c6abc
      const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
Packit 9c6abc
      const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
Packit 9c6abc
      const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
Packit 9c6abc
      const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
Packit 9c6abc
      const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
Packit 9c6abc
      const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
Packit 9c6abc
      const uint16x8_t s0 = vaddq_u16(L0, L1);
Packit 9c6abc
      const uint16x8_t s1 = vaddq_u16(L2, L3);
Packit 9c6abc
      const uint16x8_t s2 = vaddq_u16(L4, L5);
Packit 9c6abc
      const uint16x8_t s3 = vaddq_u16(L6, L7);
Packit 9c6abc
      const uint16x8_t s01 = vaddq_u16(s0, s1);
Packit 9c6abc
      const uint16x8_t s23 = vaddq_u16(s2, s3);
Packit 9c6abc
      const uint16x8_t sum = vaddq_u16(s01, s23);
Packit 9c6abc
      sum_left = vaddq_u16(sum_left, sum);
Packit 9c6abc
    }
Packit 9c6abc
  }
Packit 9c6abc
Packit 9c6abc
  if (do_top && do_left) {
Packit 9c6abc
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
Packit 9c6abc
    dc0 = vrshrn_n_u16(sum, 5);
Packit 9c6abc
  } else if (do_top) {
Packit 9c6abc
    dc0 = vrshrn_n_u16(sum_top, 4);
Packit 9c6abc
  } else if (do_left) {
Packit 9c6abc
    dc0 = vrshrn_n_u16(sum_left, 4);
Packit 9c6abc
  } else {
Packit 9c6abc
    dc0 = vdup_n_u8(0x80);
Packit 9c6abc
  }
Packit 9c6abc
Packit 9c6abc
  {
Packit 9c6abc
    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
Packit 9c6abc
    int i;
Packit 9c6abc
    for (i = 0; i < 16; ++i) {
Packit 9c6abc
      vst1q_u8(dst + i * BPS, dc);
Packit 9c6abc
    }
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
Packit 9c6abc
static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
Packit 9c6abc
static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
Packit 9c6abc
static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
Packit 9c6abc
Packit 9c6abc
static void TM16_NEON(uint8_t* dst) {
Packit 9c6abc
  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
Packit 9c6abc
  const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
Packit 9c6abc
  // A[c] - A[-1]
Packit 9c6abc
  const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
Packit 9c6abc
  const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
Packit 9c6abc
  int y;
Packit 9c6abc
  for (y = 0; y < 16; y += 4) {
Packit 9c6abc
    // left edge
Packit 9c6abc
    const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
Packit 9c6abc
    const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
Packit 9c6abc
    const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
Packit 9c6abc
    const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
Packit 9c6abc
    const int16x8_t r0_lo = vaddq_s16(L0, d_lo);  // L[r] + A[c] - A[-1]
Packit 9c6abc
    const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
Packit 9c6abc
    const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
Packit 9c6abc
    const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
Packit 9c6abc
    const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
Packit 9c6abc
    const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
Packit 9c6abc
    const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
Packit 9c6abc
    const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
Packit 9c6abc
    // Saturate and store the result.
Packit 9c6abc
    const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
Packit 9c6abc
    const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
Packit 9c6abc
    const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
Packit 9c6abc
    const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
Packit 9c6abc
    vst1q_u8(dst + 0 * BPS, row0);
Packit 9c6abc
    vst1q_u8(dst + 1 * BPS, row1);
Packit 9c6abc
    vst1q_u8(dst + 2 * BPS, row2);
Packit 9c6abc
    vst1q_u8(dst + 3 * BPS, row3);
Packit 9c6abc
    dst += 4 * BPS;
Packit 9c6abc
  }
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
//------------------------------------------------------------------------------
Packit 9c6abc
// Entry point
Packit 9c6abc
Packit 9c6abc
extern void VP8DspInitNEON(void);
Packit 9c6abc
Packit 9c6abc
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
Packit 9c6abc
  VP8Transform = TransformTwo_NEON;
Packit 9c6abc
  VP8TransformAC3 = TransformAC3_NEON;
Packit 9c6abc
  VP8TransformDC = TransformDC_NEON;
Packit 9c6abc
  VP8TransformWHT = TransformWHT_NEON;
Packit 9c6abc
Packit 9c6abc
  VP8VFilter16 = VFilter16_NEON;
Packit 9c6abc
  VP8VFilter16i = VFilter16i_NEON;
Packit 9c6abc
  VP8HFilter16 = HFilter16_NEON;
Packit 9c6abc
#if !defined(WORK_AROUND_GCC)
Packit 9c6abc
  VP8HFilter16i = HFilter16i_NEON;
Packit 9c6abc
#endif
Packit 9c6abc
  VP8VFilter8 = VFilter8_NEON;
Packit 9c6abc
  VP8VFilter8i = VFilter8i_NEON;
Packit 9c6abc
#if !defined(WORK_AROUND_GCC)
Packit 9c6abc
  VP8HFilter8 = HFilter8_NEON;
Packit 9c6abc
  VP8HFilter8i = HFilter8i_NEON;
Packit 9c6abc
#endif
Packit 9c6abc
  VP8SimpleVFilter16 = SimpleVFilter16_NEON;
Packit 9c6abc
  VP8SimpleHFilter16 = SimpleHFilter16_NEON;
Packit 9c6abc
  VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
Packit 9c6abc
  VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
Packit 9c6abc
Packit 9c6abc
  VP8PredLuma4[0] = DC4_NEON;
Packit 9c6abc
  VP8PredLuma4[1] = TM4_NEON;
Packit 9c6abc
  VP8PredLuma4[2] = VE4_NEON;
Packit 9c6abc
  VP8PredLuma4[4] = RD4_NEON;
Packit 9c6abc
  VP8PredLuma4[6] = LD4_NEON;
Packit 9c6abc
Packit 9c6abc
  VP8PredLuma16[0] = DC16TopLeft_NEON;
Packit 9c6abc
  VP8PredLuma16[1] = TM16_NEON;
Packit 9c6abc
  VP8PredLuma16[2] = VE16_NEON;
Packit 9c6abc
  VP8PredLuma16[3] = HE16_NEON;
Packit 9c6abc
  VP8PredLuma16[4] = DC16NoTop_NEON;
Packit 9c6abc
  VP8PredLuma16[5] = DC16NoLeft_NEON;
Packit 9c6abc
  VP8PredLuma16[6] = DC16NoTopLeft_NEON;
Packit 9c6abc
Packit 9c6abc
  VP8PredChroma8[0] = DC8uv_NEON;
Packit 9c6abc
  VP8PredChroma8[1] = TM8uv_NEON;
Packit 9c6abc
  VP8PredChroma8[2] = VE8uv_NEON;
Packit 9c6abc
  VP8PredChroma8[3] = HE8uv_NEON;
Packit 9c6abc
  VP8PredChroma8[4] = DC8uvNoTop_NEON;
Packit 9c6abc
  VP8PredChroma8[5] = DC8uvNoLeft_NEON;
Packit 9c6abc
  VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
Packit 9c6abc
}
Packit 9c6abc
Packit 9c6abc
#else  // !WEBP_USE_NEON
Packit 9c6abc
Packit 9c6abc
WEBP_DSP_INIT_STUB(VP8DspInitNEON)
Packit 9c6abc
Packit 9c6abc
#endif  // WEBP_USE_NEON