Blame src/dsp/msa_macro.h

Packit 9c6abc
// Copyright 2016 Google Inc. All Rights Reserved.
Packit 9c6abc
//
Packit 9c6abc
// Use of this source code is governed by a BSD-style license
Packit 9c6abc
// that can be found in the COPYING file in the root of the source
Packit 9c6abc
// tree. An additional intellectual property rights grant can be found
Packit 9c6abc
// in the file PATENTS. All contributing project authors may
Packit 9c6abc
// be found in the AUTHORS file in the root of the source tree.
Packit 9c6abc
// -----------------------------------------------------------------------------
Packit 9c6abc
//
Packit 9c6abc
// MSA common macros
Packit 9c6abc
//
Packit 9c6abc
// Author(s):  Prashant Patil   (prashant.patil@imgtec.com)
Packit 9c6abc
Packit 9c6abc
#ifndef WEBP_DSP_MSA_MACRO_H_
Packit 9c6abc
#define WEBP_DSP_MSA_MACRO_H_
Packit 9c6abc
Packit 9c6abc
#include <stdint.h>
Packit 9c6abc
#include <msa.h>
Packit 9c6abc
Packit 9c6abc
#if defined(__clang__)
Packit 9c6abc
  #define CLANG_BUILD
Packit 9c6abc
#endif
Packit 9c6abc
Packit 9c6abc
#ifdef CLANG_BUILD
Packit 9c6abc
  #define ALPHAVAL  (-1)
Packit 9c6abc
  #define ADDVI_H(a, b)  __msa_addvi_h((v8i16)a, b)
Packit 9c6abc
  #define ADDVI_W(a, b)  __msa_addvi_w((v4i32)a, b)
Packit 9c6abc
  #define SRAI_B(a, b)  __msa_srai_b((v16i8)a, b)
Packit 9c6abc
  #define SRAI_H(a, b)  __msa_srai_h((v8i16)a, b)
Packit 9c6abc
  #define SRAI_W(a, b)  __msa_srai_w((v4i32)a, b)
Packit 9c6abc
  #define SRLI_H(a, b)  __msa_srli_h((v8i16)a, b)
Packit 9c6abc
  #define SLLI_B(a, b)  __msa_slli_b((v4i32)a, b)
Packit 9c6abc
  #define ANDI_B(a, b)  __msa_andi_b((v16u8)a, b)
Packit 9c6abc
  #define ORI_B(a, b)   __msa_ori_b((v16u8)a, b)
Packit 9c6abc
#else
Packit 9c6abc
  #define ALPHAVAL  (0xff)
Packit 9c6abc
  #define ADDVI_H(a, b)  (a + b)
Packit 9c6abc
  #define ADDVI_W(a, b)  (a + b)
Packit 9c6abc
  #define SRAI_B(a, b)  (a >> b)
Packit 9c6abc
  #define SRAI_H(a, b)  (a >> b)
Packit 9c6abc
  #define SRAI_W(a, b)  (a >> b)
Packit 9c6abc
  #define SRLI_H(a, b)  (a << b)
Packit 9c6abc
  #define SLLI_B(a, b)  (a << b)
Packit 9c6abc
  #define ANDI_B(a, b)  (a & b)
Packit 9c6abc
  #define ORI_B(a, b)   (a | b)
Packit 9c6abc
#endif
Packit 9c6abc
Packit 9c6abc
#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc))
Packit 9c6abc
#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
Packit 9c6abc
#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define LD_H(RTYPE, psrc) *((RTYPE*)(psrc))
Packit 9c6abc
#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
Packit 9c6abc
#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define LD_W(RTYPE, psrc) *((RTYPE*)(psrc))
Packit 9c6abc
#define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
Packit 9c6abc
#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
Packit 9c6abc
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
Packit 9c6abc
#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
Packit 9c6abc
#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define ST_W(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
Packit 9c6abc
#define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
Packit 9c6abc
#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define MSA_LOAD_FUNC(TYPE, INSTR, FUNC_NAME)             \
Packit 9c6abc
  static inline TYPE FUNC_NAME(const void* const psrc) {  \
Packit 9c6abc
    const uint8_t* const psrc_m = (const uint8_t*)psrc;   \
Packit 9c6abc
    TYPE val_m;                                           \
Packit 9c6abc
    asm volatile (                                        \
Packit 9c6abc
      "" #INSTR " %[val_m], %[psrc_m]  \n\t"              \
Packit 9c6abc
      : [val_m] "=r" (val_m)                              \
Packit 9c6abc
      : [psrc_m] "m" (*psrc_m));                          \
Packit 9c6abc
    return val_m;                                         \
Packit 9c6abc
  }
Packit 9c6abc
Packit 9c6abc
#define MSA_LOAD(psrc, FUNC_NAME)  FUNC_NAME(psrc)
Packit 9c6abc
Packit 9c6abc
#define MSA_STORE_FUNC(TYPE, INSTR, FUNC_NAME)               \
Packit 9c6abc
  static inline void FUNC_NAME(TYPE val, void* const pdst) { \
Packit 9c6abc
    uint8_t* const pdst_m = (uint8_t*)pdst;                  \
Packit 9c6abc
    TYPE val_m = val;                                        \
Packit 9c6abc
    asm volatile (                                           \
Packit 9c6abc
      " " #INSTR "  %[val_m],  %[pdst_m]  \n\t"              \
Packit 9c6abc
      : [pdst_m] "=m" (*pdst_m)                              \
Packit 9c6abc
      : [val_m] "r" (val_m));                                \
Packit 9c6abc
  }
Packit 9c6abc
Packit 9c6abc
#define MSA_STORE(val, pdst, FUNC_NAME)  FUNC_NAME(val, pdst)
Packit 9c6abc
Packit 9c6abc
#if (__mips_isa_rev >= 6)
Packit 9c6abc
  MSA_LOAD_FUNC(uint16_t, lh, msa_lh);
Packit 9c6abc
  #define LH(psrc)  MSA_LOAD(psrc, msa_lh)
Packit 9c6abc
  MSA_LOAD_FUNC(uint32_t, lw, msa_lw);
Packit 9c6abc
  #define LW(psrc)  MSA_LOAD(psrc, msa_lw)
Packit 9c6abc
  #if (__mips == 64)
Packit 9c6abc
    MSA_LOAD_FUNC(uint64_t, ld, msa_ld);
Packit 9c6abc
    #define LD(psrc)  MSA_LOAD(psrc, msa_ld)
Packit 9c6abc
  #else  // !(__mips == 64)
Packit 9c6abc
    #define LD(psrc)  ((((uint64_t)MSA_LOAD(psrc + 4, msa_lw)) << 32) | \
Packit 9c6abc
                       MSA_LOAD(psrc, msa_lw))
Packit 9c6abc
  #endif  // (__mips == 64)
Packit 9c6abc
Packit 9c6abc
  MSA_STORE_FUNC(uint16_t, sh, msa_sh);
Packit 9c6abc
  #define SH(val, pdst)  MSA_STORE(val, pdst, msa_sh)
Packit 9c6abc
  MSA_STORE_FUNC(uint32_t, sw, msa_sw);
Packit 9c6abc
  #define SW(val, pdst)  MSA_STORE(val, pdst, msa_sw)
Packit 9c6abc
  MSA_STORE_FUNC(uint64_t, sd, msa_sd);
Packit 9c6abc
  #define SD(val, pdst)  MSA_STORE(val, pdst, msa_sd)
Packit 9c6abc
#else  // !(__mips_isa_rev >= 6)
Packit 9c6abc
  MSA_LOAD_FUNC(uint16_t, ulh, msa_ulh);
Packit 9c6abc
  #define LH(psrc)  MSA_LOAD(psrc, msa_ulh)
Packit 9c6abc
  MSA_LOAD_FUNC(uint32_t, ulw, msa_ulw);
Packit 9c6abc
  #define LW(psrc)  MSA_LOAD(psrc, msa_ulw)
Packit 9c6abc
  #if (__mips == 64)
Packit 9c6abc
    MSA_LOAD_FUNC(uint64_t, uld, msa_uld);
Packit 9c6abc
    #define LD(psrc)  MSA_LOAD(psrc, msa_uld)
Packit 9c6abc
  #else  // !(__mips == 64)
Packit 9c6abc
    #define LD(psrc)  ((((uint64_t)MSA_LOAD(psrc + 4, msa_ulw)) << 32) | \
Packit 9c6abc
                        MSA_LOAD(psrc, msa_ulw))
Packit 9c6abc
  #endif  // (__mips == 64)
Packit 9c6abc
Packit 9c6abc
  MSA_STORE_FUNC(uint16_t, ush, msa_ush);
Packit 9c6abc
  #define SH(val, pdst)  MSA_STORE(val, pdst, msa_ush)
Packit 9c6abc
  MSA_STORE_FUNC(uint32_t, usw, msa_usw);
Packit 9c6abc
  #define SW(val, pdst)  MSA_STORE(val, pdst, msa_usw)
Packit 9c6abc
  #define SD(val, pdst) do {                                               \
Packit 9c6abc
    uint8_t* const pdst_sd_m = (uint8_t*)(pdst);                           \
Packit 9c6abc
    const uint32_t val0_m = (uint32_t)(val & 0x00000000FFFFFFFF);          \
Packit 9c6abc
    const uint32_t val1_m = (uint32_t)((val >> 32) & 0x00000000FFFFFFFF);  \
Packit 9c6abc
    SW(val0_m, pdst_sd_m);                                                 \
Packit 9c6abc
    SW(val1_m, pdst_sd_m + 4);                                             \
Packit 9c6abc
  } while (0)
Packit 9c6abc
#endif  // (__mips_isa_rev >= 6)
Packit 9c6abc
Packit 9c6abc
/* Description : Load 4 words with stride
Packit 9c6abc
 * Arguments   : Inputs  - psrc, stride
Packit 9c6abc
 *               Outputs - out0, out1, out2, out3
Packit 9c6abc
 * Details     : Load word in 'out0' from (psrc)
Packit 9c6abc
 *               Load word in 'out1' from (psrc + stride)
Packit 9c6abc
 *               Load word in 'out2' from (psrc + 2 * stride)
Packit 9c6abc
 *               Load word in 'out3' from (psrc + 3 * stride)
Packit 9c6abc
 */
Packit 9c6abc
#define LW4(psrc, stride, out0, out1, out2, out3) do {  \
Packit 9c6abc
  const uint8_t* ptmp = (const uint8_t*)psrc;           \
Packit 9c6abc
  out0 = LW(ptmp);                                      \
Packit 9c6abc
  ptmp += stride;                                       \
Packit 9c6abc
  out1 = LW(ptmp);                                      \
Packit 9c6abc
  ptmp += stride;                                       \
Packit 9c6abc
  out2 = LW(ptmp);                                      \
Packit 9c6abc
  ptmp += stride;                                       \
Packit 9c6abc
  out3 = LW(ptmp);                                      \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Store words with stride
Packit 9c6abc
 * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
Packit 9c6abc
 * Details     : Store word from 'in0' to (pdst)
Packit 9c6abc
 *               Store word from 'in1' to (pdst + stride)
Packit 9c6abc
 *               Store word from 'in2' to (pdst + 2 * stride)
Packit 9c6abc
 *               Store word from 'in3' to (pdst + 3 * stride)
Packit 9c6abc
 */
Packit 9c6abc
#define SW4(in0, in1, in2, in3, pdst, stride) do {  \
Packit 9c6abc
  uint8_t* ptmp = (uint8_t*)pdst;                   \
Packit 9c6abc
  SW(in0, ptmp);                                    \
Packit 9c6abc
  ptmp += stride;                                   \
Packit 9c6abc
  SW(in1, ptmp);                                    \
Packit 9c6abc
  ptmp += stride;                                   \
Packit 9c6abc
  SW(in2, ptmp);                                    \
Packit 9c6abc
  ptmp += stride;                                   \
Packit 9c6abc
  SW(in3, ptmp);                                    \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
#define SW3(in0, in1, in2, pdst, stride) do {  \
Packit 9c6abc
  uint8_t* ptmp = (uint8_t*)pdst;              \
Packit 9c6abc
  SW(in0, ptmp);                               \
Packit 9c6abc
  ptmp += stride;                              \
Packit 9c6abc
  SW(in1, ptmp);                               \
Packit 9c6abc
  ptmp += stride;                              \
Packit 9c6abc
  SW(in2, ptmp);                               \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
#define SW2(in0, in1, pdst, stride) do {  \
Packit 9c6abc
  uint8_t* ptmp = (uint8_t*)pdst;         \
Packit 9c6abc
  SW(in0, ptmp);                          \
Packit 9c6abc
  ptmp += stride;                         \
Packit 9c6abc
  SW(in1, ptmp);                          \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Store 4 double words with stride
Packit 9c6abc
 * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
Packit 9c6abc
 * Details     : Store double word from 'in0' to (pdst)
Packit 9c6abc
 *               Store double word from 'in1' to (pdst + stride)
Packit 9c6abc
 *               Store double word from 'in2' to (pdst + 2 * stride)
Packit 9c6abc
 *               Store double word from 'in3' to (pdst + 3 * stride)
Packit 9c6abc
 */
Packit 9c6abc
#define SD4(in0, in1, in2, in3, pdst, stride) do {  \
Packit 9c6abc
  uint8_t* ptmp = (uint8_t*)pdst;                   \
Packit 9c6abc
  SD(in0, ptmp);                                    \
Packit 9c6abc
  ptmp += stride;                                   \
Packit 9c6abc
  SD(in1, ptmp);                                    \
Packit 9c6abc
  ptmp += stride;                                   \
Packit 9c6abc
  SD(in2, ptmp);                                    \
Packit 9c6abc
  ptmp += stride;                                   \
Packit 9c6abc
  SD(in3, ptmp);                                    \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Load vectors with 16 byte elements with stride
Packit 9c6abc
 * Arguments   : Inputs  - psrc, stride
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Load 16 byte elements in 'out0' from (psrc)
Packit 9c6abc
 *               Load 16 byte elements in 'out1' from (psrc + stride)
Packit 9c6abc
 */
Packit 9c6abc
#define LD_B2(RTYPE, psrc, stride, out0, out1) do {  \
Packit 9c6abc
  out0 = LD_B(RTYPE, psrc);                          \
Packit 9c6abc
  out1 = LD_B(RTYPE, psrc + stride);                 \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) do {  \
Packit 9c6abc
  LD_B2(RTYPE, psrc, stride, out0, out1);                  \
Packit 9c6abc
  out2 = LD_B(RTYPE, psrc + 2 * stride);                   \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
Packit 9c6abc
#define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) do {  \
Packit 9c6abc
  LD_B2(RTYPE, psrc, stride, out0, out1);                        \
Packit 9c6abc
  LD_B2(RTYPE, psrc + 2 * stride , stride, out2, out3);          \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
Packit 9c6abc
#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define LD_B8(RTYPE, psrc, stride,                                  \
Packit 9c6abc
              out0, out1, out2, out3, out4, out5, out6, out7) do {  \
Packit 9c6abc
  LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3);               \
Packit 9c6abc
  LD_B4(RTYPE, psrc + 4 * stride, stride, out4, out5, out6, out7);  \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
Packit 9c6abc
#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Load vectors with 8 halfword elements with stride
Packit 9c6abc
 * Arguments   : Inputs  - psrc, stride
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 * Details     : Load 8 halfword elements in 'out0' from (psrc)
Packit 9c6abc
 *               Load 8 halfword elements in 'out1' from (psrc + stride)
Packit 9c6abc
 */
Packit 9c6abc
#define LD_H2(RTYPE, psrc, stride, out0, out1) do {  \
Packit 9c6abc
  out0 = LD_H(RTYPE, psrc);                          \
Packit 9c6abc
  out1 = LD_H(RTYPE, psrc + stride);                 \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Load vectors with 4 word elements with stride
Packit 9c6abc
 * Arguments   : Inputs  - psrc, stride
Packit 9c6abc
 *               Outputs - out0, out1, out2, out3
Packit 9c6abc
 * Details     : Load 4 word elements in 'out0' from (psrc + 0 * stride)
Packit 9c6abc
 *               Load 4 word elements in 'out1' from (psrc + 1 * stride)
Packit 9c6abc
 *               Load 4 word elements in 'out2' from (psrc + 2 * stride)
Packit 9c6abc
 *               Load 4 word elements in 'out3' from (psrc + 3 * stride)
Packit 9c6abc
 */
Packit 9c6abc
#define LD_W2(RTYPE, psrc, stride, out0, out1) do {  \
Packit 9c6abc
  out0 = LD_W(RTYPE, psrc);                          \
Packit 9c6abc
  out1 = LD_W(RTYPE, psrc + stride);                 \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define LD_UW2(...) LD_W2(v4u32, __VA_ARGS__)
Packit 9c6abc
#define LD_SW2(...) LD_W2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define LD_W3(RTYPE, psrc, stride, out0, out1, out2) do {  \
Packit 9c6abc
  LD_W2(RTYPE, psrc, stride, out0, out1);                  \
Packit 9c6abc
  out2 = LD_W(RTYPE, psrc + 2 * stride);                   \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define LD_UW3(...) LD_W3(v4u32, __VA_ARGS__)
Packit 9c6abc
#define LD_SW3(...) LD_W3(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define LD_W4(RTYPE, psrc, stride, out0, out1, out2, out3) do {  \
Packit 9c6abc
  LD_W2(RTYPE, psrc, stride, out0, out1);                        \
Packit 9c6abc
  LD_W2(RTYPE, psrc + 2 * stride, stride, out2, out3);           \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define LD_UW4(...) LD_W4(v4u32, __VA_ARGS__)
Packit 9c6abc
#define LD_SW4(...) LD_W4(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Store vectors of 16 byte elements with stride
Packit 9c6abc
 * Arguments   : Inputs - in0, in1, pdst, stride
Packit 9c6abc
 * Details     : Store 16 byte elements from 'in0' to (pdst)
Packit 9c6abc
 *               Store 16 byte elements from 'in1' to (pdst + stride)
Packit 9c6abc
 */
Packit 9c6abc
#define ST_B2(RTYPE, in0, in1, pdst, stride) do {  \
Packit 9c6abc
  ST_B(RTYPE, in0, pdst);                          \
Packit 9c6abc
  ST_B(RTYPE, in1, pdst + stride);                 \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) do {  \
Packit 9c6abc
  ST_B2(RTYPE, in0, in1, pdst, stride);                      \
Packit 9c6abc
  ST_B2(RTYPE, in2, in3, pdst + 2 * stride, stride);         \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,    \
Packit 9c6abc
              pdst, stride) do {                                \
Packit 9c6abc
  ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);               \
Packit 9c6abc
  ST_B4(RTYPE, in4, in5, in6, in7, pdst + 4 * stride, stride);  \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Store vectors of 4 word elements with stride
Packit 9c6abc
 * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
Packit 9c6abc
 * Details     : Store 4 word elements from 'in0' to (pdst + 0 * stride)
Packit 9c6abc
 *               Store 4 word elements from 'in1' to (pdst + 1 * stride)
Packit 9c6abc
 *               Store 4 word elements from 'in2' to (pdst + 2 * stride)
Packit 9c6abc
 *               Store 4 word elements from 'in3' to (pdst + 3 * stride)
Packit 9c6abc
 */
Packit 9c6abc
#define ST_W2(RTYPE, in0, in1, pdst, stride) do {  \
Packit 9c6abc
  ST_W(RTYPE, in0, pdst);                          \
Packit 9c6abc
  ST_W(RTYPE, in1, pdst + stride);                 \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ST_UW2(...) ST_W2(v4u32, __VA_ARGS__)
Packit 9c6abc
#define ST_SW2(...) ST_W2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define ST_W3(RTYPE, in0, in1, in2, pdst, stride) do {  \
Packit 9c6abc
  ST_W2(RTYPE, in0, in1, pdst, stride);                 \
Packit 9c6abc
  ST_W(RTYPE, in2, pdst + 2 * stride);                  \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ST_UW3(...) ST_W3(v4u32, __VA_ARGS__)
Packit 9c6abc
#define ST_SW3(...) ST_W3(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define ST_W4(RTYPE, in0, in1, in2, in3, pdst, stride) do {  \
Packit 9c6abc
  ST_W2(RTYPE, in0, in1, pdst, stride);                      \
Packit 9c6abc
  ST_W2(RTYPE, in2, in3, pdst + 2 * stride, stride);         \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ST_UW4(...) ST_W4(v4u32, __VA_ARGS__)
Packit 9c6abc
#define ST_SW4(...) ST_W4(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Store vectors of 8 halfword elements with stride
Packit 9c6abc
 * Arguments   : Inputs - in0, in1, pdst, stride
Packit 9c6abc
 * Details     : Store 8 halfword elements from 'in0' to (pdst)
Packit 9c6abc
 *               Store 8 halfword elements from 'in1' to (pdst + stride)
Packit 9c6abc
 */
Packit 9c6abc
#define ST_H2(RTYPE, in0, in1, pdst, stride) do {  \
Packit 9c6abc
  ST_H(RTYPE, in0, pdst);                          \
Packit 9c6abc
  ST_H(RTYPE, in1, pdst + stride);                 \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Store 2x4 byte block to destination memory from input vector
Packit 9c6abc
 * Arguments   : Inputs - in, stidx, pdst, stride
Packit 9c6abc
 * Details     : Index 'stidx' halfword element from 'in' vector is copied to
Packit 9c6abc
 *               the GP register and stored to (pdst)
Packit 9c6abc
 *               Index 'stidx+1' halfword element from 'in' vector is copied to
Packit 9c6abc
 *               the GP register and stored to (pdst + stride)
Packit 9c6abc
 *               Index 'stidx+2' halfword element from 'in' vector is copied to
Packit 9c6abc
 *               the GP register and stored to (pdst + 2 * stride)
Packit 9c6abc
 *               Index 'stidx+3' halfword element from 'in' vector is copied to
Packit 9c6abc
 *               the GP register and stored to (pdst + 3 * stride)
Packit 9c6abc
 */
Packit 9c6abc
#define ST2x4_UB(in, stidx, pdst, stride) do {                   \
Packit 9c6abc
  uint8_t* pblk_2x4_m = (uint8_t*)pdst;                          \
Packit 9c6abc
  const uint16_t out0_m = __msa_copy_s_h((v8i16)in, stidx);      \
Packit 9c6abc
  const uint16_t out1_m = __msa_copy_s_h((v8i16)in, stidx + 1);  \
Packit 9c6abc
  const uint16_t out2_m = __msa_copy_s_h((v8i16)in, stidx + 2);  \
Packit 9c6abc
  const uint16_t out3_m = __msa_copy_s_h((v8i16)in, stidx + 3);  \
Packit 9c6abc
  SH(out0_m, pblk_2x4_m);                                        \
Packit 9c6abc
  pblk_2x4_m += stride;                                          \
Packit 9c6abc
  SH(out1_m, pblk_2x4_m);                                        \
Packit 9c6abc
  pblk_2x4_m += stride;                                          \
Packit 9c6abc
  SH(out2_m, pblk_2x4_m);                                        \
Packit 9c6abc
  pblk_2x4_m += stride;                                          \
Packit 9c6abc
  SH(out3_m, pblk_2x4_m);                                        \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Store 4x4 byte block to destination memory from input vector
Packit 9c6abc
 * Arguments   : Inputs - in0, in1, pdst, stride
Packit 9c6abc
 * Details     : 'Idx0' word element from input vector 'in0' is copied to the
Packit 9c6abc
 *               GP register and stored to (pdst)
Packit 9c6abc
 *               'Idx1' word element from input vector 'in0' is copied to the
Packit 9c6abc
 *               GP register and stored to (pdst + stride)
Packit 9c6abc
 *               'Idx2' word element from input vector 'in0' is copied to the
Packit 9c6abc
 *               GP register and stored to (pdst + 2 * stride)
Packit 9c6abc
 *               'Idx3' word element from input vector 'in0' is copied to the
Packit 9c6abc
 *               GP register and stored to (pdst + 3 * stride)
Packit 9c6abc
 */
Packit 9c6abc
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) do {  \
Packit 9c6abc
  uint8_t* const pblk_4x4_m = (uint8_t*)pdst;                          \
Packit 9c6abc
  const uint32_t out0_m = __msa_copy_s_w((v4i32)in0, idx0);            \
Packit 9c6abc
  const uint32_t out1_m = __msa_copy_s_w((v4i32)in0, idx1);            \
Packit 9c6abc
  const uint32_t out2_m = __msa_copy_s_w((v4i32)in1, idx2);            \
Packit 9c6abc
  const uint32_t out3_m = __msa_copy_s_w((v4i32)in1, idx3);            \
Packit 9c6abc
  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);             \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
#define ST4x8_UB(in0, in1, pdst, stride) do {                     \
Packit 9c6abc
  uint8_t* const pblk_4x8 = (uint8_t*)pdst;                       \
Packit 9c6abc
  ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
Packit 9c6abc
  ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Immediate number of elements to slide
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, slide_val
Packit 9c6abc
 *               Outputs - out
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Byte elements from 'in1' vector are slid into 'in0' by
Packit 9c6abc
 *               value specified in the 'slide_val'
Packit 9c6abc
 */
Packit 9c6abc
#define SLDI_B(RTYPE, in0, in1, slide_val)                      \
Packit 9c6abc
        (RTYPE)__msa_sldi_b((v16i8)in0, (v16i8)in1, slide_val)  \
Packit 9c6abc
Packit 9c6abc
#define SLDI_UB(...) SLDI_B(v16u8, __VA_ARGS__)
Packit 9c6abc
#define SLDI_SB(...) SLDI_B(v16i8, __VA_ARGS__)
Packit 9c6abc
#define SLDI_SH(...) SLDI_B(v8i16, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Shuffle byte vector elements as per mask vector
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Byte elements from 'in0' & 'in1' are copied selectively to
Packit 9c6abc
 *               'out0' as per control vector 'mask0'
Packit 9c6abc
 */
Packit 9c6abc
#define VSHF_B(RTYPE, in0, in1, mask)                              \
Packit 9c6abc
        (RTYPE)__msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0)
Packit 9c6abc
Packit 9c6abc
#define VSHF_UB(...) VSHF_B(v16u8, __VA_ARGS__)
Packit 9c6abc
#define VSHF_SB(...) VSHF_B(v16i8, __VA_ARGS__)
Packit 9c6abc
#define VSHF_UH(...) VSHF_B(v8u16, __VA_ARGS__)
Packit 9c6abc
#define VSHF_SH(...) VSHF_B(v8i16, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) do {  \
Packit 9c6abc
  out0 = VSHF_B(RTYPE, in0, in1, mask0);                                   \
Packit 9c6abc
  out1 = VSHF_B(RTYPE, in2, in3, mask1);                                   \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
Packit 9c6abc
#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Shuffle halfword vector elements as per mask vector
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : halfword elements from 'in0' & 'in1' are copied selectively to
Packit 9c6abc
 *               'out0' as per control vector 'mask0'
Packit 9c6abc
 */
Packit 9c6abc
#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0);        \
Packit 9c6abc
  out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2);        \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define VSHF_H2_UH(...) VSHF_H2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Dot product of byte vector elements
Packit 9c6abc
 * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Signed byte elements from 'mult0' are multiplied with
Packit 9c6abc
 *               signed byte elements from 'cnst0' producing a result
Packit 9c6abc
 *               twice the size of input i.e. signed halfword.
Packit 9c6abc
 *               The multiplication result of adjacent odd-even elements
Packit 9c6abc
 *               are added together and written to the 'out0' vector
Packit 9c6abc
*/
Packit 9c6abc
#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);           \
Packit 9c6abc
  out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);           \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Dot product of halfword vector elements
Packit 9c6abc
 * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Signed halfword elements from 'mult0' are multiplied with
Packit 9c6abc
 *               signed halfword elements from 'cnst0' producing a result
Packit 9c6abc
 *               twice the size of input i.e. signed word.
Packit 9c6abc
 *               The multiplication result of adjacent odd-even elements
Packit 9c6abc
 *               are added together and written to the 'out0' vector
Packit 9c6abc
 */
Packit 9c6abc
#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);           \
Packit 9c6abc
  out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);           \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Dot product of unsigned word vector elements
Packit 9c6abc
 * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Unsigned word elements from 'mult0' are multiplied with
Packit 9c6abc
 *               unsigned word elements from 'cnst0' producing a result
Packit 9c6abc
 *               twice the size of input i.e. unsigned double word.
Packit 9c6abc
 *               The multiplication result of adjacent odd-even elements
Packit 9c6abc
 *               are added together and written to the 'out0' vector
Packit 9c6abc
 */
Packit 9c6abc
#define DOTP_UW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_dotp_u_d((v4u32)mult0, (v4u32)cnst0);           \
Packit 9c6abc
  out1 = (RTYPE)__msa_dotp_u_d((v4u32)mult1, (v4u32)cnst1);           \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define DOTP_UW2_UD(...) DOTP_UW2(v2u64, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Dot product & addition of halfword vector elements
Packit 9c6abc
 * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Signed halfword elements from 'mult0' are multiplied with
Packit 9c6abc
 *               signed halfword elements from 'cnst0' producing a result
Packit 9c6abc
 *               twice the size of input i.e. signed word.
Packit 9c6abc
 *               The multiplication result of adjacent odd-even elements
Packit 9c6abc
 *               are added to the 'out0' vector
Packit 9c6abc
 */
Packit 9c6abc
#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {      \
Packit 9c6abc
  out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \
Packit 9c6abc
  out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Clips all signed halfword elements of input vector
Packit 9c6abc
 *               between 0 & 255
Packit 9c6abc
 * Arguments   : Input/output  - val
Packit 9c6abc
 *               Return Type - signed halfword
Packit 9c6abc
 */
Packit 9c6abc
#define CLIP_SH_0_255(val) do {                   \
Packit 9c6abc
  const v8i16 max_m = __msa_ldi_h(255);           \
Packit 9c6abc
  val = __msa_maxi_s_h((v8i16)val, 0);            \
Packit 9c6abc
  val = __msa_min_s_h(max_m, (v8i16)val);         \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
#define CLIP_SH2_0_255(in0, in1) do {  \
Packit 9c6abc
  CLIP_SH_0_255(in0);                  \
Packit 9c6abc
  CLIP_SH_0_255(in1);                  \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
#define CLIP_SH4_0_255(in0, in1, in2, in3) do {  \
Packit 9c6abc
  CLIP_SH2_0_255(in0, in1);                      \
Packit 9c6abc
  CLIP_SH2_0_255(in2, in3);                      \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Clips all unsigned halfword elements of input vector
Packit 9c6abc
 *               between 0 & 255
Packit 9c6abc
 * Arguments   : Input  - in
Packit 9c6abc
 *               Output - out_m
Packit 9c6abc
 *               Return Type - unsigned halfword
Packit 9c6abc
 */
Packit 9c6abc
#define CLIP_UH_0_255(in) do {                    \
Packit 9c6abc
  const v8u16 max_m = (v8u16)__msa_ldi_h(255);    \
Packit 9c6abc
  in = __msa_maxi_u_h((v8u16) in, 0);             \
Packit 9c6abc
  in = __msa_min_u_h((v8u16) max_m, (v8u16) in);  \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
#define CLIP_UH2_0_255(in0, in1) do {  \
Packit 9c6abc
  CLIP_UH_0_255(in0);                  \
Packit 9c6abc
  CLIP_UH_0_255(in1);                  \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Clips all signed word elements of input vector
Packit 9c6abc
 *               between 0 & 255
Packit 9c6abc
 * Arguments   : Input/output  - val
Packit 9c6abc
 *               Return Type - signed word
Packit 9c6abc
 */
Packit 9c6abc
#define CLIP_SW_0_255(val) do {                   \
Packit 9c6abc
  const v4i32 max_m = __msa_ldi_w(255);           \
Packit 9c6abc
  val = __msa_maxi_s_w((v4i32)val, 0);            \
Packit 9c6abc
  val = __msa_min_s_w(max_m, (v4i32)val);         \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
#define CLIP_SW4_0_255(in0, in1, in2, in3) do {   \
Packit 9c6abc
  CLIP_SW_0_255(in0);                             \
Packit 9c6abc
  CLIP_SW_0_255(in1);                             \
Packit 9c6abc
  CLIP_SW_0_255(in2);                             \
Packit 9c6abc
  CLIP_SW_0_255(in3);                             \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Horizontal addition of 4 signed word elements of input vector
Packit 9c6abc
 * Arguments   : Input  - in       (signed word vector)
Packit 9c6abc
 *               Output - sum_m    (i32 sum)
Packit 9c6abc
 *               Return Type - signed word (GP)
Packit 9c6abc
 * Details     : 4 signed word elements of 'in' vector are added together and
Packit 9c6abc
 *               the resulting integer sum is returned
Packit 9c6abc
 */
Packit 9c6abc
static WEBP_INLINE int32_t func_hadd_sw_s32(v4i32 in) {
Packit 9c6abc
  const v2i64 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);
Packit 9c6abc
  const v2i64 res1_m = __msa_splati_d(res0_m, 1);
Packit 9c6abc
  const v2i64 out = res0_m + res1_m;
Packit 9c6abc
  int32_t sum_m = __msa_copy_s_w((v4i32)out, 0);
Packit 9c6abc
  return sum_m;
Packit 9c6abc
}
Packit 9c6abc
#define HADD_SW_S32(in) func_hadd_sw_s32(in)
Packit 9c6abc
Packit 9c6abc
/* Description : Horizontal addition of 8 signed halfword elements
Packit 9c6abc
 * Arguments   : Input  - in       (signed halfword vector)
Packit 9c6abc
 *               Output - sum_m    (s32 sum)
Packit 9c6abc
 *               Return Type - signed word
Packit 9c6abc
 * Details     : 8 signed halfword elements of input vector are added
Packit 9c6abc
 *               together and the resulting integer sum is returned
Packit 9c6abc
 */
Packit 9c6abc
static WEBP_INLINE int32_t func_hadd_sh_s32(v8i16 in) {
Packit 9c6abc
  const v4i32 res = __msa_hadd_s_w(in, in);
Packit 9c6abc
  const v2i64 res0 = __msa_hadd_s_d(res, res);
Packit 9c6abc
  const v2i64 res1 = __msa_splati_d(res0, 1);
Packit 9c6abc
  const v2i64 res2 = res0 + res1;
Packit 9c6abc
  const int32_t sum_m = __msa_copy_s_w((v4i32)res2, 0);
Packit 9c6abc
  return sum_m;
Packit 9c6abc
}
Packit 9c6abc
#define HADD_SH_S32(in) func_hadd_sh_s32(in)
Packit 9c6abc
Packit 9c6abc
/* Description : Horizontal addition of 8 unsigned halfword elements
Packit 9c6abc
 * Arguments   : Input  - in       (unsigned halfword vector)
Packit 9c6abc
 *               Output - sum_m    (u32 sum)
Packit 9c6abc
 *               Return Type - unsigned word
Packit 9c6abc
 * Details     : 8 unsigned halfword elements of input vector are added
Packit 9c6abc
 *               together and the resulting integer sum is returned
Packit 9c6abc
 */
Packit 9c6abc
static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
Packit 9c6abc
  uint32_t sum_m;
Packit 9c6abc
  const v4u32 res_m = __msa_hadd_u_w(in, in);
Packit 9c6abc
  v2u64 res0_m = __msa_hadd_u_d(res_m, res_m);
Packit 9c6abc
  v2u64 res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);
Packit 9c6abc
  res0_m = res0_m + res1_m;
Packit 9c6abc
  sum_m = __msa_copy_s_w((v4i32)res0_m, 0);
Packit 9c6abc
  return sum_m;
Packit 9c6abc
}
Packit 9c6abc
#define HADD_UH_U32(in) func_hadd_uh_u32(in)
Packit 9c6abc
Packit 9c6abc
/* Description : Horizontal addition of signed half word vector elements
Packit 9c6abc
   Arguments   : Inputs  - in0, in1
Packit 9c6abc
                 Outputs - out0, out1
Packit 9c6abc
                 Return Type - as per RTYPE
Packit 9c6abc
   Details     : Each signed odd half word element from 'in0' is added to
Packit 9c6abc
                 even signed half word element from 'in0' (pairwise) and the
Packit 9c6abc
                 halfword result is written in 'out0'
Packit 9c6abc
*/
Packit 9c6abc
#define HADD_SH2(RTYPE, in0, in1, out0, out1) do {       \
Packit 9c6abc
  out0 = (RTYPE)__msa_hadd_s_w((v8i16)in0, (v8i16)in0);  \
Packit 9c6abc
  out1 = (RTYPE)__msa_hadd_s_w((v8i16)in1, (v8i16)in1);  \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define HADD_SH2_SW(...) HADD_SH2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define HADD_SH4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) do {  \
Packit 9c6abc
  HADD_SH2(RTYPE, in0, in1, out0, out1);                                  \
Packit 9c6abc
  HADD_SH2(RTYPE, in2, in3, out2, out3);                                  \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define HADD_SH4_SW(...) HADD_SH4(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Horizontal subtraction of unsigned byte vector elements
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Each unsigned odd byte element from 'in0' is subtracted from
Packit 9c6abc
 *               even unsigned byte element from 'in0' (pairwise) and the
Packit 9c6abc
 *               halfword result is written to 'out0'
Packit 9c6abc
 */
Packit 9c6abc
#define HSUB_UB2(RTYPE, in0, in1, out0, out1) do {       \
Packit 9c6abc
  out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \
Packit 9c6abc
  out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define HSUB_UB2_SW(...) HSUB_UB2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Set element n input vector to GPR value
Packit 9c6abc
 * Arguments   : Inputs - in0, in1, in2, in3
Packit 9c6abc
 *               Output - out
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Set element 0 in vector 'out' to value specified in 'in0'
Packit 9c6abc
 */
Packit 9c6abc
#define INSERT_W2(RTYPE, in0, in1, out) do {        \
Packit 9c6abc
  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
Packit 9c6abc
  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) do {  \
Packit 9c6abc
  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);      \
Packit 9c6abc
  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);      \
Packit 9c6abc
  out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);      \
Packit 9c6abc
  out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);      \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
Packit 9c6abc
#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
Packit 9c6abc
#define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Set element n of double word input vector to GPR value
Packit 9c6abc
 * Arguments   : Inputs - in0, in1
Packit 9c6abc
 *               Output - out
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Set element 0 in vector 'out' to GPR value specified in 'in0'
Packit 9c6abc
 *               Set element 1 in vector 'out' to GPR value specified in 'in1'
Packit 9c6abc
 */
Packit 9c6abc
#define INSERT_D2(RTYPE, in0, in1, out) do {        \
Packit 9c6abc
  out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
Packit 9c6abc
  out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Interleave even byte elements from vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Even byte elements of 'in0' and 'in1' are interleaved
Packit 9c6abc
 *               and written to 'out0'
Packit 9c6abc
 */
Packit 9c6abc
#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);        \
Packit 9c6abc
  out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);        \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
Packit 9c6abc
#define ILVEV_B2_UH(...) ILVEV_B2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Interleave odd byte elements from vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Odd byte elements of 'in0' and 'in1' are interleaved
Packit 9c6abc
 *               and written to 'out0'
Packit 9c6abc
 */
Packit 9c6abc
#define ILVOD_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_ilvod_b((v16i8)in1, (v16i8)in0);        \
Packit 9c6abc
  out1 = (RTYPE)__msa_ilvod_b((v16i8)in3, (v16i8)in2);        \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVOD_B2_UB(...) ILVOD_B2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVOD_B2_SB(...) ILVOD_B2(v16i8, __VA_ARGS__)
Packit 9c6abc
#define ILVOD_B2_UH(...) ILVOD_B2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define ILVOD_B2_SH(...) ILVOD_B2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define ILVOD_B2_SD(...) ILVOD_B2(v2i64, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Interleave even halfword elements from vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Even halfword elements of 'in0' and 'in1' are interleaved
Packit 9c6abc
 *               and written to 'out0'
Packit 9c6abc
 */
Packit 9c6abc
#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);        \
Packit 9c6abc
  out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);        \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVEV_H2_UH(...) ILVEV_H2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Interleave odd halfword elements from vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Odd halfword elements of 'in0' and 'in1' are interleaved
Packit 9c6abc
 *               and written to 'out0'
Packit 9c6abc
 */
Packit 9c6abc
#define ILVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_ilvod_h((v8i16)in1, (v8i16)in0);        \
Packit 9c6abc
  out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2);        \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVOD_H2_UB(...) ILVOD_H2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVOD_H2_UH(...) ILVOD_H2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define ILVOD_H2_SH(...) ILVOD_H2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define ILVOD_H2_SW(...) ILVOD_H2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Interleave even word elements from vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Even word elements of 'in0' and 'in1' are interleaved
Packit 9c6abc
 *               and written to 'out0'
Packit 9c6abc
 */
Packit 9c6abc
#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);        \
Packit 9c6abc
  out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);        \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
Packit 9c6abc
#define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Interleave even-odd word elements from vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Even word elements of 'in0' and 'in1' are interleaved
Packit 9c6abc
 *               and written to 'out0'
Packit 9c6abc
 *               Odd word elements of 'in2' and 'in3' are interleaved
Packit 9c6abc
 *               and written to 'out1'
Packit 9c6abc
 */
Packit 9c6abc
#define ILVEVOD_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);          \
Packit 9c6abc
  out1 = (RTYPE)__msa_ilvod_w((v4i32)in3, (v4i32)in2);          \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVEVOD_W2_UB(...) ILVEVOD_W2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVEVOD_W2_UH(...) ILVEVOD_W2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define ILVEVOD_W2_SH(...) ILVEVOD_W2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define ILVEVOD_W2_SW(...) ILVEVOD_W2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Interleave even-odd half-word elements from vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Even half-word elements of 'in0' and 'in1' are interleaved
Packit 9c6abc
 *               and written to 'out0'
Packit 9c6abc
 *               Odd half-word elements of 'in2' and 'in3' are interleaved
Packit 9c6abc
 *               and written to 'out1'
Packit 9c6abc
 */
Packit 9c6abc
#define ILVEVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);          \
Packit 9c6abc
  out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2);          \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVEVOD_H2_UB(...) ILVEVOD_H2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVEVOD_H2_UH(...) ILVEVOD_H2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define ILVEVOD_H2_SH(...) ILVEVOD_H2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define ILVEVOD_H2_SW(...) ILVEVOD_H2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Interleave even double word elements from vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Even double word elements of 'in0' and 'in1' are interleaved
Packit 9c6abc
 *               and written to 'out0'
Packit 9c6abc
 */
Packit 9c6abc
#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);        \
Packit 9c6abc
  out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);        \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
Packit 9c6abc
#define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
Packit 9c6abc
#define ILVEV_D2_SD(...) ILVEV_D2(v2i64, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Interleave left half of byte elements from vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
Packit 9c6abc
 *               and written to 'out0'.
Packit 9c6abc
 */
Packit 9c6abc
#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);        \
Packit 9c6abc
  out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);        \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
Packit 9c6abc
#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define ILVL_B2_SW(...) ILVL_B2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Interleave right half of byte elements from vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
Packit 9c6abc
 *               and written to out0.
Packit 9c6abc
 */
Packit 9c6abc
#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);        \
Packit 9c6abc
  out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);        \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
Packit 9c6abc
#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
Packit 9c6abc
                out0, out1, out2, out3) do {                    \
Packit 9c6abc
  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
Packit 9c6abc
  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
Packit 9c6abc
#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
Packit 9c6abc
#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
Packit 9c6abc
#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Interleave right half of halfword elements from vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Right half of halfword elements of 'in0' and 'in1' are
Packit 9c6abc
 *               interleaved and written to 'out0'.
Packit 9c6abc
 */
Packit 9c6abc
#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);        \
Packit 9c6abc
  out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);        \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVR_H2_UB(...) ILVR_H2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
Packit 9c6abc
                out0, out1, out2, out3) do {                    \
Packit 9c6abc
  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
Packit 9c6abc
  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVR_H4_UB(...) ILVR_H4(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
Packit 9c6abc
#define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Interleave right half of double word elements from vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Right half of double word elements of 'in0' and 'in1' are
Packit 9c6abc
 *               interleaved and written to 'out0'.
Packit 9c6abc
 */
Packit 9c6abc
#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_ilvr_d((v2i64)in0, (v2i64)in1);        \
Packit 9c6abc
  out1 = (RTYPE)__msa_ilvr_d((v2i64)in2, (v2i64)in3);        \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
Packit 9c6abc
#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
Packit 9c6abc
                out0, out1, out2, out3) do {                    \
Packit 9c6abc
  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
Packit 9c6abc
  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
Packit 9c6abc
#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Interleave both left and right half of input vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Right half of byte elements from 'in0' and 'in1' are
Packit 9c6abc
 *               interleaved and written to 'out0'
Packit 9c6abc
 */
Packit 9c6abc
#define ILVRL_B2(RTYPE, in0, in1, out0, out1) do {     \
Packit 9c6abc
  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
Packit 9c6abc
  out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
Packit 9c6abc
#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define ILVRL_H2(RTYPE, in0, in1, out0, out1) do {     \
Packit 9c6abc
  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
Packit 9c6abc
  out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
Packit 9c6abc
#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
Packit 9c6abc
#define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define ILVRL_W2(RTYPE, in0, in1, out0, out1) do {     \
Packit 9c6abc
  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
Packit 9c6abc
  out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
Packit 9c6abc
#define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Pack even byte elements of vector pairs
Packit 9c6abc
 *  Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *                Outputs - out0, out1
Packit 9c6abc
 *                Return Type - as per RTYPE
Packit 9c6abc
 *  Details     : Even byte elements of 'in0' are copied to the left half of
Packit 9c6abc
 *                'out0' & even byte elements of 'in1' are copied to the right
Packit 9c6abc
 *                half of 'out0'.
Packit 9c6abc
 */
Packit 9c6abc
#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);        \
Packit 9c6abc
  out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);        \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
Packit 9c6abc
#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
Packit 9c6abc
#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
Packit 9c6abc
                 out0, out1, out2, out3) do {                    \
Packit 9c6abc
  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
Packit 9c6abc
  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
Packit 9c6abc
#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
Packit 9c6abc
#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
Packit 9c6abc
#define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Pack even halfword elements of vector pairs
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Even halfword elements of 'in0' are copied to the left half of
Packit 9c6abc
 *               'out0' & even halfword elements of 'in1' are copied to the
Packit 9c6abc
 *               right half of 'out0'.
Packit 9c6abc
 */
Packit 9c6abc
#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);        \
Packit 9c6abc
  out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);        \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define PCKEV_H2_UH(...) PCKEV_H2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
Packit 9c6abc
#define PCKEV_H2_UW(...) PCKEV_H2(v4u32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Pack even word elements of vector pairs
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Even word elements of 'in0' are copied to the left half of
Packit 9c6abc
 *               'out0' & even word elements of 'in1' are copied to the
Packit 9c6abc
 *               right half of 'out0'.
Packit 9c6abc
 */
Packit 9c6abc
#define PCKEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_pckev_w((v4i32)in0, (v4i32)in1);        \
Packit 9c6abc
  out1 = (RTYPE)__msa_pckev_w((v4i32)in2, (v4i32)in3);        \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define PCKEV_W2_UH(...) PCKEV_W2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define PCKEV_W2_SH(...) PCKEV_W2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define PCKEV_W2_SW(...) PCKEV_W2(v4i32, __VA_ARGS__)
Packit 9c6abc
#define PCKEV_W2_UW(...) PCKEV_W2(v4u32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Pack odd halfword elements of vector pairs
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Odd halfword elements of 'in0' are copied to the left half of
Packit 9c6abc
 *               'out0' & odd halfword elements of 'in1' are copied to the
Packit 9c6abc
 *               right half of 'out0'.
Packit 9c6abc
 */
Packit 9c6abc
#define PCKOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_pckod_h((v8i16)in0, (v8i16)in1);        \
Packit 9c6abc
  out1 = (RTYPE)__msa_pckod_h((v8i16)in2, (v8i16)in3);        \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define PCKOD_H2_UH(...) PCKOD_H2(v8u16, __VA_ARGS__)
Packit 9c6abc
#define PCKOD_H2_SH(...) PCKOD_H2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define PCKOD_H2_SW(...) PCKOD_H2(v4i32, __VA_ARGS__)
Packit 9c6abc
#define PCKOD_H2_UW(...) PCKOD_H2(v4u32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Arithmetic immediate shift right all elements of word vector
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, shift
Packit 9c6abc
 *               Outputs - in place operation
Packit 9c6abc
 *               Return Type - as per input vector RTYPE
Packit 9c6abc
 * Details     : Each element of vector 'in0' is right shifted by 'shift' and
Packit 9c6abc
 *               the result is written in-place. 'shift' is a GP variable.
Packit 9c6abc
 */
Packit 9c6abc
#define SRAI_W2(RTYPE, in0, in1, shift_val) do {  \
Packit 9c6abc
  in0 = (RTYPE)SRAI_W(in0, shift_val);            \
Packit 9c6abc
  in1 = (RTYPE)SRAI_W(in1, shift_val);            \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define SRAI_W2_SW(...) SRAI_W2(v4i32, __VA_ARGS__)
Packit 9c6abc
#define SRAI_W2_UW(...) SRAI_W2(v4u32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define SRAI_W4(RTYPE, in0, in1, in2, in3, shift_val) do {  \
Packit 9c6abc
  SRAI_W2(RTYPE, in0, in1, shift_val);                      \
Packit 9c6abc
  SRAI_W2(RTYPE, in2, in3, shift_val);                      \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define SRAI_W4_SW(...) SRAI_W4(v4i32, __VA_ARGS__)
Packit 9c6abc
#define SRAI_W4_UW(...) SRAI_W4(v4u32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Arithmetic shift right all elements of half-word vector
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, shift
Packit 9c6abc
 *               Outputs - in place operation
Packit 9c6abc
 *               Return Type - as per input vector RTYPE
Packit 9c6abc
 * Details     : Each element of vector 'in0' is right shifted by 'shift' and
Packit 9c6abc
 *               the result is written in-place. 'shift' is a GP variable.
Packit 9c6abc
 */
Packit 9c6abc
#define SRAI_H2(RTYPE, in0, in1, shift_val) do {  \
Packit 9c6abc
  in0 = (RTYPE)SRAI_H(in0, shift_val);            \
Packit 9c6abc
  in1 = (RTYPE)SRAI_H(in1, shift_val);            \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define SRAI_H2_SH(...) SRAI_H2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define SRAI_H2_UH(...) SRAI_H2(v8u16, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Arithmetic rounded shift right all elements of word vector
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, shift
Packit 9c6abc
 *               Outputs - in place operation
Packit 9c6abc
 *               Return Type - as per input vector RTYPE
Packit 9c6abc
 * Details     : Each element of vector 'in0' is right shifted by 'shift' and
Packit 9c6abc
 *               the result is written in-place. 'shift' is a GP variable.
Packit 9c6abc
 */
Packit 9c6abc
#define SRARI_W2(RTYPE, in0, in1, shift) do {     \
Packit 9c6abc
  in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
Packit 9c6abc
  in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) do {  \
Packit 9c6abc
  SRARI_W2(RTYPE, in0, in1, shift);                      \
Packit 9c6abc
  SRARI_W2(RTYPE, in2, in3, shift);                      \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
Packit 9c6abc
#define SRARI_W4_UW(...) SRARI_W4(v4u32, __VA_ARGS__)
Packit 9c6abc
#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Shift right arithmetic rounded double words
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, shift
Packit 9c6abc
 *               Outputs - in place operation
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Each element of vector 'in0' is shifted right arithmetically by
Packit 9c6abc
 *               the number of bits in the corresponding element in the vector
Packit 9c6abc
 *               'shift'. The last discarded bit is added to shifted value for
Packit 9c6abc
 *               rounding and the result is written in-place.
Packit 9c6abc
 *               'shift' is a vector.
Packit 9c6abc
 */
Packit 9c6abc
#define SRAR_D2(RTYPE, in0, in1, shift) do {            \
Packit 9c6abc
  in0 = (RTYPE)__msa_srar_d((v2i64)in0, (v2i64)shift);  \
Packit 9c6abc
  in1 = (RTYPE)__msa_srar_d((v2i64)in1, (v2i64)shift);  \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define SRAR_D2_SW(...) SRAR_D2(v4i32, __VA_ARGS__)
Packit 9c6abc
#define SRAR_D2_SD(...) SRAR_D2(v2i64, __VA_ARGS__)
Packit 9c6abc
#define SRAR_D2_UD(...) SRAR_D2(v2u64, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#define SRAR_D4(RTYPE, in0, in1, in2, in3, shift) do {  \
Packit 9c6abc
  SRAR_D2(RTYPE, in0, in1, shift);                      \
Packit 9c6abc
  SRAR_D2(RTYPE, in2, in3, shift);                      \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define SRAR_D4_SD(...) SRAR_D4(v2i64, __VA_ARGS__)
Packit 9c6abc
#define SRAR_D4_UD(...) SRAR_D4(v2u64, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Addition of 2 pairs of half-word vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 * Details     : Each element in 'in0' is added to 'in1' and result is written
Packit 9c6abc
 *               to 'out0'.
Packit 9c6abc
 */
Packit 9c6abc
#define ADDVI_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)ADDVI_H(in0, in1);                            \
Packit 9c6abc
  out1 = (RTYPE)ADDVI_H(in2, in3);                            \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__)
Packit 9c6abc
#define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Addition of 2 pairs of word vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 * Details     : Each element in 'in0' is added to 'in1' and result is written
Packit 9c6abc
 *               to 'out0'.
Packit 9c6abc
 */
Packit 9c6abc
#define ADDVI_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)ADDVI_W(in0, in1);                            \
Packit 9c6abc
  out1 = (RTYPE)ADDVI_W(in2, in3);                            \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define ADDVI_W2_SW(...) ADDVI_W2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Fill 2 pairs of word vectors with GP registers
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 * Details     : GP register in0 is replicated in each word element of out0
Packit 9c6abc
 *               GP register in1 is replicated in each word element of out1
Packit 9c6abc
 */
Packit 9c6abc
#define FILL_W2(RTYPE, in0, in1, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_fill_w(in0);                 \
Packit 9c6abc
  out1 = (RTYPE)__msa_fill_w(in1);                 \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define FILL_W2_SW(...) FILL_W2(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Addition of 2 pairs of vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 * Details     : Each element in 'in0' is added to 'in1' and result is written
Packit 9c6abc
 *               to 'out0'.
Packit 9c6abc
 */
Packit 9c6abc
#define ADD2(in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = in0 + in1;                                \
Packit 9c6abc
  out1 = in2 + in3;                                \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
Packit 9c6abc
             out0, out1, out2, out3) do {             \
Packit 9c6abc
  ADD2(in0, in1, in2, in3, out0, out1);               \
Packit 9c6abc
  ADD2(in4, in5, in6, in7, out2, out3);               \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Subtraction of 2 pairs of vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 * Details     : Each element in 'in1' is subtracted from 'in0' and result is
Packit 9c6abc
 *               written to 'out0'.
Packit 9c6abc
 */
Packit 9c6abc
#define SUB2(in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = in0 - in1;                                \
Packit 9c6abc
  out1 = in2 - in3;                                \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
#define SUB3(in0, in1, in2, in3, in4, in5, out0, out1, out2) do {  \
Packit 9c6abc
  out0 = in0 - in1;                                                \
Packit 9c6abc
  out1 = in2 - in3;                                                \
Packit 9c6abc
  out2 = in4 - in5;                                                \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
Packit 9c6abc
             out0, out1, out2, out3) do {             \
Packit 9c6abc
  out0 = in0 - in1;                                   \
Packit 9c6abc
  out1 = in2 - in3;                                   \
Packit 9c6abc
  out2 = in4 - in5;                                   \
Packit 9c6abc
  out3 = in6 - in7;                                   \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Addition - Subtraction of input vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 * Details     : Each element in 'in1' is added to 'in0' and result is
Packit 9c6abc
 *               written to 'out0'.
Packit 9c6abc
 *               Each element in 'in1' is subtracted from 'in0' and result is
Packit 9c6abc
 *               written to 'out1'.
Packit 9c6abc
 */
Packit 9c6abc
#define ADDSUB2(in0, in1, out0, out1) do {  \
Packit 9c6abc
  out0 = in0 + in1;                         \
Packit 9c6abc
  out1 = in0 - in1;                         \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Multiplication of pairs of vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 * Details     : Each element from 'in0' is multiplied with elements from 'in1'
Packit 9c6abc
 *               and the result is written to 'out0'
Packit 9c6abc
 */
Packit 9c6abc
#define MUL2(in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = in0 * in1;                                \
Packit 9c6abc
  out1 = in2 * in3;                                \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
Packit 9c6abc
             out0, out1, out2, out3) do {             \
Packit 9c6abc
  MUL2(in0, in1, in2, in3, out0, out1);               \
Packit 9c6abc
  MUL2(in4, in5, in6, in7, out2, out3);               \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Sign extend halfword elements from right half of the vector
Packit 9c6abc
 * Arguments   : Input  - in    (halfword vector)
Packit 9c6abc
 *               Output - out   (sign extended word vector)
Packit 9c6abc
 *               Return Type - signed word
Packit 9c6abc
 * Details     : Sign bit of halfword elements from input vector 'in' is
Packit 9c6abc
 *               extracted and interleaved with same vector 'in0' to generate
Packit 9c6abc
 *               4 word elements keeping sign intact
Packit 9c6abc
 */
Packit 9c6abc
#define UNPCK_R_SH_SW(in, out) do {                   \
Packit 9c6abc
  const v8i16 sign_m = __msa_clti_s_h((v8i16)in, 0);  \
Packit 9c6abc
  out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);       \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Sign extend halfword elements from input vector and return
Packit 9c6abc
 *               the result in pair of vectors
Packit 9c6abc
 * Arguments   : Input   - in            (halfword vector)
Packit 9c6abc
 *               Outputs - out0, out1   (sign extended word vectors)
Packit 9c6abc
 *               Return Type - signed word
Packit 9c6abc
 * Details     : Sign bit of halfword elements from input vector 'in' is
Packit 9c6abc
 *               extracted and interleaved right with same vector 'in0' to
Packit 9c6abc
 *               generate 4 signed word elements in 'out0'
Packit 9c6abc
 *               Then interleaved left with same vector 'in0' to
Packit 9c6abc
 *               generate 4 signed word elements in 'out1'
Packit 9c6abc
 */
Packit 9c6abc
#define UNPCK_SH_SW(in, out0, out1) do {              \
Packit 9c6abc
  const v8i16 tmp_m = __msa_clti_s_h((v8i16)in, 0);   \
Packit 9c6abc
  ILVRL_H2_SW(tmp_m, in, out0, out1);                 \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Butterfly of 4 input vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *               Outputs - out0, out1, out2, out3
Packit 9c6abc
 * Details     : Butterfly operation
Packit 9c6abc
 */
Packit 9c6abc
#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
Packit 9c6abc
  out0 = in0 + in3;                                                   \
Packit 9c6abc
  out1 = in1 + in2;                                                   \
Packit 9c6abc
  out2 = in1 - in2;                                                   \
Packit 9c6abc
  out3 = in0 - in3;                                                   \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
Packit 9c6abc
 *                         in8, in9, in10, in11, in12, in13, in14, in15
Packit 9c6abc
 *               Outputs - out0, out1, out2, out3
Packit 9c6abc
 *               Return Type - unsigned byte
Packit 9c6abc
 */
Packit 9c6abc
#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
Packit 9c6abc
                            in8, in9, in10, in11, in12, in13, in14, in15,  \
Packit 9c6abc
                            out0, out1, out2, out3) do {                   \
Packit 9c6abc
  v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m;                    \
Packit 9c6abc
  ILVEV_W2_SD(in0, in4, in8, in12, tmp2_m, tmp3_m);                        \
Packit 9c6abc
  ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                        \
Packit 9c6abc
  ILVEV_D2_UB(tmp2_m, tmp3_m, tmp0_m, tmp1_m, out1, out3);                 \
Packit 9c6abc
  ILVEV_W2_SD(in2, in6, in10, in14, tmp4_m, tmp5_m);                       \
Packit 9c6abc
  ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                       \
Packit 9c6abc
  ILVEV_D2_SD(tmp4_m, tmp5_m, tmp0_m, tmp1_m, tmp2_m, tmp3_m);             \
Packit 9c6abc
  ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);                 \
Packit 9c6abc
  ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out0, out2);               \
Packit 9c6abc
  ILVOD_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);                 \
Packit 9c6abc
  ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out1, out3);               \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
Packit 9c6abc
 *                         in8, in9, in10, in11, in12, in13, in14, in15
Packit 9c6abc
 *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
Packit 9c6abc
 *               Return Type - unsigned byte
Packit 9c6abc
 */
Packit 9c6abc
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
Packit 9c6abc
                            in8, in9, in10, in11, in12, in13, in14, in15,  \
Packit 9c6abc
                            out0, out1, out2, out3, out4, out5,            \
Packit 9c6abc
                            out6, out7) do {                               \
Packit 9c6abc
  v8i16 tmp0_m, tmp1_m, tmp4_m, tmp5_m, tmp6_m, tmp7_m;                    \
Packit 9c6abc
  v4i32 tmp2_m, tmp3_m;                                                    \
Packit 9c6abc
  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
Packit 9c6abc
  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
Packit 9c6abc
  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
Packit 9c6abc
  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
Packit 9c6abc
  ILVEV_B2_SH(out7, out6, out5, out4, tmp0_m, tmp1_m);                     \
Packit 9c6abc
  ILVOD_B2_SH(out7, out6, out5, out4, tmp4_m, tmp5_m);                     \
Packit 9c6abc
  ILVEV_B2_UB(out3, out2, out1, out0, out5, out7);                         \
Packit 9c6abc
  ILVOD_B2_SH(out3, out2, out1, out0, tmp6_m, tmp7_m);                     \
Packit 9c6abc
  ILVEV_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
Packit 9c6abc
  ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out0, out4);               \
Packit 9c6abc
  ILVOD_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
Packit 9c6abc
  ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out2, out6);               \
Packit 9c6abc
  ILVEV_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
Packit 9c6abc
  ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out1, out5);               \
Packit 9c6abc
  ILVOD_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
Packit 9c6abc
  ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out3, out7);               \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Transpose 4x4 block with word elements in vectors
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3
Packit 9c6abc
 *                Outputs - out0, out1, out2, out3
Packit 9c6abc
 *                Return Type - as per RTYPE
Packit 9c6abc
 */
Packit 9c6abc
#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3,                            \
Packit 9c6abc
                       out0, out1, out2, out3) do {                          \
Packit 9c6abc
  v4i32 s0_m, s1_m, s2_m, s3_m;                                              \
Packit 9c6abc
  ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                         \
Packit 9c6abc
  ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                         \
Packit 9c6abc
  out0 = (RTYPE)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);                      \
Packit 9c6abc
  out1 = (RTYPE)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                      \
Packit 9c6abc
  out2 = (RTYPE)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                      \
Packit 9c6abc
  out3 = (RTYPE)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                      \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define TRANSPOSE4x4_SW_SW(...) TRANSPOSE4x4_W(v4i32, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
/* Description : Add block 4x4
Packit 9c6abc
 * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
Packit 9c6abc
 * Details     : Least significant 4 bytes from each input vector are added to
Packit 9c6abc
 *               the destination bytes, clipped between 0-255 and stored.
Packit 9c6abc
 */
Packit 9c6abc
#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) do {  \
Packit 9c6abc
  uint32_t src0_m, src1_m, src2_m, src3_m;                      \
Packit 9c6abc
  v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
Packit 9c6abc
  v16i8 dst0_m = { 0 };                                         \
Packit 9c6abc
  v16i8 dst1_m = { 0 };                                         \
Packit 9c6abc
  const v16i8 zero_m = { 0 };                                   \
Packit 9c6abc
  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m);               \
Packit 9c6abc
  LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m);            \
Packit 9c6abc
  INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
Packit 9c6abc
  INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
Packit 9c6abc
  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
Packit 9c6abc
  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
Packit 9c6abc
  CLIP_SH2_0_255(res0_m, res1_m);                               \
Packit 9c6abc
  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
Packit 9c6abc
  ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : Pack even byte elements, extract 0 & 2 index words from pair
Packit 9c6abc
 *               of results and store 4 words in destination memory as per
Packit 9c6abc
 *               stride
Packit 9c6abc
 * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
Packit 9c6abc
 */
Packit 9c6abc
#define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) do {  \
Packit 9c6abc
  v16i8 tmp0_m, tmp1_m;                                        \
Packit 9c6abc
  PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m);             \
Packit 9c6abc
  ST4x4_UB(tmp0_m, tmp1_m, 0, 2, 0, 2, pdst, stride);          \
Packit 9c6abc
} while (0)
Packit 9c6abc
Packit 9c6abc
/* Description : average with rounding (in0 + in1 + 1) / 2.
Packit 9c6abc
 * Arguments   : Inputs  - in0, in1, in2, in3,
Packit 9c6abc
 *               Outputs - out0, out1
Packit 9c6abc
 *               Return Type - as per RTYPE
Packit 9c6abc
 * Details     : Each unsigned byte element from 'in0' vector is added with
Packit 9c6abc
 *               each unsigned byte element from 'in1' vector. Then the average
Packit 9c6abc
 *               with rounding is calculated and written to 'out0'
Packit 9c6abc
 */
Packit 9c6abc
#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
Packit 9c6abc
  out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1);       \
Packit 9c6abc
  out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3);       \
Packit 9c6abc
} while (0)
Packit 9c6abc
#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
Packit 9c6abc
Packit 9c6abc
#endif  /* WEBP_DSP_MSA_MACRO_H_ */