#include "config.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/types.h>
#include <orc/orcprogram.h>
#include <orc/orcarm.h>
#include <orc/orcdebug.h>
#include <orc/orcneon.h>
void orc_neon_emit_loadiq (OrcCompiler *compiler, int dest, int param);
void orc_neon_emit_loadpq (OrcCompiler *compiler, int dest, int param);
const char *orc_neon_reg_name (int reg)
{
static const char *vec_regs[] = {
"d0", "d1", "d2", "d3",
"d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11",
"d12", "d13", "d14", "d15",
"d16", "d17", "d18", "d19",
"d20", "d21", "d22", "d23",
"d24", "d25", "d26", "d27",
"d28", "d29", "d30", "d31",
};
if (reg < ORC_VEC_REG_BASE || reg >= ORC_VEC_REG_BASE+32) {
return "ERROR";
}
return vec_regs[reg&0x1f];
}
const char *orc_neon_reg_name_quad (int reg)
{
static const char *vec_regs[] = {
"q0", "ERROR", "q1", "ERROR",
"q2", "ERROR", "q3", "ERROR",
"q4", "ERROR", "q5", "ERROR",
"q6", "ERROR", "q7", "ERROR",
"q8", "ERROR", "q9", "ERROR",
"q10", "ERROR", "q11", "ERROR",
"q12", "ERROR", "q13", "ERROR",
"q14", "ERROR", "q15", "ERROR",
};
if (reg < ORC_VEC_REG_BASE || reg >= ORC_VEC_REG_BASE+32) {
return "ERROR";
}
return vec_regs[reg&0x1f];
}
static void
orc_neon_emit_binary (OrcCompiler *p, const char *name, unsigned int code,
int dest, int src1, int src2)
{
ORC_ASSERT((code & 0x004ff0af) == 0);
ORC_ASM_CODE(p," %s %s, %s, %s\n", name,
orc_neon_reg_name (dest), orc_neon_reg_name (src1),
orc_neon_reg_name (src2));
code |= (dest&0xf)<<12;
code |= ((dest>>4)&0x1)<<22;
code |= (src1&0xf)<<16;
code |= ((src1>>4)&0x1)<<7;
code |= (src2&0xf)<<0;
code |= ((src2>>4)&0x1)<<5;
orc_arm_emit (p, code);
}
#define NEON_BINARY(code,a,b,c) \
((code) | \
(((a)&0xf)<<12) | \
((((a)>>4)&0x1)<<22) | \
(((b)&0xf)<<16) | \
((((b)>>4)&0x1)<<7) | \
(((c)&0xf)<<0) | \
((((c)>>4)&0x1)<<5))
static void
orc_neon_emit_binary_long (OrcCompiler *p, const char *name, unsigned int code,
int dest, int src1, int src2)
{
ORC_ASSERT((code & 0x004ff0af) == 0);
ORC_ASM_CODE(p," %s %s, %s, %s\n", name,
orc_neon_reg_name_quad (dest), orc_neon_reg_name (src1),
orc_neon_reg_name (src2));
code |= (dest&0xf)<<12;
code |= ((dest>>4)&0x1)<<22;
code |= (src1&0xf)<<16;
code |= ((src1>>4)&0x1)<<7;
code |= (src2&0xf)<<0;
code |= ((src2>>4)&0x1)<<5;
orc_arm_emit (p, code);
}
#if 0
static void
orc_neon_emit_binary_narrow (OrcCompiler *p, const char *name, unsigned int code,
int dest, int src1, int src2)
{
ORC_ASSERT((code & 0x004ff0af) == 0);
ORC_ASM_CODE(p," %s %s, %s, %s\n", name,
orc_neon_reg_name (dest), orc_neon_reg_name_quad (src1),
orc_neon_reg_name_quad (src2));
code |= (dest&0xf)<<12;
code |= ((dest>>4)&0x1)<<22;
code |= (src1&0xf)<<16;
code |= ((src1>>4)&0x1)<<7;
code |= (src2&0xf)<<0;
code |= ((src2>>4)&0x1)<<5;
orc_arm_emit (p, code);
}
#endif
static void
orc_neon_emit_binary_quad (OrcCompiler *p, const char *name, unsigned int code,
int dest, int src1, int src2)
{
ORC_ASSERT((code & 0x004ff0af) == 0);
ORC_ASM_CODE(p," %s %s, %s, %s\n", name,
orc_neon_reg_name_quad (dest), orc_neon_reg_name_quad (src1),
orc_neon_reg_name_quad (src2));
code |= (dest&0xf)<<12;
code |= ((dest>>4)&0x1)<<22;
code |= (src1&0xf)<<16;
code |= ((src1>>4)&0x1)<<7;
code |= (src2&0xf)<<0;
code |= ((src2>>4)&0x1)<<5;
code |= 0x40;
orc_arm_emit (p, code);
}
static void
orc_neon_emit_unary (OrcCompiler *p, const char *name, unsigned int code,
int dest, int src1)
{
ORC_ASSERT((code & 0x0040f02f) == 0);
ORC_ASM_CODE(p," %s %s, %s\n", name,
orc_neon_reg_name (dest), orc_neon_reg_name (src1));
code |= (dest&0xf)<<12;
code |= ((dest>>4)&0x1)<<22;
code |= (src1&0xf)<<0;
code |= ((src1>>4)&0x1)<<5;
orc_arm_emit (p, code);
}
static void
orc_neon_emit_unary_long (OrcCompiler *p, const char *name, unsigned int code,
int dest, int src1)
{
ORC_ASSERT((code & 0x0040f02f) == 0);
ORC_ASM_CODE(p," %s %s, %s\n", name,
orc_neon_reg_name_quad (dest), orc_neon_reg_name (src1));
code |= (dest&0xf)<<12;
code |= ((dest>>4)&0x1)<<22;
code |= (src1&0xf)<<0;
code |= ((src1>>4)&0x1)<<5;
orc_arm_emit (p, code);
}
static void
orc_neon_emit_unary_narrow (OrcCompiler *p, const char *name, unsigned int code,
int dest, int src1)
{
ORC_ASSERT((code & 0x0040f02f) == 0);
ORC_ASM_CODE(p," %s %s, %s\n", name,
orc_neon_reg_name (dest), orc_neon_reg_name_quad (src1));
code |= (dest&0xf)<<12;
code |= ((dest>>4)&0x1)<<22;
code |= (src1&0xf)<<0;
code |= ((src1>>4)&0x1)<<5;
orc_arm_emit (p, code);
}
static void
orc_neon_emit_unary_quad (OrcCompiler *p, const char *name, unsigned int code,
int dest, int src1)
{
ORC_ASSERT((code & 0x0040f02f) == 0);
ORC_ASM_CODE(p," %s %s, %s\n", name,
orc_neon_reg_name_quad (dest), orc_neon_reg_name_quad (src1));
code |= (dest&0xf)<<12;
code |= ((dest>>4)&0x1)<<22;
code |= (src1&0xf)<<0;
code |= ((src1>>4)&0x1)<<5;
code |= 0x40;
orc_arm_emit (p, code);
}
void
orc_neon_emit_mov (OrcCompiler *compiler, int dest, int src)
{
orc_neon_emit_binary (compiler, "vorr", 0xf2200110,
dest, src, src);
}
void
orc_neon_emit_mov_quad (OrcCompiler *compiler, int dest, int src)
{
orc_neon_emit_binary_quad (compiler, "vorr", 0xf2200110,
dest, src, src);
}
void
orc_neon_preload (OrcCompiler *compiler, OrcVariable *var, int write,
int offset)
{
orc_uint32 code;
/* Don't use multiprocessing extensions */
write = FALSE;
ORC_ASM_CODE(compiler," pld%s [%s, #%d]\n",
write ? "w" : "",
orc_arm_reg_name (var->ptr_register), offset);
code = 0xf510f000;
if (!write) code |= (1<<22);
code |= (var->ptr_register&0xf) << 16;
if (offset < 0) {
code |= ((-offset)&0xfff) << 0;
} else {
code |= (offset&0xfff) << 0;
code |= (1<<23);
}
orc_arm_emit (compiler, code);
}
#if 0
void
orc_neon_load_halfvec_aligned (OrcCompiler *compiler, OrcVariable *var, int update)
{
orc_uint32 code;
ORC_ASM_CODE(compiler," vld1.32 %s[0], [%s]%s\n",
orc_neon_reg_name (var->alloc),
orc_arm_reg_name (var->ptr_register),
update ? "!" : "");
code = 0xf4a0080d;
code |= (var->ptr_register&0xf) << 16;
code |= (var->alloc&0xf) << 12;
code |= ((var->alloc>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
void
orc_neon_load_vec_aligned (OrcCompiler *compiler, OrcVariable *var, int update)
{
orc_uint32 code;
ORC_ASM_CODE(compiler," vld1.64 %s, [%s]%s\n",
orc_neon_reg_name (var->alloc),
orc_arm_reg_name (var->ptr_register),
update ? "!" : "");
code = 0xf42007cd;
code |= (var->ptr_register&0xf) << 16;
code |= (var->alloc&0xf) << 12;
code |= ((var->alloc>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
void
orc_neon_load_vec_unaligned (OrcCompiler *compiler, OrcVariable *var,
int update)
{
orc_uint32 code;
ORC_ASM_CODE(compiler," vld1.8 %s, [%s]%s\n",
orc_neon_reg_name (var->alloc),
orc_arm_reg_name (var->ptr_register),
update ? "!" : "");
code = 0xf420070d;
code |= (var->ptr_register&0xf) << 16;
code |= ((var->alloc)&0xf) << 12;
code |= (((var->alloc)>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
#if 0
/* used with need_mask_regs */
ORC_ASM_CODE(compiler," vld1.64 %s, [%s]%s\n",
orc_neon_reg_name (var->aligned_data + 1),
orc_arm_reg_name (var->ptr_register),
update ? "!" : "");
code = 0xf42007cd;
code |= (var->ptr_register&0xf) << 16;
code |= ((var->aligned_data+1)&0xf) << 12;
code |= (((var->aligned_data+1)>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
ORC_ASM_CODE(compiler," vtbl.8 %s, {%s,%s}, %s\n",
orc_neon_reg_name (var->alloc),
orc_neon_reg_name (var->aligned_data),
orc_neon_reg_name (var->aligned_data+1),
orc_neon_reg_name (var->mask_alloc));
code = NEON_BINARY(0xf3b00900, var->alloc, var->aligned_data,
var->mask_alloc);
orc_arm_emit (compiler, code);
/* orc_neon_emit_mov (compiler, var->alloc, var->mask_alloc); */
orc_neon_emit_mov (compiler, var->aligned_data, var->aligned_data + 1);
#endif
}
void
orc_neon_load_halfvec_unaligned (OrcCompiler *compiler, OrcVariable *var,
int update)
{
orc_uint32 code;
ORC_ASM_CODE(compiler," vld1.8 %s, [%s]\n",
orc_neon_reg_name (var->alloc),
orc_arm_reg_name (var->ptr_register));
code = 0xf420070d;
code |= (var->ptr_register&0xf) << 16;
code |= ((var->alloc)&0xf) << 12;
code |= (((var->alloc)>>4)&0x1) << 22;
/* code |= (!update) << 1; */
code |= (1) << 1;
orc_arm_emit (compiler, code);
if (update) {
orc_arm_emit_add_imm (compiler, var->ptr_register,
var->ptr_register, 4);
}
#if 0
/* used with need_mask_regs */
ORC_ASM_CODE(compiler," vld1.32 %s[1], [%s]%s\n",
orc_neon_reg_name (var->aligned_data),
orc_arm_reg_name (var->ptr_register),
update ? "!" : "");
code = 0xf4a0088d;
code |= (var->ptr_register&0xf) << 16;
code |= ((var->aligned_data)&0xf) << 12;
code |= (((var->aligned_data)>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
ORC_ASM_CODE(compiler," vtbl.8 %s, {%s,%s}, %s\n",
orc_neon_reg_name (var->alloc),
orc_neon_reg_name (var->aligned_data),
orc_neon_reg_name (var->aligned_data + 1),
orc_neon_reg_name (var->mask_alloc));
code = NEON_BINARY(0xf3b00900, var->alloc, var->aligned_data, var->mask_alloc);
orc_arm_emit (compiler, code);
orc_neon_emit_unary (compiler, "vrev64.i32", 0xf3b80000,
var->aligned_data, var->aligned_data);
#endif
}
void
orc_neon_load_fourvec_aligned (OrcCompiler *compiler, OrcVariable *var, int update)
{
orc_uint32 code;
ORC_ASM_CODE(compiler," vld1.64 { %s, %s, %s, %s }, [%s,:256]%s\n",
orc_neon_reg_name (var->alloc),
orc_neon_reg_name (var->alloc + 1),
orc_neon_reg_name (var->alloc + 2),
orc_neon_reg_name (var->alloc + 3),
orc_arm_reg_name (var->ptr_register),
update ? "!" : "");
code = 0xf42002dd;
code |= (var->ptr_register&0xf) << 16;
code |= (var->alloc&0xf) << 12;
code |= ((var->alloc>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
void
orc_neon_load_fourvec_unaligned (OrcCompiler *compiler, OrcVariable *var,
int update)
{
orc_uint32 code;
ORC_ASM_CODE(compiler," vld1.8 { %s, %s, %s, %s }, [%s]%s\n",
orc_neon_reg_name (var->alloc),
orc_neon_reg_name (var->alloc + 1),
orc_neon_reg_name (var->alloc + 2),
orc_neon_reg_name (var->alloc + 3),
orc_arm_reg_name (var->ptr_register),
update ? "!" : "");
code = 0xf420020d;
code |= (var->ptr_register&0xf) << 16;
code |= ((var->alloc)&0xf) << 12;
code |= (((var->alloc)>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
void
orc_neon_load_twovec_aligned (OrcCompiler *compiler, OrcVariable *var, int update)
{
orc_uint32 code;
ORC_ASM_CODE(compiler," vld1.64 { %s, %s }, [%s,:128]%s\n",
orc_neon_reg_name (var->alloc),
orc_neon_reg_name (var->alloc + 1),
orc_arm_reg_name (var->ptr_register),
update ? "!" : "");
code = 0xf4200aed;
code |= (var->ptr_register&0xf) << 16;
code |= (var->alloc&0xf) << 12;
code |= ((var->alloc>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
void
orc_neon_load_twovec_unaligned (OrcCompiler *compiler, OrcVariable *var,
int update)
{
orc_uint32 code;
ORC_ASM_CODE(compiler," vld1.8 { %s, %s }, [%s]%s\n",
orc_neon_reg_name (var->alloc),
orc_neon_reg_name (var->alloc + 1),
orc_arm_reg_name (var->ptr_register),
update ? "!" : "");
code = 0xf4200a0d;
code |= (var->ptr_register&0xf) << 16;
code |= ((var->alloc)&0xf) << 12;
code |= (((var->alloc)>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
void
orc_neon_loadb (OrcCompiler *compiler, OrcVariable *var, int update)
{
orc_uint32 code;
int i;
if (var->is_aligned && compiler->insn_shift == 5) {
orc_neon_load_fourvec_aligned (compiler, var, update);
} else if (var->is_aligned && compiler->insn_shift == 4) {
orc_neon_load_twovec_aligned (compiler, var, update);
} else if (var->is_aligned && compiler->insn_shift == 3) {
orc_neon_load_vec_aligned (compiler, var, update);
} else if (var->is_aligned && compiler->insn_shift == 2) {
orc_neon_load_halfvec_aligned (compiler, var, update);
} else if (compiler->insn_shift == 5) {
orc_neon_load_fourvec_unaligned (compiler, var, update);
} else if (compiler->insn_shift == 4) {
orc_neon_load_twovec_unaligned (compiler, var, update);
} else if (compiler->insn_shift == 3) {
orc_neon_load_vec_unaligned (compiler, var, update);
} else if (compiler->insn_shift == 2) {
orc_neon_load_halfvec_unaligned (compiler, var, update);
} else {
if (compiler->insn_shift > 1) {
ORC_ERROR("slow load");
}
for(i=0;i<(1<<compiler->insn_shift);i++){
ORC_ASM_CODE(compiler," vld1.8 %s[%d], [%s]%s\n",
orc_neon_reg_name (var->alloc + (i>>3)), i&7,
orc_arm_reg_name (var->ptr_register),
update ? "!" : "");
code = NEON_BINARY(0xf4a0000d, var->alloc + (i>>3),
var->ptr_register, 0);
code |= (i&7) << 5;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
}
}
void
orc_neon_loadw (OrcCompiler *compiler, OrcVariable *var, int update)
{
if (var->is_aligned && compiler->insn_shift == 3) {
orc_neon_load_twovec_aligned (compiler, var, update);
} else if (var->is_aligned && compiler->insn_shift == 2) {
orc_neon_load_vec_aligned (compiler, var, update);
} else if (var->is_aligned && compiler->insn_shift == 1) {
orc_neon_load_halfvec_aligned (compiler, var, update);
} else if (compiler->insn_shift == 3) {
orc_neon_load_twovec_unaligned (compiler, var, update);
} else if (compiler->insn_shift == 2) {
orc_neon_load_vec_unaligned (compiler, var, update);
} else if (compiler->insn_shift == 1) {
orc_neon_load_halfvec_unaligned (compiler, var, update);
} else {
orc_uint32 code;
int i;
if (compiler->insn_shift == 2) {
orc_neon_load_vec_aligned (compiler, var, update);
return;
} else if (compiler->insn_shift == 1) {
orc_neon_load_halfvec_aligned (compiler, var, update);
return;
}
if (compiler->insn_shift > 1) {
ORC_ERROR("slow load");
}
for(i=0;i<(1<<compiler->insn_shift);i++){
ORC_ASM_CODE(compiler," vld1.16 %s[%d], [%s]%s\n",
orc_neon_reg_name (var->alloc + (i>>2)), i&3,
orc_arm_reg_name (var->ptr_register),
update ? "!" : "");
code = NEON_BINARY(0xf4a0040d, var->alloc + (i>>2),
var->ptr_register, 0);
code |= (i&3) << 6;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
}
}
void
orc_neon_loadl (OrcCompiler *compiler, OrcVariable *var, int update)
{
orc_uint32 code;
int i;
if (var->is_aligned && compiler->insn_shift == 2) {
orc_neon_load_twovec_aligned (compiler, var, update);
} else if (var->is_aligned && compiler->insn_shift == 1) {
orc_neon_load_vec_aligned (compiler, var, update);
} else if (compiler->insn_shift == 2) {
orc_neon_load_twovec_unaligned (compiler, var, update);
} else if (compiler->insn_shift == 1) {
orc_neon_load_vec_unaligned (compiler, var, update);
} else {
if (compiler->insn_shift > 0) {
/* ORC_ERROR("slow load"); */
}
for(i=0;i<(1<<compiler->insn_shift);i++){
ORC_ASM_CODE(compiler," vld1.32 %s[%d], [%s]%s\n",
orc_neon_reg_name (var->alloc + (i>>1)), i & 1,
orc_arm_reg_name (var->ptr_register),
update ? "!" : "");
code = NEON_BINARY(0xf4a0080d, var->alloc + (i>>1),
var->ptr_register, 0);
code |= (i&1)<<7;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
}
}
void
orc_neon_loadq (OrcCompiler *compiler, int dest, int src1, int update, int is_aligned)
{
orc_uint32 code;
ORC_ASM_CODE(compiler," vld1.64 %s, [%s]%s\n",
orc_neon_reg_name (dest),
orc_arm_reg_name (src1),
update ? "!" : "");
code = 0xf42007cd;
code |= (src1&0xf) << 16;
code |= (dest&0xf) << 12;
code |= ((dest>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
void
orc_neon_storeb (OrcCompiler *compiler, int dest, int update, int src1, int is_aligned)
{
orc_uint32 code;
int i;
if (is_aligned && compiler->insn_shift == 5) {
ORC_ASM_CODE(compiler," vst1.8 { %s, %s, %s, %s }, [%s,:256]%s\n",
orc_neon_reg_name (src1),
orc_neon_reg_name (src1+1),
orc_neon_reg_name (src1+2),
orc_neon_reg_name (src1+3),
orc_arm_reg_name (dest),
update ? "!" : "");
code = 0xf400023d;
code |= (dest&0xf) << 16;
code |= (src1&0xf) << 12;
code |= ((src1>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
} else if (compiler->insn_shift == 5) {
ORC_ASM_CODE(compiler," vst1.8 { %s, %s, %s, %s }, [%s]%s\n",
orc_neon_reg_name (src1),
orc_neon_reg_name (src1+1),
orc_neon_reg_name (src1+2),
orc_neon_reg_name (src1+3),
orc_arm_reg_name (dest),
update ? "!" : "");
code = 0xf400020d;
code |= (dest&0xf) << 16;
code |= (src1&0xf) << 12;
code |= ((src1>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
} else if (is_aligned && compiler->insn_shift == 4) {
ORC_ASM_CODE(compiler," vst1.8 { %s, %s }, [%s,:128]%s\n",
orc_neon_reg_name (src1),
orc_neon_reg_name (src1+1),
orc_arm_reg_name (dest),
update ? "!" : "");
code = 0xf4000a2d;
code |= (dest&0xf) << 16;
code |= (src1&0xf) << 12;
code |= ((src1>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
} else if (compiler->insn_shift == 4) {
ORC_ASM_CODE(compiler," vst1.8 { %s, %s }, [%s]%s\n",
orc_neon_reg_name (src1),
orc_neon_reg_name (src1+1),
orc_arm_reg_name (dest),
update ? "!" : "");
code = 0xf4000a0d;
code |= (dest&0xf) << 16;
code |= (src1&0xf) << 12;
code |= ((src1>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
} else if (is_aligned && compiler->insn_shift == 3) {
ORC_ASM_CODE(compiler," vst1.8 %s, [%s,:64]%s\n",
orc_neon_reg_name (src1),
orc_arm_reg_name (dest),
update ? "!" : "");
code = 0xf400071d;
code |= (dest&0xf) << 16;
code |= (src1&0xf) << 12;
code |= ((src1>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
} else {
for(i=0;i<(1<<compiler->insn_shift);i++){
ORC_ASM_CODE(compiler," vst1.8 %s[%d], [%s]%s\n",
orc_neon_reg_name (src1 + (i>>3)), i&7,
orc_arm_reg_name (dest),
update ? "!" : "");
code = 0xf480000d;
code |= (dest&0xf) << 16;
code |= ((src1 + (i>>3))&0xf) << 12;
code |= ((src1>>4)&0x1) << 22;
code |= (i&7)<<5;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
}
}
void
orc_neon_storew (OrcCompiler *compiler, int dest, int update, int src1, int is_aligned)
{
orc_uint32 code;
int i;
if (is_aligned && compiler->insn_shift == 3) {
ORC_ASM_CODE(compiler," vst1.16 { %s, %s }, [%s,:128]%s\n",
orc_neon_reg_name (src1),
orc_neon_reg_name (src1 + 1),
orc_arm_reg_name (dest),
update ? "!" : "");
code = 0xf4000a6d;
code |= (dest&0xf) << 16;
code |= (src1&0xf) << 12;
code |= ((src1>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
} else if (is_aligned && compiler->insn_shift == 2) {
ORC_ASM_CODE(compiler," vst1.16 %s, [%s,:64]%s\n",
orc_neon_reg_name (src1),
orc_arm_reg_name (dest),
update ? "!" : "");
code = 0xf400075d;
code |= (dest&0xf) << 16;
code |= (src1&0xf) << 12;
code |= ((src1>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
} else {
for(i=0;i<(1<<compiler->insn_shift);i++){
ORC_ASM_CODE(compiler," vst1.16 %s[%d], [%s]%s\n",
orc_neon_reg_name (src1 + (i>>2)), i&3,
orc_arm_reg_name (dest),
update ? "!" : "");
code = 0xf480040d;
code |= (dest&0xf) << 16;
code |= ((src1 + (i>>2))&0xf) << 12;
code |= ((src1>>4)&0x1) << 22;
code |= (i&3)<<6;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
}
}
void
orc_neon_storel (OrcCompiler *compiler, int dest, int update, int src1, int is_aligned)
{
orc_uint32 code;
int i;
if (is_aligned && compiler->insn_shift == 2) {
ORC_ASM_CODE(compiler," vst1.32 { %s, %s }, [%s,:128]%s\n",
orc_neon_reg_name (src1),
orc_neon_reg_name (src1 + 1),
orc_arm_reg_name (dest),
update ? "!" : "");
code = 0xf4000aad;
code |= (dest&0xf) << 16;
code |= (src1&0xf) << 12;
code |= ((src1>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
} else if (is_aligned && compiler->insn_shift == 1) {
ORC_ASM_CODE(compiler," vst1.32 %s, [%s,:64]%s\n",
orc_neon_reg_name (src1),
orc_arm_reg_name (dest),
update ? "!" : "");
code = 0xf400079d;
code |= (dest&0xf) << 16;
code |= (src1&0xf) << 12;
code |= ((src1>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
} else {
for(i=0;i<(1<<compiler->insn_shift);i++){
ORC_ASM_CODE(compiler," vst1.32 %s[%d], [%s]%s\n",
orc_neon_reg_name (src1 + (i>>1)), i&1,
orc_arm_reg_name (dest),
update ? "!" : "");
code = 0xf480080d;
code |= (dest&0xf) << 16;
code |= ((src1 + (i>>1))&0xf) << 12;
code |= ((src1>>4)&0x1) << 22;
code |= (i&1)<<7;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
}
}
void
orc_neon_storeq (OrcCompiler *compiler, int dest, int update, int src1, int is_aligned)
{
orc_uint32 code;
ORC_ASM_CODE(compiler," vst1.64 %s, [%s]%s\n",
orc_neon_reg_name (src1),
orc_arm_reg_name (dest),
update ? "!" : "");
code = 0xf40007cd;
code |= (dest&0xf) << 16;
code |= (src1&0xf) << 12;
code |= ((src1>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
#endif
static void
neon_rule_loadpX (OrcCompiler *compiler, void *user, OrcInstruction *insn)
{
OrcVariable *src = compiler->vars + insn->src_args[0];
OrcVariable *dest = compiler->vars + insn->dest_args[0];
int size = ORC_PTR_TO_INT (user);
if (src->vartype == ORC_VAR_TYPE_CONST) {
if (size == 1) {
orc_neon_emit_loadib (compiler, dest->alloc, src->value.i);
} else if (size == 2) {
orc_neon_emit_loadiw (compiler, dest->alloc, src->value.i);
} else if (size == 4) {
orc_neon_emit_loadil (compiler, dest->alloc, src->value.i);
} else if (size == 8) {
if (src->size == 8) {
ORC_COMPILER_ERROR(compiler,"64-bit constants not implemented");
}
orc_neon_emit_loadiq (compiler, dest->alloc, src->value.i);
} else {
ORC_PROGRAM_ERROR(compiler,"unimplemented");
}
} else {
if (size == 1) {
orc_neon_emit_loadpb (compiler, dest->alloc, insn->src_args[0]);
} else if (size == 2) {
orc_neon_emit_loadpw (compiler, dest->alloc, insn->src_args[0]);
} else if (size == 4) {
orc_neon_emit_loadpl (compiler, dest->alloc, insn->src_args[0]);
} else if (size == 8) {
orc_neon_emit_loadpq (compiler, dest->alloc, insn->src_args[0]);
} else {
ORC_PROGRAM_ERROR(compiler,"unimplemented");
}
}
}
static void
neon_rule_loadX (OrcCompiler *compiler, void *user, OrcInstruction *insn)
{
OrcVariable *src = compiler->vars + insn->src_args[0];
OrcVariable *dest = compiler->vars + insn->dest_args[0];
int update = FALSE;
unsigned int code = 0;
int size = src->size << compiler->insn_shift;
int type = ORC_PTR_TO_INT(user);
int ptr_register;
int is_aligned = src->is_aligned;
/* FIXME this should be fixed at a higher level */
if (src->vartype != ORC_VAR_TYPE_SRC && src->vartype != ORC_VAR_TYPE_DEST) {
ORC_COMPILER_ERROR(compiler, "loadX used with non src/dest");
return;
}
if (src->vartype == ORC_VAR_TYPE_DEST) update = FALSE;
if (type == 1) {
if (compiler->vars[insn->src_args[1]].vartype != ORC_VAR_TYPE_CONST) {
ORC_PROGRAM_ERROR(compiler,"unimplemented");
return;
}
ptr_register = compiler->gp_tmpreg;
orc_arm_emit_add_imm (compiler, ptr_register,
src->ptr_register,
compiler->vars[insn->src_args[1]].value.i * src->size);
update = FALSE;
is_aligned = FALSE;
} else {
ptr_register = src->ptr_register;
}
if (size >= 8) {
if (is_aligned) {
if (size == 32) {
ORC_ASM_CODE(compiler," vld1.64 { %s, %s, %s, %s }, [%s,:256]%s\n",
orc_neon_reg_name (dest->alloc),
orc_neon_reg_name (dest->alloc + 1),
orc_neon_reg_name (dest->alloc + 2),
orc_neon_reg_name (dest->alloc + 3),
orc_arm_reg_name (ptr_register),
update ? "!" : "");
code = 0xf42002dd;
} else if (size == 16) {
ORC_ASM_CODE(compiler," vld1.64 { %s, %s }, [%s,:128]%s\n",
orc_neon_reg_name (dest->alloc),
orc_neon_reg_name (dest->alloc + 1),
orc_arm_reg_name (ptr_register),
update ? "!" : "");
code = 0xf4200aed;
} else if (size == 8) {
ORC_ASM_CODE(compiler," vld1.64 %s, [%s]%s\n",
orc_neon_reg_name (dest->alloc),
orc_arm_reg_name (ptr_register),
update ? "!" : "");
code = 0xf42007cd;
} else {
ORC_COMPILER_ERROR(compiler,"bad aligned load size %d",
src->size << compiler->insn_shift);
}
} else {
if (size == 32) {
ORC_ASM_CODE(compiler," vld1.8 { %s, %s, %s, %s }, [%s]%s\n",
orc_neon_reg_name (dest->alloc),
orc_neon_reg_name (dest->alloc + 1),
orc_neon_reg_name (dest->alloc + 2),
orc_neon_reg_name (dest->alloc + 3),
orc_arm_reg_name (ptr_register),
update ? "!" : "");
code = 0xf420020d;
} else if (size == 16) {
ORC_ASM_CODE(compiler," vld1.8 { %s, %s }, [%s]%s\n",
orc_neon_reg_name (dest->alloc),
orc_neon_reg_name (dest->alloc + 1),
orc_arm_reg_name (ptr_register),
update ? "!" : "");
code = 0xf4200a0d;
} else if (size == 8) {
ORC_ASM_CODE(compiler," vld1.8 %s, [%s]%s\n",
orc_neon_reg_name (dest->alloc),
orc_arm_reg_name (ptr_register),
update ? "!" : "");
code = 0xf420070d;
} else {
ORC_COMPILER_ERROR(compiler,"bad unaligned load size %d",
src->size << compiler->insn_shift);
}
}
} else {
int shift;
if (size == 4) {
shift = 2;
} else if (size == 2) {
shift = 1;
} else {
shift = 0;
}
ORC_ASM_CODE(compiler," vld1.%d %s[0], [%s]%s\n",
8<<shift,
orc_neon_reg_name (dest->alloc),
orc_arm_reg_name (ptr_register),
update ? "!" : "");
code = 0xf4a0000d;
code |= shift<<10;
code |= (0&7)<<5;
}
code |= (ptr_register&0xf) << 16;
code |= (dest->alloc&0xf) << 12;
code |= ((dest->alloc>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
static void
neon_rule_storeX (OrcCompiler *compiler, void *user, OrcInstruction *insn)
{
OrcVariable *src = compiler->vars + insn->src_args[0];
OrcVariable *dest = compiler->vars + insn->dest_args[0];
int update = FALSE;
unsigned int code = 0;
int size = dest->size << compiler->insn_shift;
if (size >= 8) {
if (dest->is_aligned) {
if (size == 32) {
ORC_ASM_CODE(compiler," vst1.64 { %s, %s, %s, %s }, [%s,:256]%s\n",
orc_neon_reg_name (src->alloc),
orc_neon_reg_name (src->alloc + 1),
orc_neon_reg_name (src->alloc + 2),
orc_neon_reg_name (src->alloc + 3),
orc_arm_reg_name (dest->ptr_register),
update ? "!" : "");
code = 0xf40002dd;
} else if (size == 16) {
ORC_ASM_CODE(compiler," vst1.64 { %s, %s }, [%s,:128]%s\n",
orc_neon_reg_name (src->alloc),
orc_neon_reg_name (src->alloc + 1),
orc_arm_reg_name (dest->ptr_register),
update ? "!" : "");
code = 0xf4000aed;
} else if (size == 8) {
ORC_ASM_CODE(compiler," vst1.64 %s, [%s]%s\n",
orc_neon_reg_name (src->alloc),
orc_arm_reg_name (dest->ptr_register),
update ? "!" : "");
code = 0xf40007cd;
} else {
ORC_COMPILER_ERROR(compiler,"bad aligned store size %d", size);
}
} else {
if (size == 32) {
ORC_ASM_CODE(compiler," vst1.8 { %s, %s, %s, %s }, [%s]%s\n",
orc_neon_reg_name (src->alloc),
orc_neon_reg_name (src->alloc + 1),
orc_neon_reg_name (src->alloc + 2),
orc_neon_reg_name (src->alloc + 3),
orc_arm_reg_name (dest->ptr_register),
update ? "!" : "");
code = 0xf400020d;
} else if (size == 16) {
ORC_ASM_CODE(compiler," vst1.8 { %s, %s }, [%s]%s\n",
orc_neon_reg_name (src->alloc),
orc_neon_reg_name (src->alloc + 1),
orc_arm_reg_name (dest->ptr_register),
update ? "!" : "");
code = 0xf4000a0d;
} else if (size == 8) {
ORC_ASM_CODE(compiler," vst1.8 %s, [%s]%s\n",
orc_neon_reg_name (src->alloc),
orc_arm_reg_name (dest->ptr_register),
update ? "!" : "");
code = 0xf400070d;
} else {
ORC_COMPILER_ERROR(compiler,"bad aligned store size %d", size);
}
}
} else {
int shift;
if (size == 4) {
shift = 2;
} else if (size == 2) {
shift = 1;
} else {
shift = 0;
}
ORC_ASM_CODE(compiler," vst1.%d %s[0], [%s]%s\n",
8<<shift,
orc_neon_reg_name (src->alloc),
orc_arm_reg_name (dest->ptr_register),
update ? "!" : "");
code = 0xf480000d;
code |= shift<<10;
code |= (0&7)<<5;
}
code |= (dest->ptr_register&0xf) << 16;
code |= (src->alloc&0xf) << 12;
code |= ((src->alloc>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
#if 0
static int
orc_neon_get_const_shift (unsigned int value)
{
int shift = 0;
while((value & 0xff) != value) {
shift++;
value >>= 1;
}
return shift;
}
#endif
void
orc_neon_emit_loadib (OrcCompiler *compiler, int reg, int value)
{
orc_uint32 code;
if (value == 0) {
orc_neon_emit_binary_quad (compiler, "veor", 0xf3000110, reg, reg, reg);
return;
}
value &= 0xff;
ORC_ASM_CODE(compiler," vmov.i8 %s, #%d\n",
orc_neon_reg_name_quad (reg), value);
code = 0xf2800e10;
code |= (reg&0xf) << 12;
code |= ((reg>>4)&0x1) << 22;
code |= (value&0xf) << 0;
code |= (value&0x70) << 12;
code |= (value&0x80) << 17;
code |= 0x40;
orc_arm_emit (compiler, code);
}
void
orc_neon_emit_loadiw (OrcCompiler *compiler, int reg, int value)
{
orc_uint32 code;
if (value == 0) {
orc_neon_emit_binary_quad (compiler, "veor", 0xf3000110, reg, reg, reg);
return;
}
ORC_ASM_CODE(compiler," vmov.i16 %s, #0x%04x\n",
orc_neon_reg_name_quad (reg), value & 0xff);
code = 0xf2800810;
code |= (reg&0xf) << 12;
code |= ((reg>>4)&0x1) << 22;
code |= (value&0xf) << 0;
code |= (value&0x70) << 12;
code |= (value&0x80) << 17;
code |= 0x40;
orc_arm_emit (compiler, code);
value >>= 8;
if (value) {
ORC_ASM_CODE(compiler," vorr.i16 %s, #0x%04x\n",
orc_neon_reg_name_quad (reg), (value & 0xff) << 8);
code = 0xf2800b10;
code |= (reg&0xf) << 12;
code |= ((reg>>4)&0x1) << 22;
code |= (value&0xf) << 0;
code |= (value&0x70) << 12;
code |= (value&0x80) << 17;
code |= 0x40;
orc_arm_emit (compiler, code);
}
}
void
orc_neon_emit_loadil (OrcCompiler *compiler, int reg, int value)
{
orc_uint32 code;
if (value == 0) {
orc_neon_emit_binary_quad (compiler, "veor", 0xf3000110, reg, reg, reg);
return;
}
ORC_ASM_CODE(compiler," vmov.i32 %s, #0x%08x\n",
orc_neon_reg_name_quad (reg), value & 0xff);
code = 0xf2800010;
code |= (reg&0xf) << 12;
code |= ((reg>>4)&0x1) << 22;
code |= (value&0xf) << 0;
code |= (value&0x70) << 12;
code |= (value&0x80) << 17;
code |= 0x40;
orc_arm_emit (compiler, code);
value >>= 8;
if (value & 0xff) {
ORC_ASM_CODE(compiler," vorr.i32 %s, #0x%08x\n",
orc_neon_reg_name_quad (reg), (value & 0xff) << 8);
code = 0xf2800310;
code |= (reg&0xf) << 12;
code |= ((reg>>4)&0x1) << 22;
code |= (value&0xf) << 0;
code |= (value&0x70) << 12;
code |= (value&0x80) << 17;
code |= 0x40;
orc_arm_emit (compiler, code);
}
value >>= 8;
if (value & 0xff) {
ORC_ASM_CODE(compiler," vorr.i32 %s, #0x%08x\n",
orc_neon_reg_name_quad (reg), (value & 0xff) << 16);
code = 0xf2800510;
code |= (reg&0xf) << 12;
code |= ((reg>>4)&0x1) << 22;
code |= (value&0xf) << 0;
code |= (value&0x70) << 12;
code |= (value&0x80) << 17;
code |= 0x40;
orc_arm_emit (compiler, code);
}
value >>= 8;
if (value & 0xff) {
ORC_ASM_CODE(compiler," vorr.i32 %s, #0x%08x\n",
orc_neon_reg_name_quad (reg), (value & 0xff) << 24);
code = 0xf2800710;
code |= (reg&0xf) << 12;
code |= ((reg>>4)&0x1) << 22;
code |= (value&0xf) << 0;
code |= (value&0x70) << 12;
code |= (value&0x80) << 17;
code |= 0x40;
orc_arm_emit (compiler, code);
}
}
void
orc_neon_emit_loadiq (OrcCompiler *compiler, int reg, int value)
{
/* orc_uint32 code; */
/* int shift; */
/* int neg = FALSE; */
if (value == 0) {
orc_neon_emit_binary_quad (compiler, "veor", 0xf3000110, reg, reg, reg);
return;
}
if (value < 0) {
/* neg = TRUE; */
value = ~value;
}
#if 0
shift = orc_neon_get_const_shift (value);
if ((value & (0xff<<shift)) == value) {
value >>= shift;
if (neg) {
ORC_ASM_CODE(compiler," vmvn.i64 %s, #%d\n",
orc_neon_reg_name_quad (reg), value);
code = 0xf2800030;
} else {
ORC_ASM_CODE(compiler," vmov.i64 %s, #%d\n",
orc_neon_reg_name_quad (reg), value);
code = 0xf2800010;
}
code |= (reg&0xf) << 12;
code |= ((reg>>4)&0x1) << 22;
code |= (value&0xf) << 0;
code |= (value&0x70) << 12;
code |= (value&0x80) << 17;
code |= 0x40;
orc_arm_emit (compiler, code);
if (shift > 0) {
ORC_ASM_CODE(compiler," vshl.i64 %s, %s, #%d\n",
orc_neon_reg_name_quad (reg), orc_neon_reg_name_quad (reg), shift);
code = 0xf2a00510;
code |= (reg&0xf) << 12;
code |= ((reg>>4)&0x1) << 22;
code |= (reg&0xf) << 0;
code |= ((reg>>4)&0x1) << 5;
code |= (shift&0xf) << 16;
code |= 0x40;
orc_arm_emit (compiler, code);
}
return;
}
#endif
ORC_COMPILER_ERROR(compiler, "unimplemented load of constant %d", value);
}
void
orc_neon_emit_loadpb (OrcCompiler *compiler, int dest, int param)
{
orc_uint32 code;
orc_arm_emit_add_imm (compiler, compiler->gp_tmpreg,
compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, params[param]));
ORC_ASM_CODE(compiler," vld1.8 {%s[],%s[]}, [%s]\n",
orc_neon_reg_name (dest), orc_neon_reg_name (dest+1),
orc_arm_reg_name (compiler->gp_tmpreg));
code = 0xf4a00c2f;
code |= (compiler->gp_tmpreg&0xf) << 16;
code |= (dest&0xf) << 12;
code |= ((dest>>4)&0x1) << 22;
orc_arm_emit (compiler, code);
}
void
orc_neon_emit_loadpw (OrcCompiler *compiler, int dest, int param)
{
orc_uint32 code;
orc_arm_emit_add_imm (compiler, compiler->gp_tmpreg,
compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, params[param]));
ORC_ASM_CODE(compiler," vld1.16 {%s[],%s[]}, [%s]\n",
orc_neon_reg_name (dest), orc_neon_reg_name (dest+1),
orc_arm_reg_name (compiler->gp_tmpreg));
code = 0xf4a00c6f;
code |= (compiler->gp_tmpreg&0xf) << 16;
code |= (dest&0xf) << 12;
code |= ((dest>>4)&0x1) << 22;
orc_arm_emit (compiler, code);
}
void
orc_neon_emit_loadpl (OrcCompiler *compiler, int dest, int param)
{
orc_uint32 code;
orc_arm_emit_add_imm (compiler, compiler->gp_tmpreg,
compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, params[param]));
ORC_ASM_CODE(compiler," vld1.32 {%s[],%s[]}, [%s]\n",
orc_neon_reg_name (dest), orc_neon_reg_name (dest+1),
orc_arm_reg_name (compiler->gp_tmpreg));
code = 0xf4a00caf;
code |= (compiler->gp_tmpreg&0xf) << 16;
code |= (dest&0xf) << 12;
code |= ((dest>>4)&0x1) << 22;
orc_arm_emit (compiler, code);
}
void
orc_neon_emit_loadpq (OrcCompiler *compiler, int dest, int param)
{
orc_uint32 code;
int update = FALSE;
orc_arm_emit_add_imm (compiler, compiler->gp_tmpreg,
compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, params[param]));
ORC_ASM_CODE(compiler," vld1.32 %s[0], [%s]%s\n",
orc_neon_reg_name (dest),
orc_arm_reg_name (compiler->gp_tmpreg),
update ? "!" : "");
code = 0xf4a0000d;
code |= 2<<10;
code |= (0)<<7;
code |= (compiler->gp_tmpreg&0xf) << 16;
code |= (dest&0xf) << 12;
code |= ((dest>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
ORC_ASM_CODE(compiler," vld1.32 %s[0], [%s]%s\n",
orc_neon_reg_name (dest+1),
orc_arm_reg_name (compiler->gp_tmpreg),
update ? "!" : "");
code = 0xf4a0000d;
code |= 2<<10;
code |= (0)<<7;
code |= (compiler->gp_tmpreg&0xf) << 16;
code |= ((dest+1)&0xf) << 12;
code |= (((dest+1)>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
orc_arm_emit_add_imm (compiler, compiler->gp_tmpreg,
compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor,
params[param + (ORC_VAR_T1-ORC_VAR_P1)]));
ORC_ASM_CODE(compiler," vld1.32 %s[1], [%s]%s\n",
orc_neon_reg_name (dest),
orc_arm_reg_name (compiler->gp_tmpreg),
update ? "!" : "");
code = 0xf4a0000d;
code |= 2<<10;
code |= (1)<<7;
code |= (compiler->gp_tmpreg&0xf) << 16;
code |= (dest&0xf) << 12;
code |= ((dest>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
ORC_ASM_CODE(compiler," vld1.32 %s[1], [%s]%s\n",
orc_neon_reg_name (dest+1),
orc_arm_reg_name (compiler->gp_tmpreg),
update ? "!" : "");
code = 0xf4a0000d;
code |= 2<<10;
code |= (1)<<7;
code |= (compiler->gp_tmpreg&0xf) << 16;
code |= ((dest+1)&0xf) << 12;
code |= (((dest+1)>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
}
#define UNARY(opcode,insn_name,code,vec_shift) \
static void \
orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
{ \
if (p->insn_shift <= vec_shift) { \
orc_neon_emit_unary (p, insn_name, code, \
p->vars[insn->dest_args[0]].alloc, \
p->vars[insn->src_args[0]].alloc); \
} else if (p->insn_shift == vec_shift + 1) { \
orc_neon_emit_unary_quad (p, insn_name, code, \
p->vars[insn->dest_args[0]].alloc, \
p->vars[insn->src_args[0]].alloc); \
} else { \
ORC_COMPILER_ERROR(p, "shift too large"); \
} \
}
#define UNARY_LONG(opcode,insn_name,code,vec_shift) \
static void \
orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
{ \
if (p->insn_shift <= vec_shift) { \
orc_neon_emit_unary_long (p, insn_name, code, \
p->vars[insn->dest_args[0]].alloc, \
p->vars[insn->src_args[0]].alloc); \
} else { \
ORC_COMPILER_ERROR(p, "shift too large"); \
} \
}
#define UNARY_NARROW(opcode,insn_name,code,vec_shift) \
static void \
orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
{ \
if (p->insn_shift <= vec_shift) { \
orc_neon_emit_unary_narrow (p, insn_name, code, \
p->vars[insn->dest_args[0]].alloc, \
p->vars[insn->src_args[0]].alloc); \
} else { \
ORC_COMPILER_ERROR(p, "shift too large"); \
} \
}
#define BINARY(opcode,insn_name,code,vec_shift) \
static void \
orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
{ \
if (p->insn_shift <= vec_shift) { \
orc_neon_emit_binary (p, insn_name, code, \
p->vars[insn->dest_args[0]].alloc, \
p->vars[insn->src_args[0]].alloc, \
p->vars[insn->src_args[1]].alloc); \
} else if (p->insn_shift == vec_shift + 1) { \
orc_neon_emit_binary_quad (p, insn_name, code, \
p->vars[insn->dest_args[0]].alloc, \
p->vars[insn->src_args[0]].alloc, \
p->vars[insn->src_args[1]].alloc); \
} else { \
ORC_COMPILER_ERROR(p, "shift too large"); \
} \
}
#define BINARY_LONG(opcode,insn_name,code,vec_shift) \
static void \
orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
{ \
if (p->insn_shift <= vec_shift) { \
orc_neon_emit_binary_long (p, insn_name, code, \
p->vars[insn->dest_args[0]].alloc, \
p->vars[insn->src_args[0]].alloc, \
p->vars[insn->src_args[1]].alloc); \
} else { \
ORC_COMPILER_ERROR(p, "shift too large"); \
} \
}
#define BINARY_NARROW(opcode,insn_name,code,vec_shift) \
static void \
orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
{ \
if (p->insn_shift <= vec_shift) { \
orc_neon_emit_binary_narrow (p, insn_name, code, \
p->vars[insn->dest_args[0]].alloc, \
p->vars[insn->src_args[0]].alloc, \
p->vars[insn->src_args[1]].alloc); \
} else { \
ORC_COMPILER_ERROR(p, "shift too large"); \
} \
}
#define MOVE(opcode,insn_name,code,vec_shift) \
static void \
orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
{ \
if (p->vars[insn->dest_args[0]].alloc == p->vars[insn->src_args[0]].alloc) { \
return; \
} \
if (p->insn_shift <= vec_shift) { \
orc_neon_emit_binary (p, "vorr", 0xf2200110, \
p->vars[insn->dest_args[0]].alloc, \
p->vars[insn->src_args[0]].alloc, \
p->vars[insn->src_args[0]].alloc); \
} else if (p->insn_shift == vec_shift + 1) { \
orc_neon_emit_binary_quad (p, "vorr", 0xf2200110, \
p->vars[insn->dest_args[0]].alloc, \
p->vars[insn->src_args[0]].alloc, \
p->vars[insn->src_args[0]].alloc); \
} else { \
ORC_COMPILER_ERROR(p, "shift too large"); \
} \
}
typedef struct {
orc_uint32 code;
char *name;
int negate;
int bits;
int vec_shift;
} ShiftInfo;
ShiftInfo immshift_info[] = {
{ 0xf2880510, "vshl.i8", FALSE, 8, 3 }, /* shlb */
{ 0xf2880010, "vshr.s8", TRUE, 8, 3 }, /* shrsb */
{ 0xf3880010, "vshr.u8", TRUE, 8, 3 }, /* shrub */
{ 0xf2900510, "vshl.i16", FALSE, 16, 2 },
{ 0xf2900010, "vshr.s16", TRUE, 16, 2 },
{ 0xf3900010, "vshr.u16", TRUE, 16, 2 },
{ 0xf2a00510, "vshl.i32", FALSE, 32, 1 },
{ 0xf2a00010, "vshr.s32", TRUE, 32, 1 },
{ 0xf3a00010, "vshr.u32", TRUE, 32, 1 }
};
ShiftInfo regshift_info[] = {
{ 0xf3000400, "vshl.u8", FALSE, 0, 3 }, /* shlb */
{ 0xf2000400, "vshl.s8", TRUE, 0, 3 }, /* shrsb */
{ 0xf3000400, "vshl.u8", TRUE, 0, 3 }, /* shrub */
{ 0xf3100400, "vshl.u16", FALSE, 0, 2 },
{ 0xf2100400, "vshl.s16", TRUE, 0, 2 },
{ 0xf3100400, "vshl.u16", TRUE, 0, 2 },
{ 0xf3200400, "vshl.u32", FALSE, 0, 1 },
{ 0xf2200400, "vshl.s32", TRUE, 0, 1 },
{ 0xf3200400, "vshl.u32", TRUE, 0, 1 }
};
static void
orc_neon_rule_shift (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int type = ORC_PTR_TO_INT(user);
orc_uint32 code;
if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) {
int shift = p->vars[insn->src_args[1]].value.i;
if (shift < 0) {
ORC_COMPILER_ERROR(p, "shift negative");
return;
}
if (shift >= immshift_info[type].bits) {
ORC_COMPILER_ERROR(p, "shift too large");
return;
}
code = immshift_info[type].code;
if (p->insn_shift <= immshift_info[type].vec_shift) {
ORC_ASM_CODE(p," %s %s, %s, #%d\n",
immshift_info[type].name,
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name (p->vars[insn->src_args[0]].alloc),
(int)p->vars[insn->src_args[1]].value.i);
} else {
ORC_ASM_CODE(p," %s %s, %s, #%d\n",
immshift_info[type].name,
orc_neon_reg_name_quad (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name_quad (p->vars[insn->src_args[0]].alloc),
(int)p->vars[insn->src_args[1]].value.i);
code |= 0x40;
}
code |= (p->vars[insn->dest_args[0]].alloc&0xf)<<12;
code |= ((p->vars[insn->dest_args[0]].alloc>>4)&0x1)<<22;
code |= (p->vars[insn->src_args[0]].alloc&0xf)<<0;
code |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<5;
if (immshift_info[type].negate) {
shift = immshift_info[type].bits - shift;
}
code |= shift<<16;
orc_arm_emit (p, code);
} else if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_PARAM) {
orc_neon_emit_loadpb (p, p->tmpreg, insn->src_args[1]);
if (regshift_info[type].negate) {
orc_neon_emit_unary_quad (p, "vneg.s8", 0xf3b10380,
p->tmpreg, p->tmpreg);
}
code = regshift_info[type].code;
if (p->insn_shift <= regshift_info[type].vec_shift) {
ORC_ASM_CODE(p," %s %s, %s, %s\n",
regshift_info[type].name,
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name (p->vars[insn->src_args[0]].alloc),
orc_neon_reg_name (p->tmpreg));
} else {
ORC_ASM_CODE(p," %s %s, %s, %s\n",
regshift_info[type].name,
orc_neon_reg_name_quad (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name_quad (p->vars[insn->src_args[0]].alloc),
orc_neon_reg_name_quad (p->tmpreg));
code |= 0x40;
}
code |= (p->vars[insn->dest_args[0]].alloc&0xf)<<12;
code |= ((p->vars[insn->dest_args[0]].alloc>>4)&0x1)<<22;
code |= (p->vars[insn->src_args[0]].alloc&0xf)<<0;
code |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<5;
code |= (p->tmpreg&0xf)<<16;
code |= ((p->tmpreg>>4)&0x1)<<7;
orc_arm_emit (p, code);
} else {
ORC_PROGRAM_ERROR(p,"shift rule only works with constants and params");
}
}
#if 0
static void
orc_neon_rule_shrsw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
orc_uint32 code;
if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) {
code = 0xf2900010;
ORC_ASM_CODE(p," vshr.s16 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name (p->vars[insn->src_args[0]].alloc),
p->vars[insn->src_args[1]].value);
code |= (p->vars[insn->dest_args[0]].alloc&0xf)<<12;
code |= ((p->vars[insn->dest_args[0]].alloc>>4)&0x1)<<22;
code |= (p->vars[insn->src_args[0]].alloc&0xf)<<0;
code |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<5;
code |= ((16 - p->vars[insn->src_args[1]].value)&0xf)<<16;
orc_arm_emit (p, code);
} else if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_PARAM) {
code = 0xf2100400;
ORC_ASM_CODE(p," vshl.s16 %s, %s, %s\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name (p->vars[insn->src_args[0]].alloc),
orc_neon_reg_name (p->vars[insn->src_args[1]].alloc));
code |= (p->vars[insn->dest_args[0]].alloc&0xf)<<12;
code |= ((p->vars[insn->dest_args[0]].alloc>>4)&0x1)<<22;
code |= (p->vars[insn->src_args[0]].alloc&0xf)<<0;
code |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<5;
code |= (p->vars[insn->src_args[1]].alloc&0xf)<<16;
code |= ((p->vars[insn->src_args[1]].alloc>>4)&0x1)<<7;
orc_arm_emit (p, code);
} else {
ORC_PROGRAM_ERROR(p,"shift rule only works with constants and params");
}
}
static void
orc_neon_rule_shrsl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
orc_uint32 code;
if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) {
code = 0xf2900010;
ORC_ASM_CODE(p," vshr.s32 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name (p->vars[insn->src_args[0]].alloc),
p->vars[insn->src_args[1]].value);
code |= (p->vars[insn->dest_args[0]].alloc&0xf)<<12;
code |= ((p->vars[insn->dest_args[0]].alloc>>4)&0x1)<<22;
code |= (p->vars[insn->src_args[0]].alloc&0xf)<<0;
code |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<5;
code |= ((16 - p->vars[insn->src_args[1]].value)&0xf)<<16;
orc_arm_emit (p, code);
} else if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_PARAM) {
code = 0xf2100400;
ORC_ASM_CODE(p," vshl.s32 %s, %s, %s\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name (p->vars[insn->src_args[0]].alloc),
orc_neon_reg_name (p->vars[insn->src_args[1]].alloc));
code |= (p->vars[insn->dest_args[0]].alloc&0xf)<<12;
code |= ((p->vars[insn->dest_args[0]].alloc>>4)&0x1)<<22;
code |= (p->vars[insn->src_args[0]].alloc&0xf)<<0;
code |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<5;
code |= (p->vars[insn->src_args[1]].alloc&0xf)<<16;
code |= ((p->vars[insn->src_args[1]].alloc>>4)&0x1)<<7;
orc_arm_emit (p, code);
} else {
ORC_PROGRAM_ERROR(p,"shift rule only works with constants and params");
}
}
#endif
static void
orc_neon_rule_andn (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int max_shift = ORC_PTR_TO_INT(user);
/* this is special because the operand order is reversed */
if (p->insn_shift <= max_shift) { \
orc_neon_emit_binary (p, "vbic", 0xf2100110,
p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[1]].alloc,
p->vars[insn->src_args[0]].alloc);
} else {
orc_neon_emit_binary_quad (p, "vbic", 0xf2100110,
p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[1]].alloc,
p->vars[insn->src_args[0]].alloc);
}
}
UNARY(absb,"vabs.s8",0xf3b10300, 3)
BINARY(addb,"vadd.i8",0xf2000800, 3)
BINARY(addssb,"vqadd.s8",0xf2000010, 3)
BINARY(addusb,"vqadd.u8",0xf3000010, 3)
BINARY(andb,"vand",0xf2000110, 3)
/* BINARY(andnb,"vbic",0xf2100110, 3) */
BINARY(avgsb,"vrhadd.s8",0xf2000100, 3)
BINARY(avgub,"vrhadd.u8",0xf3000100, 3)
BINARY(cmpeqb,"vceq.i8",0xf3000810, 3)
BINARY(cmpgtsb,"vcgt.s8",0xf2000300, 3)
MOVE(copyb,"vmov",0xf2200110, 3)
BINARY(maxsb,"vmax.s8",0xf2000600, 3)
BINARY(maxub,"vmax.u8",0xf3000600, 3)
BINARY(minsb,"vmin.s8",0xf2000610, 3)
BINARY(minub,"vmin.u8",0xf3000610, 3)
BINARY(mullb,"vmul.i8",0xf2000910, 3)
BINARY(orb,"vorr",0xf2200110, 3)
/* LSHIFT(shlb,"vshl.i8",0xf2880510, 3) */
/* RSHIFT(shrsb,"vshr.s8",0xf2880010,8, 3) */
/* RSHIFT(shrub,"vshr.u8",0xf3880010,8, 3) */
BINARY(subb,"vsub.i8",0xf3000800, 3)
BINARY(subssb,"vqsub.s8",0xf2000210, 3)
BINARY(subusb,"vqsub.u8",0xf3000210, 3)
BINARY(xorb,"veor",0xf3000110, 3)
UNARY(absw,"vabs.s16",0xf3b50300, 2)
BINARY(addw,"vadd.i16",0xf2100800, 2)
BINARY(addssw,"vqadd.s16",0xf2100010, 2)
BINARY(addusw,"vqadd.u16",0xf3100010, 2)
BINARY(andw,"vand",0xf2000110, 2)
/* BINARY(andnw,"vbic",0xf2100110, 2) */
BINARY(avgsw,"vrhadd.s16",0xf2100100, 2)
BINARY(avguw,"vrhadd.u16",0xf3100100, 2)
BINARY(cmpeqw,"vceq.i16",0xf3100810, 2)
BINARY(cmpgtsw,"vcgt.s16",0xf2100300, 2)
MOVE(copyw,"vmov",0xf2200110, 2)
BINARY(maxsw,"vmax.s16",0xf2100600, 2)
BINARY(maxuw,"vmax.u16",0xf3100600, 2)
BINARY(minsw,"vmin.s16",0xf2100610, 2)
BINARY(minuw,"vmin.u16",0xf3100610, 2)
BINARY(mullw,"vmul.i16",0xf2100910, 2)
BINARY(orw,"vorr",0xf2200110, 2)
/* LSHIFT(shlw,"vshl.i16",0xf2900510, 2) */
/* RSHIFT(shrsw,"vshr.s16",0xf2900010,16, 2) */
/* RSHIFT(shruw,"vshr.u16",0xf3900010,16, 2) */
BINARY(subw,"vsub.i16",0xf3100800, 2)
BINARY(subssw,"vqsub.s16",0xf2100210, 2)
BINARY(subusw,"vqsub.u16",0xf3100210, 2)
BINARY(xorw,"veor",0xf3000110, 2)
UNARY(absl,"vabs.s32",0xf3b90300, 1)
BINARY(addl,"vadd.i32",0xf2200800, 1)
BINARY(addssl,"vqadd.s32",0xf2200010, 1)
BINARY(addusl,"vqadd.u32",0xf3200010, 1)
BINARY(andl,"vand",0xf2000110, 1)
/* BINARY(andnl,"vbic",0xf2100110, 1) */
BINARY(avgsl,"vrhadd.s32",0xf2200100, 1)
BINARY(avgul,"vrhadd.u32",0xf3200100, 1)
BINARY(cmpeql,"vceq.i32",0xf3200810, 1)
BINARY(cmpgtsl,"vcgt.s32",0xf2200300, 1)
MOVE(copyl,"vmov",0xf2200110, 1)
BINARY(maxsl,"vmax.s32",0xf2200600, 1)
BINARY(maxul,"vmax.u32",0xf3200600, 1)
BINARY(minsl,"vmin.s32",0xf2200610, 1)
BINARY(minul,"vmin.u32",0xf3200610, 1)
BINARY(mulll,"vmul.i32",0xf2200910, 1)
BINARY(orl,"vorr",0xf2200110, 1)
/* LSHIFT(shll,"vshl.i32",0xf2a00510, 1) */
/* RSHIFT(shrsl,"vshr.s32",0xf2a00010,32, 1) */
/* RSHIFT(shrul,"vshr.u32",0xf3a00010,32, 1) */
BINARY(subl,"vsub.i32",0xf3200800, 1)
BINARY(subssl,"vqsub.s32",0xf2200210, 1)
BINARY(subusl,"vqsub.u32",0xf3200210, 1)
BINARY(xorl,"veor",0xf3000110, 1)
/* UNARY(absq,"vabs.s64",0xf3b10300, 0) */
BINARY(addq,"vadd.i64",0xf2300800, 0)
/* BINARY(addssq,"vqadd.s64",0xf2000010, 0) */
/* BINARY(addusq,"vqadd.u64",0xf3000010, 0) */
BINARY(andq,"vand",0xf2000110, 0)
/* BINARY(avgsq,"vrhadd.s64",0xf2000100, 0) */
/* BINARY(avguq,"vrhadd.u64",0xf3000100, 0) */
/* BINARY(cmpeqq,"vceq.i64",0xf3000810, 0) */
/* BINARY(cmpgtsq,"vcgt.s64",0xf2000300, 0) */
MOVE(copyq,"vmov",0xf2200110, 0)
/* BINARY(maxsq,"vmax.s64",0xf2000600, 0) */
/* BINARY(maxuq,"vmax.u64",0xf3000600, 0) */
/* BINARY(minsq,"vmin.s64",0xf2000610, 0) */
/* BINARY(minuq,"vmin.u64",0xf3000610, 0) */
/* BINARY(mullq,"vmul.i64",0xf2000910, 0) */
BINARY(orq,"vorr",0xf2200110, 0)
BINARY(subq,"vsub.i64",0xf3300800, 0)
/* BINARY(subssq,"vqsub.s64",0xf2000210, 0) */
/* BINARY(subusq,"vqsub.u64",0xf3000210, 0) */
BINARY(xorq,"veor",0xf3000110, 0)
UNARY_LONG(convsbw,"vmovl.s8",0xf2880a10, 3)
UNARY_LONG(convubw,"vmovl.u8",0xf3880a10, 3)
UNARY_LONG(convswl,"vmovl.s16",0xf2900a10, 2)
UNARY_LONG(convuwl,"vmovl.u16",0xf3900a10, 2)
UNARY_LONG(convslq,"vmovl.s32",0xf2a00a10, 1)
UNARY_LONG(convulq,"vmovl.u32",0xf3a00a10, 1)
UNARY_NARROW(convwb,"vmovn.i16",0xf3b20200, 3)
UNARY_NARROW(convssswb,"vqmovn.s16",0xf3b20280, 3)
UNARY_NARROW(convsuswb,"vqmovun.s16",0xf3b20240, 3)
UNARY_NARROW(convuuswb,"vqmovn.u16",0xf3b202c0, 3)
UNARY_NARROW(convlw,"vmovn.i32",0xf3b60200, 2)
UNARY_NARROW(convql,"vmovn.i64",0xf3ba0200, 1)
UNARY_NARROW(convssslw,"vqmovn.s32",0xf3b60280, 2)
UNARY_NARROW(convsuslw,"vqmovun.s32",0xf3b60240, 2)
UNARY_NARROW(convuuslw,"vqmovn.u32",0xf3b602c0, 2)
UNARY_NARROW(convsssql,"vqmovn.s64",0xf3ba0280, 1)
UNARY_NARROW(convsusql,"vqmovun.s64",0xf3ba0240, 1)
UNARY_NARROW(convuusql,"vqmovn.u64",0xf3ba02c0, 1)
BINARY_LONG(mulsbw,"vmull.s8",0xf2800c00, 3)
BINARY_LONG(mulubw,"vmull.u8",0xf3800c00, 3)
BINARY_LONG(mulswl,"vmull.s16",0xf2900c00, 2)
BINARY_LONG(muluwl,"vmull.u16",0xf3900c00, 2)
UNARY(swapw,"vrev16.i8",0xf3b00100, 2)
UNARY(swapl,"vrev32.i8",0xf3b00080, 1)
UNARY(swapq,"vrev64.i8",0xf3b00000, 0)
UNARY(swapwl,"vrev32.i16",0xf3b40080, 1)
UNARY(swaplq,"vrev64.i32",0xf3b80000, 0)
UNARY_NARROW(select0ql,"vmovn.i64",0xf3ba0200, 1)
UNARY_NARROW(select0lw,"vmovn.i32",0xf3b60200, 2)
UNARY_NARROW(select0wb,"vmovn.i16",0xf3b20200, 3)
BINARY(addf,"vadd.f32",0xf2000d00, 1)
BINARY(subf,"vsub.f32",0xf2200d00, 1)
BINARY(mulf,"vmul.f32",0xf3000d10, 1)
BINARY(maxf,"vmax.f32",0xf2000f00, 1)
BINARY(minf,"vmin.f32",0xf2200f00, 1)
BINARY(cmpeqf,"vceq.f32",0xf2000e00, 1)
/* BINARY_R(cmpltf,"vclt.f32",0xf3200e00, 1) */
/* BINARY_R(cmplef,"vcle.f32",0xf3000e00, 1) */
UNARY(convfl,"vcvt.s32.f32",0xf3bb0700, 1)
UNARY(convlf,"vcvt.f32.s32",0xf3bb0600, 1)
#define UNARY_VFP(opcode,insn_name,code,vec_shift) \
static void \
orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
{ \
orc_neon_emit_unary (p, insn_name, code, \
p->vars[insn->dest_args[0]].alloc, \
p->vars[insn->src_args[0]].alloc); \
if (p->insn_shift == vec_shift + 1) { \
orc_neon_emit_unary (p, insn_name, code, \
p->vars[insn->dest_args[0]].alloc + 1, \
p->vars[insn->src_args[0]].alloc + 1); \
} else { \
ORC_COMPILER_ERROR(p, "shift too large"); \
} \
}
#define BINARY_VFP(opcode,insn_name,code,vec_shift) \
static void \
orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
{ \
orc_neon_emit_binary (p, insn_name, code, \
p->vars[insn->dest_args[0]].alloc, \
p->vars[insn->src_args[0]].alloc, \
p->vars[insn->src_args[1]].alloc); \
if (p->insn_shift == vec_shift + 1) { \
orc_neon_emit_binary (p, insn_name, code, \
p->vars[insn->dest_args[0]].alloc+1, \
p->vars[insn->src_args[0]].alloc+1, \
p->vars[insn->src_args[1]].alloc+1); \
} else if (p->insn_shift > vec_shift + 1) { \
ORC_COMPILER_ERROR(p, "shift too large"); \
} \
}
BINARY_VFP(addd,"vadd.f64",0xee300b00, 0)
BINARY_VFP(subd,"vsub.f64",0xee300b40, 0)
BINARY_VFP(muld,"vmul.f64",0xee200b00, 0)
BINARY_VFP(divd,"vdiv.f64",0xee800b00, 0)
UNARY_VFP(sqrtd,"vsqrt.f64",0xeeb10b00, 0)
/* BINARY_VFP(cmpeqd,"vcmpe.f64",0xee000000, 0) */
UNARY_VFP(convdf,"vcvt.f64.f32",0xee200b00, 0)
UNARY_VFP(convfd,"vcvt.f32.f64",0xee200b00, 0)
#if 1
#define NUM_ITERS_DIVF 2
static void
orc_neon_rule_divf (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int vec_shift = 1;
if (p->insn_shift <= vec_shift) {
int i;
orc_neon_emit_unary (p, "vrecpe.f32", 0xf3bb0500,
p->tmpreg,
p->vars[insn->src_args[1]].alloc);
for(i = 0; i < NUM_ITERS_DIVF; i++) {
orc_neon_emit_binary (p, "vrecps.f32", 0xf2000f10,
p->tmpreg2, /* correction factor */
p->tmpreg, /* the last estimate */
p->vars[insn->src_args[1]].alloc); /* the original number */
orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10,
p->tmpreg, /* revised estimate */
p->tmpreg, /* last estimate */
p->tmpreg2); /* correction factor */
}
orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10,
p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[0]].alloc,
p->tmpreg);
} else if (p->insn_shift == vec_shift + 1) {
int i;
orc_neon_emit_unary_quad (p, "vrecpe.f32", 0xf3bb0500,
p->tmpreg,
p->vars[insn->src_args[1]].alloc);
for(i = 0; i < NUM_ITERS_DIVF; i++) {
orc_neon_emit_binary_quad (p, "vrecps.f32", 0xf2000f10,
p->tmpreg2, /* correction factor */
p->tmpreg, /* the last estimate */
p->vars[insn->src_args[1]].alloc); /* the original number */
orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10,
p->tmpreg, /* revised estimate */
p->tmpreg, /* last estimate */
p->tmpreg2); /* correction factor */
}
orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10,
p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[0]].alloc,
p->tmpreg);
} else {
ORC_COMPILER_ERROR(p, "shift too large");
}
}
#endif
#if 1
#define NUM_ITERS_SQRTF 2
static void
orc_neon_rule_sqrtf (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int vec_shift = 1;
if (p->insn_shift <= vec_shift) {
int i;
orc_neon_emit_unary (p, "vrsqrte.f32", 0xf3bb0580,
p->tmpreg,
p->vars[insn->src_args[0]].alloc);
for(i = 0; i < NUM_ITERS_SQRTF; i++) {
orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10,
p->tmpreg2,
p->tmpreg,
p->vars[insn->src_args[0]].alloc);
orc_neon_emit_binary (p, "vrsqrts.f32", 0xf2200f10,
p->tmpreg2,
p->tmpreg,
p->tmpreg2);
orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10,
p->tmpreg,
p->tmpreg,
p->tmpreg2);
}
orc_neon_emit_unary(p, "vrecpe.f32", 0xf3bb0500,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
for(i=0; i < NUM_ITERS_DIVF; i++) {
orc_neon_emit_binary (p, "vrecps.f32", 0xf2000f10,
p->tmpreg2, /* correction factor */
p->vars[insn->dest_args[0]].alloc, /* the last estimate */
p->tmpreg); /* the original number */
orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10,
p->vars[insn->dest_args[0]].alloc, /* revised estimate */
p->vars[insn->dest_args[0]].alloc, /* last estimate */
p->tmpreg2); /* correction factor */
}
} else if (p->insn_shift == vec_shift + 1) {
int i;
orc_neon_emit_unary_quad (p, "vrsqrte.f32", 0xf3bb0580,
p->tmpreg,
p->vars[insn->src_args[0]].alloc);
for(i = 0; i < NUM_ITERS_SQRTF; i++) {
orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10,
p->tmpreg2,
p->tmpreg,
p->vars[insn->src_args[0]].alloc);
orc_neon_emit_binary_quad (p, "vrsqrts.f32", 0xf2200f10,
p->tmpreg2,
p->tmpreg,
p->tmpreg2);
orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10,
p->tmpreg,
p->tmpreg,
p->tmpreg2);
}
orc_neon_emit_unary_quad(p, "vrecpe.f32", 0xf3bb0500,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
for(i=0; i < NUM_ITERS_DIVF; i++) {
orc_neon_emit_binary_quad (p, "vrecps.f32", 0xf2000f10,
p->tmpreg2, /* correction factor */
p->vars[insn->dest_args[0]].alloc, /* the last estimate */
p->tmpreg); /* the original number */
orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10,
p->vars[insn->dest_args[0]].alloc, /* revised estimate */
p->vars[insn->dest_args[0]].alloc, /* last estimate */
p->tmpreg2); /* correction factor */
}
} else {
ORC_COMPILER_ERROR(p, "shift too large");
}
}
#endif
static void
orc_neon_rule_accw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
unsigned int code;
if (p->insn_shift < 2) {
ORC_ASM_CODE(p," vshl.i64 %s, %s, #%d\n",
orc_neon_reg_name (p->tmpreg),
orc_neon_reg_name (p->vars[insn->src_args[0]].alloc), 48);
code = NEON_BINARY(0xf2a00590, p->tmpreg, 0,
p->vars[insn->src_args[0]].alloc);
code |= (48) << 16;
orc_arm_emit (p, code);
orc_neon_emit_binary (p, "vadd.i16", 0xf2100800,
p->vars[insn->dest_args[0]].alloc,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
} else {
orc_neon_emit_binary (p, "vadd.i16", 0xf2100800,
p->vars[insn->dest_args[0]].alloc,
p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[0]].alloc);
}
}
static void
orc_neon_rule_accl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
unsigned int code;
if (p->insn_shift < 1) {
ORC_ASM_CODE(p," vshl.i64 %s, %s, #%d\n",
orc_neon_reg_name (p->tmpreg),
orc_neon_reg_name (p->vars[insn->src_args[0]].alloc), 32);
code = NEON_BINARY(0xf2a00590, p->tmpreg, 0,
p->vars[insn->src_args[0]].alloc);
code |= (32) << 16;
orc_arm_emit (p, code);
orc_neon_emit_binary (p, "vadd.i32", 0xf2200800,
p->vars[insn->dest_args[0]].alloc,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
} else {
orc_neon_emit_binary (p, "vadd.i32", 0xf2200800,
p->vars[insn->dest_args[0]].alloc,
p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[0]].alloc);
}
}
static void
orc_neon_rule_select1wb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
unsigned int code;
ORC_ASM_CODE(p," vshrn.i16 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name_quad (p->vars[insn->src_args[0]].alloc), 8);
code = NEON_BINARY (0xf2880810,
p->vars[insn->dest_args[0]].alloc,
0, p->vars[insn->src_args[0]].alloc);
orc_arm_emit (p, code);
}
static void
orc_neon_rule_select1lw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
unsigned int code;
ORC_ASM_CODE(p," vshrn.i32 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name_quad (p->vars[insn->src_args[0]].alloc), 16);
code = NEON_BINARY (0xf2900810,
p->vars[insn->dest_args[0]].alloc,
0, p->vars[insn->src_args[0]].alloc);
orc_arm_emit (p, code);
}
static void
orc_neon_rule_select1ql (OrcCompiler *p, void *user, OrcInstruction *insn)
{
unsigned int code;
ORC_ASM_CODE(p," vtrn.32 %s, %s\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name_quad (p->vars[insn->src_args[0]].alloc));
code = NEON_BINARY (0xf2a00810,
p->vars[insn->dest_args[0]].alloc,
0, p->vars[insn->src_args[0]].alloc);
orc_arm_emit (p, code);
}
static void
orc_neon_rule_convhwb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
unsigned int code;
ORC_ASM_CODE(p," vshrn.i16 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name_quad (p->vars[insn->src_args[0]].alloc), 8);
code = NEON_BINARY (0xf2880810,
p->vars[insn->dest_args[0]].alloc,
0, p->vars[insn->src_args[0]].alloc);
orc_arm_emit (p, code);
}
static void
orc_neon_rule_convhlw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
unsigned int code;
ORC_ASM_CODE(p," vshrn.i32 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name_quad (p->vars[insn->src_args[0]].alloc), 16);
code = NEON_BINARY (0xf2900810,
p->vars[insn->dest_args[0]].alloc,
0, p->vars[insn->src_args[0]].alloc);
orc_arm_emit (p, code);
}
static void
orc_neon_rule_mergebw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
if (p->insn_shift <= 2) {
if (p->vars[insn->dest_args[0]].alloc != p->vars[insn->src_args[0]].alloc) {
orc_neon_emit_mov (p, p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[0]].alloc);
}
if (p->vars[insn->src_args[1]].last_use != p->insn_index ||
p->vars[insn->src_args[1]].alloc == p->vars[insn->dest_args[0]].alloc) {
orc_neon_emit_mov (p, p->tmpreg, p->vars[insn->src_args[1]].alloc);
orc_neon_emit_unary (p, "vzip.8", 0xf3b20180,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
} else {
orc_neon_emit_unary (p, "vzip.8", 0xf3b20180,
p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[1]].alloc);
}
} else {
if (p->vars[insn->dest_args[0]].alloc != p->vars[insn->src_args[0]].alloc) {
orc_neon_emit_mov_quad (p, p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[0]].alloc);
}
orc_neon_emit_mov_quad (p, p->tmpreg, p->vars[insn->src_args[1]].alloc);
orc_neon_emit_unary_quad (p, "vzip.8", 0xf3b20180,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
}
}
static void
orc_neon_rule_mergewl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
if (p->insn_shift <= 1) {
if (p->vars[insn->dest_args[0]].alloc != p->vars[insn->src_args[0]].alloc) {
orc_neon_emit_mov (p, p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[0]].alloc);
}
if (p->vars[insn->src_args[1]].last_use != p->insn_index ||
p->vars[insn->src_args[1]].alloc == p->vars[insn->dest_args[0]].alloc) {
orc_neon_emit_mov (p, p->tmpreg, p->vars[insn->src_args[1]].alloc);
orc_neon_emit_unary (p, "vzip.16", 0xf3b60180,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
} else {
orc_neon_emit_unary (p, "vzip.16", 0xf3b60180,
p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[1]].alloc);
}
} else {
if (p->vars[insn->dest_args[0]].alloc != p->vars[insn->src_args[0]].alloc) {
orc_neon_emit_mov_quad (p, p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[0]].alloc);
}
if (p->vars[insn->src_args[1]].last_use != p->insn_index ||
p->vars[insn->src_args[1]].alloc == p->vars[insn->dest_args[0]].alloc) {
orc_neon_emit_mov_quad (p, p->tmpreg, p->vars[insn->src_args[1]].alloc);
orc_neon_emit_unary_quad (p, "vzip.16", 0xf3b60180,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
} else {
orc_neon_emit_unary_quad (p, "vzip.16", 0xf3b60180,
p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[1]].alloc);
}
}
}
static void
orc_neon_rule_mergelq (OrcCompiler *p, void *user, OrcInstruction *insn)
{
if (p->insn_shift <= 0) {
if (p->vars[insn->dest_args[0]].alloc != p->vars[insn->src_args[0]].alloc) {
orc_neon_emit_mov (p, p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[0]].alloc);
}
if (p->vars[insn->src_args[1]].last_use != p->insn_index ||
p->vars[insn->src_args[1]].alloc == p->vars[insn->dest_args[0]].alloc) {
orc_neon_emit_mov (p, p->tmpreg, p->vars[insn->src_args[1]].alloc);
orc_neon_emit_unary (p, "vtrn.32", 0xf3ba0080,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
} else {
orc_neon_emit_unary (p, "vtrn.32", 0xf3ba0080,
p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[1]].alloc);
}
} else {
if (p->vars[insn->dest_args[0]].alloc != p->vars[insn->src_args[0]].alloc) {
orc_neon_emit_mov_quad (p, p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[0]].alloc);
}
if (p->vars[insn->src_args[1]].last_use != p->insn_index ||
p->vars[insn->src_args[1]].alloc == p->vars[insn->dest_args[0]].alloc) {
orc_neon_emit_mov_quad (p, p->tmpreg, p->vars[insn->src_args[1]].alloc);
orc_neon_emit_unary_quad (p, "vzip.32", 0xf3ba0180,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
} else {
orc_neon_emit_unary_quad (p, "vzip.32", 0xf3ba0180,
p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[1]].alloc);
}
}
}
static void
orc_neon_rule_splatbw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
if (p->insn_shift <= 2) {
if (p->vars[insn->dest_args[0]].alloc != p->vars[insn->src_args[0]].alloc) {
orc_neon_emit_mov (p, p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[0]].alloc);
}
orc_neon_emit_mov (p, p->tmpreg, p->vars[insn->dest_args[0]].alloc);
orc_neon_emit_unary (p, "vzip.8", 0xf3b20180,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
} else {
if (p->vars[insn->dest_args[0]].alloc != p->vars[insn->src_args[0]].alloc) {
orc_neon_emit_mov_quad (p, p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[0]].alloc);
}
orc_neon_emit_mov_quad (p, p->tmpreg, p->vars[insn->dest_args[0]].alloc);
orc_neon_emit_unary_quad (p, "vzip.8", 0xf3b20180,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
}
}
static void
orc_neon_rule_splatbl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
if (p->insn_shift <= 1) {
if (p->vars[insn->dest_args[0]].alloc != p->vars[insn->src_args[0]].alloc) {
orc_neon_emit_mov (p, p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[0]].alloc);
}
orc_neon_emit_mov (p, p->tmpreg, p->vars[insn->dest_args[0]].alloc);
orc_neon_emit_unary (p, "vzip.8", 0xf3b20180,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
orc_neon_emit_mov (p, p->tmpreg, p->vars[insn->dest_args[0]].alloc);
orc_neon_emit_unary (p, "vzip.16", 0xf3b60180,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
} else {
if (p->vars[insn->dest_args[0]].alloc != p->vars[insn->src_args[0]].alloc) {
orc_neon_emit_mov_quad (p, p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[0]].alloc);
}
orc_neon_emit_mov (p, p->tmpreg, p->vars[insn->dest_args[0]].alloc);
orc_neon_emit_unary_quad (p, "vzip.8", 0xf3b20180,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
orc_neon_emit_mov (p, p->tmpreg, p->vars[insn->dest_args[0]].alloc);
orc_neon_emit_unary_quad (p, "vzip.16", 0xf3b60180,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
}
}
static void
orc_neon_rule_splatw3q (OrcCompiler *p, void *user, OrcInstruction *insn)
{
orc_uint32 code;
int offset = 0;
int label = 20;
orc_arm_add_fixup (p, label, 1);
ORC_ASM_CODE(p," vldr %s, .L%d+%d\n",
orc_neon_reg_name (p->tmpreg), label, offset);
code = 0xed9f0b00;
code |= (p->tmpreg&0xf) << 12;
code |= ((p->tmpreg>>4)&0x1) << 22;
code |= ((offset - 8) >> 2)&0xff;
orc_arm_emit (p, code);
ORC_ASM_CODE(p," vtbl.8 %s, { %s, %s }, %s\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name (p->vars[insn->src_args[0]].alloc),
orc_neon_reg_name (p->vars[insn->src_args[0]].alloc + 1),
orc_neon_reg_name (p->tmpreg));
code = NEON_BINARY(0xf3b00900,
p->vars[insn->dest_args[0]].alloc,
p->vars[insn->src_args[0]].alloc,
p->tmpreg);
orc_arm_emit (p, code);
if (p->insn_shift > 0) {
ORC_ASM_CODE(p," vtbl.8 %s, { %s }, %s\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc+1),
orc_neon_reg_name (p->vars[insn->src_args[0]].alloc+1),
orc_neon_reg_name (p->tmpreg));
code = NEON_BINARY(0xf3b00800,
p->vars[insn->dest_args[0]].alloc+1,
p->vars[insn->src_args[0]].alloc+1,
p->tmpreg);
orc_arm_emit (p, code);
}
}
static void
orc_neon_rule_accsadubl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
orc_uint32 x;
unsigned int code;
if (p->insn_shift < 2) {
x = 0xf3800700;
ORC_ASM_CODE(p," vabdl.u8 %s, %s, %s\n",
orc_neon_reg_name_quad (p->tmpreg),
orc_neon_reg_name (p->vars[insn->src_args[0]].alloc),
orc_neon_reg_name (p->vars[insn->src_args[1]].alloc));
x |= (p->tmpreg&0xf)<<12;
x |= ((p->tmpreg>>4)&0x1)<<22;
x |= (p->vars[insn->src_args[0]].alloc&0xf)<<16;
x |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<7;
x |= (p->vars[insn->src_args[1]].alloc&0xf)<<0;
x |= ((p->vars[insn->src_args[1]].alloc>>4)&0x1)<<5;
orc_arm_emit (p, x);
ORC_ASM_CODE(p," vshl.i64 %s, %s, #%d\n",
orc_neon_reg_name (p->tmpreg),
orc_neon_reg_name (p->tmpreg), 64 - (16<<p->insn_shift));
code = NEON_BINARY(0xf2a00590, p->tmpreg, 0, p->tmpreg);
code |= (64 - (16<<p->insn_shift)) << 16;
orc_arm_emit (p, code);
orc_neon_emit_unary (p, "vpadal.u16", 0xf3b40680,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
} else {
x = 0xf3800700;
ORC_ASM_CODE(p," vabdl.u8 %s, %s, %s\n",
orc_neon_reg_name_quad (p->tmpreg),
orc_neon_reg_name (p->vars[insn->src_args[0]].alloc),
orc_neon_reg_name (p->vars[insn->src_args[1]].alloc));
x |= (p->tmpreg&0xf)<<12;
x |= ((p->tmpreg>>4)&0x1)<<22;
x |= (p->vars[insn->src_args[0]].alloc&0xf)<<16;
x |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<7;
x |= (p->vars[insn->src_args[1]].alloc&0xf)<<0;
x |= ((p->vars[insn->src_args[1]].alloc>>4)&0x1)<<5;
orc_arm_emit (p, x);
orc_neon_emit_unary (p, "vpadal.u16", 0xf3b40680,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg);
}
}
static void
orc_neon_rule_signw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
/* slow */
orc_neon_emit_loadiw (p, p->tmpreg, 1);
if (p->insn_shift < 3) {
orc_neon_emit_binary (p, "vmin.s16", 0xf2100610,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg,
p->vars[insn->src_args[0]].alloc);
} else {
orc_neon_emit_binary_quad (p, "vmin.s16", 0xf2100610,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg,
p->vars[insn->src_args[0]].alloc);
}
orc_neon_emit_loadiw (p, p->tmpreg, -1);
if (p->insn_shift < 3) {
orc_neon_emit_binary (p, "vmax.s16", 0xf2100600,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg,
p->vars[insn->dest_args[0]].alloc);
} else {
orc_neon_emit_binary_quad (p, "vmax.s16", 0xf2100600,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg,
p->vars[insn->dest_args[0]].alloc);
}
}
static void
orc_neon_rule_signb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
/* slow */
orc_neon_emit_loadib (p, p->tmpreg, 1);
if (p->insn_shift < 4) {
orc_neon_emit_binary (p, "vmin.s8", 0xf2000610,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg,
p->vars[insn->src_args[0]].alloc);
} else {
orc_neon_emit_binary_quad (p, "vmin.s8", 0xf2000610,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg,
p->vars[insn->src_args[0]].alloc);
}
orc_neon_emit_loadib (p, p->tmpreg, -1);
if (p->insn_shift < 4) {
orc_neon_emit_binary (p, "vmax.s8", 0xf2000600,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg,
p->vars[insn->dest_args[0]].alloc);
} else {
orc_neon_emit_binary_quad (p, "vmax.s8", 0xf2000600,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg,
p->vars[insn->dest_args[0]].alloc);
}
}
static void
orc_neon_rule_signl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
/* slow */
orc_neon_emit_loadil (p, p->tmpreg, 1);
if (p->insn_shift < 2) {
orc_neon_emit_binary (p, "vmin.s32", 0xf2200610,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg,
p->vars[insn->src_args[0]].alloc);
} else {
orc_neon_emit_binary_quad (p, "vmin.s32", 0xf2200610,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg,
p->vars[insn->src_args[0]].alloc);
}
orc_neon_emit_loadil (p, p->tmpreg, -1);
if (p->insn_shift < 2) {
orc_neon_emit_binary (p, "vmax.s32", 0xf2200600,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg,
p->vars[insn->dest_args[0]].alloc);
} else {
orc_neon_emit_binary_quad (p, "vmax.s32", 0xf2200600,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg,
p->vars[insn->dest_args[0]].alloc);
}
}
static void
orc_neon_rule_mulhub (OrcCompiler *p, void *user, OrcInstruction *insn)
{
unsigned int code;
orc_neon_emit_binary_long (p, "vmull.u8",0xf3800c00,
p->tmpreg,
p->vars[insn->src_args[0]].alloc,
p->vars[insn->src_args[1]].alloc);
ORC_ASM_CODE(p," vshrn.i16 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name_quad (p->tmpreg), 8);
code = NEON_BINARY (0xf2880810,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg, 0);
orc_arm_emit (p, code);
if (p->insn_shift == 4) {
orc_neon_emit_binary_long (p, "vmull.u8",0xf3800c00,
p->tmpreg,
p->vars[insn->src_args[0]].alloc + 1,
p->vars[insn->src_args[1]].alloc + 1);
ORC_ASM_CODE(p," vshrn.i16 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc + 1),
orc_neon_reg_name_quad (p->tmpreg), 8);
code = NEON_BINARY (0xf2880810,
p->vars[insn->dest_args[0]].alloc + 1,
p->tmpreg, 0);
orc_arm_emit (p, code);
}
}
static void
orc_neon_rule_mulhsb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
unsigned int code;
orc_neon_emit_binary_long (p, "vmull.s8",0xf2800c00,
p->tmpreg,
p->vars[insn->src_args[0]].alloc,
p->vars[insn->src_args[1]].alloc);
ORC_ASM_CODE(p," vshrn.i16 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name_quad (p->tmpreg), 8);
code = NEON_BINARY (0xf2880810,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg, 0);
orc_arm_emit (p, code);
if (p->insn_shift == 4) {
orc_neon_emit_binary_long (p, "vmull.s8",0xf2800c00,
p->tmpreg,
p->vars[insn->src_args[0]].alloc + 1,
p->vars[insn->src_args[1]].alloc + 1);
ORC_ASM_CODE(p," vshrn.i16 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc + 1),
orc_neon_reg_name_quad (p->tmpreg), 8);
code = NEON_BINARY (0xf2880810,
p->vars[insn->dest_args[0]].alloc + 1,
p->tmpreg, 0);
orc_arm_emit (p, code);
}
}
static void
orc_neon_rule_mulhuw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
unsigned int code;
orc_neon_emit_binary_long (p, "vmull.u16",0xf3900c00,
p->tmpreg,
p->vars[insn->src_args[0]].alloc,
p->vars[insn->src_args[1]].alloc);
ORC_ASM_CODE(p," vshrn.i32 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name_quad (p->tmpreg), 16);
code = NEON_BINARY (0xf2900810,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg, 0);
orc_arm_emit (p, code);
if (p->insn_shift == 3) {
orc_neon_emit_binary_long (p, "vmull.u16",0xf3900c00,
p->tmpreg,
p->vars[insn->src_args[0]].alloc + 1,
p->vars[insn->src_args[1]].alloc + 1);
ORC_ASM_CODE(p," vshrn.i32 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc + 1),
orc_neon_reg_name_quad (p->tmpreg), 16);
code = NEON_BINARY (0xf2900810,
p->vars[insn->dest_args[0]].alloc + 1,
p->tmpreg, 0);
orc_arm_emit (p, code);
}
}
static void
orc_neon_rule_mulhsw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
unsigned int code;
orc_neon_emit_binary_long (p, "vmull.s16",0xf2900c00,
p->tmpreg,
p->vars[insn->src_args[0]].alloc,
p->vars[insn->src_args[1]].alloc);
ORC_ASM_CODE(p," vshrn.i32 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name_quad (p->tmpreg), 16);
code = NEON_BINARY (0xf2900810,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg, 0);
orc_arm_emit (p, code);
if (p->insn_shift == 3) {
orc_neon_emit_binary_long (p, "vmull.s16",0xf2900c00,
p->tmpreg,
p->vars[insn->src_args[0]].alloc + 1,
p->vars[insn->src_args[1]].alloc + 1);
ORC_ASM_CODE(p," vshrn.i32 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc + 1),
orc_neon_reg_name_quad (p->tmpreg), 16);
code = NEON_BINARY (0xf2900810,
p->vars[insn->dest_args[0]].alloc + 1,
p->tmpreg, 0);
orc_arm_emit (p, code);
}
}
static void
orc_neon_rule_mulhul (OrcCompiler *p, void *user, OrcInstruction *insn)
{
unsigned int code;
orc_neon_emit_binary_long (p, "vmull.u32",0xf3a00c00,
p->tmpreg,
p->vars[insn->src_args[0]].alloc,
p->vars[insn->src_args[1]].alloc);
ORC_ASM_CODE(p," vshrn.i64 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name_quad (p->tmpreg), 32);
code = NEON_BINARY (0xf2a00810,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg, 0);
orc_arm_emit (p, code);
if (p->insn_shift == 2) {
orc_neon_emit_binary_long (p, "vmull.u32",0xf3a00c00,
p->tmpreg,
p->vars[insn->src_args[0]].alloc + 1,
p->vars[insn->src_args[1]].alloc + 1);
ORC_ASM_CODE(p," vshrn.i64 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc + 1),
orc_neon_reg_name_quad (p->tmpreg), 32);
code = NEON_BINARY (0xf2a00810,
p->vars[insn->dest_args[0]].alloc + 1,
p->tmpreg, 0);
orc_arm_emit (p, code);
}
}
static void
orc_neon_rule_mulhsl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
unsigned int code;
orc_neon_emit_binary_long (p, "vmull.s32",0xf2a00c00,
p->tmpreg,
p->vars[insn->src_args[0]].alloc,
p->vars[insn->src_args[1]].alloc);
ORC_ASM_CODE(p," vshrn.i64 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc),
orc_neon_reg_name_quad (p->tmpreg), 32);
code = NEON_BINARY (0xf2a00810,
p->vars[insn->dest_args[0]].alloc,
p->tmpreg, 0);
orc_arm_emit (p, code);
if (p->insn_shift == 2) {
orc_neon_emit_binary_long (p, "vmull.s32",0xf2a00c00,
p->tmpreg,
p->vars[insn->src_args[0]].alloc + 1,
p->vars[insn->src_args[1]].alloc + 1);
ORC_ASM_CODE(p," vshrn.i64 %s, %s, #%d\n",
orc_neon_reg_name (p->vars[insn->dest_args[0]].alloc + 1),
orc_neon_reg_name_quad (p->tmpreg), 32);
code = NEON_BINARY (0xf2a00810,
p->vars[insn->dest_args[0]].alloc + 1,
p->tmpreg, 0);
orc_arm_emit (p, code);
}
}
static void
orc_neon_rule_splitql (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest0 = p->vars[insn->dest_args[0]].alloc;
int dest1 = p->vars[insn->dest_args[1]].alloc;
int src = p->vars[insn->src_args[0]].alloc;
if (p->insn_shift < 1) {
if (src != dest0) {
orc_neon_emit_mov (p, dest0, src);
}
if (src != dest1) {
orc_neon_emit_mov (p, dest1, src);
}
orc_neon_emit_unary (p, "vtrn.32", 0xf3ba0080, dest1, dest0);
} else {
if (src != dest0) {
orc_neon_emit_mov_quad (p, dest0, src);
}
if (src != dest1) {
orc_neon_emit_mov_quad (p, dest1, src);
}
orc_neon_emit_unary_quad (p, "vuzp.32", 0xf3ba0140, dest1, dest0);
}
}
static void
orc_neon_rule_splitlw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest0 = p->vars[insn->dest_args[0]].alloc;
int dest1 = p->vars[insn->dest_args[1]].alloc;
int src = p->vars[insn->src_args[0]].alloc;
if (p->insn_shift < 2) {
if (src != dest0) {
orc_neon_emit_mov (p, dest0, src);
}
if (src != dest1) {
orc_neon_emit_mov (p, dest1, src);
}
orc_neon_emit_unary (p, "vuzp.16", 0xf3b60100, dest1, dest0);
} else {
if (src != dest0) {
orc_neon_emit_mov_quad (p, dest0, src);
}
if (src != dest1) {
orc_neon_emit_mov_quad (p, dest1, src);
}
orc_neon_emit_unary_quad (p, "vuzp.16", 0xf3b60140, dest1, dest0);
}
}
static void
orc_neon_rule_splitwb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest0 = p->vars[insn->dest_args[0]].alloc;
int dest1 = p->vars[insn->dest_args[1]].alloc;
int src = p->vars[insn->src_args[0]].alloc;
if (p->insn_shift < 2) {
if (src != dest0) {
orc_neon_emit_mov (p, dest0, src);
}
if (src != dest1) {
orc_neon_emit_mov (p, dest1, src);
}
orc_neon_emit_unary (p, "vuzp.8", 0xf3b20100, dest1, dest0);
} else {
if (src != dest0) {
orc_neon_emit_mov_quad (p, dest0, src);
}
if (src != dest1) {
orc_neon_emit_mov_quad (p, dest1, src);
}
orc_neon_emit_unary_quad (p, "vuzp.8", 0xf3b20140, dest1, dest0);
}
}
static void
orc_neon_rule_div255w (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
int src = p->vars[insn->src_args[0]].alloc;
int tmp = p->tmpreg;
if (p->insn_shift < 3) {
ORC_ASM_CODE(p," vrshrn.u16 %s, %s, #%d\n", orc_neon_reg_name(tmp),
orc_neon_reg_name_quad(src), 8);
orc_arm_emit (p, NEON_BINARY (0xf2880850, tmp, 0, src));
orc_neon_emit_unary_long (p, "vmovl.u8",0xf3880a10, tmp, tmp);
orc_neon_emit_binary (p, "vadd.i16", 0xf2100800, tmp, tmp, src);
ORC_ASM_CODE(p," vrshrn.u16 %s, %s, #%d\n", orc_neon_reg_name(dest),
orc_neon_reg_name_quad(tmp), 8);
orc_arm_emit (p, NEON_BINARY (0xf2880850, dest, 0, tmp));
orc_neon_emit_unary_long (p, "vmovl.u8",0xf3880a10, dest, dest);
} else {
ORC_ASM_CODE(p," vrshrn.u16 %s, %s, #%d\n", orc_neon_reg_name(tmp),
orc_neon_reg_name_quad(src), 8);
orc_arm_emit (p, NEON_BINARY (0xf2880850, tmp, 0, src));
orc_neon_emit_unary_long (p, "vmovl.u8",0xf3880a10, tmp, tmp);
orc_neon_emit_binary_quad (p, "vadd.i16", 0xf2100800, tmp, tmp, src);
ORC_ASM_CODE(p," vrshrn.u16 %s, %s, #%d\n", orc_neon_reg_name(dest),
orc_neon_reg_name_quad(tmp), 8);
orc_arm_emit (p, NEON_BINARY (0xf2880850, dest, 0, tmp));
orc_neon_emit_unary_long (p, "vmovl.u8",0xf3880a10, dest, dest);
}
}
void
orc_compiler_neon_register_rules (OrcTarget *target)
{
OrcRuleSet *rule_set;
rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target, 0);
#define REG(x) \
orc_rule_register (rule_set, #x , orc_neon_rule_ ## x, NULL)
REG(absb);
REG(addb);
REG(addssb);
REG(addusb);
REG(andb);
/* REG(andnb); */
REG(avgsb);
REG(avgub);
REG(cmpeqb);
REG(cmpgtsb);
REG(copyb);
REG(maxsb);
REG(maxub);
REG(minsb);
REG(minub);
REG(mullb);
REG(mulhsb);
REG(mulhub);
REG(orb);
/* REG(shlb); */
/* REG(shrsb); */
/* REG(shrub); */
REG(signb);
REG(subb);
REG(subssb);
REG(subusb);
REG(xorb);
REG(absw);
REG(addw);
REG(addssw);
REG(addusw);
REG(andw);
/* REG(andnw); */
REG(avgsw);
REG(avguw);
REG(cmpeqw);
REG(cmpgtsw);
REG(copyw);
REG(maxsw);
REG(maxuw);
REG(minsw);
REG(minuw);
REG(mullw);
REG(mulhsw);
REG(mulhuw);
REG(orw);
/* REG(shlw); */
/* REG(shrsw); */
/* REG(shruw); */
REG(signw);
REG(subw);
REG(subssw);
REG(subusw);
REG(xorw);
REG(absl);
REG(addl);
REG(addssl);
REG(addusl);
REG(andl);
/* REG(andnl); */
REG(avgsl);
REG(avgul);
REG(cmpeql);
REG(cmpgtsl);
REG(copyl);
REG(maxsl);
REG(maxul);
REG(minsl);
REG(minul);
REG(mulll);
REG(mulhsl);
REG(mulhul);
REG(orl);
/* REG(shll); */
/* REG(shrsl); */
/* REG(shrul); */
REG(signl);
REG(subl);
REG(subssl);
REG(subusl);
REG(xorl);
REG(addq);
REG(andq);
REG(orq);
REG(copyq);
REG(subq);
REG(xorq);
REG(convsbw);
REG(convubw);
REG(convswl);
REG(convuwl);
REG(convslq);
REG(convulq);
REG(convlw);
REG(convql);
REG(convssslw);
REG(convsuslw);
REG(convuuslw);
REG(convsssql);
REG(convsusql);
REG(convuusql);
REG(convwb);
REG(convhwb);
REG(convhlw);
REG(convssswb);
REG(convsuswb);
REG(convuuswb);
REG(mulsbw);
REG(mulubw);
REG(mulswl);
REG(muluwl);
REG(accw);
REG(accl);
REG(accsadubl);
REG(swapw);
REG(swapl);
REG(swapq);
REG(swapwl);
REG(swaplq);
REG(select0wb);
REG(select1wb);
REG(select0lw);
REG(select1lw);
REG(select0ql);
if (0) REG(select1ql);
REG(mergebw);
REG(mergewl);
REG(mergelq);
REG(splitql);
REG(splitlw);
REG(splitwb);
REG(addf);
REG(subf);
REG(mulf);
REG(divf);
REG(sqrtf);
REG(maxf);
REG(minf);
REG(cmpeqf);
/* REG(cmpltf); */
/* REG(cmplef); */
REG(convfl);
REG(convlf);
REG(addd);
REG(subd);
REG(muld);
REG(divd);
REG(sqrtd);
/* REG(cmpeqd); */
REG(convdf);
REG(convfd);
REG(splatbw);
REG(splatbl);
REG(splatw3q);
REG(div255w);
orc_rule_register (rule_set, "loadpb", neon_rule_loadpX, (void *)1);
orc_rule_register (rule_set, "loadpw", neon_rule_loadpX, (void *)2);
orc_rule_register (rule_set, "loadpl", neon_rule_loadpX, (void *)4);
orc_rule_register (rule_set, "loadpq", neon_rule_loadpX, (void *)8);
orc_rule_register (rule_set, "loadb", neon_rule_loadX, (void *)0);
orc_rule_register (rule_set, "loadw", neon_rule_loadX, (void *)0);
orc_rule_register (rule_set, "loadl", neon_rule_loadX, (void *)0);
orc_rule_register (rule_set, "loadq", neon_rule_loadX, (void *)0);
orc_rule_register (rule_set, "loadoffb", neon_rule_loadX, (void *)1);
orc_rule_register (rule_set, "loadoffw", neon_rule_loadX, (void *)1);
orc_rule_register (rule_set, "loadoffl", neon_rule_loadX, (void *)1);
orc_rule_register (rule_set, "storeb", neon_rule_storeX, (void *)0);
orc_rule_register (rule_set, "storew", neon_rule_storeX, (void *)0);
orc_rule_register (rule_set, "storel", neon_rule_storeX, (void *)0);
orc_rule_register (rule_set, "storeq", neon_rule_storeX, (void *)0);
orc_rule_register (rule_set, "shlb", orc_neon_rule_shift, (void *)0);
orc_rule_register (rule_set, "shrsb", orc_neon_rule_shift, (void *)1);
orc_rule_register (rule_set, "shrub", orc_neon_rule_shift, (void *)2);
orc_rule_register (rule_set, "shlw", orc_neon_rule_shift, (void *)3);
orc_rule_register (rule_set, "shrsw", orc_neon_rule_shift, (void *)4);
orc_rule_register (rule_set, "shruw", orc_neon_rule_shift, (void *)5);
orc_rule_register (rule_set, "shll", orc_neon_rule_shift, (void *)6);
orc_rule_register (rule_set, "shrsl", orc_neon_rule_shift, (void *)7);
orc_rule_register (rule_set, "shrul", orc_neon_rule_shift, (void *)8);
orc_rule_register (rule_set, "andnb", orc_neon_rule_andn, (void *)3);
orc_rule_register (rule_set, "andnw", orc_neon_rule_andn, (void *)2);
orc_rule_register (rule_set, "andnl", orc_neon_rule_andn, (void *)1);
orc_rule_register (rule_set, "andnq", orc_neon_rule_andn, (void *)0);
}