#include "config.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/types.h>
#include <orc/orcprogram.h>
#include <orc/orcdebug.h>
#include <orc/orcsse.h>
#undef MMX
#define SIZE 65536
/* sse rules */
static void
sse_rule_loadpX (OrcCompiler *compiler, void *user, OrcInstruction *insn)
{
OrcVariable *src = compiler->vars + insn->src_args[0];
OrcVariable *dest = compiler->vars + insn->dest_args[0];
int reg;
int size = ORC_PTR_TO_INT(user);
if (src->vartype == ORC_VAR_TYPE_PARAM) {
reg = dest->alloc;
if (size == 8 && src->size == 8) {
orc_x86_emit_mov_memoffset_sse (compiler, 4,
(int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[0]]),
compiler->exec_reg, reg, FALSE);
#ifndef MMX
orc_sse_emit_movhps_load_memoffset (compiler,
(int)ORC_STRUCT_OFFSET(OrcExecutor,
params[insn->src_args[0] + (ORC_VAR_T1 - ORC_VAR_P1)]),
compiler->exec_reg, reg);
orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(2,0,2,0), reg, reg);
#else
/* FIXME yes, I understand this is terrible */
orc_sse_emit_pinsrw_memoffset (compiler, 2,
(int)ORC_STRUCT_OFFSET(OrcExecutor,
params[insn->src_args[0] + (ORC_VAR_T1 - ORC_VAR_P1)]) + 0,
compiler->exec_reg, reg);
orc_sse_emit_pinsrw_memoffset (compiler, 3,
(int)ORC_STRUCT_OFFSET(OrcExecutor,
params[insn->src_args[0] + (ORC_VAR_T1 - ORC_VAR_P1)]) + 2,
compiler->exec_reg, reg);
#ifndef MMX
orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(1,0,1,0), reg, reg);
#endif
#endif
} else {
orc_x86_emit_mov_memoffset_sse (compiler, 4,
(int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[0]]),
compiler->exec_reg, reg, FALSE);
if (size < 8) {
if (size == 1) {
orc_sse_emit_punpcklbw (compiler, reg, reg);
}
#ifndef MMX
if (size <= 2) {
orc_sse_emit_pshuflw (compiler, 0, reg, reg);
}
orc_sse_emit_pshufd (compiler, 0, reg, reg);
#else
if (size <= 2) {
orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(0,0,0,0), reg, reg);
} else {
orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(1,0,1,0), reg, reg);
}
#endif
} else {
#ifndef MMX
orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(1,0,1,0), reg, reg);
#endif
}
}
} else if (src->vartype == ORC_VAR_TYPE_CONST) {
orc_sse_load_constant (compiler, dest->alloc, size, src->value.i);
} else {
ORC_ASSERT(0);
}
}
static void
sse_rule_loadX (OrcCompiler *compiler, void *user, OrcInstruction *insn)
{
OrcVariable *src = compiler->vars + insn->src_args[0];
OrcVariable *dest = compiler->vars + insn->dest_args[0];
int ptr_reg;
int offset = 0;
offset = compiler->offset * src->size;
if (src->ptr_register == 0) {
int i = insn->src_args[0];
orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4,
(int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]),
compiler->exec_reg, compiler->gp_tmpreg);
ptr_reg = compiler->gp_tmpreg;
} else {
ptr_reg = src->ptr_register;
}
switch (src->size << compiler->loop_shift) {
case 1:
orc_x86_emit_mov_memoffset_reg (compiler, 1, offset, ptr_reg,
compiler->gp_tmpreg);
orc_sse_emit_movd_load_register (compiler, compiler->gp_tmpreg, dest->alloc);
break;
case 2:
orc_sse_emit_pxor (compiler, dest->alloc, dest->alloc);
orc_sse_emit_pinsrw_memoffset (compiler, 0, offset, ptr_reg, dest->alloc);
break;
case 4:
orc_x86_emit_mov_memoffset_sse (compiler, 4, offset, ptr_reg,
dest->alloc, src->is_aligned);
break;
case 8:
orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, ptr_reg,
dest->alloc, src->is_aligned);
break;
case 16:
orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, ptr_reg,
dest->alloc, src->is_aligned);
break;
default:
orc_compiler_error (compiler, "bad load size %d",
src->size << compiler->loop_shift);
break;
}
src->update_type = 2;
}
static void
sse_rule_loadoffX (OrcCompiler *compiler, void *user, OrcInstruction *insn)
{
OrcVariable *src = compiler->vars + insn->src_args[0];
OrcVariable *dest = compiler->vars + insn->dest_args[0];
int ptr_reg;
int offset = 0;
if (compiler->vars[insn->src_args[1]].vartype != ORC_VAR_TYPE_CONST) {
orc_compiler_error (compiler, "code generation rule for %s only works with constant offset",
insn->opcode->name);
return;
}
offset = (compiler->offset + compiler->vars[insn->src_args[1]].value.i) *
src->size;
if (src->ptr_register == 0) {
int i = insn->src_args[0];
orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4,
(int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]),
compiler->exec_reg, compiler->gp_tmpreg);
ptr_reg = compiler->gp_tmpreg;
} else {
ptr_reg = src->ptr_register;
}
switch (src->size << compiler->loop_shift) {
case 1:
orc_x86_emit_mov_memoffset_reg (compiler, 1, offset, ptr_reg,
compiler->gp_tmpreg);
orc_sse_emit_movd_load_register (compiler, compiler->gp_tmpreg, dest->alloc);
break;
case 2:
orc_sse_emit_pxor (compiler, dest->alloc, dest->alloc);
orc_sse_emit_pinsrw_memoffset (compiler, 0, offset, ptr_reg, dest->alloc);
break;
case 4:
orc_x86_emit_mov_memoffset_sse (compiler, 4, offset, ptr_reg,
dest->alloc, src->is_aligned);
break;
case 8:
orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, ptr_reg,
dest->alloc, src->is_aligned);
break;
case 16:
orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, ptr_reg,
dest->alloc, src->is_aligned);
break;
default:
orc_compiler_error (compiler,"bad load size %d",
src->size << compiler->loop_shift);
break;
}
src->update_type = 2;
}
static void
sse_rule_loadupib (OrcCompiler *compiler, void *user, OrcInstruction *insn)
{
OrcVariable *src = compiler->vars + insn->src_args[0];
OrcVariable *dest = compiler->vars + insn->dest_args[0];
int ptr_reg;
int offset = 0;
int tmp = orc_compiler_get_temp_reg (compiler);
offset = (compiler->offset * src->size) >> 1;
if (src->ptr_register == 0) {
int i = insn->src_args[0];
orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4,
(int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]),
compiler->exec_reg, compiler->gp_tmpreg);
ptr_reg = compiler->gp_tmpreg;
} else {
ptr_reg = src->ptr_register;
}
switch (src->size << compiler->loop_shift) {
case 1:
case 2:
orc_sse_emit_pinsrw_memoffset (compiler, 0, offset, ptr_reg, dest->alloc);
orc_sse_emit_movdqa (compiler, dest->alloc, tmp);
orc_sse_emit_psrlw_imm (compiler, 8, tmp);
break;
case 4:
orc_sse_emit_pinsrw_memoffset (compiler, 0, offset, ptr_reg, dest->alloc);
orc_sse_emit_pinsrw_memoffset (compiler, 0, offset + 1, ptr_reg, tmp);
break;
case 8:
orc_x86_emit_mov_memoffset_sse (compiler, 4, offset, ptr_reg,
dest->alloc, FALSE);
orc_x86_emit_mov_memoffset_sse (compiler, 4, offset + 1, ptr_reg,
tmp, FALSE);
break;
case 16:
orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, ptr_reg,
dest->alloc, FALSE);
orc_x86_emit_mov_memoffset_sse (compiler, 8, offset + 1, ptr_reg,
tmp, FALSE);
break;
case 32:
orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, ptr_reg,
dest->alloc, FALSE);
orc_x86_emit_mov_memoffset_sse (compiler, 16, offset + 1, ptr_reg,
tmp, FALSE);
break;
default:
orc_compiler_error(compiler,"bad load size %d",
src->size << compiler->loop_shift);
break;
}
orc_sse_emit_pavgb (compiler, dest->alloc, tmp);
orc_sse_emit_punpcklbw (compiler, tmp, dest->alloc);
src->update_type = 1;
}
static void
sse_rule_loadupdb (OrcCompiler *compiler, void *user, OrcInstruction *insn)
{
OrcVariable *src = compiler->vars + insn->src_args[0];
OrcVariable *dest = compiler->vars + insn->dest_args[0];
int ptr_reg;
int offset = 0;
offset = (compiler->offset * src->size) >> 1;
if (src->ptr_register == 0) {
int i = insn->src_args[0];
orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4,
(int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]),
compiler->exec_reg, compiler->gp_tmpreg);
ptr_reg = compiler->gp_tmpreg;
} else {
ptr_reg = src->ptr_register;
}
switch (src->size << compiler->loop_shift) {
case 1:
case 2:
orc_x86_emit_mov_memoffset_reg (compiler, 1, offset, ptr_reg,
compiler->gp_tmpreg);
orc_sse_emit_movd_load_register (compiler, compiler->gp_tmpreg, dest->alloc);
break;
case 4:
orc_sse_emit_pinsrw_memoffset (compiler, 0, offset, ptr_reg, dest->alloc);
break;
case 8:
orc_x86_emit_mov_memoffset_sse (compiler, 4, offset, ptr_reg,
dest->alloc, src->is_aligned);
break;
case 16:
orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, ptr_reg,
dest->alloc, src->is_aligned);
break;
case 32:
orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, ptr_reg,
dest->alloc, src->is_aligned);
break;
default:
orc_compiler_error(compiler,"bad load size %d",
src->size << compiler->loop_shift);
break;
}
switch (src->size) {
case 1:
orc_sse_emit_punpcklbw (compiler, dest->alloc, dest->alloc);
break;
case 2:
orc_sse_emit_punpcklwd (compiler, dest->alloc, dest->alloc);
break;
case 4:
orc_sse_emit_punpckldq (compiler, dest->alloc, dest->alloc);
break;
}
src->update_type = 1;
}
static void
sse_rule_storeX (OrcCompiler *compiler, void *user, OrcInstruction *insn)
{
OrcVariable *src = compiler->vars + insn->src_args[0];
OrcVariable *dest = compiler->vars + insn->dest_args[0];
int offset;
int ptr_reg;
offset = compiler->offset * dest->size;
if (dest->ptr_register == 0) {
orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4,
dest->ptr_offset, compiler->exec_reg, compiler->gp_tmpreg);
ptr_reg = compiler->gp_tmpreg;
} else {
ptr_reg = dest->ptr_register;
}
switch (dest->size << compiler->loop_shift) {
case 1:
/* FIXME we might be using ecx twice here */
if (ptr_reg == compiler->gp_tmpreg) {
orc_compiler_error (compiler, "unimplemented corner case in %s",
insn->opcode->name);
}
orc_sse_emit_movd_store_register (compiler, src->alloc, compiler->gp_tmpreg);
orc_x86_emit_mov_reg_memoffset (compiler, 1, compiler->gp_tmpreg,
offset, ptr_reg);
break;
case 2:
if (compiler->target_flags & ORC_TARGET_SSE_SSE4_1) {
orc_sse_emit_pextrw_memoffset (compiler, 0, offset, src->alloc,
ptr_reg);
} else {
/* FIXME we might be using ecx twice here */
if (ptr_reg == compiler->gp_tmpreg) {
orc_compiler_error(compiler, "unimplemented corner case in %s",
insn->opcode->name);
}
orc_sse_emit_movd_store_register (compiler, src->alloc, compiler->gp_tmpreg);
orc_x86_emit_mov_reg_memoffset (compiler, 2, compiler->gp_tmpreg,
offset, ptr_reg);
}
break;
case 4:
orc_x86_emit_mov_sse_memoffset (compiler, 4, src->alloc, offset, ptr_reg,
dest->is_aligned, dest->is_uncached);
break;
case 8:
orc_x86_emit_mov_sse_memoffset (compiler, 8, src->alloc, offset, ptr_reg,
dest->is_aligned, dest->is_uncached);
break;
case 16:
orc_x86_emit_mov_sse_memoffset (compiler, 16, src->alloc, offset, ptr_reg,
dest->is_aligned, dest->is_uncached);
break;
default:
orc_compiler_error (compiler, "bad size");
break;
}
dest->update_type = 2;
}
#if try1
static void
sse_rule_ldresnearl (OrcCompiler *compiler, void *user, OrcInstruction *insn)
{
OrcVariable *src = compiler->vars + insn->src_args[0];
OrcVariable *dest = compiler->vars + insn->dest_args[0];
int tmp = orc_compiler_get_temp_reg (compiler);
int tmp2 = orc_compiler_get_temp_reg (compiler);
int tmpc;
orc_sse_emit_movd_store_register (compiler, X86_XMM6, compiler->gp_tmpreg);
orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg);
orc_sse_emit_movdqu_load_memindex (compiler, 0, src->ptr_register,
compiler->gp_tmpreg, 4, dest->alloc);
#if 0
orc_sse_emit_movdqa (compiler, X86_XMM6, tmp);
orc_sse_emit_pslld_imm (compiler, 10, tmp);
orc_sse_emit_psrld_imm (compiler, 26, tmp);
orc_sse_emit_pslld_imm (compiler, 2, tmp);
orc_sse_emit_movdqa (compiler, tmp, tmp2);
orc_sse_emit_pslld_imm (compiler, 8, tmp2);
orc_sse_emit_por (compiler, tmp2, tmp);
orc_sse_emit_movdqa (compiler, tmp, tmp2);
orc_sse_emit_pslld_imm (compiler, 16, tmp2);
orc_sse_emit_por (compiler, tmp2, tmp);
#else
orc_sse_emit_movdqa (compiler, X86_XMM6, tmp);
tmpc = orc_compiler_get_constant_long (compiler, 0x02020202,
0x06060606, 0x0a0a0a0a, 0x0e0e0e0e);
orc_sse_emit_pshufb (compiler, tmpc, tmp);
orc_sse_emit_paddb (compiler, tmp, tmp);
orc_sse_emit_paddb (compiler, tmp, tmp);
#endif
orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(0,0,0,0), tmp, tmp2);
orc_sse_emit_psubd (compiler, tmp2, tmp);
tmpc = orc_compiler_get_constant (compiler, 4, 0x03020100);
orc_sse_emit_paddd (compiler, tmpc, tmp);
orc_sse_emit_pshufb (compiler, tmp, dest->alloc);
orc_sse_emit_movdqa (compiler, X86_XMM7, tmp);
orc_sse_emit_pslld_imm (compiler, compiler->loop_shift, tmp);
orc_sse_emit_paddd (compiler, tmp, X86_XMM6);
src->update_type = 0;
}
#endif
static void
sse_rule_ldresnearl (OrcCompiler *compiler, void *user, OrcInstruction *insn)
{
OrcVariable *src = compiler->vars + insn->src_args[0];
int increment_var = insn->src_args[2];
OrcVariable *dest = compiler->vars + insn->dest_args[0];
int tmp = orc_compiler_get_temp_reg (compiler);
int i;
for(i=0;i<(1<<compiler->loop_shift);i++){
if (i == 0) {
orc_x86_emit_mov_memoffset_sse (compiler, 4, 0,
src->ptr_register, dest->alloc, FALSE);
} else {
orc_x86_emit_mov_memindex_sse (compiler, 4, 0,
src->ptr_register, compiler->gp_tmpreg, 2, tmp, FALSE);
#ifdef MMX
/* orc_mmx_emit_punpckldq (compiler, tmp, dest->alloc); */
orc_sse_emit_psllq_imm (compiler, 8*4*i, tmp);
orc_sse_emit_por (compiler, tmp, dest->alloc);
#else
orc_sse_emit_pslldq_imm (compiler, 4*i, tmp);
orc_sse_emit_por (compiler, tmp, dest->alloc);
#endif
}
if (compiler->vars[increment_var].vartype == ORC_VAR_TYPE_PARAM) {
orc_x86_emit_add_memoffset_reg (compiler, 4,
(int)ORC_STRUCT_OFFSET(OrcExecutor, params[increment_var]),
compiler->exec_reg, src->ptr_offset);
} else {
orc_x86_emit_add_imm_reg (compiler, 4,
compiler->vars[increment_var].value.i,
src->ptr_offset, FALSE);
}
orc_x86_emit_mov_reg_reg (compiler, 4, src->ptr_offset, compiler->gp_tmpreg);
orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg);
}
orc_x86_emit_add_reg_reg_shift (compiler, compiler->is_64bit ? 8 : 4,
compiler->gp_tmpreg,
src->ptr_register, 2);
orc_x86_emit_and_imm_reg (compiler, 4, 0xffff, src->ptr_offset);
src->update_type = 0;
}
#ifndef MMX
static void
sse_rule_ldreslinl (OrcCompiler *compiler, void *user, OrcInstruction *insn)
{
OrcVariable *src = compiler->vars + insn->src_args[0];
int increment_var = insn->src_args[2];
OrcVariable *dest = compiler->vars + insn->dest_args[0];
int tmp = orc_compiler_get_temp_reg (compiler);
int tmp2 = orc_compiler_get_temp_reg (compiler);
int regsize = compiler->is_64bit ? 8 : 4;
int i;
if (compiler->loop_shift == 0) {
orc_x86_emit_mov_memoffset_sse (compiler, 8, 0,
src->ptr_register, tmp, FALSE);
orc_sse_emit_pxor (compiler, tmp2, tmp2);
orc_sse_emit_punpcklbw (compiler, tmp2, tmp);
orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(3,2,3,2), tmp, tmp2);
orc_sse_emit_psubw (compiler, tmp, tmp2);
orc_sse_emit_movd_load_register (compiler, src->ptr_offset, tmp);
orc_sse_emit_pshuflw (compiler, ORC_SSE_SHUF(0,0,0,0), tmp, tmp);
orc_sse_emit_psrlw_imm (compiler, 8, tmp);
orc_sse_emit_pmullw (compiler, tmp2, tmp);
orc_sse_emit_psraw_imm (compiler, 8, tmp);
orc_sse_emit_pxor (compiler, tmp2, tmp2);
orc_sse_emit_packsswb (compiler, tmp2, tmp);
orc_x86_emit_mov_memoffset_sse (compiler, 4, 0,
src->ptr_register, dest->alloc, FALSE);
orc_sse_emit_paddb (compiler, tmp, dest->alloc);
if (compiler->vars[increment_var].vartype == ORC_VAR_TYPE_PARAM) {
orc_x86_emit_add_memoffset_reg (compiler, 4,
(int)ORC_STRUCT_OFFSET(OrcExecutor, params[increment_var]),
compiler->exec_reg, src->ptr_offset);
} else {
orc_x86_emit_add_imm_reg (compiler, regsize,
compiler->vars[increment_var].value.i,
src->ptr_offset, FALSE);
}
orc_x86_emit_mov_reg_reg (compiler, 4, src->ptr_offset, compiler->gp_tmpreg);
orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg);
orc_x86_emit_add_reg_reg_shift (compiler, regsize, compiler->gp_tmpreg,
src->ptr_register, 2);
orc_x86_emit_and_imm_reg (compiler, 4, 0xffff, src->ptr_offset);
} else {
int tmp3 = orc_compiler_get_temp_reg (compiler);
int tmp4 = orc_compiler_get_temp_reg (compiler);
for(i=0;i<(1<<compiler->loop_shift);i+=2){
orc_x86_emit_mov_memoffset_sse (compiler, 8, 0,
src->ptr_register, tmp, FALSE);
orc_sse_emit_movd_load_register (compiler, src->ptr_offset, tmp4);
if (compiler->vars[increment_var].vartype == ORC_VAR_TYPE_PARAM) {
orc_x86_emit_add_memoffset_reg (compiler, 4,
(int)ORC_STRUCT_OFFSET(OrcExecutor, params[increment_var]),
compiler->exec_reg, src->ptr_offset);
} else {
orc_x86_emit_add_imm_reg (compiler, 4,
compiler->vars[increment_var].value.i,
src->ptr_offset, FALSE);
}
orc_x86_emit_mov_reg_reg (compiler, 4, src->ptr_offset, compiler->gp_tmpreg);
orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg);
orc_x86_emit_mov_memindex_sse (compiler, 8, 0,
src->ptr_register, compiler->gp_tmpreg, 2, tmp2, FALSE);
orc_sse_emit_punpckldq (compiler, tmp2, tmp);
orc_sse_emit_movdqa (compiler, tmp, tmp2);
if (i == 0) {
orc_sse_emit_movdqa (compiler, tmp, dest->alloc);
} else {
orc_sse_emit_punpcklqdq (compiler, tmp, dest->alloc);
}
orc_sse_emit_pxor (compiler, tmp3, tmp3);
orc_sse_emit_punpcklbw (compiler, tmp3, tmp);
orc_sse_emit_punpckhbw (compiler, tmp3, tmp2);
orc_sse_emit_psubw (compiler, tmp, tmp2);
orc_sse_emit_pinsrw_register (compiler, 1, src->ptr_offset, tmp4);
#if 0
orc_sse_emit_punpcklwd (compiler, tmp4, tmp4);
orc_sse_emit_punpckldq (compiler, tmp4, tmp4);
#else
orc_sse_emit_pshuflw (compiler, ORC_SSE_SHUF(1,1,0,0), tmp4, tmp4);
orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(1,1,0,0), tmp4, tmp4);
#endif
orc_sse_emit_psrlw_imm (compiler, 8, tmp4);
orc_sse_emit_pmullw (compiler, tmp4, tmp2);
orc_sse_emit_psraw_imm (compiler, 8, tmp2);
orc_sse_emit_pxor (compiler, tmp, tmp);
orc_sse_emit_packsswb (compiler, tmp, tmp2);
if (i != 0) {
orc_sse_emit_pslldq_imm (compiler, 8, tmp2);
}
orc_sse_emit_paddb (compiler, tmp2, dest->alloc);
if (compiler->vars[increment_var].vartype == ORC_VAR_TYPE_PARAM) {
orc_x86_emit_add_memoffset_reg (compiler, 4,
(int)ORC_STRUCT_OFFSET(OrcExecutor, params[increment_var]),
compiler->exec_reg, src->ptr_offset);
} else {
orc_x86_emit_add_imm_reg (compiler, 4,
compiler->vars[increment_var].value.i,
src->ptr_offset, FALSE);
}
orc_x86_emit_mov_reg_reg (compiler, 4, src->ptr_offset, compiler->gp_tmpreg);
orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg);
orc_x86_emit_add_reg_reg_shift (compiler, 8, compiler->gp_tmpreg,
src->ptr_register, 2);
orc_x86_emit_and_imm_reg (compiler, 4, 0xffff, src->ptr_offset);
}
}
src->update_type = 0;
}
#else
static void
mmx_rule_ldreslinl (OrcCompiler *compiler, void *user, OrcInstruction *insn)
{
OrcVariable *src = compiler->vars + insn->src_args[0];
int increment_var = insn->src_args[2];
OrcVariable *dest = compiler->vars + insn->dest_args[0];
int tmp = orc_compiler_get_temp_reg (compiler);
int tmp2 = orc_compiler_get_temp_reg (compiler);
int zero;
int regsize = compiler->is_64bit ? 8 : 4;
int i;
zero = orc_compiler_get_constant (compiler, 1, 0);
for(i=0;i<(1<<compiler->loop_shift);i++){
orc_x86_emit_mov_memoffset_mmx (compiler, 4, 0,
src->ptr_register, tmp, FALSE);
orc_x86_emit_mov_memoffset_mmx (compiler, 4, 4,
src->ptr_register, tmp2, FALSE);
orc_mmx_emit_punpcklbw (compiler, zero, tmp);
orc_mmx_emit_punpcklbw (compiler, zero, tmp2);
orc_mmx_emit_psubw (compiler, tmp, tmp2);
orc_sse_emit_movd_load_register (compiler, src->ptr_offset, tmp);
orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(0,0,0,0), tmp, tmp);
orc_mmx_emit_psrlw_imm (compiler, 8, tmp);
orc_mmx_emit_pmullw (compiler, tmp2, tmp);
orc_mmx_emit_psraw_imm (compiler, 8, tmp);
orc_mmx_emit_pxor (compiler, tmp2, tmp2);
orc_mmx_emit_packsswb (compiler, tmp2, tmp);
if (i == 0) {
orc_x86_emit_mov_memoffset_mmx (compiler, 4, 0,
src->ptr_register, dest->alloc, FALSE);
orc_mmx_emit_paddb (compiler, tmp, dest->alloc);
} else {
orc_x86_emit_mov_memoffset_mmx (compiler, 4, 0,
src->ptr_register, tmp2, FALSE);
orc_mmx_emit_paddb (compiler, tmp, tmp2);
orc_mmx_emit_psllq_imm (compiler, 32, tmp2);
orc_mmx_emit_por (compiler, tmp2, dest->alloc);
}
if (compiler->vars[increment_var].vartype == ORC_VAR_TYPE_PARAM) {
orc_x86_emit_add_memoffset_reg (compiler, 4,
(int)ORC_STRUCT_OFFSET(OrcExecutor, params[increment_var]),
compiler->exec_reg, src->ptr_offset);
} else {
orc_x86_emit_add_imm_reg (compiler, regsize,
compiler->vars[increment_var].value.i,
src->ptr_offset, FALSE);
}
orc_x86_emit_mov_reg_reg (compiler, 4, src->ptr_offset, compiler->gp_tmpreg);
orc_x86_emit_sar_imm_reg (compiler, 4, 16, compiler->gp_tmpreg);
orc_x86_emit_add_reg_reg_shift (compiler, regsize, compiler->gp_tmpreg,
src->ptr_register, 2);
orc_x86_emit_and_imm_reg (compiler, 4, 0xffff, src->ptr_offset);
}
src->update_type = 0;
}
#endif
static void
sse_rule_copyx (OrcCompiler *p, void *user, OrcInstruction *insn)
{
if (p->vars[insn->src_args[0]].alloc == p->vars[insn->dest_args[0]].alloc) {
return;
}
orc_sse_emit_movdqa (p,
p->vars[insn->src_args[0]].alloc,
p->vars[insn->dest_args[0]].alloc);
}
#define UNARY(opcode,insn_name,code) \
static void \
sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
{ \
orc_sse_emit_ ## insn_name (p, \
p->vars[insn->src_args[0]].alloc, \
p->vars[insn->dest_args[0]].alloc); \
}
#define BINARY(opcode,insn_name,code) \
static void \
sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
{ \
orc_sse_emit_ ## insn_name (p, \
p->vars[insn->src_args[1]].alloc, \
p->vars[insn->dest_args[0]].alloc); \
}
UNARY(absb,pabsb,0x381c)
BINARY(addb,paddb,0xfc)
BINARY(addssb,paddsb,0xec)
BINARY(addusb,paddusb,0xdc)
BINARY(andb,pand,0xdb)
BINARY(andnb,pandn,0xdf)
BINARY(avgub,pavgb,0xe0)
BINARY(cmpeqb,pcmpeqb,0x74)
BINARY(cmpgtsb,pcmpgtb,0x64)
BINARY(maxsb,pmaxsb,0x383c)
BINARY(maxub,pmaxub,0xde)
BINARY(minsb,pminsb,0x3838)
BINARY(minub,pminub,0xda)
/* BINARY(mullb,pmullb,0xd5) */
/* BINARY(mulhsb,pmulhb,0xe5) */
/* BINARY(mulhub,pmulhub,0xe4) */
BINARY(orb,por,0xeb)
/* UNARY(signb,psignb,0x3808) */
BINARY(subb,psubb,0xf8)
BINARY(subssb,psubsb,0xe8)
BINARY(subusb,psubusb,0xd8)
BINARY(xorb,pxor,0xef)
UNARY(absw,pabsw,0x381d)
BINARY(addw,paddw,0xfd)
BINARY(addssw,paddsw,0xed)
BINARY(addusw,paddusw,0xdd)
BINARY(andw,pand,0xdb)
BINARY(andnw,pandn,0xdf)
BINARY(avguw,pavgw,0xe3)
BINARY(cmpeqw,pcmpeqw,0x75)
BINARY(cmpgtsw,pcmpgtw,0x65)
BINARY(maxsw,pmaxsw,0xee)
BINARY(maxuw,pmaxuw,0x383e)
BINARY(minsw,pminsw,0xea)
BINARY(minuw,pminuw,0x383a)
BINARY(mullw,pmullw,0xd5)
BINARY(mulhsw,pmulhw,0xe5)
BINARY(mulhuw,pmulhuw,0xe4)
BINARY(orw,por,0xeb)
/* UNARY(signw,psignw,0x3809) */
BINARY(subw,psubw,0xf9)
BINARY(subssw,psubsw,0xe9)
BINARY(subusw,psubusw,0xd9)
BINARY(xorw,pxor,0xef)
UNARY(absl,pabsd,0x381e)
BINARY(addl,paddd,0xfe)
/* BINARY(addssl,paddsd,0xed) */
/* BINARY(addusl,paddusd,0xdd) */
BINARY(andl,pand,0xdb)
BINARY(andnl,pandn,0xdf)
/* BINARY(avgul,pavgd,0xe3) */
BINARY(cmpeql,pcmpeqd,0x76)
BINARY(cmpgtsl,pcmpgtd,0x66)
BINARY(maxsl,pmaxsd,0x383d)
BINARY(maxul,pmaxud,0x383f)
BINARY(minsl,pminsd,0x3839)
BINARY(minul,pminud,0x383b)
BINARY(mulll,pmulld,0x3840)
/* BINARY(mulhsl,pmulhd,0xe5) */
/* BINARY(mulhul,pmulhud,0xe4) */
BINARY(orl,por,0xeb)
/* UNARY(signl,psignd,0x380a) */
BINARY(subl,psubd,0xfa)
/* BINARY(subssl,psubsd,0xe9) */
/* BINARY(subusl,psubusd,0xd9) */
BINARY(xorl,pxor,0xef)
BINARY(andq,pand,0xdb)
BINARY(andnq,pandn,0xdf)
BINARY(orq,por,0xeb)
BINARY(xorq,pxor,0xef)
BINARY(cmpeqq,pcmpeqq,0x3829)
BINARY(cmpgtsq,pcmpgtq,0x3837)
#ifndef MMX
BINARY(addq,paddq,0xd4)
BINARY(subq,psubq,0xfb)
#endif
static void
sse_rule_accw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_paddw (p, src, dest);
}
static void
sse_rule_accl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
#ifndef MMX
if (p->loop_shift == 0) {
orc_sse_emit_pslldq_imm (p, 12, src);
}
#endif
orc_sse_emit_paddd (p, src, dest);
}
static void
sse_rule_accsadubl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src1 = p->vars[insn->src_args[0]].alloc;
int src2 = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
int tmp2 = orc_compiler_get_temp_reg (p);
#ifndef MMX
if (p->loop_shift <= 2) {
orc_sse_emit_movdqa (p, src1, tmp);
orc_sse_emit_pslldq_imm (p, 16 - (1<<p->loop_shift), tmp);
orc_sse_emit_movdqa (p, src2, tmp2);
orc_sse_emit_pslldq_imm (p, 16 - (1<<p->loop_shift), tmp2);
orc_sse_emit_psadbw (p, tmp2, tmp);
} else if (p->loop_shift == 3) {
orc_sse_emit_movdqa (p, src1, tmp);
orc_sse_emit_psadbw (p, src2, tmp);
orc_sse_emit_pslldq_imm (p, 8, tmp);
} else {
orc_sse_emit_movdqa (p, src1, tmp);
orc_sse_emit_psadbw (p, src2, tmp);
}
#else
if (p->loop_shift <= 2) {
orc_sse_emit_movdqa (p, src1, tmp);
orc_sse_emit_psllq_imm (p, 8*(8 - (1<<p->loop_shift)), tmp);
orc_sse_emit_movdqa (p, src2, tmp2);
orc_sse_emit_psllq_imm (p, 8*(8 - (1<<p->loop_shift)), tmp2);
orc_sse_emit_psadbw (p, tmp2, tmp);
} else {
orc_sse_emit_movdqa (p, src1, tmp);
orc_sse_emit_psadbw (p, src2, tmp);
}
#endif
orc_sse_emit_paddd (p, tmp, dest);
}
#ifndef MMX
static void
sse_rule_signX_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int opcodes[] = { ORC_X86_psignb, ORC_X86_psignw, ORC_X86_psignd };
int type = ORC_PTR_TO_INT(user);
int tmpc;
tmpc = orc_compiler_get_temp_constant (p, 1<<type, 1);
if (src == dest) {
orc_x86_emit_cpuinsn_size (p, opcodes[type], 16, src, tmpc);
orc_sse_emit_movdqa (p, tmpc, dest);
} else {
/* FIXME this would be a good opportunity to not chain src to dest */
orc_sse_emit_movdqa (p, tmpc, dest);
orc_x86_emit_cpuinsn_size (p, opcodes[type], 16, src, dest);
}
}
#endif
static void
sse_rule_signw_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp;
tmp = orc_compiler_get_constant (p, 2, 0x0001);
orc_sse_emit_pminsw (p, tmp, dest);
tmp = orc_compiler_get_constant (p, 2, 0xffff);
orc_sse_emit_pmaxsw (p, tmp, dest);
}
static void
sse_rule_absb_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_pxor (p, tmp, tmp);
orc_sse_emit_pcmpgtb (p, src, tmp);
orc_sse_emit_pxor (p, tmp, dest);
orc_sse_emit_psubb (p, tmp, dest);
}
static void
sse_rule_absw_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
if (src == dest) {
orc_sse_emit_movdqa (p, src, tmp);
} else {
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_movdqa (p, tmp, dest);
}
orc_sse_emit_psraw_imm (p, 15, tmp);
orc_sse_emit_pxor (p, tmp, dest);
orc_sse_emit_psubw (p, tmp, dest);
}
static void
sse_rule_absl_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
if (src == dest) {
orc_sse_emit_movdqa (p, src, tmp);
} else {
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_movdqa (p, tmp, dest);
}
orc_sse_emit_psrad_imm (p, 31, tmp);
orc_sse_emit_pxor (p, tmp, dest);
orc_sse_emit_psubd (p, tmp, dest);
}
static void
sse_rule_shift (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int type = ORC_PTR_TO_INT(user);
/* int imm_code1[] = { 0x71, 0x71, 0x71, 0x72, 0x72, 0x72, 0x73, 0x73 }; */
/* int imm_code2[] = { 6, 2, 4, 6, 2, 4, 6, 2 }; */
/* int reg_code[] = { 0xf1, 0xd1, 0xe1, 0xf2, 0xd2, 0xe2, 0xf3, 0xd3 }; */
/* const char *code[] = { "psllw", "psrlw", "psraw", "pslld", "psrld", "psrad", "psllq", "psrlq" }; */
const int opcodes[] = { ORC_X86_psllw, ORC_X86_psrlw, ORC_X86_psraw,
ORC_X86_pslld, ORC_X86_psrld, ORC_X86_psrad, ORC_X86_psllq,
ORC_X86_psrlq };
const int opcodes_imm[] = { ORC_X86_psllw_imm, ORC_X86_psrlw_imm,
ORC_X86_psraw_imm, ORC_X86_pslld_imm, ORC_X86_psrld_imm,
ORC_X86_psrad_imm, ORC_X86_psllq_imm, ORC_X86_psrlq_imm };
if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) {
orc_x86_emit_cpuinsn_imm (p, opcodes_imm[type],
p->vars[insn->src_args[1]].value.i, 16,
p->vars[insn->dest_args[0]].alloc);
} else if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_PARAM) {
int tmp = orc_compiler_get_temp_reg (p);
/* FIXME this is a gross hack to reload the register with a
* 64-bit version of the parameter. */
orc_x86_emit_mov_memoffset_sse (p, 4,
(int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[1]]),
p->exec_reg, tmp, FALSE);
orc_x86_emit_cpuinsn_size (p, opcodes[type], 16, tmp,
p->vars[insn->dest_args[0]].alloc);
} else {
orc_compiler_error (p, "code generation rule for %s only works with "
"constant or parameter shifts", insn->opcode->name);
p->result = ORC_COMPILE_RESULT_UNKNOWN_COMPILE;
}
}
static void
sse_rule_shlb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp;
if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) {
orc_sse_emit_psllw_imm (p, p->vars[insn->src_args[1]].value.i, dest);
tmp = orc_compiler_get_constant (p, 1,
0xff&(0xff<<p->vars[insn->src_args[1]].value.i));
orc_sse_emit_pand (p, tmp, dest);
} else {
orc_compiler_error (p, "code generation rule for %s only works with "
"constant shifts", insn->opcode->name);
p->result = ORC_COMPILE_RESULT_UNKNOWN_COMPILE;
}
}
static void
sse_rule_shrsb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) {
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_psllw_imm (p, 8, tmp);
orc_sse_emit_psraw_imm (p, p->vars[insn->src_args[1]].value.i, tmp);
orc_sse_emit_psrlw_imm (p, 8, tmp);
orc_sse_emit_psraw_imm (p, 8 + p->vars[insn->src_args[1]].value.i, dest);
orc_sse_emit_psllw_imm (p, 8, dest);
orc_sse_emit_por (p, tmp, dest);
} else {
orc_compiler_error (p, "code generation rule for %s only works with "
"constant shifts", insn->opcode->name);
p->result = ORC_COMPILE_RESULT_UNKNOWN_COMPILE;
}
}
static void
sse_rule_shrub (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp;
if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) {
orc_sse_emit_psrlw_imm (p, p->vars[insn->src_args[1]].value.i, dest);
tmp = orc_compiler_get_constant (p, 1,
(0xff>>p->vars[insn->src_args[1]].value.i));
orc_sse_emit_pand (p, tmp, dest);
} else {
orc_compiler_error (p, "code generation rule for %s only works with "
"constant shifts", insn->opcode->name);
p->result = ORC_COMPILE_RESULT_UNKNOWN_COMPILE;
}
}
static void
sse_rule_shrsq (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
if (p->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_CONST) {
#ifndef MMX
orc_sse_emit_pshufd (p, ORC_SSE_SHUF(3,3,1,1), src, tmp);
#else
orc_mmx_emit_pshufw (p, ORC_MMX_SHUF(3,2,3,2), src, tmp);
#endif
orc_sse_emit_psrad_imm (p, 31, tmp);
orc_sse_emit_psllq_imm (p, 64-p->vars[insn->src_args[1]].value.i, tmp);
orc_sse_emit_psrlq_imm (p, p->vars[insn->src_args[1]].value.i, dest);
orc_sse_emit_por (p, tmp, dest);
} else {
orc_compiler_error (p, "code generation rule for %s only works with "
"constant shifts", insn->opcode->name);
p->result = ORC_COMPILE_RESULT_UNKNOWN_COMPILE;
}
}
static void
sse_rule_convsbw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_punpcklbw (p, src, dest);
orc_sse_emit_psraw_imm (p, 8, dest);
}
static void
sse_rule_convubw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
/* FIXME need a zero register */
if (0) {
orc_sse_emit_punpcklbw (p, src, dest);
orc_sse_emit_psrlw_imm (p, 8, dest);
} else {
orc_sse_emit_pxor(p, tmp, tmp);
orc_sse_emit_punpcklbw (p, tmp, dest);
}
}
static void
sse_rule_convssswb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_packsswb (p, src, dest);
}
static void
sse_rule_convsuswb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_packuswb (p, src, dest);
}
static void
sse_rule_convuuswb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_movdqa (p, src, dest);
orc_sse_emit_psrlw_imm (p, 15, tmp);
orc_sse_emit_psllw_imm (p, 14, tmp);
orc_sse_emit_por (p, tmp, dest);
orc_sse_emit_psllw_imm (p, 1, tmp);
orc_sse_emit_pxor (p, tmp, dest);
orc_sse_emit_packuswb (p, dest, dest);
}
static void
sse_rule_convwb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_psllw_imm (p, 8, dest);
orc_sse_emit_psrlw_imm (p, 8, dest);
orc_sse_emit_packuswb (p, dest, dest);
}
static void
sse_rule_convhwb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_psrlw_imm (p, 8, dest);
orc_sse_emit_packuswb (p, dest, dest);
}
static void
sse_rule_convswl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_punpcklwd (p, src, dest);
orc_sse_emit_psrad_imm (p, 16, dest);
}
static void
sse_rule_convuwl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
/* FIXME need a zero register */
if (0) {
orc_sse_emit_punpcklwd (p, src, dest);
orc_sse_emit_psrld_imm (p, 16, dest);
} else {
orc_sse_emit_pxor(p, tmp, tmp);
orc_sse_emit_punpcklwd (p, tmp, dest);
}
}
static void
sse_rule_convlw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_pslld_imm (p, 16, dest);
orc_sse_emit_psrad_imm (p, 16, dest);
orc_sse_emit_packssdw (p, dest, dest);
}
static void
sse_rule_convhlw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_psrad_imm (p, 16, dest);
orc_sse_emit_packssdw (p, dest, dest);
}
static void
sse_rule_convssslw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_packssdw (p, src, dest);
}
static void
sse_rule_convsuslw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_packusdw (p, src, dest);
}
static void
sse_rule_convslq (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_psrad_imm (p, 31, tmp);
orc_sse_emit_punpckldq (p, tmp, dest);
}
static void
sse_rule_convulq (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp;
tmp = orc_compiler_get_constant (p, 4, 0);
orc_sse_emit_punpckldq (p, tmp, dest);
}
static void
sse_rule_convql (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
#ifndef MMX
orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,2,0), src, dest);
#else
orc_sse_emit_movdqa (p, src, dest);
#endif
}
static void
sse_rule_splatw3q (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
#ifndef MMX
orc_sse_emit_pshuflw (p, ORC_SSE_SHUF(3,3,3,3), dest, dest);
orc_sse_emit_pshufhw (p, ORC_SSE_SHUF(3,3,3,3), dest, dest);
#else
orc_mmx_emit_pshufw (p, ORC_SSE_SHUF(3,3,3,3), dest, dest);
#endif
}
static void
sse_rule_splatbw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_punpcklbw (p, dest, dest);
}
static void
sse_rule_splatbl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_punpcklbw (p, dest, dest);
orc_sse_emit_punpcklwd (p, dest, dest);
}
static void
sse_rule_div255w (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
int tmpc;
tmpc = orc_compiler_get_constant (p, 2, 0x0080);
orc_sse_emit_paddw (p, tmpc, dest);
orc_sse_emit_movdqa (p, dest, tmp);
orc_sse_emit_psrlw_imm (p, 8, tmp);
orc_sse_emit_paddw (p, tmp, dest);
orc_sse_emit_psrlw_imm (p, 8, dest);
}
#if 1
static void
sse_rule_divluw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
/* About 5.2 cycles per array member on ginger */
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int a = orc_compiler_get_temp_reg (p);
int j = orc_compiler_get_temp_reg (p);
int j2 = orc_compiler_get_temp_reg (p);
int l = orc_compiler_get_temp_reg (p);
int divisor = orc_compiler_get_temp_reg (p);
int tmp;
int i;
orc_sse_emit_movdqa (p, src, divisor);
orc_sse_emit_psllw_imm (p, 8, divisor);
orc_sse_emit_psrlw_imm (p, 1, divisor);
orc_sse_load_constant (p, a, 2, 0x00ff);
tmp = orc_compiler_get_constant (p, 2, 0x8000);
orc_sse_emit_movdqa (p, tmp, j);
orc_sse_emit_psrlw_imm (p, 8, j);
orc_sse_emit_pxor (p, tmp, dest);
for(i=0;i<7;i++){
orc_sse_emit_movdqa (p, divisor, l);
orc_sse_emit_pxor (p, tmp, l);
orc_sse_emit_pcmpgtw (p, dest, l);
orc_sse_emit_movdqa (p, l, j2);
orc_sse_emit_pandn (p, divisor, l);
orc_sse_emit_psubw (p, l, dest);
orc_sse_emit_psrlw_imm (p, 1, divisor);
orc_sse_emit_pand (p, j, j2);
orc_sse_emit_pxor (p, j2, a);
orc_sse_emit_psrlw_imm (p, 1, j);
}
orc_sse_emit_movdqa (p, divisor, l);
orc_sse_emit_pxor (p, tmp, l);
orc_sse_emit_pcmpgtw (p, dest, l);
orc_sse_emit_pand (p, j, l);
orc_sse_emit_pxor (p, l, a);
orc_sse_emit_movdqa (p, a, dest);
}
#else
static void
sse_rule_divluw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
/* About 8.4 cycles per array member on ginger */
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int b = orc_compiler_get_temp_reg (p);
int a = orc_compiler_get_temp_reg (p);
int k = orc_compiler_get_temp_reg (p);
int j = orc_compiler_get_temp_reg (p);
int tmp;
int i;
orc_sse_emit_movdqa (p, dest, b);
tmp = orc_compiler_get_constant (p, 2, 0x00ff);
orc_sse_emit_pand (p, tmp, src);
tmp = orc_compiler_get_constant (p, 2, 0x8000);
orc_sse_emit_pxor (p, tmp, b);
orc_sse_emit_pxor (p, a, a);
orc_sse_emit_movdqa (p, tmp, j);
orc_sse_emit_psrlw_imm (p, 8, j);
for(i=0;i<8;i++){
orc_sse_emit_por (p, j, a);
orc_sse_emit_movdqa (p, a, k);
orc_sse_emit_pmullw (p, src, k);
orc_sse_emit_pxor (p, tmp, k);
orc_sse_emit_pcmpgtw (p, b, k);
orc_sse_emit_pand (p, j, k);
orc_sse_emit_pxor (p, k, a);
orc_sse_emit_psrlw_imm (p, 1, j);
}
orc_sse_emit_movdqa (p, a, dest);
}
#endif
static void
sse_rule_mulsbw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_punpcklbw (p, src, tmp);
orc_sse_emit_psraw_imm (p, 8, tmp);
orc_sse_emit_punpcklbw (p, dest, dest);
orc_sse_emit_psraw_imm (p, 8, dest);
orc_sse_emit_pmullw (p, tmp, dest);
}
static void
sse_rule_mulubw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_punpcklbw (p, src, tmp);
orc_sse_emit_psrlw_imm (p, 8, tmp);
orc_sse_emit_punpcklbw (p, dest, dest);
orc_sse_emit_psrlw_imm (p, 8, dest);
orc_sse_emit_pmullw (p, tmp, dest);
}
static void
sse_rule_mullb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
int tmp2 = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, dest, tmp);
orc_sse_emit_pmullw (p, src, dest);
orc_sse_emit_psllw_imm (p, 8, dest);
orc_sse_emit_psrlw_imm (p, 8, dest);
orc_sse_emit_movdqa (p, src, tmp2);
orc_sse_emit_psraw_imm (p, 8, tmp2);
orc_sse_emit_psraw_imm (p, 8, tmp);
orc_sse_emit_pmullw (p, tmp2, tmp);
orc_sse_emit_psllw_imm (p, 8, tmp);
orc_sse_emit_por (p, tmp, dest);
}
static void
sse_rule_mulhsb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
int tmp2 = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_movdqa (p, dest, tmp2);
orc_sse_emit_psllw_imm (p, 8, tmp);
orc_sse_emit_psraw_imm (p, 8, tmp);
orc_sse_emit_psllw_imm (p, 8, dest);
orc_sse_emit_psraw_imm (p, 8, dest);
orc_sse_emit_pmullw (p, tmp, dest);
orc_sse_emit_psrlw_imm (p, 8, dest);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_psraw_imm (p, 8, tmp);
orc_sse_emit_psraw_imm (p, 8, tmp2);
orc_sse_emit_pmullw (p, tmp, tmp2);
orc_sse_emit_psrlw_imm (p, 8, tmp2);
orc_sse_emit_psllw_imm (p, 8, tmp2);
orc_sse_emit_por (p, tmp2, dest);
}
static void
sse_rule_mulhub (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
int tmp2 = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_movdqa (p, dest, tmp2);
orc_sse_emit_psllw_imm (p, 8, tmp);
orc_sse_emit_psrlw_imm (p, 8, tmp);
orc_sse_emit_psllw_imm (p, 8, dest);
orc_sse_emit_psrlw_imm (p, 8, dest);
orc_sse_emit_pmullw (p, tmp, dest);
orc_sse_emit_psrlw_imm (p, 8, dest);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_psrlw_imm (p, 8, tmp);
orc_sse_emit_psrlw_imm (p, 8, tmp2);
orc_sse_emit_pmullw (p, tmp, tmp2);
orc_sse_emit_psrlw_imm (p, 8, tmp2);
orc_sse_emit_psllw_imm (p, 8, tmp2);
orc_sse_emit_por (p, tmp2, dest);
}
static void
sse_rule_mulswl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, dest, tmp);
orc_sse_emit_pmulhw (p, src, tmp);
orc_sse_emit_pmullw (p, src, dest);
orc_sse_emit_punpcklwd (p, tmp, dest);
}
static void
sse_rule_muluwl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, dest, tmp);
orc_sse_emit_pmulhuw (p, src, tmp);
orc_sse_emit_pmullw (p, src, dest);
orc_sse_emit_punpcklwd (p, tmp, dest);
}
static void
sse_rule_mulll_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int i;
int offset = ORC_STRUCT_OFFSET(OrcExecutor,arrays[ORC_VAR_T1]);
orc_x86_emit_mov_sse_memoffset (p, 16, p->vars[insn->src_args[0]].alloc,
offset, p->exec_reg, FALSE, FALSE);
orc_x86_emit_mov_sse_memoffset (p, 16, p->vars[insn->src_args[1]].alloc,
offset + 16, p->exec_reg, FALSE, FALSE);
for(i=0;i<(1<<p->insn_shift);i++) {
orc_x86_emit_mov_memoffset_reg (p, 4, offset + 4*i, p->exec_reg,
p->gp_tmpreg);
orc_x86_emit_imul_memoffset_reg (p, 4, offset + 16+4*i, p->exec_reg,
p->gp_tmpreg);
orc_x86_emit_mov_reg_memoffset (p, 4, p->gp_tmpreg, offset + 4*i,
p->exec_reg);
}
orc_x86_emit_mov_memoffset_sse (p, 16, offset, p->exec_reg,
p->vars[insn->dest_args[0]].alloc, FALSE);
}
#ifndef MMX
static void
sse_rule_mulhsl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
int tmp2 = orc_compiler_get_temp_reg (p);
orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,3,0,1), dest, tmp);
orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,3,0,1), src, tmp2);
orc_sse_emit_pmuldq (p, src, dest);
orc_sse_emit_pmuldq (p, tmp, tmp2);
orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,3,1), dest, dest);
orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,3,1), tmp2, tmp2);
orc_sse_emit_punpckldq (p, tmp2, dest);
}
#endif
#ifndef MMX
static void
sse_rule_mulhsl_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int i;
int regsize = p->is_64bit ? 8 : 4;
int offset = ORC_STRUCT_OFFSET(OrcExecutor,arrays[ORC_VAR_T1]);
orc_x86_emit_mov_sse_memoffset (p, 16, p->vars[insn->src_args[0]].alloc,
offset, p->exec_reg, FALSE, FALSE);
orc_x86_emit_mov_sse_memoffset (p, 16, p->vars[insn->src_args[1]].alloc,
offset + 16, p->exec_reg, FALSE, FALSE);
orc_x86_emit_mov_reg_memoffset (p, regsize, X86_EAX, offset + 32,
p->exec_reg);
orc_x86_emit_mov_reg_memoffset (p, regsize, X86_EDX, offset + 40,
p->exec_reg);
for(i=0;i<(1<<p->insn_shift);i++) {
orc_x86_emit_mov_memoffset_reg (p, 4, offset + 4*i, p->exec_reg, X86_EAX);
orc_x86_emit_cpuinsn_memoffset (p, ORC_X86_imul_rm, 4,
offset + 16 + 4*i, p->exec_reg);
orc_x86_emit_mov_reg_memoffset (p, 4, X86_EDX, offset + 4*i, p->exec_reg);
}
orc_x86_emit_mov_memoffset_sse (p, 16, offset, p->exec_reg,
p->vars[insn->dest_args[0]].alloc, FALSE);
orc_x86_emit_mov_memoffset_reg (p, regsize, offset + 32, p->exec_reg, X86_EAX);
orc_x86_emit_mov_memoffset_reg (p, regsize, offset + 40, p->exec_reg, X86_EDX);
}
#endif
#ifndef MMX
static void
sse_rule_mulhul (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
int tmp2 = orc_compiler_get_temp_reg (p);
orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,3,0,1), dest, tmp);
orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,3,0,1), src, tmp2);
orc_sse_emit_pmuludq (p, src, dest);
orc_sse_emit_pmuludq (p, tmp, tmp2);
orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,3,1), dest, dest);
orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,3,1), tmp2, tmp2);
orc_sse_emit_punpckldq (p, tmp2, dest);
}
#endif
static void
sse_rule_mulslq (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_punpckldq (p, dest, dest);
orc_sse_emit_punpckldq (p, tmp, tmp);
orc_sse_emit_pmuldq (p, tmp, dest);
}
#ifndef MMX
static void
sse_rule_mulslq_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int i;
int regsize = p->is_64bit ? 8 : 4;
int offset = ORC_STRUCT_OFFSET(OrcExecutor,arrays[ORC_VAR_T1]);
orc_x86_emit_mov_sse_memoffset (p, 8, p->vars[insn->src_args[0]].alloc,
offset, p->exec_reg, FALSE, FALSE);
orc_x86_emit_mov_sse_memoffset (p, 8, p->vars[insn->src_args[1]].alloc,
offset + 8, p->exec_reg, FALSE, FALSE);
orc_x86_emit_mov_reg_memoffset (p, regsize, X86_EAX, offset + 32,
p->exec_reg);
orc_x86_emit_mov_reg_memoffset (p, regsize, X86_EDX, offset + 40,
p->exec_reg);
for(i=0;i<(1<<p->insn_shift);i++) {
orc_x86_emit_mov_memoffset_reg (p, 4, offset + 4*i, p->exec_reg, X86_EAX);
orc_x86_emit_cpuinsn_memoffset (p, ORC_X86_imul_rm, 4,
offset + 8 + 4*i, p->exec_reg);
orc_x86_emit_mov_reg_memoffset (p, 4, X86_EAX, offset + 16 + 8*i, p->exec_reg);
orc_x86_emit_mov_reg_memoffset (p, 4, X86_EDX, offset + 16 + 8*i + 4, p->exec_reg);
}
orc_x86_emit_mov_memoffset_sse (p, 16, offset + 16, p->exec_reg,
p->vars[insn->dest_args[0]].alloc, FALSE);
orc_x86_emit_mov_memoffset_reg (p, regsize, offset + 32, p->exec_reg, X86_EAX);
orc_x86_emit_mov_memoffset_reg (p, regsize, offset + 40, p->exec_reg, X86_EDX);
}
#endif
#ifndef MMX
static void
sse_rule_mululq (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_punpckldq (p, dest, dest);
orc_sse_emit_punpckldq (p, tmp, tmp);
orc_sse_emit_pmuludq (p, tmp, dest);
}
#endif
static void
sse_rule_select0lw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
/* int src = p->vars[insn->src_args[0]].alloc; */
int dest = p->vars[insn->dest_args[0]].alloc;
/* FIXME slow */
/* same as convlw */
orc_sse_emit_pslld_imm (p, 16, dest);
orc_sse_emit_psrad_imm (p, 16, dest);
orc_sse_emit_packssdw (p, dest, dest);
}
static void
sse_rule_select1lw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
/* int src = p->vars[insn->src_args[0]].alloc; */
int dest = p->vars[insn->dest_args[0]].alloc;
/* FIXME slow */
orc_sse_emit_psrad_imm (p, 16, dest);
orc_sse_emit_packssdw (p, dest, dest);
}
static void
sse_rule_select0ql (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
/* same as convql */
#ifndef MMX
orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,2,0), src, dest);
#else
orc_sse_emit_movdqa (p, src, dest);
#endif
}
static void
sse_rule_select1ql (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_psrlq_imm (p, 32, dest);
#ifndef MMX
orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,2,0), src, dest);
#else
orc_sse_emit_movdqa (p, src, dest);
#endif
}
static void
sse_rule_select0wb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
/* int src = p->vars[insn->src_args[0]].alloc; */
int dest = p->vars[insn->dest_args[0]].alloc;
/* FIXME slow */
/* same as convwb */
orc_sse_emit_psllw_imm (p, 8, dest);
orc_sse_emit_psraw_imm (p, 8, dest);
orc_sse_emit_packsswb (p, dest, dest);
}
static void
sse_rule_select1wb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
/* int src = p->vars[insn->src_args[0]].alloc; */
int dest = p->vars[insn->dest_args[0]].alloc;
/* FIXME slow */
orc_sse_emit_psraw_imm (p, 8, dest);
orc_sse_emit_packsswb (p, dest, dest);
}
static void
sse_rule_splitql (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest1 = p->vars[insn->dest_args[0]].alloc;
int dest2 = p->vars[insn->dest_args[1]].alloc;
#ifndef MMX
orc_sse_emit_pshufd (p, ORC_SSE_SHUF(3,1,3,1), src, dest1);
orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,0,2,0), src, dest2);
#else
orc_sse_emit_movdqa (p, src, dest2);
orc_sse_emit_pshufw (p, ORC_SSE_SHUF(3,2,3,2), src, dest1);
#endif
}
static void
sse_rule_splitlw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest1 = p->vars[insn->dest_args[0]].alloc;
int dest2 = p->vars[insn->dest_args[1]].alloc;
/* FIXME slow */
orc_sse_emit_psrad_imm (p, 16, dest1);
orc_sse_emit_packssdw (p, dest1, dest1);
if (dest2 != src)
orc_sse_emit_movdqa (p, src, dest2);
orc_sse_emit_pslld_imm (p, 16, dest2);
orc_sse_emit_psrad_imm (p, 16, dest2);
orc_sse_emit_packssdw (p, dest2, dest2);
}
static void
sse_rule_splitwb (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest1 = p->vars[insn->dest_args[0]].alloc;
int dest2 = p->vars[insn->dest_args[1]].alloc;
int tmp = orc_compiler_get_constant (p, 2, 0xff);
ORC_DEBUG ("got tmp %d", tmp);
/* FIXME slow */
orc_sse_emit_psraw_imm (p, 8, dest1);
orc_sse_emit_packsswb (p, dest1, dest1);
if (dest2 != src)
orc_sse_emit_movdqa (p, src, dest2);
#if 0
orc_sse_emit_psllw_imm (p, 8, dest2);
orc_sse_emit_psraw_imm (p, 8, dest2);
orc_sse_emit_packsswb (p, dest2, dest2);
#else
orc_sse_emit_pand (p, tmp, dest2);
orc_sse_emit_packuswb (p, dest2, dest2);
#endif
}
static void
sse_rule_mergebw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_punpcklbw (p, src, dest);
}
static void
sse_rule_mergewl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_punpcklwd (p, src, dest);
}
static void
sse_rule_mergelq (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
orc_sse_emit_punpckldq (p, src, dest);
}
static void
sse_rule_swapw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_psllw_imm (p, 8, tmp);
orc_sse_emit_psrlw_imm (p, 8, dest);
orc_sse_emit_por (p, tmp, dest);
}
static void
sse_rule_swapl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_pslld_imm (p, 16, tmp);
orc_sse_emit_psrld_imm (p, 16, dest);
orc_sse_emit_por (p, tmp, dest);
orc_sse_emit_movdqa (p, dest, tmp);
orc_sse_emit_psllw_imm (p, 8, tmp);
orc_sse_emit_psrlw_imm (p, 8, dest);
orc_sse_emit_por (p, tmp, dest);
}
static void
sse_rule_swapwl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_pslld_imm (p, 16, tmp);
orc_sse_emit_psrld_imm (p, 16, dest);
orc_sse_emit_por (p, tmp, dest);
}
static void
sse_rule_swapq (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_psllq_imm (p, 32, tmp);
orc_sse_emit_psrlq_imm (p, 32, dest);
orc_sse_emit_por (p, tmp, dest);
orc_sse_emit_movdqa (p, dest, tmp);
orc_sse_emit_pslld_imm (p, 16, tmp);
orc_sse_emit_psrld_imm (p, 16, dest);
orc_sse_emit_por (p, tmp, dest);
orc_sse_emit_movdqa (p, dest, tmp);
orc_sse_emit_psllw_imm (p, 8, tmp);
orc_sse_emit_psrlw_imm (p, 8, dest);
orc_sse_emit_por (p, tmp, dest);
}
static void
sse_rule_swaplq (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
#ifndef MMX
orc_sse_emit_pshufd (p, ORC_SSE_SHUF(2,3,0,1), dest, dest);
#else
orc_mmx_emit_pshufw (p, ORC_MMX_SHUF(1,0,3,2), dest, dest);
#endif
}
#ifndef MMX
static void
sse_rule_swapw_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp;
tmp = orc_compiler_try_get_constant_long (p,
0x02030001, 0x06070405, 0x0a0b0809, 0x0e0f0c0d);
if (tmp != ORC_REG_INVALID) {
orc_sse_emit_pshufb (p, tmp, dest);
} else {
sse_rule_swapw (p, user, insn);
}
}
static void
sse_rule_swapl_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp;
tmp = orc_compiler_try_get_constant_long (p,
0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
if (tmp != ORC_REG_INVALID) {
orc_sse_emit_pshufb (p, tmp, dest);
} else {
sse_rule_swapl (p, user, insn);
}
}
static void
sse_rule_swapwl_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp;
tmp = orc_compiler_try_get_constant_long (p,
0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
if (tmp != ORC_REG_INVALID) {
orc_sse_emit_pshufb (p, tmp, dest);
} else {
sse_rule_swapl (p, user, insn);
}
}
static void
sse_rule_swapq_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp;
tmp = orc_compiler_try_get_constant_long (p,
0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b);
if (tmp != ORC_REG_INVALID) {
orc_sse_emit_pshufb (p, tmp, dest);
} else {
sse_rule_swapq (p, user, insn);
}
}
static void
sse_rule_splitlw_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest1 = p->vars[insn->dest_args[0]].alloc;
int dest2 = p->vars[insn->dest_args[1]].alloc;
int tmp1, tmp2;
tmp1 = orc_compiler_try_get_constant_long (p,
0x07060302, 0x0f0e0b0a, 0x07060302, 0x0f0e0b0a);
tmp2 = orc_compiler_try_get_constant_long (p,
0x05040100, 0x0d0c0908, 0x05040100, 0x0d0c0908);
if (tmp1 != ORC_REG_INVALID && tmp2 != ORC_REG_INVALID) {
orc_sse_emit_pshufb (p, tmp1, dest1);
if (dest2 != src)
orc_sse_emit_movdqa (p, src, dest2);
orc_sse_emit_pshufb (p, tmp2, dest2);
} else {
sse_rule_splitlw (p, user, insn);
}
}
static void
sse_rule_splitwb_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest1 = p->vars[insn->dest_args[0]].alloc;
int dest2 = p->vars[insn->dest_args[1]].alloc;
int tmp1, tmp2;
tmp1 = orc_compiler_try_get_constant_long (p,
0x07050301, 0x0f0d0b09, 0x07050301, 0x0f0d0b09);
tmp2 = orc_compiler_try_get_constant_long (p,
0x06040200, 0x0e0c0a08, 0x06040200, 0x0e0c0a08);
if (tmp1 != ORC_REG_INVALID && tmp2 != ORC_REG_INVALID) {
orc_sse_emit_pshufb (p, tmp1, dest1);
if (dest2 != src)
orc_sse_emit_movdqa (p, src, dest2);
orc_sse_emit_pshufb (p, tmp2, dest2);
} else {
sse_rule_splitwb (p, user, insn);
}
}
static void
sse_rule_select0lw_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp;
tmp = orc_compiler_try_get_constant_long (p,
0x05040100, 0x0d0c0908, 0x05040100, 0x0d0c0908);
if (tmp != ORC_REG_INVALID) {
orc_sse_emit_pshufb (p, tmp, dest);
} else {
sse_rule_select0lw (p, user, insn);
}
}
static void
sse_rule_select1lw_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp;
tmp = orc_compiler_try_get_constant_long (p,
0x07060302, 0x0f0e0b0a, 0x07060302, 0x0f0e0b0a);
if (tmp != ORC_REG_INVALID) {
orc_sse_emit_pshufb (p, tmp, dest);
} else {
sse_rule_select1lw (p, user, insn);
}
}
static void
sse_rule_select0wb_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp;
tmp = orc_compiler_try_get_constant_long (p,
0x06040200, 0x0e0c0a08, 0x06040200, 0x0e0c0a08);
if (tmp != ORC_REG_INVALID) {
orc_sse_emit_pshufb (p, tmp, dest);
} else {
sse_rule_select0wb (p, user, insn);
}
}
static void
sse_rule_select1wb_ssse3 (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp;
tmp = orc_compiler_try_get_constant_long (p,
0x07050301, 0x0f0d0b09, 0x07050301, 0x0f0d0b09);
if (tmp != ORC_REG_INVALID) {
orc_sse_emit_pshufb (p, tmp, dest);
} else {
sse_rule_select1wb (p, user, insn);
}
}
#endif
/* slow rules */
static void
sse_rule_maxuw_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
tmp = orc_compiler_get_constant (p, 2, 0x8000);
orc_sse_emit_pxor(p, tmp, src);
orc_sse_emit_pxor(p, tmp, dest);
orc_sse_emit_pmaxsw (p, src, dest);
orc_sse_emit_pxor(p, tmp, src);
orc_sse_emit_pxor(p, tmp, dest);
}
static void
sse_rule_minuw_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp;
tmp = orc_compiler_get_constant (p, 2, 0x8000);
orc_sse_emit_pxor(p, tmp, src);
orc_sse_emit_pxor(p, tmp, dest);
orc_sse_emit_pminsw (p, src, dest);
orc_sse_emit_pxor(p, tmp, src);
orc_sse_emit_pxor(p, tmp, dest);
}
static void
sse_rule_avgsb_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp;
tmp = orc_compiler_get_constant (p, 1, 0x80);
orc_sse_emit_pxor(p, tmp, src);
orc_sse_emit_pxor(p, tmp, dest);
orc_sse_emit_pavgb (p, src, dest);
orc_sse_emit_pxor(p, tmp, src);
orc_sse_emit_pxor(p, tmp, dest);
}
static void
sse_rule_avgsw_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp;
tmp = orc_compiler_get_constant (p, 2, 0x8000);
orc_sse_emit_pxor(p, tmp, src);
orc_sse_emit_pxor(p, tmp, dest);
orc_sse_emit_pavgw (p, src, dest);
orc_sse_emit_pxor(p, tmp, src);
orc_sse_emit_pxor(p, tmp, dest);
}
static void
sse_rule_maxsb_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, dest, tmp);
orc_sse_emit_pcmpgtb (p, src, tmp);
orc_sse_emit_pand (p, tmp, dest);
orc_sse_emit_pandn (p, src, tmp);
orc_sse_emit_por (p, tmp, dest);
}
static void
sse_rule_minsb_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_pcmpgtb (p, dest, tmp);
orc_sse_emit_pand (p, tmp, dest);
orc_sse_emit_pandn (p, src, tmp);
orc_sse_emit_por (p, tmp, dest);
}
static void
sse_rule_maxsl_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, dest, tmp);
orc_sse_emit_pcmpgtd (p, src, tmp);
orc_sse_emit_pand (p, tmp, dest);
orc_sse_emit_pandn (p, src, tmp);
orc_sse_emit_por (p, tmp, dest);
}
static void
sse_rule_minsl_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_pcmpgtd (p, dest, tmp);
orc_sse_emit_pand (p, tmp, dest);
orc_sse_emit_pandn (p, src, tmp);
orc_sse_emit_por (p, tmp, dest);
}
static void
sse_rule_maxul_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
int tmpc;
tmpc = orc_compiler_get_constant (p, 4, 0x80000000);
orc_sse_emit_pxor(p, tmpc, src);
orc_sse_emit_pxor(p, tmpc, dest);
orc_sse_emit_movdqa (p, dest, tmp);
orc_sse_emit_pcmpgtd (p, src, tmp);
orc_sse_emit_pand (p, tmp, dest);
orc_sse_emit_pandn (p, src, tmp);
orc_sse_emit_por (p, tmp, dest);
orc_sse_emit_pxor(p, tmpc, src);
orc_sse_emit_pxor(p, tmpc, dest);
}
static void
sse_rule_minul_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
int tmpc;
tmpc = orc_compiler_get_constant (p, 4, 0x80000000);
orc_sse_emit_pxor(p, tmpc, src);
orc_sse_emit_pxor(p, tmpc, dest);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_pcmpgtd (p, dest, tmp);
orc_sse_emit_pand (p, tmp, dest);
orc_sse_emit_pandn (p, src, tmp);
orc_sse_emit_por (p, tmp, dest);
orc_sse_emit_pxor(p, tmpc, src);
orc_sse_emit_pxor(p, tmpc, dest);
}
static void
sse_rule_avgsl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
/* (a+b+1) >> 1 = (a|b) - ((a^b)>>1) */
orc_sse_emit_movdqa (p, dest, tmp);
orc_sse_emit_pxor(p, src, tmp);
orc_sse_emit_psrad_imm(p, 1, tmp);
orc_sse_emit_por(p, src, dest);
orc_sse_emit_psubd(p, tmp, dest);
}
static void
sse_rule_avgul (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
/* (a+b+1) >> 1 = (a|b) - ((a^b)>>1) */
orc_sse_emit_movdqa (p, dest, tmp);
orc_sse_emit_pxor(p, src, tmp);
orc_sse_emit_psrld_imm(p, 1, tmp);
orc_sse_emit_por(p, src, dest);
orc_sse_emit_psubd(p, tmp, dest);
}
static void
sse_rule_addssl_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
#if 0
int tmp2 = orc_compiler_get_temp_reg (p);
int tmp3 = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_pand (p, dest, tmp);
orc_sse_emit_movdqa (p, src, tmp2);
orc_sse_emit_pxor (p, dest, tmp2);
orc_sse_emit_psrad_imm (p, 1, tmp2);
orc_sse_emit_paddd (p, tmp2, tmp);
orc_sse_emit_psrad (p, 30, tmp);
orc_sse_emit_pslld (p, 30, tmp);
orc_sse_emit_movdqa (p, tmp, tmp2);
orc_sse_emit_pslld_imm (p, 1, tmp2);
orc_sse_emit_movdqa (p, tmp, tmp3);
orc_sse_emit_pxor (p, tmp2, tmp3);
orc_sse_emit_psrad_imm (p, 31, tmp3);
orc_sse_emit_psrad_imm (p, 31, tmp2);
tmp = orc_compiler_get_constant (p, 4, 0x80000000);
orc_sse_emit_pxor (p, tmp, tmp2); /* clamped value */
orc_sse_emit_pand (p, tmp3, tmp2);
orc_sse_emit_paddd (p, src, dest);
orc_sse_emit_pandn (p, dest, tmp3); /* tmp is mask: ~0 is for clamping */
orc_sse_emit_movdqa (p, tmp3, dest);
orc_sse_emit_por (p, tmp2, dest);
#endif
int s = orc_compiler_get_temp_reg (p);
int t = orc_compiler_get_temp_reg (p);
/*
From Tim Terriberry: (slightly faster than above)
m=0xFFFFFFFF;
s=_a;
t=_a;
s^=_b;
_a+=_b;
t^=_a;
t^=m;
m>>=1;
s|=t;
t=_b;
s>>=31;
t>>=31;
_a&=s;
t^=m;
s=~s&t;
_a|=s;
*/
orc_sse_emit_movdqa (p, dest, s);
orc_sse_emit_movdqa (p, dest, t);
orc_sse_emit_pxor (p, src, s);
orc_sse_emit_paddd (p, src, dest);
orc_sse_emit_pxor (p, dest, t);
tmp = orc_compiler_get_constant (p, 4, 0xffffffff);
orc_sse_emit_pxor (p, tmp, t);
orc_sse_emit_por (p, t, s);
orc_sse_emit_movdqa (p, src, t);
orc_sse_emit_psrad_imm (p, 31, s);
orc_sse_emit_psrad_imm (p, 31, t);
orc_sse_emit_pand (p, s, dest);
tmp = orc_compiler_get_constant (p, 4, 0x7fffffff);
orc_sse_emit_pxor (p, tmp, t);
orc_sse_emit_pandn (p, t, s);
orc_sse_emit_por (p, s, dest);
}
static void
sse_rule_subssl_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
int tmp2 = orc_compiler_get_temp_reg (p);
int tmp3 = orc_compiler_get_temp_reg (p);
tmp = orc_compiler_get_temp_constant (p, 4, 0xffffffff);
orc_sse_emit_pxor (p, src, tmp);
orc_sse_emit_movdqa (p, tmp, tmp2);
orc_sse_emit_por (p, dest, tmp);
orc_sse_emit_pxor (p, dest, tmp2);
orc_sse_emit_psrad_imm (p, 1, tmp2);
orc_sse_emit_psubd (p, tmp2, tmp);
orc_sse_emit_psrad_imm (p, 30, tmp);
orc_sse_emit_pslld_imm (p, 30, tmp);
orc_sse_emit_movdqa (p, tmp, tmp2);
orc_sse_emit_pslld_imm (p, 1, tmp2);
orc_sse_emit_movdqa (p, tmp, tmp3);
orc_sse_emit_pxor (p, tmp2, tmp3);
orc_sse_emit_psrad_imm (p, 31, tmp3); /* tmp3 is mask: ~0 is for clamping */
orc_sse_emit_psrad_imm (p, 31, tmp2);
tmp = orc_compiler_get_constant (p, 4, 0x80000000);
orc_sse_emit_pxor (p, tmp, tmp2); /* clamped value */
orc_sse_emit_pand (p, tmp3, tmp2);
orc_sse_emit_psubd (p, src, dest);
orc_sse_emit_pandn (p, dest, tmp3);
orc_sse_emit_movdqa (p, tmp3, dest);
orc_sse_emit_por (p, tmp2, dest);
}
static void
sse_rule_addusl_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
int tmp2 = orc_compiler_get_temp_reg (p);
#if 0
/* an alternate version. slower. */
/* Compute the bit that gets carried from bit 0 to bit 1 */
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_pand (p, dest, tmp);
orc_sse_emit_pslld_imm (p, 31, tmp);
orc_sse_emit_psrld_imm (p, 31, tmp);
/* Add in (src>>1) */
orc_sse_emit_movdqa (p, src, tmp2);
orc_sse_emit_psrld_imm (p, 1, tmp2);
orc_sse_emit_paddd (p, tmp2, tmp);
/* Add in (dest>>1) */
orc_sse_emit_movdqa (p, dest, tmp2);
orc_sse_emit_psrld_imm (p, 1, tmp2);
orc_sse_emit_paddd (p, tmp2, tmp);
/* turn overflow bit into mask */
orc_sse_emit_psrad_imm (p, 31, tmp);
/* compute the sum, then or over the mask */
orc_sse_emit_paddd (p, src, dest);
orc_sse_emit_por (p, tmp, dest);
#endif
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_pand (p, dest, tmp);
orc_sse_emit_movdqa (p, src, tmp2);
orc_sse_emit_pxor (p, dest, tmp2);
orc_sse_emit_psrld_imm (p, 1, tmp2);
orc_sse_emit_paddd (p, tmp2, tmp);
orc_sse_emit_psrad_imm (p, 31, tmp);
orc_sse_emit_paddd (p, src, dest);
orc_sse_emit_por (p, tmp, dest);
}
static void
sse_rule_subusl_slow (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[1]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmp = orc_compiler_get_temp_reg (p);
int tmp2 = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p, src, tmp2);
orc_sse_emit_psrld_imm (p, 1, tmp2);
orc_sse_emit_movdqa (p, dest, tmp);
orc_sse_emit_psrld_imm (p, 1, tmp);
orc_sse_emit_psubd (p, tmp, tmp2);
/* turn overflow bit into mask */
orc_sse_emit_psrad_imm (p, 31, tmp2);
/* compute the difference, then and over the mask */
orc_sse_emit_psubd (p, src, dest);
orc_sse_emit_pand (p, tmp2, dest);
}
#ifndef MMX
/* float ops */
#define UNARY_F(opcode,insn_name,code) \
static void \
sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
{ \
orc_sse_emit_ ## insn_name (p, \
p->vars[insn->src_args[0]].alloc, \
p->vars[insn->dest_args[0]].alloc); \
}
#define BINARY_F(opcode,insn_name,code) \
static void \
sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
{ \
orc_sse_emit_ ## insn_name (p, \
p->vars[insn->src_args[1]].alloc, \
p->vars[insn->dest_args[0]].alloc); \
}
BINARY_F(addf, addps, 0x58)
BINARY_F(subf, subps, 0x5c)
BINARY_F(mulf, mulps, 0x59)
BINARY_F(divf, divps, 0x5e)
UNARY_F(sqrtf, sqrtps, 0x51)
#define UNARY_D(opcode,insn_name,code) \
static void \
sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
{ \
orc_sse_emit_ ## insn_name (p, \
p->vars[insn->src_args[0]].alloc, \
p->vars[insn->dest_args[0]].alloc); \
}
#define BINARY_D(opcode,insn_name,code) \
static void \
sse_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
{ \
orc_sse_emit_ ## insn_name (p, \
p->vars[insn->src_args[1]].alloc, \
p->vars[insn->dest_args[0]].alloc); \
}
BINARY_D(addd, addpd, 0x58)
BINARY_D(subd, subpd, 0x5c)
BINARY_D(muld, mulpd, 0x59)
BINARY_D(divd, divpd, 0x5e)
UNARY_D(sqrtd, sqrtpd, 0x51)
static void
sse_rule_minf (OrcCompiler *p, void *user, OrcInstruction *insn)
{
if (p->target_flags & ORC_TARGET_FAST_NAN) {
orc_sse_emit_minps (p,
p->vars[insn->src_args[1]].alloc,
p->vars[insn->dest_args[0]].alloc);
} else {
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p,
p->vars[insn->src_args[1]].alloc,
tmp);
orc_sse_emit_minps (p,
p->vars[insn->dest_args[0]].alloc,
tmp);
orc_sse_emit_minps (p,
p->vars[insn->src_args[1]].alloc,
p->vars[insn->dest_args[0]].alloc);
orc_sse_emit_por (p,
tmp,
p->vars[insn->dest_args[0]].alloc);
}
}
static void
sse_rule_mind (OrcCompiler *p, void *user, OrcInstruction *insn)
{
if (p->target_flags & ORC_TARGET_FAST_NAN) {
orc_sse_emit_minpd (p,
p->vars[insn->src_args[1]].alloc,
p->vars[insn->dest_args[0]].alloc);
} else {
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p,
p->vars[insn->src_args[1]].alloc,
tmp);
orc_sse_emit_minpd (p,
p->vars[insn->dest_args[0]].alloc,
tmp);
orc_sse_emit_minpd (p,
p->vars[insn->src_args[1]].alloc,
p->vars[insn->dest_args[0]].alloc);
orc_sse_emit_por (p,
tmp,
p->vars[insn->dest_args[0]].alloc);
}
}
static void
sse_rule_maxf (OrcCompiler *p, void *user, OrcInstruction *insn)
{
if (p->target_flags & ORC_TARGET_FAST_NAN) {
orc_sse_emit_maxps (p,
p->vars[insn->src_args[1]].alloc,
p->vars[insn->dest_args[0]].alloc);
} else {
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p,
p->vars[insn->src_args[1]].alloc,
tmp);
orc_sse_emit_maxps (p,
p->vars[insn->dest_args[0]].alloc,
tmp);
orc_sse_emit_maxps (p,
p->vars[insn->src_args[1]].alloc,
p->vars[insn->dest_args[0]].alloc);
orc_sse_emit_por (p,
tmp,
p->vars[insn->dest_args[0]].alloc);
}
}
static void
sse_rule_maxd (OrcCompiler *p, void *user, OrcInstruction *insn)
{
if (p->target_flags & ORC_TARGET_FAST_NAN) {
orc_sse_emit_maxpd (p,
p->vars[insn->src_args[1]].alloc,
p->vars[insn->dest_args[0]].alloc);
} else {
int tmp = orc_compiler_get_temp_reg (p);
orc_sse_emit_movdqa (p,
p->vars[insn->src_args[1]].alloc,
tmp);
orc_sse_emit_maxpd (p,
p->vars[insn->dest_args[0]].alloc,
tmp);
orc_sse_emit_maxpd (p,
p->vars[insn->src_args[1]].alloc,
p->vars[insn->dest_args[0]].alloc);
orc_sse_emit_por (p,
tmp,
p->vars[insn->dest_args[0]].alloc);
}
}
static void
sse_rule_cmpeqf (OrcCompiler *p, void *user, OrcInstruction *insn)
{
orc_sse_emit_cmpeqps (p,
p->vars[insn->src_args[1]].alloc,
p->vars[insn->dest_args[0]].alloc);
}
static void
sse_rule_cmpeqd (OrcCompiler *p, void *user, OrcInstruction *insn)
{
orc_sse_emit_cmpeqpd (p,
p->vars[insn->src_args[1]].alloc,
p->vars[insn->dest_args[0]].alloc);
}
static void
sse_rule_cmpltf (OrcCompiler *p, void *user, OrcInstruction *insn)
{
orc_sse_emit_cmpltps (p,
p->vars[insn->src_args[1]].alloc,
p->vars[insn->dest_args[0]].alloc);
}
static void
sse_rule_cmpltd (OrcCompiler *p, void *user, OrcInstruction *insn)
{
orc_sse_emit_cmpltpd (p,
p->vars[insn->src_args[1]].alloc,
p->vars[insn->dest_args[0]].alloc);
}
static void
sse_rule_cmplef (OrcCompiler *p, void *user, OrcInstruction *insn)
{
orc_sse_emit_cmpleps (p,
p->vars[insn->src_args[1]].alloc,
p->vars[insn->dest_args[0]].alloc);
}
static void
sse_rule_cmpled (OrcCompiler *p, void *user, OrcInstruction *insn)
{
orc_sse_emit_cmplepd (p,
p->vars[insn->src_args[1]].alloc,
p->vars[insn->dest_args[0]].alloc);
}
static void
sse_rule_convfl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmpc;
int tmp = orc_compiler_get_temp_reg (p);
tmpc = orc_compiler_get_temp_constant (p, 4, 0x80000000);
orc_sse_emit_movdqa (p, src, tmp);
orc_sse_emit_cvttps2dq (p, src, dest);
orc_sse_emit_psrad_imm (p, 31, tmp);
orc_sse_emit_pcmpeqd (p, dest, tmpc);
orc_sse_emit_pandn (p, tmpc, tmp);
orc_sse_emit_paddd (p, tmp, dest);
}
static void
sse_rule_convdl (OrcCompiler *p, void *user, OrcInstruction *insn)
{
int src = p->vars[insn->src_args[0]].alloc;
int dest = p->vars[insn->dest_args[0]].alloc;
int tmpc;
int tmp = orc_compiler_get_temp_reg (p);
tmpc = orc_compiler_get_temp_constant (p, 4, 0x80000000);
orc_sse_emit_pshufd (p, ORC_SSE_SHUF(3,1,3,1), src, tmp);
orc_sse_emit_cvttpd2dq (p, src, dest);
orc_sse_emit_psrad_imm (p, 31, tmp);
orc_sse_emit_pcmpeqd (p, dest, tmpc);
orc_sse_emit_pandn (p, tmpc, tmp);
orc_sse_emit_paddd (p, tmp, dest);
}
static void
sse_rule_convlf (OrcCompiler *p, void *user, OrcInstruction *insn)
{
orc_sse_emit_cvtdq2ps (p,
p->vars[insn->src_args[0]].alloc,
p->vars[insn->dest_args[0]].alloc);
}
static void
sse_rule_convld (OrcCompiler *p, void *user, OrcInstruction *insn)
{
orc_sse_emit_cvtdq2pd (p,
p->vars[insn->src_args[0]].alloc,
p->vars[insn->dest_args[0]].alloc);
}
static void
sse_rule_convfd (OrcCompiler *p, void *user, OrcInstruction *insn)
{
orc_sse_emit_cvtps2pd (p,
p->vars[insn->src_args[0]].alloc,
p->vars[insn->dest_args[0]].alloc);
}
static void
sse_rule_convdf (OrcCompiler *p, void *user, OrcInstruction *insn)
{
orc_sse_emit_cvtpd2ps (p,
p->vars[insn->src_args[0]].alloc,
p->vars[insn->dest_args[0]].alloc);
}
#endif
#define UNARY_SSE41(opcode,insn_name) \
static void \
sse_rule_ ## opcode ## _sse41 (OrcCompiler *p, void *user, OrcInstruction *insn) \
{ \
orc_sse_emit_ ## insn_name (p, \
p->vars[insn->src_args[0]].alloc, \
p->vars[insn->dest_args[0]].alloc); \
}
UNARY_SSE41(convsbw,pmovsxbw);
UNARY_SSE41(convswl,pmovsxwd);
UNARY_SSE41(convslq,pmovsxdq);
UNARY_SSE41(convubw,pmovzxbw);
UNARY_SSE41(convuwl,pmovzxwd);
UNARY_SSE41(convulq,pmovzxdq);
void
orc_compiler_sse_register_rules (OrcTarget *target)
{
OrcRuleSet *rule_set;
#define REG(x) \
orc_rule_register (rule_set, #x , sse_rule_ ## x, NULL)
/* SSE 2 */
#ifndef MMX
rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target,
ORC_TARGET_SSE_SSE2);
#else
rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target,
ORC_TARGET_MMX_MMX);
#endif
orc_rule_register (rule_set, "loadb", sse_rule_loadX, NULL);
orc_rule_register (rule_set, "loadw", sse_rule_loadX, NULL);
orc_rule_register (rule_set, "loadl", sse_rule_loadX, NULL);
orc_rule_register (rule_set, "loadq", sse_rule_loadX, NULL);
orc_rule_register (rule_set, "loadoffb", sse_rule_loadoffX, NULL);
orc_rule_register (rule_set, "loadoffw", sse_rule_loadoffX, NULL);
orc_rule_register (rule_set, "loadoffl", sse_rule_loadoffX, NULL);
orc_rule_register (rule_set, "loadupdb", sse_rule_loadupdb, NULL);
orc_rule_register (rule_set, "loadupib", sse_rule_loadupib, NULL);
orc_rule_register (rule_set, "loadpb", sse_rule_loadpX, (void *)1);
orc_rule_register (rule_set, "loadpw", sse_rule_loadpX, (void *)2);
orc_rule_register (rule_set, "loadpl", sse_rule_loadpX, (void *)4);
orc_rule_register (rule_set, "loadpq", sse_rule_loadpX, (void *)8);
orc_rule_register (rule_set, "ldresnearl", sse_rule_ldresnearl, NULL);
orc_rule_register (rule_set, "ldreslinl", sse_rule_ldreslinl, NULL);
orc_rule_register (rule_set, "storeb", sse_rule_storeX, NULL);
orc_rule_register (rule_set, "storew", sse_rule_storeX, NULL);
orc_rule_register (rule_set, "storel", sse_rule_storeX, NULL);
orc_rule_register (rule_set, "storeq", sse_rule_storeX, NULL);
REG(addb);
REG(addssb);
REG(addusb);
REG(andb);
REG(andnb);
REG(avgub);
REG(cmpeqb);
REG(cmpgtsb);
REG(maxub);
REG(minub);
REG(orb);
REG(subb);
REG(subssb);
REG(subusb);
REG(xorb);
REG(addw);
REG(addssw);
REG(addusw);
REG(andw);
REG(andnw);
REG(avguw);
REG(cmpeqw);
REG(cmpgtsw);
REG(maxsw);
REG(minsw);
REG(mullw);
REG(mulhsw);
REG(mulhuw);
REG(orw);
REG(subw);
REG(subssw);
REG(subusw);
REG(xorw);
REG(addl);
REG(andl);
REG(andnl);
REG(cmpeql);
REG(cmpgtsl);
REG(orl);
REG(subl);
REG(xorl);
REG(andq);
REG(andnq);
REG(orq);
REG(xorq);
REG(select0ql);
REG(select1ql);
REG(select0lw);
REG(select1lw);
REG(select0wb);
REG(select1wb);
REG(mergebw);
REG(mergewl);
REG(mergelq);
orc_rule_register (rule_set, "copyb", sse_rule_copyx, NULL);
orc_rule_register (rule_set, "copyw", sse_rule_copyx, NULL);
orc_rule_register (rule_set, "copyl", sse_rule_copyx, NULL);
orc_rule_register (rule_set, "copyq", sse_rule_copyx, NULL);
orc_rule_register (rule_set, "shlw", sse_rule_shift, (void *)0);
orc_rule_register (rule_set, "shruw", sse_rule_shift, (void *)1);
orc_rule_register (rule_set, "shrsw", sse_rule_shift, (void *)2);
orc_rule_register (rule_set, "shll", sse_rule_shift, (void *)3);
orc_rule_register (rule_set, "shrul", sse_rule_shift, (void *)4);
orc_rule_register (rule_set, "shrsl", sse_rule_shift, (void *)5);
orc_rule_register (rule_set, "shlq", sse_rule_shift, (void *)6);
orc_rule_register (rule_set, "shruq", sse_rule_shift, (void *)7);
orc_rule_register (rule_set, "shrsq", sse_rule_shrsq, NULL);
orc_rule_register (rule_set, "convsbw", sse_rule_convsbw, NULL);
orc_rule_register (rule_set, "convubw", sse_rule_convubw, NULL);
orc_rule_register (rule_set, "convssswb", sse_rule_convssswb, NULL);
orc_rule_register (rule_set, "convsuswb", sse_rule_convsuswb, NULL);
orc_rule_register (rule_set, "convuuswb", sse_rule_convuuswb, NULL);
orc_rule_register (rule_set, "convwb", sse_rule_convwb, NULL);
orc_rule_register (rule_set, "convswl", sse_rule_convswl, NULL);
orc_rule_register (rule_set, "convuwl", sse_rule_convuwl, NULL);
orc_rule_register (rule_set, "convssslw", sse_rule_convssslw, NULL);
orc_rule_register (rule_set, "convql", sse_rule_convql, NULL);
orc_rule_register (rule_set, "convslq", sse_rule_convslq, NULL);
orc_rule_register (rule_set, "convulq", sse_rule_convulq, NULL);
/* orc_rule_register (rule_set, "convsssql", sse_rule_convsssql, NULL); */
orc_rule_register (rule_set, "mulsbw", sse_rule_mulsbw, NULL);
orc_rule_register (rule_set, "mulubw", sse_rule_mulubw, NULL);
orc_rule_register (rule_set, "mulswl", sse_rule_mulswl, NULL);
orc_rule_register (rule_set, "muluwl", sse_rule_muluwl, NULL);
orc_rule_register (rule_set, "accw", sse_rule_accw, NULL);
orc_rule_register (rule_set, "accl", sse_rule_accl, NULL);
orc_rule_register (rule_set, "accsadubl", sse_rule_accsadubl, NULL);
#ifndef MMX
/* These require the SSE2 flag, although could be used with MMX.
That flag is not yet handled. */
orc_rule_register (rule_set, "mululq", sse_rule_mululq, NULL);
REG(addq);
REG(subq);
orc_rule_register (rule_set, "addf", sse_rule_addf, NULL);
orc_rule_register (rule_set, "subf", sse_rule_subf, NULL);
orc_rule_register (rule_set, "mulf", sse_rule_mulf, NULL);
orc_rule_register (rule_set, "divf", sse_rule_divf, NULL);
orc_rule_register (rule_set, "minf", sse_rule_minf, NULL);
orc_rule_register (rule_set, "maxf", sse_rule_maxf, NULL);
orc_rule_register (rule_set, "sqrtf", sse_rule_sqrtf, NULL);
orc_rule_register (rule_set, "cmpeqf", sse_rule_cmpeqf, NULL);
orc_rule_register (rule_set, "cmpltf", sse_rule_cmpltf, NULL);
orc_rule_register (rule_set, "cmplef", sse_rule_cmplef, NULL);
orc_rule_register (rule_set, "convfl", sse_rule_convfl, NULL);
orc_rule_register (rule_set, "convlf", sse_rule_convlf, NULL);
orc_rule_register (rule_set, "addd", sse_rule_addd, NULL);
orc_rule_register (rule_set, "subd", sse_rule_subd, NULL);
orc_rule_register (rule_set, "muld", sse_rule_muld, NULL);
orc_rule_register (rule_set, "divd", sse_rule_divd, NULL);
orc_rule_register (rule_set, "mind", sse_rule_mind, NULL);
orc_rule_register (rule_set, "maxd", sse_rule_maxd, NULL);
orc_rule_register (rule_set, "sqrtd", sse_rule_sqrtd, NULL);
orc_rule_register (rule_set, "cmpeqd", sse_rule_cmpeqd, NULL);
orc_rule_register (rule_set, "cmpltd", sse_rule_cmpltd, NULL);
orc_rule_register (rule_set, "cmpled", sse_rule_cmpled, NULL);
orc_rule_register (rule_set, "convdl", sse_rule_convdl, NULL);
orc_rule_register (rule_set, "convld", sse_rule_convld, NULL);
orc_rule_register (rule_set, "convfd", sse_rule_convfd, NULL);
orc_rule_register (rule_set, "convdf", sse_rule_convdf, NULL);
#endif
/* slow rules */
orc_rule_register (rule_set, "maxuw", sse_rule_maxuw_slow, NULL);
orc_rule_register (rule_set, "minuw", sse_rule_minuw_slow, NULL);
orc_rule_register (rule_set, "avgsb", sse_rule_avgsb_slow, NULL);
orc_rule_register (rule_set, "avgsw", sse_rule_avgsw_slow, NULL);
orc_rule_register (rule_set, "maxsb", sse_rule_maxsb_slow, NULL);
orc_rule_register (rule_set, "minsb", sse_rule_minsb_slow, NULL);
orc_rule_register (rule_set, "maxsl", sse_rule_maxsl_slow, NULL);
orc_rule_register (rule_set, "minsl", sse_rule_minsl_slow, NULL);
orc_rule_register (rule_set, "maxul", sse_rule_maxul_slow, NULL);
orc_rule_register (rule_set, "minul", sse_rule_minul_slow, NULL);
orc_rule_register (rule_set, "convlw", sse_rule_convlw, NULL);
orc_rule_register (rule_set, "signw", sse_rule_signw_slow, NULL);
orc_rule_register (rule_set, "absb", sse_rule_absb_slow, NULL);
orc_rule_register (rule_set, "absw", sse_rule_absw_slow, NULL);
orc_rule_register (rule_set, "absl", sse_rule_absl_slow, NULL);
orc_rule_register (rule_set, "swapw", sse_rule_swapw, NULL);
orc_rule_register (rule_set, "swapl", sse_rule_swapl, NULL);
orc_rule_register (rule_set, "swapwl", sse_rule_swapwl, NULL);
orc_rule_register (rule_set, "swapq", sse_rule_swapq, NULL);
orc_rule_register (rule_set, "swaplq", sse_rule_swaplq, NULL);
orc_rule_register (rule_set, "splitql", sse_rule_splitql, NULL);
orc_rule_register (rule_set, "splitlw", sse_rule_splitlw, NULL);
orc_rule_register (rule_set, "splitwb", sse_rule_splitwb, NULL);
orc_rule_register (rule_set, "avgsl", sse_rule_avgsl, NULL);
orc_rule_register (rule_set, "avgul", sse_rule_avgul, NULL);
orc_rule_register (rule_set, "shlb", sse_rule_shlb, NULL);
orc_rule_register (rule_set, "shrsb", sse_rule_shrsb, NULL);
orc_rule_register (rule_set, "shrub", sse_rule_shrub, NULL);
orc_rule_register (rule_set, "mulll", sse_rule_mulll_slow, NULL);
#ifndef MMX
orc_rule_register (rule_set, "mulhsl", sse_rule_mulhsl_slow, NULL);
orc_rule_register (rule_set, "mulhul", sse_rule_mulhul, NULL);
orc_rule_register (rule_set, "mulslq", sse_rule_mulslq_slow, NULL);
#endif
orc_rule_register (rule_set, "mullb", sse_rule_mullb, NULL);
orc_rule_register (rule_set, "mulhsb", sse_rule_mulhsb, NULL);
orc_rule_register (rule_set, "mulhub", sse_rule_mulhub, NULL);
orc_rule_register (rule_set, "addssl", sse_rule_addssl_slow, NULL);
orc_rule_register (rule_set, "subssl", sse_rule_subssl_slow, NULL);
orc_rule_register (rule_set, "addusl", sse_rule_addusl_slow, NULL);
orc_rule_register (rule_set, "subusl", sse_rule_subusl_slow, NULL);
orc_rule_register (rule_set, "convhwb", sse_rule_convhwb, NULL);
orc_rule_register (rule_set, "convhlw", sse_rule_convhlw, NULL);
orc_rule_register (rule_set, "splatw3q", sse_rule_splatw3q, NULL);
orc_rule_register (rule_set, "splatbw", sse_rule_splatbw, NULL);
orc_rule_register (rule_set, "splatbl", sse_rule_splatbl, NULL);
orc_rule_register (rule_set, "div255w", sse_rule_div255w, NULL);
orc_rule_register (rule_set, "divluw", sse_rule_divluw, NULL);
/* SSE 3 -- no rules */
/* SSSE 3 */
rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target,
ORC_TARGET_SSE_SSSE3);
#ifndef MMX
orc_rule_register (rule_set, "signb", sse_rule_signX_ssse3, (void *)0);
orc_rule_register (rule_set, "signw", sse_rule_signX_ssse3, (void *)1);
orc_rule_register (rule_set, "signl", sse_rule_signX_ssse3, (void *)2);
#endif
REG(absb);
REG(absw);
REG(absl);
#ifndef MMX
orc_rule_register (rule_set, "swapw", sse_rule_swapw_ssse3, NULL);
orc_rule_register (rule_set, "swapl", sse_rule_swapl_ssse3, NULL);
orc_rule_register (rule_set, "swapwl", sse_rule_swapwl_ssse3, NULL);
orc_rule_register (rule_set, "swapq", sse_rule_swapq_ssse3, NULL);
orc_rule_register (rule_set, "splitlw", sse_rule_splitlw_ssse3, NULL);
orc_rule_register (rule_set, "splitwb", sse_rule_splitwb_ssse3, NULL);
orc_rule_register (rule_set, "select0lw", sse_rule_select0lw_ssse3, NULL);
orc_rule_register (rule_set, "select1lw", sse_rule_select1lw_ssse3, NULL);
orc_rule_register (rule_set, "select0wb", sse_rule_select0wb_ssse3, NULL);
orc_rule_register (rule_set, "select1wb", sse_rule_select1wb_ssse3, NULL);
#endif
/* SSE 4.1 */
rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target,
ORC_TARGET_SSE_SSE4_1);
REG(maxsb);
REG(minsb);
REG(maxuw);
REG(minuw);
REG(maxsl);
REG(maxul);
REG(minsl);
REG(minul);
REG(mulll);
orc_rule_register (rule_set, "convsbw", sse_rule_convsbw_sse41, NULL);
orc_rule_register (rule_set, "convswl", sse_rule_convswl_sse41, NULL);
orc_rule_register (rule_set, "convslq", sse_rule_convslq_sse41, NULL);
orc_rule_register (rule_set, "convubw", sse_rule_convubw_sse41, NULL);
orc_rule_register (rule_set, "convuwl", sse_rule_convuwl_sse41, NULL);
orc_rule_register (rule_set, "convulq", sse_rule_convulq_sse41, NULL);
orc_rule_register (rule_set, "convsuslw", sse_rule_convsuslw, NULL);
orc_rule_register (rule_set, "mulslq", sse_rule_mulslq, NULL);
#ifndef MMX
orc_rule_register (rule_set, "mulhsl", sse_rule_mulhsl, NULL);
#endif
REG(cmpeqq);
/* SSE 4.2 -- no rules */
rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target,
ORC_TARGET_SSE_SSE4_2);
REG(cmpgtsq);
/* SSE 4a -- no rules */
}