Blob Blame History Raw

#include "config.h"

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#include <sys/types.h>

#include <orc/orcprogram.h>
#include <orc/orcx86.h>
#include <orc/orcmmx.h>
#include <orc/orcutils.h>
#include <orc/orcdebug.h>

#define MMX 1
#define SIZE 65536

#define ORC_MMX_ALIGNED_DEST_CUTOFF 64

void orc_mmx_emit_loop (OrcCompiler *compiler, int offset, int update);

void orc_compiler_mmx_init (OrcCompiler *compiler);
unsigned int orc_compiler_mmx_get_default_flags (void);
void orc_compiler_mmx_assemble (OrcCompiler *compiler);
void orc_compiler_mmx_register_rules (OrcTarget *target);
void orc_mmx_emit_invariants (OrcCompiler *compiler);


void orc_compiler_rewrite_vars (OrcCompiler *compiler);
void orc_compiler_dump (OrcCompiler *compiler);
void mmx_load_constant (OrcCompiler *compiler, int reg, int size, int value);
void mmx_load_constant_long (OrcCompiler *compiler, int reg,
    OrcConstant *constant);
static const char * mmx_get_flag_name (int shift);

static OrcTarget mmx_target = {
  "mmx",
#if defined(HAVE_I386) || defined(HAVE_AMD64)
  TRUE,
#else
  FALSE,
#endif
  ORC_VEC_REG_BASE,
  orc_compiler_mmx_get_default_flags,
  orc_compiler_mmx_init,
  orc_compiler_mmx_assemble,
  { { 0 } },
  0,
  NULL,
  mmx_load_constant,
  mmx_get_flag_name,
  NULL,
  mmx_load_constant_long
};


extern int orc_x86_mmx_flags;
extern int orc_x86_mmx_flags;

void
orc_mmx_init (void)
{
#if defined(HAVE_AMD64) || defined(HAVE_I386)
  /* initializes cache information */
  orc_mmx_get_cpu_flags ();
#endif

#if defined(HAVE_I386)
#ifndef MMX
  if (!(orc_x86_mmx_flags & ORC_TARGET_MMX_MMXEXT)) {
    mmx_target.executable = FALSE;
  }
#else
  if (!(orc_x86_mmx_flags & ORC_TARGET_MMX_MMX)) {
    mmx_target.executable = FALSE;
  }
#endif
#endif

  orc_target_register (&mmx_target);

  orc_compiler_mmx_register_rules (&mmx_target);
}

unsigned int
orc_compiler_mmx_get_default_flags (void)
{
  unsigned int flags = 0;

#if defined (HAVE_AMD64)
  flags |= ORC_TARGET_MMX_64BIT;
#endif
  if (_orc_compiler_flag_debug) {
    flags |= ORC_TARGET_MMX_FRAME_POINTER;
  }
  
#if defined(HAVE_AMD64) || defined(HAVE_I386)
#ifndef MMX
  flags |= orc_x86_mmx_flags;
#else
  flags |= orc_x86_mmx_flags;
#endif
#else
#ifndef MMX
  flags |= ORC_TARGET_MMX_MMXEXT;
  flags |= ORC_TARGET_MMX_SSE3;
  flags |= ORC_TARGET_MMX_SSSE3;
#else
  flags |= ORC_TARGET_MMX_MMX;
  flags |= ORC_TARGET_MMX_3DNOW;
#endif
#endif

  return flags;
}

static const char *
mmx_get_flag_name (int shift)
{
  static const char *flags[] = {
#ifndef MMX
    "sse2", "sse3", "ssse3", "sse41", "sse42", "sse4a", "sse5",
    "frame_pointer", "short_jumps", "64bit"
#else
    "mmx", "mmxext", "3dnow", "3dnowext", "ssse3", "sse41", "",
    "frame_pointer", "short_jumps", "64bit"
#endif
  };

  if (shift >= 0 && shift < sizeof(flags)/sizeof(flags[0])) {
    return flags[shift];
  }

  return NULL;
}

void
orc_compiler_mmx_init (OrcCompiler *compiler)
{
  int i;

  if (compiler->target_flags & ORC_TARGET_MMX_64BIT) {
    compiler->is_64bit = TRUE;
  }
  if (compiler->target_flags & ORC_TARGET_MMX_FRAME_POINTER) {
    compiler->use_frame_pointer = TRUE;
  }
  if (!(compiler->target_flags & ORC_TARGET_MMX_SHORT_JUMPS)) {
    compiler->long_jumps = TRUE;
  }
  

  if (compiler->is_64bit) {
    for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+16;i++){
      compiler->valid_regs[i] = 1;
    }
    compiler->valid_regs[X86_ESP] = 0;
#ifndef MMX
    for(i=X86_MM0;i<X86_MM0+16;i++){
      compiler->valid_regs[i] = 1;
    }
#else
    for(i=X86_MM0;i<X86_MM0+8;i++){
      compiler->valid_regs[i] = 1;
    }
#endif
    compiler->save_regs[X86_EBX] = 1;
    compiler->save_regs[X86_EBP] = 1;
    compiler->save_regs[X86_R12] = 1;
    compiler->save_regs[X86_R13] = 1;
    compiler->save_regs[X86_R14] = 1;
    compiler->save_regs[X86_R15] = 1;
#ifdef HAVE_OS_WIN32
    compiler->save_regs[X86_EDI] = 1;
    compiler->save_regs[X86_ESI] = 1;
    for(i=X86_MM0+6;i<X86_MM0+16;i++){
      compiler->save_regs[i] = 1;
    }
#endif
  } else {
    for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+8;i++){
      compiler->valid_regs[i] = 1;
    }
    compiler->valid_regs[X86_ESP] = 0;
    if (compiler->use_frame_pointer) {
      compiler->valid_regs[X86_EBP] = 0;
    }
    for(i=X86_MM0;i<X86_MM0+8;i++){
      compiler->valid_regs[i] = 1;
    }
    compiler->save_regs[X86_EBX] = 1;
    compiler->save_regs[X86_EDI] = 1;
    compiler->save_regs[X86_EBP] = 1;
  }
  for(i=0;i<128;i++){
    compiler->alloc_regs[i] = 0;
    compiler->used_regs[i] = 0;
  }

  if (compiler->is_64bit) {
#ifdef HAVE_OS_WIN32
    compiler->exec_reg = X86_ECX;
    compiler->gp_tmpreg = X86_EDX;
#else
    compiler->exec_reg = X86_EDI;
    compiler->gp_tmpreg = X86_ECX;
#endif
  } else {
    compiler->gp_tmpreg = X86_ECX;
    if (compiler->use_frame_pointer) {
      compiler->exec_reg = X86_EBX;
    } else {
      compiler->exec_reg = X86_EBP;
    }
  }
  compiler->valid_regs[compiler->gp_tmpreg] = 0;
  compiler->valid_regs[compiler->exec_reg] = 0;

  switch (compiler->max_var_size) {
    case 1:
      compiler->loop_shift = 4;
      break;
    case 2:
      compiler->loop_shift = 3;
      break;
    case 4:
      compiler->loop_shift = 2;
      break;
    case 8:
      compiler->loop_shift = 1;
      break;
    default:
      ORC_ERROR("unhandled max var size %d", compiler->max_var_size);
      break;
  }
#ifdef MMX
  compiler->loop_shift--;
#endif

  /* This limit is arbitrary, but some large functions run slightly
     slower when unrolled (ginger Core2 6,15,6), and only some small
     functions run faster when unrolled.  Most are the same speed. */
  if (compiler->n_insns <= 10) {
    compiler->unroll_shift = 1;
  }
  if (!compiler->long_jumps) {
    compiler->unroll_shift = 0;
  }
  if (compiler->loop_shift == 0) {
    /* FIXME something is broken with loop_shift=0, unroll_shift=1 */
    compiler->unroll_shift = 0;
  }
  compiler->alloc_loop_counter = TRUE;
  compiler->allow_gp_on_stack = TRUE;

  {
    for(i=0;i<compiler->n_insns;i++){
      OrcInstruction *insn = compiler->insns + i;
      OrcStaticOpcode *opcode = insn->opcode;

      if (strcmp (opcode->name, "ldreslinb") == 0 ||
          strcmp (opcode->name, "ldreslinl") == 0 ||
          strcmp (opcode->name, "ldresnearb") == 0 ||
          strcmp (opcode->name, "ldresnearl") == 0) {
        compiler->vars[insn->src_args[0]].need_offset_reg = TRUE;
      }
    }
  }
}

void
mmx_save_accumulators (OrcCompiler *compiler)
{
  int i;
  int src;
  int tmp;

  for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){
    OrcVariable *var = compiler->vars + i;

    if (var->name == NULL) continue;
    switch (var->vartype) {
      case ORC_VAR_TYPE_ACCUMULATOR:
        src = var->alloc;
        tmp = orc_compiler_get_temp_reg (compiler);

#ifndef MMX
        orc_mmx_emit_pshufd (compiler, ORC_MMX_SHUF(3,2,3,2), src, tmp);
#else
        orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(3,2,3,2), src, tmp);
#endif

        if (var->size == 2) {
          orc_mmx_emit_paddw (compiler, tmp, src);
        } else {
          orc_mmx_emit_paddd (compiler, tmp, src);
        }

#ifndef MMX
        orc_mmx_emit_pshufd (compiler, ORC_MMX_SHUF(1,1,1,1), src, tmp);

        if (var->size == 2) {
          orc_mmx_emit_paddw (compiler, tmp, src);
        } else {
          orc_mmx_emit_paddd (compiler, tmp, src);
        }
#endif

        if (var->size == 2) {
#ifndef MMX
          orc_mmx_emit_pshuflw (compiler, ORC_MMX_SHUF(1,1,1,1), src, tmp);
#else
          orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(1,1,1,1), src, tmp);
#endif

          orc_mmx_emit_paddw (compiler, tmp, src);
        }

        if (var->size == 2) {
          orc_mmx_emit_movd_store_register (compiler, src, compiler->gp_tmpreg);
          orc_x86_emit_and_imm_reg (compiler, 4, 0xffff, compiler->gp_tmpreg);
          orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
              (int)ORC_STRUCT_OFFSET(OrcExecutor, accumulators[i-ORC_VAR_A1]),
              compiler->exec_reg);
        } else {
          orc_x86_emit_mov_mmx_memoffset (compiler, 4, src,
              (int)ORC_STRUCT_OFFSET(OrcExecutor, accumulators[i-ORC_VAR_A1]),
              compiler->exec_reg,
              var->is_aligned, var->is_uncached);
        }

        break;
      default:
        break;
    }
  }
}

void
mmx_load_constant (OrcCompiler *compiler, int reg, int size, int value)
{
  orc_mmx_load_constant (compiler, reg, size, value);
}

void
orc_mmx_load_constant (OrcCompiler *compiler, int reg, int size, orc_uint64 value)
{
  int i;

  if (size == 8) {
    int offset = ORC_STRUCT_OFFSET(OrcExecutor,arrays[ORC_VAR_T1]);

    /* FIXME how ugly and slow! */
    orc_x86_emit_mov_imm_reg (compiler, 4, value>>0,
        compiler->gp_tmpreg);
    orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
        offset + 0, compiler->exec_reg);

    orc_x86_emit_mov_imm_reg (compiler, 4, value>>32,
        compiler->gp_tmpreg);
    orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
        offset + 4, compiler->exec_reg);

    orc_x86_emit_mov_memoffset_mmx (compiler, 8, offset, compiler->exec_reg,
        reg, FALSE);
#ifndef MMX
    orc_mmx_emit_pshufd (compiler, ORC_MMX_SHUF(1,0,1,0), reg, reg);
#endif
    return;
  }

  if (size == 1) {
    value &= 0xff;
    value |= (value << 8);
    value |= (value << 16);
  }
  if (size == 2) {
    value &= 0xffff;
    value |= (value << 16);
  }

  ORC_ASM_CODE(compiler, "# loading constant %d 0x%08x\n", (int)value, (int)value);
  if (value == 0) {
    orc_mmx_emit_pxor(compiler, reg, reg);
    return;
  }
  if (value == 0xffffffff) {
    orc_mmx_emit_pcmpeqb (compiler, reg, reg);
    return;
  }
  if (compiler->target_flags & ORC_TARGET_MMX_SSSE3) {
    if (value == 0x01010101) {
      orc_mmx_emit_pcmpeqb (compiler, reg, reg);
      orc_mmx_emit_pabsb (compiler, reg, reg);
      return;
    }
  }

  for(i=1;i<32;i++){
    orc_uint32 v;
    v = (0xffffffff<<i);
    if (value == v) {
      orc_mmx_emit_pcmpeqb (compiler, reg, reg);
      orc_mmx_emit_pslld_imm (compiler, i, reg);
      return;
    }
    v = (0xffffffff>>i);
    if (value == v) {
      orc_mmx_emit_pcmpeqb (compiler, reg, reg);
      orc_mmx_emit_psrld_imm (compiler, i, reg);
      return;
    }
  }
  for(i=1;i<16;i++){
    orc_uint32 v;
    v = (0xffff & (0xffff<<i)) | (0xffff0000 & (0xffff0000<<i));
    if (value == v) {
      orc_mmx_emit_pcmpeqb (compiler, reg, reg);
      orc_mmx_emit_psllw_imm (compiler, i, reg);
      return;
    }
    v = (0xffff & (0xffff>>i)) | (0xffff0000 & (0xffff0000>>i));
    if (value == v) {
      orc_mmx_emit_pcmpeqb (compiler, reg, reg);
      orc_mmx_emit_psrlw_imm (compiler, i, reg);
      return;
    }
  }

  orc_x86_emit_mov_imm_reg (compiler, 4, value, compiler->gp_tmpreg);
  orc_mmx_emit_movd_load_register (compiler, compiler->gp_tmpreg, reg);
#ifndef MMX
  orc_mmx_emit_pshufd (compiler, ORC_MMX_SHUF(0,0,0,0), reg, reg);
#else
  orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(1,0,1,0), reg, reg);
#endif
}

void
mmx_load_constant_long (OrcCompiler *compiler, int reg,
    OrcConstant *constant)
{
  int i;
  int offset = ORC_STRUCT_OFFSET(OrcExecutor,arrays[ORC_VAR_T1]);

  /* FIXME this is slower than it could be */

  ORC_ASM_CODE(compiler, "# loading constant %08x %08x %08x %08x\n",
      constant->full_value[0], constant->full_value[1],
      constant->full_value[2], constant->full_value[3]);

  for(i=0;i<4;i++){
    orc_x86_emit_mov_imm_reg (compiler, 4, constant->full_value[i],
        compiler->gp_tmpreg);
    orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
        offset + 4*i, compiler->exec_reg);
  }
  orc_x86_emit_mov_memoffset_mmx (compiler, 16, offset, compiler->exec_reg,
      reg, FALSE);

}

void
mmx_load_constants_outer (OrcCompiler *compiler)
{
  int i;
  for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){
    if (compiler->vars[i].name == NULL) continue;
    switch (compiler->vars[i].vartype) {
      case ORC_VAR_TYPE_CONST:
        break;
      case ORC_VAR_TYPE_PARAM:
        break;
      case ORC_VAR_TYPE_SRC:
      case ORC_VAR_TYPE_DEST:
        break;
      case ORC_VAR_TYPE_ACCUMULATOR:
        orc_mmx_emit_pxor (compiler,
            compiler->vars[i].alloc, compiler->vars[i].alloc);
        break;
      case ORC_VAR_TYPE_TEMP:
        break;
      default:
        orc_compiler_error(compiler,"bad vartype");
        break;
    }
  }

  orc_mmx_emit_invariants (compiler);

  /* FIXME move to a better place */
  for(i=0;i<compiler->n_constants;i++){
    compiler->constants[i].alloc_reg =
      orc_compiler_get_constant_reg (compiler);
  }

  for(i=0;i<compiler->n_constants;i++){
    if (compiler->constants[i].alloc_reg) {
      if (compiler->constants[i].is_long) {
        mmx_load_constant_long (compiler, compiler->constants[i].alloc_reg,
            compiler->constants + i);
      } else {
        mmx_load_constant (compiler, compiler->constants[i].alloc_reg,
            4, compiler->constants[i].value);
      }
    }
  }

  {
    for(i=0;i<compiler->n_insns;i++){
      OrcInstruction *insn = compiler->insns + i;
      OrcStaticOpcode *opcode = insn->opcode;

      if (strcmp (opcode->name, "ldreslinb") == 0 ||
          strcmp (opcode->name, "ldreslinl") == 0 ||
          strcmp (opcode->name, "ldresnearb") == 0 ||
          strcmp (opcode->name, "ldresnearl") == 0) {
        if (compiler->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_PARAM) {
          orc_x86_emit_mov_memoffset_reg (compiler, 4,
              (int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[1]]),
              compiler->exec_reg,
              compiler->vars[insn->src_args[0]].ptr_offset);
        } else {
          orc_x86_emit_mov_imm_reg (compiler, 4,
              compiler->vars[insn->src_args[1]].value.i,
              compiler->vars[insn->src_args[0]].ptr_offset);
        }
      }
    }
  }
}

void
mmx_load_constants_inner (OrcCompiler *compiler)
{
  int i;
  for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){
    if (compiler->vars[i].name == NULL) continue;
    switch (compiler->vars[i].vartype) {
      case ORC_VAR_TYPE_CONST:
        break;
      case ORC_VAR_TYPE_PARAM:
        break;
      case ORC_VAR_TYPE_SRC:
      case ORC_VAR_TYPE_DEST:
        if (compiler->vars[i].ptr_register) {
          orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4,
              (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg,
              compiler->vars[i].ptr_register);
        }
        break;
      case ORC_VAR_TYPE_ACCUMULATOR:
        break;
      case ORC_VAR_TYPE_TEMP:
        break;
      default:
        orc_compiler_error(compiler,"bad vartype");
        break;
    }
  }
}

void
mmx_add_strides (OrcCompiler *compiler)
{
  int i;

  for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){
    if (compiler->vars[i].name == NULL) continue;
    switch (compiler->vars[i].vartype) {
      case ORC_VAR_TYPE_CONST:
        break;
      case ORC_VAR_TYPE_PARAM:
        break;
      case ORC_VAR_TYPE_SRC:
      case ORC_VAR_TYPE_DEST:
        orc_x86_emit_mov_memoffset_reg (compiler, 4,
            (int)ORC_STRUCT_OFFSET(OrcExecutor, params[i]), compiler->exec_reg,
            compiler->gp_tmpreg);
        orc_x86_emit_add_reg_memoffset (compiler, compiler->is_64bit ? 8 : 4,
            compiler->gp_tmpreg,
            (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg);

        if (compiler->vars[i].ptr_register == 0) {
          orc_compiler_error (compiler, "unimplemented: stride on pointer stored in memory");
        }
        break;
      case ORC_VAR_TYPE_ACCUMULATOR:
        break;
      case ORC_VAR_TYPE_TEMP:
        break;
      default:
        orc_compiler_error(compiler,"bad vartype");
        break;
    }
  }
}

static int
get_align_var (OrcCompiler *compiler)
{
  int i;
  for(i=ORC_VAR_D1;i<=ORC_VAR_S8;i++){
    if (compiler->vars[i].size == 0) continue;
    if ((compiler->vars[i].size << compiler->loop_shift) >= 16) {
      return i;
    }
  }
  for(i=ORC_VAR_D1;i<=ORC_VAR_S8;i++){
    if (compiler->vars[i].size == 0) continue;
    if ((compiler->vars[i].size << compiler->loop_shift) >= 8) {
      return i;
    }
  }
  for(i=ORC_VAR_D1;i<=ORC_VAR_S8;i++){
    if (compiler->vars[i].size == 0) continue;
    return i;
  }

  orc_compiler_error(compiler, "could not find alignment variable");

  return -1;
}

static int
get_shift (int size)
{
  switch (size) {
    case 1:
      return 0;
    case 2:
      return 1;
    case 4:
      return 2;
    case 8:
      return 3;
    default:
      ORC_ERROR("bad size %d", size);
  }
  return -1;
}


static void
orc_emit_split_3_regions (OrcCompiler *compiler)
{
  int align_var;
  int align_shift;
  int var_size_shift;

  align_var = get_align_var (compiler);
  if (align_var < 0)
    return;
  var_size_shift = get_shift (compiler->vars[align_var].size);
  align_shift = var_size_shift + compiler->loop_shift;

  /* determine how many iterations until align array is aligned (n1) */
  orc_x86_emit_mov_imm_reg (compiler, 4, 16, X86_EAX);
  orc_x86_emit_sub_memoffset_reg (compiler, 4,
      (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[align_var]),
      compiler->exec_reg, X86_EAX);
  orc_x86_emit_and_imm_reg (compiler, 4, (1<<align_shift) - 1, X86_EAX);
  orc_x86_emit_sar_imm_reg (compiler, 4, var_size_shift, X86_EAX);

  /* check if n1 is greater than n. */
  orc_x86_emit_cmp_reg_memoffset (compiler, 4, X86_EAX,
      (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg);

  orc_x86_emit_jle (compiler, 6);

  /* If so, we have a standard 3-region split. */
  orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
      (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1), compiler->exec_reg);
    
  /* Calculate n2 */
  orc_x86_emit_mov_memoffset_reg (compiler, 4,
      (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg,
      compiler->gp_tmpreg);
  orc_x86_emit_sub_reg_reg (compiler, 4, X86_EAX, compiler->gp_tmpreg);

  orc_x86_emit_mov_reg_reg (compiler, 4, compiler->gp_tmpreg, X86_EAX);

  orc_x86_emit_sar_imm_reg (compiler, 4,
      compiler->loop_shift + compiler->unroll_shift,
      compiler->gp_tmpreg);
  orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
      (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg);

  /* Calculate n3 */
  orc_x86_emit_and_imm_reg (compiler, 4,
      (1<<(compiler->loop_shift + compiler->unroll_shift))-1, X86_EAX);
  orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
      (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg);

  orc_x86_emit_jmp (compiler, 7);

  /* else, iterations are all unaligned: n1=n, n2=0, n3=0 */
  orc_x86_emit_label (compiler, 6);

  orc_x86_emit_mov_memoffset_reg (compiler, 4,
      (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg, X86_EAX);
  orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
      (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1), compiler->exec_reg);
  orc_x86_emit_mov_imm_reg (compiler, 4, 0, X86_EAX);
  orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
      (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg);
  orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
      (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg);

  orc_x86_emit_label (compiler, 7);
}

static void
orc_emit_split_2_regions (OrcCompiler *compiler)
{
  int align_var;
  int align_shift ORC_GNUC_UNUSED;
  int var_size_shift;

  align_var = get_align_var (compiler);
  if (align_var < 0)
    return;
  var_size_shift = get_shift (compiler->vars[align_var].size);
  align_shift = var_size_shift + compiler->loop_shift;

  /* Calculate n2 */
  orc_x86_emit_mov_memoffset_reg (compiler, 4,
      (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg,
      compiler->gp_tmpreg);
  orc_x86_emit_mov_reg_reg (compiler, 4, compiler->gp_tmpreg, X86_EAX);
  orc_x86_emit_sar_imm_reg (compiler, 4,
      compiler->loop_shift + compiler->unroll_shift,
      compiler->gp_tmpreg);
  orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
      (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg);

  /* Calculate n3 */
  orc_x86_emit_and_imm_reg (compiler, 4,
      (1<<(compiler->loop_shift + compiler->unroll_shift))-1, X86_EAX);
  orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
      (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg);
}

#ifndef MMX
static int
orc_program_has_float (OrcCompiler *compiler)
{
  int j;
  for(j=0;j<compiler->n_insns;j++){
    OrcInstruction *insn = compiler->insns + j;
    OrcStaticOpcode *opcode = insn->opcode;
    if (opcode->flags & ORC_STATIC_OPCODE_FLOAT) return TRUE;
  }
  return FALSE;
}
#endif

#define LABEL_REGION1_SKIP 1
#define LABEL_INNER_LOOP_START 2
#define LABEL_REGION2_SKIP 3
#define LABEL_OUTER_LOOP 4
#define LABEL_OUTER_LOOP_SKIP 5
#define LABEL_STEP_DOWN(x) (8+(x))
#define LABEL_STEP_UP(x) (13+(x))


void
orc_compiler_mmx_assemble (OrcCompiler *compiler)
{
#ifndef MMX
  int set_mxcsr = FALSE;
#endif
  int align_var;
  int is_aligned;

  if (0 && orc_x86_assemble_copy_check (compiler)) {
    /* The rep movs implementation isn't faster most of the time */
    orc_x86_assemble_copy (compiler);
    return;
  }

  align_var = get_align_var (compiler);
  if (align_var < 0) {
    orc_x86_assemble_copy (compiler);
    return;
  }
  is_aligned = compiler->vars[align_var].is_aligned;

  {
    orc_mmx_emit_loop (compiler, 0, 0);

    compiler->codeptr = compiler->code;
    free (compiler->asm_code);
    compiler->asm_code = NULL;
    compiler->asm_code_len = 0;
    memset (compiler->labels, 0, sizeof (compiler->labels));
    memset (compiler->labels_int, 0, sizeof (compiler->labels_int));
    compiler->n_fixups = 0;
    compiler->n_output_insns = 0;
  }

  if (compiler->error) return;

  orc_x86_emit_prologue (compiler);

#ifndef MMX
  if (orc_program_has_float (compiler)) {
    set_mxcsr = TRUE;
    orc_mmx_set_mxcsr (compiler);
  }
#endif

  mmx_load_constants_outer (compiler);

  if (compiler->program->is_2d) {
    if (compiler->program->constant_m > 0) {
      orc_x86_emit_mov_imm_reg (compiler, 4, compiler->program->constant_m,
          X86_EAX);
      orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
          (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A2]),
          compiler->exec_reg);
    } else {
      orc_x86_emit_mov_memoffset_reg (compiler, 4,
          (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A1]),
          compiler->exec_reg, X86_EAX);
      orc_x86_emit_test_reg_reg (compiler, 4, X86_EAX, X86_EAX);
      orc_x86_emit_jle (compiler, LABEL_OUTER_LOOP_SKIP);
      orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
          (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A2]),
          compiler->exec_reg);
    }

    orc_x86_emit_label (compiler, LABEL_OUTER_LOOP);
  }

  if (compiler->program->constant_n > 0 &&
      compiler->program->constant_n <= ORC_MMX_ALIGNED_DEST_CUTOFF) {
    /* don't need to load n */
  } else if (compiler->loop_shift > 0) {
    if (compiler->has_iterator_opcode || is_aligned) {
      orc_emit_split_2_regions (compiler);
    } else {
      /* split n into three regions, with center region being aligned */
      orc_emit_split_3_regions (compiler);
    }
  } else {
    /* loop shift is 0, no need to split */
    orc_x86_emit_mov_memoffset_reg (compiler, 4,
        (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg,
        compiler->gp_tmpreg);
    orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
        (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg);
  }

  mmx_load_constants_inner (compiler);

  if (compiler->program->constant_n > 0 &&
      compiler->program->constant_n <= ORC_MMX_ALIGNED_DEST_CUTOFF) {
    int n_left = compiler->program->constant_n;
    int save_loop_shift;
    int loop_shift;

    compiler->offset = 0;

    save_loop_shift = compiler->loop_shift;
    while (n_left >= (1<<compiler->loop_shift)) {
      ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift);
      orc_mmx_emit_loop (compiler, compiler->offset, 0);

      n_left -= 1<<compiler->loop_shift;
      compiler->offset += 1<<compiler->loop_shift;
    }
    for(loop_shift = compiler->loop_shift-1; loop_shift>=0; loop_shift--) {
      if (n_left >= (1<<loop_shift)) {
        compiler->loop_shift = loop_shift;
        ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", loop_shift);
        orc_mmx_emit_loop (compiler, compiler->offset, 0);
        n_left -= 1<<loop_shift;
        compiler->offset += 1<<loop_shift;
      }
    }
    compiler->loop_shift = save_loop_shift;

  } else {
    int ui, ui_max;
    int emit_region1 = TRUE;
    int emit_region3 = TRUE;

    if (compiler->has_iterator_opcode || is_aligned) {
      emit_region1 = FALSE;
    }
    if (compiler->loop_shift == 0) {
      emit_region1 = FALSE;
      emit_region3 = FALSE;
    }

    if (emit_region1) {
      int save_loop_shift;
      int l;

      save_loop_shift = compiler->loop_shift;
      compiler->vars[align_var].is_aligned = FALSE;

      for (l=0;l<save_loop_shift;l++){
        compiler->loop_shift = l;
        ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift);

        orc_x86_emit_test_imm_memoffset (compiler, 4, 1<<compiler->loop_shift,
            (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1), compiler->exec_reg);
        orc_x86_emit_je (compiler, LABEL_STEP_UP(compiler->loop_shift));
        orc_mmx_emit_loop (compiler, 0, 1<<compiler->loop_shift);
        orc_x86_emit_label (compiler, LABEL_STEP_UP(compiler->loop_shift));
      }

      compiler->loop_shift = save_loop_shift;
      compiler->vars[align_var].is_aligned = TRUE;
    }

    orc_x86_emit_label (compiler, LABEL_REGION1_SKIP);

    orc_x86_emit_cmp_imm_memoffset (compiler, 4, 0,
        (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg);
    orc_x86_emit_je (compiler, LABEL_REGION2_SKIP);

    if (compiler->loop_counter != ORC_REG_INVALID) {
      orc_x86_emit_mov_memoffset_reg (compiler, 4,
          (int)ORC_STRUCT_OFFSET(OrcExecutor, counter2), compiler->exec_reg,
          compiler->loop_counter);
    }

    ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift);
    orc_x86_emit_align (compiler, 4);
    orc_x86_emit_label (compiler, LABEL_INNER_LOOP_START);
    ui_max = 1<<compiler->unroll_shift;
    for(ui=0;ui<ui_max;ui++) {
      compiler->offset = ui<<compiler->loop_shift;
      orc_mmx_emit_loop (compiler, compiler->offset,
          (ui==ui_max-1) << (compiler->loop_shift + compiler->unroll_shift));
    }
    compiler->offset = 0;
    if (compiler->loop_counter != ORC_REG_INVALID) {
      orc_x86_emit_add_imm_reg (compiler, 4, -1, compiler->loop_counter, TRUE);
    } else {
      orc_x86_emit_dec_memoffset (compiler, 4,
          (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2),
          compiler->exec_reg);
    }
    orc_x86_emit_jne (compiler, LABEL_INNER_LOOP_START);
    orc_x86_emit_label (compiler, LABEL_REGION2_SKIP);

    if (emit_region3) {
      int save_loop_shift;
      int l;

      save_loop_shift = compiler->loop_shift + compiler->unroll_shift;
      compiler->vars[align_var].is_aligned = FALSE;

      for(l=save_loop_shift - 1; l >= 0; l--) {
        compiler->loop_shift = l;
        ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift);

        orc_x86_emit_test_imm_memoffset (compiler, 4, 1<<compiler->loop_shift,
            (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg);
        orc_x86_emit_je (compiler, LABEL_STEP_DOWN(compiler->loop_shift));
        orc_mmx_emit_loop (compiler, 0, 1<<compiler->loop_shift);
        orc_x86_emit_label (compiler, LABEL_STEP_DOWN(compiler->loop_shift));
      }

      compiler->loop_shift = save_loop_shift;
    }
  }

  if (compiler->program->is_2d && compiler->program->constant_m != 1) {
    mmx_add_strides (compiler);

    orc_x86_emit_add_imm_memoffset (compiler, 4, -1,
        (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]),
        compiler->exec_reg);
    orc_x86_emit_jne (compiler, LABEL_OUTER_LOOP);
    orc_x86_emit_label (compiler, LABEL_OUTER_LOOP_SKIP);
  }

  mmx_save_accumulators (compiler);

#ifndef MMX
  if (set_mxcsr) {
    orc_mmx_restore_mxcsr (compiler);
  }
#else
  orc_x86_emit_emms (compiler);
#endif
  orc_x86_emit_epilogue (compiler);

  orc_x86_calculate_offsets (compiler);
  orc_x86_output_insns (compiler);

  orc_x86_do_fixups (compiler);
}

void
orc_mmx_emit_loop (OrcCompiler *compiler, int offset, int update)
{
  int j;
  int k;
  OrcInstruction *insn;
  OrcStaticOpcode *opcode;
  OrcRule *rule;

  for(j=0;j<compiler->n_insns;j++){
    insn = compiler->insns + j;
    opcode = insn->opcode;

    compiler->insn_index = j;

    if (insn->flags & ORC_INSN_FLAG_INVARIANT) continue;

    ORC_ASM_CODE(compiler,"# %d: %s\n", j, insn->opcode->name);

    compiler->min_temp_reg = ORC_VEC_REG_BASE;

    compiler->insn_shift = compiler->loop_shift;
    if (insn->flags & ORC_INSTRUCTION_FLAG_X2) {
      compiler->insn_shift += 1;
    }
    if (insn->flags & ORC_INSTRUCTION_FLAG_X4) {
      compiler->insn_shift += 2;
    }

    rule = insn->rule;
    if (rule && rule->emit) {
      if (!(insn->opcode->flags & (ORC_STATIC_OPCODE_ACCUMULATOR|ORC_STATIC_OPCODE_LOAD|ORC_STATIC_OPCODE_STORE)) &&
          compiler->vars[insn->dest_args[0]].alloc !=
          compiler->vars[insn->src_args[0]].alloc) {
#ifdef MMX
        orc_mmx_emit_movq (compiler,
            compiler->vars[insn->src_args[0]].alloc,
            compiler->vars[insn->dest_args[0]].alloc);
#else
        orc_mmx_emit_movdqu (compiler,
            compiler->vars[insn->src_args[0]].alloc,
            compiler->vars[insn->dest_args[0]].alloc);
#endif
      }
      rule->emit (compiler, rule->emit_user, insn);
    } else {
      orc_compiler_error (compiler, "no code generation rule for %s",
          opcode->name);
    }
  }

  if (update) {
    for(k=0;k<ORC_N_COMPILER_VARIABLES;k++){
      OrcVariable *var = compiler->vars + k;

      if (var->name == NULL) continue;
      if (var->vartype == ORC_VAR_TYPE_SRC ||
          var->vartype == ORC_VAR_TYPE_DEST) {
        int offset;
        if (var->update_type == 0) {
          offset = 0;
        } else if (var->update_type == 1) {
          offset = (var->size * update) >> 1;
        } else {
          offset = var->size * update;
        }

        if (offset != 0) {
          if (compiler->vars[k].ptr_register) {
            orc_x86_emit_add_imm_reg (compiler, compiler->is_64bit ? 8 : 4,
                offset,
                compiler->vars[k].ptr_register, FALSE);
          } else {
            orc_x86_emit_add_imm_memoffset (compiler, compiler->is_64bit ? 8 : 4,
                offset,
                (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[k]),
                compiler->exec_reg);
          }
        }
      }
    }
  }
}

void
orc_mmx_emit_invariants (OrcCompiler *compiler)
{
  int j;
  OrcInstruction *insn;
  OrcStaticOpcode *opcode;
  OrcRule *rule;

  for(j=0;j<compiler->n_insns;j++){
    insn = compiler->insns + j;
    opcode = insn->opcode;

    if (!(insn->flags & ORC_INSN_FLAG_INVARIANT)) continue;

    ORC_ASM_CODE(compiler,"# %d: %s\n", j, insn->opcode->name);

    compiler->insn_shift = compiler->loop_shift;
    if (insn->flags & ORC_INSTRUCTION_FLAG_X2) {
      compiler->insn_shift += 1;
    }
    if (insn->flags & ORC_INSTRUCTION_FLAG_X4) {
      compiler->insn_shift += 2;
    }

    rule = insn->rule;
    if (rule && rule->emit) {
      rule->emit (compiler, rule->emit_user, insn);
    } else {
      orc_compiler_error (compiler, "no code generation rule for %s",
          opcode->name);
    }
  }
}