Blob Blame History Raw

#include <stdio.h>
#ifndef _MSC_VER
#include <sys/time.h>
#endif

#define ORC_ENABLE_UNSTABLE_API

#include <orc/orc.h>
#include <orc/orcmmx.h>
#include <orc/orcsse.h>

static OrcProgram *p = NULL;

static void
mmx_rule_mulhslw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
  int tmp1 = X86_MM4;
  int tmp2 = X86_MM5;
  int src1 = ORC_SRC_ARG (p, insn, 0);
  int src2 = ORC_SRC_ARG (p, insn, 1);
  int dest = ORC_DEST_ARG (p, insn, 0);

  if (dest != src1) {
    orc_mmx_emit_movq (p, src1, dest);
  }

  orc_mmx_emit_pxor (p, tmp1, tmp1);      /* .. |    0  |    0  | */
  orc_mmx_emit_punpcklwd (p, tmp1, src2); /* .. |    0  |   p0  | */
  orc_mmx_emit_pcmpgtw (p, dest, tmp1);   /* .. |    0  | s(vl) | */
  orc_mmx_emit_pand (p, src2, tmp1);      /* .. |    0  |  (p0) |  (vl >> 15) & p */
  orc_mmx_emit_movq (p, src2, tmp2);
  orc_mmx_emit_pmulhw (p, src1, src2);    /* .. |    0  | vl*p0 | */
  orc_mmx_emit_paddw (p, tmp1, src2);     /* .. |    0  | vl*p0 | + sign correct */
  orc_mmx_emit_psrld_imm (p, 16, dest);       /* .. |    0  |   vh  | */
  orc_mmx_emit_pmaddwd (p, tmp2, dest);   /* .. |    p0 * vh    | */
  orc_mmx_emit_paddd (p, src2, dest);     /* .. |    p0 * v0    | */
}

static void
sse_rule_mulhslw (OrcCompiler *p, void *user, OrcInstruction *insn)
{
  int tmp1 = X86_XMM4;
  int tmp2 = X86_XMM5;
  int src1 = ORC_SRC_ARG (p, insn, 0);
  int src2 = ORC_SRC_ARG (p, insn, 1);
  int dest = ORC_DEST_ARG (p, insn, 0);

  if (dest != src1) {
    orc_sse_emit_movdqa (p, src1, dest);
  }

  orc_sse_emit_pxor (p, tmp1, tmp1);      /* .. |    0  |    0  | */
  orc_sse_emit_punpcklwd (p, tmp1, src2); /* .. |    0  |   p0  | */
  orc_sse_emit_pcmpgtw (p, dest, tmp1);   /* .. |    0  | s(vl) | */
  orc_sse_emit_pand (p, src2, tmp1);      /* .. |    0  |  (p0) |  (vl >> 15) & p */
  orc_sse_emit_movdqa (p, src2, tmp2);
  orc_sse_emit_pmulhw (p, src1, src2);    /* .. |    0  | vl*p0 | */
  orc_sse_emit_paddw (p, tmp1, src2);     /* .. |    0  | vl*p0 | + sign correct */
  orc_sse_emit_psrld_imm (p, 16, dest);       /* .. |    0  |   vh  | */
  orc_sse_emit_pmaddwd (p, tmp2, dest);   /* .. |    p0 * vh    | */
  orc_sse_emit_paddd (p, src2, dest);     /* .. |    p0 * v0    | */
}

static void
mmx_register_rules (void)
{
  OrcRuleSet *rule_set;

  rule_set = orc_rule_set_new (orc_opcode_set_get("pulse"),
      orc_target_get_by_name ("mmx"), ORC_TARGET_MMX_MMX);

  orc_rule_register (rule_set, "mulhslw", mmx_rule_mulhslw, NULL);
}

static void
sse_register_rules (void)
{
  OrcRuleSet *rule_set;

  rule_set = orc_rule_set_new (orc_opcode_set_get("pulse"),
      orc_target_get_by_name ("sse"), ORC_TARGET_SSE_SSE2);

  orc_rule_register (rule_set, "mulhslw", sse_rule_mulhslw, NULL);
}

/* calculate the high 32 bits of a 32x16 signed multiply */
static void
emulate_mulhslw (OrcOpcodeExecutor *ex, int offset, int n)
{
  int i;
  orc_union32 * ptr0;
  const orc_union32 * ptr4;
  const orc_int16 * ptr5;
  orc_union32 var32;
  orc_int16 var33;
  orc_union32 var34;

  ptr0 = (orc_union32 *)ex->dest_ptrs[0];
  ptr4 = (orc_union32 *)ex->src_ptrs[0];
  ptr5 = (orc_int16 *)ex->src_ptrs[1];

  for (i = 0; i < n; i++) {
    /* 0: loadb */
    var32 = ptr4[i];
    /* 1: loadb */
    var33 = ptr5[i];
    /* 2: mulsbw */
    var34.i = (var32.i * var33)>>16;
    /* 3: storew */
    ptr0[i] = var34;
  }
}

static OrcStaticOpcode opcodes[] = {
  { "mulhslw", 0, { 4 }, { 4, 2 }, emulate_mulhslw },

  { "" }
};

static void
register_instr (void)
{
  orc_opcode_register_static (opcodes, "pulse");
  mmx_register_rules ();
  sse_register_rules ();
}

static void
do_volume_c (orc_int16 *dest, const orc_int32 *vols, const orc_int16 *samp, int len)
{
  int i;

  for (i = 0; i < len; i++) {
    orc_int32 t, hi, lo;

    hi = vols[i] >> 16;
    lo = vols[i] & 0xffff;

    t = (orc_int32)(samp[i]);
    t = ((t * lo) >> 16) + (t * hi);
    dest[i] = (orc_int16) ORC_CLAMP (t, -0x8000, 0x7FFF);
  }
}


static void
do_volume_backup (OrcExecutor *ex)
{
  orc_int16 *dest;
  orc_int32 *vols;
  const orc_int16 *samp;
  int len;

  dest = ex->arrays[ORC_VAR_D1];
  vols = ex->arrays[ORC_VAR_S1];
  samp = ex->arrays[ORC_VAR_S2];
  len = ex->n;

  do_volume_c (dest, vols, samp, len);
}

static void
make_volume_orc()
{
  OrcCompileResult res;

  /* int16 destination samples that get scaled by int32 volumes */
  p = orc_program_new ();
  orc_program_set_backup_function (p, do_volume_backup);
  orc_program_add_destination (p, 2, "d1");
  orc_program_add_source (p, 4, "s1");
  orc_program_add_source (p, 2, "s2");

  /* a temporary for the upscaled input samples */
  orc_program_add_temporary (p, 4, "t1");

  /* multiply with the volume, keeping only the high 32bits */
  orc_program_append (p, "mulhslw", ORC_VAR_T1, ORC_VAR_S1, ORC_VAR_S2);
  /* pack an saturate do 16 bits again */
  orc_program_append_ds (p, "convssslw", ORC_VAR_D1, ORC_VAR_T1);

  /* Compile the program */
  res = orc_program_compile (p);
  fprintf (stderr, "result: %d\n", res);

  if (res == ORC_COMPILE_RESULT_OK)
    fprintf (stderr, "%s\n", orc_program_get_asm_code (p));
}

static void
do_volume_orc (orc_int16 *dest, orc_int32 *volumes, orc_int16 *samp, int length)
{
  OrcExecutor _ex;
  OrcExecutor *ex = &_ex;

  /* Set the values on the executor structure */
  orc_executor_set_program (ex, p);
  orc_executor_set_n (ex, length);
  orc_executor_set_array (ex, ORC_VAR_D1, dest);
  orc_executor_set_array (ex, ORC_VAR_S1, volumes);
  orc_executor_set_array (ex, ORC_VAR_S2, samp);

  /* Run the program.  This calls the code that was generated above,
   * or, if the compilation failed, will emulate the program. */
  orc_executor_run (ex);
}

static orc_uint64
get_timestamp ()
{
#ifndef _MSC_VER
  struct timeval now;

  gettimeofday (&now, NULL);

  return now.tv_sec * 1000000LL + now.tv_usec;
#else
  return 0;
#endif
}

#define TIMES 100000
#define N 1024

orc_int16 dest[N];
orc_int16 samp[N];
orc_int32 vols[N];

int
main (int argc, char *argv[])
{
  int i;
  orc_uint64 start, stop;

  /* orc_init() must be called before any other Orc function */
  orc_init ();

  orc_debug_set_level (ORC_DEBUG_LOG);
  register_instr ();

  make_volume_orc();
  orc_debug_set_level (ORC_DEBUG_NONE);

  /* Create some data in the source arrays */
  for(i=0;i<N;i++){
    dest[i] = 0;
    samp[i] = i + 1;
    vols[i] = 0x10000 + i;
  }

  start = get_timestamp ();
  for (i = 0; i < TIMES; i++)
    do_volume_c (dest, vols, samp, N);
  stop = get_timestamp ();
  printf ("elapsed C: %d ms\n", (int) (stop - start));


  start = get_timestamp ();
  for (i = 0; i < TIMES; i++)
    do_volume_orc (dest, vols, samp, N);
  stop = get_timestamp ();
  printf ("elapsed ORC: %d ms\n", (int) (stop - start));

  /* Print the results */
  for(i=0;i<20;i++){
    printf("%d: %d -> %d\n", i, samp[i], dest[i]);
  }

  return 0;
}