Blob Blame History Raw
/*
    Image modifier implementation: (un)distortion functions
    Copyright (C) 2007 by Andrew Zabolotny
    Copyright (C) 2014 by Roman Lebedev

    Most of the math in this file has been borrowed from PanoTools.
    Thanks to the PanoTools team for their pioneering work.
*/

#include "config.h"

#ifdef VECTORIZATION_SSE

#include "lensfun.h"
#include "lensfunprv.h"
#include <xmmintrin.h>

#if defined (_MSC_VER)
typedef size_t uintptr_t;
#else
typedef __SIZE_TYPE__ uintptr_t;
#endif

void lfModifier::ModifyCoord_UnDist_PTLens_SSE (void *data, float *iocoord, int count)
{
  /*
   * If buffer is not aligned, fall back to plain code
   */
  if((uintptr_t)(iocoord) & 0xf)
  {
    return ModifyCoord_UnDist_PTLens(data, iocoord, count);
  }

  float *param = (float *)data;

  __m128 a = _mm_set_ps1 (param [0]);
  __m128 b = _mm_set_ps1 (param [1]);
  __m128 c = _mm_set_ps1 (param [2]);
  __m128 very_small = _mm_set_ps1 (1e-15);
  __m128 one = _mm_set_ps1 (1.0f);
  __m128 d = _mm_sub_ps (one, _mm_add_ps (a, _mm_add_ps (b, c)));

  // SSE Loop processes 4 pixels/loop
  int loop_count = count / 4;
  for (int i = 0; i < loop_count ; i++)
  {
    // Load 4 sets of coordinates
    __m128 c0 = _mm_load_ps (&iocoord[8*i]);
    __m128 c1 = _mm_load_ps (&iocoord[8*i+4]);
    __m128 x = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (2, 0, 2, 0));
    __m128 y = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (3, 1, 3, 1));
    __m128 rd = _mm_add_ps (_mm_mul_ps (x, x), _mm_mul_ps (y, y));

    // We don't check for zero, but set it to a very small value instead
    rd = _mm_max_ps (rd, very_small);
    rd = _mm_rcp_ps (_mm_rsqrt_ps (rd));
    __m128 ru = rd;
    for (int step = 0; step < 4; step++)
    {
      // fru = ru * (a * ru^2 * ru + b * ru^2 + c * ru + d) - rd
      __m128 ru_sq =  _mm_mul_ps (ru, ru);
      __m128 fru = _mm_mul_ps (_mm_mul_ps (a, ru), ru_sq);
      __m128 t = _mm_add_ps (_mm_mul_ps (b, ru_sq), _mm_add_ps (d, _mm_mul_ps (c, ru)));
      fru = _mm_sub_ps (_mm_mul_ps (_mm_add_ps (t, fru), ru), rd);

      // This is most likely faster than loading form L1 cache
      __m128 two = _mm_add_ps (one, one);
      __m128 three = _mm_add_ps (one, two);
      __m128 four = _mm_add_ps (two, two);

      // corr =  4 * a * ru * ru^2 + 3 * b * ru^2 + 2 * c * ru + d
      __m128 corr = _mm_mul_ps (c, ru);
      corr = _mm_add_ps (d, _mm_add_ps (corr, corr));
      t = _mm_mul_ps (ru_sq, _mm_mul_ps (three, b));
      corr = _mm_add_ps (corr, _mm_mul_ps (_mm_mul_ps (ru, ru_sq), _mm_mul_ps (four, a)));
      corr = _mm_rcp_ps (_mm_add_ps (corr, t));

      // ru -= fru * corr
      ru = _mm_sub_ps (ru, _mm_mul_ps (fru, corr));
    }
    // We don't check for zero, but set it to a very small value instead
    ru = _mm_max_ps (ru, very_small);

    // ru /= rd
    ru = _mm_mul_ps (ru, _mm_rcp_ps(rd));
    c0 = _mm_mul_ps (c0, ru);
    c1 = _mm_mul_ps (c1, ru);
    _mm_store_ps (&iocoord [8 * i], c0);
    _mm_store_ps (&iocoord [8 * i + 4], c1);
  }

  loop_count *= 4;
  int remain = count - loop_count;
  if (remain) 
    ModifyCoord_UnDist_PTLens (data, &iocoord [loop_count * 2], remain);
}

void lfModifier::ModifyCoord_Dist_PTLens_SSE (void *data, float *iocoord, int count)
{
  /*
   * If buffer is not aligned, fall back to plain code
   */
  if((uintptr_t)(iocoord) & 0xf)
  {
    return ModifyCoord_Dist_PTLens(data, iocoord, count);
  }

  float *param = (float *)data;

  // Rd = Ru * (a * Ru^3 + b * Ru^2 + c * Ru + d)
  __m128 a = _mm_set_ps1 (param [0]);
  __m128 b = _mm_set_ps1 (param [1]);
  __m128 c = _mm_set_ps1 (param [2]);
  __m128 d = _mm_sub_ps (_mm_set_ps1 (1.0f), _mm_add_ps (a, _mm_add_ps (b, c)));

  // SSE Loop processes 4 pixels/loop
  int loop_count = count / 4;
  for (int i = 0; i < loop_count ; i++)
  {
    __m128 c0 = _mm_load_ps (&iocoord [8 * i]);
    __m128 c1 = _mm_load_ps (&iocoord [8 * i + 4]);
    __m128 x = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (2, 0, 2, 0));
    __m128 y = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (3, 1, 3, 1));
    __m128 ru2 = _mm_add_ps (_mm_mul_ps (x, x), _mm_mul_ps (y, y));
    __m128 ru = _mm_rcp_ps (_mm_rsqrt_ps (ru2));

    // Calculate poly3 = a * ru2 * ru + b * ru2 + c * ru + d;
    __m128 t = _mm_mul_ps (ru2, b);
    __m128 poly3 = _mm_mul_ps (_mm_mul_ps (a, ru2), ru);
    t = _mm_add_ps (t, _mm_mul_ps (ru, c));
    poly3 = _mm_add_ps (t, _mm_add_ps (poly3, d));
    _mm_store_ps (&iocoord [8 * i], _mm_mul_ps (poly3, c0));
    _mm_store_ps (&iocoord [8 * i + 4], _mm_mul_ps (poly3, c1));
  }

  loop_count *= 4;
  int remain = count - loop_count;
  if (remain)
    ModifyCoord_Dist_PTLens (data, &iocoord [loop_count * 2], remain);
}

void lfModifier::ModifyCoord_Dist_Poly3_SSE (void *data, float *iocoord, int count)
{
  /*
   * If buffer is not aligned, fall back to plain code
   */
  if((uintptr_t)(iocoord) & 0xf)
  {
    return ModifyCoord_Dist_Poly3(data, iocoord, count);
  }

  float *param = (float *)data;

  // Rd = Ru * (d + k1 * Ru^2),  d = 1 - k1
  __m128 k1 = _mm_set_ps1 (param [0]);
  __m128 d = _mm_sub_ps (_mm_set_ps1 (1.0f), k1);

  // SSE Loop processes 4 pixels/loop
  int loop_count = count / 4;
  for (int i = 0; i < loop_count ; i++)
  {
    __m128 c0 = _mm_load_ps (&iocoord [8 * i]);
    __m128 c1 = _mm_load_ps (&iocoord [8 * i + 4]);
    __m128 x = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (2, 0, 2, 0));
    __m128 y = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (3, 1, 3, 1));

    // Calculate poly3 = k1 * ru * ru + d;
    __m128 poly3 = _mm_add_ps (_mm_mul_ps (_mm_add_ps (_mm_mul_ps (x, x), _mm_mul_ps (y, y)), k1), d);
    _mm_store_ps (&iocoord [8 * i], _mm_mul_ps (poly3, c0));
    _mm_store_ps (&iocoord [8 * i + 4], _mm_mul_ps (poly3, c1));
  }

  loop_count *= 4;
  int remain = count - loop_count;
  if (remain)
    ModifyCoord_Dist_Poly3 (data, &iocoord [loop_count * 2], remain);
}

#endif