|
Packit |
c2c737 |
///////////////////////////////////////////////////////////////////////////
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Copyright (c) 2012, Autodesk, Inc.
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// All rights reserved.
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Implementation of IIF-specific file format and speed optimizations
|
|
Packit |
c2c737 |
// provided by Innobec Technologies inc on behalf of Autodesk.
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Redistribution and use in source and binary forms, with or without
|
|
Packit |
c2c737 |
// modification, are permitted provided that the following conditions are
|
|
Packit |
c2c737 |
// met:
|
|
Packit |
c2c737 |
// * Redistributions of source code must retain the above copyright
|
|
Packit |
c2c737 |
// notice, this list of conditions and the following disclaimer.
|
|
Packit |
c2c737 |
// * Redistributions in binary form must reproduce the above
|
|
Packit |
c2c737 |
// copyright notice, this list of conditions and the following disclaimer
|
|
Packit |
c2c737 |
// in the documentation and/or other materials provided with the
|
|
Packit |
c2c737 |
// distribution.
|
|
Packit |
c2c737 |
// * Neither the name of Industrial Light & Magic nor the names of
|
|
Packit |
c2c737 |
// its contributors may be used to endorse or promote products derived
|
|
Packit |
c2c737 |
// from this software without specific prior written permission.
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
Packit |
c2c737 |
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
Packit |
c2c737 |
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
Packit |
c2c737 |
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
Packit |
c2c737 |
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
Packit |
c2c737 |
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
Packit |
c2c737 |
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
Packit |
c2c737 |
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
Packit |
c2c737 |
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
Packit |
c2c737 |
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
Packit |
c2c737 |
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
///////////////////////////////////////////////////////////////////////////
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
#pragma once
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
#ifndef INCLUDED_IMF_OPTIMIZED_PIXEL_READING_H
|
|
Packit |
c2c737 |
#define INCLUDED_IMF_OPTIMIZED_PIXEL_READING_H
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
#include "ImfSimd.h"
|
|
Packit |
c2c737 |
#include "ImfSystemSpecific.h"
|
|
Packit |
c2c737 |
#include <iostream>
|
|
Packit |
c2c737 |
#include "ImfChannelList.h"
|
|
Packit |
c2c737 |
#include "ImfFrameBuffer.h"
|
|
Packit |
c2c737 |
#include "ImfStringVectorAttribute.h"
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_ENTER
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
class OptimizationMode
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
public:
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
bool _optimizable;
|
|
Packit |
c2c737 |
int _ySampling;
|
|
Packit |
c2c737 |
OptimizationMode() : _optimizable(false) {}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
};
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
#if IMF_HAVE_SSE2
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//------------------------------------------------------------------------
|
|
Packit |
c2c737 |
// Test for SSE pointer alignemnt
|
|
Packit |
c2c737 |
//------------------------------------------------------------------------
|
|
Packit |
c2c737 |
EXR_FORCEINLINE
|
|
Packit |
c2c737 |
bool
|
|
Packit |
c2c737 |
isPointerSSEAligned (const void* EXR_RESTRICT pPointer)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
unsigned long trailingBits = ((unsigned long)pPointer) & 15;
|
|
Packit |
c2c737 |
return trailingBits == 0;
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//------------------------------------------------------------------------
|
|
Packit |
c2c737 |
// Load SSE from address into register
|
|
Packit |
c2c737 |
//------------------------------------------------------------------------
|
|
Packit |
c2c737 |
template<bool IS_ALIGNED>
|
|
Packit |
c2c737 |
EXR_FORCEINLINE
|
|
Packit |
c2c737 |
__m128i loadSSE (__m128i*& loadAddress)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
// throw exception :: this is not accepted
|
|
Packit |
c2c737 |
return _mm_loadu_si128 (loadAddress);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
template<>
|
|
Packit |
c2c737 |
EXR_FORCEINLINE
|
|
Packit |
c2c737 |
__m128i loadSSE<false> (__m128i*& loadAddress)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
return _mm_loadu_si128 (loadAddress);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
template<>
|
|
Packit |
c2c737 |
EXR_FORCEINLINE
|
|
Packit |
c2c737 |
__m128i loadSSE<true> (__m128i*& loadAddress)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
return _mm_load_si128 (loadAddress);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//------------------------------------------------------------------------
|
|
Packit |
c2c737 |
// Store SSE from register into address
|
|
Packit |
c2c737 |
//------------------------------------------------------------------------
|
|
Packit |
c2c737 |
template<bool IS_ALIGNED>
|
|
Packit |
c2c737 |
EXR_FORCEINLINE
|
|
Packit |
c2c737 |
void storeSSE (__m128i*& storeAddress, __m128i& dataToStore)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
template<>
|
|
Packit |
c2c737 |
EXR_FORCEINLINE
|
|
Packit |
c2c737 |
void
|
|
Packit |
c2c737 |
storeSSE<false> (__m128i*& storeAddress, __m128i& dataToStore)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
_mm_storeu_si128 (storeAddress, dataToStore);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
template<>
|
|
Packit |
c2c737 |
EXR_FORCEINLINE
|
|
Packit |
c2c737 |
void
|
|
Packit |
c2c737 |
storeSSE<true> (__m128i*& storeAddress, __m128i& dataToStore)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
_mm_stream_si128 (storeAddress, dataToStore);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//------------------------------------------------------------------------
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Write to RGBA
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
//------------------------------------------------------------------------
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Using SSE intrinsics
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED>
|
|
Packit |
c2c737 |
EXR_FORCEINLINE
|
|
Packit |
c2c737 |
void writeToRGBASSETemplate
|
|
Packit |
c2c737 |
(__m128i*& readPtrSSERed,
|
|
Packit |
c2c737 |
__m128i*& readPtrSSEGreen,
|
|
Packit |
c2c737 |
__m128i*& readPtrSSEBlue,
|
|
Packit |
c2c737 |
__m128i*& readPtrSSEAlpha,
|
|
Packit |
c2c737 |
__m128i*& writePtrSSE,
|
|
Packit |
c2c737 |
const size_t& lPixelsToCopySSE)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
for (size_t i = 0; i < lPixelsToCopySSE; ++i)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
__m128i redRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed);
|
|
Packit |
c2c737 |
__m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen);
|
|
Packit |
c2c737 |
__m128i blueRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue);
|
|
Packit |
c2c737 |
__m128i alphaRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEAlpha);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
__m128i redGreenRegister = _mm_unpacklo_epi16 (redRegister,
|
|
Packit |
c2c737 |
greenRegister);
|
|
Packit |
c2c737 |
__m128i blueAlphaRegister = _mm_unpacklo_epi16 (blueRegister,
|
|
Packit |
c2c737 |
alphaRegister);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
__m128i pixel12Register = _mm_unpacklo_epi32 (redGreenRegister,
|
|
Packit |
c2c737 |
blueAlphaRegister);
|
|
Packit |
c2c737 |
__m128i pixel34Register = _mm_unpackhi_epi32 (redGreenRegister,
|
|
Packit |
c2c737 |
blueAlphaRegister);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
|
|
Packit |
c2c737 |
++writePtrSSE;
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
|
|
Packit |
c2c737 |
++writePtrSSE;
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
redGreenRegister = _mm_unpackhi_epi16 (redRegister, greenRegister);
|
|
Packit |
c2c737 |
blueAlphaRegister = _mm_unpackhi_epi16 (blueRegister, alphaRegister);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
pixel12Register = _mm_unpacklo_epi32 (redGreenRegister,
|
|
Packit |
c2c737 |
blueAlphaRegister);
|
|
Packit |
c2c737 |
pixel34Register = _mm_unpackhi_epi32 (redGreenRegister,
|
|
Packit |
c2c737 |
blueAlphaRegister);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
|
|
Packit |
c2c737 |
++writePtrSSE;
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
|
|
Packit |
c2c737 |
++writePtrSSE;
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
++readPtrSSEAlpha;
|
|
Packit |
c2c737 |
++readPtrSSEBlue;
|
|
Packit |
c2c737 |
++readPtrSSEGreen;
|
|
Packit |
c2c737 |
++readPtrSSERed;
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Not using SSE intrinsics. This is still faster than the alternative
|
|
Packit |
c2c737 |
// because we have multiple read pointers and therefore we are able to
|
|
Packit |
c2c737 |
// take advantage of data locality for write operations.
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
EXR_FORCEINLINE
|
|
Packit |
c2c737 |
void writeToRGBANormal (unsigned short*& readPtrRed,
|
|
Packit |
c2c737 |
unsigned short*& readPtrGreen,
|
|
Packit |
c2c737 |
unsigned short*& readPtrBlue,
|
|
Packit |
c2c737 |
unsigned short*& readPtrAlpha,
|
|
Packit |
c2c737 |
unsigned short*& writePtr,
|
|
Packit |
c2c737 |
const size_t& lPixelsToCopy)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
for (size_t i = 0; i < lPixelsToCopy; ++i)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
*(writePtr++) = *(readPtrRed++);
|
|
Packit |
c2c737 |
*(writePtr++) = *(readPtrGreen++);
|
|
Packit |
c2c737 |
*(writePtr++) = *(readPtrBlue++);
|
|
Packit |
c2c737 |
*(writePtr++) = *(readPtrAlpha++);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Determine which (template) version to use by checking whether pointers
|
|
Packit |
c2c737 |
// are aligned
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
EXR_FORCEINLINE
|
|
Packit |
c2c737 |
void optimizedWriteToRGBA (unsigned short*& readPtrRed,
|
|
Packit |
c2c737 |
unsigned short*& readPtrGreen,
|
|
Packit |
c2c737 |
unsigned short*& readPtrBlue,
|
|
Packit |
c2c737 |
unsigned short*& readPtrAlpha,
|
|
Packit |
c2c737 |
unsigned short*& writePtr,
|
|
Packit |
c2c737 |
const size_t& pixelsToCopySSE,
|
|
Packit |
c2c737 |
const size_t& pixelsToCopyNormal)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
bool readPtrAreAligned = true;
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
readPtrAreAligned &= isPointerSSEAligned(readPtrRed);
|
|
Packit |
c2c737 |
readPtrAreAligned &= isPointerSSEAligned(readPtrGreen);
|
|
Packit |
c2c737 |
readPtrAreAligned &= isPointerSSEAligned(readPtrBlue);
|
|
Packit |
c2c737 |
readPtrAreAligned &= isPointerSSEAligned(readPtrAlpha);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
bool writePtrIsAligned = isPointerSSEAligned(writePtr);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
if (!readPtrAreAligned && !writePtrIsAligned)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
writeToRGBASSETemplate<false, false> ((__m128i*&)readPtrRed,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrGreen,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrBlue,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrAlpha,
|
|
Packit |
c2c737 |
(__m128i*&)writePtr,
|
|
Packit |
c2c737 |
pixelsToCopySSE);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
else if (!readPtrAreAligned && writePtrIsAligned)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
writeToRGBASSETemplate<false, true> ((__m128i*&)readPtrRed,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrGreen,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrBlue,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrAlpha,
|
|
Packit |
c2c737 |
(__m128i*&)writePtr,
|
|
Packit |
c2c737 |
pixelsToCopySSE);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
else if (readPtrAreAligned && !writePtrIsAligned)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
writeToRGBASSETemplate<true, false> ((__m128i*&)readPtrRed,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrGreen,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrBlue,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrAlpha,
|
|
Packit |
c2c737 |
(__m128i*&)writePtr,
|
|
Packit |
c2c737 |
pixelsToCopySSE);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
else if(readPtrAreAligned && writePtrIsAligned)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
writeToRGBASSETemplate<true, true> ((__m128i*&)readPtrRed,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrGreen,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrBlue,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrAlpha,
|
|
Packit |
c2c737 |
(__m128i*&)writePtr,
|
|
Packit |
c2c737 |
pixelsToCopySSE);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
writeToRGBANormal (readPtrRed, readPtrGreen, readPtrBlue, readPtrAlpha,
|
|
Packit |
c2c737 |
writePtr, pixelsToCopyNormal);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//------------------------------------------------------------------------
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Write to RGBA Fill A
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
//------------------------------------------------------------------------
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Using SSE intrinsics
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED>
|
|
Packit |
c2c737 |
EXR_FORCEINLINE
|
|
Packit |
c2c737 |
void
|
|
Packit |
c2c737 |
writeToRGBAFillASSETemplate (__m128i*& readPtrSSERed,
|
|
Packit |
c2c737 |
__m128i*& readPtrSSEGreen,
|
|
Packit |
c2c737 |
__m128i*& readPtrSSEBlue,
|
|
Packit |
c2c737 |
const unsigned short& alphaFillValue,
|
|
Packit |
c2c737 |
__m128i*& writePtrSSE,
|
|
Packit |
c2c737 |
const size_t& pixelsToCopySSE)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
const __m128i dummyAlphaRegister = _mm_set_epi16 (alphaFillValue,
|
|
Packit |
c2c737 |
alphaFillValue,
|
|
Packit |
c2c737 |
alphaFillValue,
|
|
Packit |
c2c737 |
alphaFillValue,
|
|
Packit |
c2c737 |
alphaFillValue,
|
|
Packit |
c2c737 |
alphaFillValue,
|
|
Packit |
c2c737 |
alphaFillValue,
|
|
Packit |
c2c737 |
alphaFillValue);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
for (size_t pixelCounter = 0; pixelCounter < pixelsToCopySSE; ++pixelCounter)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
__m128i redRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed);
|
|
Packit |
c2c737 |
__m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen);
|
|
Packit |
c2c737 |
__m128i blueRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
__m128i redGreenRegister = _mm_unpacklo_epi16 (redRegister,
|
|
Packit |
c2c737 |
greenRegister);
|
|
Packit |
c2c737 |
__m128i blueAlphaRegister = _mm_unpacklo_epi16 (blueRegister,
|
|
Packit |
c2c737 |
dummyAlphaRegister);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
__m128i pixel12Register = _mm_unpacklo_epi32 (redGreenRegister,
|
|
Packit |
c2c737 |
blueAlphaRegister);
|
|
Packit |
c2c737 |
__m128i pixel34Register = _mm_unpackhi_epi32 (redGreenRegister,
|
|
Packit |
c2c737 |
blueAlphaRegister);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
|
|
Packit |
c2c737 |
++writePtrSSE;
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
|
|
Packit |
c2c737 |
++writePtrSSE;
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
redGreenRegister = _mm_unpackhi_epi16 (redRegister,
|
|
Packit |
c2c737 |
greenRegister);
|
|
Packit |
c2c737 |
blueAlphaRegister = _mm_unpackhi_epi16 (blueRegister,
|
|
Packit |
c2c737 |
dummyAlphaRegister);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
pixel12Register = _mm_unpacklo_epi32 (redGreenRegister,
|
|
Packit |
c2c737 |
blueAlphaRegister);
|
|
Packit |
c2c737 |
pixel34Register = _mm_unpackhi_epi32 (redGreenRegister,
|
|
Packit |
c2c737 |
blueAlphaRegister);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
|
|
Packit |
c2c737 |
++writePtrSSE;
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
|
|
Packit |
c2c737 |
++writePtrSSE;
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
++readPtrSSEBlue;
|
|
Packit |
c2c737 |
++readPtrSSEGreen;
|
|
Packit |
c2c737 |
++readPtrSSERed;
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Not using SSE intrinsics. This is still faster than the alternative
|
|
Packit |
c2c737 |
// because we have multiple read pointers and therefore we are able to
|
|
Packit |
c2c737 |
// take advantage of data locality for write operations.
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
EXR_FORCEINLINE
|
|
Packit |
c2c737 |
void
|
|
Packit |
c2c737 |
writeToRGBAFillANormal (unsigned short*& readPtrRed,
|
|
Packit |
c2c737 |
unsigned short*& readPtrGreen,
|
|
Packit |
c2c737 |
unsigned short*& readPtrBlue,
|
|
Packit |
c2c737 |
const unsigned short& alphaFillValue,
|
|
Packit |
c2c737 |
unsigned short*& writePtr,
|
|
Packit |
c2c737 |
const size_t& pixelsToCopy)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
for (size_t i = 0; i < pixelsToCopy; ++i)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
*(writePtr++) = *(readPtrRed++);
|
|
Packit |
c2c737 |
*(writePtr++) = *(readPtrGreen++);
|
|
Packit |
c2c737 |
*(writePtr++) = *(readPtrBlue++);
|
|
Packit |
c2c737 |
*(writePtr++) = alphaFillValue;
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Determine which (template) version to use by checking whether pointers
|
|
Packit |
c2c737 |
// are aligned.
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
EXR_FORCEINLINE
|
|
Packit |
c2c737 |
void
|
|
Packit |
c2c737 |
optimizedWriteToRGBAFillA (unsigned short*& readPtrRed,
|
|
Packit |
c2c737 |
unsigned short*& readPtrGreen,
|
|
Packit |
c2c737 |
unsigned short*& readPtrBlue,
|
|
Packit |
c2c737 |
const unsigned short& alphaFillValue,
|
|
Packit |
c2c737 |
unsigned short*& writePtr,
|
|
Packit |
c2c737 |
const size_t& pixelsToCopySSE,
|
|
Packit |
c2c737 |
const size_t& pixelsToCopyNormal)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
bool readPtrAreAligned = true;
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
readPtrAreAligned &= isPointerSSEAligned (readPtrRed);
|
|
Packit |
c2c737 |
readPtrAreAligned &= isPointerSSEAligned (readPtrGreen);
|
|
Packit |
c2c737 |
readPtrAreAligned &= isPointerSSEAligned (readPtrBlue);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
bool writePtrIsAligned = isPointerSSEAligned (writePtr);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
if (!readPtrAreAligned && !writePtrIsAligned)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
writeToRGBAFillASSETemplate<false, false> ((__m128i*&)readPtrRed,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrGreen,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrBlue,
|
|
Packit |
c2c737 |
alphaFillValue,
|
|
Packit |
c2c737 |
(__m128i*&)writePtr,
|
|
Packit |
c2c737 |
pixelsToCopySSE);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
else if (!readPtrAreAligned && writePtrIsAligned)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
writeToRGBAFillASSETemplate<false, true> ((__m128i*&)readPtrRed,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrGreen,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrBlue,
|
|
Packit |
c2c737 |
alphaFillValue,
|
|
Packit |
c2c737 |
(__m128i*&)writePtr,
|
|
Packit |
c2c737 |
pixelsToCopySSE);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
else if (readPtrAreAligned && !writePtrIsAligned)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
writeToRGBAFillASSETemplate<true, false> ((__m128i*&)readPtrRed,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrGreen,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrBlue,
|
|
Packit |
c2c737 |
alphaFillValue,
|
|
Packit |
c2c737 |
(__m128i*&)writePtr,
|
|
Packit |
c2c737 |
pixelsToCopySSE);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
else if (readPtrAreAligned && writePtrIsAligned)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
writeToRGBAFillASSETemplate<true, true> ((__m128i*&)readPtrRed,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrGreen,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrBlue,
|
|
Packit |
c2c737 |
alphaFillValue,
|
|
Packit |
c2c737 |
(__m128i*&)writePtr,
|
|
Packit |
c2c737 |
pixelsToCopySSE);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
writeToRGBAFillANormal (readPtrRed,
|
|
Packit |
c2c737 |
readPtrGreen, readPtrBlue, alphaFillValue,
|
|
Packit |
c2c737 |
writePtr, pixelsToCopyNormal);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//------------------------------------------------------------------------
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Write to RGB
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
//------------------------------------------------------------------------
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Using SSE intrinsics
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED>
|
|
Packit |
c2c737 |
EXR_FORCEINLINE
|
|
Packit |
c2c737 |
void
|
|
Packit |
c2c737 |
writeToRGBSSETemplate (__m128i*& readPtrSSERed,
|
|
Packit |
c2c737 |
__m128i*& readPtrSSEGreen,
|
|
Packit |
c2c737 |
__m128i*& readPtrSSEBlue,
|
|
Packit |
c2c737 |
__m128i*& writePtrSSE,
|
|
Packit |
c2c737 |
const size_t& pixelsToCopySSE)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
for (size_t pixelCounter = 0; pixelCounter < pixelsToCopySSE; ++pixelCounter)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Need to shuffle and unpack pointers to obtain my first register
|
|
Packit |
c2c737 |
// We must save 8 pixels at a time, so we must have the following three registers at the end:
|
|
Packit |
c2c737 |
// 1) R1 G1 B1 R2 G2 B2 R3 G3
|
|
Packit |
c2c737 |
// 2) B3 R4 G4 B4 R5 G5 B5 R6
|
|
Packit |
c2c737 |
// 3) G6 B6 R7 G7 B7 R8 G8 B8
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
__m128i redRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed);
|
|
Packit |
c2c737 |
__m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen);
|
|
Packit |
c2c737 |
__m128i blueRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// First register: R1 G1 B1 R2 G2 B2 R3 G3
|
|
Packit |
c2c737 |
// Construct 2 registers and then unpack them to obtain our final result:
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
__m128i redGreenRegister = _mm_unpacklo_epi16 (redRegister,
|
|
Packit |
c2c737 |
greenRegister);
|
|
Packit |
c2c737 |
__m128i redBlueRegister = _mm_unpacklo_epi16 (redRegister,
|
|
Packit |
c2c737 |
blueRegister);
|
|
Packit |
c2c737 |
__m128i greenBlueRegister = _mm_unpacklo_epi16 (greenRegister,
|
|
Packit |
c2c737 |
blueRegister);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
// Left Part (R1 G1 B1 R2)
|
|
Packit |
c2c737 |
__m128i quarterRight = _mm_shufflelo_epi16 (redBlueRegister,
|
|
Packit |
c2c737 |
_MM_SHUFFLE(3,0,2,1));
|
|
Packit |
c2c737 |
__m128i halfLeft = _mm_unpacklo_epi32 (redGreenRegister,
|
|
Packit |
c2c737 |
quarterRight);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
// Right Part (G2 B2 R3 G3)
|
|
Packit |
c2c737 |
__m128i quarterLeft = _mm_shuffle_epi32 (greenBlueRegister,
|
|
Packit |
c2c737 |
_MM_SHUFFLE(3,2,0,1));
|
|
Packit |
c2c737 |
quarterRight = _mm_shuffle_epi32 (redGreenRegister,
|
|
Packit |
c2c737 |
_MM_SHUFFLE(3,0,1,2));
|
|
Packit |
c2c737 |
__m128i halfRight = _mm_unpacklo_epi32 (quarterLeft, quarterRight);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
__m128i fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight);
|
|
Packit |
c2c737 |
storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister);
|
|
Packit |
c2c737 |
++writePtrSSE;
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Second register: B3 R4 G4 B4 R5 G5 B5 R6
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
// Left Part (B3, R4, G4, B4)
|
|
Packit |
c2c737 |
quarterLeft = _mm_shufflehi_epi16 (redBlueRegister,
|
|
Packit |
c2c737 |
_MM_SHUFFLE(0, 3, 2, 1));
|
|
Packit |
c2c737 |
quarterRight = _mm_shufflehi_epi16 (greenBlueRegister,
|
|
Packit |
c2c737 |
_MM_SHUFFLE(1, 0, 3, 2));
|
|
Packit |
c2c737 |
halfLeft = _mm_unpackhi_epi32 (quarterLeft, quarterRight);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
// Update the registers
|
|
Packit |
c2c737 |
redGreenRegister = _mm_unpackhi_epi16 (redRegister, greenRegister);
|
|
Packit |
c2c737 |
redBlueRegister = _mm_unpackhi_epi16 (redRegister, blueRegister);
|
|
Packit |
c2c737 |
greenBlueRegister = _mm_unpackhi_epi16 (greenRegister, blueRegister);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
// Right Part (R5 G5 B5 R6)
|
|
Packit |
c2c737 |
quarterRight = _mm_shufflelo_epi16 (redBlueRegister,
|
|
Packit |
c2c737 |
_MM_SHUFFLE(3,0,2,1));
|
|
Packit |
c2c737 |
halfRight = _mm_unpacklo_epi32 (redGreenRegister, quarterRight);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight);
|
|
Packit |
c2c737 |
storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister);
|
|
Packit |
c2c737 |
++writePtrSSE;
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Third register: G6 B6 R7 G7 B7 R8 G8 B8
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
// Left part (G6 B6 R7 G7)
|
|
Packit |
c2c737 |
quarterLeft = _mm_shuffle_epi32 (greenBlueRegister,
|
|
Packit |
c2c737 |
_MM_SHUFFLE(3,2,0,1));
|
|
Packit |
c2c737 |
quarterRight = _mm_shuffle_epi32 (redGreenRegister,
|
|
Packit |
c2c737 |
_MM_SHUFFLE(3,0,1,2));
|
|
Packit |
c2c737 |
halfLeft = _mm_unpacklo_epi32 (quarterLeft, quarterRight);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
// Right part (B7 R8 G8 B8)
|
|
Packit |
c2c737 |
quarterLeft = _mm_shufflehi_epi16 (redBlueRegister,
|
|
Packit |
c2c737 |
_MM_SHUFFLE(0, 3, 2, 1));
|
|
Packit |
c2c737 |
quarterRight = _mm_shufflehi_epi16 (greenBlueRegister,
|
|
Packit |
c2c737 |
_MM_SHUFFLE(1, 0, 3, 2));
|
|
Packit |
c2c737 |
halfRight = _mm_unpackhi_epi32 (quarterLeft, quarterRight);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight);
|
|
Packit |
c2c737 |
storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister);
|
|
Packit |
c2c737 |
++writePtrSSE;
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Increment read pointers
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
++readPtrSSEBlue;
|
|
Packit |
c2c737 |
++readPtrSSEGreen;
|
|
Packit |
c2c737 |
++readPtrSSERed;
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Not using SSE intrinsics. This is still faster than the alternative
|
|
Packit |
c2c737 |
// because we have multiple read pointers and therefore we are able to
|
|
Packit |
c2c737 |
// take advantage of data locality for write operations.
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
EXR_FORCEINLINE
|
|
Packit |
c2c737 |
void
|
|
Packit |
c2c737 |
writeToRGBNormal (unsigned short*& readPtrRed,
|
|
Packit |
c2c737 |
unsigned short*& readPtrGreen,
|
|
Packit |
c2c737 |
unsigned short*& readPtrBlue,
|
|
Packit |
c2c737 |
unsigned short*& writePtr,
|
|
Packit |
c2c737 |
const size_t& pixelsToCopy)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
for (size_t i = 0; i < pixelsToCopy; ++i)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
*(writePtr++) = *(readPtrRed++);
|
|
Packit |
c2c737 |
*(writePtr++) = *(readPtrGreen++);
|
|
Packit |
c2c737 |
*(writePtr++) = *(readPtrBlue++);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
// Determine which (template) version to use by checking whether pointers
|
|
Packit |
c2c737 |
// are aligned
|
|
Packit |
c2c737 |
//
|
|
Packit |
c2c737 |
EXR_FORCEINLINE
|
|
Packit |
c2c737 |
void optimizedWriteToRGB (unsigned short*& readPtrRed,
|
|
Packit |
c2c737 |
unsigned short*& readPtrGreen,
|
|
Packit |
c2c737 |
unsigned short*& readPtrBlue,
|
|
Packit |
c2c737 |
unsigned short*& writePtr,
|
|
Packit |
c2c737 |
const size_t& pixelsToCopySSE,
|
|
Packit |
c2c737 |
const size_t& pixelsToCopyNormal)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
bool readPtrAreAligned = true;
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
readPtrAreAligned &= isPointerSSEAligned(readPtrRed);
|
|
Packit |
c2c737 |
readPtrAreAligned &= isPointerSSEAligned(readPtrGreen);
|
|
Packit |
c2c737 |
readPtrAreAligned &= isPointerSSEAligned(readPtrBlue);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
bool writePtrIsAligned = isPointerSSEAligned(writePtr);
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
if (!readPtrAreAligned && !writePtrIsAligned)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
writeToRGBSSETemplate<false, false> ((__m128i*&)readPtrRed,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrGreen,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrBlue,
|
|
Packit |
c2c737 |
(__m128i*&)writePtr,
|
|
Packit |
c2c737 |
pixelsToCopySSE);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
else if (!readPtrAreAligned && writePtrIsAligned)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
writeToRGBSSETemplate<false, true> ((__m128i*&)readPtrRed,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrGreen,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrBlue,
|
|
Packit |
c2c737 |
(__m128i*&)writePtr,
|
|
Packit |
c2c737 |
pixelsToCopySSE);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
else if (readPtrAreAligned && !writePtrIsAligned)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
writeToRGBSSETemplate<true, false> ((__m128i*&)readPtrRed,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrGreen,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrBlue,
|
|
Packit |
c2c737 |
(__m128i*&)writePtr,
|
|
Packit |
c2c737 |
pixelsToCopySSE);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
else if (readPtrAreAligned && writePtrIsAligned)
|
|
Packit |
c2c737 |
{
|
|
Packit |
c2c737 |
writeToRGBSSETemplate<true, true> ((__m128i*&)readPtrRed,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrGreen,
|
|
Packit |
c2c737 |
(__m128i*&)readPtrBlue,
|
|
Packit |
c2c737 |
(__m128i*&)writePtr,
|
|
Packit |
c2c737 |
pixelsToCopySSE);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
writeToRGBNormal (readPtrRed, readPtrGreen, readPtrBlue,
|
|
Packit |
c2c737 |
writePtr, pixelsToCopyNormal);
|
|
Packit |
c2c737 |
}
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
#else // ! defined IMF_HAVE_SSE2
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
#endif // defined IMF_HAVE_SSE2
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_EXIT
|
|
Packit |
c2c737 |
|
|
Packit |
c2c737 |
#endif
|