|
Packit |
1fb8d4 |
/* prim_templates.h
|
|
Packit |
1fb8d4 |
* vi:ts=4 sw=4
|
|
Packit |
1fb8d4 |
*
|
|
Packit |
1fb8d4 |
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
|
Packit |
1fb8d4 |
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
Packit |
1fb8d4 |
* not use this file except in compliance with the License. You may obtain
|
|
Packit |
1fb8d4 |
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
|
Packit |
1fb8d4 |
* Unless required by applicable law or agreed to in writing, software
|
|
Packit |
1fb8d4 |
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
Packit |
1fb8d4 |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
Packit |
1fb8d4 |
* or implied. See the License for the specific language governing
|
|
Packit |
1fb8d4 |
* permissions and limitations under the License. Algorithms used by
|
|
Packit |
1fb8d4 |
* this code may be covered by patents by HP, Microsoft, or other parties.
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
#ifdef __GNUC__
|
|
Packit |
1fb8d4 |
# pragma once
|
|
Packit |
1fb8d4 |
#endif
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
#ifndef FREERDP_LIB_PRIM_TEMPLATES_H
|
|
Packit |
1fb8d4 |
#define FREERDP_LIB_PRIM_TEMPLATES_H
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/* These are prototypes for SSE (potentially NEON) routines that do a
|
|
Packit |
1fb8d4 |
* simple SSE operation over an array of data. Since so much of this
|
|
Packit |
1fb8d4 |
* code is shared except for the operation itself, these prototypes are
|
|
Packit |
1fb8d4 |
* used rather than duplicating code. The naming convention depends on
|
|
Packit |
1fb8d4 |
* the parameters: S=Source param; C=Constant; D=Destination.
|
|
Packit |
1fb8d4 |
* All the macros have parameters for a fallback procedure if the data
|
|
Packit |
1fb8d4 |
* is too small and an operation "the slow way" for use at 16-byte edges.
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/* SSE3 note: If someone needs to support an SSE2 version of these without
|
|
Packit |
1fb8d4 |
* SSE3 support, an alternative version could be added that merely checks
|
|
Packit |
1fb8d4 |
* that 16-byte alignment on both destination and source(s) can be
|
|
Packit |
1fb8d4 |
* achieved, rather than use LDDQU for unaligned reads.
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
|
|
Packit |
1fb8d4 |
* It easily can't do that if the value is stored in a variable.
|
|
Packit |
1fb8d4 |
* So don't save it as an intermediate value.
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/* ----------------------------------------------------------------------------
|
|
Packit |
1fb8d4 |
* SCD = Source, Constant, Destination
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
|
Packit |
1fb8d4 |
static pstatus_t _name_(const _type_ *pSrc, UINT32 val, _type_ *pDst, UINT32 len) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
INT32 shifts = 0; \
|
|
Packit |
1fb8d4 |
UINT32 offBeatMask; \
|
|
Packit |
1fb8d4 |
const _type_ *sptr = pSrc; \
|
|
Packit |
1fb8d4 |
_type_ *dptr = pDst; \
|
|
Packit |
1fb8d4 |
size_t count; \
|
|
Packit |
1fb8d4 |
if (len < 16) /* pointless if too small */ \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
return _fallback_(pSrc, val, pDst, len); \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
if (sizeof(_type_) == 1) shifts = 1; \
|
|
Packit |
1fb8d4 |
else if (sizeof(_type_) == 2) shifts = 2; \
|
|
Packit |
1fb8d4 |
else if (sizeof(_type_) == 4) shifts = 3; \
|
|
Packit |
1fb8d4 |
else if (sizeof(_type_) == 8) shifts = 4; \
|
|
Packit |
1fb8d4 |
offBeatMask = (1 << (shifts - 1)) - 1; \
|
|
Packit |
1fb8d4 |
if ((ULONG_PTR) pDst & offBeatMask) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
/* Incrementing the pointer skips over 16-byte boundary. */ \
|
|
Packit |
1fb8d4 |
return _fallback_(pSrc, val, pDst, len); \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
/* Get to the 16-byte boundary now. */ \
|
|
Packit |
1fb8d4 |
while ((ULONG_PTR) dptr & 0x0f) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
_slowWay_; \
|
|
Packit |
1fb8d4 |
if (--len == 0) return PRIMITIVES_SUCCESS; \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
/* Use 8 128-bit SSE registers. */ \
|
|
Packit |
1fb8d4 |
count = len >> (8-shifts); \
|
|
Packit |
1fb8d4 |
len -= count << (8-shifts); \
|
|
Packit |
1fb8d4 |
if ((ULONG_PTR) sptr & 0x0f) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
while (count--) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
|
Packit |
1fb8d4 |
xmm0 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm5 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm6 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm7 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm0 = _op_(xmm0, val); \
|
|
Packit |
1fb8d4 |
xmm1 = _op_(xmm1, val); \
|
|
Packit |
1fb8d4 |
xmm2 = _op_(xmm2, val); \
|
|
Packit |
1fb8d4 |
xmm3 = _op_(xmm3, val); \
|
|
Packit |
1fb8d4 |
xmm4 = _op_(xmm4, val); \
|
|
Packit |
1fb8d4 |
xmm5 = _op_(xmm5, val); \
|
|
Packit |
1fb8d4 |
xmm6 = _op_(xmm6, val); \
|
|
Packit |
1fb8d4 |
xmm7 = _op_(xmm7, val); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm0); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm1); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm2); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm3); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm4); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm5); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm6); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm7); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
else \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
while (count--) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
|
Packit |
1fb8d4 |
xmm0 = _mm_load_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm1 = _mm_load_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm2 = _mm_load_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm3 = _mm_load_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm4 = _mm_load_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm5 = _mm_load_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm6 = _mm_load_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm7 = _mm_load_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm0 = _op_(xmm0, val); \
|
|
Packit |
1fb8d4 |
xmm1 = _op_(xmm1, val); \
|
|
Packit |
1fb8d4 |
xmm2 = _op_(xmm2, val); \
|
|
Packit |
1fb8d4 |
xmm3 = _op_(xmm3, val); \
|
|
Packit |
1fb8d4 |
xmm4 = _op_(xmm4, val); \
|
|
Packit |
1fb8d4 |
xmm5 = _op_(xmm5, val); \
|
|
Packit |
1fb8d4 |
xmm6 = _op_(xmm6, val); \
|
|
Packit |
1fb8d4 |
xmm7 = _op_(xmm7, val); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm0); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm1); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm2); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm3); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm4); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm5); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm6); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm7); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
/* Use a single 128-bit SSE register. */ \
|
|
Packit |
1fb8d4 |
count = len >> (5-shifts); \
|
|
Packit |
1fb8d4 |
len -= count << (5-shifts); \
|
|
Packit |
1fb8d4 |
while (count--) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
__m128i xmm0 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm0 = _op_(xmm0, val); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm0); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
/* Finish off the remainder. */ \
|
|
Packit |
1fb8d4 |
while (len--) { _slowWay_; } \
|
|
Packit |
1fb8d4 |
return PRIMITIVES_SUCCESS; \
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/* ----------------------------------------------------------------------------
|
|
Packit |
1fb8d4 |
* SCD = Source, Constant, Destination
|
|
Packit |
1fb8d4 |
* PRE = preload xmm0 with the constant.
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
|
Packit |
1fb8d4 |
pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
int shifts = 0; \
|
|
Packit |
1fb8d4 |
UINT32 offBeatMask; \
|
|
Packit |
1fb8d4 |
const _type_ *sptr = pSrc; \
|
|
Packit |
1fb8d4 |
_type_ *dptr = pDst; \
|
|
Packit |
1fb8d4 |
size_t count; \
|
|
Packit |
1fb8d4 |
__m128i xmm0; \
|
|
Packit |
1fb8d4 |
if (len < 16) /* pointless if too small */ \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
return _fallback_(pSrc, val, pDst, len); \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
if (sizeof(_type_) == 1) shifts = 1; \
|
|
Packit |
1fb8d4 |
else if (sizeof(_type_) == 2) shifts = 2; \
|
|
Packit |
1fb8d4 |
else if (sizeof(_type_) == 4) shifts = 3; \
|
|
Packit |
1fb8d4 |
else if (sizeof(_type_) == 8) shifts = 4; \
|
|
Packit |
1fb8d4 |
offBeatMask = (1 << (shifts - 1)) - 1; \
|
|
Packit |
1fb8d4 |
if ((ULONG_PTR) pDst & offBeatMask) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
/* Incrementing the pointer skips over 16-byte boundary. */ \
|
|
Packit |
1fb8d4 |
return _fallback_(pSrc, val, pDst, len); \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
/* Get to the 16-byte boundary now. */ \
|
|
Packit |
1fb8d4 |
while ((ULONG_PTR) dptr & 0x0f) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
_slowWay_; \
|
|
Packit |
1fb8d4 |
if (--len == 0) return PRIMITIVES_SUCCESS; \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
/* Use 4 128-bit SSE registers. */ \
|
|
Packit |
1fb8d4 |
count = len >> (7-shifts); \
|
|
Packit |
1fb8d4 |
len -= count << (7-shifts); \
|
|
Packit |
1fb8d4 |
xmm0 = _mm_set1_epi32(val); \
|
|
Packit |
1fb8d4 |
if ((ULONG_PTR) sptr & 0x0f) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
while (count--) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
__m128i xmm1, xmm2, xmm3, xmm4; \
|
|
Packit |
1fb8d4 |
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm1 = _op_(xmm1, xmm0); \
|
|
Packit |
1fb8d4 |
xmm2 = _op_(xmm2, xmm0); \
|
|
Packit |
1fb8d4 |
xmm3 = _op_(xmm3, xmm0); \
|
|
Packit |
1fb8d4 |
xmm4 = _op_(xmm4, xmm0); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm1); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm2); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm3); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm4); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
else \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
while (count--) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
__m128i xmm1, xmm2, xmm3, xmm4; \
|
|
Packit |
1fb8d4 |
xmm1 = _mm_load_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm2 = _mm_load_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm3 = _mm_load_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm4 = _mm_load_si128((__m128i *) sptr); \
|
|
Packit |
1fb8d4 |
sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm1 = _op_(xmm1, xmm0); \
|
|
Packit |
1fb8d4 |
xmm2 = _op_(xmm2, xmm0); \
|
|
Packit |
1fb8d4 |
xmm3 = _op_(xmm3, xmm0); \
|
|
Packit |
1fb8d4 |
xmm4 = _op_(xmm4, xmm0); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm1); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm2); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm3); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm4); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
/* Use a single 128-bit SSE register. */ \
|
|
Packit |
1fb8d4 |
count = len >> (5-shifts); \
|
|
Packit |
1fb8d4 |
len -= count << (5-shifts); \
|
|
Packit |
1fb8d4 |
while (count--) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
__m128i xmm1 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm1 = _op_(xmm1, xmm0); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm1); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
/* Finish off the remainder. */ \
|
|
Packit |
1fb8d4 |
while (len--) { _slowWay_; } \
|
|
Packit |
1fb8d4 |
return PRIMITIVES_SUCCESS; \
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
/* ----------------------------------------------------------------------------
|
|
Packit |
1fb8d4 |
* SSD = Source1, Source2, Destination
|
|
Packit |
1fb8d4 |
*/
|
|
Packit |
1fb8d4 |
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
|
Packit |
1fb8d4 |
pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, UINT32 len) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
int shifts = 0; \
|
|
Packit |
1fb8d4 |
UINT32 offBeatMask; \
|
|
Packit |
1fb8d4 |
const _type_ *sptr1 = pSrc1; \
|
|
Packit |
1fb8d4 |
const _type_ *sptr2 = pSrc2; \
|
|
Packit |
1fb8d4 |
_type_ *dptr = pDst; \
|
|
Packit |
1fb8d4 |
size_t count; \
|
|
Packit |
1fb8d4 |
if (len < 16) /* pointless if too small */ \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
return _fallback_(pSrc1, pSrc2, pDst, len); \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
if (sizeof(_type_) == 1) shifts = 1; \
|
|
Packit |
1fb8d4 |
else if (sizeof(_type_) == 2) shifts = 2; \
|
|
Packit |
1fb8d4 |
else if (sizeof(_type_) == 4) shifts = 3; \
|
|
Packit |
1fb8d4 |
else if (sizeof(_type_) == 8) shifts = 4; \
|
|
Packit |
1fb8d4 |
offBeatMask = (1 << (shifts - 1)) - 1; \
|
|
Packit |
1fb8d4 |
if ((ULONG_PTR) pDst & offBeatMask) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
/* Incrementing the pointer skips over 16-byte boundary. */ \
|
|
Packit |
1fb8d4 |
return _fallback_(pSrc1, pSrc2, pDst, len); \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
/* Get to the 16-byte boundary now. */ \
|
|
Packit |
1fb8d4 |
while ((ULONG_PTR) dptr & 0x0f) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
pstatus_t status; \
|
|
Packit |
1fb8d4 |
status = _slowWay_; \
|
|
Packit |
1fb8d4 |
if (status != PRIMITIVES_SUCCESS) return status; \
|
|
Packit |
1fb8d4 |
if (--len == 0) return PRIMITIVES_SUCCESS; \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
/* Use 4 128-bit SSE registers. */ \
|
|
Packit |
1fb8d4 |
count = len >> (7-shifts); \
|
|
Packit |
1fb8d4 |
len -= count << (7-shifts); \
|
|
Packit |
1fb8d4 |
if (((ULONG_PTR) sptr1 & 0x0f) || ((ULONG_PTR) sptr2 & 0x0f)) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
/* Unaligned loads */ \
|
|
Packit |
1fb8d4 |
while (count--) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
|
Packit |
1fb8d4 |
xmm0 = _mm_lddqu_si128((__m128i *) sptr1); \
|
|
Packit |
1fb8d4 |
sptr1 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm1 = _mm_lddqu_si128((__m128i *) sptr1); \
|
|
Packit |
1fb8d4 |
sptr1 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm2 = _mm_lddqu_si128((__m128i *) sptr1); \
|
|
Packit |
1fb8d4 |
sptr1 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm3 = _mm_lddqu_si128((__m128i *) sptr1); \
|
|
Packit |
1fb8d4 |
sptr1 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm4 = _mm_lddqu_si128((__m128i *) sptr2); \
|
|
Packit |
1fb8d4 |
sptr2 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm5 = _mm_lddqu_si128((__m128i *) sptr2); \
|
|
Packit |
1fb8d4 |
sptr2 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm6 = _mm_lddqu_si128((__m128i *) sptr2); \
|
|
Packit |
1fb8d4 |
sptr2 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm7 = _mm_lddqu_si128((__m128i *) sptr2); \
|
|
Packit |
1fb8d4 |
sptr2 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm0 = _op_(xmm0, xmm4); \
|
|
Packit |
1fb8d4 |
xmm1 = _op_(xmm1, xmm5); \
|
|
Packit |
1fb8d4 |
xmm2 = _op_(xmm2, xmm6); \
|
|
Packit |
1fb8d4 |
xmm3 = _op_(xmm3, xmm7); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm0); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm1); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm2); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm3); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
else \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
/* Aligned loads */ \
|
|
Packit |
1fb8d4 |
while (count--) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
|
Packit |
1fb8d4 |
xmm0 = _mm_load_si128((__m128i *) sptr1); \
|
|
Packit |
1fb8d4 |
sptr1 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm1 = _mm_load_si128((__m128i *) sptr1); \
|
|
Packit |
1fb8d4 |
sptr1 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm2 = _mm_load_si128((__m128i *) sptr1); \
|
|
Packit |
1fb8d4 |
sptr1 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm3 = _mm_load_si128((__m128i *) sptr1); \
|
|
Packit |
1fb8d4 |
sptr1 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm4 = _mm_load_si128((__m128i *) sptr2); \
|
|
Packit |
1fb8d4 |
sptr2 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm5 = _mm_load_si128((__m128i *) sptr2); \
|
|
Packit |
1fb8d4 |
sptr2 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm6 = _mm_load_si128((__m128i *) sptr2); \
|
|
Packit |
1fb8d4 |
sptr2 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm7 = _mm_load_si128((__m128i *) sptr2); \
|
|
Packit |
1fb8d4 |
sptr2 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm0 = _op_(xmm0, xmm4); \
|
|
Packit |
1fb8d4 |
xmm1 = _op_(xmm1, xmm5); \
|
|
Packit |
1fb8d4 |
xmm2 = _op_(xmm2, xmm6); \
|
|
Packit |
1fb8d4 |
xmm3 = _op_(xmm3, xmm7); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm0); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm1); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm2); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm3); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
/* Use a single 128-bit SSE register. */ \
|
|
Packit |
1fb8d4 |
count = len >> (5-shifts); \
|
|
Packit |
1fb8d4 |
len -= count << (5-shifts); \
|
|
Packit |
1fb8d4 |
while (count--) \
|
|
Packit |
1fb8d4 |
{ \
|
|
Packit |
1fb8d4 |
__m128i xmm0, xmm1; \
|
|
Packit |
1fb8d4 |
xmm0 = LOAD_SI128(sptr1); sptr1 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm1 = LOAD_SI128(sptr2); sptr2 += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
xmm0 = _op_(xmm0, xmm1); \
|
|
Packit |
1fb8d4 |
_mm_store_si128((__m128i *) dptr, xmm0); \
|
|
Packit |
1fb8d4 |
dptr += (16/sizeof(_type_)); \
|
|
Packit |
1fb8d4 |
} \
|
|
Packit |
1fb8d4 |
/* Finish off the remainder. */ \
|
|
Packit |
1fb8d4 |
while (len--) { _slowWay_; } \
|
|
Packit |
1fb8d4 |
return PRIMITIVES_SUCCESS; \
|
|
Packit |
1fb8d4 |
}
|
|
Packit |
1fb8d4 |
|
|
Packit |
1fb8d4 |
#endif /* FREERDP_LIB_PRIM_TEMPLATES_H */
|