/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* (C) 2001 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
#ifndef VECCPY_H_INCLUDED
#define VECCPY_H_INCLUDED
#ifdef HAVE_ANY_INT64_T_ALIGNEMENT
#define MPIR_ALIGN8_TEST(p1,p2)
#else
#define MPIR_ALIGN8_TEST(p1,p2) && (((DLOOP_VOID_PTR_CAST_TO_OFFSET p1 | DLOOP_VOID_PTR_CAST_TO_OFFSET p2) & 0x7) == 0)
#endif
#ifdef HAVE_ANY_INT32_T_ALIGNEMENT
#define MPIR_ALIGN4_TEST(p1,p2)
#else
#define MPIR_ALIGN4_TEST(p1,p2) && (((DLOOP_VOID_PTR_CAST_TO_OFFSET p1 | DLOOP_VOID_PTR_CAST_TO_OFFSET p2) & 0x3) == 0)
#endif
#define MPII_COPY_FROM_VEC(src,dest,stride,type,nelms,count) \
{ \
if (!nelms) { \
src = (char*) DLOOP_OFFSET_CAST_TO_VOID_PTR \
((DLOOP_VOID_PTR_CAST_TO_OFFSET (src)) + \
((DLOOP_Offset) count * (DLOOP_Offset) stride)); \
} \
else if (stride % sizeof(type)) { \
MPII_COPY_FROM_VEC_UNALIGNED(src,dest,stride,type,nelms,count); \
} \
else { \
MPII_COPY_FROM_VEC_ALIGNED(src,dest,stride/(DLOOP_Offset)sizeof(type),type,nelms,count); \
} \
}
#define MPII_COPY_TO_VEC(src,dest,stride,type,nelms,count) \
{ \
if (!nelms) { \
dest = (char*) DLOOP_OFFSET_CAST_TO_VOID_PTR \
((DLOOP_VOID_PTR_CAST_TO_OFFSET (dest)) + \
((DLOOP_Offset) count * (DLOOP_Offset) stride)); \
} \
else if (stride % (DLOOP_Offset) sizeof(type)) { \
MPII_COPY_TO_VEC_UNALIGNED(src,dest,stride,type,nelms,count); \
} \
else { \
MPII_COPY_TO_VEC_ALIGNED(src,dest,stride/(DLOOP_Offset)sizeof(type),type,nelms,count); \
} \
}
#define MPII_COPY_FROM_VEC_ALIGNED(src,dest,stride,type,nelms,count) \
{ \
type * l_src = (type *) src, * l_dest = (type *) dest; \
type * tmp_src = l_src; \
register int k; \
register unsigned long _i, j; \
unsigned long total_count = count * nelms; \
const DLOOP_Offset l_stride = stride; \
\
DLOOP_Assert(stride <= INT_MAX); \
DLOOP_Assert(total_count <= INT_MAX); \
DLOOP_Assert(nelms <= INT_MAX); \
if (nelms == 1) { \
for (_i = (int)total_count; _i; _i--) { \
*l_dest++ = *l_src; \
l_src += l_stride; \
} \
} \
else if (nelms == 2) { \
for (_i = (int)total_count; _i; _i -= 2) { \
*l_dest++ = l_src[0]; \
*l_dest++ = l_src[1]; \
l_src += l_stride; \
} \
} \
else if (nelms == 3) { \
for (_i = (int)total_count; _i; _i -= 3) { \
*l_dest++ = l_src[0]; \
*l_dest++ = l_src[1]; \
*l_dest++ = l_src[2]; \
l_src += l_stride; \
} \
} \
else if (nelms == 4) { \
for (_i = (int)total_count; _i; _i -= 4) { \
*l_dest++ = l_src[0]; \
*l_dest++ = l_src[1]; \
*l_dest++ = l_src[2]; \
*l_dest++ = l_src[3]; \
l_src += l_stride; \
} \
} \
else if (nelms == 5) { \
for (_i = (int)total_count; _i; _i -= 5) { \
*l_dest++ = l_src[0]; \
*l_dest++ = l_src[1]; \
*l_dest++ = l_src[2]; \
*l_dest++ = l_src[3]; \
*l_dest++ = l_src[4]; \
l_src += l_stride; \
} \
} \
else if (nelms == 6) { \
for (_i = (int)total_count; _i; _i -= 6) { \
*l_dest++ = l_src[0]; \
*l_dest++ = l_src[1]; \
*l_dest++ = l_src[2]; \
*l_dest++ = l_src[3]; \
*l_dest++ = l_src[4]; \
*l_dest++ = l_src[5]; \
l_src += l_stride; \
} \
} \
else if (nelms == 7) { \
for (_i = (int)total_count; _i; _i -= 7) { \
*l_dest++ = l_src[0]; \
*l_dest++ = l_src[1]; \
*l_dest++ = l_src[2]; \
*l_dest++ = l_src[3]; \
*l_dest++ = l_src[4]; \
*l_dest++ = l_src[5]; \
*l_dest++ = l_src[6]; \
l_src += l_stride; \
} \
} \
else if (nelms == 8) { \
for (_i = (int)total_count; _i; _i -= 8) { \
*l_dest++ = l_src[0]; \
*l_dest++ = l_src[1]; \
*l_dest++ = l_src[2]; \
*l_dest++ = l_src[3]; \
*l_dest++ = l_src[4]; \
*l_dest++ = l_src[5]; \
*l_dest++ = l_src[6]; \
*l_dest++ = l_src[7]; \
l_src += l_stride; \
} \
} \
else { \
_i = (int)total_count; \
while (_i) { \
tmp_src = l_src; \
j = (int)nelms; \
while (j >= 8) { \
*l_dest++ = tmp_src[0]; \
*l_dest++ = tmp_src[1]; \
*l_dest++ = tmp_src[2]; \
*l_dest++ = tmp_src[3]; \
*l_dest++ = tmp_src[4]; \
*l_dest++ = tmp_src[5]; \
*l_dest++ = tmp_src[6]; \
*l_dest++ = tmp_src[7]; \
j -= 8; \
tmp_src += 8; \
} \
for (k = 0; k < j; k++) { \
*l_dest++ = *tmp_src++; \
} \
l_src += l_stride; \
_i -= nelms; \
} \
} \
src = (char *) l_src; \
dest = (char *) l_dest; \
}
#define MPII_COPY_FROM_VEC_UNALIGNED(src,dest,stride,type,nelms,count) \
{ \
type * l_src = (type *) src, * l_dest = (type *) dest; \
type * tmp_src = l_src; \
register int k; \
register unsigned long _i, j, total_count = count * nelms; \
const DLOOP_Offset l_stride = stride; \
\
DLOOP_Assert(stride <= INT_MAX); \
DLOOP_Assert(total_count <= INT_MAX); \
DLOOP_Assert(nelms <= INT_MAX); \
if (nelms == 1) { \
for (_i = (int)total_count; _i; _i--) { \
*l_dest++ = *l_src; \
l_src = (type *) ((char *) l_src + l_stride); \
} \
} \
else if (nelms == 2) { \
for (_i = (int)total_count; _i; _i -= 2) { \
*l_dest++ = l_src[0]; \
*l_dest++ = l_src[1]; \
l_src = (type *) ((char *) l_src + l_stride); \
} \
} \
else if (nelms == 3) { \
for (_i = (int)total_count; _i; _i -= 3) { \
*l_dest++ = l_src[0]; \
*l_dest++ = l_src[1]; \
*l_dest++ = l_src[2]; \
l_src = (type *) ((char *) l_src + l_stride); \
} \
} \
else if (nelms == 4) { \
for (_i = (int)total_count; _i; _i -= 4) { \
*l_dest++ = l_src[0]; \
*l_dest++ = l_src[1]; \
*l_dest++ = l_src[2]; \
*l_dest++ = l_src[3]; \
l_src = (type *) ((char *) l_src + l_stride); \
} \
} \
else if (nelms == 5) { \
for (_i = (int)total_count; _i; _i -= 5) { \
*l_dest++ = l_src[0]; \
*l_dest++ = l_src[1]; \
*l_dest++ = l_src[2]; \
*l_dest++ = l_src[3]; \
*l_dest++ = l_src[4]; \
l_src = (type *) ((char *) l_src + l_stride); \
} \
} \
else if (nelms == 6) { \
for (_i = (int)total_count; _i; _i -= 6) { \
*l_dest++ = l_src[0]; \
*l_dest++ = l_src[1]; \
*l_dest++ = l_src[2]; \
*l_dest++ = l_src[3]; \
*l_dest++ = l_src[4]; \
*l_dest++ = l_src[5]; \
l_src = (type *) ((char *) l_src + l_stride); \
} \
} \
else if (nelms == 7) { \
for (_i = (int)total_count; _i; _i -= 7) { \
*l_dest++ = l_src[0]; \
*l_dest++ = l_src[1]; \
*l_dest++ = l_src[2]; \
*l_dest++ = l_src[3]; \
*l_dest++ = l_src[4]; \
*l_dest++ = l_src[5]; \
*l_dest++ = l_src[6]; \
l_src = (type *) ((char *) l_src + l_stride); \
} \
} \
else if (nelms == 8) { \
for (_i = (int)total_count; _i; _i -= 8) { \
*l_dest++ = l_src[0]; \
*l_dest++ = l_src[1]; \
*l_dest++ = l_src[2]; \
*l_dest++ = l_src[3]; \
*l_dest++ = l_src[4]; \
*l_dest++ = l_src[5]; \
*l_dest++ = l_src[6]; \
*l_dest++ = l_src[7]; \
l_src = (type *) ((char *) l_src + l_stride); \
} \
} \
else { \
_i = (int)total_count; \
while (_i) { \
tmp_src = l_src; \
j = (int)nelms; \
while (j >= 8) { \
*l_dest++ = tmp_src[0]; \
*l_dest++ = tmp_src[1]; \
*l_dest++ = tmp_src[2]; \
*l_dest++ = tmp_src[3]; \
*l_dest++ = tmp_src[4]; \
*l_dest++ = tmp_src[5]; \
*l_dest++ = tmp_src[6]; \
*l_dest++ = tmp_src[7]; \
j -= 8; \
tmp_src += 8; \
} \
for (k = 0; k < j; k++) { \
*l_dest++ = *tmp_src++; \
} \
l_src = (type *) ((char *) l_src + l_stride); \
_i -= nelms; \
} \
} \
src = (char *) l_src; \
dest = (char *) l_dest; \
}
#define MPII_COPY_TO_VEC_ALIGNED(src,dest,stride,type,nelms,count) \
{ \
type * l_src = (type *) src, * l_dest = (type *) dest; \
type * tmp_dest = l_dest; \
register int k; \
register unsigned long _i, j; \
unsigned long total_count = count * nelms; \
const DLOOP_Offset l_stride = stride; \
\
DLOOP_Assert(stride <= INT_MAX); \
DLOOP_Assert(total_count <= INT_MAX); \
DLOOP_Assert(nelms <= INT_MAX); \
if (nelms == 1) { \
for (_i = (int)total_count; _i; _i--) { \
*l_dest = *l_src++; \
l_dest += l_stride; \
} \
} \
else if (nelms == 2) { \
for (_i = (int)total_count; _i; _i -= 2) { \
l_dest[0] = *l_src++; \
l_dest[1] = *l_src++; \
l_dest += l_stride; \
} \
} \
else if (nelms == 3) { \
for (_i = (int)total_count; _i; _i -= 3) { \
l_dest[0] = *l_src++; \
l_dest[1] = *l_src++; \
l_dest[2] = *l_src++; \
l_dest += l_stride; \
} \
} \
else if (nelms == 4) { \
for (_i = (int)total_count; _i; _i -= 4) { \
l_dest[0] = *l_src++; \
l_dest[1] = *l_src++; \
l_dest[2] = *l_src++; \
l_dest[3] = *l_src++; \
l_dest += l_stride; \
} \
} \
else if (nelms == 5) { \
for (_i = (int)total_count; _i; _i -= 5) { \
l_dest[0] = *l_src++; \
l_dest[1] = *l_src++; \
l_dest[2] = *l_src++; \
l_dest[3] = *l_src++; \
l_dest[4] = *l_src++; \
l_dest += l_stride; \
} \
} \
else if (nelms == 6) { \
for (_i = (int)total_count; _i; _i -= 6) { \
l_dest[0] = *l_src++; \
l_dest[1] = *l_src++; \
l_dest[2] = *l_src++; \
l_dest[3] = *l_src++; \
l_dest[4] = *l_src++; \
l_dest[5] = *l_src++; \
l_dest += l_stride; \
} \
} \
else if (nelms == 7) { \
for (_i = (int)total_count; _i; _i -= 7) { \
l_dest[0] = *l_src++; \
l_dest[1] = *l_src++; \
l_dest[2] = *l_src++; \
l_dest[3] = *l_src++; \
l_dest[4] = *l_src++; \
l_dest[5] = *l_src++; \
l_dest[6] = *l_src++; \
l_dest += l_stride; \
} \
} \
else if (nelms == 8) { \
for (_i = (int)total_count; _i; _i -= 8) { \
l_dest[0] = *l_src++; \
l_dest[1] = *l_src++; \
l_dest[2] = *l_src++; \
l_dest[3] = *l_src++; \
l_dest[4] = *l_src++; \
l_dest[5] = *l_src++; \
l_dest[6] = *l_src++; \
l_dest[7] = *l_src++; \
l_dest += l_stride; \
} \
} \
else { \
_i = (int)total_count; \
while (_i) { \
tmp_dest = l_dest; \
j = (int)nelms; \
while (j >= 8) { \
tmp_dest[0] = *l_src++; \
tmp_dest[1] = *l_src++; \
tmp_dest[2] = *l_src++; \
tmp_dest[3] = *l_src++; \
tmp_dest[4] = *l_src++; \
tmp_dest[5] = *l_src++; \
tmp_dest[6] = *l_src++; \
tmp_dest[7] = *l_src++; \
j -= 8; \
tmp_dest += 8; \
} \
for (k = 0; k < j; k++) { \
*tmp_dest++ = *l_src++; \
} \
l_dest += l_stride; \
_i -= nelms; \
} \
} \
src = (char *) l_src; \
dest = (char *) l_dest; \
}
#define MPII_COPY_TO_VEC_UNALIGNED(src,dest,stride,type,nelms,count) \
{ \
type * l_src = (type *) src, * l_dest = (type *) dest; \
type * tmp_dest = l_dest; \
register int k; \
register unsigned long _i, j; \
unsigned long total_count = count * nelms; \
const DLOOP_Offset l_stride = stride; \
\
DLOOP_Assert(stride <= INT_MAX); \
DLOOP_Assert(total_count <= INT_MAX); \
DLOOP_Assert(nelms <= INT_MAX); \
if (nelms == 1) { \
for (_i = (int)total_count; _i; _i--) { \
*l_dest = *l_src++; \
l_dest = (type *) ((char *) l_dest + l_stride); \
} \
} \
else if (nelms == 2) { \
for (_i = (int)total_count; _i; _i -= 2) { \
l_dest[0] = *l_src++; \
l_dest[1] = *l_src++; \
l_dest = (type *) ((char *) l_dest + l_stride); \
} \
} \
else if (nelms == 3) { \
for (_i = (int)total_count; _i; _i -= 3) { \
l_dest[0] = *l_src++; \
l_dest[1] = *l_src++; \
l_dest[2] = *l_src++; \
l_dest = (type *) ((char *) l_dest + l_stride); \
} \
} \
else if (nelms == 4) { \
for (_i = (int)total_count; _i; _i -= 4) { \
l_dest[0] = *l_src++; \
l_dest[1] = *l_src++; \
l_dest[2] = *l_src++; \
l_dest[3] = *l_src++; \
l_dest = (type *) ((char *) l_dest + l_stride); \
} \
} \
else if (nelms == 5) { \
for (_i = (int)total_count; _i; _i -= 5) { \
l_dest[0] = *l_src++; \
l_dest[1] = *l_src++; \
l_dest[2] = *l_src++; \
l_dest[3] = *l_src++; \
l_dest[4] = *l_src++; \
l_dest = (type *) ((char *) l_dest + l_stride); \
} \
} \
else if (nelms == 6) { \
for (_i = (int)total_count; _i; _i -= 6) { \
l_dest[0] = *l_src++; \
l_dest[1] = *l_src++; \
l_dest[2] = *l_src++; \
l_dest[3] = *l_src++; \
l_dest[4] = *l_src++; \
l_dest[5] = *l_src++; \
l_dest = (type *) ((char *) l_dest + l_stride); \
} \
} \
else if (nelms == 7) { \
for (_i = (int)total_count; _i; _i -= 7) { \
l_dest[0] = *l_src++; \
l_dest[1] = *l_src++; \
l_dest[2] = *l_src++; \
l_dest[3] = *l_src++; \
l_dest[4] = *l_src++; \
l_dest[5] = *l_src++; \
l_dest[6] = *l_src++; \
l_dest = (type *) ((char *) l_dest + l_stride); \
} \
} \
else if (nelms == 8) { \
for (_i = (int)total_count; _i; _i -= 8) { \
l_dest[0] = *l_src++; \
l_dest[1] = *l_src++; \
l_dest[2] = *l_src++; \
l_dest[3] = *l_src++; \
l_dest[4] = *l_src++; \
l_dest[5] = *l_src++; \
l_dest[6] = *l_src++; \
l_dest[7] = *l_src++; \
l_dest = (type *) ((char *) l_dest + l_stride); \
} \
} \
else { \
_i = (int)total_count; \
while (_i) { \
tmp_dest = l_dest; \
j = (int)nelms; \
while (j >= 8) { \
tmp_dest[0] = *l_src++; \
tmp_dest[1] = *l_src++; \
tmp_dest[2] = *l_src++; \
tmp_dest[3] = *l_src++; \
tmp_dest[4] = *l_src++; \
tmp_dest[5] = *l_src++; \
tmp_dest[6] = *l_src++; \
tmp_dest[7] = *l_src++; \
j -= 8; \
tmp_dest += 8; \
} \
for (k = 0; k < j; k++) { \
*tmp_dest++ = *l_src++; \
} \
l_dest = (type *) ((char *) l_dest + l_stride); \
_i -= nelms; \
} \
} \
src = (char *) l_src; \
dest = (char *) l_dest; \
}
#endif /* VECCPY_H_INCLUDED */
/*
* Local variables:
* c-indent-tabs-mode: nil
* End:
*/