Blob Blame History Raw
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */

/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#ifndef VECCPY_H_INCLUDED
#define VECCPY_H_INCLUDED

#ifdef HAVE_ANY_INT64_T_ALIGNEMENT
#define MPIR_ALIGN8_TEST(p1,p2)
#else
#define MPIR_ALIGN8_TEST(p1,p2) && (((DLOOP_VOID_PTR_CAST_TO_OFFSET p1 | DLOOP_VOID_PTR_CAST_TO_OFFSET p2) & 0x7) == 0)
#endif

#ifdef HAVE_ANY_INT32_T_ALIGNEMENT
#define MPIR_ALIGN4_TEST(p1,p2)
#else
#define MPIR_ALIGN4_TEST(p1,p2) && (((DLOOP_VOID_PTR_CAST_TO_OFFSET p1 | DLOOP_VOID_PTR_CAST_TO_OFFSET p2) & 0x3) == 0)
#endif

#define MPII_COPY_FROM_VEC(src,dest,stride,type,nelms,count)            \
    {                                                                   \
        if (!nelms) {                                                   \
            src = (char*) DLOOP_OFFSET_CAST_TO_VOID_PTR                 \
            ((DLOOP_VOID_PTR_CAST_TO_OFFSET (src)) +                    \
             ((DLOOP_Offset) count * (DLOOP_Offset) stride));           \
        }                                                               \
        else if (stride % sizeof(type)) {                               \
            MPII_COPY_FROM_VEC_UNALIGNED(src,dest,stride,type,nelms,count); \
        }                                                               \
        else {                                                          \
            MPII_COPY_FROM_VEC_ALIGNED(src,dest,stride/(DLOOP_Offset)sizeof(type),type,nelms,count); \
        }                                                               \
    }

#define MPII_COPY_TO_VEC(src,dest,stride,type,nelms,count)              \
    {                                                                   \
        if (!nelms) {                                                   \
            dest = (char*) DLOOP_OFFSET_CAST_TO_VOID_PTR                \
            ((DLOOP_VOID_PTR_CAST_TO_OFFSET (dest)) +                   \
             ((DLOOP_Offset) count * (DLOOP_Offset) stride));           \
        }                                                               \
        else if (stride % (DLOOP_Offset) sizeof(type)) {                \
            MPII_COPY_TO_VEC_UNALIGNED(src,dest,stride,type,nelms,count); \
        }                                                               \
        else {                                                          \
            MPII_COPY_TO_VEC_ALIGNED(src,dest,stride/(DLOOP_Offset)sizeof(type),type,nelms,count); \
        }                                                               \
    }

#define MPII_COPY_FROM_VEC_ALIGNED(src,dest,stride,type,nelms,count)    \
    {                                                                   \
        type * l_src = (type *) src, * l_dest = (type *) dest;          \
        type * tmp_src = l_src;                                         \
        register int k;                                                 \
        register unsigned long _i, j;                                   \
        unsigned long total_count = count * nelms;                      \
        const DLOOP_Offset l_stride = stride;                           \
                                                                        \
        DLOOP_Assert(stride <= INT_MAX);                                \
        DLOOP_Assert(total_count <= INT_MAX);                           \
        DLOOP_Assert(nelms <= INT_MAX);                                 \
        if (nelms == 1) {                                               \
            for (_i = (int)total_count; _i; _i--) {                     \
                *l_dest++ = *l_src;                                     \
                l_src += l_stride;                                      \
            }                                                           \
        }                                                               \
        else if (nelms == 2) {                                          \
            for (_i = (int)total_count; _i; _i -= 2) {                  \
                *l_dest++ = l_src[0];                                   \
                *l_dest++ = l_src[1];                                   \
                l_src += l_stride;                                      \
            }                                                           \
        }                                                               \
        else if (nelms == 3) {                                          \
            for (_i = (int)total_count; _i; _i -= 3) {                  \
                *l_dest++ = l_src[0];                                   \
                *l_dest++ = l_src[1];                                   \
                *l_dest++ = l_src[2];                                   \
                l_src += l_stride;                                      \
            }                                                           \
        }                                                               \
        else if (nelms == 4) {                                          \
            for (_i = (int)total_count; _i; _i -= 4) {                  \
                *l_dest++ = l_src[0];                                   \
                *l_dest++ = l_src[1];                                   \
                *l_dest++ = l_src[2];                                   \
                *l_dest++ = l_src[3];                                   \
                l_src += l_stride;                                      \
            }                                                           \
        }                                                               \
        else if (nelms == 5) {                                          \
            for (_i = (int)total_count; _i; _i -= 5) {                  \
                *l_dest++ = l_src[0];                                   \
                *l_dest++ = l_src[1];                                   \
                *l_dest++ = l_src[2];                                   \
                *l_dest++ = l_src[3];                                   \
                *l_dest++ = l_src[4];                                   \
                l_src += l_stride;                                      \
            }                                                           \
        }                                                               \
        else if (nelms == 6) {                                          \
            for (_i = (int)total_count; _i; _i -= 6) {                  \
                *l_dest++ = l_src[0];                                   \
                *l_dest++ = l_src[1];                                   \
                *l_dest++ = l_src[2];                                   \
                *l_dest++ = l_src[3];                                   \
                *l_dest++ = l_src[4];                                   \
                *l_dest++ = l_src[5];                                   \
                l_src += l_stride;                                      \
            }                                                           \
        }                                                               \
        else if (nelms == 7) {                                          \
            for (_i = (int)total_count; _i; _i -= 7) {                  \
                *l_dest++ = l_src[0];                                   \
                *l_dest++ = l_src[1];                                   \
                *l_dest++ = l_src[2];                                   \
                *l_dest++ = l_src[3];                                   \
                *l_dest++ = l_src[4];                                   \
                *l_dest++ = l_src[5];                                   \
                *l_dest++ = l_src[6];                                   \
                l_src += l_stride;                                      \
            }                                                           \
        }                                                               \
        else if (nelms == 8) {                                          \
            for (_i = (int)total_count; _i; _i -= 8) {                  \
                *l_dest++ = l_src[0];                                   \
                *l_dest++ = l_src[1];                                   \
                *l_dest++ = l_src[2];                                   \
                *l_dest++ = l_src[3];                                   \
                *l_dest++ = l_src[4];                                   \
                *l_dest++ = l_src[5];                                   \
                *l_dest++ = l_src[6];                                   \
                *l_dest++ = l_src[7];                                   \
                l_src += l_stride;                                      \
            }                                                           \
        }                                                               \
        else {                                                          \
            _i = (int)total_count;                                      \
            while (_i) {                                                \
                tmp_src = l_src;                                        \
                j = (int)nelms;                                         \
                while (j >= 8) {                                        \
                    *l_dest++ = tmp_src[0];                             \
                    *l_dest++ = tmp_src[1];                             \
                    *l_dest++ = tmp_src[2];                             \
                    *l_dest++ = tmp_src[3];                             \
                    *l_dest++ = tmp_src[4];                             \
                    *l_dest++ = tmp_src[5];                             \
                    *l_dest++ = tmp_src[6];                             \
                    *l_dest++ = tmp_src[7];                             \
                    j -= 8;                                             \
                    tmp_src += 8;                                       \
                }                                                       \
                for (k = 0; k < j; k++) {                               \
                    *l_dest++ = *tmp_src++;                             \
                }                                                       \
                l_src += l_stride;                                      \
                _i -= nelms;                                            \
            }                                                           \
        }                                                               \
        src = (char *) l_src;                                           \
        dest = (char *) l_dest;                                         \
    }

#define MPII_COPY_FROM_VEC_UNALIGNED(src,dest,stride,type,nelms,count)  \
    {                                                                   \
        type * l_src = (type *) src, * l_dest = (type *) dest;          \
        type * tmp_src = l_src;                                         \
        register int k;                                                 \
        register unsigned long _i, j, total_count = count * nelms;      \
        const DLOOP_Offset l_stride = stride;                           \
                                                                        \
        DLOOP_Assert(stride <= INT_MAX);                                \
        DLOOP_Assert(total_count <= INT_MAX);                           \
        DLOOP_Assert(nelms <= INT_MAX);                                 \
        if (nelms == 1) {                                               \
            for (_i = (int)total_count; _i; _i--) {                     \
                *l_dest++ = *l_src;                                     \
                l_src = (type *) ((char *) l_src + l_stride);           \
            }                                                           \
        }                                                               \
        else if (nelms == 2) {                                          \
            for (_i = (int)total_count; _i; _i -= 2) {                  \
                *l_dest++ = l_src[0];                                   \
                *l_dest++ = l_src[1];                                   \
                l_src = (type *) ((char *) l_src + l_stride);           \
            }                                                           \
        }                                                               \
        else if (nelms == 3) {                                          \
            for (_i = (int)total_count; _i; _i -= 3) {                  \
                *l_dest++ = l_src[0];                                   \
                *l_dest++ = l_src[1];                                   \
                *l_dest++ = l_src[2];                                   \
                l_src = (type *) ((char *) l_src + l_stride);           \
            }                                                           \
        }                                                               \
        else if (nelms == 4) {                                          \
            for (_i = (int)total_count; _i; _i -= 4) {                  \
                *l_dest++ = l_src[0];                                   \
                *l_dest++ = l_src[1];                                   \
                *l_dest++ = l_src[2];                                   \
                *l_dest++ = l_src[3];                                   \
                l_src = (type *) ((char *) l_src + l_stride);           \
            }                                                           \
        }                                                               \
        else if (nelms == 5) {                                          \
            for (_i = (int)total_count; _i; _i -= 5) {                  \
                *l_dest++ = l_src[0];                                   \
                *l_dest++ = l_src[1];                                   \
                *l_dest++ = l_src[2];                                   \
                *l_dest++ = l_src[3];                                   \
                *l_dest++ = l_src[4];                                   \
                l_src = (type *) ((char *) l_src + l_stride);           \
            }                                                           \
        }                                                               \
        else if (nelms == 6) {                                          \
            for (_i = (int)total_count; _i; _i -= 6) {                  \
                *l_dest++ = l_src[0];                                   \
                *l_dest++ = l_src[1];                                   \
                *l_dest++ = l_src[2];                                   \
                *l_dest++ = l_src[3];                                   \
                *l_dest++ = l_src[4];                                   \
                *l_dest++ = l_src[5];                                   \
                l_src = (type *) ((char *) l_src + l_stride);           \
            }                                                           \
        }                                                               \
        else if (nelms == 7) {                                          \
            for (_i = (int)total_count; _i; _i -= 7) {                  \
                *l_dest++ = l_src[0];                                   \
                *l_dest++ = l_src[1];                                   \
                *l_dest++ = l_src[2];                                   \
                *l_dest++ = l_src[3];                                   \
                *l_dest++ = l_src[4];                                   \
                *l_dest++ = l_src[5];                                   \
                *l_dest++ = l_src[6];                                   \
                l_src = (type *) ((char *) l_src + l_stride);           \
            }                                                           \
        }                                                               \
        else if (nelms == 8) {                                          \
            for (_i = (int)total_count; _i; _i -= 8) {                  \
                *l_dest++ = l_src[0];                                   \
                *l_dest++ = l_src[1];                                   \
                *l_dest++ = l_src[2];                                   \
                *l_dest++ = l_src[3];                                   \
                *l_dest++ = l_src[4];                                   \
                *l_dest++ = l_src[5];                                   \
                *l_dest++ = l_src[6];                                   \
                *l_dest++ = l_src[7];                                   \
                l_src = (type *) ((char *) l_src + l_stride);           \
            }                                                           \
        }                                                               \
        else {                                                          \
            _i = (int)total_count;                                      \
            while (_i) {                                                \
                tmp_src = l_src;                                        \
                j = (int)nelms;                                         \
                while (j >= 8) {                                        \
                    *l_dest++ = tmp_src[0];                             \
                    *l_dest++ = tmp_src[1];                             \
                    *l_dest++ = tmp_src[2];                             \
                    *l_dest++ = tmp_src[3];                             \
                    *l_dest++ = tmp_src[4];                             \
                    *l_dest++ = tmp_src[5];                             \
                    *l_dest++ = tmp_src[6];                             \
                    *l_dest++ = tmp_src[7];                             \
                    j -= 8;                                             \
                    tmp_src += 8;                                       \
                }                                                       \
                for (k = 0; k < j; k++) {                               \
                    *l_dest++ = *tmp_src++;                             \
                }                                                       \
                l_src = (type *) ((char *) l_src + l_stride);           \
                _i -= nelms;                                            \
            }                                                           \
        }                                                               \
        src = (char *) l_src;                                           \
        dest = (char *) l_dest;                                         \
    }

#define MPII_COPY_TO_VEC_ALIGNED(src,dest,stride,type,nelms,count)      \
    {                                                                   \
        type * l_src = (type *) src, * l_dest = (type *) dest;          \
        type * tmp_dest = l_dest;                                       \
        register int k;                                                 \
        register unsigned long _i, j;                                   \
        unsigned long total_count = count * nelms;                      \
        const DLOOP_Offset l_stride = stride;                           \
                                                                        \
        DLOOP_Assert(stride <= INT_MAX);                                \
        DLOOP_Assert(total_count <= INT_MAX);                           \
        DLOOP_Assert(nelms <= INT_MAX);                                 \
        if (nelms == 1) {                                               \
            for (_i = (int)total_count; _i; _i--) {                     \
                *l_dest = *l_src++;                                     \
                l_dest += l_stride;                                     \
            }                                                           \
        }                                                               \
        else if (nelms == 2) {                                          \
            for (_i = (int)total_count; _i; _i -= 2) {                  \
                l_dest[0] = *l_src++;                                   \
                l_dest[1] = *l_src++;                                   \
                l_dest += l_stride;                                     \
            }                                                           \
        }                                                               \
        else if (nelms == 3) {                                          \
            for (_i = (int)total_count; _i; _i -= 3) {                  \
                l_dest[0] = *l_src++;                                   \
                l_dest[1] = *l_src++;                                   \
                l_dest[2] = *l_src++;                                   \
                l_dest += l_stride;                                     \
            }                                                           \
        }                                                               \
        else if (nelms == 4) {                                          \
            for (_i = (int)total_count; _i; _i -= 4) {                  \
                l_dest[0] = *l_src++;                                   \
                l_dest[1] = *l_src++;                                   \
                l_dest[2] = *l_src++;                                   \
                l_dest[3] = *l_src++;                                   \
                l_dest += l_stride;                                     \
            }                                                           \
        }                                                               \
        else if (nelms == 5) {                                          \
            for (_i = (int)total_count; _i; _i -= 5) {                  \
                l_dest[0] = *l_src++;                                   \
                l_dest[1] = *l_src++;                                   \
                l_dest[2] = *l_src++;                                   \
                l_dest[3] = *l_src++;                                   \
                l_dest[4] = *l_src++;                                   \
                l_dest += l_stride;                                     \
            }                                                           \
        }                                                               \
        else if (nelms == 6) {                                          \
            for (_i = (int)total_count; _i; _i -= 6) {                  \
                l_dest[0] = *l_src++;                                   \
                l_dest[1] = *l_src++;                                   \
                l_dest[2] = *l_src++;                                   \
                l_dest[3] = *l_src++;                                   \
                l_dest[4] = *l_src++;                                   \
                l_dest[5] = *l_src++;                                   \
                l_dest += l_stride;                                     \
            }                                                           \
        }                                                               \
        else if (nelms == 7) {                                          \
            for (_i = (int)total_count; _i; _i -= 7) {                  \
                l_dest[0] = *l_src++;                                   \
                l_dest[1] = *l_src++;                                   \
                l_dest[2] = *l_src++;                                   \
                l_dest[3] = *l_src++;                                   \
                l_dest[4] = *l_src++;                                   \
                l_dest[5] = *l_src++;                                   \
                l_dest[6] = *l_src++;                                   \
                l_dest += l_stride;                                     \
            }                                                           \
        }                                                               \
        else if (nelms == 8) {                                          \
            for (_i = (int)total_count; _i; _i -= 8) {                  \
                l_dest[0] = *l_src++;                                   \
                l_dest[1] = *l_src++;                                   \
                l_dest[2] = *l_src++;                                   \
                l_dest[3] = *l_src++;                                   \
                l_dest[4] = *l_src++;                                   \
                l_dest[5] = *l_src++;                                   \
                l_dest[6] = *l_src++;                                   \
                l_dest[7] = *l_src++;                                   \
                l_dest += l_stride;                                     \
            }                                                           \
        }                                                               \
        else {                                                          \
            _i = (int)total_count;                                      \
            while (_i) {                                                \
                tmp_dest = l_dest;                                      \
                j = (int)nelms;                                         \
                while (j >= 8) {                                        \
                    tmp_dest[0] = *l_src++;                             \
                    tmp_dest[1] = *l_src++;                             \
                    tmp_dest[2] = *l_src++;                             \
                    tmp_dest[3] = *l_src++;                             \
                    tmp_dest[4] = *l_src++;                             \
                    tmp_dest[5] = *l_src++;                             \
                    tmp_dest[6] = *l_src++;                             \
                    tmp_dest[7] = *l_src++;                             \
                    j -= 8;                                             \
                    tmp_dest += 8;                                      \
                }                                                       \
                for (k = 0; k < j; k++) {                               \
                    *tmp_dest++ = *l_src++;                             \
                }                                                       \
                l_dest += l_stride;                                     \
                _i -= nelms;                                            \
            }                                                           \
        }                                                               \
        src = (char *) l_src;                                           \
        dest = (char *) l_dest;                                         \
    }

#define MPII_COPY_TO_VEC_UNALIGNED(src,dest,stride,type,nelms,count)    \
    {                                                                   \
        type * l_src = (type *) src, * l_dest = (type *) dest;          \
        type * tmp_dest = l_dest;                                       \
        register int k;                                                 \
        register unsigned long _i, j;                                   \
        unsigned long total_count = count * nelms;                      \
        const DLOOP_Offset l_stride = stride;                           \
                                                                        \
        DLOOP_Assert(stride <= INT_MAX);                                \
        DLOOP_Assert(total_count <= INT_MAX);                           \
        DLOOP_Assert(nelms <= INT_MAX);                                 \
        if (nelms == 1) {                                               \
            for (_i = (int)total_count; _i; _i--) {                     \
                *l_dest = *l_src++;                                     \
                l_dest = (type *) ((char *) l_dest + l_stride);         \
            }                                                           \
        }                                                               \
        else if (nelms == 2) {                                          \
            for (_i = (int)total_count; _i; _i -= 2) {                  \
                l_dest[0] = *l_src++;                                   \
                l_dest[1] = *l_src++;                                   \
                l_dest = (type *) ((char *) l_dest + l_stride);         \
            }                                                           \
        }                                                               \
        else if (nelms == 3) {                                          \
            for (_i = (int)total_count; _i; _i -= 3) {                  \
                l_dest[0] = *l_src++;                                   \
                l_dest[1] = *l_src++;                                   \
                l_dest[2] = *l_src++;                                   \
                l_dest = (type *) ((char *) l_dest + l_stride);         \
            }                                                           \
        }                                                               \
        else if (nelms == 4) {                                          \
            for (_i = (int)total_count; _i; _i -= 4) {                  \
                l_dest[0] = *l_src++;                                   \
                l_dest[1] = *l_src++;                                   \
                l_dest[2] = *l_src++;                                   \
                l_dest[3] = *l_src++;                                   \
                l_dest = (type *) ((char *) l_dest + l_stride);         \
            }                                                           \
        }                                                               \
        else if (nelms == 5) {                                          \
            for (_i = (int)total_count; _i; _i -= 5) {                  \
                l_dest[0] = *l_src++;                                   \
                l_dest[1] = *l_src++;                                   \
                l_dest[2] = *l_src++;                                   \
                l_dest[3] = *l_src++;                                   \
                l_dest[4] = *l_src++;                                   \
                l_dest = (type *) ((char *) l_dest + l_stride);         \
            }                                                           \
        }                                                               \
        else if (nelms == 6) {                                          \
            for (_i = (int)total_count; _i; _i -= 6) {                  \
                l_dest[0] = *l_src++;                                   \
                l_dest[1] = *l_src++;                                   \
                l_dest[2] = *l_src++;                                   \
                l_dest[3] = *l_src++;                                   \
                l_dest[4] = *l_src++;                                   \
                l_dest[5] = *l_src++;                                   \
                l_dest = (type *) ((char *) l_dest + l_stride);         \
            }                                                           \
        }                                                               \
        else if (nelms == 7) {                                          \
            for (_i = (int)total_count; _i; _i -= 7) {                  \
                l_dest[0] = *l_src++;                                   \
                l_dest[1] = *l_src++;                                   \
                l_dest[2] = *l_src++;                                   \
                l_dest[3] = *l_src++;                                   \
                l_dest[4] = *l_src++;                                   \
                l_dest[5] = *l_src++;                                   \
                l_dest[6] = *l_src++;                                   \
                l_dest = (type *) ((char *) l_dest + l_stride);         \
            }                                                           \
        }                                                               \
        else if (nelms == 8) {                                          \
            for (_i = (int)total_count; _i; _i -= 8) {                  \
                l_dest[0] = *l_src++;                                   \
                l_dest[1] = *l_src++;                                   \
                l_dest[2] = *l_src++;                                   \
                l_dest[3] = *l_src++;                                   \
                l_dest[4] = *l_src++;                                   \
                l_dest[5] = *l_src++;                                   \
                l_dest[6] = *l_src++;                                   \
                l_dest[7] = *l_src++;                                   \
                l_dest = (type *) ((char *) l_dest + l_stride);         \
            }                                                           \
        }                                                               \
        else {                                                          \
            _i = (int)total_count;                                      \
            while (_i) {                                                \
                tmp_dest = l_dest;                                      \
                j = (int)nelms;                                         \
                while (j >= 8) {                                        \
                    tmp_dest[0] = *l_src++;                             \
                    tmp_dest[1] = *l_src++;                             \
                    tmp_dest[2] = *l_src++;                             \
                    tmp_dest[3] = *l_src++;                             \
                    tmp_dest[4] = *l_src++;                             \
                    tmp_dest[5] = *l_src++;                             \
                    tmp_dest[6] = *l_src++;                             \
                    tmp_dest[7] = *l_src++;                             \
                    j -= 8;                                             \
                    tmp_dest += 8;                                      \
                }                                                       \
                for (k = 0; k < j; k++) {                               \
                    *tmp_dest++ = *l_src++;                             \
                }                                                       \
                l_dest = (type *) ((char *) l_dest + l_stride);         \
                _i -= nelms;                                            \
            }                                                           \
        }                                                               \
        src = (char *) l_src;                                           \
        dest = (char *) l_dest;                                         \
    }

#endif /* VECCPY_H_INCLUDED */

/*
 * Local variables:
 * c-indent-tabs-mode: nil
 * End:
 */