/**
* Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED.
* Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/
#if defined(__x86_64__)
#include <ucs/arch/cpu.h>
#include <ucs/debug/log.h>
#include <ucs/sys/math.h>
#include <ucs/sys/sys.h>
#include <ucs/sys/string.h>
#define X86_CPUID_GENUINEINTEL "GenuntelineI" /* GenuineIntel in magic notation */
#define X86_CPUID_AUTHENTICAMD "AuthcAMDenti" /* AuthenticAMD in magic notation */
#define X86_CPUID_GET_MODEL 0x00000001u
#define X86_CPUID_GET_BASE_VALUE 0x00000000u
#define X86_CPUID_GET_EXTD_VALUE 0x00000007u
#define X86_CPUID_GET_MAX_VALUE 0x80000000u
#define X86_CPUID_INVARIANT_TSC 0x80000007u
#define X86_CPUID_GET_CACHE_INFO 0x00000002u
#define X86_CPUID_GET_LEAF4_INFO 0x00000004u
#define X86_CPU_CACHE_RESERVED 0x80000000
#define X86_CPU_CACHE_TAG_L1_ONLY 0x40
#define X86_CPU_CACHE_TAG_LEAF4 0xff
#if defined (__SSE4_1__)
#define _mm_load(a) _mm_stream_load_si128((__m128i *) (a))
#define _mm_store(a,v) _mm_storeu_si128((__m128i *) (a), (v))
#endif
typedef enum ucs_x86_cpu_cache_type {
X86_CPU_CACHE_TYPE_DATA = 1,
X86_CPU_CACHE_TYPE_INSTRUCTION = 2,
X86_CPU_CACHE_TYPE_UNIFIED = 3
} ucs_x86_cpu_cache_type_t;
/* CPU version */
typedef union ucs_x86_cpu_version {
struct {
unsigned stepping : 4;
unsigned model : 4;
unsigned family : 4;
unsigned type : 2;
unsigned unused : 2;
unsigned ext_model : 4;
unsigned ext_family : 8;
};
uint32_t reg;
} UCS_S_PACKED ucs_x86_cpu_version_t;
/* cache datatypes */
typedef struct ucs_x86_cpu_cache_info {
unsigned level;
ucs_x86_cpu_cache_type_t type;
} UCS_S_PACKED ucs_x86_cpu_cache_info_t;
typedef union ucs_x86_cache_line_reg_info {
uint32_t reg;
struct {
unsigned size : 12;
unsigned partitions : 10;
unsigned associativity : 10;
};
struct {
unsigned type : 5;
unsigned level : 3;
};
} UCS_S_PACKED ucs_x86_cache_line_reg_info_t;
typedef union ucs_x86_cpu_registers {
struct {
union {
uint32_t eax;
uint8_t max_iter; /* leaf 2 - max iterations */
};
union {
struct {
uint32_t ebx;
uint32_t ecx;
uint32_t edx;
};
char id[sizeof(uint32_t) * 3]; /* leaf 0 - CPU ID */
};
};
union {
uint32_t value;
uint8_t tag[sizeof(uint32_t)];
} reg[4]; /* leaf 2 tags */
} UCS_S_PACKED ucs_x86_cpu_registers;
typedef struct ucs_x86_cpu_cache_size_codes {
ucs_cpu_cache_type_t type;
size_t size;
} ucs_x86_cpu_cache_size_codes_t;
ucs_ternary_value_t ucs_arch_x86_enable_rdtsc = UCS_TRY;
static const ucs_x86_cpu_cache_info_t x86_cpu_cache[] = {
[UCS_CPU_CACHE_L1d] = {.level = 1, .type = X86_CPU_CACHE_TYPE_DATA},
[UCS_CPU_CACHE_L1i] = {.level = 1, .type = X86_CPU_CACHE_TYPE_INSTRUCTION},
[UCS_CPU_CACHE_L2] = {.level = 2, .type = X86_CPU_CACHE_TYPE_UNIFIED},
[UCS_CPU_CACHE_L3] = {.level = 3, .type = X86_CPU_CACHE_TYPE_UNIFIED}
};
static const ucs_x86_cpu_cache_size_codes_t ucs_x86_cpu_cache_size_codes[] = {
[0x06] = {.type = UCS_CPU_CACHE_L1i, .size = 8192 },
[0x08] = {.type = UCS_CPU_CACHE_L1i, .size = 16384 },
[0x09] = {.type = UCS_CPU_CACHE_L1i, .size = 32768 },
[0x0a] = {.type = UCS_CPU_CACHE_L1d, .size = 8192 },
[0x0c] = {.type = UCS_CPU_CACHE_L1d, .size = 16384 },
[0x0d] = {.type = UCS_CPU_CACHE_L1d, .size = 16384 },
[0x0e] = {.type = UCS_CPU_CACHE_L1d, .size = 24576 },
[0x21] = {.type = UCS_CPU_CACHE_L2, .size = 262144 },
[0x22] = {.type = UCS_CPU_CACHE_L3, .size = 524288 },
[0x23] = {.type = UCS_CPU_CACHE_L3, .size = 1048576 },
[0x25] = {.type = UCS_CPU_CACHE_L3, .size = 2097152 },
[0x29] = {.type = UCS_CPU_CACHE_L3, .size = 4194304 },
[0x2c] = {.type = UCS_CPU_CACHE_L1d, .size = 32768 },
[0x30] = {.type = UCS_CPU_CACHE_L1i, .size = 32768 },
[0x39] = {.type = UCS_CPU_CACHE_L2, .size = 131072 },
[0x3a] = {.type = UCS_CPU_CACHE_L2, .size = 196608 },
[0x3b] = {.type = UCS_CPU_CACHE_L2, .size = 131072 },
[0x3c] = {.type = UCS_CPU_CACHE_L2, .size = 262144 },
[0x3d] = {.type = UCS_CPU_CACHE_L2, .size = 393216 },
[0x3e] = {.type = UCS_CPU_CACHE_L2, .size = 524288 },
[0x3f] = {.type = UCS_CPU_CACHE_L2, .size = 262144 },
[0x41] = {.type = UCS_CPU_CACHE_L2, .size = 131072 },
[0x42] = {.type = UCS_CPU_CACHE_L2, .size = 262144 },
[0x43] = {.type = UCS_CPU_CACHE_L2, .size = 524288 },
[0x44] = {.type = UCS_CPU_CACHE_L2, .size = 1048576 },
[0x45] = {.type = UCS_CPU_CACHE_L2, .size = 2097152 },
[0x46] = {.type = UCS_CPU_CACHE_L3, .size = 4194304 },
[0x47] = {.type = UCS_CPU_CACHE_L3, .size = 8388608 },
[0x48] = {.type = UCS_CPU_CACHE_L2, .size = 3145728 },
[0x49] = {.type = UCS_CPU_CACHE_L2, .size = 4194304 },
[0x4a] = {.type = UCS_CPU_CACHE_L3, .size = 6291456 },
[0x4b] = {.type = UCS_CPU_CACHE_L3, .size = 8388608 },
[0x4c] = {.type = UCS_CPU_CACHE_L3, .size = 12582912 },
[0x4d] = {.type = UCS_CPU_CACHE_L3, .size = 16777216 },
[0x4e] = {.type = UCS_CPU_CACHE_L2, .size = 6291456 },
[0x60] = {.type = UCS_CPU_CACHE_L1d, .size = 16384 },
[0x66] = {.type = UCS_CPU_CACHE_L1d, .size = 8192 },
[0x67] = {.type = UCS_CPU_CACHE_L1d, .size = 16384 },
[0x68] = {.type = UCS_CPU_CACHE_L1d, .size = 32768 },
[0x78] = {.type = UCS_CPU_CACHE_L2, .size = 1048576 },
[0x79] = {.type = UCS_CPU_CACHE_L2, .size = 131072 },
[0x7a] = {.type = UCS_CPU_CACHE_L2, .size = 262144 },
[0x7b] = {.type = UCS_CPU_CACHE_L2, .size = 524288 },
[0x7c] = {.type = UCS_CPU_CACHE_L2, .size = 1048576 },
[0x7d] = {.type = UCS_CPU_CACHE_L2, .size = 2097152 },
[0x7f] = {.type = UCS_CPU_CACHE_L2, .size = 524288 },
[0x80] = {.type = UCS_CPU_CACHE_L2, .size = 524288 },
[0x82] = {.type = UCS_CPU_CACHE_L2, .size = 262144 },
[0x83] = {.type = UCS_CPU_CACHE_L2, .size = 524288 },
[0x84] = {.type = UCS_CPU_CACHE_L2, .size = 1048576 },
[0x85] = {.type = UCS_CPU_CACHE_L2, .size = 2097152 },
[0x86] = {.type = UCS_CPU_CACHE_L2, .size = 524288 },
[0x87] = {.type = UCS_CPU_CACHE_L2, .size = 1048576 },
[0xd0] = {.type = UCS_CPU_CACHE_L3, .size = 524288 },
[0xd1] = {.type = UCS_CPU_CACHE_L3, .size = 1048576 },
[0xd2] = {.type = UCS_CPU_CACHE_L3, .size = 2097152 },
[0xd6] = {.type = UCS_CPU_CACHE_L3, .size = 1048576 },
[0xd7] = {.type = UCS_CPU_CACHE_L3, .size = 2097152 },
[0xd8] = {.type = UCS_CPU_CACHE_L3, .size = 4194304 },
[0xdc] = {.type = UCS_CPU_CACHE_L3, .size = 2097152 },
[0xdd] = {.type = UCS_CPU_CACHE_L3, .size = 4194304 },
[0xde] = {.type = UCS_CPU_CACHE_L3, .size = 8388608 },
[0xe2] = {.type = UCS_CPU_CACHE_L3, .size = 2097152 },
[0xe3] = {.type = UCS_CPU_CACHE_L3, .size = 4194304 },
[0xe4] = {.type = UCS_CPU_CACHE_L3, .size = 8388608 },
[0xea] = {.type = UCS_CPU_CACHE_L3, .size = 12582912 },
[0xeb] = {.type = UCS_CPU_CACHE_L3, .size = 18874368 },
[0xec] = {.type = UCS_CPU_CACHE_L3, .size = 25165824 }
};
static UCS_F_NOOPTIMIZE inline void ucs_x86_cpuid(uint32_t level,
uint32_t *a, uint32_t *b,
uint32_t *c, uint32_t *d)
{
asm volatile ("cpuid\n\t"
: "=a"(*a), "=b"(*b), "=c"(*c), "=d"(*d)
: "0"(level));
}
static UCS_F_NOOPTIMIZE inline void ucs_x86_cpuid_ecx(uint32_t level, uint32_t ecx,
uint32_t *a, uint32_t *b,
uint32_t *c, uint32_t *d)
{
asm volatile("cpuid"
: "=a"(*a), "=b"(*b), "=c"(*c), "=d"(*d)
: "0"(level), "2"(ecx));
}
/* This allows the CPU detection to work with assemblers not supporting
* the xgetbv mnemonic. These include clang and some BSD versions.
*/
#define ucs_x86_xgetbv(_index, _eax, _edx) \
asm volatile (".byte 0x0f, 0x01, 0xd0" : "=a"(_eax), "=d"(_edx) : "c" (_index))
static int ucs_x86_invariant_tsc()
{
uint32_t _eax, _ebx, _ecx, _edx;
ucs_x86_cpuid(X86_CPUID_GET_MAX_VALUE, &_eax, &_ebx, &_ecx, &_edx);
if (_eax <= X86_CPUID_INVARIANT_TSC) {
goto warn;
}
ucs_x86_cpuid(X86_CPUID_INVARIANT_TSC, &_eax, &_ebx, &_ecx, &_edx);
if (!(_edx & UCS_BIT(8))) {
goto warn;
}
return 1;
warn:
ucs_debug("CPU does not support invariant TSC, using fallback timer");
return 0;
}
double ucs_x86_tsc_freq_from_cpu_model()
{
char buf[256];
char model[256];
char *rate;
char newline[2];
double ghz, max_ghz;
FILE* f;
int rc;
int warn;
f = fopen("/proc/cpuinfo","r");
if (!f) {
return -1;
}
warn = 0;
max_ghz = 0.0;
while (fgets(buf, sizeof(buf), f)) {
rc = sscanf(buf, "model name : %s", model);
if (rc != 1) {
continue;
}
rate = strrchr(buf, '@');
if (rate == NULL) {
continue;
}
rc = sscanf(rate, "@ %lfGHz%[\n]", &ghz, newline);
if (rc != 2) {
continue;
}
max_ghz = ucs_max(max_ghz, ghz);
if (max_ghz != ghz) {
warn = 1;
break;
}
}
fclose(f);
if (warn) {
ucs_debug("Conflicting CPU frequencies detected, using fallback timer");
return -1;
}
return max_ghz * 1e9;
}
double ucs_x86_init_tsc_freq()
{
double result;
if (!ucs_x86_invariant_tsc()) {
goto err_disable_rdtsc;
}
ucs_arch_x86_enable_rdtsc = UCS_YES;
result = ucs_x86_tsc_freq_from_cpu_model();
if (result <= 0.0) {
result = ucs_get_cpuinfo_clock_freq("cpu MHz", 1e6);
}
if (result > 0.0) {
return result;
}
err_disable_rdtsc:
ucs_arch_x86_enable_rdtsc = UCS_NO;
return -1;
}
double ucs_arch_get_clocks_per_sec()
{
double freq;
/* Init rdtsc state ucs_arch_x86_enable_rdtsc */
freq = ucs_x86_init_tsc_freq();
if (ucs_arch_x86_enable_rdtsc == UCS_YES) {
/* using rdtsc */
return freq;
}
return ucs_arch_generic_get_clocks_per_sec();
}
ucs_cpu_model_t ucs_arch_get_cpu_model()
{
ucs_x86_cpu_version_t version;
uint32_t _ebx, _ecx, _edx;
uint32_t model, family;
/* Get CPU model/family */
ucs_x86_cpuid(X86_CPUID_GET_MODEL, ucs_unaligned_ptr(&version.reg), &_ebx, &_ecx, &_edx);
model = version.model;
family = version.family;
/* Adjust family/model */
if (family == 0xf) {
family += version.ext_family;
}
if ((family == 0x6) || (family == 0xf) || (family == 0x17)) {
model = (version.ext_model << 4) | model;
}
/* Check known CPUs */
if (family == 0x06) {
switch (model) {
case 0x3a:
case 0x3e:
return UCS_CPU_MODEL_INTEL_IVYBRIDGE;
case 0x2a:
case 0x2d:
return UCS_CPU_MODEL_INTEL_SANDYBRIDGE;
case 0x1a:
case 0x1e:
case 0x1f:
case 0x2e:
return UCS_CPU_MODEL_INTEL_NEHALEM;
case 0x25:
case 0x2c:
case 0x2f:
return UCS_CPU_MODEL_INTEL_WESTMERE;
case 0x3c:
case 0x3f:
case 0x45:
case 0x46:
return UCS_CPU_MODEL_INTEL_HASWELL;
case 0x3d:
case 0x47:
case 0x4f:
case 0x56:
return UCS_CPU_MODEL_INTEL_BROADWELL;
case 0x5e:
case 0x4e:
case 0x55:
return UCS_CPU_MODEL_INTEL_SKYLAKE;
}
}
if (family == 0x17) {
switch (model) {
case 0x29:
return UCS_CPU_MODEL_AMD_NAPLES;
case 0x31:
return UCS_CPU_MODEL_AMD_ROME;
}
}
return UCS_CPU_MODEL_UNKNOWN;
}
int ucs_arch_get_cpu_flag()
{
static int cpu_flag = UCS_CPU_FLAG_UNKNOWN;
if (UCS_CPU_FLAG_UNKNOWN == cpu_flag) {
uint32_t result = 0;
uint32_t base_value;
uint32_t _eax, _ebx, _ecx, _edx;
ucs_x86_cpuid(X86_CPUID_GET_BASE_VALUE, &_eax, &_ebx, &_ecx, &_edx);
base_value = _eax;
if (base_value >= 1) {
ucs_x86_cpuid(X86_CPUID_GET_MODEL, &_eax, &_ebx, &_ecx, &_edx);
if (_edx & (1 << 15)) {
result |= UCS_CPU_FLAG_CMOV;
}
if (_edx & (1 << 23)) {
result |= UCS_CPU_FLAG_MMX;
}
if (_edx & (1 << 25)) {
result |= UCS_CPU_FLAG_MMX2;
}
if (_edx & (1 << 25)) {
result |= UCS_CPU_FLAG_SSE;
}
if (_edx & (1 << 26)) {
result |= UCS_CPU_FLAG_SSE2;
}
if (_ecx & 1) {
result |= UCS_CPU_FLAG_SSE3;
}
if (_ecx & (1 << 9)) {
result |= UCS_CPU_FLAG_SSSE3;
}
if (_ecx & (1 << 19)) {
result |= UCS_CPU_FLAG_SSE41;
}
if (_ecx & (1 << 20)) {
result |= UCS_CPU_FLAG_SSE42;
}
if ((_ecx & 0x18000000) == 0x18000000) {
ucs_x86_xgetbv(0, _eax, _edx);
if ((_eax & 0x6) == 0x6) {
result |= UCS_CPU_FLAG_AVX;
}
}
}
if (base_value >= 7) {
ucs_x86_cpuid(X86_CPUID_GET_EXTD_VALUE, &_eax, &_ebx, &_ecx, &_edx);
if ((result & UCS_CPU_FLAG_AVX) && (_ebx & (1 << 5))) {
result |= UCS_CPU_FLAG_AVX2;
}
}
cpu_flag = result;
}
return cpu_flag;
}
ucs_cpu_vendor_t ucs_arch_get_cpu_vendor()
{
ucs_x86_cpu_registers reg;
ucs_x86_cpuid(X86_CPUID_GET_BASE_VALUE,
ucs_unaligned_ptr(®.eax), ucs_unaligned_ptr(®.ebx),
ucs_unaligned_ptr(®.ecx), ucs_unaligned_ptr(®.edx));
if (!memcmp(reg.id, X86_CPUID_GENUINEINTEL, sizeof(X86_CPUID_GENUINEINTEL) - 1)) {
return UCS_CPU_VENDOR_INTEL;
} else if (!memcmp(reg.id, X86_CPUID_AUTHENTICAMD, sizeof(X86_CPUID_AUTHENTICAMD) - 1)) {
return UCS_CPU_VENDOR_AMD;
}
return UCS_CPU_VENDOR_UNKNOWN;
}
#if ENABLE_BUILTIN_MEMCPY
static size_t ucs_cpu_memcpy_thresh(size_t user_val, size_t auto_val)
{
if (user_val != UCS_MEMUNITS_AUTO) {
return user_val;
}
if (((ucs_arch_get_cpu_vendor() == UCS_CPU_VENDOR_INTEL) &&
(ucs_arch_get_cpu_model() >= UCS_CPU_MODEL_INTEL_HASWELL)) ||
(ucs_arch_get_cpu_vendor() == UCS_CPU_VENDOR_AMD)) {
return auto_val;
} else {
return UCS_MEMUNITS_INF;
}
}
#endif
void ucs_cpu_init()
{
#if ENABLE_BUILTIN_MEMCPY
ucs_global_opts.arch.builtin_memcpy_min =
ucs_cpu_memcpy_thresh(ucs_global_opts.arch.builtin_memcpy_min,
ucs_cpu_builtin_memcpy[ucs_arch_get_cpu_vendor()].min);
ucs_global_opts.arch.builtin_memcpy_max =
ucs_cpu_memcpy_thresh(ucs_global_opts.arch.builtin_memcpy_max,
ucs_cpu_builtin_memcpy[ucs_arch_get_cpu_vendor()].max);
#endif
}
ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes)
{
ucs_x86_cache_line_reg_info_t cache_info;
ucs_x86_cache_line_reg_info_t line_info;
ucs_x86_cpu_registers reg;
uint32_t sets;
uint32_t i, t, r, l4;
uint32_t max_iter;
size_t c;
int level1_only; /* level 1 cache only supported */
int tag;
int cache_count;
ucs_cpu_cache_type_t type;
/* Get CPU ID and vendor - it will reset cache iteration sequence */
if (ucs_arch_get_cpu_vendor() != UCS_CPU_VENDOR_INTEL) {
return UCS_ERR_UNSUPPORTED;
}
ucs_x86_cpuid(X86_CPUID_GET_BASE_VALUE,
ucs_unaligned_ptr(®.eax), ucs_unaligned_ptr(®.ebx),
ucs_unaligned_ptr(®.ecx), ucs_unaligned_ptr(®.edx));
if (reg.eax < X86_CPUID_GET_CACHE_INFO) {
return UCS_ERR_UNSUPPORTED;
}
level1_only = 0;
cache_count = 0;
for (i = 0, max_iter = 1; i < max_iter; i++) {
ucs_x86_cpuid(X86_CPUID_GET_CACHE_INFO,
ucs_unaligned_ptr(®.eax), ucs_unaligned_ptr(®.ebx),
ucs_unaligned_ptr(®.ecx), ucs_unaligned_ptr(®.edx));
if (i == 0) { /* on first iteration get max iteration number */
max_iter = reg.max_iter;
reg.max_iter = 0; /* mask iteration register from processing */
}
for (r = 0; r < ucs_array_size(reg.reg); r++) {
if (ucs_test_all_flags(reg.reg[r].value, X86_CPU_CACHE_RESERVED)) {
continue;
}
for (t = 0; (t < ucs_array_size(reg.reg[r].tag)) && (reg.reg[r].tag[t] != 0); t++) {
tag = reg.reg[r].tag[t];
switch(tag) {
case X86_CPU_CACHE_TAG_L1_ONLY:
level1_only = 1;
break;
case X86_CPU_CACHE_TAG_LEAF4:
for (l4 = 0; cache_count < UCS_CPU_CACHE_LAST; l4++) {
ucs_x86_cpuid_ecx(X86_CPUID_GET_LEAF4_INFO, l4,
ucs_unaligned_ptr(&cache_info.reg),
ucs_unaligned_ptr(&line_info.reg),
&sets, ucs_unaligned_ptr(®.edx));
if (cache_info.type == 0) {
/* we are done - nothing found, go to next register */
break;
}
for (c = 0; c < UCS_CPU_CACHE_LAST; c++) {
if ((cache_info.level == x86_cpu_cache[c].level) &&
(cache_info.type == x86_cpu_cache[c].type)) {
/* found it */
/* cache entry is not updated yet */
/* and cache level is 1 or all levels are supported */
if (!((cache_sizes[c] == 0) &&
((x86_cpu_cache[c].level == 1) || !level1_only))) {
break;
}
cache_sizes[c] = (line_info.associativity + 1) *
(line_info.partitions + 1) *
(line_info.size + 1) *
(sets + 1);
cache_count++;
}
}
}
return cache_count == UCS_CPU_CACHE_LAST ? UCS_OK : UCS_ERR_UNSUPPORTED;
default:
if ((tag >= ucs_array_size(ucs_x86_cpu_cache_size_codes)) ||
(ucs_x86_cpu_cache_size_codes[tag].size != 0)) {
break; /* tag is out of table or in empty entry */
}
type = ucs_x86_cpu_cache_size_codes[tag].type;
if (cache_sizes[type] != 0) { /* cache is filled already */
break;
}
cache_sizes[type] = ucs_x86_cpu_cache_size_codes[tag].size;
cache_count++;
break;
}
}
}
}
return cache_count == UCS_CPU_CACHE_LAST ? UCS_OK : UCS_ERR_UNSUPPORTED;
}
void ucs_x86_memcpy_sse_movntdqa(void *dst, const void *src, size_t len)
{
#if defined (__SSE4_1__)
/* Copy unaligned portion of src */
if ((uintptr_t)src & 15) {
uintptr_t aligned = (uintptr_t)src & ~15;
uintptr_t misalign = (uintptr_t)src & 15;
uintptr_t copy = ucs_min(len, 16 - misalign);
__m128i tmp = _mm_load(aligned);
memcpy(dst, UCS_PTR_BYTE_OFFSET(&tmp, misalign), copy);
src = UCS_PTR_BYTE_OFFSET(src, copy);
dst = UCS_PTR_BYTE_OFFSET(dst, copy);
len -= copy;
}
/* Copy 64 bytes at a time */
while (len >= 64) {
__m128i *S = (__m128i *)src;
__m128i *D = (__m128i *)dst;
__m128i tmp[4];
tmp[0] = _mm_load(S + 0);
tmp[1] = _mm_load(S + 1);
tmp[2] = _mm_load(S + 2);
tmp[3] = _mm_load(S + 3);
_mm_store(D + 0, tmp[0]);
_mm_store(D + 1, tmp[1]);
_mm_store(D + 2, tmp[2]);
_mm_store(D + 3, tmp[3]);
src = UCS_PTR_BYTE_OFFSET(src, 64);
dst = UCS_PTR_BYTE_OFFSET(dst, 64);
len -= 64;
}
/* Copy 16 bytes at a time */
while (len >= 16) {
_mm_store(dst, _mm_load(src));
src = UCS_PTR_BYTE_OFFSET(src, 16);
dst = UCS_PTR_BYTE_OFFSET(dst, 16);
len -= 16;
}
/* Copy any remaining bytes */
if (len) {
__m128i tmp = _mm_load(src);
memcpy(dst, &tmp, len);
}
#else
memcpy(dst, src, len);
#endif
}
#endif