/*
* ORC - Library of Optimized Inner Loops
* Copyright (c) 2003,2004,2010 David A. Schleef <ds@schleef.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <orc/orcdebug.h>
#include <orc/orcsse.h>
#include <orc/orcmmx.h>
#include <orc/orcprogram.h>
#include <orc/orcutils.h>
#include "orcinternal.h"
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <setjmp.h>
#include <signal.h>
#include <time.h>
orc_uint32 orc_x86_vendor;
int orc_x86_sse_flags;
int orc_x86_mmx_flags;
int orc_x86_microarchitecture;
#if defined(_MSC_VER)
static void
get_cpuid (orc_uint32 op, orc_uint32 *a, orc_uint32 *b, orc_uint32 *c, orc_uint32 *d)
{
int tmp[4];
__cpuid(tmp, op);
*a = tmp[0];
*b = tmp[1];
*c = tmp[2];
*d = tmp[3];
}
static void
get_cpuid_ecx (orc_uint32 op, orc_uint32 init_ecx, orc_uint32 *a, orc_uint32 *b, orc_uint32 *c, orc_uint32 *d)
{
#if _MSC_VER >= 1500
int tmp[4];
__cpuidex(tmp, op, init_ecx);
*a = tmp[0];
*b = tmp[1];
*c = tmp[2];
*d = tmp[3];
#else
*a = 0;
*b = 0;
*c = 0;
*d = 0;
#endif
}
#elif defined(__GNUC__) || defined (__SUNPRO_C)
static void
get_cpuid_ecx (orc_uint32 op, orc_uint32 init_ecx, orc_uint32 *a, orc_uint32 *b,
orc_uint32 *c, orc_uint32 *d)
{
*a = op;
*c = init_ecx;
#if defined(HAVE_I386)
__asm__ (
" pushl %%ebx\n"
" cpuid\n"
" mov %%ebx, %%esi\n"
" popl %%ebx\n"
: "+a" (*a), "=S" (*b), "+c" (*c), "=d" (*d));
#elif defined(HAVE_AMD64)
__asm__ (
" cpuid\n"
: "+a" (*a), "=b" (*b), "+c" (*c), "=d" (*d));
#endif
}
static void
get_cpuid (orc_uint32 op, orc_uint32 *a, orc_uint32 *b,
orc_uint32 *c, orc_uint32 *d)
{
get_cpuid_ecx (op, 0, a, b, c, d);
}
#else
/* FIXME generate a get_cpuid() function at runtime. */
#error Need get_cpuid() function.
#endif
struct desc_struct {
int desc;
int level;
int size;
};
struct desc_struct cache_descriptors[] = {
{ 0x0a, 1, 8*1024 },
{ 0x0c, 1, 16*1024 },
{ 0x0d, 1, 16*1024 },
{ 0x0e, 1, 24*1024 },
{ 0x21, 2, 256*1024 },
{ 0x22, 3, 512*1024 },
{ 0x23, 3, 1024*1024 },
{ 0x25, 3, 2*1024*1024 },
{ 0x29, 3, 4*1024*1024 },
{ 0x2c, 1, 32*1024 },
{ 0x41, 2, 128*1024 },
{ 0x42, 2, 256*1024 },
{ 0x43, 2, 512*1024 },
{ 0x44, 2, 1*1024*1024 },
{ 0x45, 2, 2*1024*1024 },
{ 0x46, 3, 4*1024*1024 },
{ 0x47, 3, 8*1024*1024 },
{ 0x48, 2, 3*1024*1024 },
{ 0x49, 2, 4*1024*1024 }, /* special case */
{ 0x4a, 3, 6*1024*1024 },
{ 0x4b, 3, 8*1024*1024 },
{ 0x4c, 3, 12*1024*1024 },
{ 0x4d, 3, 16*1024*1024 },
{ 0x4e, 2, 6*1024*1024 },
{ 0x60, 1, 16*1024 },
{ 0x66, 1, 8*1024 },
{ 0x67, 1, 16*1024 },
{ 0x68, 1, 32*1024 },
{ 0x78, 2, 1*1024*1024 },
{ 0x79, 2, 128*1024 },
{ 0x7a, 2, 256*1024 },
{ 0x7b, 2, 512*1024 },
{ 0x7c, 2, 1*1024*1024 },
{ 0x7d, 2, 2*1024*1024 },
{ 0x7f, 2, 512*1024 },
{ 0x80, 2, 512*1024 },
{ 0x82, 2, 256*1024 },
{ 0x83, 2, 512*1024 },
{ 0x84, 2, 1*1024*1024 },
{ 0x85, 2, 2*1024*1024 },
{ 0x86, 2, 512*1024 },
{ 0x87, 2, 1*1024*1024 },
{ 0xe4, 3, 8*1024*1024 }
};
static void
handle_cache_descriptor (unsigned int desc)
{
int i;
if (desc == 0) return;
/* special case */
if (desc == 0x49 && _orc_cpu_family == 0xf && _orc_cpu_model == 0x6) {
ORC_DEBUG("level %d size %d", 3, 4*1024*1024);
_orc_data_cache_size_level3 = 4*1024*1024;
return;
}
for(i=0;i<sizeof(cache_descriptors)/sizeof(cache_descriptors[0]);i++){
if (desc == cache_descriptors[i].desc) {
ORC_DEBUG("level %d size %d", cache_descriptors[i].level,
cache_descriptors[i].size);
switch (cache_descriptors[i].level) {
case 1:
_orc_data_cache_size_level1 = cache_descriptors[i].size;
break;
case 2:
_orc_data_cache_size_level2 = cache_descriptors[i].size;
break;
case 3:
_orc_data_cache_size_level3 = cache_descriptors[i].size;
break;
}
}
}
}
static void orc_sse_detect_cpuid_intel (orc_uint32 level);
static void orc_sse_detect_cpuid_amd (orc_uint32 level);
static void orc_sse_detect_cpuid_generic (orc_uint32 level);
static void
orc_x86_detect_cpuid (void)
{
static int inited = 0;
orc_uint32 ebx, edx;
orc_uint32 level;
if (inited) return;
inited = 1;
get_cpuid (0x00000000, &level, &ebx, &orc_x86_vendor, &edx);
ORC_DEBUG("cpuid %d %08x %08x %08x", level, ebx, edx, orc_x86_vendor);
#define ORC_X86_GenuineIntel (('n'<<0)|('t'<<8)|('e'<<16)|('l'<<24))
#define ORC_X86_AuthenticAMD (('c'<<0)|('A'<<8)|('M'<<16)|('D'<<24))
#define ORC_X86_CentaurHauls (('a'<<0)|('u'<<8)|('l'<<16)|('s'<<24))
#define ORC_X86_CyrixInstead (('t'<<0)|('e'<<8)|('a'<<16)|('d'<<24))
#define ORC_X86_GenuineTMx86 (('M'<<0)|('x'<<8)|('8'<<16)|('6'<<24))
#define ORC_X86_Geode_by_NSC ((' '<<0)|('N'<<8)|('S'<<16)|('6'<<24))
#define ORC_X86_NexGenDriven (('i'<<0)|('v'<<8)|('e'<<16)|('n'<<24))
#define ORC_X86_RiseRiseRise (('R'<<0)|('i'<<8)|('s'<<16)|('e'<<24))
#define ORC_X86_SiS_SiS_SiS_ (('S'<<0)|('i'<<8)|('S'<<16)|(' '<<24))
#define ORC_X86_UMC_UMC_UMC_ (('U'<<0)|('M'<<8)|('C'<<16)|(' '<<24))
#define ORC_X86_VIA_VIA_VIA_ (('V'<<0)|('I'<<8)|('A'<<16)|(' '<<24))
switch (orc_x86_vendor) {
case ORC_X86_GenuineIntel:
orc_sse_detect_cpuid_intel (level);
break;
case ORC_X86_AuthenticAMD:
orc_sse_detect_cpuid_amd (level);
break;
default:
ORC_INFO("unhandled vendor %08x %08x %08x", ebx, edx, orc_x86_vendor);
orc_sse_detect_cpuid_generic (level);
break;
}
if (orc_compiler_flag_check ("-sse2")) {
orc_x86_sse_flags &= ~ORC_TARGET_SSE_SSE2;
}
if (orc_compiler_flag_check ("-sse3")) {
orc_x86_sse_flags &= ~ORC_TARGET_SSE_SSE3;
}
if (orc_compiler_flag_check ("-ssse3")) {
orc_x86_sse_flags &= ~ORC_TARGET_SSE_SSSE3;
}
if (orc_compiler_flag_check ("-sse41")) {
orc_x86_sse_flags &= ~ORC_TARGET_SSE_SSE4_1;
}
if (orc_compiler_flag_check ("-sse42")) {
orc_x86_sse_flags &= ~ORC_TARGET_SSE_SSE4_2;
}
if (orc_compiler_flag_check ("-sse4a")) {
orc_x86_sse_flags &= ~ORC_TARGET_SSE_SSE4A;
}
if (orc_compiler_flag_check ("-sse5")) {
orc_x86_sse_flags &= ~ORC_TARGET_SSE_SSE5;
}
}
char orc_x86_processor_string[49];
static void
orc_x86_cpuid_get_branding_string (void)
{
get_cpuid (0x80000002,
(orc_uint32 *)(orc_x86_processor_string+0),
(orc_uint32 *)(orc_x86_processor_string+4),
(orc_uint32 *)(orc_x86_processor_string+8),
(orc_uint32 *)(orc_x86_processor_string+12));
get_cpuid (0x80000003,
(orc_uint32 *)(orc_x86_processor_string+16),
(orc_uint32 *)(orc_x86_processor_string+20),
(orc_uint32 *)(orc_x86_processor_string+24),
(orc_uint32 *)(orc_x86_processor_string+28));
get_cpuid (0x80000004,
(orc_uint32 *)(orc_x86_processor_string+32),
(orc_uint32 *)(orc_x86_processor_string+36),
(orc_uint32 *)(orc_x86_processor_string+40),
(orc_uint32 *)(orc_x86_processor_string+44));
ORC_INFO ("processor string '%s'", orc_x86_processor_string);
_orc_cpu_name = orc_x86_processor_string;
}
static void
orc_x86_cpuid_handle_standard_flags (void)
{
orc_uint32 eax, ebx, ecx, edx;
get_cpuid (0x00000001, &eax, &ebx, &ecx, &edx);
if (edx & (1<<23)) {
orc_x86_mmx_flags |= ORC_TARGET_MMX_MMX;
}
if (edx & (1<<26)) {
orc_x86_sse_flags |= ORC_TARGET_SSE_SSE2;
orc_x86_mmx_flags |= ORC_TARGET_MMX_MMXEXT;
}
if (ecx & (1<<0)) {
orc_x86_sse_flags |= ORC_TARGET_SSE_SSE3;
}
if (ecx & (1<<9)) {
orc_x86_sse_flags |= ORC_TARGET_SSE_SSSE3;
orc_x86_mmx_flags |= ORC_TARGET_MMX_SSSE3;
}
if (ecx & (1<<19)) {
orc_x86_sse_flags |= ORC_TARGET_SSE_SSE4_1;
orc_x86_mmx_flags |= ORC_TARGET_MMX_SSE4_1;
}
if (ecx & (1<<20)) {
orc_x86_sse_flags |= ORC_TARGET_SSE_SSE4_2;
}
}
static void
orc_x86_cpuid_handle_family_model_stepping (void)
{
orc_uint32 eax, ebx, ecx, edx;
int family_id;
int model_id;
int ext_family_id;
int ext_model_id;
get_cpuid (0x00000001, &eax, &ebx, &ecx, &edx);
family_id = (eax>>8)&0xf;
model_id = (eax>>4)&0xf;
ext_family_id = (eax>>20)&0xff;
ext_model_id = (eax>>16)&0xf;
_orc_cpu_family = family_id + ext_family_id;
_orc_cpu_model = (ext_model_id << 4) | model_id;
_orc_cpu_stepping = eax&0xf;
ORC_INFO ("family_id %d model_id %d stepping %d",
_orc_cpu_family, _orc_cpu_model, _orc_cpu_stepping);
}
static void
orc_sse_detect_cpuid_generic (orc_uint32 level)
{
if (level >= 1) {
orc_x86_cpuid_handle_standard_flags ();
orc_x86_cpuid_handle_family_model_stepping ();
}
}
static void
orc_sse_detect_cpuid_intel (orc_uint32 level)
{
orc_uint32 eax, ebx, ecx, edx;
if (level >= 1) {
orc_x86_cpuid_handle_standard_flags ();
orc_x86_cpuid_handle_family_model_stepping ();
orc_x86_microarchitecture = ORC_X86_UNKNOWN;
if (_orc_cpu_family == 6) {
switch (_orc_cpu_model) {
case 6: /* Mendocino */
case 11: /* Tualatin-256 */
orc_x86_microarchitecture = ORC_X86_P6;
break;
case 15:
case 22:
orc_x86_microarchitecture = ORC_X86_CORE;
break;
case 23:
case 29:
orc_x86_microarchitecture = ORC_X86_PENRYN;
break;
case 26:
orc_x86_microarchitecture = ORC_X86_NEHALEM;
break;
case 28:
orc_x86_microarchitecture = ORC_X86_BONNELL;
break;
/* orc_x86_microarchitecture = ORC_X86_WESTMERE; */
/* orc_x86_microarchitecture = ORC_X86_SANDY_BRIDGE; */
}
} else if (_orc_cpu_family == 15) {
orc_x86_microarchitecture = ORC_X86_NETBURST;
}
}
if (level >= 2) {
get_cpuid (0x00000002, &eax, &ebx, &ecx, &edx);
if ((eax&0x80000000) == 0) {
handle_cache_descriptor ((eax>>8)&0xff);
handle_cache_descriptor ((eax>>16)&0xff);
handle_cache_descriptor ((eax>>24)&0xff);
}
if ((ebx&0x80000000) == 0) {
handle_cache_descriptor (ebx&0xff);
handle_cache_descriptor ((ebx>>8)&0xff);
handle_cache_descriptor ((ebx>>16)&0xff);
handle_cache_descriptor ((ebx>>24)&0xff);
}
if ((ecx&0x80000000) == 0) {
handle_cache_descriptor (ecx&0xff);
handle_cache_descriptor ((ecx>>8)&0xff);
handle_cache_descriptor ((ecx>>16)&0xff);
handle_cache_descriptor ((ecx>>24)&0xff);
}
if ((edx&0x80000000) == 0) {
handle_cache_descriptor (edx&0xff);
handle_cache_descriptor ((edx>>8)&0xff);
handle_cache_descriptor ((edx>>16)&0xff);
handle_cache_descriptor ((edx>>24)&0xff);
}
}
if (level >= 4) {
int i;
for(i=0;i<10;i++){
int type;
int level;
int l;
int p;
int w;
int s;
get_cpuid_ecx (0x00000004, i, &eax, &ebx, &ecx, &edx);
type = eax&0xf;
if (type == 0) break;
level = (eax>>5)&0x7;
l = ((ebx>>0)&0xfff)+1;
p = ((ebx>>12)&0x3ff)+1;
w = ((ebx>>22)&0x3ff)+1;
s = ecx + 1;
ORC_INFO ("type %d level %d line size %d partitions %d ways %d sets %d",
type, level, l, p, w, s);
if (type == 1 || type == 3) {
switch (level) {
case 1:
_orc_data_cache_size_level1 = l*p*w*s;
break;
case 2:
_orc_data_cache_size_level2 = l*p*w*s;
break;
case 3:
_orc_data_cache_size_level3 = l*p*w*s;
break;
}
}
}
}
get_cpuid (0x80000000, &level, &ebx, &ecx, &edx);
if (level >= 4) {
orc_x86_cpuid_get_branding_string ();
}
}
static void
orc_sse_detect_cpuid_amd (orc_uint32 level)
{
orc_uint32 eax, ebx, ecx, edx;
if (level >= 1) {
orc_x86_cpuid_handle_standard_flags ();
orc_x86_cpuid_handle_family_model_stepping ();
orc_x86_microarchitecture = ORC_X86_UNKNOWN;
switch (_orc_cpu_family) {
case 5:
/* Don't know if 8 is correct */
if (_orc_cpu_model < 8) {
orc_x86_microarchitecture = ORC_X86_K5;
} else {
orc_x86_microarchitecture = ORC_X86_K6;
}
break;
case 6:
orc_x86_microarchitecture = ORC_X86_K7;
break;
case 0xf:
orc_x86_microarchitecture = ORC_X86_K8;
break;
case 0x10:
orc_x86_microarchitecture = ORC_X86_K10;
break;
default:
break;
}
}
get_cpuid (0x80000000, &level, &ebx, &ecx, &edx);
if (level >= 1) {
get_cpuid (0x80000001, &eax, &ebx, &ecx, &edx);
/* AMD flags */
if (ecx & (1<<6)) {
orc_x86_sse_flags |= ORC_TARGET_SSE_SSE4A;
}
if (ecx & (1<<11)) {
orc_x86_sse_flags |= ORC_TARGET_SSE_SSE5;
}
if (edx & (1<<22)) {
orc_x86_mmx_flags |= ORC_TARGET_MMX_MMXEXT;
}
if (edx & (1<<31)) {
orc_x86_mmx_flags |= ORC_TARGET_MMX_3DNOW;
}
if (edx & (1<<30)) {
orc_x86_mmx_flags |= ORC_TARGET_MMX_3DNOWEXT;
}
}
if (level >= 4) {
orc_x86_cpuid_get_branding_string ();
}
if (level >= 6) {
get_cpuid (0x80000005, &eax, &ebx, &ecx, &edx);
_orc_data_cache_size_level1 = ((ecx>>24)&0xff) * 1024;
ORC_INFO ("L1 D-cache: %d kbytes, %d-way, %d lines/tag, %d line size",
(ecx>>24)&0xff, (ecx>>16)&0xff, (ecx>>8)&0xff, ecx&0xff);
ORC_INFO ("L1 I-cache: %d kbytes, %d-way, %d lines/tag, %d line size",
(edx>>24)&0xff, (edx>>16)&0xff, (edx>>8)&0xff, edx&0xff);
get_cpuid (0x80000006, &eax, &ebx, &ecx, &edx);
_orc_data_cache_size_level2 = ((ecx>>16)&0xffff) * 1024;
ORC_INFO ("L2 cache: %d kbytes, %d assoc, %d lines/tag, %d line size",
(ecx>>16)&0xffff, (ecx>>12)&0xf, (ecx>>8)&0xf, ecx&0xff);
}
}
unsigned int
orc_sse_get_cpu_flags(void)
{
orc_x86_detect_cpuid ();
return orc_x86_sse_flags;
}
unsigned int
orc_mmx_get_cpu_flags(void)
{
orc_x86_detect_cpuid ();
return orc_x86_mmx_flags;
}