diff --git a/manual/tunables.texi b/manual/tunables.texi index 3dc6f9a..3e1e519 100644 --- a/manual/tunables.texi +++ b/manual/tunables.texi @@ -364,7 +364,11 @@ set shared cache size in bytes for use in memory and string routines. @deftp Tunable glibc.tune.x86_non_temporal_threshold The @code{glibc.tune.x86_non_temporal_threshold} tunable allows the user -to set threshold in bytes for non temporal store. +to set threshold in bytes for non temporal store. Non temporal stores +give a hint to the hardware to move data directly to memory without +displacing other data from the cache. This tunable is used by some +platforms to determine when to use non temporal stores in operations +like memmove and memcpy. This tunable is specific to i386 and x86-64. @end deftp diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c index b9444dd..42b468d 100644 --- a/sysdeps/x86/cacheinfo.c +++ b/sysdeps/x86/cacheinfo.c @@ -778,14 +778,20 @@ intel_bug_no_cache_info: __x86_shared_cache_size = shared; } - /* The large memcpy micro benchmark in glibc shows that 6 times of - shared cache size is the approximate value above which non-temporal - store becomes faster on a 8-core processor. This is the 3/4 of the - total shared cache size. */ + /* The default setting for the non_temporal threshold is 3/4 of one + thread's share of the chip's cache. For most Intel and AMD processors + with an initial release date between 2017 and 2020, a thread's typical + share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4 + threshold leaves 125 KBytes to 500 KBytes of the thread's data + in cache after a maximum temporal copy, which will maintain + in cache a reasonable portion of the thread's stack and other + active data. If the threshold is set higher than one thread's + share of the cache, it has a substantial risk of negatively + impacting the performance of other threads running on the chip. */ __x86_shared_non_temporal_threshold = (cpu_features->non_temporal_threshold != 0 ? cpu_features->non_temporal_threshold - : __x86_shared_cache_size * threads * 3 / 4); + : __x86_shared_cache_size * 3 / 4); } #endif