diff --git a/COMMIT b/COMMIT index d92e801..b5b8411 100644 --- a/COMMIT +++ b/COMMIT @@ -1 +1 @@ -853ab1113c4eabf7218dfab673e433588fe7a8c4 \ No newline at end of file +7a33bedc4bb3dff4e57c00293a2d70890db4d983 \ No newline at end of file diff --git a/CONTRIBUTORS b/CONTRIBUTORS new file mode 100644 index 0000000..7571183 --- /dev/null +++ b/CONTRIBUTORS @@ -0,0 +1,15 @@ +The following developers have all contributed bug fixes to the open source +version of the PSM library. Intel gratefully thanks them for their +contributions: + +Michal Schmidt (michich on github.com) +Lisanna Dettwyler (LisannaDettwyler on github.com) +Ana Guerrero Lopez (ana on github.com) +Brian Smith (bsmith94 on github.com) +Michael J OConnor (michael-j-oconnor on github.com) +Nicolas Morey-Chaismartin (nmorey on github.com) +Bernhard M. Wiedemann (bmwidemann on github.com) +Dmitry (dmitrygx on github.com) +Florian Weimer (fweimer on github.com) +Jonas Hahnfeld (hahnjo on github.com) +Tom Stellard (tstellar on github.com) diff --git a/COPYING b/COPYING index ea3d558..d0d6f87 100644 --- a/COPYING +++ b/COPYING @@ -313,64 +313,3 @@ PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - -Also add information on how to contact you by electronic and paper mail. - -If the program is interactive, make it output a short notice like this -when it starts in an interactive mode: - - Gnomovision version 69, Copyright (C) year name of author - Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, the commands you use may -be called something other than `show w' and `show c'; they could even be -mouse-clicks or menu items--whatever suits your program. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the program, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the program - `Gnomovision' (which makes passes at compilers) written by James Hacker. - - , 1 April 1989 - Ty Coon, President of Vice - -This General Public License does not permit incorporating your program into -proprietary programs. If your program is a subroutine library, you may -consider it more useful to permit linking proprietary applications with the -library. If this is what you want to do, use the GNU Library General -Public License instead of this License. - diff --git a/Makefile b/Makefile index 8f51f46..5a31d64 100644 --- a/Makefile +++ b/Makefile @@ -164,7 +164,7 @@ nthreads := $(shell echo $$(( `nproc` * 2 )) ) # The DISTRO variable is used subsequently for variable # behaviors of the 3 distros. -DISTRO := $(shell . /etc/os-release; echo $$ID) +DISTRO := $(shell . /etc/os-release; if [[ "$$ID" == "sle_hpc" ]]; then ID="sles"; fi; echo $$ID) # By default the following two variables have the following values: LIBPSM2_COMPAT_CONF_DIR := /etc @@ -277,7 +277,7 @@ OSVERSION := $(shell grep VERSION_ID /etc/os-release | cut -f 2 -d\" | cut -f 1 OSSUBVERSION := $(shell grep VERSION_ID /etc/os-release | cut -f 2 -d\" | cut -f 2 -d.) override RPM_NAME_BASEEXT := $(shell \ - if [ "$(OS)" = "SLES" ]; then \ + if [ "$(OS)" = "SLES" -o "$(OS)" = "SLE_HPC" ]; then \ if [ $(OSVERSION) -gt 11 ]; then \ if [ $(OSVERSION) -eq 12 ]; then \ if [ $(OSSUBVERSION) -gt 2 ]; then \ @@ -483,7 +483,7 @@ dist: distclean PRUNE_LIST=""; \ for pd in ".git" "cscope*" "$(shell realpath --relative-to=${top_srcdir} ${OUTDIR})" \ "*.orig" "*~" "#*" ".gitignore" "doc" "libcm" "psm.supp" "test" "psm_hal_MOCK" \ - "tools" "artifacts" "*.rej.patch"; do \ + "psm_test" "tools" "artifacts" "*.rej.patch"; do \ PRUNE_LIST="$$PRUNE_LIST -name $$pd -prune -o"; \ done; \ for hid in psm_hal_* ; do \ diff --git a/README b/README index a6efb40..7990555 100644 --- a/README +++ b/README @@ -67,6 +67,7 @@ Contains the following sections: - INSTALLING * INSTALLING USING MAKEFILE * INSTALLING USING EITHER YUM OR DNF +- TESTING - RELATED SOFTWARE TO PSM2 - SUPPORTING DOCUMENTATION @@ -181,6 +182,10 @@ BUILDING USING RPMBUILD Other supporting RPM package names will be as listed above. +2. To build rpm files from the srpm file with Intel C (icc), specify the + correct CCARCH in the rpmbuild environment: + $ env CCARCH=icc rpmbuild --rebuild SRPMS/libpsm2-10.3.7-1.src.rpm + INSTALLING ========== diff --git a/buildflags.mak b/buildflags.mak index 6790fb7..7c3cda0 100644 --- a/buildflags.mak +++ b/buildflags.mak @@ -60,19 +60,11 @@ endif export os ?= $(shell uname -s | tr '[A-Z]' '[a-z]') export arch := $(shell uname -m | sed -e 's,\(i[456]86\|athlon$$\),i386,') -ifeq (${CCARCH},gcc) - export CC := gcc +ifeq (${CCARCH},$(filter ${CCARCH},gcc gcc4 icc clang)) + export CC := ${CCARCH} else - ifeq (${CCARCH},gcc4) - export CC := gcc4 - else - ifeq (${CCARCH},icc) - export CC := icc - else - anerr := $(error Unknown C compiler arch: ${CCARCH}) - endif # ICC - endif # gcc4 -endif # gcc + anerr := $(error Unknown C compiler arch: ${CCARCH}) +endif ifeq (${FCARCH},gfortran) export FC := gfortran @@ -108,48 +100,48 @@ BASECFLAGS +=-Wall $(WERROR) # test if compiler supports 32B(AVX2)/64B(AVX512F) move instruction. # ifeq (${CC},icc) - ifeq ($(PSM_DISABLE_AVX2),) - MAVX2=-xATOM_SSE4.2 -DPSM_AVX512 - else - MAVX2=-march=core-avx-i - endif + ifeq ($(PSM_DISABLE_AVX2),) + MAVX2=-xATOM_SSE4.2 -DPSM_AVX512 + else + MAVX2=-march=core-avx-i + endif else - ifeq ($(PSM_DISABLE_AVX2),) - MAVX2=-mavx2 - else - MAVX2=-mavx - endif + ifeq ($(PSM_DISABLE_AVX2),) + MAVX2=-mavx2 + else + MAVX2=-mavx + endif endif ifneq (icc,${CC}) - ifeq ($(PSM_DISABLE_AVX2),) - RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX2 ; echo $$?) - else - RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX ; echo $$?) - $(warning ***NOTE TO USER**** Disabling AVX2 will harm performance) - endif - - ifeq (0,${RET}) - BASECFLAGS += ${MAVX2} - else - $(error Compiler does not support ${MAVX2} ) - endif + ifeq ($(PSM_DISABLE_AVX2),) + RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX2 ; echo $$?) + else + RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX ; echo $$?) + $(warning ***NOTE TO USER**** Disabling AVX2 will harm performance) + endif + + ifeq (0,${RET}) + BASECFLAGS += ${MAVX2} + else + $(error Compiler does not support ${MAVX2} ) + endif else - BASECFLAGS += ${MAVX2} + BASECFLAGS += ${MAVX2} endif # This support is dynamic at runtime, so is OK to enable as long as compiler can generate # the code. ifneq (,${PSM_AVX512}) - ifneq (icc,${CC}) - RET := $(shell echo "int main() {}" | ${CC} -mavx512f -E -dM -xc - 2>&1 | grep -q AVX512 ; echo $$?) - ifeq (0,${RET}) - BASECFLAGS += -mavx512f - else - $(error Compiler does not support AVX512 ) - endif - BASECFLAGS += -DPSM_AVX512 - endif + ifneq (icc,${CC}) + RET := $(shell echo "int main() {}" | ${CC} -mavx512f -E -dM -xc - 2>&1 | grep -q AVX512 ; echo $$?) + ifeq (0,${RET}) + BASECFLAGS += -mavx512f + else + $(error Compiler does not support AVX512 ) + endif + BASECFLAGS += -DPSM_AVX512 + endif endif # @@ -158,38 +150,42 @@ endif BASECFLAGS += -D_DEFAULT_SOURCE -D_SVID_SOURCE -D_BSD_SOURCE ifneq (,${HFI_BRAKE_DEBUG}) - BASECFLAGS += -DHFI_BRAKE_DEBUG + BASECFLAGS += -DHFI_BRAKE_DEBUG +endif +ifneq (,${PSM_FI}) + BASECFLAGS += -DPSM_FI endif ifneq (,${PSM_DEBUG}) - BASECFLAGS += -O -g3 -DPSM_DEBUG -D_HFI_DEBUGGING -funit-at-a-time -Wp,-D_FORTIFY_SOURCE=2 + BASECFLAGS += -O -g3 -DPSM_DEBUG -D_HFI_DEBUGGING -funit-at-a-time -Wp,-D_FORTIFY_SOURCE=2 else - BASECFLAGS += -O3 -g3 + BASECFLAGS += -O3 -g3 endif ifneq (,${PSM_COVERAGE}) # This check must come after PSM_DEBUG to override optimization setting - BASECFLAGS += -O -fprofile-arcs -ftest-coverage - LDFLAGS += -fprofile-arcs + BASECFLAGS += -O -fprofile-arcs -ftest-coverage + LDFLAGS += -fprofile-arcs endif ifneq (,${PSM_LOG}) - BASECFLAGS += -DPSM_LOG + BASECFLAGS += -DPSM_LOG ifneq (,${PSM_LOG_FAST_IO}) - BASECFLAGS += -DPSM_LOG_FAST_IO - PSM2_ADDITIONAL_GLOBALS += psmi_log_fini;psmi_log_message; + BASECFLAGS += -DPSM_LOG_FAST_IO + PSM2_ADDITIONAL_GLOBALS += psmi_log_fini;psmi_log_message; endif endif ifneq (,${PSM_PERF}) - BASECFLAGS += -DRDPMC_PERF_FRAMEWORK + BASECFLAGS += -DRDPMC_PERF_FRAMEWORK endif ifneq (,${PSM_HEAP_DEBUG}) - BASECFLAGS += -DPSM_HEAP_DEBUG - PSM2_ADDITIONAL_GLOBALS += _psmi_heapdebug_val_heapallocs; + BASECFLAGS += -DPSM_HEAP_DEBUG + PSM2_ADDITIONAL_GLOBALS += _psmi_heapdebug_val_heapallocs; endif ifneq (,${PSM_PROFILE}) - BASECFLAGS += -DPSM_PROFILE + BASECFLAGS += -DPSM_PROFILE endif +BASECFLAGS += -DNVIDIA_GPU_DIRECT ifneq (,${PSM_CUDA}) - BASECFLAGS += -DNVIDIA_GPU_DIRECT -DPSM_CUDA - CUDA_HOME ?= /usr/local/cuda - INCLUDES += -I$(CUDA_HOME)/include + BASECFLAGS += -DPSM_CUDA + CUDA_HOME ?= /usr/local/cuda + INCLUDES += -I$(CUDA_HOME)/include endif BASECFLAGS += -fpic -fPIC -D_GNU_SOURCE @@ -199,15 +195,16 @@ ASFLAGS += -g3 -fpic BASECFLAGS += ${OPA_CFLAGS} ifeq (${CCARCH},icc) - BASECFLAGS += -fpic -fPIC -D_GNU_SOURCE -DPACK_STRUCT_STL=packed, - LDFLAGS += -static-intel + BASECFLAGS += -fpic -fPIC -D_GNU_SOURCE -DPACK_STRUCT_STL=packed, + LDFLAGS += -static-intel else - ifeq (${CCARCH},gcc) - BASECFLAGS += -funwind-tables -Wno-strict-aliasing -Wformat-security + LDFLAGS += -Wl,--build-id + ifeq (${CCARCH},$(filter ${CCARCH},gcc clang)) + BASECFLAGS += -funwind-tables -Wno-strict-aliasing -Wformat-security else - ifneq (${CCARCH},gcc4) - $(error Unknown compiler arch "${CCARCH}") - endif # gcc4 + ifneq (${CCARCH},gcc4) + $(error Unknown compiler arch "${CCARCH}") + endif # gcc4 endif # gcc endif # icc diff --git a/compat/buildflags.mak b/compat/buildflags.mak index b448e4e..db34848 100644 --- a/compat/buildflags.mak +++ b/compat/buildflags.mak @@ -57,19 +57,11 @@ export os ?= $(shell uname -s | tr '[A-Z]' '[a-z]') export arch := $(shell uname -m | sed -e 's,\(i[456]86\|athlon$$\),i386,') export CCARCH ?= gcc -ifeq (${CCARCH},gcc) - export CC := gcc +ifeq (${CCARCH},$(filter ${CCARCH},gcc gcc4 icc clang)) + export CC := ${CCARCH} else - ifeq (${CCARCH},gcc4) - export CC := gcc4 - else - ifeq (${CCARCH},icc) - export CC := icc - else - anerr := $(error Unknown C compiler arch: ${CCARCH}) - endif # ICC - endif # gcc4 -endif # gcc + anerr := $(error Unknown C compiler arch: ${CCARCH}) +endif BASECFLAGS += $(BASE_FLAGS) LDFLAGS += $(BASE_FLAGS) @@ -90,7 +82,7 @@ ifeq (${CCARCH},icc) BASECFLAGS += -O3 -g3 LDFLAGS += -static-intel else - ifeq (${CCARCH},gcc) + ifeq (${CCARCH},$(filter ${CCARCH},gcc clang)) BASECFLAGS += -Wno-strict-aliasing else ifneq (${CCARCH},gcc4) diff --git a/compat/psm-compat.c b/compat/psm-compat.c index 7d12165..258851c 100644 --- a/compat/psm-compat.c +++ b/compat/psm-compat.c @@ -242,11 +242,11 @@ psm_mq_setopt(psm2_mq_t mq, int key, const void *value) } psm2_error_t -psm_mq_init(psm2_ep_t ep, uint64_t tag_order_mask, +psm_mq_init(psm2_ep_t ep, uint64_t ignored, const struct psm2_optkey *opts, int numopts, psm2_mq_t *mqo) { - return psm2_mq_init(ep, tag_order_mask, opts, numopts, mqo); + return psm2_mq_init(ep, ignored, opts, numopts, mqo); } psm2_error_t diff --git a/include/opa_service.h b/include/opa_service.h index 3ec4824..1e7c9ab 100644 --- a/include/opa_service.h +++ b/include/opa_service.h @@ -84,8 +84,11 @@ int hfi_get_ctrs_port_names(int unitno, char **namep); /* sysfs helper routines (only those currently used are exported; * try to avoid using others) */ -/* Initializes the following sysfs helper routines. */ -void sysfs_init(const char *dflt_hfi_class_path); +/* Initializes the following sysfs helper routines. + sysfs_init() returns 0 on success, non-zero on an error: */ +int sysfs_init(const char *dflt_hfi_class_path); +/* Complementary */ +void sysfs_fini(void); /* read a string value into buff, no more than size bytes. returns the number of bytes read */ diff --git a/include/opa_user.h b/include/opa_user.h index 637dacb..624b8b2 100644 --- a/include/opa_user.h +++ b/include/opa_user.h @@ -133,14 +133,8 @@ #ifdef PSM_CUDA extern int is_driver_gpudirect_enabled; -static __inline__ int _psmi_is_driver_gpudirect_enabled() __attribute__((always_inline)); - -static __inline__ int -_psmi_is_driver_gpudirect_enabled() -{ - return is_driver_gpudirect_enabled; -} -#define PSMI_IS_DRIVER_GPUDIRECT_ENABLED _psmi_is_driver_gpudirect_enabled() +#define PSMI_IS_DRIVER_GPUDIRECT_ENABLED likely(is_driver_gpudirect_enabled) +#define PSMI_IS_DRIVER_GPUDIRECT_DISABLED unlikely(!is_driver_gpudirect_enabled) #endif /* hfi kdeth header format */ diff --git a/include/rbtree.c b/include/rbtree.c index 9d6930d..b79f135 100644 --- a/include/rbtree.c +++ b/include/rbtree.c @@ -85,13 +85,22 @@ #include /* for memset declaration */ -#if !defined ( RBTREE_GET_LEFTMOST ) || \ +// RBTREE_CMP should be a comparator, i.e. RBTREE_CMP(a, b) should evaluate to +// -1, 0, or 1 depending on if a < b, a == b, or a > b, respectively. +#ifdef RBTREE_CMP + +#if defined(RBTREE_GET_LEFTMOST) || defined(RBTREE_GET_RIGHTMOST) +#error Cannot define both RBTREE_CMP and RBTREE_GET_(LEFT|RIGHT)MOST +#endif + +#elif !defined ( RBTREE_GET_LEFTMOST ) || \ ! defined ( RBTREE_GET_RIGHTMOST ) || \ ! defined ( RBTREE_MAP_COUNT ) || \ ! defined ( RBTREE_ASSERT ) #error "You must define RBTREE_GET_LEFTMOST and RBTREE_GET_RIGHTMOST and \ RBTREE_MAP_COUNT and RBTREE_ASSERT before including rbtree.c" -#endif + +#endif /* RBTREE_CMP */ #define IN /* nothing */ @@ -117,13 +126,24 @@ static void ips_cl_qmap_remove_item( static cl_map_item_t* ips_cl_qmap_successor( IN cl_qmap_t* const p_map, IN const cl_map_item_t* p_item); + + +#ifndef RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR static cl_map_item_t* ips_cl_qmap_predecessor( IN cl_qmap_t* const p_map, IN const cl_map_item_t* p_item); +#endif + +#if defined(RBTREE_GET_LEFTMOST) static cl_map_item_t* ips_cl_qmap_search( IN cl_qmap_t* const p_map, IN unsigned long start, IN unsigned long end); +#else +static cl_map_item_t* ips_cl_qmap_searchv( + cl_qmap_t* const p_map, + const RBTREE_MI_PL *key); +#endif /* * Get the root. @@ -380,7 +400,11 @@ ips_cl_qmap_insert_item( p_insert_at = p_comp_item; /* Traverse the tree until the correct insertion point is found. */ +#ifdef RBTREE_GET_LEFTMOST if( RBTREE_GET_LEFTMOST(&p_item->payload) < RBTREE_GET_LEFTMOST(&p_insert_at->payload) ) +#else + if(RBTREE_CMP(&p_item->payload, &p_insert_at->payload) < 0) +#endif { p_comp_item = p_insert_at->p_left; compare_res = 1; @@ -604,6 +628,11 @@ ips_cl_qmap_successor( } } +// When includer defines RBTREE_CMP, ips_cl_qmap_search() is not emitted. +// When this happens, ips_cl_qmap_predecessor() may not be called. +// Combined with -Werror -Wunused-function, libpsm2 fails to build. +// So provide macro to control emitting this function +#ifndef RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR static cl_map_item_t * ips_cl_qmap_predecessor( IN cl_qmap_t* const p_map, @@ -627,7 +656,9 @@ ips_cl_qmap_predecessor( return p_tmp; } } +#endif /* RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR */ +#if defined(RBTREE_GET_LEFTMOST) /* * return the first node with buffer overlapping or zero. */ @@ -690,3 +721,23 @@ ips_cl_qmap_search(cl_qmap_t * const p_map, return p_item; } +#else /* defined(...LEFTMOST) || defined(...RIGHTMOST) */ +static cl_map_item_t * +ips_cl_qmap_searchv(cl_qmap_t * const p_map, const RBTREE_MI_PL *key) +{ + RBTREE_ASSERT( p_map ); + cl_map_item_t *p_item = __cl_map_root(p_map); + + while (p_item != p_map->nil_item) { + if (RBTREE_CMP(key, &p_item->payload) > 0) { + p_item = p_item->p_right; + } else if (RBTREE_CMP(key, &p_item->payload) < 0) { + p_item = p_item->p_left; + } else { + break; + } + } + + return p_item; +} +#endif /* defined(...LEFTMOST) || defined(...RIGHTMOST) */ diff --git a/libpsm2.spec.in b/libpsm2.spec.in index b033dff..e9ef766 100644 --- a/libpsm2.spec.in +++ b/libpsm2.spec.in @@ -71,8 +71,8 @@ Obsoletes: hfi1-psm < 1.0.0 %if "@RPM_NAME_BASEEXT@" %package -n @RPM_NAME@@RPM_NAME_BASEEXT@ -%endif Summary: Intel PSM2 Libraries +%endif Provides: @RPM_NAME@ = %{version}-%{release} Provides: @RPM_NAME@%{_isa} = %{version}-%{release} %if 0%{?suse_version} diff --git a/makesrpm.sh b/makesrpm.sh index 31caa01..18f74e2 100755 --- a/makesrpm.sh +++ b/makesrpm.sh @@ -80,7 +80,7 @@ function usage() echo " -d , -dir " echo " Optionally sets output folder for rpmbuild to use" echo " -h , -hal_gen " - echo " Optional, default is includes all HAL generations" + echo " Optional, default is to build gen1" echo " Sets hal generations for rpmbuild to use" echo " Examples:" echo " $0 b" @@ -142,7 +142,7 @@ while [ "$1" != "" ]; do done if [ "$HAL_GENS" = "" ]; then - HAL_GENS="*" + HAL_GENS="gen1" fi # Generic cleanup, build, and tmp folder creation diff --git a/opa/opa_dwordcpy-i386.S b/opa/opa_dwordcpy-i386.S index f3d898d..863941b 100644 --- a/opa/opa_dwordcpy-i386.S +++ b/opa/opa_dwordcpy-i386.S @@ -53,6 +53,10 @@ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ +#ifdef __CET__ +#include +#endif + .globl hfi_dwordcpy .file "opa_dword32cpy.S" .text @@ -61,6 +65,9 @@ hfi_dwordcpy: // standard C calling convention, args on stack // does not return any value .type hfi_dwordcpy, @function +#ifdef _CET_ENDBR + _CET_ENDBR +#endif // save caller-saved regs mov %edi,%eax mov %esi,%edx diff --git a/opa/opa_dwordcpy-x86_64-fast.S b/opa/opa_dwordcpy-x86_64-fast.S index fe07ebf..12fe9a3 100644 --- a/opa/opa_dwordcpy-x86_64-fast.S +++ b/opa/opa_dwordcpy-x86_64-fast.S @@ -53,6 +53,10 @@ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ +#ifdef __CET__ +#include +#endif + .globl hfi_dwordcpy .file "opa_dwordcpy-x86_64-fast.S" .text @@ -61,6 +65,9 @@ // does not return any value hfi_dwordcpy: .type hfi_dwordcpy, @function +#ifdef _CET_ENDBR + _CET_ENDBR +#endif movl %edx,%ecx shrl $1,%ecx andl $1,%edx diff --git a/opa/opa_sysfs.c b/opa/opa_sysfs.c index 91446ec..854f604 100644 --- a/opa/opa_sysfs.c +++ b/opa/opa_sysfs.c @@ -72,15 +72,20 @@ static size_t sysfs_path_len; static char *hfifs_path; static long sysfs_page_size; -void sysfs_init(const char *dflt_hfi_class_path) +int sysfs_init(const char *dflt_hfi_class_path) { + int rv = 0; + if (NULL != (sysfs_path = getenv("HFI_SYSFS_PATH"))) { char *syspath = strdup(sysfs_path); if (!syspath) + { _HFI_DBG("Failed to strdup(\"%s\") for syspath.\n", sysfs_path); + rv = -1; + } else sysfs_path = syspath; } @@ -89,7 +94,10 @@ void sysfs_init(const char *dflt_hfi_class_path) char *syspath = malloc(len); if (!syspath) + { _HFI_DBG("Failed to alloc %u bytes for syspath.\n",len); + rv = -1; + } else { snprintf(syspath, len, "%s_0", dflt_hfi_class_path); @@ -104,6 +112,7 @@ void sysfs_init(const char *dflt_hfi_class_path) { _HFI_DBG("Did not find sysfs directory %s, using anyway\n", sysfs_path); + rv = -1; } else { @@ -125,6 +134,13 @@ void sysfs_init(const char *dflt_hfi_class_path) if (!sysfs_page_size) sysfs_page_size = sysconf(_SC_PAGESIZE); + + return rv; +} + +void sysfs_fini(void) +{ + free(sysfs_path); } const char *hfi_sysfs_path(void) diff --git a/psm.c b/psm.c index cb12dc5..7f929ce 100644 --- a/psm.c +++ b/psm.c @@ -65,11 +65,14 @@ static int psmi_verno = PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR); static int psmi_verno_client_val; int psmi_epid_ver; +// Special psmi_refcount values #define PSMI_NOT_INITIALIZED 0 -#define PSMI_INITIALIZED 1 -#define PSMI_FINALIZED -1 /* Prevent the user from calling psm2_init - * once psm_finalize has been called. */ -static int psmi_isinit = PSMI_NOT_INITIALIZED; +#define PSMI_FINALIZED -1 + +// PSM2 doesn't support transitioning out of the PSMI_FINALIZED state +// once psmi_refcount is set to PSMI_FINALIZED, any further attempts to change +// psmi_refcount should be treated as an error +static int psmi_refcount = PSMI_NOT_INITIALIZED; /* Global lock used for endpoint creation and destroy * (in functions psm2_ep_open and psm2_ep_close) and also @@ -84,16 +87,56 @@ uint64_t *shared_affinity_ptr; char *sem_affinity_shm_rw_name; char *affinity_shm_name; +uint32_t psmi_cpu_model; + #ifdef PSM_CUDA int is_cuda_enabled; int is_gdr_copy_enabled; int device_support_gpudirect; +int gpu_p2p_supported = 0; +int my_gpu_device = 0; int cuda_lib_version; int is_driver_gpudirect_enabled; int is_cuda_primary_context_retain = 0; uint32_t cuda_thresh_rndv; uint32_t gdr_copy_threshold_send; uint32_t gdr_copy_threshold_recv; + +void *psmi_cuda_lib; +CUresult (*psmi_cuInit)(unsigned int Flags ); +CUresult (*psmi_cuCtxDetach)(CUcontext c); +CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c); +CUresult (*psmi_cuCtxSetCurrent)(CUcontext c); +CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); +CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); +CUresult (*psmi_cuDeviceCanAccessPeer)(int *canAccessPeer, CUdevice dev, CUdevice peerDev); +CUresult (*psmi_cuDeviceGet)(CUdevice* device, int ordinal); +CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev); +CUresult (*psmi_cuDriverGetVersion)(int* driverVersion); +CUresult (*psmi_cuDeviceGetCount)(int* count); +CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags); +CUresult (*psmi_cuStreamDestroy)(CUstream phStream); +CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags); +CUresult (*psmi_cuEventDestroy)(CUevent hEvent); +CUresult (*psmi_cuEventQuery)(CUevent hEvent); +CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream); +CUresult (*psmi_cuEventSynchronize)(CUevent hEvent); +CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags); +CUresult (*psmi_cuMemFreeHost)(void* p); +CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); +CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); +CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); +CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount); +CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); +CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream); +CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr); +CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags); +CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr); +CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr); +CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active); +CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev); +CUresult (*psmi_cuCtxGetDevice)(CUdevice* device); +CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device); #endif /* @@ -102,9 +145,8 @@ uint32_t gdr_copy_threshold_recv; * It is supposed to be filled with logical OR * on conditional compilation basis * along with future features/capabilities. - * At the very beginning we start with Multi EPs. */ -uint64_t psm2_capabilities_bitset = PSM2_MULTI_EP_CAP; +uint64_t psm2_capabilities_bitset = PSM2_MULTI_EP_CAP | PSM2_LIB_REFCOUNT_CAP; int psmi_verno_client() { @@ -128,7 +170,7 @@ int psmi_verno_isinteroperable(uint16_t verno) int MOCKABLE(psmi_isinitialized)() { - return (psmi_isinit == PSMI_INITIALIZED); + return (psmi_refcount > 0); } MOCK_DEF_EPILOGUE(psmi_isinitialized); @@ -169,6 +211,7 @@ int psmi_cuda_lib_load() PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxSetCurrent); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerGetAttribute); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerSetAttribute); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceCanAccessPeer); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetAttribute); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGet); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetCount); @@ -229,23 +272,27 @@ int psmi_cuda_initialize() return err; } - CUdevice device; + CUdevice current_device; CUcontext primary_ctx; - PSMI_CUDA_CALL(cuCtxGetDevice, &device); + PSMI_CUDA_CALL(cuCtxGetDevice, ¤t_device); int is_ctx_active; unsigned ctx_flags; - PSMI_CUDA_CALL(cuDevicePrimaryCtxGetState, device, &ctx_flags, &is_ctx_active); + PSMI_CUDA_CALL(cuDevicePrimaryCtxGetState, current_device, &ctx_flags, + &is_ctx_active); if (!is_ctx_active) { /* There is an issue where certain CUDA API calls create * contexts but does not make it active which cause the * driver API call to fail with error 709 */ - PSMI_CUDA_CALL(cuDevicePrimaryCtxRetain, &primary_ctx, device); + PSMI_CUDA_CALL(cuDevicePrimaryCtxRetain, &primary_ctx, + current_device); is_cuda_primary_context_retain = 1; } /* Check if all devices support Unified Virtual Addressing. */ PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices); + device_support_gpudirect = 1; + for (dev = 0; dev < num_devices; dev++) { CUdevice device; PSMI_CUDA_CALL(cuDeviceGet, &device, dev); @@ -265,11 +312,24 @@ int psmi_cuda_initialize() &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); - if (major >= 3) - device_support_gpudirect = 1; - else { + if (major < 3) { device_support_gpudirect = 0; - _HFI_INFO("Device %d does not support GPUDirect RDMA (Non-fatal error) \n", dev); + _HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev); + } + + if (device != current_device) { + int canAccessPeer = 0; + PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer, + current_device, device); + + if (canAccessPeer != 1) + _HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev); + else + gpu_p2p_supported |= (1 << device); + } else { + /* Always support p2p on the same GPU */ + my_gpu_device = device; + gpu_p2p_supported |= (1 << device); } } @@ -336,10 +396,12 @@ psm2_error_t __psm2_init(int *major, int *minor) GENERIC_PERF_SET_SLOT_NAME(PSM_TX_SPEEDPATH_CTR, "TX"); GENERIC_PERF_SET_SLOT_NAME(PSM_RX_SPEEDPATH_CTR, "RX"); - if (psmi_isinit == PSMI_INITIALIZED) + if (psmi_refcount > 0) { + psmi_refcount++; goto update; + } - if (psmi_isinit == PSMI_FINALIZED) { + if (psmi_refcount == PSMI_FINALIZED) { err = PSM2_IS_FINALIZED; goto fail; } @@ -363,10 +425,12 @@ psm2_error_t __psm2_init(int *major, int *minor) "!!! WARNING !!! You are running an internal-only PSM *PROFILE* build.\n"); #endif +#ifdef PSM_FI /* Make sure we complain if fault injection is enabled */ if (getenv("PSM2_FI") && !getenv("PSM2_NO_WARN")) fprintf(stderr, "!!! WARNING !!! You are running with fault injection enabled!\n"); +#endif /* #ifdef PSM_FI */ /* Make sure, as an internal check, that this version knows how to detect * compatibility with other library versions it may communicate with */ @@ -413,7 +477,7 @@ psm2_error_t __psm2_init(int *major, int *minor) ((id.eax & CPUID_EXMODEL_MASK) >> 12); } - psmi_isinit = PSMI_INITIALIZED; + psmi_refcount++; /* hfi_debug lives in libhfi.so */ psmi_getenv("PSM2_TRACEMASK", "Mask flags for tracing", @@ -468,7 +532,9 @@ psm2_error_t __psm2_init(int *major, int *minor) psmi_multi_ep_init(); +#ifdef PSM_FI psmi_faultinj_init(); +#endif /* #ifdef PSM_FI */ psmi_epid_init(); @@ -496,7 +562,6 @@ psm2_error_t __psm2_init(int *major, int *minor) #endif update: - if (getenv("PSM2_IDENTIFY")) { Dl_info info_psm; char ofed_delta[100] = ""; @@ -533,6 +598,8 @@ update: *major = (int)psmi_verno_major; *minor = (int)psmi_verno_minor; fail: + _HFI_DBG("psmi_refcount=%d,err=%u\n", psmi_refcount, err); + PSM2_LOG_MSG("leaving"); return err; } @@ -604,18 +671,21 @@ psm2_error_t __psm2_info_query(psm2_info_query_t q, void *out, 2, /* PSM2_INFO_QUERY_MTU */ 2, /* PSM2_INFO_QUERY_LINK_SPEED */ 1, /* PSM2_INFO_QUERY_NETWORK_TYPE */ + 0, /* PSM2_INFO_QUERY_FEATURE_MASK */ }; psm2_error_t rv = PSM2_INTERNAL_ERR; if ((q < 0) || - (q >= PSM2_INFO_QUERY_LAST) || - (nargs != expected_arg_cnt[q])) - return rv; + (q >= PSM2_INFO_QUERY_LAST)) + return PSM2_IQ_INVALID_QUERY; + + if (nargs != expected_arg_cnt[q]) + return PSM2_PARAM_ERR; switch (q) { case PSM2_INFO_QUERY_NUM_UNITS: - *((uint32_t*)out) = psmi_hal_get_num_units_(1); + *((uint32_t*)out) = psmi_hal_get_num_units_(); rv = PSM2_OK; break; case PSM2_INFO_QUERY_NUM_PORTS: @@ -719,7 +789,16 @@ psm2_error_t __psm2_info_query(psm2_info_query_t q, void *out, rv = PSM2_OK; } } - + break; + case PSM2_INFO_QUERY_FEATURE_MASK: + { +#ifdef PSM_CUDA + *((uint32_t*)out) = PSM2_INFO_QUERY_FEATURE_CUDA; +#else + *((uint32_t*)out) = 0; +#endif /* #ifdef PSM_CUDA */ + } + rv = PSM2_OK; break; default: break; @@ -743,7 +822,14 @@ psm2_error_t __psm2_finalize(void) PSM2_LOG_MSG("entering"); + _HFI_DBG("psmi_refcount=%d\n", psmi_refcount); PSMI_ERR_UNLESS_INITIALIZED(NULL); + psmi_assert(psmi_refcount > 0); + psmi_refcount--; + + if (psmi_refcount > 0) { + return PSM2_OK; + } /* When PSM_PERF is enabled, the following line causes the instruction cycles gathered in the current run to be dumped @@ -751,15 +837,15 @@ psm2_error_t __psm2_finalize(void) GENERIC_PERF_DUMP(stderr); ep = psmi_opened_endpoint; while (ep != NULL) { - psmi_opened_endpoint = ep->user_ep_next; + psm2_ep_t saved_ep = ep->user_ep_next; psm2_ep_close(ep, PSM2_EP_CLOSE_GRACEFUL, 2 * PSMI_MIN_EP_CLOSE_TIMEOUT); - ep = psmi_opened_endpoint; + psmi_opened_endpoint = ep = saved_ep; } - psmi_epid_fini(); - +#ifdef PSM_FI psmi_faultinj_fini(); +#endif /* #ifdef PSM_FI */ /* De-allocate memory for any allocated space to store hostnames */ psmi_epid_itor_init(&itor, PSMI_EP_HOSTNAME); @@ -767,6 +853,8 @@ psm2_error_t __psm2_finalize(void) psmi_free(hostname); psmi_epid_itor_fini(&itor); + psmi_epid_fini(); + /* unmap shared mem object for affinity */ if (psmi_affinity_shared_file_opened) { /* @@ -807,15 +895,25 @@ psm2_error_t __psm2_finalize(void) psmi_hal_finalize(); #ifdef PSM_CUDA if (is_cuda_primary_context_retain) { + /* + * This code will be called during deinitialization, and if + * CUDA is deinitialized before PSM, then + * CUDA_ERROR_DEINITIALIZED will happen here + */ CUdevice device; - PSMI_CUDA_CALL(cuCtxGetDevice, &device); - PSMI_CUDA_CALL(cuDevicePrimaryCtxRelease, device); + if (psmi_cuCtxGetDevice(&device) == CUDA_SUCCESS) + PSMI_CUDA_CALL(cuDevicePrimaryCtxRelease, device); } #endif - psmi_isinit = PSMI_FINALIZED; + psmi_refcount = PSMI_FINALIZED; PSM2_LOG_MSG("leaving"); psmi_log_fini(); + + psmi_stats_deregister_all(); + + psmi_heapdebug_finalize(); + return PSM2_OK; } PSMI_API_DECL(psm2_finalize) diff --git a/psm2.h b/psm2.h index fa0ec20..84f59bb 100644 --- a/psm2.h +++ b/psm2.h @@ -277,9 +277,9 @@ typedef struct psm2_mq *psm2_mq_t; /*! @defgroup init PSM2 Initialization and Maintenance * @{ */ -#define PSM2_VERNO 0x0201 /*!< Header-defined Version number */ +#define PSM2_VERNO 0x0202 /*!< Header-defined Version number */ #define PSM2_VERNO_MAJOR 0x02 /*!< Header-defined Major Version Number */ -#define PSM2_VERNO_MINOR 0x01 /*!< Header-defined Minor Version Number */ +#define PSM2_VERNO_MINOR 0x02 /*!< Header-defined Minor Version Number */ #define PSM2_VERNO_COMPAT_MAJOR 0x01 /*!hfi_sys_class_path); +#ifndef PSM2_MOCK_TESTING + if (!sysfs_init(psm_hi->hfi_sys_class_path)) +#endif + SLIST_INSERT_HEAD(&head_hi, psm_hi, next_hi); } -static struct _psmi_hal_instance *psmi_hal_get_pi_inst(int *pnumunits, - int *pnumports, - int *pdflt_pkey); - -#if PSMI_HAL_INST_CNT > 1 +static struct _psmi_hal_instance *psmi_hal_get_pi_inst(void); -int psmi_hal_pre_init_func(enum psmi_hal_pre_init_func_krnls k, ...) +int psmi_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...) { va_list ap; va_start(ap, k); - int rv = 0,numunits,numports,dflt_pkey; - struct _psmi_hal_instance *p = psmi_hal_get_pi_inst(&numunits, - &numports, - &dflt_pkey); + int rv = 0; + struct _psmi_hal_instance *p = psmi_hal_get_pi_inst(); + if (!p) rv = -1; else { switch(k) { - case psmi_hal_pre_init_func_get_num_units: - rv = numunits; + case psmi_hal_pre_init_cache_func_get_num_units: + rv = p->params.num_units; break; - case psmi_hal_pre_init_func_get_num_ports: - rv = numports; + case psmi_hal_pre_init_cache_func_get_num_ports: + rv = p->params.num_ports; break; - case psmi_hal_pre_init_func_get_unit_active: - rv = p->hfp_get_unit_active( va_arg(ap,int) ); + case psmi_hal_pre_init_cache_func_get_unit_active: + { + int unit = va_arg(ap,int); + + if ((unit >= 0) && (unit < p->params.num_units)) + { + if (!p->params.unit_active_valid[unit]) { + p->params.unit_active_valid[unit] = 1; + p->params.unit_active[unit] = p->hfp_get_unit_active(unit); + } + rv = p->params.unit_active[unit]; + } + else + rv = -1; + } break; - case psmi_hal_pre_init_func_get_port_active: - rv = p->hfp_get_port_active( va_arg(ap,int), - va_arg(ap,int) ); + case psmi_hal_pre_init_cache_func_get_port_active: + { + int unit = va_arg(ap,int); + + if ((unit >= 0) && (unit < p->params.num_units)) + { + int port = va_arg(ap,int); + if ((port >= 1) && (port <= p->params.num_ports)) + { + if (!p->params.port_active_valid[unit*port]) { + p->params.port_active_valid[unit*port] = 1; + p->params.port_active[unit*port] = p->hfp_get_port_active(unit,port); + } + rv = p->params.port_active[unit*port]; + } + else + rv = -1; + } + else + rv = -1; + } + break; + case psmi_hal_pre_init_cache_func_get_num_contexts: + { + int unit = va_arg(ap,int); + if ((unit >= 0) && (unit < p->params.num_units)) + { + if (!p->params.num_contexts_valid[unit]) { + p->params.num_contexts_valid[unit] = 1; + p->params.num_contexts[unit] = p->hfp_get_num_contexts(unit); + } + rv = p->params.num_contexts[unit]; + } + else + rv = -1; + } break; - case psmi_hal_pre_init_func_get_num_contexts: - rv = p->hfp_get_num_contexts( va_arg(ap,int) ); + case psmi_hal_pre_init_cache_func_get_num_free_contexts: + { + int unit = va_arg(ap,int); + + if ((unit >= 0) && (unit < p->params.num_units)) + { + if (!p->params.num_free_contexts_valid[unit]) { + p->params.num_free_contexts_valid[unit] = 1; + p->params.num_free_contexts[unit] = p->hfp_get_num_free_contexts(unit); + } + rv = p->params.num_free_contexts[unit]; + } + else + rv = -1; + } break; - case psmi_hal_pre_init_func_get_num_free_contexts: - rv = p->hfp_get_num_free_contexts( va_arg(ap,int) ); + case psmi_hal_pre_init_cache_func_get_default_pkey: + rv = p->params.default_pkey; break; default: rv = -1; @@ -219,13 +275,12 @@ int psmi_hal_pre_init_func(enum psmi_hal_pre_init_func_krnls k, ...) return rv; } -#endif +static struct _psmi_hal_instance *psmi_hal_get_pi_inst(void) +{ + if (psmi_hal_current_hal_instance) + return psmi_hal_current_hal_instance; -static struct _psmi_hal_instance *psmi_hal_get_pi_inst(int *pnumunits, - int *pnumports, - int *pdflt_pkey) -{ if (SLIST_EMPTY(&head_hi)) return NULL; @@ -243,7 +298,6 @@ static struct _psmi_hal_instance *psmi_hal_get_pi_inst(int *pnumunits, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)PSM_HAL_INSTANCE_ANY_GEN, &env_hi_pref); - int wait = 0; /* The hfp_get_num_units() call below, will not wait for the HFI driver to come up and create device nodes in /dev/.) */ struct _psmi_hal_instance *p; @@ -252,15 +306,38 @@ static struct _psmi_hal_instance *psmi_hal_get_pi_inst(int *pnumunits, if ((env_hi_pref.e_int == PSM_HAL_INSTANCE_ANY_GEN) || (p->type == env_hi_pref.e_int)) { - int nunits = p->hfp_get_num_units(wait); + const int valid_flags = PSM_HAL_PARAMS_VALID_DEFAULT_PKEY | + PSM_HAL_PARAMS_VALID_NUM_UNITS | + PSM_HAL_PARAMS_VALID_NUM_PORTS; + + if ((p->params.sw_status & valid_flags) == valid_flags) + return p; + + int nunits = p->hfp_get_num_units(); int nports = p->hfp_get_num_ports(); int dflt_pkey = p->hfp_get_default_pkey(); - if (nunits > 0 && nports > 0 && dflt_pkey > 0) + if (nunits > 0 && nports > 0 && dflt_pkey > 0 +#ifndef PSM2_MOCK_TESTING + && (0 == sysfs_init(p->hfi_sys_class_path)) +#endif + ) { - sysfs_init(p->hfi_sys_class_path); - *pnumunits = nunits; - *pnumports = nports; - *pdflt_pkey = dflt_pkey; + p->params.num_units = nunits; + p->params.num_ports = nports; + p->params.default_pkey = dflt_pkey; + p->params.sw_status |= valid_flags; + p->params.unit_active = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, sizeof(int8_t)); + p->params.unit_active_valid = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, sizeof(int8_t)); + p->params.port_active = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits*nports, sizeof(int8_t)); + p->params.port_active_valid = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits*nports, sizeof(int8_t)); + p->params.num_contexts = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, + sizeof(uint16_t)); + p->params.num_contexts_valid = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, + sizeof(uint16_t)); + p->params.num_free_contexts = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, + sizeof(uint16_t)); + p->params.num_free_contexts_valid = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, + sizeof(uint16_t)); return p; } } @@ -271,23 +348,15 @@ static struct _psmi_hal_instance *psmi_hal_get_pi_inst(int *pnumunits, /* psmi_hal_initialize */ int psmi_hal_initialize(void) { - int nunits = 0; - int nports = 0; - int dflt_pkey = 0; - struct _psmi_hal_instance *p = psmi_hal_get_pi_inst(&nunits, &nports, &dflt_pkey); + struct _psmi_hal_instance *p = psmi_hal_get_pi_inst(); if (!p) return -PSM_HAL_ERROR_INIT_FAILED; - memset(&p->params,0,sizeof(p->params)); - int rv = p->hfp_initialize(p); if (!rv) { - p->params.num_units = nunits; - p->params.num_ports = nports; - p->params.default_pkey = dflt_pkey; psmi_hal_current_hal_instance = p; if (psmi_hal_has_cap(PSM_HAL_CAP_HDRSUPP)) { @@ -310,6 +379,34 @@ int psmi_hal_initialize(void) return -PSM_HAL_ERROR_INIT_FAILED; } +int psmi_hal_finalize(void) +{ + struct _psmi_hal_instance *p = psmi_hal_current_hal_instance; + + int rv = psmi_hal_finalize_(); + + psmi_free(p->params.unit_active); + psmi_free(p->params.unit_active_valid); + psmi_free(p->params.port_active); + psmi_free(p->params.port_active_valid); + psmi_free(p->params.num_contexts); + psmi_free(p->params.num_contexts_valid); + psmi_free(p->params.num_free_contexts); + psmi_free(p->params.num_free_contexts_valid); + p->params.unit_active = NULL; + p->params.unit_active_valid = NULL; + p->params.port_active = NULL; + p->params.port_active_valid = NULL; + p->params.num_contexts = NULL; + p->params.num_contexts_valid = NULL; + p->params.num_free_contexts = NULL; + p->params.num_free_contexts_valid = NULL; + psmi_hal_current_hal_instance = NULL; + sysfs_fini(); + return rv; +} + + #ifdef PSM2_MOCK_TESTING #include "psm_hal_gen1/opa_user_gen1.h" diff --git a/psm2_hal.h b/psm2_hal.h index e2367f5..1bec596 100644 --- a/psm2_hal.h +++ b/psm2_hal.h @@ -166,6 +166,10 @@ typedef enum PSM_HAL_PSMI_RUNTIME_INTR_ENABLED = (1UL << 2), /* Header suppression is enabled: */ PSM_HAL_HDRSUPP_ENABLED = (1UL << 3), + PSM_HAL_PARAMS_VALID_NUM_UNITS = (1UL << 4), + PSM_HAL_PARAMS_VALID_NUM_PORTS = (1UL << 5), + PSM_HAL_PARAMS_VALID_DEFAULT_PKEY = (1UL << 6), + } psmi_hal_sw_status; /* The _psmi_hal_params structure stores values that remain constant for the entire life of @@ -173,11 +177,16 @@ typedef enum The values are settled after the context is opened. */ typedef struct _psmi_hal_params { - uint16_t num_units; - uint16_t num_ports; uint32_t cap_mask; uint32_t sw_status; + /* start cached members */ + uint16_t num_units; + uint16_t num_ports; uint16_t default_pkey; + int8_t *unit_active,*unit_active_valid; + int8_t *port_active,*port_active_valid; + uint16_t *num_contexts,*num_contexts_valid; + uint16_t *num_free_contexts,*num_free_contexts_valid; } psmi_hal_params_t; /* HAL assumes that the rx hdr q and the egr buff q are circular lists @@ -403,12 +412,12 @@ struct _psmi_hal_instance /* Initialize the HAL INSTANCE. */ int (*hfp_initialize)(psmi_hal_instance_t *); /* Finalize the HAL INSTANCE. */ - int (*hfp_finalize)(void); + int (*hfp_finalize_)(void); /* Returns the number of hfi units installed on ths host: NOTE: hfp_get_num_units is a function that must be callable before the hal instance is initialized. */ - int (*hfp_get_num_units)(int wait); + int (*hfp_get_num_units)(void); /* Returns the number of ports on each hfi unit installed. on ths host. @@ -458,7 +467,7 @@ struct _psmi_hal_instance int (*hfp_get_port_lmc)(int unit, int port); int (*hfp_get_port_rate)(int unit, int port); int (*hfp_get_port_sl2sc)(int unit, int port,int sl); - int (*hfp_get_port_sc2vl)(int unit, int port,int sc); + int (*hfp_get_sc2vl_map)(struct ips_proto *proto); int (*hfp_set_pkey)(psmi_hal_hw_context, uint16_t); int (*hfp_poll_type)(uint16_t poll_type, psmi_hal_hw_context); int (*hfp_get_port_lid)(int unit, int port); @@ -693,48 +702,42 @@ void psmi_hal_register_instance(psmi_hal_instance_t *); another failure has occured during initialization. */ int psmi_hal_initialize(void); -/* note that: +int psmi_hal_finalize(void); -int psmi_hal_get_num_units(void); +#include "psm2_hal_inlines_d.h" -Is intentionally left out as it is called during initialization, -and the results are cached in the hw params. -*/ +enum psmi_hal_pre_init_cache_func_krnls +{ + psmi_hal_pre_init_cache_func_get_num_units, + psmi_hal_pre_init_cache_func_get_num_ports, + psmi_hal_pre_init_cache_func_get_unit_active, + psmi_hal_pre_init_cache_func_get_port_active, + psmi_hal_pre_init_cache_func_get_num_contexts, + psmi_hal_pre_init_cache_func_get_num_free_contexts, + psmi_hal_pre_init_cache_func_get_default_pkey, +}; -#include "psm2_hal_inlines_d.h" +int psmi_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...); + +#define PSMI_HAL_DISPATCH_PI(KERNEL,...) ( psmi_hal_pre_init_cache_func(psmi_hal_pre_init_cache_func_ ## KERNEL , ##__VA_ARGS__ ) ) #if PSMI_HAL_INST_CNT == 1 #define PSMI_HAL_DISPATCH(KERNEL,...) ( PSMI_HAL_CAT_INL_SYM(KERNEL) ( __VA_ARGS__ ) ) -#define PSMI_HAL_DISPATCH_PI(KERNEL,...) PSMI_HAL_DISPATCH(KERNEL , ##__VA_ARGS__ ) - #else -enum psmi_hal_pre_init_func_krnls -{ - psmi_hal_pre_init_func_get_num_units, - psmi_hal_pre_init_func_get_num_ports, - psmi_hal_pre_init_func_get_unit_active, - psmi_hal_pre_init_func_get_port_active, - psmi_hal_pre_init_func_get_num_contexts, - psmi_hal_pre_init_func_get_num_free_contexts, -}; - -int psmi_hal_pre_init_func(enum psmi_hal_pre_init_func_krnls k, ...); - #define PSMI_HAL_DISPATCH(KERNEL,...) ( psmi_hal_current_hal_instance->hfp_ ## KERNEL ( __VA_ARGS__ )) -#define PSMI_HAL_DISPATCH_PI(KERNEL,...) ( psmi_hal_pre_init_func(psmi_hal_pre_init_func_ ## KERNEL , ##__VA_ARGS__ ) ) - #endif -#define psmi_hal_get_num_units_(...) PSMI_HAL_DISPATCH_PI(get_num_units,__VA_ARGS__) -#define psmi_hal_get_num_ports_(...) PSMI_HAL_DISPATCH_PI(get_num_ports,##__VA_ARGS__) -#define psmi_hal_get_unit_active(...) PSMI_HAL_DISPATCH_PI(get_unit_active,__VA_ARGS__) -#define psmi_hal_get_port_active(...) PSMI_HAL_DISPATCH_PI(get_port_active,__VA_ARGS__) -#define psmi_hal_get_num_contexts(...) PSMI_HAL_DISPATCH_PI(get_num_contexts,__VA_ARGS__) -#define psmi_hal_get_num_free_contexts(...) PSMI_HAL_DISPATCH_PI(get_num_free_contexts,__VA_ARGS__) +#define psmi_hal_get_num_units_(...) PSMI_HAL_DISPATCH_PI(get_num_units,##__VA_ARGS__) +#define psmi_hal_get_num_ports_(...) PSMI_HAL_DISPATCH_PI(get_num_ports,##__VA_ARGS__) +#define psmi_hal_get_unit_active(...) PSMI_HAL_DISPATCH_PI(get_unit_active,__VA_ARGS__) +#define psmi_hal_get_port_active(...) PSMI_HAL_DISPATCH_PI(get_port_active,__VA_ARGS__) +#define psmi_hal_get_num_contexts(...) PSMI_HAL_DISPATCH_PI(get_num_contexts,__VA_ARGS__) +#define psmi_hal_get_num_free_contexts(...) PSMI_HAL_DISPATCH_PI(get_num_free_contexts,__VA_ARGS__) +#define psmi_hal_get_default_pkey(...) PSMI_HAL_DISPATCH_PI(get_default_pkey,##__VA_ARGS__) #define psmi_hal_context_open(...) PSMI_HAL_DISPATCH(context_open,__VA_ARGS__) #define psmi_hal_close_context(...) PSMI_HAL_DISPATCH(close_context,__VA_ARGS__) #define psmi_hal_get_port_index2pkey(...) PSMI_HAL_DISPATCH(get_port_index2pkey,__VA_ARGS__) @@ -743,7 +746,7 @@ int psmi_hal_pre_init_func(enum psmi_hal_pre_init_func_krnls k, ...); #define psmi_hal_get_port_lmc(...) PSMI_HAL_DISPATCH(get_port_lmc,__VA_ARGS__) #define psmi_hal_get_port_rate(...) PSMI_HAL_DISPATCH(get_port_rate,__VA_ARGS__) #define psmi_hal_get_port_sl2sc(...) PSMI_HAL_DISPATCH(get_port_sl2sc,__VA_ARGS__) -#define psmi_hal_get_port_sc2vl(...) PSMI_HAL_DISPATCH(get_port_sc2vl,__VA_ARGS__) +#define psmi_hal_get_sc2vl_map(...) PSMI_HAL_DISPATCH(get_sc2vl_map, __VA_ARGS__) #define psmi_hal_set_pkey(...) PSMI_HAL_DISPATCH(set_pkey,__VA_ARGS__) #define psmi_hal_poll_type(...) PSMI_HAL_DISPATCH(poll_type,__VA_ARGS__) #define psmi_hal_get_port_lid(...) PSMI_HAL_DISPATCH(get_port_lid,__VA_ARGS__) @@ -782,7 +785,7 @@ int psmi_hal_pre_init_func(enum psmi_hal_pre_init_func_krnls k, ...); #define psmi_hal_tidflow_get_genmismatch(...) PSMI_HAL_DISPATCH(tidflow_get_genmismatch,__VA_ARGS__) #define psmi_hal_forward_packet_to_subcontext(...) PSMI_HAL_DISPATCH(forward_packet_to_subcontext,__VA_ARGS__) #define psmi_hal_subcontext_ureg_get(...) PSMI_HAL_DISPATCH(subcontext_ureg_get,__VA_ARGS__) -#define psmi_hal_finalize(...) PSMI_HAL_DISPATCH(finalize,__VA_ARGS__) +#define psmi_hal_finalize_(...) PSMI_HAL_DISPATCH(finalize_,__VA_ARGS__) #define psmi_hal_get_hfi_event_bits(...) PSMI_HAL_DISPATCH(get_hfi_event_bits,__VA_ARGS__) #define psmi_hal_ack_hfi_event(...) PSMI_HAL_DISPATCH(ack_hfi_event,__VA_ARGS__) #define psmi_hal_hfi_reset_context(...) PSMI_HAL_DISPATCH(hfi_reset_context,__VA_ARGS__) @@ -863,7 +866,6 @@ int psmi_hal_pre_init_func(enum psmi_hal_pre_init_func_krnls k, ...); #define psmi_hal_get_hfi_name() psmi_hal_current_hal_instance->hfi_name #define psmi_hal_get_num_units() psmi_hal_current_hal_instance->params.num_units #define psmi_hal_get_num_ports() psmi_hal_current_hal_instance->params.num_ports -#define psmi_hal_get_default_pkey() psmi_hal_current_hal_instance->params.default_pkey #define psmi_hal_get_cap_mask() psmi_hal_current_hal_instance->params.cap_mask #define psmi_hal_set_cap_mask(NEW_MASK) (psmi_hal_current_hal_instance->params.cap_mask = (NEW_MASK)) #define psmi_hal_add_cap(CAP) (psmi_hal_current_hal_instance->params.cap_mask |= (CAP)) diff --git a/psm2_hal_inline_t.h b/psm2_hal_inline_t.h index 8e061a2..f48f6c6 100644 --- a/psm2_hal_inline_t.h +++ b/psm2_hal_inline_t.h @@ -56,10 +56,10 @@ static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(initialize) (psmi_hal_instance_t *); -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(finalize) +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(finalize_) (void); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_units) - (int wait); + (void); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_ports) (void); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_unit_active) @@ -95,8 +95,8 @@ static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_rate) (int unit, int port); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_sl2sc) (int unit, int port, int sl); -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_sc2vl) - (int unit, int port, int sc); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_sc2vl_map) + (struct ips_proto *proto); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(set_pkey) (psmi_hal_hw_context, uint16_t); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(poll_type) diff --git a/psm2_mq.h b/psm2_mq.h index c193afc..7b63608 100644 --- a/psm2_mq.h +++ b/psm2_mq.h @@ -256,15 +256,7 @@ extern "C" { * Matched Queue communication operations. * * @param[in] ep Endpoint over which to initialize Matched Queue - * @param[in] tag_order_mask Order mask hint to let MQ know what bits of the - * send tag are required to maintain MQ message - * order. In MPI parlance, this mask sets the bits - * that store the context (or communicator ID). The - * user can choose to pass PSM2_MQ_ORDERMASK_NONE or - * PSM2_MQ_ORDERMASK_ALL to tell MQ to respectively - * provide no ordering guarantees or to provide - * ordering over all messages by ignoring the - * contexts of the send tags. + * @param[in] ignored * @param[in] opts Set of options for Matched Queue * @param[in] numopts Number of options passed * @param[out] mq User-supplied storage to return the Matched Queue handle @@ -311,7 +303,7 @@ extern "C" { @endcode */ psm2_error_t -psm2_mq_init(psm2_ep_t ep, uint64_t tag_order_mask, +psm2_mq_init(psm2_ep_t ep, uint64_t ignored, const struct psm2_optkey *opts, int numopts, psm2_mq_t *mq); #define PSM2_MQ_ORDERMASK_NONE 0ULL diff --git a/psm_am.c b/psm_am.c index c421142..f1f3a45 100644 --- a/psm_am.c +++ b/psm_am.c @@ -130,6 +130,13 @@ psm2_error_t psmi_am_init_internal(psm2_ep_t ep) } +void psmi_am_fini_internal(psm2_ep_t ep) +{ + if(ep->am_htable != NULL) { + psmi_free(ep->am_htable); + } +} + psm2_error_t __psm2_am_register_handlers(psm2_ep_t ep, const psm2_am_handler_fn_t *handlers, diff --git a/psm_am_internal.h b/psm_am_internal.h index bc2c128..af151dc 100644 --- a/psm_am_internal.h +++ b/psm_am_internal.h @@ -103,5 +103,6 @@ PSMI_ALWAYS_INLINE(struct psm2_ep_am_handle_entry * /* PSM internal initialization */ psm2_error_t psmi_am_init_internal(psm2_ep_t ep); +void psmi_am_fini_internal(psm2_ep_t ep); #endif diff --git a/psm_config.h b/psm_config.h index 3c42106..85fc1bc 100644 --- a/psm_config.h +++ b/psm_config.h @@ -153,7 +153,7 @@ #define MQ_HFI_THRESH_TINY 8 #define MQ_HFI_THRESH_EGR_SDMA_XEON 34000 /* Eager Xeon blocking */ #define MQ_HFI_THRESH_EGR_SDMA_PHI2 200000 /* Eager Phi2 blocking */ -#define MQ_HFI_THRESH_EGR_SDMA_SQ_XEON 16000 /* Eager Xeon non-blocking */ +#define MQ_HFI_THRESH_EGR_SDMA_SQ_XEON 16384 /* Eager Xeon non-blocking */ #define MQ_HFI_THRESH_EGR_SDMA_SQ_PHI2 65536 /* Eager Phi2 non-blocking */ #define MQ_HFI_THRESH_RNDV_PHI2 200000 diff --git a/psm_context.c b/psm_context.c index 48f4671..6b9a8ae 100644 --- a/psm_context.c +++ b/psm_context.c @@ -223,8 +223,12 @@ psmi_create_and_open_affinity_shm(psm2_uuid_t const job_key) } ret = ftruncate(shm_fd, AFFINITY_SHMEMSIZE); - if ( ret < 0 ) + if ( ret < 0 ) { + _HFI_VDBG("Cannot truncate affinity shared mem fd:%s, errno=%d\n", + affinity_shm_name, errno); + if (shm_fd >= 0) close(shm_fd); return ret; + } shared_affinity_ptr = (uint64_t *) mmap(NULL, AFFINITY_SHMEMSIZE, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0); @@ -529,10 +533,20 @@ psmi_context_open(const psm2_ep_t ep, long unit_param, long port, context->ep = (psm2_ep_t) ep; -#ifdef PSM_CUDA /* Check backward compatibility bits here and save the info */ if (psmi_hal_has_cap(PSM_HAL_CAP_GPUDIRECT_OT)) + { +#ifdef PSM_CUDA is_driver_gpudirect_enabled = 1; +#else + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "FATAL ERROR: " + "CUDA version of hfi1 driver is loaded with non-CUDA version of " + "psm2 library.\n"); +#endif + } +#ifdef PSM_CUDA + else + fprintf(stderr,"WARNING: running CUDA version of libpsm2 with non CUDA version of hfi1 driver.\n"); #endif _HFI_VDBG("hfi_userinit() passed.\n"); diff --git a/psm_context.h b/psm_context.h index d152a7f..c9387d1 100644 --- a/psm_context.h +++ b/psm_context.h @@ -82,7 +82,6 @@ struct psmi_context { psm2_ep_t ep; /* psm ep handle */ psm2_epid_t epid; /* psm integral ep id */ - uint32_t rcvthread_flags; psm2_error_t status_lasterr; time_t networkLostTime; } psmi_context_t; diff --git a/psm_diags.c b/psm_diags.c index 2a43c22..8b4ba8a 100644 --- a/psm_diags.c +++ b/psm_diags.c @@ -322,22 +322,23 @@ int memcpy_check_size(memcpy_fn_t fn, int *p, int *f, size_t n) if (USE_MALLOC) { src = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size); dst = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size); - if (src == NULL || dst == NULL) - if (src) - psmi_free(src); - if (dst) - psmi_free(dst); - return -1; + if (src == NULL || dst == NULL) { + if (src) psmi_free(src); + if (dst) psmi_free(dst); + return -1; + } } else { - void *src_p, *dst_p; + void *src_p = NULL, *dst_p = NULL; if (posix_memalign(&src_p, 64, size) != 0 || - posix_memalign(&dst_p, 64, size) != 0) + posix_memalign(&dst_p, 64, size) != 0) { + if (src_p) free(src_p); + if (dst_p) free(dst_p); return -1; - else { - src = (uint8_t *) src_p; - dst = (uint8_t *) dst_p; } + src = (uint8_t *) src_p; + dst = (uint8_t *) dst_p; } + int src_align, dst_align; for (src_align = 0; src_align < num_aligns; src_align++) { for (dst_align = 0; dst_align < num_aligns; dst_align++) { @@ -356,7 +357,12 @@ int memcpy_check_size(memcpy_fn_t fn, int *p, int *f, size_t n) } } } - psmi_free(src); - psmi_free(dst); + if (USE_MALLOC) { + psmi_free(src); + psmi_free(dst); + } else { + free(src); + free(dst); + } return 0; } diff --git a/psm_ep.c b/psm_ep.c index d78431d..8c4fe5e 100644 --- a/psm_ep.c +++ b/psm_ep.c @@ -57,6 +57,7 @@ #include #include /* cpu_set */ #include /* isalpha */ +#include #include "psm_user.h" #include "psm2_hal.h" @@ -71,6 +72,8 @@ */ psm2_ep_t psmi_opened_endpoint = NULL; int psmi_opened_endpoint_count = 0; +static uint16_t *hfi_lids; +static uint32_t nlids; static psm2_error_t psmi_ep_open_device(const psm2_ep_t ep, const struct psm2_ep_open_opts *opts, @@ -297,8 +300,6 @@ static psm2_error_t psmi_ep_devlids(uint16_t **lids, uint32_t *num_lids_o, uint64_t my_gid_hi, uint64_t my_gid_lo) { - static uint16_t *hfi_lids; - static uint32_t nlids; uint32_t num_units; int i; psm2_error_t err = PSM2_OK; @@ -863,10 +864,6 @@ __psm2_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, goto fail; } - /* Set environment variable if PSM is not allowed to set affinity */ - if (opts.affinity == PSM2_EP_OPEN_AFFINITY_SKIP) - setenv("HFI_NO_CPUAFFINITY", "1", 1); - /* Allocate end point structure storage */ ptl_sizes = (psmi_device_is_enabled(devid_enabled, PTL_DEVID_SELF) ? @@ -922,6 +919,10 @@ __psm2_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, &envvar_val); ep->yield_spin_cnt = envvar_val.e_uint; + /* Set skip_affinity flag if PSM is not allowed to set affinity */ + if (opts.affinity == PSM2_EP_OPEN_AFFINITY_SKIP) + ep->skip_affinity = true; + ptl_sizes = 0; amsh_ptl = ips_ptl = self_ptl = NULL; if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { @@ -1179,6 +1180,8 @@ psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in) PSMI_LOCK(psmi_creation_lock); + psmi_am_fini_internal(ep); + if (psmi_opened_endpoint == NULL) { err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED, "PSM Endpoint is closed or does not exist"); @@ -1322,6 +1325,7 @@ psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in) if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) psmi_context_close(&ep->context); + psmi_epid_remove_all(ep); psmi_free(ep->epaddr); psmi_free(ep->context_mylabel); @@ -1332,9 +1336,17 @@ psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in) } while ((err == PSM2_OK || err == PSM2_TIMEOUT) && tmp != ep); - if (mmq) - err = psmi_mq_free(mmq); + if (mmq) { + psmi_destroy_lock(&(mmq->progress_lock)); + err = psmi_mq_free(mmq); + } + if (hfi_lids) + { + psmi_free(hfi_lids); + hfi_lids = NULL; + nlids = 0; + } PSMI_UNLOCK(psmi_creation_lock); @@ -1363,7 +1375,6 @@ psmi_ep_open_device(const psm2_ep_t ep, * option affinity skip. */ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { - uint32_t rcvthread_flags; union psmi_envvar_val env_rcvthread; static int norcvthread; /* only for first rail */ @@ -1394,11 +1405,10 @@ psmi_ep_open_device(const psm2_ep_t ep, (union psmi_envvar_val)(norcvthread++ ? 0 : PSMI_RCVTHREAD_FLAGS), &env_rcvthread); - rcvthread_flags = env_rcvthread.e_uint; - /* If enabled, use the pollurg capability to implement a receive + /* If enabled, use the polling capability to implement a receive * interrupt thread that can handle urg packets */ - if (rcvthread_flags) { + if (env_rcvthread.e_uint) { psmi_hal_add_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD); #ifdef PSMI_PLOCK_IS_NOLOCK psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, @@ -1406,7 +1416,6 @@ psmi_ep_open_device(const psm2_ep_t ep, "with RCVTHREAD on"); #endif } - context->rcvthread_flags = rcvthread_flags; *epid = context->epid; } else if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { diff --git a/psm_ep.h b/psm_ep.h index 055573d..b526fa0 100644 --- a/psm_ep.h +++ b/psm_ep.h @@ -83,6 +83,8 @@ #define PSMI_SL_MAX 31 #define PSMI_SC_ADMIN 15 #define PSMI_VL_ADMIN 15 +#define PSMI_SC_NBITS 5 /* Number of bits in SC */ +#define PSMI_N_SCS (1 << PSMI_SC_NBITS) /* The number of SC's */ #define PSMI_EPID_PACK_V1(lid, context, subcontext, hfiunit, epid_version, rank) \ (((((uint64_t)lid)&0xffff)<<16) | \ @@ -175,6 +177,7 @@ struct psm2_ep { /* All ptl data is allocated inline below */ uint8_t ptl_base_data[0] __attribute__ ((aligned(64))); + bool skip_affinity; }; struct mqq { diff --git a/psm_error.h b/psm_error.h index f335382..cb1b4ba 100644 --- a/psm_error.h +++ b/psm_error.h @@ -65,7 +65,7 @@ #define PSMI_EP_NORETURN ((psm2_ep_t) -2) #define PSMI_EP_LOGEVENT ((psm2_ep_t) -3) -psm2_ep_errhandler_t psmi_errhandler_global; +extern psm2_ep_errhandler_t psmi_errhandler_global; psm2_error_t MOCKABLE(psmi_handle_error)(psm2_ep_t ep, psm2_error_t error, const char *buf, ...) diff --git a/psm_hal_gen1/opa_proto_gen1.c b/psm_hal_gen1/opa_proto_gen1.c index 1f2b13e..eb8bce9 100644 --- a/psm_hal_gen1/opa_proto_gen1.c +++ b/psm_hal_gen1/opa_proto_gen1.c @@ -72,7 +72,218 @@ #include -#define ALIGN(x, a) (((x)+(a)-1)&~((a)-1)) +size_t arrsz[MAPSIZE_MAX] = { 0 }; + +static int map_hfi_mem(int fd, struct _hfi_ctrl *ctrl, size_t subctxt_cnt) +{ +#define CREDITS_NUM 64 + struct hfi1_ctxt_info *cinfo = &ctrl->ctxt_info; + struct hfi1_base_info *binfo = &ctrl->base_info; + size_t sz; + __u64 off; + void *maddr; + + + /* 1. Map the PIO credits address */ + off = binfo->sc_credits_addr &~ HFI_MMAP_PGMASK; + + sz = HFI_MMAP_PGSIZE; + maddr = HFI_MMAP_ERRCHECK(fd, binfo, sc_credits_addr, sz, PROT_READ); + hfi_touch_mmap(maddr, sz); + arrsz[SC_CREDITS] = sz; + + binfo->sc_credits_addr |= off; + + + /* 2. Map the PIO buffer SOP address + * Skipping the cast of cinfo->credits to size_t. This causes the outcome of the multiplication + * to be sign-extended in the event of too large input values. This results in a very large product + * when treated as unsigned which in turn will make the HFI_MMAP_ERRCHECK() macro fail and give an + * adequate error report. TODO: Consider sanitizing the credits value explicitly + */ + sz = cinfo->credits * CREDITS_NUM; + HFI_MMAP_ERRCHECK(fd, binfo, pio_bufbase_sop, sz, PROT_WRITE); + arrsz[PIO_BUFBASE_SOP] = sz; + + + /* 3. Map the PIO buffer address */ + sz = cinfo->credits * CREDITS_NUM; + HFI_MMAP_ERRCHECK(fd, binfo, pio_bufbase, sz, PROT_WRITE); + arrsz[PIO_BUFBASE] = sz; + + + /* 4. Map the receive header queue + * (u16 * u16 -> max value 0xfffe0001) + */ + sz = (size_t)cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize; + maddr = HFI_MMAP_ERRCHECK(fd, binfo, rcvhdr_bufbase, sz, PROT_READ); + hfi_touch_mmap(maddr, sz); + arrsz[RCVHDR_BUFBASE] = sz; + + + /* 5. Map the receive eager buffer + * (u16 * u32. Assuming size_t's precision is 64 bits - no overflow) + */ + sz = (size_t)cinfo->egrtids * cinfo->rcvegr_size; + maddr = HFI_MMAP_ERRCHECK(fd, binfo, rcvegr_bufbase, sz, PROT_READ); + hfi_touch_mmap(maddr, sz); + arrsz[RCVEGR_BUFBASE] = sz; + + + /* 6. Map the sdma completion queue */ + if (cinfo->runtime_flags & HFI1_CAP_SDMA) { + sz = cinfo->sdma_ring_size * sizeof(struct hfi1_sdma_comp_entry); + HFI_MMAP_ERRCHECK(fd, binfo, sdma_comp_bufbase, sz, PROT_READ); + } else { + sz = 0; + binfo->sdma_comp_bufbase = (__u64)0; + } + arrsz[SDMA_COMP_BUFBASE] = sz; + + + /* 7. Map RXE per-context CSRs */ + sz = HFI_MMAP_PGSIZE; + HFI_MMAP_ERRCHECK(fd, binfo, user_regbase, sz, PROT_WRITE|PROT_READ); + arrsz[USER_REGBASE] = sz; + /* Set up addresses for optimized register writeback routines. + * This is for the real onchip registers, shared context or not + */ + uint64_t *regbasep = (uint64_t *)binfo->user_regbase; + ctrl->__hfi_rcvhdrtail = (volatile __le64 *)(regbasep + ur_rcvhdrtail); + ctrl->__hfi_rcvhdrhead = (volatile __le64 *)(regbasep + ur_rcvhdrhead); + ctrl->__hfi_rcvegrtail = (volatile __le64 *)(regbasep + ur_rcvegrindextail); + ctrl->__hfi_rcvegrhead = (volatile __le64 *)(regbasep + ur_rcvegrindexhead); + ctrl->__hfi_rcvofftail = (volatile __le64 *)(regbasep + ur_rcvegroffsettail); + + if (cinfo->runtime_flags & HFI1_CAP_HDRSUPP) { + ctrl->__hfi_rcvtidflow = (volatile __le64 *)(regbasep + ur_rcvtidflowtable); + ctrl->__hfi_tfvalid = 1; + } else { + ctrl->__hfi_rcvtidflow = ctrl->regs; + ctrl->__hfi_tfvalid = 0; + } + + + /* 8. Map the rcvhdrq tail register address */ + if (cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL) { + sz = HFI_MMAP_PGSIZE; + HFI_MMAP_ERRCHECK(fd, binfo, rcvhdrtail_base, sz, PROT_READ); + } else { + /* We don't use receive header queue tail register to detect new packets, + * but here we save the address for false-eager-full recovery + */ + sz = 0; + /* This points inside the previously established mapping (user_rehbase). Don't munmap()! */ + binfo->rcvhdrtail_base = (uint64_t) (uintptr_t) ctrl->__hfi_rcvhdrtail; + } + ctrl->__hfi_rcvtail = (__le64 *)binfo->rcvhdrtail_base; + arrsz[RCVHDRTAIL_BASE] = sz; + + + /* 9. Map the event page */ + off = binfo->events_bufbase &~ HFI_MMAP_PGMASK; + + sz = HFI_MMAP_PGSIZE; + HFI_MMAP_ERRCHECK(fd, binfo, events_bufbase, sz, PROT_READ); + arrsz[EVENTS_BUFBASE] = sz; + /* keep the offset in the address */ + binfo->events_bufbase |= off; + + + /* 10. Map the status page */ + sz = HFI_MMAP_PGSIZE; + HFI_MMAP_ERRCHECK(fd, binfo, status_bufbase, sz, PROT_READ); + arrsz[STATUS_BUFBASE] = sz; + + + if (!subctxt_cnt) + return 0; + + /* 11. If subcontext is used, map the buffers */ + const char *errstr = "Incorrect input values for the subcontext"; + size_t factor; + + /* 11a) subctxt_uregbase */ + sz = HFI_MMAP_PGSIZE; + maddr = HFI_MMAP_ERRCHECK(fd, binfo, subctxt_uregbase, sz, PROT_READ|PROT_WRITE); + hfi_touch_mmap(maddr, sz); + arrsz[SUBCTXT_UREGBASE] = sz; + + + /* 11b) subctxt_rcvhdrbuf + * u16 * u16. Prevent promotion to int through an explicit cast to size_t + */ + factor = (size_t)cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize; + factor = ALIGN(factor, HFI_MMAP_PGSIZE); + sz = factor * subctxt_cnt; + maddr = HFI_MMAP_ERRCHECK(fd, binfo, subctxt_rcvhdrbuf, sz, PROT_READ|PROT_WRITE); + hfi_touch_mmap(maddr, sz); + arrsz[SUBCTXT_RCVHDRBUF] = sz; + + + /* 11c) subctxt_rcvegrbuf + * u16 * u32. Assuming size_t's precision to be 64 bits (no overflow) + */ + factor = (size_t)cinfo->egrtids * cinfo->rcvegr_size; + factor = ALIGN(factor, HFI_MMAP_PGSIZE); + sz = factor * subctxt_cnt; + if (sz / subctxt_cnt != factor) { + _HFI_INFO("%s (rcvegrbuf)\n", errstr); + goto err_int_overflow_subctxt_rcvegrbuf; + } + maddr = HFI_MMAP_ERRCHECK(fd, binfo, subctxt_rcvegrbuf, sz, PROT_READ|PROT_WRITE); + hfi_touch_mmap(maddr, sz); + arrsz[SUBCTXT_RCVEGRBUF] = sz; + + return 0; + +err_int_overflow_subctxt_rcvegrbuf: +err_mmap_subctxt_rcvegrbuf: + /* if we got here, subctxt_cnt must be != 0 */ + HFI_MUNMAP_ERRCHECK(binfo, subctxt_rcvhdrbuf, arrsz[SUBCTXT_RCVHDRBUF]); + +err_mmap_subctxt_rcvhdrbuf: + /* if we got it here, subctxt_cnt must be != 0 */ + HFI_MUNMAP_ERRCHECK(binfo, subctxt_uregbase, arrsz[SUBCTXT_UREGBASE]); + +err_mmap_subctxt_uregbase: + HFI_MUNMAP_ERRCHECK(binfo, status_bufbase, arrsz[STATUS_BUFBASE]); + +err_mmap_status_bufbase: + HFI_MUNMAP_ERRCHECK(binfo, events_bufbase, arrsz[EVENTS_BUFBASE]); + +err_mmap_events_bufbase: + if(cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL) { + HFI_MUNMAP_ERRCHECK(binfo, rcvhdrtail_base, arrsz[RCVHDRTAIL_BASE]); + } + +err_mmap_rcvhdrtail_base: + HFI_MUNMAP_ERRCHECK(binfo, user_regbase, arrsz[USER_REGBASE]); + +err_mmap_user_regbase: + /* the condition could be: if(cinfo->runtime_flags & HFI1_CAP_SDMA) too */ + if(binfo->sdma_comp_bufbase != 0) { + HFI_MUNMAP_ERRCHECK(binfo, sdma_comp_bufbase, arrsz[SDMA_COMP_BUFBASE]); + } + +err_mmap_sdma_comp_bufbase: + HFI_MUNMAP_ERRCHECK(binfo, rcvegr_bufbase, arrsz[RCVEGR_BUFBASE]); + +err_mmap_rcvegr_bufbase: + HFI_MUNMAP_ERRCHECK(binfo, rcvhdr_bufbase, arrsz[RCVHDR_BUFBASE]); + +err_mmap_rcvhdr_bufbase: + HFI_MUNMAP_ERRCHECK(binfo, pio_bufbase, arrsz[PIO_BUFBASE]); + +err_mmap_pio_bufbase: + HFI_MUNMAP_ERRCHECK(binfo, pio_bufbase_sop, arrsz[PIO_BUFBASE_SOP]); + +err_mmap_pio_bufbase_sop: + HFI_MUNMAP_ERRCHECK(binfo, sc_credits_addr, arrsz[SC_CREDITS]); + +err_mmap_sc_credits_addr: + return -1; +} /* It is allowed to have multiple devices (and of different types) simultaneously opened and initialized, although this (still! Oct 07) @@ -82,15 +293,13 @@ struct _hfi_ctrl *. The struct _hfi_ctrl * used for everything else is returned as part of hfi1_base_info. */ -struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo) +struct _hfi_ctrl *hfi_userinit_internal(int fd, bool skip_affinity, + struct hfi1_user_info_dep *uinfo) { struct _hfi_ctrl *spctrl = NULL; struct hfi1_ctxt_info *cinfo; struct hfi1_base_info *binfo; - void *tmp; - uint64_t *tmp64; struct hfi1_cmd c; - uintptr_t pg_mask; int __hfi_pg_sz; #ifdef PSM2_SUPPORT_IW_CMD_API /* for major version 6 of driver, we will use uinfo_new. See below for details. */ @@ -99,12 +308,11 @@ struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo) /* First get the page size */ __hfi_pg_sz = sysconf(_SC_PAGESIZE); - pg_mask = ~(intptr_t) (__hfi_pg_sz - 1); if (!(spctrl = calloc(1, sizeof(struct _hfi_ctrl)))) { _HFI_INFO("can't allocate memory for hfi_ctrl: %s\n", strerror(errno)); - goto err; + goto err_calloc_hfi_ctrl; } cinfo = &spctrl->ctxt_info; binfo = &spctrl->base_info; @@ -157,7 +365,7 @@ struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo) _HFI_INFO("assign_context command failed: %s\n", strerror(errno)); } - goto err; + goto err_hfi_cmd_assign_ctxt; } #ifdef PSM2_SUPPORT_IW_CMD_API @@ -180,37 +388,37 @@ struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo) if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) { _HFI_INFO("CTXT_INFO command failed: %s\n", strerror(errno)); - goto err; + goto err_hfi_cmd_ctxt_info; } /* sanity checking... */ if (cinfo->rcvtids%8) { _HFI_INFO("rcvtids not 8 multiple: %d\n", cinfo->rcvtids); - goto err; + goto err_sanity_check; } if (cinfo->egrtids%8) { _HFI_INFO("egrtids not 8 multiple: %d\n", cinfo->egrtids); - goto err; + goto err_sanity_check; } if (cinfo->rcvtids < cinfo->egrtids) { _HFI_INFO("rcvtids(%d) < egrtids(%d)\n", cinfo->rcvtids, cinfo->egrtids); - goto err; + goto err_sanity_check; } if (cinfo->rcvhdrq_cnt%32) { _HFI_INFO("rcvhdrq_cnt not 32 multiple: %d\n", cinfo->rcvhdrq_cnt); - goto err; + goto err_sanity_check; } if (cinfo->rcvhdrq_entsize%64) { _HFI_INFO("rcvhdrq_entsize not 64 multiple: %d\n", cinfo->rcvhdrq_entsize); - goto err; + goto err_sanity_check; } if (cinfo->rcvegr_size%__hfi_pg_sz) { _HFI_INFO("rcvegr_size not page multiple: %d\n", cinfo->rcvegr_size); - goto err; + goto err_sanity_check; } _HFI_VDBG("ctxtinfo: runtime_flags %llx, rcvegr_size %d\n", @@ -227,8 +435,10 @@ struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo) cinfo->egrtids, cinfo->sdma_ring_size); /* if affinity has not been setup, set it */ - if ((!getenv("HFI_NO_CPUAFFINITY") && cinfo->rec_cpu != (__u16) -1) || - getenv("HFI_FORCE_CPUAFFINITY")) { + if (getenv("HFI_FORCE_CPUAFFINITY") || + (cinfo->rec_cpu != (__u16) -1 && + !(getenv("HFI_NO_CPUAFFINITY") || skip_affinity))) + { cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(cinfo->rec_cpu, &cpuset); @@ -240,7 +450,6 @@ struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo) } } - /* 4. Get user base info from driver */ c.type = PSMI_HFI_CMD_USER_INFO; c.len = sizeof(*binfo); @@ -248,7 +457,7 @@ struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo) if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) { _HFI_INFO("BASE_INFO command failed: %s\n", strerror(errno)); - goto err; + goto err_hfi_cmd_user_info; } hfi_set_user_version(binfo->sw_version); @@ -276,272 +485,15 @@ struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo) ("User major version 0x%x not same as driver major 0x%x\n", hfi_get_user_major_version(), binfo->sw_version >> HFI1_SWMAJOR_SHIFT); if ((binfo->sw_version >> HFI1_SWMAJOR_SHIFT) < hfi_get_user_major_version()) - goto err; /* else assume driver knows how to be compatible */ + goto err_version_mismatch; /* else assume driver knows how to be compatible */ } else if ((binfo->sw_version & 0xffff) != HFI1_USER_SWMINOR) { _HFI_PRDBG ("User minor version 0x%x not same as driver minor 0x%x\n", HFI1_USER_SWMINOR, binfo->sw_version & 0xffff); } - /* Map the PIO credits address */ - if ((tmp = hfi_mmap64(0, __hfi_pg_sz, - PROT_READ, MAP_SHARED | MAP_LOCKED, fd, - (__off64_t) binfo->sc_credits_addr & - pg_mask)) == MAP_FAILED) { - _HFI_INFO("mmap of sc_credits_addr (%llx) failed: %s\n", - (unsigned long long)binfo->sc_credits_addr, - strerror(errno)); - goto err; - } else { - hfi_touch_mmap(tmp, __hfi_pg_sz); - binfo->sc_credits_addr = (uint64_t) (uintptr_t) tmp | - (binfo->sc_credits_addr & ~pg_mask); - _HFI_VDBG("sc_credits_addr %llx\n", - binfo->sc_credits_addr); - } - - /* Map the PIO buffer SOP address */ - if ((tmp = hfi_mmap64(0, cinfo->credits * 64, - PROT_WRITE, MAP_SHARED | MAP_LOCKED, fd, - (__off64_t) binfo->pio_bufbase_sop & pg_mask)) - == MAP_FAILED) { - _HFI_INFO("mmap of pio buffer sop at %llx failed: %s\n", - (unsigned long long)binfo->pio_bufbase_sop, - strerror(errno)); - goto err; - } else { - /* Do not try to read the PIO buffers; they are mapped write */ - /* only. We'll fault them in as we write to them. */ - binfo->pio_bufbase_sop = (uintptr_t) tmp; - _HFI_VDBG("pio_bufbase_sop %llx\n", - binfo->pio_bufbase_sop); - } - - /* Map the PIO buffer address */ - if ((tmp = hfi_mmap64(0, cinfo->credits * 64, - PROT_WRITE, MAP_SHARED | MAP_LOCKED, fd, - (__off64_t) binfo->pio_bufbase & pg_mask)) == - MAP_FAILED) { - _HFI_INFO("mmap of pio buffer at %llx failed: %s\n", - (unsigned long long)binfo->pio_bufbase, - strerror(errno)); - goto err; - } else { - /* Do not try to read the PIO buffers; they are mapped write */ - /* only. We'll fault them in as we write to them. */ - binfo->pio_bufbase = (uintptr_t) tmp; - _HFI_VDBG("sendpio_bufbase %llx\n", binfo->pio_bufbase); - } - - /* Map the receive header queue */ - if ((tmp = - hfi_mmap64(0, cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize, - PROT_READ, MAP_SHARED | MAP_LOCKED, fd, - (__off64_t) binfo->rcvhdr_bufbase & pg_mask)) == - MAP_FAILED) { - _HFI_INFO("mmap of rcvhdrq at %llx failed: %s\n", - (unsigned long long)binfo->rcvhdr_bufbase, - strerror(errno)); - goto err; - } else { - /* for use in protocol code */ - hfi_touch_mmap(tmp, - cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize); - binfo->rcvhdr_bufbase = (uintptr_t) tmp; /* set to mapped address */ - _HFI_VDBG("rcvhdr_bufbase %llx\n", binfo->rcvhdr_bufbase); - } - - /* Map the receive eager buffer */ - if ((tmp = - hfi_mmap64(0, cinfo->egrtids * cinfo->rcvegr_size, - PROT_READ, MAP_SHARED | MAP_LOCKED, fd, - (__off64_t) binfo->rcvegr_bufbase & pg_mask)) == - MAP_FAILED) { - _HFI_INFO("mmap of rcvegrq bufs from %llx failed: %s\n", - (unsigned long long)binfo->rcvegr_bufbase, - strerror(errno)); - goto err; - } else { - hfi_touch_mmap(tmp, cinfo->egrtids * cinfo->rcvegr_size); - binfo->rcvegr_bufbase = (uint64_t) (uintptr_t) tmp; - _HFI_VDBG("rcvegr_bufbase %llx\n", binfo->rcvegr_bufbase); - } - - /* Map the sdma completion queue */ - if (!(cinfo->runtime_flags & HFI1_CAP_SDMA)) { - binfo->sdma_comp_bufbase = 0; - } else - if ((tmp = - hfi_mmap64(0, cinfo->sdma_ring_size * - sizeof(struct hfi1_sdma_comp_entry), - PROT_READ, MAP_SHARED | MAP_LOCKED, fd, - (__off64_t) binfo->sdma_comp_bufbase & pg_mask)) == - MAP_FAILED) { - _HFI_INFO - ("mmap of sdma completion queue from %llx failed: %s\n", - (unsigned long long)binfo->sdma_comp_bufbase, - strerror(errno)); - goto err; - } else { - binfo->sdma_comp_bufbase = (uint64_t) (uintptr_t) tmp; - } - _HFI_VDBG("sdma_comp_bufbase %llx\n", binfo->sdma_comp_bufbase); - - /* Map RXE per-context CSRs */ - if ((tmp = hfi_mmap64(0, __hfi_pg_sz, - PROT_WRITE | PROT_READ, MAP_SHARED | MAP_LOCKED, - fd, - (__off64_t) binfo->user_regbase & pg_mask)) == - MAP_FAILED) { - _HFI_INFO("mmap of user registers at %llx failed: %s\n", - (unsigned long long)binfo->user_regbase, - strerror(errno)); - goto err; - } else { - /* we don't try to fault these in, no need */ - binfo->user_regbase = (uint64_t) (uintptr_t) tmp; - _HFI_VDBG("user_regbase %llx\n", binfo->user_regbase); - } - - /* - * Set up addresses for optimized register writeback routines. - * This is for the real onchip registers, shared context or not - */ - tmp64 = (uint64_t *) tmp; - spctrl->__hfi_rcvhdrtail = (volatile __le64 *)&tmp64[ur_rcvhdrtail]; - spctrl->__hfi_rcvhdrhead = (volatile __le64 *)&tmp64[ur_rcvhdrhead]; - spctrl->__hfi_rcvegrtail = - (volatile __le64 *)&tmp64[ur_rcvegrindextail]; - spctrl->__hfi_rcvegrhead = - (volatile __le64 *)&tmp64[ur_rcvegrindexhead]; - spctrl->__hfi_rcvofftail = - (volatile __le64 *)&tmp64[ur_rcvegroffsettail]; - - if (!(cinfo->runtime_flags & HFI1_CAP_HDRSUPP)) { - spctrl->__hfi_rcvtidflow = spctrl->regs; - spctrl->__hfi_tfvalid = 0; - } else { - spctrl->__hfi_rcvtidflow = - (volatile __le64 *)&tmp64[ur_rcvtidflowtable]; - spctrl->__hfi_tfvalid = 1; - } - - /* Map the rcvhdrq tail register address */ - if (!(cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL)) { - /* - * We don't use receive header queue tail register to detect - * new packets, but here we save the address for - * false-eager-full recovery. - */ - binfo->rcvhdrtail_base = - (uint64_t) (uintptr_t) spctrl->__hfi_rcvhdrtail; - spctrl->__hfi_rcvtail = (__le64 *) binfo->rcvhdrtail_base; - } else - if ((tmp = hfi_mmap64(0, __hfi_pg_sz, - PROT_READ, MAP_SHARED | MAP_LOCKED, fd, - (__off64_t) binfo->rcvhdrtail_base & - pg_mask)) == MAP_FAILED) { - _HFI_INFO("mmap of rcvhdrq tail addr %llx failed: %s\n", - (unsigned long long)binfo->rcvhdrtail_base, - strerror(errno)); - goto err; - } else { - hfi_touch_mmap(tmp, __hfi_pg_sz); - binfo->rcvhdrtail_base = (uint64_t) (uintptr_t) tmp; - spctrl->__hfi_rcvtail = (__le64 *) binfo->rcvhdrtail_base; - } - _HFI_VDBG("rcvhdr_tail_addr %llx\n", binfo->rcvhdrtail_base); - - /* Map the event page */ - if ((tmp = hfi_mmap64(0, __hfi_pg_sz, - PROT_READ, MAP_SHARED | MAP_LOCKED, fd, - (__off64_t) binfo->events_bufbase & pg_mask)) == - MAP_FAILED) { - _HFI_INFO("mmap of status page at %llx failed: %s\n", - (unsigned long long)binfo->events_bufbase, - strerror(errno)); - goto err; - } else { - binfo->events_bufbase = (uint64_t) (uintptr_t) tmp | - (binfo->events_bufbase & ~pg_mask); - _HFI_VDBG("events_bufbase %llx\n", binfo->events_bufbase); - } - - /* Map the status page */ - if ((tmp = hfi_mmap64(0, __hfi_pg_sz, - PROT_READ, MAP_SHARED | MAP_LOCKED, fd, - (__off64_t) binfo->status_bufbase & pg_mask)) == - MAP_FAILED) { - _HFI_INFO("mmap of status page (%llx) failed: %s\n", - (unsigned long long)binfo->status_bufbase, - strerror(errno)); - goto err; - } else { - binfo->status_bufbase = (uintptr_t) tmp; - _HFI_VDBG("status_bufbase %llx\n", binfo->status_bufbase); - } - - /* If subcontext is used, map the buffers */ - if (uinfo->subctxt_cnt) { - unsigned num_subcontexts = uinfo->subctxt_cnt; - size_t size; - - size = __hfi_pg_sz; - if ((tmp = hfi_mmap64(0, size, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_LOCKED, fd, - (__off64_t) binfo->subctxt_uregbase & - pg_mask)) == MAP_FAILED) { - _HFI_INFO - ("mmap of subcontext uregbase array (%llx) failed: %s\n", - (unsigned long long)binfo->subctxt_uregbase, - strerror(errno)); - goto err; - } else { - hfi_touch_mmap(tmp, size); - binfo->subctxt_uregbase = (uint64_t) (uintptr_t) tmp; - _HFI_VDBG("subctxt_uregbase %llx\n", - binfo->subctxt_uregbase); - } - - size = ALIGN(cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize, - __hfi_pg_sz) * num_subcontexts; - if ((tmp = hfi_mmap64(0, size, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_LOCKED, fd, - (__off64_t) binfo->subctxt_rcvhdrbuf & - pg_mask)) == MAP_FAILED) { - _HFI_INFO - ("mmap of subcontext rcvhdr_base array (%llx) failed: %s\n", - (unsigned long long)binfo->subctxt_rcvhdrbuf, - strerror(errno)); - goto err; - } else { - hfi_touch_mmap(tmp, size); - binfo->subctxt_rcvhdrbuf = (uint64_t) (uintptr_t) tmp; - _HFI_VDBG("subctxt_rcvhdrbuf %llx\n", - binfo->subctxt_rcvhdrbuf); - } - - size = ALIGN(cinfo->egrtids * cinfo->rcvegr_size, - __hfi_pg_sz) * num_subcontexts; - if ((tmp = hfi_mmap64(0, size, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_LOCKED, fd, - (__off64_t) binfo->subctxt_rcvegrbuf & - pg_mask)) == MAP_FAILED) { - _HFI_INFO - ("mmap of subcontext rcvegrbuf array (%llx) failed: %s\n", - (unsigned long long)binfo->subctxt_rcvegrbuf, - strerror(errno)); - goto err; - } else { - hfi_touch_mmap(tmp, size); - binfo->subctxt_rcvegrbuf = (uint64_t) (uintptr_t) tmp; - _HFI_VDBG("subctxt_rcvegrbuf %llx\n", - binfo->subctxt_rcvegrbuf); - } - } + if (map_hfi_mem(fd, spctrl, uinfo->subctxt_cnt) == -1) + goto err_map_hfi_mem; /* Save some info. */ spctrl->fd = fd; @@ -560,8 +512,32 @@ struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo) return spctrl; -err: - if (spctrl) - free(spctrl); +err_map_hfi_mem: +err_version_mismatch: +err_hfi_cmd_user_info: + /* TODO: restore the original CPU affinity? */ + +err_sanity_check: +err_hfi_cmd_ctxt_info: + /* TODO: ioctl de-assign context here? */ + // without de-assigning the context, all subsequent hfi_userinit_internal() + // calls are going to fail + _HFI_ERROR("An unrecoverable error occurred while communicating with the driver\n"); + abort(); /* TODO: or do we want to include psm_user.h to use psmi_handle_error()? */ + // no recovery here + + /* if we failed to allocate memory or to assign the context, we might still recover from this. + * Returning NULL will cause the function to be reinvoked n times. Do we really want this + * behavior? + */ +err_hfi_cmd_assign_ctxt: + free(spctrl); + +err_calloc_hfi_ctrl: return NULL; } + +struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo) +{ + return hfi_userinit_internal(fd, false, uinfo); +} diff --git a/psm_hal_gen1/opa_service_gen1.c b/psm_hal_gen1/opa_service_gen1.c index e4719e3..641e262 100644 --- a/psm_hal_gen1/opa_service_gen1.c +++ b/psm_hal_gen1/opa_service_gen1.c @@ -89,84 +89,6 @@ static sw_version_t sw_version = } }; -/* - * This function is necessary in a udev-based world. There can be an - * arbitrarily long (but typically less than one second) delay between - * a driver getting loaded and any dynamic special files turning up. - * - * The timeout is in milliseconds. A value of zero means "callee - * decides timeout". Negative is infinite. - * - * Returns 0 on success, -1 on error or timeout. Check errno to see - * whether there was a timeout (ETIMEDOUT) or an error (any other - * non-zero value). - */ -int hfi_wait_for_device(const char *path, long timeout) -{ - int saved_errno; - struct stat st; - long elapsed; - int ret; - - if (timeout == 0) - timeout = 15000; - - elapsed = 0; - - while (1) { - static const long default_ms = 250; - struct timespec req = { 0 }; - long ms; - - ret = stat(path, &st); - saved_errno = errno; - - if (ret == 0 || (ret == -1 && errno != ENOENT)) - break; - - if ((timeout > 0) && ((timeout - elapsed) <= 0)) { - saved_errno = ETIMEDOUT; - break; - } - - if (elapsed == 0) { - if (timeout < 0) - _HFI_DBG - ("Device file %s not present on first check; " - "waiting indefinitely...\n", path); - else - _HFI_DBG - ("Device file %s not present on first check; " - "waiting up to %.1f seconds...\n", path, - timeout / 1e3); - } - - if (timeout < 0 || timeout - elapsed >= default_ms) - ms = default_ms; - else - ms = timeout; - - elapsed += ms; - req.tv_nsec = ms * 1000000; - - ret = nanosleep(&req, NULL); - saved_errno = errno; - - if (ret == -1) - break; - } - - if (ret == 0) - _HFI_DBG("Found %s after %.1f seconds\n", path, elapsed / 1e3); - else - _HFI_INFO - ("The %s device failed to appear after %.1f seconds: %s\n", - path, elapsed / 1e3, strerror(saved_errno)); - - errno = saved_errno; - return ret; -} - /* fwd declaration */ ustatic int _hfi_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count); @@ -223,13 +145,6 @@ int hfi_context_open_ex(int unit, int port, uint64_t open_timeout, snprintf(dev_name, dev_name_len, "%s_%u", HFI_DEVICE_PATH_GEN1, 0); - if (hfi_wait_for_device(dev_name, (long)open_timeout) == -1) { - _HFI_DBG("Could not find an HFI Unit on device " - "%s (%lds elapsed)", dev_name, - (long)open_timeout / 1000); - return -1; - } - if ((fd = open(dev_name, O_RDWR)) == -1) { _HFI_DBG("(host:Can't open %s for reading and writing", dev_name); @@ -397,7 +312,7 @@ void *hfi_mmap64(void *addr, size_t length, int prot, int flags, int fd, /* that a working chip has been found for each possible unit #. */ /* number of units >=0 (0 means none found). */ /* formerly used sysfs file "num_units" */ -int hfi_get_num_units(int wait) +int hfi_get_num_units(void) { int ret; @@ -407,12 +322,7 @@ int hfi_get_num_units(int wait) int r; snprintf(pathname, sizeof(pathname), HFI_DEVICE_PATH_GEN1 "_%d", ret); - if (wait && (ret == 0)) - /* We only wait for the first device to come up. Not - on subsequent devices in order to save time. */ - r = hfi_wait_for_device(pathname, 0); - else - r = stat(pathname, &st); + r = stat(pathname, &st); if (!r) continue; else @@ -443,14 +353,14 @@ int hfi_get_unit_active(int unit) /* get the number of contexts from the unit id. */ /* Returns 0 if no unit or no match. */ -int hfi_get_num_contexts(int unit_id, int wait) +int hfi_get_num_contexts(int unit_id) { int n = 0; int units; int64_t val; uint32_t p = HFI_MIN_PORT; - units = hfi_get_num_units(wait); + units = hfi_get_num_units(); if_pf(units <= 0) return 0; diff --git a/psm_hal_gen1/opa_service_gen1.h b/psm_hal_gen1/opa_service_gen1.h index 9bce8ca..6e18e57 100644 --- a/psm_hal_gen1/opa_service_gen1.h +++ b/psm_hal_gen1/opa_service_gen1.h @@ -173,21 +173,16 @@ int hfi_get_port_index2pkey(int unit, int port, int index); /* Get the number of units supported by the driver. Does not guarantee that a working chip has been found for each possible unit #. - When the parameter 'wait' is non-zero, the code will wait briefly as - the driver may be coming up. If 'wait' is zero, the function does not wait. Returns -1 with errno set, or number of units >=0 (0 means none found). */ -int hfi_get_num_units(int wait); +int hfi_get_num_units(); /* Given a unit number, returns 1 if any port on the unit is active. returns 0 if no port on the unit is active. returns -1 when an error occurred. */ int hfi_get_unit_active(int unit); -/* get the number of contexts from the unit id. - When the parameter 'wait' is non-zero, the code will wait briefly as - the driver may be coming up. If 'wait' is zero, the function does not wait. - Returns 0 if no unit or no match. */ -int hfi_get_num_contexts(int unit, int wait); +/* get the number of contexts from the unit id. */ +int hfi_get_num_contexts(int unit); /* Open hfi device file, return -1 on error. */ int hfi_context_open(int unit, int port, uint64_t open_timeout); @@ -242,9 +237,6 @@ int hfi_get_ctrs_port_names(int unitno, char **namep); /* sysfs helper routines (only those currently used are exported; * try to avoid using others) */ -/* Initializes the following sysfs helper routines. */ -void sysfs_init(const char *dflt_hfi_class_path); - const char *hfi_sysfs_path(void); /* read a string value */ @@ -285,10 +277,6 @@ int hfi_hfifs_unit_rd(uint32_t unit, const char *, void *, int); int hfi_hfifs_open(const char *relname, int flags); -/* wait for device special file to show up. timeout is in - * milliseconds, 0 is "callee knows best", < 0 is infinite. */ -int hfi_wait_for_device(const char *path, long timeout); - int hfi_cmd_wait_for_packet(int fd); #endif /* OPA_SERVICE_GEN1_H */ diff --git a/psm_hal_gen1/opa_user_gen1.h b/psm_hal_gen1/opa_user_gen1.h index 9731b2b..adb120d 100644 --- a/psm_hal_gen1/opa_user_gen1.h +++ b/psm_hal_gen1/opa_user_gen1.h @@ -77,6 +77,7 @@ #include #include #include +#include #include "opa_intf.h" #include "opa_common_gen1.h" #include "opa_byteorder.h" @@ -149,6 +150,82 @@ struct hfi_pbc { __u16 fill1; }; +typedef enum mapsize +{ SC_CREDITS, + PIO_BUFBASE_SOP, + PIO_BUFBASE, + RCVHDR_BUFBASE, + RCVEGR_BUFBASE, + SDMA_COMP_BUFBASE, + USER_REGBASE, + RCVHDRTAIL_BASE, + EVENTS_BUFBASE, + STATUS_BUFBASE, + SUBCTXT_UREGBASE, + SUBCTXT_RCVHDRBUF, + SUBCTXT_RCVEGRBUF, + MAPSIZE_MAX +} mapsize_t; + +/* TODO: consider casting in the ALIGN() macro */ +#define ALIGN(x, a) (((x)+(a)-1)&~((a)-1)) +#define ALIGNDOWN_PTR(x, a) ((void*)(((uintptr_t)(x))&~((uintptr_t)((a)-1)))) + +/* using the same flags for all the mappings */ +#define HFI_MMAP_FLAGS (MAP_SHARED|MAP_LOCKED) +#define HFI_MMAP_PGSIZE sysconf(_SC_PAGESIZE) +/* cast to uintptr_t as opposed to intptr_t which evaluates to a signed type + * * on which one should not perform bitwise operations (undefined behavior) + * */ +#define HFI_MMAP_PGMASK (~(uintptr_t)(HFI_MMAP_PGSIZE-1)) + +/* this is only an auxiliary macro for HFI_MMAP_ERRCHECK() + * @off expected to be unsigned in order to AND with the page mask and avoid undefined behavior + */ +#define U64_TO_OFF64_PGMASK(off) ((__off64_t)((off) & HFI_MMAP_PGMASK)) + +#define HFI_MMAP_ALIGNOFF(fd, off, size, prot) hfi_mmap64(0,(size),(prot),HFI_MMAP_FLAGS,(fd),U64_TO_OFF64_PGMASK((off))) +/* complementary */ +#define HFI_MUNMAP(addr, size) munmap((addr), (size)) + +/* make sure uintmax_t can hold the result of unsigned int multiplication */ +#if UINT_MAX > (UINTMAX_MAX / UINT_MAX) +#error We cannot safely multiply unsigned integers on this platform +#endif + +/* @member assumed to be of type u64 and validated to be so */ +#define HFI_MMAP_ERRCHECK(fd, binfo, member, size, prot) ({ \ + typeof((binfo)->member) *__tptr = (__u64 *)NULL; \ + (void)__tptr; \ + void *__maddr = HFI_MMAP_ALIGNOFF((fd), (binfo)->member, (size), (prot)); \ + do { \ + if (unlikely(__maddr == MAP_FAILED)) { \ + uintmax_t outval = (uintmax_t)((binfo)->member); \ + _HFI_INFO("mmap of " #member " (0x%jx) size %zu failed: %s\n", \ + outval, size, strerror(errno)); \ + goto err_mmap_##member; \ + } \ + (binfo)->member = (__u64)__maddr; \ + _HFI_VDBG(#member "mmap %jx successful\n", (uintmax_t)((binfo)->member)); \ + } while(0); \ + __maddr; \ +}) + +/* assigns 0 to the member after unmapping */ +#define HFI_MUNMAP_ERRCHECK(binfo, member, size) \ + do { typeof((binfo)->member) *__tptr = (__u64 *)NULL; \ + (void)__tptr; \ + void *__addr = ALIGNDOWN_PTR((binfo)->member, HFI_MMAP_PGSIZE); \ + if (unlikely( __addr == NULL || (munmap(__addr, (size)) == -1))) { \ + _HFI_INFO("unmap of " #member " (%p) failed: %s\n", \ + __addr, strerror(errno)); \ + } \ + else { \ + _HFI_VDBG("unmap of " #member "(%p) succeeded\n", __addr); \ + (binfo)->member = 0; \ + } \ + } while(0) + #define HFI_PCB_SIZE_IN_BYTES 8 /* Usable bytes in header (hdrsize - lrh - bth) */ @@ -199,7 +276,7 @@ struct _hfi_ctrl { struct hfi1_base_info base_info; /* some local storages in some condition: */ - /* as storage of __hfi_rcvtidflow in hfi_userinit(). */ + /* as storage of __hfi_rcvtidflow in hfi_userinit_internal(). */ __le64 regs[HFI_TF_NFLOWS]; /* location to which OPA writes the rcvhdrtail register whenever @@ -236,9 +313,13 @@ struct _hfi_ctrl { struct _hfi_ctrl *. The struct _hfi_ctrl * used for everything else is returned by this routine. */ - struct _hfi_ctrl *hfi_userinit(int32_t, struct hfi1_user_info_dep *); +/* Internal function extends API, while original remains for backwards + compatibility with external code +*/ +struct _hfi_ctrl *hfi_userinit_internal(int32_t, bool, struct hfi1_user_info_dep *); + /* don't inline these; it's all init code, and not inlining makes the */ /* overall code shorter and easier to debug */ void hfi_touch_mmap(void *, size_t) __attribute__ ((noinline)); @@ -477,10 +558,9 @@ static __inline__ int32_t hfi_update_tid(struct _hfi_ctrl *ctrl, uint64_t tidlist, uint32_t *tidcnt, uint16_t flags) { struct hfi1_cmd cmd; -#ifdef PSM_CUDA - struct hfi1_tid_info_v2 tidinfo; -#else struct hfi1_tid_info tidinfo; +#ifdef PSM_CUDA + struct hfi1_tid_info_v2 tidinfov2; #endif int err; @@ -491,23 +571,30 @@ static __inline__ int32_t hfi_update_tid(struct _hfi_ctrl *ctrl, tidinfo.tidcnt = 0; /* clear to zero */ cmd.type = PSMI_HFI_CMD_TID_UPDATE; -#ifdef PSM_CUDA - cmd.type = PSMI_HFI_CMD_TID_UPDATE_V2; - - if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) - tidinfo.flags = flags; - else - tidinfo.flags = 0; -#endif - cmd.len = sizeof(tidinfo); cmd.addr = (__u64) &tidinfo; +#ifdef PSM_CUDA + if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) { + /* Copy values to v2 struct */ + tidinfov2.vaddr = tidinfo.vaddr; + tidinfov2.length = tidinfo.length; + tidinfov2.tidlist = tidinfo.tidlist; + tidinfov2.tidcnt = tidinfo.tidcnt; + tidinfov2.flags = flags; + + cmd.type = PSMI_HFI_CMD_TID_UPDATE_V2; + cmd.len = sizeof(tidinfov2); + cmd.addr = (__u64) &tidinfov2; + } +#endif err = hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)); if (err != -1) { - *length = tidinfo.length; - *tidcnt = tidinfo.tidcnt; + struct hfi1_tid_info *rettidinfo = + (struct hfi1_tid_info *)cmd.addr; + *length = rettidinfo->length; + *tidcnt = rettidinfo->tidcnt; } return err; diff --git a/psm_hal_gen1/psm_gdrcpy.c b/psm_hal_gen1/psm_gdrcpy.c index 06cb9c2..1896f9e 100644 --- a/psm_hal_gen1/psm_gdrcpy.c +++ b/psm_hal_gen1/psm_gdrcpy.c @@ -63,9 +63,6 @@ static int gdr_fd; -int is_gdr_copy_enabled; - - int get_gdr_fd(){ return gdr_fd; } @@ -175,6 +172,9 @@ gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf, ((buf + size - 1) & GPU_PAGE_MASK) - pageaddr); + _HFI_VDBG("(gpudirect) buf=%p size=%zu pageaddr=%p pagelen=%u flags=0x%x proto=%p\n", + (void *)buf, size, (void *)pageaddr, pagelen, flags, proto); + query_params.query_params_in.gpu_buf_addr = pageaddr; query_params.query_params_in.gpu_buf_size = pagelen; retry: @@ -186,7 +186,7 @@ gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf, if (!handle_out_of_bar_space(proto)) { /* Fatal error */ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "Unable to PIN GPU pages(Out of BAR1 space)\n"); + "Unable to PIN GPU pages(Out of BAR1 space) (errno: %d)\n", errno); return NULL; } else { goto retry; diff --git a/psm_hal_gen1/psm_hal_gen1.c b/psm_hal_gen1/psm_hal_gen1.c index 732943f..be5e351 100644 --- a/psm_hal_gen1/psm_hal_gen1.c +++ b/psm_hal_gen1/psm_hal_gen1.c @@ -82,7 +82,7 @@ static hfp_gen1_t psm_gen1_hi = { .hfp_close_context = hfp_gen1_close_context, .hfp_context_open = hfp_gen1_context_open, .hfp_dma_slot_available = hfp_gen1_dma_slot_available, - .hfp_finalize = hfp_gen1_finalize, + .hfp_finalize_ = hfp_gen1_finalize_, .hfp_forward_packet_to_subcontext = hfp_gen1_forward_packet_to_subcontext, .hfp_free_tid = hfp_gen1_free_tid, .hfp_get_bthqp = hfp_gen1_get_bthqp, @@ -102,18 +102,15 @@ static hfp_gen1_t psm_gen1_hi = { .hfp_get_jkey = hfp_gen1_get_jkey, .hfp_get_lid = hfp_gen1_get_lid, .hfp_get_node_id = hfp_gen1_get_node_id, - .hfp_get_num_contexts = hfp_gen1_get_num_contexts, - .hfp_get_num_free_contexts = hfp_gen1_get_num_free_contexts, .hfp_get_pio_size = hfp_gen1_get_pio_size, .hfp_get_pio_stall_cnt = hfp_gen1_get_pio_stall_cnt, - .hfp_get_port_active = hfp_gen1_get_port_active, .hfp_get_port_gid = hfp_gen1_get_port_gid, .hfp_get_port_index2pkey = hfp_gen1_get_port_index2pkey, .hfp_get_port_lid = hfp_gen1_get_port_lid, .hfp_get_port_lmc = hfp_gen1_get_port_lmc, .hfp_get_port_num = hfp_gen1_get_port_num, .hfp_get_port_rate = hfp_gen1_get_port_rate, - .hfp_get_port_sc2vl = hfp_gen1_get_port_sc2vl, + .hfp_get_sc2vl_map = hfp_gen1_get_sc2vl_map, .hfp_get_port_sl2sc = hfp_gen1_get_port_sl2sc, .hfp_get_receive_event = hfp_gen1_get_receive_event, .hfp_get_rhf_expected_sequence_number = hfp_gen1_get_rhf_expected_sequence_number, @@ -127,7 +124,6 @@ static hfp_gen1_t psm_gen1_hi = { .hfp_get_subctxt_cnt = hfp_gen1_get_subctxt_cnt, .hfp_get_tid_exp_cnt = hfp_gen1_get_tid_exp_cnt, .hfp_get_tidcache_invalidation = hfp_gen1_get_tidcache_invalidation, - .hfp_get_unit_active = hfp_gen1_get_unit_active, .hfp_get_unit_id = hfp_gen1_get_unit_id, .hfp_get_user_major_bldtime_version = hfp_gen1_get_user_major_bldtime_version, .hfp_get_user_major_bldtime_version = hfp_gen1_get_user_major_bldtime_version, @@ -171,8 +167,12 @@ static hfp_gen1_t psm_gen1_hi = { .hfp_writev = hfp_gen1_writev, #endif .hfp_get_default_pkey = hfp_gen1_get_default_pkey, + .hfp_get_num_contexts = hfp_gen1_get_num_contexts, + .hfp_get_num_free_contexts = hfp_gen1_get_num_free_contexts, .hfp_get_num_units = hfp_gen1_get_num_units, .hfp_get_num_ports = hfp_gen1_get_num_ports, + .hfp_get_port_active = hfp_gen1_get_port_active, + .hfp_get_unit_active = hfp_gen1_get_unit_active, .hfp_initialize = hfp_gen1_initialize, }, /* start of private hfp_gen1_private data */ diff --git a/psm_hal_gen1/psm_hal_gen1.h b/psm_hal_gen1/psm_hal_gen1.h index abe04a5..c4610f2 100644 --- a/psm_hal_gen1/psm_hal_gen1.h +++ b/psm_hal_gen1/psm_hal_gen1.h @@ -89,6 +89,7 @@ typedef struct _hfp_gen1_pc_private struct ips_subcontext_ureg *subcontext_ureg[HFI1_MAX_SHARED_CTXTS]; struct ips_spio spio_ctrl; struct hfi1_user_info_dep user_info; + uint16_t sc2vl[PSMI_N_SCS]; } hfp_gen1_pc_private; /* At the end of each scb struct, we have space reserved to accommodate diff --git a/psm_hal_gen1/psm_hal_gen1_spio.c b/psm_hal_gen1/psm_hal_gen1_spio.c index 8767dd9..eb9d5aa 100644 --- a/psm_hal_gen1/psm_hal_gen1_spio.c +++ b/psm_hal_gen1/psm_hal_gen1_spio.c @@ -640,7 +640,7 @@ ips_spio_process_events(const struct ptl *ptl_gen) if (event_mask & PSM_HAL_HFI_EVENT_SL2VL_CHANGE) { _HFI_INFO("SL2VL mapping changed for port.\n"); - ips_ibta_init_sl2sc2vl_table(&((struct ptl_ips *)(ctrl->ptl))->proto); + ips_ibta_init_sl2sc_table(&((struct ptl_ips *)(ctrl->ptl))->proto); } return PSM2_OK; @@ -686,6 +686,7 @@ ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, if (do_lock) pthread_spin_lock(&ctrl->spio_lock); +#ifdef PSM_FI if_pf(PSMI_FAULTINJ_ENABLED()) { PSMI_FAULTINJ_STATIC_DECL(fi_lost, "piosend", 1, IPS_FAULTINJ_PIOLOST); @@ -699,6 +700,7 @@ ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, goto fi_busy; /* else fall through normal processing path, i.e. no faults */ } +#endif /* #ifdef PSM_FI */ psmi_assert((length & 0x3) == 0); paylen = length + (cksum_valid ? PSM_CRC_SIZE_IN_BYTES : 0); @@ -709,7 +711,9 @@ ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, if_pf(spio_ctrl->spio_available_blocks < nblks) { /* Check unit status */ +#ifdef PSM_FI fi_busy: +#endif /* #ifdef PSM_FI */ if ((err = psmi_context_check_status(ctrl->context)) == PSM2_OK) { diff --git a/psm_hal_gen1/psm_hal_inline_i.h b/psm_hal_gen1/psm_hal_inline_i.h index d573653..6346bae 100644 --- a/psm_hal_gen1/psm_hal_inline_i.h +++ b/psm_hal_gen1/psm_hal_inline_i.h @@ -53,6 +53,8 @@ #include "psm_hal_gen1.h" +extern size_t arrsz[MAPSIZE_MAX]; + static inline struct _hfp_gen1 *get_psm_gen1_hi(void) { return (struct _hfp_gen1*) psmi_hal_current_hal_instance; @@ -64,16 +66,16 @@ static PSMI_HAL_INLINE int hfp_gen1_initialize(psmi_hal_instance_t *phi) return 0; } -/* hfp_gen1_finalize */ -static PSMI_HAL_INLINE int hfp_gen1_finalize(void) +/* hfp_gen1_finalize_ */ +static PSMI_HAL_INLINE int hfp_gen1_finalize_(void) { return 0; } /* hfp_gen1_get_num_units */ -static PSMI_HAL_INLINE int hfp_gen1_get_num_units(int wait) +static PSMI_HAL_INLINE int hfp_gen1_get_num_units(void) { - return hfi_get_num_units(wait); + return hfi_get_num_units(); } /* hfp_gen1_get_num_ports */ @@ -120,63 +122,100 @@ static PSMI_HAL_INLINE int hfp_gen1_get_num_free_contexts(int unit) return -PSM_HAL_ERROR_GENERAL_ERROR; } -/* hfp_gen1_close_context */ -static PSMI_HAL_INLINE int hfp_gen1_close_context(psmi_hal_hw_context *ctxtp) +static void free_egr_buffs(hfp_gen1_pc_private *psm_hw_ctxt) { - if (!ctxtp || !*ctxtp) - return PSM_HAL_ERROR_OK; +#define FREE_EGR_BUFFS_TABLE(cl_qs_arr, index) ips_recvq_egrbuf_table_free(((cl_qs_arr)[index]).egr_buffs) + size_t i, index, subctxt_cnt; + psm_hal_gen1_cl_q_t *cl_qs; - int i; - hfp_gen1_pc_private *psm_hw_ctxt = *ctxtp; - - ips_recvq_egrbuf_table_free(psm_hw_ctxt->cl_qs[PSM_HAL_CL_Q_RX_EGR_Q].egr_buffs); - - for (i=0;i < psm_hw_ctxt->user_info.subctxt_cnt;i++) - ips_recvq_egrbuf_table_free( - psm_hw_ctxt->cl_qs[ - PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(i) - ].egr_buffs); - struct hfi1_base_info *binfo; - struct hfi1_ctxt_info *cinfo; - int __hfi_pg_sz = sysconf(_SC_PAGESIZE); + cl_qs = psm_hw_ctxt->cl_qs; + index = PSM_HAL_CL_Q_RX_EGR_Q; + FREE_EGR_BUFFS_TABLE(cl_qs, index); + + subctxt_cnt = psm_hw_ctxt->user_info.subctxt_cnt; + for (i = 0; i < subctxt_cnt; i++) { + index = PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(i); + FREE_EGR_BUFFS_TABLE(cl_qs, index); + } +#undef FREE_EGR_BUFFS_TABLE +} + +static void unmap_hfi_mem(hfp_gen1_pc_private *psm_hw_ctxt) +{ + size_t subctxt_cnt = psm_hw_ctxt->user_info.subctxt_cnt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - binfo = &ctrl->base_info; - cinfo = &ctrl->ctxt_info; - - munmap((void*)PSMI_ALIGNDOWN(binfo->sc_credits_addr, __hfi_pg_sz), - __hfi_pg_sz); - munmap((void*)PSMI_ALIGNDOWN(binfo->pio_bufbase_sop, __hfi_pg_sz), - cinfo->credits * 64); - munmap((void*)PSMI_ALIGNDOWN(binfo->pio_bufbase, __hfi_pg_sz), - cinfo->credits * 64); - munmap((void*)PSMI_ALIGNDOWN(binfo->rcvhdr_bufbase, __hfi_pg_sz), - cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize); - munmap((void*)PSMI_ALIGNDOWN(binfo->rcvegr_bufbase, __hfi_pg_sz), - cinfo->egrtids * cinfo->rcvegr_size); - munmap((void*)PSMI_ALIGNDOWN(binfo->sdma_comp_bufbase, __hfi_pg_sz), - cinfo->sdma_ring_size * sizeof(struct hfi1_sdma_comp_entry)); - /* only unmap the RTAIL if it was enabled in the first place */ + struct hfi1_base_info *binfo = &ctrl->base_info; + struct hfi1_ctxt_info *cinfo = &ctrl->ctxt_info; + + /* 1. Unmap the PIO credits address */ + HFI_MUNMAP_ERRCHECK(binfo, sc_credits_addr, arrsz[SC_CREDITS]); + + /* 2. Unmap the PIO buffer SOP address */ + HFI_MUNMAP_ERRCHECK(binfo, pio_bufbase_sop, arrsz[PIO_BUFBASE_SOP]); + + /* 3. Unmap the PIO buffer address */ + HFI_MUNMAP_ERRCHECK(binfo, pio_bufbase, arrsz[PIO_BUFBASE]); + + /* 4. Unmap the receive header queue */ + HFI_MUNMAP_ERRCHECK(binfo, rcvhdr_bufbase, arrsz[RCVHDR_BUFBASE]); + + /* 5. Unmap the receive eager buffer */ + HFI_MUNMAP_ERRCHECK(binfo, rcvegr_bufbase, arrsz[RCVEGR_BUFBASE]); + + /* 6. Unmap the sdma completion queue */ + HFI_MUNMAP_ERRCHECK(binfo, sdma_comp_bufbase, arrsz[SDMA_COMP_BUFBASE]); + + /* 7. Unmap RXE per-context CSRs */ + HFI_MUNMAP_ERRCHECK(binfo, user_regbase, arrsz[USER_REGBASE]); + ctrl->__hfi_rcvhdrtail = NULL; + ctrl->__hfi_rcvhdrhead = NULL; + ctrl->__hfi_rcvegrtail = NULL; + ctrl->__hfi_rcvegrhead = NULL; + ctrl->__hfi_rcvofftail = NULL; + if (cinfo->runtime_flags & HFI1_CAP_HDRSUPP) { + ctrl->__hfi_rcvtidflow = NULL; + } + + /* 8. Unmap the rcvhdrq tail register address */ if (cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL) { - munmap((void*)PSMI_ALIGNDOWN(binfo->rcvhdrtail_base, __hfi_pg_sz), - __hfi_pg_sz); + /* only unmap the RTAIL if it was enabled in the first place */ + HFI_MUNMAP_ERRCHECK(binfo, rcvhdrtail_base, arrsz[RCVHDRTAIL_BASE]); + } else { + binfo->rcvhdrtail_base = 0; } - munmap((void*)PSMI_ALIGNDOWN(binfo->user_regbase, __hfi_pg_sz), - __hfi_pg_sz); - munmap((void*)PSMI_ALIGNDOWN(binfo->events_bufbase, __hfi_pg_sz), - __hfi_pg_sz); - munmap((void*)PSMI_ALIGNDOWN(binfo->status_bufbase, __hfi_pg_sz), - __hfi_pg_sz); - - /* only unmap subcontext-related stuff it subcontexts are enabled */ - if (psm_hw_ctxt->user_info.subctxt_cnt > 0) { - munmap((void*)PSMI_ALIGNDOWN(binfo->subctxt_uregbase, __hfi_pg_sz), - __hfi_pg_sz); - munmap((void*)PSMI_ALIGNDOWN(binfo->subctxt_rcvhdrbuf, __hfi_pg_sz), - __hfi_pg_sz); - munmap((void*)PSMI_ALIGNDOWN(binfo->subctxt_rcvegrbuf, __hfi_pg_sz), - __hfi_pg_sz); + + /* 9. Unmap the event page */ + HFI_MUNMAP_ERRCHECK(binfo, events_bufbase, arrsz[EVENTS_BUFBASE]); + + /* 10. Unmap the status page */ + HFI_MUNMAP_ERRCHECK(binfo, status_bufbase, arrsz[STATUS_BUFBASE]); + + /* 11. If subcontext is used, unmap the buffers */ + if (subctxt_cnt > 0) { + /* only unmap subcontext-related stuff it subcontexts are enabled */ + HFI_MUNMAP_ERRCHECK(binfo, subctxt_uregbase, arrsz[SUBCTXT_UREGBASE]); + HFI_MUNMAP_ERRCHECK(binfo, subctxt_rcvhdrbuf, arrsz[SUBCTXT_RCVHDRBUF]); + HFI_MUNMAP_ERRCHECK(binfo, subctxt_rcvegrbuf, arrsz[SUBCTXT_RCVEGRBUF]); } +} +/* hfp_gen1_close_context */ +static PSMI_HAL_INLINE int hfp_gen1_close_context(psmi_hal_hw_context *ctxtp) +{ + hfp_gen1_pc_private *psm_hw_ctxt; + + if (!ctxtp || !*ctxtp) + return PSM_HAL_ERROR_OK; + + psm_hw_ctxt = (hfp_gen1_pc_private *)(*ctxtp); + + /* Free the egress buffers */ + free_egr_buffs(psm_hw_ctxt); + + /* Unmap the HFI memory */ + unmap_hfi_mem(psm_hw_ctxt); + + /* Clean up the rest */ close(psm_hw_ctxt->ctrl->fd); free(psm_hw_ctxt->ctrl); psmi_free(psm_hw_ctxt); @@ -226,7 +265,7 @@ psmi_init_userinfo_params(psm2_ep_t ep, int unit_id, if (!shcontexts_enabled) return err; - avail_contexts = hfi_get_num_contexts(unit_id, 0); + avail_contexts = hfi_get_num_contexts(unit_id); if (avail_contexts == 0) { err = psmi_handle_error(NULL, PSM2_EP_NO_DEVICE, @@ -465,9 +504,16 @@ uint64_t get_cap_mask(uint64_t gen1_mask) { HFI1_CAP_STATIC_RATE_CTRL, PSM_HAL_CAP_STATIC_RATE_CTRL }, { HFI1_CAP_SDMA_HEAD_CHECK, PSM_HAL_CAP_SDMA_HEAD_CHECK }, { HFI1_CAP_EARLY_CREDIT_RETURN, PSM_HAL_CAP_EARLY_CREDIT_RETURN }, -#ifdef PSM_CUDA +#ifdef HFI1_CAP_GPUDIRECT_OT { HFI1_CAP_GPUDIRECT_OT, PSM_HAL_CAP_GPUDIRECT_OT }, -#endif +#else /* #ifdef HFI1_CAP_GPUDIRECT_OT */ +#ifndef PSM_CUDA + /* lifted from hfi1_user.h */ + { (1UL << 63), PSM_HAL_CAP_GPUDIRECT_OT }, +#else /* #ifndef PSM_CUDA */ +#error "Inconsistent build. HFI1_CAP_GPUDIRECT_OT must be defined for CUDA builds." +#endif /* #ifndef PSM_CUDA */ +#endif /* #ifdef HFI1_CAP_GPUDIRECT_OT */ }; uint64_t rv = 0; int i; @@ -490,7 +536,7 @@ static PSMI_HAL_INLINE int hfp_gen1_context_open(int unit, unsigned retryCnt) { int fd = -1; - psm2_error_t err = PSM_HAL_ERROR_OK; + psm2_error_t err = PSM2_OK; hfp_gen1_pc_private *pc_private = psmi_malloc(ep, UNDEFINED, sizeof(hfp_gen1_pc_private)); if_pf (!pc_private) { @@ -498,7 +544,7 @@ static PSMI_HAL_INLINE int hfp_gen1_context_open(int unit, goto bail; } - memset(pc_private,0,sizeof(hfp_gen1_pc_private)); + memset(pc_private, 0, sizeof(hfp_gen1_pc_private)); char dev_name[PATH_MAX]; fd = hfi_context_open_ex(unit, port, open_timeout, @@ -518,13 +564,14 @@ static PSMI_HAL_INLINE int hfp_gen1_context_open(int unit, goto bail; } - /* attempt to assign the context via hfi_userinit() */ + /* attempt to assign the context via hfi_userinit_internal() */ int retry = 0; do { if (retry > 0) - _HFI_INFO("hfi_userinit: failed, trying again (%d/%d)\n", + _HFI_INFO("hfi_userinit_internal: failed, trying again (%d/%d)\n", retry, retryCnt); - pc_private->ctrl = hfi_userinit(fd, &pc_private->user_info); + pc_private->ctrl = hfi_userinit_internal(fd, ep->skip_affinity, + &pc_private->user_info); } while (pc_private->ctrl == NULL && ++retry <= retryCnt); if (!pc_private->ctrl) @@ -781,9 +828,24 @@ static PSMI_HAL_INLINE int hfp_gen1_get_port_sl2sc(int unit, int port, int sl) return hfi_get_port_sl2sc(unit, port, sl); } -static PSMI_HAL_INLINE int hfp_gen1_get_port_sc2vl(int unit, int port, int sc) +static PSMI_HAL_INLINE int hfp_gen1_get_sc2vl_map(struct ips_proto *proto) { - return hfi_get_port_sc2vl(unit, port, sc); + hfp_gen1_pc_private *psm_hw_ctxt = proto->ep->context.psm_hw_ctxt; + uint8_t i; + + /* Get SC2VL table for unit, port */ + for (i = 0; i < PSMI_N_SCS; i++) { + int ret = hfi_get_port_sc2vl( + psmi_hal_get_unit_id( proto->ep->context.psm_hw_ctxt), + psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt), + i); + if (ret < 0) + /* Unable to get SC2VL. Set it to default */ + ret = PSMI_VL_DEFAULT; + + psm_hw_ctxt->sc2vl[i] = (uint16_t) ret; + } + return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_set_pkey(psmi_hal_hw_context ctxt, uint16_t pkey) @@ -1146,6 +1208,46 @@ static PSMI_HAL_INLINE int hfp_gen1_get_receive_event(psmi_hal_cl_idx head_idx, PSM_HAL_ERROR_OK) return rv; + /* If the hdrq_head is before cachedlastscan, that means that we have + * already prescanned this for BECNs and FECNs, so we should not check + * again + */ + if_pt((rcv_ev->proto->flags & IPS_PROTO_FLAG_CCA) && + (head_idx >= rcv_ev->recvq->state->hdrq_cachedlastscan)) { + /* IBTA CCA handling: + * If FECN bit set handle IBTA CCA protocol. For the + * flow that suffered congestion we flag it to generate + * a control packet with the BECN bit set - This is + * currently an unsolicited ACK. + * + * For all MQ packets the FECN processing/BECN + * generation is done in the is_expected_or_nak + * function as each eager packet is inspected there. + * + * For TIDFLOW/Expected data transfers the FECN + * bit/BECN generation is done in protoexp_data. Since + * header suppression can result in even FECN packets + * being suppressed the expected protocol generated + * additional BECN packets if a "large" number of + * generations are swapped without progress being made + * for receive. "Large" is set empirically to 4. + * + * FECN packets are ignored for all control messages + * (except ACKs and NAKs) since they indicate + * congestion on the control path which is not rate + * controlled. The CCA specification allows FECN on + * ACKs to be disregarded as well. + */ + + rcv_ev->is_congested = + _is_cca_fecn_set(rcv_ev-> + p_hdr) & IPS_RECV_EVENT_FECN; + rcv_ev->is_congested |= + (_is_cca_becn_set(rcv_ev->p_hdr) << + (IPS_RECV_EVENT_BECN - 1)); + } else + rcv_ev->is_congested = 0; + return PSM_HAL_ERROR_OK; } @@ -1265,9 +1367,10 @@ ips_proto_pbc_update(struct ips_proto *proto, struct ips_flow *flow, uint32_t isCtrlMsg, struct psm_hal_pbc *pbc, uint32_t hdrlen, uint32_t paylen)) { + hfp_gen1_pc_private *psm_hw_ctxt = proto->ep->context.psm_hw_ctxt; int dw = (sizeof(struct psm_hal_pbc) + hdrlen + paylen) >> BYTE2DWORD_SHIFT; int sc = proto->sl2sc[flow->path->pr_sl]; - int vl = proto->sc2vl[sc]; + int vl = psm_hw_ctxt->sc2vl[sc]; uint16_t static_rate = 0; if_pf(!isCtrlMsg && flow->path->pr_active_ipd) diff --git a/psm_lock.h b/psm_lock.h index c82960c..4a17272 100644 --- a/psm_lock.h +++ b/psm_lock.h @@ -69,11 +69,13 @@ typedef pthread_spinlock_t psmi_spinlock_t; #define psmi_spin_init(lock) pthread_spin_init(lock, \ PTHREAD_PROCESS_PRIVATE) +#define psmi_spin_destroy(lock) pthread_spin_destroy(lock) #define psmi_spin_lock(lock) pthread_spin_lock(lock) #define psmi_spin_trylock(lock) pthread_spin_trylock(lock) #define psmi_spin_unlock(lock) pthread_spin_unlock(lock) #else typedef ips_atomic_t psmi_spinlock_t; +#define PSMI_SPIN_INVALID 2 #define PSMI_SPIN_LOCKED 1 #define PSMI_SPIN_UNLOCKED 0 #endif @@ -103,10 +105,26 @@ PSMI_ALWAYS_INLINE(int psmi_spin_init(psmi_spinlock_t *lock)) PSMI_ALWAYS_INLINE(int psmi_spin_trylock(psmi_spinlock_t *lock)) { if (ips_atomic_cmpxchg(lock, PSMI_SPIN_UNLOCKED, PSMI_SPIN_LOCKED) - == PSMI_SPIN_UNLOCKED) + == PSMI_SPIN_UNLOCKED) { return 0; - else - return EBUSY; + } + + return EBUSY; +} + +PSMI_ALWAYS_INLINE(int psmi_spin_destroy(psmi_spinlock_t *lock)) +{ + if (lock == NULL) { + return EINVAL; + } + + /* We could just do psmi_spin_trylock() here and dispense with the invalid state */ + if (ips_atomic_cmpxchg(lock, PSMI_SPIN_UNLOCKED, PSMI_SPIN_INVALID) + == PSMI_SPIN_UNLOCKED) { + return 0; + } + + return EBUSY; } PSMI_ALWAYS_INLINE(int psmi_spin_lock(psmi_spinlock_t *lock)) @@ -139,6 +157,35 @@ PSMI_ALWAYS_INLINE(void psmi_init_lock(psmi_lock_t *lock)) #endif } +PSMI_ALWAYS_INLINE(void psmi_destroy_lock(psmi_lock_t *lock)) +{ + int err; +#ifdef PSMI_LOCK_IS_SPINLOCK + /* This will map to either pthread_spin_destroy() or our custom psmi_spin_destroy(). + * Both their return values can be interpreted by strerror(). + */ + if ((err = psmi_spin_destroy(&(lock->lock))) != 0) { + _HFI_VDBG("Destroying spinlock failed: %s\n", strerror(err)); + } + /* The same path for both the regular mutex and the debugging mutex */ +#elif defined(PSMI_LOCK_IS_MUTEXLOCK) || defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG) + if ((err = pthread_mutex_destroy(&(lock->lock))) != 0) { + /* strerror_r() may be a better choice here but it is tricky + * to reliably detect the XSI vs GNU version, and if hardcoded, + * may be inadvertently changed when tampering with headers/makefiles + * in the long run. + * + * This would result in incorrect operation: a segfault from + * derefencing the return value or failure to retrieve the + * error string. + * + * The C11's strerror_s may be an option here too. + */ + _HFI_VDBG("Destroying mutex failed: %s\n", strerror(err)); + } +#endif +} + PSMI_ALWAYS_INLINE(int psmi_sem_post(sem_t *sem, const char *name)) { if (sem_post(sem) == -1) { diff --git a/psm_mq.c b/psm_mq.c index f41c134..a25a581 100644 --- a/psm_mq.c +++ b/psm_mq.c @@ -766,14 +766,14 @@ psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len) PSM2_LOG_MSG("entering"); psmi_assert(MQE_TYPE_IS_RECV(req->type)); + psmi_mtucpy_fn_t psmi_mtucpy_fn = psmi_mq_mtucpy; #ifdef PSM_CUDA - psmi_mtucpy_fn_t psmi_mtucpy_fn; - if (req->is_buf_gpu_mem) - psmi_mtucpy_fn = psmi_mq_mtucpy; - else + if (!req->is_buf_gpu_mem) psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem; #endif + _HFI_VDBG("(req=%p) buf=%p len=%u req.state=%u\n", req, buf, len, req->state); + switch (req->state) { case MQ_STATE_COMPLETE: if (req->req_data.buf != NULL) { /* 0-byte messages don't alloc a sysbuf */ @@ -786,10 +786,8 @@ psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len) mq->ep->epaddr->proto); psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem; } - psmi_mtucpy_fn(ubuf, (const void *)req->req_data.buf, copysz); -#else - psmi_mq_mtucpy(ubuf, (const void *)req->req_data.buf, copysz); #endif + psmi_mtucpy_fn(ubuf, (const void *)req->req_data.buf, copysz); psmi_mq_sysbuf_free(mq, req->req_data.buf); } req->req_data.buf = buf; @@ -814,12 +812,7 @@ psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len) #endif if (req->recv_msgoff) { -#ifdef PSM_CUDA - psmi_mtucpy_fn -#else - psmi_mq_mtucpy -#endif - (buf, (const void *)req->req_data.buf, + psmi_mtucpy_fn(buf, (const void *)req->req_data.buf, req->recv_msgoff); } psmi_mq_sysbuf_free(mq, req->req_data.buf); @@ -836,12 +829,7 @@ psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len) */ req->recv_msgoff = min(req->recv_msgoff, copysz); if (req->recv_msgoff) { -#ifdef PSM_CUDA - psmi_mtucpy_fn -#else - psmi_mq_mtucpy -#endif - (buf, (const void *)req->req_data.buf, + psmi_mtucpy_fn(buf, (const void *)req->req_data.buf, req->recv_msgoff); } if (req->send_msgoff) { @@ -895,17 +883,10 @@ __psm2_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t * #ifdef PSM_CUDA int gpu_mem = 0; void *gpu_user_buffer = NULL; - /* CUDA documentation dictates the use of SYNC_MEMOPS attribute - * when the buffer pointer received into PSM has been allocated - * by the application. This guarantees the all memory operations - * to this region of memory (used by multiple layers of the stack) - * always synchronize - */ - if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)buf)) { - int trueflag = 1; - PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag, - CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, - (CUdeviceptr)buf); + + if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(buf)) { + psmi_cuda_set_attr_sync_memops(buf); + gpu_mem = 1; gpu_user_buffer = buf; } @@ -980,21 +961,13 @@ __psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_req_t req; #ifdef PSM_CUDA - int gpu_mem; - /* CUDA documentation dictates the use of SYNC_MEMOPS attribute - * when the buffer pointer received into PSM has been allocated - * by the application. This guarantees the all memory operations - * to this region of memory (used by multiple layers of the stack) - * always synchronize - */ - if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)buf)) { - int trueflag = 1; - PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag, - CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, - (CUdeviceptr)buf); + int gpu_mem = 0; + + if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(buf)) { + psmi_cuda_set_attr_sync_memops(buf); + gpu_mem = 1; - } else - gpu_mem = 0; + } #endif PSM2_LOG_MSG("entering"); @@ -1111,20 +1084,12 @@ __psm2_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len, req->req_data.context = context; #ifdef PSM_CUDA - /* CUDA documentation dictates the use of SYNC_MEMOPS attribute - * when the buffer pointer received into PSM has been allocated - * by the application. This guarantees the all memory operations - * to this region of memory (used by multiple layers of the stack) - * always synchronize - */ - if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)buf)) { - int trueflag = 1; - PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag, - CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, - (CUdeviceptr)buf); - req->is_buf_gpu_mem = 1; - } else - req->is_buf_gpu_mem = 0; + if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(buf)) { + psmi_cuda_set_attr_sync_memops(buf); + req->is_buf_gpu_mem = 1; + } else { + req->is_buf_gpu_mem = 0; + } #endif PSMI_LOCK(mq->progress_lock); @@ -1451,7 +1416,7 @@ psmi_mq_print_stats_finalize(psm2_mq_t mq) * the user can set options after obtaining an endpoint */ psm2_error_t -__psm2_mq_init(psm2_ep_t ep, uint64_t tag_order_mask, +__psm2_mq_init(psm2_ep_t ep, uint64_t ignored, const struct psm2_optkey *opts, int numopts, psm2_mq_t *mqo) { psm2_error_t err = PSM2_OK; diff --git a/psm_mq_internal.h b/psm_mq_internal.h index 1a26898..a1afaf8 100644 --- a/psm_mq_internal.h +++ b/psm_mq_internal.h @@ -306,11 +306,6 @@ mq_copy_tiny(uint32_t *dest, uint32_t *src, uint8_t len)) { #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(dest) || PSMI_IS_CUDA_MEM(src))) { - if (!PSMI_IS_CUDA_ENABLED) { - psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "Please enable PSM CUDA support when using GPU buffer \n"); - return; - } PSMI_CUDA_CALL(cuMemcpy, (CUdeviceptr)dest, (CUdeviceptr)src, len); return; } @@ -347,8 +342,8 @@ mq_copy_tiny(uint32_t *dest, uint32_t *src, uint8_t len)) } } -#ifdef PSM_CUDA typedef void (*psmi_mtucpy_fn_t)(void *dest, const void *src, uint32_t len); +#ifdef PSM_CUDA PSMI_ALWAYS_INLINE( void @@ -409,7 +404,7 @@ mq_status_copy(psm2_mq_req_t req, psm2_mq_status_t *status)) status->msg_tag = *((uint64_t *) req->req_data.tag.tag); status->msg_length = req->req_data.send_msglen; status->nbytes = req->req_data.recv_msglen; - status->error_code = req->req_data.error_code; + status->error_code = (psm2_error_t)req->req_data.error_code; status->context = req->req_data.context; } @@ -421,7 +416,7 @@ mq_status2_copy(psm2_mq_req_t req, psm2_mq_status2_t *status)) status->msg_tag = req->req_data.tag; status->msg_length = req->req_data.send_msglen; status->nbytes = req->req_data.recv_msglen; - status->error_code = req->req_data.error_code; + status->error_code = (psm2_error_t)req->req_data.error_code; status->context = req->req_data.context; } diff --git a/psm_mq_recv.c b/psm_mq_recv.c index 0f46075..642fbc1 100644 --- a/psm_mq_recv.c +++ b/psm_mq_recv.c @@ -342,9 +342,11 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, { psm2_mq_req_t req; uint32_t msglen; + psmi_mtucpy_fn_t psmi_mtucpy_fn; if (msgorder && (req = mq_req_match(mq, src, tag, 1))) { /* we have a match */ + void *user_buffer = req->req_data.buf; psmi_assert(MQE_TYPE_IS_RECV(req->type)); req->req_data.peer = src; req->req_data.tag = *tag; @@ -356,29 +358,17 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, tag->tag[0], tag->tag[1], tag->tag[2], msglen, paylen); - void* user_buffer = NULL; - switch (opcode) { case MQ_MSG_TINY: /* mq_copy_tiny() can handle zero byte */ - #ifdef PSM_CUDA if (PSMI_USE_GDR_COPY(req, msglen)) { - void* mmaped_host = gdr_convert_gpu_to_host_addr(GDR_FD, + user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)req->req_data.buf, msglen, 1, src->proto); - mq_copy_tiny((uint32_t *) mmaped_host, - (uint32_t *) payload, msglen); } - else { - mq_copy_tiny((uint32_t *) req->req_data.buf, - (uint32_t *) payload, msglen); - } -#else - - mq_copy_tiny((uint32_t *) req->req_data.buf, - (uint32_t *) payload, msglen); #endif + mq_copy_tiny((uint32_t *) user_buffer, (uint32_t *) payload, msglen); req->state = MQ_STATE_COMPLETE; ips_barrier(); @@ -386,9 +376,8 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, break; case MQ_MSG_SHORT: /* message fits in 1 payload */ - user_buffer = req->req_data.buf; + psmi_mtucpy_fn = psmi_mq_mtucpy; #ifdef PSM_CUDA - psmi_mtucpy_fn_t psmi_mtucpy_fn = psmi_mq_mtucpy; if (PSMI_USE_GDR_COPY(req, msglen)) { user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)req->req_data.buf, @@ -397,18 +386,10 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, } #endif if (msglen <= paylen) { -#ifdef PSM_CUDA psmi_mtucpy_fn(user_buffer, payload, msglen); -#else - psmi_mq_mtucpy(user_buffer, payload, msglen); -#endif } else { psmi_assert((msglen & ~0x3) == paylen); -#ifdef PSM_CUDA psmi_mtucpy_fn(user_buffer, payload, paylen); -#else - psmi_mq_mtucpy(user_buffer, payload, paylen); -#endif /* * there are nonDW bytes attached in header, * copy after the DW payload. diff --git a/psm_stats.c b/psm_stats.c index c9b5777..c9f37e6 100644 --- a/psm_stats.c +++ b/psm_stats.c @@ -586,6 +586,10 @@ void stats_register_hfi_counters(psm2_ep_t ep) psmi_stats_register_type("OPA device counters", PSMI_STATSTYPE_DEVCOUNTERS, entries, nc + npc, ep); + // psmi_stats_register_type makes it's own copy of entries + // so we should free the entries buffer. + // The snames will be freed when we deregister the hfi. + psmi_free(entries); return; bail: @@ -605,7 +609,7 @@ void stats_register_hfi_stats(psm2_ep_t ep) struct psmi_stats_entry *entries = NULL; ns = hfi_get_stats_names(&snames); - if (ns == -1 || snames == NULL) + if (ns <= 0 || snames == NULL) goto bail; entries = psmi_calloc(ep, STATS, ns, sizeof(struct psmi_stats_entry)); if (entries == NULL) diff --git a/psm_user.h b/psm_user.h index 5a35085..09477c5 100644 --- a/psm_user.h +++ b/psm_user.h @@ -56,6 +56,10 @@ #ifndef _PSMI_USER_H #define _PSMI_USER_H +#ifdef __cplusplus +extern "C" { +#endif + #include "psm_config.h" #include #include @@ -64,6 +68,7 @@ #include #include #include +#include #include "psm2.h" #include "psm2_mq.h" @@ -301,44 +306,47 @@ void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak)); extern int is_cuda_enabled; extern int is_gdr_copy_enabled; extern int device_support_gpudirect; +extern int gpu_p2p_supported; +extern int my_gpu_device; extern int cuda_lib_version; extern CUcontext ctxt; -void *psmi_cuda_lib; -CUresult (*psmi_cuInit)(unsigned int Flags ); -CUresult (*psmi_cuCtxDetach)(CUcontext c); -CUresult (*psmi_cuCtxSetCurrent)(CUcontext c); -CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c); -CUresult (*psmi_cuCtxSetCurrent)(CUcontext c); -CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); -CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); -CUresult (*psmi_cuDeviceGet)(CUdevice* device, int ordinal); -CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev); -CUresult (*psmi_cuDriverGetVersion)(int* driverVersion); -CUresult (*psmi_cuDeviceGetCount)(int* count); -CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags); -CUresult (*psmi_cuStreamDestroy)(CUstream phStream); -CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags); -CUresult (*psmi_cuEventDestroy)(CUevent hEvent); -CUresult (*psmi_cuEventQuery)(CUevent hEvent); -CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream); -CUresult (*psmi_cuEventSynchronize)(CUevent hEvent); -CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags); -CUresult (*psmi_cuMemFreeHost)(void* p); -CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); -CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); -CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); -CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount); -CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); -CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream); -CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr); -CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags); -CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr); -CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr); -CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active); -CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev); -CUresult (*psmi_cuCtxGetDevice)(CUdevice* device); -CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device); +extern void *psmi_cuda_lib; + +extern CUresult (*psmi_cuInit)(unsigned int Flags ); +extern CUresult (*psmi_cuCtxDetach)(CUcontext c); +extern CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c); +extern CUresult (*psmi_cuCtxSetCurrent)(CUcontext c); +extern CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); +extern CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); +extern CUresult (*psmi_cuDeviceCanAccessPeer)(int *canAccessPeer, CUdevice dev, CUdevice peerDev); +extern CUresult (*psmi_cuDeviceGet)(CUdevice* device, int ordinal); +extern CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev); +extern CUresult (*psmi_cuDriverGetVersion)(int* driverVersion); +extern CUresult (*psmi_cuDeviceGetCount)(int* count); +extern CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags); +extern CUresult (*psmi_cuStreamDestroy)(CUstream phStream); +extern CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags); +extern CUresult (*psmi_cuEventDestroy)(CUevent hEvent); +extern CUresult (*psmi_cuEventQuery)(CUevent hEvent); +extern CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream); +extern CUresult (*psmi_cuEventSynchronize)(CUevent hEvent); +extern CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags); +extern CUresult (*psmi_cuMemFreeHost)(void* p); +extern CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); +extern CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); +extern CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); +extern CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount); +extern CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); +extern CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream); +extern CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr); +extern CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags); +extern CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr); +extern CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr); +extern CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active); +extern CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev); +extern CUresult (*psmi_cuCtxGetDevice)(CUdevice* device); +extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device); #define PSMI_CUDA_CALL(func, args...) do { \ CUresult cudaerr; \ @@ -358,6 +366,39 @@ CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device); } \ } while (0) +/** + * Similar to PSMI_CUDA_CALL() except does not error out + * if func(args) returns CUDA_SUCCESS or except_err + * + * Invoker must provide 'CUresult cudaerr' in invoked scope + * so invoker can inspect whether cudaerr == CUDA_SUCCESS or + * cudaerr == except_err after expanded code is executed. + * + * As except_err is an allowed value, message is printed at + * DBG level. + */ +#define PSMI_CUDA_CALL_EXCEPT(except_err, func, args...) do { \ + cudaerr = psmi_##func(args); \ + if (cudaerr != CUDA_SUCCESS && cudaerr != except_err) { \ + if (ctxt == NULL) \ + _HFI_ERROR( \ + "Check if CUDA is initialized" \ + "before psm2_ep_open call \n"); \ + _HFI_ERROR( \ + "CUDA failure: %s() (at %s:%d)" \ + "returned %d\n", \ + #func, __FILE__, __LINE__, cudaerr); \ + psmi_handle_error( \ + PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from CUDA function.\n");\ + } else if (cudaerr == except_err) { \ + _HFI_DBG( \ + "CUDA non-zero return value: %s() (at %s:%d)" \ + "returned %d\n", \ + #func, __FILE__, __LINE__, cudaerr); \ + } \ + } while (0) + #define PSMI_CUDA_CHECK_EVENT(event, cudaerr) do { \ cudaerr = psmi_cuEventQuery(event); \ if ((cudaerr != CUDA_SUCCESS) && \ @@ -383,7 +424,7 @@ CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device); PSMI_ALWAYS_INLINE( int -_psmi_is_cuda_mem(void *ptr)) +_psmi_is_cuda_mem(const void *ptr)) { CUresult cres; CUmemorytype mt; @@ -401,14 +442,8 @@ _psmi_is_cuda_mem(void *ptr)) return 0; } -PSMI_ALWAYS_INLINE( -int -_psmi_is_cuda_enabled()) -{ - return is_cuda_enabled; -} - -#define PSMI_IS_CUDA_ENABLED _psmi_is_cuda_enabled() +#define PSMI_IS_CUDA_ENABLED likely(is_cuda_enabled) +#define PSMI_IS_CUDA_DISABLED unlikely(!is_cuda_enabled) PSMI_ALWAYS_INLINE( int @@ -480,8 +515,28 @@ enum psm2_chb_match_type { }; typedef enum psm2_chb_match_type psm2_chb_match_type_t; +/* + * CUDA documentation dictates the use of SYNC_MEMOPS attribute + * when the buffer pointer received into PSM has been allocated + * by the application. This guarantees that all memory operations + * to this region of memory (used by multiple layers of the stack) + * always synchronize. + */ +static inline +void psmi_cuda_set_attr_sync_memops(const void *ubuf) +{ + int true_flag = 1; + + PSMI_CUDA_CALL(cuPointerSetAttribute, &true_flag, + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr) ubuf); +} + #endif /* PSM_CUDA */ #define COMPILE_TIME_ASSERT(NAME,COND) extern char NAME[1/ COND] +#ifdef __cplusplus +} /* extern "C" */ +#endif + #endif /* _PSMI_USER_H */ diff --git a/psm_utils.c b/psm_utils.c index 521467f..7f7995d 100644 --- a/psm_utils.c +++ b/psm_utils.c @@ -196,6 +196,30 @@ void *psmi_epid_remove(psm2_ep_t ep, psm2_epid_t epid) return psmi_epid_lookup_inner(ep, epid, 1); } +void psmi_epid_remove_all(psm2_ep_t ep) +{ + size_t i; + struct psmi_epid_tabentry *e; + + pthread_mutex_lock(&psmi_epid_table.tablock); + + for (i = 0; i < psmi_epid_table.tabsize; i++) { + e = &psmi_epid_table.table[i]; + + if (e->entry == NULL || e->entry == EPADDR_DELETED) + continue; + + if (e->ep == ep) { + /* unspecified fields implicitly zeroed */ + *e = (struct psmi_epid_tabentry) { + .entry = EPADDR_DELETED + }; + } + } + + pthread_mutex_unlock(&psmi_epid_table.tablock); +} + psm2_error_t psmi_epid_add(psm2_ep_t ep, psm2_epid_t epid, void *entry) { uint64_t key; @@ -262,17 +286,29 @@ fail: return err; } +static psmi_lock_t psmi_gethostname_lock; + +static void __attribute__ ((constructor)) __psmi_gethostname_lock_constructor(void) +{ + psmi_init_lock(&psmi_gethostname_lock); +} + char *psmi_gethostname(void) { - /* XXX this will need a lock in a multi-threaded environment */ static char hostname[80] = { '\0' }; char *c; if (hostname[0] == '\0') { - gethostname(hostname, sizeof(hostname)); - hostname[sizeof(hostname) - 1] = '\0'; /* no guarantee of nul termination */ - if ((c = strchr(hostname, '.'))) - *c = '\0'; + PSMI_LOCK(psmi_gethostname_lock); + /* CRITICAL SECTION START */ + if (hostname[0] == '\0') { + gethostname(hostname, sizeof(hostname)); + hostname[sizeof(hostname) - 1] = '\0'; /* no guarantee of nul termination */ + if ((c = strchr(hostname, '.'))) + *c = '\0'; + } + PSMI_UNLOCK(psmi_gethostname_lock); + /* CRITICAL SECTION END */ } return hostname; @@ -817,6 +853,8 @@ void psmi_multi_ep_init() psmi_multi_ep_enabled = env_fi.e_uint; } +#ifdef PSM_FI + int psmi_faultinj_enabled = 0; int psmi_faultinj_verbose = 0; char *psmi_faultinj_outfile = NULL; @@ -984,6 +1022,8 @@ int psmi_faultinj_is_fault(struct psmi_faultinj_spec *fi) return 0; } +#endif /* #ifdef PSM_FI */ + /* For memory allocation, we kind of break the PSM error handling rules. * If the caller gets NULL, it has to assume that the error has been handled * and should always return PSM2_NO_MEMORY */ @@ -1237,6 +1277,35 @@ void _psmi_heapdebug_val_heapallocs(const char *curloc) } } +/* psmi_heapdebug_finalize() validates the heap and then emits all of the allocations to stdout. + to help debug heap memory leaks. */ +void psmi_heapdebug_finalize(void) +{ + /* First validate the existing heap allocations: */ + + psmi_heapdebug_val_heapallocs(); + + printf("orphaned heap allocations: %d\n", n_allocations); + + if (n_allocations > 0) + { + /* Now, emit all of the alloations to stdout. */ + + HD_Header_Type *p = HD_root_of_list; + + while (p) + { + printf("orphaned heap allocation: %p allocated at: %s, size: %lu\n", + p, p->allocLoc, p->sizeOfAlloc); + + p = p->nextHD_header; + } + fflush(0); + /* Abort if any allocations still exist: */ + abort(); + } +} + /* hd_est_hdr_trlr() establishes the new allocation to the singly linked list, and adds * the header and trailer to the allocation. Lastly, it validates the existing singly-linked * list for integrity. */ @@ -1246,15 +1315,9 @@ static void hd_est_hdr_trlr(HD_Header_Type *hd_alloc, uint64_t actualSize, const char *curloc) { -#if 0 - /* if we use this block of code, psm hangs running mpistress. See JIRA STL-5244. */ + /* First, write HD_NO_MANS_LAND to the entire allocation: */ memset(systemAlloc,HD_NO_MANS_LAND,systemSize); -#else - /* write HD_NO_MANS_LAND to the area between the system allocation and the start of the hd header. */ - signed char *pchr = systemAlloc; - for (;pchr < (signed char*) hd_alloc;pchr++) - *pchr = (signed char) HD_NO_MANS_LAND; -#endif + /* Write the HD header info: */ memcpy(hd_alloc->magic1,HD_HDR_MGC_1,sizeof(HD_HDR_MGC_1)); hd_alloc->allocLoc = curloc; diff --git a/psm_utils.h b/psm_utils.h index fc38153..0c58307 100644 --- a/psm_utils.h +++ b/psm_utils.h @@ -87,6 +87,7 @@ psm2_error_t psmi_epid_init(); psm2_error_t psmi_epid_fini(); void *psmi_epid_lookup(psm2_ep_t ep, psm2_epid_t epid); void *psmi_epid_remove(psm2_ep_t ep, psm2_epid_t epid); +void psmi_epid_remove_all(psm2_ep_t ep); psm2_error_t psmi_epid_add(psm2_ep_t ep, psm2_epid_t epid, void *entry); #define PSMI_EP_HOSTNAME ((psm2_ep_t) -1) /* Special endpoint handle we use * to register hostnames */ @@ -182,9 +183,17 @@ void _psmi_heapdebug_val_heapallocs(const char *curloc); #define psmi_heapdebug_val_heapallocs() _psmi_heapdebug_val_heapallocs(PSMI_CURLOC) +/* Finialize the heapdebug functionality after tear down of the psm + session when you are certain that all heap allocations have been + freed. psmi_heapdebug_finalize() will emit all of the extant + heap allocations and abort if there are any. This is to aid + in debug of heap leaks. */ +void psmi_heapdebug_finalize(void); + #else #define psmi_heapdebug_val_heapallocs() /* nothing */ +#define psmi_heapdebug_finalize() /* nothing */ #endif @@ -320,7 +329,7 @@ uint32_t psmi_crc(unsigned char *buf, int len); /* * Global model so we can tune defaults better for specific cpu's */ -uint32_t psmi_cpu_model; +extern uint32_t psmi_cpu_model; /* * Diagnostics, all in psm_diags.c @@ -333,6 +342,7 @@ int psmi_diags(void); extern int psmi_multi_ep_enabled; void psmi_multi_ep_init(); +#ifdef PSM_FI /* * Fault injection */ @@ -354,6 +364,7 @@ struct psmi_faultinj_spec *psmi_faultinj_getspec(char *spec_name, (var) = psmi_faultinj_getspec((spec_name), (num), (denom)); int psmi_faultinj_is_fault(struct psmi_faultinj_spec *spec); +#endif /* #ifdef PSM_FI */ /* * PSM core component set/get options */ diff --git a/ptl_am/am_cuda_memhandle_cache.c b/ptl_am/am_cuda_memhandle_cache.c index 8406a37..730562d 100644 --- a/ptl_am/am_cuda_memhandle_cache.c +++ b/ptl_am/am_cuda_memhandle_cache.c @@ -55,25 +55,139 @@ #include "psm_user.h" #include "am_cuda_memhandle_cache.h" -#define RBTREE_GET_LEFTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start) -#define RBTREE_GET_RIGHTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start+((PAYLOAD_PTR)->length)) + +/* + * rbtree cruft + */ +struct _cl_map_item; + +typedef struct +{ + unsigned long start; /* start virtual address */ + CUipcMemHandle cuda_ipc_handle; /* cuda ipc mem handle */ + CUdeviceptr cuda_ipc_dev_ptr;/* Cuda device pointer */ + uint16_t length; /* length*/ + psm2_epid_t epid; + struct _cl_map_item* i_prev; /* idle queue previous */ + struct _cl_map_item* i_next; /* idle queue next */ +}__attribute__ ((aligned (128))) rbtree_cuda_memhandle_cache_mapitem_pl_t; + +typedef struct { + uint32_t nelems; /* number of elements in the cache */ +} rbtree_cuda_memhandle_cache_map_pl_t; + +static psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size); + +/* + * Custom comparator + */ +typedef rbtree_cuda_memhandle_cache_mapitem_pl_t cuda_cache_item; + +static int cuda_cache_key_cmp(const cuda_cache_item *a, const cuda_cache_item *b) +{ + // When multi-ep is disabled, cache can assume + // 1 epid == 1 remote process == 1 CUDA address space + // But when multi-ep is enabled, one process can have many epids, so in this case + // cannot use epid as part of cache key. + if (!psmi_multi_ep_enabled) { + if (a->epid < b->epid) + return -1; + if (a->epid > b->epid) + return 1; + } + + unsigned long a_end, b_end; + // normalize into inclusive upper bounds to handle + // 0-length entries + a_end = (a->start + a->length); + b_end = (b->start + b->length); + if (a->length > 0) + a_end--; + + if (b->length > 0) + b_end--; + + if (a_end < b->start) + return -1; + if (b_end < a->start) + return 1; + + return 0; +} + + +/* + * Necessary rbtree cruft + */ +#define RBTREE_MI_PL rbtree_cuda_memhandle_cache_mapitem_pl_t +#define RBTREE_MAP_PL rbtree_cuda_memhandle_cache_map_pl_t +#define RBTREE_CMP(a,b) cuda_cache_key_cmp((a), (b)) #define RBTREE_ASSERT psmi_assert #define RBTREE_MAP_COUNT(PAYLOAD_PTR) ((PAYLOAD_PTR)->nelems) +#define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR +#include "rbtree.h" #include "rbtree.c" -#ifdef PSM_DEBUG -static int cache_hit_counter; -static int cache_miss_counter; -#endif +/* + * Convenience rbtree cruft + */ +#define NELEMS cuda_memhandle_cachemap.payload.nelems + +#define IHEAD cuda_memhandle_cachemap.root +#define LAST IHEAD->payload.i_prev +#define FIRST IHEAD->payload.i_next +#define INEXT(x) x->payload.i_next +#define IPREV(x) x->payload.i_prev + +/* + * Actual module data + */ +static cl_qmap_t cuda_memhandle_cachemap; /* Global cache */ +static uint8_t cuda_memhandle_cache_enabled; +static mpool_t cuda_memhandle_mpool; +static uint32_t cuda_memhandle_cache_size; + +static uint64_t cache_hit_counter; +static uint64_t cache_miss_counter; +static uint64_t cache_evict_counter; +static uint64_t cache_collide_counter; +static uint64_t cache_clear_counter; + +static void print_cuda_memhandle_cache_stats(void) +{ + _HFI_DBG("enabled=%u,size=%u,hit=%lu,miss=%lu,evict=%lu,collide=%lu,clear=%lu\n", + cuda_memhandle_cache_enabled, cuda_memhandle_cache_size, + cache_hit_counter, cache_miss_counter, + cache_evict_counter, cache_collide_counter, cache_clear_counter); +} + +/* + * This is the callback function when mempool are resized or destroyed. + * Upon calling cache fini mpool is detroyed which in turn calls this callback + * which helps in closing all memhandles. + */ +static void +psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj) +{ + cl_map_item_t* memcache_item = (cl_map_item_t*)obj; + if (!is_alloc) { + if(memcache_item->payload.start) + PSMI_CUDA_CALL(cuIpcCloseMemHandle, + memcache_item->payload.cuda_ipc_dev_ptr); + } +} /* * Creating mempool for cuda memhandle cache nodes. */ -psm2_error_t +static psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size) { psm2_error_t err; + if (memcache_size < 1) + return PSM2_PARAM_ERR; + cuda_memhandle_cache_size = memcache_size; /* Creating a memory pool of size PSM2_CUDA_MEMCACHE_SIZE * which includes the Root and NIL items @@ -95,38 +209,58 @@ am_cuda_memhandle_mpool_init(uint32_t memcache_size) /* * Initialize rbtree. */ -psm2_error_t am_cuda_memhandle_cache_map_init() +psm2_error_t am_cuda_memhandle_cache_init(uint32_t memcache_size) { + psm2_error_t err = am_cuda_memhandle_mpool_init(memcache_size); + if (err != PSM2_OK) + return err; + cl_map_item_t *root, *nil_item; root = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); if (root == NULL) return PSM2_NO_MEMORY; nil_item = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); - if (nil_item == NULL) + if (nil_item == NULL) { + psmi_free(root); return PSM2_NO_MEMORY; + } + nil_item->payload.start = 0; nil_item->payload.epid = 0; nil_item->payload.length = 0; cuda_memhandle_cache_enabled = 1; ips_cl_qmap_init(&cuda_memhandle_cachemap,root,nil_item); NELEMS = 0; + + cache_hit_counter = 0; + cache_miss_counter = 0; + cache_evict_counter = 0; + cache_collide_counter = 0; + cache_clear_counter = 0; + return PSM2_OK; } void am_cuda_memhandle_cache_map_fini() { -#ifdef PSM_DEBUG - _HFI_DBG("cache hit counter: %d\n", cache_hit_counter); - _HFI_DBG("cache miss counter: %d\n", cache_miss_counter); -#endif + print_cuda_memhandle_cache_stats(); - if (cuda_memhandle_cachemap.nil_item) + if (cuda_memhandle_cachemap.nil_item) { psmi_free(cuda_memhandle_cachemap.nil_item); - if (cuda_memhandle_cachemap.root) + cuda_memhandle_cachemap.nil_item = NULL; + } + + if (cuda_memhandle_cachemap.root) { psmi_free(cuda_memhandle_cachemap.root); - if (cuda_memhandle_cache_enabled) + cuda_memhandle_cachemap.root = NULL; + } + + if (cuda_memhandle_cache_enabled) { psmi_mpool_destroy(cuda_memhandle_mpool); - return; + cuda_memhandle_cache_enabled = 0; + } + + cuda_memhandle_cache_size = 0; } /* @@ -143,6 +277,7 @@ am_cuda_idleq_insert(cl_map_item_t* memcache_item) INEXT(FIRST) = memcache_item; IPREV(memcache_item) = FIRST; FIRST = memcache_item; + INEXT(FIRST) = NULL; return; } @@ -155,11 +290,13 @@ am_cuda_idleq_remove_last(cl_map_item_t* memcache_item) if (!INEXT(memcache_item)) { LAST = NULL; FIRST = NULL; - return; + } else { + LAST = INEXT(memcache_item); + IPREV(LAST) = NULL; } - LAST = INEXT(memcache_item); - IPREV(LAST) = NULL; - return; + // Null-out now-removed memcache_item's next and prev pointers out of + // an abundance of caution + INEXT(memcache_item) = IPREV(memcache_item) = NULL; } static void @@ -167,15 +304,16 @@ am_cuda_idleq_remove(cl_map_item_t* memcache_item) { if (LAST == memcache_item) { am_cuda_idleq_remove_last(memcache_item); - return; - } - if (INEXT(memcache_item) == NULL) { - INEXT(IPREV(memcache_item)) = NULL; - return; + } else if (FIRST == memcache_item) { + FIRST = IPREV(memcache_item); + INEXT(FIRST) = NULL; + } else { + INEXT(IPREV(memcache_item)) = INEXT(memcache_item); + IPREV(INEXT(memcache_item)) = IPREV(memcache_item); } - INEXT(IPREV(memcache_item)) = INEXT(memcache_item); - IPREV(INEXT(memcache_item)) = IPREV(memcache_item); - return; + // Null-out now-removed memcache_item's next and prev pointers out of + // an abundance of caution + INEXT(memcache_item) = IPREV(memcache_item) = NULL; } static void @@ -207,10 +345,14 @@ am_cuda_memhandle_cache_validate(cl_map_item_t* memcache_item, && epid == memcache_item->payload.epid) { return PSM2_OK; } + _HFI_DBG("cache collision: new entry start=%lu,length=%u\n", sbuf, length); + + cache_collide_counter++; ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, memcache_item); PSMI_CUDA_CALL(cuIpcCloseMemHandle, memcache_item->payload.cuda_ipc_dev_ptr); am_cuda_idleq_remove(memcache_item); + memset(memcache_item, 0, sizeof(*memcache_item)); psmi_mpool_put(memcache_item); return PSM2_OK_NO_PROGRESS; } @@ -219,14 +361,18 @@ am_cuda_memhandle_cache_validate(cl_map_item_t* memcache_item, * Current eviction policy: Least Recently Used. */ static void -am_cuda_memhandle_cache_evict() +am_cuda_memhandle_cache_evict(void) { + cache_evict_counter++; cl_map_item_t *p_item = LAST; + _HFI_VDBG("Removing (epid=%lu,start=%lu,length=%u,dev_ptr=0x%llX,it=%p) from cuda_memhandle_cachemap.\n", + p_item->payload.epid, p_item->payload.start, p_item->payload.length, + p_item->payload.cuda_ipc_dev_ptr, p_item); ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, p_item); PSMI_CUDA_CALL(cuIpcCloseMemHandle, p_item->payload.cuda_ipc_dev_ptr); am_cuda_idleq_remove_last(p_item); + memset(p_item, 0, sizeof(*p_item)); psmi_mpool_put(p_item); - return; } static psm2_error_t @@ -236,6 +382,7 @@ am_cuda_memhandle_cache_register(uintptr_t sbuf, CUipcMemHandle* handle, { if (NELEMS == cuda_memhandle_cache_size) am_cuda_memhandle_cache_evict(); + cl_map_item_t* memcache_item = psmi_mpool_get(cuda_memhandle_mpool); /* memcache_item cannot be NULL as we evict * before the call to mpool_get. Check has @@ -253,6 +400,15 @@ am_cuda_memhandle_cache_register(uintptr_t sbuf, CUipcMemHandle* handle, return PSM2_OK; } +static void am_cuda_memhandle_cache_clear(void) +{ + _HFI_DBG("Closing all handles, clearing cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS); + while (NELEMS) { + am_cuda_memhandle_cache_evict(); + } + _HFI_DBG("Closed all handles, cleared cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS); +} + /* * The key used to search the cache is the senders buf address pointer. * Upon a succesful hit in the cache, additional validation is required @@ -262,36 +418,67 @@ CUdeviceptr am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle, uint32_t length, psm2_epid_t epid) { + _HFI_VDBG("sbuf=%lu,handle=%p,length=%u,epid=%lu\n", + sbuf, handle, length, epid); + CUdeviceptr cuda_ipc_dev_ptr; - if(cuda_memhandle_cache_enabled) { - cl_qmap_t *p_map = &cuda_memhandle_cachemap; - cl_map_item_t *p_item; - unsigned long start = (unsigned long)sbuf; - unsigned long end = start + length; - p_item = ips_cl_qmap_search(p_map, start, end); - if (p_item->payload.start) { - if (am_cuda_memhandle_cache_validate(p_item, sbuf, - handle, length, epid) == PSM2_OK) { -#ifdef PSM_DEBUG - cache_hit_counter++; -#endif - am_cuda_idleq_reorder(p_item); - return p_item->payload.cuda_ipc_dev_ptr; - } - } -#ifdef PSM_DEBUG - cache_miss_counter++; -#endif - PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, - *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); - am_cuda_memhandle_cache_register(sbuf, handle, - length, epid, cuda_ipc_dev_ptr); - return cuda_ipc_dev_ptr; - } else { + if(!cuda_memhandle_cache_enabled) { PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); return cuda_ipc_dev_ptr; } + + cuda_cache_item key = { + .start = (unsigned long) sbuf, + .length= length, + .epid = epid + }; + + /* + * preconditions: + * 1) newrange [start,end) may or may not be in cachemap already + * 2) there are no overlapping address ranges in cachemap + * postconditions: + * 1) newrange is in cachemap + * 2) there are no overlapping address ranges in cachemap + * + * The key used to search the cache is the senders buf address pointer. + * Upon a succesful hit in the cache, additional validation is required + * as multiple senders could potentially send the same buf address value. + */ + cl_map_item_t *p_item = ips_cl_qmap_searchv(&cuda_memhandle_cachemap, &key); + while (p_item->payload.start) { + // Since a precondition is that there are no overlapping ranges in cachemap, + // an exact match implies no need to check further + if (am_cuda_memhandle_cache_validate(p_item, sbuf, handle, length, epid) == PSM2_OK) { + cache_hit_counter++; + am_cuda_idleq_reorder(p_item); + return p_item->payload.cuda_ipc_dev_ptr; + } + + // newrange is not in the cache and overlaps at least one existing range. + // am_cuda_memhandle_cache_validate() closed and removed existing range. + // Continue searching for more overlapping ranges + p_item = ips_cl_qmap_searchv(&cuda_memhandle_cachemap, &key); + } + cache_miss_counter++; + + CUresult cudaerr; + PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_ALREADY_MAPPED, cuIpcOpenMemHandle, + &cuda_ipc_dev_ptr, *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + + if (cudaerr == CUDA_ERROR_ALREADY_MAPPED) { + // remote memory already mapped. Close all handles, clear cache, + // and try again + am_cuda_memhandle_cache_clear(); + cache_clear_counter++; + PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, *handle, + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + } + + am_cuda_memhandle_cache_register(sbuf, handle, + length, epid, cuda_ipc_dev_ptr); + return cuda_ipc_dev_ptr; } void @@ -302,20 +489,4 @@ am_cuda_memhandle_release(CUdeviceptr cuda_ipc_dev_ptr) return; } -/* - * This is the callback function when mempool are resized or destroyed. - * Upon calling cache fini mpool is detroyed which in turn calls this callback - * which helps in closing all memhandles. - */ -void -psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj) -{ - cl_map_item_t* memcache_item = (cl_map_item_t*)obj; - if (!is_alloc) { - if(memcache_item->payload.start) - PSMI_CUDA_CALL(cuIpcCloseMemHandle, - memcache_item->payload.cuda_ipc_dev_ptr); - } -} - #endif diff --git a/ptl_am/am_cuda_memhandle_cache.h b/ptl_am/am_cuda_memhandle_cache.h index 494de32..2b1dbc0 100644 --- a/ptl_am/am_cuda_memhandle_cache.h +++ b/ptl_am/am_cuda_memhandle_cache.h @@ -56,58 +56,16 @@ #ifndef _AM_CUDA_MEMHANDLE_CACHE_H #define _AM_CUDA_MEMHANDLE_CACHE_H -#include -#include +#include "psm_user.h" #include -#include - -struct _cl_map_item; - -typedef struct -{ - unsigned long start; /* start virtual address */ - CUipcMemHandle cuda_ipc_handle; /* cuda ipc mem handle */ - CUdeviceptr cuda_ipc_dev_ptr;/* Cuda device pointer */ - uint16_t length; /* length*/ - psm2_epid_t epid; - struct _cl_map_item* i_prev; /* idle queue previous */ - struct _cl_map_item* i_next; /* idle queue next */ -}__attribute__ ((aligned (128))) rbtree_cuda_memhandle_cache_mapitem_pl_t; - -typedef struct { - uint32_t nelems; /* number of elements in the cache */ -} rbtree_cuda_memhandle_cache_map_pl_t; - -#define RBTREE_MI_PL rbtree_cuda_memhandle_cache_mapitem_pl_t -#define RBTREE_MAP_PL rbtree_cuda_memhandle_cache_map_pl_t - -#include "rbtree.h" - -cl_qmap_t cuda_memhandle_cachemap; /* Global cache */ -uint8_t cuda_memhandle_cache_enabled; -mpool_t cuda_memhandle_mpool; -uint32_t cuda_memhandle_cache_size; -#define CUDA_MEMHANDLE_CACHE_SIZE 64 - -/* - * Macro definition for easy programming. - */ - -#define NELEMS cuda_memhandle_cachemap.payload.nelems - -/* - * Macro for idle queue management. - */ -#define IHEAD cuda_memhandle_cachemap.root -#define LAST IHEAD->payload.i_prev -#define FIRST IHEAD->payload.i_next -#define INEXT(x) x->payload.i_next -#define IPREV(x) x->payload.i_prev +#ifdef __cplusplus +extern "C" { +#endif -psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size); +#define CUDA_MEMHANDLE_CACHE_SIZE 64 -psm2_error_t am_cuda_memhandle_cache_map_init(); +psm2_error_t am_cuda_memhandle_cache_init(uint32_t memcache_size); CUdeviceptr am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle, @@ -115,10 +73,12 @@ am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle, void am_cuda_memhandle_release(CUdeviceptr cuda_ipc_dev_ptr); -void psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj); - void am_cuda_memhandle_cache_map_fini(); +#ifdef __cplusplus +} /* extern "C" */ #endif -#endif +#endif /* _AM_CUDA_MEMHANDLE_CACHE_H */ + +#endif /* PSM_CUDA */ diff --git a/ptl_am/am_reqrep_shmem.c b/ptl_am/am_reqrep_shmem.c index 95973c9..9be72f9 100644 --- a/ptl_am/am_reqrep_shmem.c +++ b/ptl_am/am_reqrep_shmem.c @@ -144,23 +144,38 @@ static inline uintptr_t am_ctl_sizeof_block() #undef _PA +static uint32_t create_extra_ep_data() +{ + uint32_t ret = getpid(); + +#ifdef PSM_CUDA + /* PID is at maximum 22 bits */ + ret |= my_gpu_device << 22; +#endif + + return ret; +} + +static void read_extra_ep_data(uint32_t data, uint32_t *pid, uint32_t *gpu) +{ + uint32_t pid_mask = (1 << 22) - 1; + + *pid = data & pid_mask; + *gpu = (data & ~pid_mask) >> 22; +} + static void am_update_directory(struct am_ctl_nodeinfo *); static void amsh_atexit() { - static pthread_mutex_t mutex_once = PTHREAD_MUTEX_INITIALIZER; - static int atexit_once; + static ips_atomic_t atexit_once = { 0 }; psm2_ep_t ep; struct ptl_am *ptl; - pthread_mutex_lock(&mutex_once); - if (atexit_once) { - pthread_mutex_unlock(&mutex_once); + /* bail out if previous value is non-zero */ + if (ips_atomic_cmpxchg(&atexit_once, 0, 1) != 0) return; - } else - atexit_once = 1; - pthread_mutex_unlock(&mutex_once); ep = psmi_opened_endpoint; while (ep) { @@ -240,7 +255,7 @@ psm2_error_t psmi_shm_create(ptl_t *ptl_gen) size_t segsz; psm2_error_t err = PSM2_OK; int shmfd = -1; - char *amsh_keyname; + char *amsh_keyname = NULL; int iterator; /* Get which kassist mode to use. */ ptl->psmi_kassist_mode = psmi_get_kassist_mode(); @@ -269,6 +284,8 @@ psm2_error_t psmi_shm_create(ptl_t *ptl_gen) shmfd = shm_open(amsh_keyname, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); if (shmfd < 0) { + psmi_free(amsh_keyname); + amsh_keyname = NULL; if (errno == EACCES && iterator < INT_MAX) continue; else { @@ -301,6 +318,7 @@ psm2_error_t psmi_shm_create(ptl_t *ptl_gen) } } if (err) { + if (amsh_keyname) psmi_free(amsh_keyname); err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error creating shared memory object " @@ -328,6 +346,7 @@ psm2_error_t psmi_shm_create(ptl_t *ptl_gen) err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error mmapping shared memory: %s", strerror(errno)); + psmi_free(amsh_keyname); goto fail; } @@ -454,6 +473,7 @@ psm2_error_t psmi_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm "shared memory object " "with fstat: %s", strerror(errno)); + close(dest_shmfd); goto fail; } if (getuid() == st.st_uid) { @@ -480,6 +500,7 @@ psm2_error_t psmi_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error mmapping remote shared memory: %s", strerror(errno)); + close(dest_shmfd); goto fail; } close(dest_shmfd); @@ -560,7 +581,8 @@ psm2_error_t psmi_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm if (shmidx == (uint16_t)-1) err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, - "Could not connect to local endpoint"); fail: + "Could not connect to local endpoint"); +fail: return err; } @@ -593,9 +615,10 @@ static psm2_error_t amsh_init_segment(ptl_t *ptl_gen) /* We core dump right after here if we don't check the mmap */ - struct sigaction act; - act.sa_sigaction = amsh_mmap_fault; - act.sa_flags = SA_SIGINFO; + struct sigaction act = { + .sa_sigaction = amsh_mmap_fault, + .sa_flags = SA_SIGINFO + }; sigaction(SIGSEGV, &act, &action_stash.SIGSEGV_old_act); sigaction(SIGBUS, &act, &action_stash.SIGBUS_old_act); @@ -1014,7 +1037,7 @@ amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req) req->args[0].u32w1 = ptl->connect_phase; req->args[1].u64w0 = (uint64_t) ptl->epid; psmi_assert(shmidx != (uint16_t)-1); - req->args[2].u32w0 = getpid(); + req->args[2].u32w0 = create_extra_ep_data(); req->args[2].u32w1 = PSM2_OK; req->args[3].u64w0 = (uint64_t) (uintptr_t) &req->errors[i]; @@ -1154,7 +1177,7 @@ amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req) req->args[0].u16w1 = shmidx; req->args[0].u32w1 = ptl->connect_phase; req->args[1].u64w0 = (uint64_t) ptl->epid; - req->args[2].u32w0 = getpid(); + req->args[2].u32w0 = create_extra_ep_data(); req->args[2].u32w1 = PSM2_OK; req->args[3].u64w0 = (uint64_t) (uintptr_t) &req->errors[i]; @@ -1958,171 +1981,94 @@ amsh_mq_rndv(ptl_t *ptl, psm2_mq_t mq, psm2_mq_req_t req, req->send_msgoff = 0; #ifdef PSM_CUDA - /* If the send buffer is on gpu, we create a cuda IPC - * handle and send it as payload in the RTS - */ - if (req->is_buf_gpu_mem) { - CUdeviceptr buf_base_ptr; - PSMI_CUDA_CALL(cuMemGetAddressRange, &buf_base_ptr, NULL, (CUdeviceptr)buf); - - /* Offset in GPU buffer from which we copy data, we have to - * send it separetly because this offset is lost - * when cuIpcGetMemHandle is called */ - req->cuda_ipc_offset = buf - (void*)buf_base_ptr; - args[2].u32w0 = (uint32_t)req->cuda_ipc_offset; - - PSMI_CUDA_CALL(cuIpcGetMemHandle, - &req->cuda_ipc_handle, - (CUdeviceptr) buf); - if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) { - psmi_am_reqq_add(AMREQUEST_SHORT, ptl, - epaddr, mq_handler_hidx, - args, 5, (void*)&req->cuda_ipc_handle, - sizeof(CUipcMemHandle), NULL, 0); - } else { - psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx, - args, 5, (void*)&req->cuda_ipc_handle, - sizeof(CUipcMemHandle), 0); - } - req->cuda_ipc_handle_attached = 1; - } else -#endif + /* If the send buffer is on gpu, we create a cuda IPC + * handle and send it as payload in the RTS */ + if (req->is_buf_gpu_mem) { + CUdeviceptr buf_base_ptr; + PSMI_CUDA_CALL(cuMemGetAddressRange, &buf_base_ptr, NULL, (CUdeviceptr)buf); + + /* Offset in GPU buffer from which we copy data, we have to + * send it separetly because this offset is lost + * when cuIpcGetMemHandle is called */ + req->cuda_ipc_offset = buf - (void*)buf_base_ptr; + args[2].u32w0 = (uint32_t)req->cuda_ipc_offset; + + PSMI_CUDA_CALL(cuIpcGetMemHandle, + &req->cuda_ipc_handle, + (CUdeviceptr) buf); if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) { psmi_am_reqq_add(AMREQUEST_SHORT, ptl, - epaddr, mq_handler_hidx, - args, 5, NULL, 0, NULL, 0); + epaddr, mq_handler_hidx, + args, 5, (void*)&req->cuda_ipc_handle, + sizeof(CUipcMemHandle), NULL, 0); } else { psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx, - args, 5, NULL, 0, 0); + args, 5, (void*)&req->cuda_ipc_handle, + sizeof(CUipcMemHandle), 0); } + req->cuda_ipc_handle_attached = 1; + } else +#endif + if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) { + psmi_am_reqq_add(AMREQUEST_SHORT, ptl, epaddr, mq_handler_hidx, + args, 5, NULL, 0, NULL, 0); + } else { + psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx, + args, 5, NULL, 0, 0); + } + + mq->stats.tx_num++; + mq->stats.tx_shm_num++; + mq->stats.tx_rndv_num++; + mq->stats.tx_rndv_bytes += len; return err; } -/* - * All shared am mq sends, req can be NULL - */ PSMI_ALWAYS_INLINE( psm2_error_t -amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, - uint32_t flags_user, uint32_t flags_internal, psm2_mq_tag_t *tag, - const void *ubuf, uint32_t len)) +amsh_mq_send_inner_eager(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, + psm2_amarg_t *args, uint32_t flags_user, uint32_t flags_internal, + psm2_mq_tag_t *tag, const void *ubuf, uint32_t len)) { - psm2_amarg_t args[3]; - psm2_error_t err = PSM2_OK; - int is_blocking = (req == NULL); + uint32_t bytes_left = len; + uint32_t bytes_this = 0; -#ifdef PSM_CUDA - int gpu_mem; - /* All sends from a gpu buffer use the rendezvous protocol */ - if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)ubuf)) { - if (!PSMI_IS_CUDA_ENABLED) - psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - " Please enable PSM CUDA support when using GPU buffer \n"); - gpu_mem = 1; - goto do_rendezvous; - } else - gpu_mem = 0; -#endif + psm2_handler_t handler = mq_handler_hidx; + + args[1].u32w1 = tag->tag[0]; + args[1].u32w0 = tag->tag[1]; + args[2].u32w1 = tag->tag[2]; + args[2].u32w0 = 0; if (!flags_user && len <= AMLONG_MTU) { if (len <= 32) args[0].u32w0 = MQ_MSG_TINY; else args[0].u32w0 = MQ_MSG_SHORT; - args[1].u32w1 = tag->tag[0]; - args[1].u32w0 = tag->tag[1]; - args[2].u32w1 = tag->tag[2]; - - if (flags_internal & PSMI_REQ_FLAG_FASTPATH) { - psmi_am_reqq_add(AMREQUEST_SHORT, epaddr->ptlctl->ptl, - epaddr, mq_handler_hidx, - args, 3, (void *)ubuf, len, NULL, 0); - } else { - psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr, - mq_handler_hidx, args, 3, ubuf, len, 0); - } - } else if (flags_user & PSM2_MQ_FLAG_SENDSYNC) - goto do_rendezvous; - else if (len <= mq->shm_thresh_rv) { - uint32_t bytes_left = len; - uint32_t bytes_this = min(bytes_left, AMLONG_MTU); - uint8_t *buf = (uint8_t *) ubuf; + } else { args[0].u32w0 = MQ_MSG_EAGER; args[0].u32w1 = len; - args[1].u32w1 = tag->tag[0]; - args[1].u32w0 = tag->tag[1]; - args[2].u32w1 = tag->tag[2]; + } + + do { + args[2].u32w0 += bytes_this; + bytes_this = min(bytes_left, AMLONG_MTU); + + /* Assume that shared-memory active messages are delivered in order */ if (flags_internal & PSMI_REQ_FLAG_FASTPATH) { psmi_am_reqq_add(AMREQUEST_SHORT, epaddr->ptlctl->ptl, - epaddr, mq_handler_hidx, - args, 3, buf, bytes_this, NULL, 0); + epaddr, handler, args, 3, (void *)ubuf, + bytes_this, NULL, 0); } else { psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr, - mq_handler_hidx, args, 3, buf, - bytes_this, 0); + handler, args, 3, ubuf, bytes_this, 0); } - bytes_left -= bytes_this; - buf += bytes_this; - args[2].u32w0 = 0; - while (bytes_left) { - args[2].u32w0 += bytes_this; - bytes_this = min(bytes_left, AMLONG_MTU); - /* Here we kind of bend the rules, and assume that shared-memory - * active messages are delivered in order */ - if (flags_internal & PSMI_REQ_FLAG_FASTPATH) { - psmi_am_reqq_add(AMREQUEST_SHORT, epaddr->ptlctl->ptl, - epaddr, mq_handler_data_hidx, - args, 3, buf, bytes_this, NULL, 0); - } else { - psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr, - mq_handler_data_hidx, args, - 3, buf, bytes_this, 0); - } - buf += bytes_this; - bytes_left -= bytes_this; - } - } else { -do_rendezvous: - if (is_blocking) { - req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); - if_pf(req == NULL) - return PSM2_NO_MEMORY; - req->req_data.send_msglen = len; - req->req_data.tag = *tag; - - /* Since SEND command is blocking, this request is - * entirely internal and we will not be exposed to user. - * Setting as internal so it will not be added to - * mq->completed_q */ - req->flags_internal |= (flags_internal | PSMI_REQ_FLAG_IS_INTERNAL); - } -#ifdef PSM_CUDA - /* CUDA documentation dictates the use of SYNC_MEMOPS attribute - * when the buffer pointer received into PSM has been allocated - * by the application. This guarantees the all memory operations - * to this region of memory (used by multiple layers of the stack) - * always synchronize - */ - if (gpu_mem) { - int trueflag = 1; - PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag, - CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, - (CUdeviceptr)ubuf); - req->is_buf_gpu_mem = 1; - } else - req->is_buf_gpu_mem = 0; -#endif - - err = - amsh_mq_rndv(epaddr->ptlctl->ptl, mq, req, epaddr, tag, - ubuf, len); - if (err == PSM2_OK && is_blocking) { /* wait... */ - err = psmi_mq_wait_internal(&req); - } - return err; /* skip eager accounting below */ - } + ubuf += bytes_this; + bytes_left -= bytes_this; + handler = mq_handler_data_hidx; + } while(bytes_left); /* All eager async sends are always "all done" */ if (req != NULL) { @@ -2135,6 +2081,98 @@ do_rendezvous: mq->stats.tx_eager_num++; mq->stats.tx_eager_bytes += len; + return PSM2_OK; +} + +/* + * All shared am mq sends, req can be NULL + */ +PSMI_ALWAYS_INLINE( +psm2_error_t +amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, + uint32_t flags_user, uint32_t flags_internal, psm2_mq_tag_t *tag, + const void *ubuf, uint32_t len)) +{ + psm2_amarg_t args[3]; + psm2_error_t err = PSM2_OK; + int is_blocking = (req == NULL); + +#ifdef PSM_CUDA + int gpu_mem = 0; + int ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & gpu_p2p_supported; + + if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)) { + gpu_mem = 1; + + /* All sends from a gpu buffer use the rendezvous protocol if p2p is supported */ + if (ep_supports_p2p) { + goto do_rendezvous; + } + + /* + * Use eager messages if P2P is unsupported between endpoints. + * Potentially use rendezvous with blocking requests only. + */ + if (!is_blocking) + goto do_eager; + } +#endif + if (flags_user & PSM2_MQ_FLAG_SENDSYNC) + goto do_rendezvous; + + if (len <= mq->shm_thresh_rv) +#ifdef PSM_CUDA +do_eager: +#endif + return amsh_mq_send_inner_eager(mq, req, epaddr, args, flags_user, + flags_internal, tag, ubuf, len); +do_rendezvous: + if (is_blocking) { + req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); + if_pf(req == NULL) + return PSM2_NO_MEMORY; + req->req_data.send_msglen = len; + req->req_data.tag = *tag; + + /* Since SEND command is blocking, this request is + * entirely internal and we will not be exposed to user. + * Setting as internal so it will not be added to + * mq->completed_q */ + req->flags_internal |= (flags_internal | PSMI_REQ_FLAG_IS_INTERNAL); + } +#ifdef PSM_CUDA + void *host_buf = NULL; + + req->is_buf_gpu_mem = gpu_mem; + if (req->is_buf_gpu_mem) { + psmi_cuda_set_attr_sync_memops(ubuf); + + /* Use host buffer for blocking requests if GPU P2P is + * unsupported between endpoints. + * This will be only used with blocking requests. */ + if (!ep_supports_p2p) { + host_buf = psmi_malloc(epaddr->ptlctl->ep, UNDEFINED, len); + PSMI_CUDA_CALL(cuMemcpyDtoH, host_buf, (CUdeviceptr)ubuf, len); + + /* Reset is_buf_gpu_mem since host buffer is being used + * instead of one from GPU. */ + ubuf = host_buf; + req->is_buf_gpu_mem = 0; + } + } +#endif + + err = amsh_mq_rndv(epaddr->ptlctl->ptl, mq, req, epaddr, tag, ubuf, len); + + if (err == PSM2_OK && is_blocking) { /* wait... */ + err = psmi_mq_wait_internal(&req); + } + +#ifdef PSM_CUDA + if (err == PSM2_OK && host_buf) + psmi_free(host_buf); +#endif + return err; } @@ -2205,11 +2243,9 @@ const char *psmi_kassist_getmode(int mode) static int psmi_get_kassist_mode() { - int mode = PSMI_KASSIST_MODE_DEFAULT; - /* Cuda PSM only supports KASSIST_CMA_GET */ -#ifdef PSM_CUDA - mode = PSMI_KASSIST_CMA_GET; -#else + /* Cuda PSM2 supports only KASSIST_CMA_GET */ + int mode = PSMI_KASSIST_CMA_GET; +#ifndef PSM_CUDA union psmi_envvar_val env_kassist; if (!psmi_getenv("PSM2_KASSIST_MODE", @@ -2225,11 +2261,6 @@ int psmi_get_kassist_mode() mode = PSMI_KASSIST_CMA_GET; else mode = PSMI_KASSIST_OFF; - } else { - /* cma-get is the fastest, so it's the default. - Availability of CMA is checked in psmi_shm_create(); - if CMA is not available it falls back to 'none' there. */ - mode = PSMI_KASSIST_CMA_GET; } #endif return mode; @@ -2253,7 +2284,8 @@ amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, int16_t return_shmidx = args[0].u16w1; psm2_error_t err = (psm2_error_t) args[2].u32w1; psm2_error_t *perr = (psm2_error_t *) (uintptr_t) args[3].u64w0; - int pid = args[2].u32w0; + unsigned int pid; + unsigned int gpuid; int force_remap = 0; psm2_epaddr_t epaddr; @@ -2266,6 +2298,7 @@ amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, /* We do this because it's an assumption below */ psmi_assert_always(buf == NULL && len == 0); + read_extra_ep_data(args[2].u32w0, &pid, &gpuid); _HFI_VDBG("Conn op=%d, phase=%d, epid=%llx, err=%d\n", op, phase, (unsigned long long)epid, err); @@ -2279,6 +2312,7 @@ amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, /* If old pid is unknown consider new pid the correct one */ if (((am_epaddr_t *) epaddr)->pid == AMSH_PID_UNKNOWN) { ((am_epaddr_t *) epaddr)->pid = pid; + ((am_epaddr_t *) epaddr)->gpuid = gpuid; } else { psmi_epid_remove(ptl->ep, epid); epaddr = NULL; @@ -2312,6 +2346,7 @@ amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, args_segoff); ((am_epaddr_t *) epaddr)->pid = pid; + ((am_epaddr_t *) epaddr)->gpuid = gpuid; } /* Rewrite args */ @@ -2320,7 +2355,7 @@ amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, /* and return our shmidx for the connecting process */ args[0].u16w1 = shmidx; args[1].u64w0 = (psm2_epid_t) ptl->epid; - args[2].u32w0 = getpid(); + args[2].u32w0 = create_extra_ep_data(); args[2].u32w1 = PSM2_OK; ((am_epaddr_t *) epaddr)->cstate_incoming = AMSH_CSTATE_INCOMING_ESTABLISHED; @@ -2539,10 +2574,7 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val) CUDA_MEMHANDLE_CACHE_SIZE, &env_memcache_size); - if ((err = am_cuda_memhandle_mpool_init(env_memcache_size.e_uint) - != PSM2_OK)) - goto fail; - if ((err = am_cuda_memhandle_cache_map_init() != PSM2_OK)) + if ((err = am_cuda_memhandle_cache_init(env_memcache_size.e_uint) != PSM2_OK)) goto fail; } #endif @@ -2640,6 +2672,10 @@ static psm2_error_t amsh_fini(ptl_t *ptl_gen, int force, uint64_t timeout_ns) * deallocated to reference memory that disappeared */ ptl->repH.head = &ptl->amsh_empty_shortpkt; ptl->reqH.head = &ptl->amsh_empty_shortpkt; + + if (ptl->am_ep) + psmi_free(ptl->am_ep); + #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED) am_cuda_memhandle_cache_map_fini(); diff --git a/ptl_am/psm_am_internal.h b/ptl_am/psm_am_internal.h index 8e07a57..c4c08a5 100644 --- a/ptl_am/psm_am_internal.h +++ b/ptl_am/psm_am_internal.h @@ -72,9 +72,15 @@ struct am_epaddr { uint16_t shmidx; uint16_t return_shmidx; - uint32_t cstate_outgoing:4; - uint32_t cstate_incoming:4; + uint32_t cstate_outgoing:3; + uint32_t cstate_incoming:3; uint32_t pid:22; + /* + * Device number of GPU used by given EP, only used when CUDA is + * enabled. There is no gain from #ifdefing it out, since it does not + * use any extra space. + */ + uint32_t gpuid:4; } am_epaddr_t; /* Up to NSHORT_ARGS are supported via am_pkt_short_t; the remaining diff --git a/ptl_am/ptl_fwd.h b/ptl_am/ptl_fwd.h index e1bd064..1d0fec4 100644 --- a/ptl_am/ptl_fwd.h +++ b/ptl_am/ptl_fwd.h @@ -57,7 +57,7 @@ #define _PTL_FWD_AMSH_H /* Symbol in am ptl */ -struct ptl_ctl_init psmi_ptl_amsh; +extern struct ptl_ctl_init psmi_ptl_amsh; extern int psmi_shm_mq_rv_thresh; diff --git a/ptl_ips/ips_config.h b/ptl_ips/ips_config.h index c323194..329a69c 100644 --- a/ptl_ips/ips_config.h +++ b/ptl_ips/ips_config.h @@ -78,6 +78,8 @@ */ #define IPS_PROTOEXP_MIN_MTU 2048 +#ifdef PSM_FI + /* Fault injection, becomes parameters to psmi_faultinj_getspec so * a comma-delimited list of * "spec_name", num, denom @@ -95,6 +97,7 @@ #define IPS_FAULTINJ_PIOBUSY 10 /* 1 every 10 pio sends get busy */ #define IPS_FAULTINJ_RECVLOST 200 /* 1 every 200 pkts dropped at recv */ +#endif /* #ifdef PSM_FI */ /* TID */ diff --git a/ptl_ips/ips_epstate.h b/ptl_ips/ips_epstate.h index 7308040..d7b7cb3 100644 --- a/ptl_ips/ips_epstate.h +++ b/ptl_ips/ips_epstate.h @@ -91,7 +91,7 @@ struct ips_epstate_entry * ips_epstate_lookup(const struct ips_epstate *eps, ips_epstate_idx idx)) { idx = (idx + eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1); - if (idx < eps->eps_tabsize) + if (idx < (ips_epstate_idx)eps->eps_tabsize) return &eps->eps_tab[idx]; else return NULL; diff --git a/ptl_ips/ips_path_rec.c b/ptl_ips/ips_path_rec.c index 5b37347..d08b6f9 100644 --- a/ptl_ips/ips_path_rec.c +++ b/ptl_ips/ips_path_rec.c @@ -565,13 +565,13 @@ static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto) return err; } -/* (Re)load the SL2VL table */ -psm2_error_t ips_ibta_init_sl2sc2vl_table(struct ips_proto *proto) +/* (Re)load the SL2SC table */ +psm2_error_t ips_ibta_init_sl2sc_table(struct ips_proto *proto) { int ret, i; /* Get SL2SC table for unit, port */ - for (i = 0; i < 32; i++) { + for (i = 0; i < PSMI_N_SCS; i++) { if ((ret = psmi_hal_get_port_sl2sc(psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt), psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt), @@ -582,19 +582,7 @@ psm2_error_t ips_ibta_init_sl2sc2vl_table(struct ips_proto *proto) proto->sl2sc[i] = (uint16_t) ret; } - /* Get SC2VL table for unit, port */ - for (i = 0; i < 32; i++) { - if ((ret = - psmi_hal_get_port_sc2vl(psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt), - psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt), - (uint8_t) i)) < 0) { - /* Unable to get SC2VL. Set it to default */ - ret = PSMI_VL_DEFAULT; - } - - proto->sc2vl[i] = (uint16_t) ret; - } - + psmi_hal_get_sc2vl_map(proto); return PSM2_OK; } @@ -633,7 +621,7 @@ psm2_error_t ips_ibta_link_updown_event(struct ips_proto *proto) proto->epinfo.ep_link_rate = ips_rate_to_enum(ret); /* Load the SL2SC2VL table */ - ips_ibta_init_sl2sc2vl_table(proto); + ips_ibta_init_sl2sc_table(proto); /* Regenerate new IPD table for the updated link rate. */ ips_gen_ipd_table(proto); @@ -691,7 +679,9 @@ MOCKABLE(ips_ibta_init)(struct ips_proto *proto) char ccabuf[256]; uint8_t *p; - proto->flags |= IPS_PROTO_FLAG_CCA; + /* Start out by turning on both styles of congestion control. + * Later, we will eliminate the correct one. */ + proto->flags |= IPS_PROTO_FLAG_CCA | IPS_PROTO_FLAG_CC_REPL_BECN; /* * If user set any environment variable, use self CCA. */ @@ -758,15 +748,18 @@ MOCKABLE(ips_ibta_init)(struct ips_proto *proto) for (i = 0; i < proto->ccti_limit; i++) _HFI_CCADBG("cct[%d] = 0x%04x\n", i, (int) proto->cct[i]); - + /* Note, here, we are leaving CC style(s): + (IPS_PROTO_FLAG_CCA | IPS_PROTO_FLAG_CCA_PRESCAN) */ + proto->flags &= ~IPS_PROTO_FLAG_CC_REPL_BECN; goto finishcca; /* * Disable CCA. */ disablecca: - proto->flags &= ~IPS_PROTO_FLAG_CCA; - proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN; + /* Note, here, we are leaving CC style: + IPS_PROTO_FLAG_CC_REPL_BECN */ + proto->flags &= ~(IPS_PROTO_FLAG_CCA | IPS_PROTO_FLAG_CCA_PRESCAN); } finishcca: diff --git a/ptl_ips/ips_proto.c b/ptl_ips/ips_proto.c index 35dcce7..dfd03e6 100644 --- a/ptl_ips/ips_proto.c +++ b/ptl_ips/ips_proto.c @@ -660,11 +660,11 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl, gpudirect_recv_threshold = env_gpudirect_recv_thresh.e_uint; if (env_gpudirect_rdma.e_uint && device_support_gpudirect) { - if (!PSMI_IS_CUDA_ENABLED || + if (PSMI_IS_CUDA_DISABLED || /* All pio, No SDMA*/ (proto->flags & IPS_PROTO_FLAG_SPIO) || !(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) || - !PSMI_IS_DRIVER_GPUDIRECT_ENABLED) + PSMI_IS_DRIVER_GPUDIRECT_DISABLED) err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Requires hfi1 driver with GPU-Direct feature enabled.\n"); @@ -685,7 +685,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl, &env_gpudirect_rdma_send); if (env_gpudirect_rdma_send.e_uint && device_support_gpudirect) { - if (!PSMI_IS_CUDA_ENABLED || + if (PSMI_IS_CUDA_DISABLED || /* All pio, No SDMA*/ (proto->flags & IPS_PROTO_FLAG_SPIO)) err = psmi_handle_error(PSMI_EP_NORETURN, @@ -705,7 +705,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl, &env_gpudirect_rdma_recv); if (env_gpudirect_rdma_recv.e_uint && device_support_gpudirect) { - if (!PSMI_IS_CUDA_ENABLED || + if (PSMI_IS_CUDA_DISABLED || !(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, @@ -786,6 +786,9 @@ ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in) int i; union psmi_envvar_val grace_intval; + /* Poll one more time to attempt to synchronize with the peer ep's. */ + ips_ptl_poll(proto->ptl, 0); + psmi_getenv("PSM2_CLOSE_GRACE_PERIOD", "Additional grace period in seconds for closing end-point.", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, @@ -900,12 +903,12 @@ ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in) uint64_t t_grace_interval_start = get_cycles(); int num_disconnect_requests = proto->num_disconnect_requests; PSMI_BLOCKUNTIL( - proto->ep, err, - proto->num_connected_incoming == 0 || + proto->ep, err, + proto->num_connected_incoming == 0 || (!psmi_cycles_left(t_start, timeout_in) && - (!psmi_cycles_left(t_grace_interval_start, - t_grace_interval) || - !psmi_cycles_left(t_grace_start, t_grace_time)))); + (!psmi_cycles_left(t_grace_interval_start, + t_grace_interval) || + !psmi_cycles_left(t_grace_start, t_grace_time)))); if (num_disconnect_requests == proto->num_disconnect_requests) { /* nothing happened in this grace interval so break out early */ break; @@ -1649,6 +1652,8 @@ fail: return err; } +#ifdef PSM_FI + /* * Fault injection in dma sends. Since DMA through writev() is all-or-nothing, * we don't inject faults on a packet-per-packet basis since the code gets @@ -1671,6 +1676,8 @@ PSMI_ALWAYS_INLINE(int dma_do_fault()) return 0; } +#endif /* #ifdef PSM_FI */ + /* * Driver defines the following sdma completion error code, returned * as negative value: @@ -1812,10 +1819,11 @@ ips_dma_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, uint16_t iovcnt; struct iovec iovec[2]; +#ifdef PSM_FI /* See comments above for fault injection */ if_pf(dma_do_fault()) return PSM2_OK; - +#endif /* #ifdef PSM_FI */ /* * Check if there is a sdma queue slot. */ @@ -1873,14 +1881,14 @@ ips_dma_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, sdmahdr->ctrl = 2 | (PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); - } else { + } else #endif + { sdmahdr->ctrl = 1 | (PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); -#ifdef PSM_CUDA } -#endif + /* * Write into driver to do SDMA work. */ @@ -1991,8 +1999,10 @@ scb_dma_send(struct ips_proto *proto, struct ips_flow *flow, int16_t credits; ssize_t ret; +#ifdef PSM_FI /* See comments above for fault injection */ if_pf(dma_do_fault()) goto fail; +#endif /* #ifdef PSM_FI */ /* Check how many SCBs to send based on flow credits */ credits = flow->credits; @@ -2144,14 +2154,14 @@ scb_dma_send(struct ips_proto *proto, struct ips_flow *flow, sdmahdr->ctrl = 2 | (PSM_HAL_EXP << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); - } else { + } else #endif + { + sdmahdr->ctrl = 1 | (PSM_HAL_EXP << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); -#ifdef PSM_CUDA } -#endif _HFI_VDBG("tid-info=%p,%d\n", iovec[vec_idx - 1].iov_base, (int)iovec[vec_idx - 1].iov_len); @@ -2162,14 +2172,13 @@ scb_dma_send(struct ips_proto *proto, struct ips_flow *flow, sdmahdr->ctrl = 2 | (PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); - } else { + } else #endif + { sdmahdr->ctrl = 1 | (PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); -#ifdef PSM_CUDA } -#endif } /* Can bound the number to send by 'num' */ diff --git a/ptl_ips/ips_proto.h b/ptl_ips/ips_proto.h index c6030f4..dc8e7d4 100644 --- a/ptl_ips/ips_proto.h +++ b/ptl_ips/ips_proto.h @@ -374,8 +374,6 @@ struct ips_proto { /* SL2SC and SC2VL table for protocol */ uint16_t sl2sc[32]; - uint16_t sc2vl[32]; - /* CCA per port */ uint16_t *cct; /* cct table */ uint16_t ccti_size; /* ccti table size */ @@ -690,8 +688,8 @@ typedef int (*ips_packet_service_fn_t)(struct ips_recvhdrq_event *rcv_ev); extern ips_packet_service_fn_t ips_packet_service_routine[OPCODE_FUTURE_FROM-OPCODE_RESERVED]; -/* IBTA feature related functions (path record, sl2sc2vl etc.) */ -psm2_error_t ips_ibta_init_sl2sc2vl_table(struct ips_proto *proto); +/* IBTA feature related functions (path record, sl2sc etc.) */ +psm2_error_t ips_ibta_init_sl2sc_table(struct ips_proto *proto); psm2_error_t ips_ibta_link_updown_event(struct ips_proto *proto); psm2_error_t @@ -706,15 +704,15 @@ psmi_get_sdma_req_info(struct ips_scb *scb, size_t *extra)) { *extra = 0; #ifdef PSM_CUDA - if (!PSMI_IS_DRIVER_GPUDIRECT_ENABLED) - return (void *)(((char *)&scb->pbc) - + if (PSMI_IS_DRIVER_GPUDIRECT_DISABLED) + return (struct psm_hal_sdma_req_info *)(((char *)&scb->pbc) - (sizeof(struct psm_hal_sdma_req_info) - PSM_HAL_CUDA_SDMA_REQ_INFO_EXTRA)); *extra = PSM_HAL_CUDA_SDMA_REQ_INFO_EXTRA; #endif - return (void *)(((char *)&scb->pbc) - (sizeof(struct psm_hal_sdma_req_info))); + return (struct psm_hal_sdma_req_info *)(((char *)&scb->pbc) - (sizeof(struct psm_hal_sdma_req_info))); } #ifdef PSM_CUDA @@ -730,4 +728,19 @@ uint32_t ips_cuda_next_window(uint32_t max_window, uint32_t offset, } #endif +/* Determine if FECN bit is set IBTA 1.2.1 CCA Annex A*/ + +static __inline__ uint8_t +_is_cca_fecn_set(const struct ips_message_header *p_hdr) +{ + return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_FECN_SHIFT) & 0x1; +} + +/* Detrmine if BECN bit is set IBTA 1.2.1 CCA Annex A*/ +static __inline__ uint8_t +_is_cca_becn_set(const struct ips_message_header *p_hdr) +{ + return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_BECN_SHIFT) & 0x1; +} + #endif /* _IPS_PROTO_H */ diff --git a/ptl_ips/ips_proto_expected.c b/ptl_ips/ips_proto_expected.c index 1f507ed..7e7e997 100644 --- a/ptl_ips/ips_proto_expected.c +++ b/ptl_ips/ips_proto_expected.c @@ -1919,20 +1919,15 @@ ips_tid_recv_alloc_frag(struct ips_protoexp *protoexp, ((bufptr + size - 1) & page_mask) - (bufptr & page_mask)); tidoff = pageoff = (uint32_t) (bufptr & page_offset_mask); - } else { + } else +#endif + { pageaddr = bufptr & protoexp->tid_page_mask; pagelen = (uint32_t) (PSMI_PAGESIZE + ((bufptr + size - 1) & protoexp->tid_page_mask) - (bufptr & protoexp->tid_page_mask)); tidoff = pageoff = (uint32_t) (bufptr & protoexp->tid_page_offset_mask); } -#else - pageaddr = bufptr & protoexp->tid_page_mask; - pagelen = (uint32_t) (PSMI_PAGESIZE + - ((bufptr + size - 1) & protoexp->tid_page_mask) - - (bufptr & protoexp->tid_page_mask)); - tidoff = pageoff = (uint32_t) (bufptr & protoexp->tid_page_offset_mask); -#endif reglen = pagelen; if (protoexp->tidc.tid_array) { @@ -2298,8 +2293,9 @@ ipsaddr_next: getreq->tidgr_offset + nbytes_this; nbytes_this -= pageoff & (PSMI_GPU_PAGESIZE - 1); } - } else { + } else #endif + { if ((getreq->tidgr_offset + nbytes_this) < getreq->tidgr_length && nbytes_this > PSMI_PAGESIZE) { @@ -2309,9 +2305,7 @@ ipsaddr_next: getreq->tidgr_offset + nbytes_this; nbytes_this -= pageoff & (PSMI_PAGESIZE - 1); } -#ifdef PSM_CUDA } -#endif psmi_assert(nbytes_this >= 4); psmi_assert(nbytes_this <= PSM_TID_WINSIZE); diff --git a/ptl_ips/ips_proto_help.h b/ptl_ips/ips_proto_help.h index 42567f5..ba32b84 100644 --- a/ptl_ips/ips_proto_help.h +++ b/ptl_ips/ips_proto_help.h @@ -522,6 +522,7 @@ ips_proto_process_packet(const struct ips_recvhdrq_event *rcv_ev)) { uint32_t index; +#ifdef PSM_FI /* NOTE: Fault injection will currently not work with hardware * suppression. See note below for reason why as we currently * do not update the hardware tidflow table if FI is dropping @@ -545,7 +546,7 @@ ips_proto_process_packet(const struct ips_recvhdrq_event *rcv_ev)) if (psmi_faultinj_is_fault(fi_recv)) return IPS_RECVHDRQ_CONTINUE; } - +#endif /* #ifdef PSM_FI */ /* see file ips_proto_header.h for details */ index = _get_proto_hfi_opcode(rcv_ev->p_hdr) - OPCODE_RESERVED; if (index >= (OPCODE_FUTURE_FROM - OPCODE_RESERVED)) diff --git a/ptl_ips/ips_proto_mq.c b/ptl_ips/ips_proto_mq.c index 32471fd..8a047c6 100644 --- a/ptl_ips/ips_proto_mq.c +++ b/ptl_ips/ips_proto_mq.c @@ -565,22 +565,6 @@ int psmi_cuda_is_buffer_gpu_mem(void *ubuf) return (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)); } -/* - * CUDA documentation dictates the use of SYNC_MEMOPS attribute - * when the buffer pointer received into PSM has been allocated - * by the application. This guarantees that all memory operations - * to this region of memory (used by multiple layers of the stack) - * always synchronize. - */ -static inline -void psmi_cuda_set_attr_sync_memops(void *ubuf) -{ - int trueflag = 1; - - PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag, - CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr) ubuf); -} - static inline int psmi_cuda_is_needed_rendezvous(struct ips_proto *proto, uint32_t len) { @@ -691,6 +675,8 @@ ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user, if_pf(req == NULL) return PSM2_NO_MEMORY; + _HFI_VDBG("(req=%p) ubuf=%p len=%u\n", req, ubuf, len); + req->flags_user = flags_user; req->flags_internal = flags_internal; ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next; @@ -704,8 +690,9 @@ ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user, #ifdef PSM_CUDA req->is_buf_gpu_mem = psmi_cuda_is_buffer_gpu_mem((void*)ubuf); + req->cuda_hostbuf_used = 0; if (req->is_buf_gpu_mem) { - psmi_cuda_set_attr_sync_memops((void*)ubuf); + psmi_cuda_set_attr_sync_memops(ubuf); if (psmi_cuda_is_needed_rendezvous(proto, len)) goto do_rendezvous; } @@ -882,6 +869,8 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, ips_scb_t *scb; int gpu_mem = 0; + _HFI_VDBG("ubuf=%p len=%u\n", ubuf, len); + ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next; ipsaddr->msgctl->ipsaddr_next = ipsaddr->next; proto = ((psm2_epaddr_t) ipsaddr)->proto; @@ -891,7 +880,7 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, #ifdef PSM_CUDA gpu_mem = psmi_cuda_is_buffer_gpu_mem((void*)ubuf); if (gpu_mem) { - psmi_cuda_set_attr_sync_memops((void*)ubuf); + psmi_cuda_set_attr_sync_memops(ubuf); if (psmi_cuda_is_needed_rendezvous(proto, len)) goto do_rendezvous; } @@ -1031,6 +1020,7 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, return err; #ifdef PSM_CUDA + req->cuda_hostbuf_used = 0; if (gpu_mem) { req->is_buf_gpu_mem = 1; } else @@ -1069,12 +1059,6 @@ do_rendezvous: req->flags_internal |= PSMI_REQ_FLAG_IS_INTERNAL; #ifdef PSM_CUDA - /* CUDA documentation dictates the use of SYNC_MEMOPS attribute - * when the buffer pointer received into PSM has been allocated - * by the application. This guarantees the all memory operations - * to this region of memory (used by multiple layers of the stack) - * always synchronize - */ if (gpu_mem) { req->is_buf_gpu_mem = 1; } else diff --git a/ptl_ips/ips_proto_params.h b/ptl_ips/ips_proto_params.h index ae2b894..47bf125 100644 --- a/ptl_ips/ips_proto_params.h +++ b/ptl_ips/ips_proto_params.h @@ -214,8 +214,10 @@ #define IPS_PROTO_FLAG_PPOLICY_STATIC 0x1c00 /* IBTA CCA Protocol support */ -#define IPS_PROTO_FLAG_CCA 0x2000 +#define IPS_PROTO_FLAG_CCA 0x2000 /* Enables full-fledged CCA */ #define IPS_PROTO_FLAG_CCA_PRESCAN 0x4000 /* Enable RAPID CCA prescanning */ +#define IPS_PROTO_FLAG_CC_REPL_BECN 0x8000 /* A simple congestion control scheme */ + /* that simply replies a BECN on rx FECN. */ #ifdef PSM_CUDA /* Use RNDV (TID) for all message sizes */ diff --git a/ptl_ips/ips_recvhdrq.c b/ptl_ips/ips_recvhdrq.c index 16908ba..6c5fd07 100644 --- a/ptl_ips/ips_recvhdrq.c +++ b/ptl_ips/ips_recvhdrq.c @@ -149,21 +149,6 @@ _get_proto_subcontext(const struct ips_message_header *p_hdr) HFI_BTH_SUBCTXT_SHIFT) & HFI_BTH_SUBCTXT_MASK); } -/* Determine if FECN bit is set IBTA 1.2.1 CCA Annex A*/ - -static __inline__ uint8_t -_is_cca_fecn_set(const struct ips_message_header *p_hdr) -{ - return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_FECN_SHIFT) & 0x1; -} - -/* Detrmine if BECN bit is set IBTA 1.2.1 CCA Annex A*/ -static __inline__ uint8_t -_is_cca_becn_set(const struct ips_message_header *p_hdr) -{ - return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_BECN_SHIFT) & 0x1; -} - static __inline__ void _dump_invalid_pkt(struct ips_recvhdrq_event *rcv_ev) { char *payload = ips_recvhdrq_event_payload(rcv_ev); @@ -426,46 +411,6 @@ psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq) ("new packet: rcv_hdr %p, rhf %" PRIx64 "\n", rcv_ev.p_hdr, rcv_ev.psm_hal_rhf.raw_rhf); - /* If the hdrq_head is before cachedlastscan, that means that we have - * already prescanned this for BECNs and FECNs, so we should not check - * again - */ - if_pt((recvq->proto->flags & IPS_PROTO_FLAG_CCA) && - (state->hdrq_head >= state->hdrq_cachedlastscan)) { - /* IBTA CCA handling: - * If FECN bit set handle IBTA CCA protocol. For the - * flow that suffered congestion we flag it to generate - * a control packet with the BECN bit set - This is - * currently an unsolicited ACK. - * - * For all MQ packets the FECN processing/BECN - * generation is done in the is_expected_or_nak - * function as each eager packet is inspected there. - * - * For TIDFLOW/Expected data transfers the FECN - * bit/BECN generation is done in protoexp_data. Since - * header suppression can result in even FECN packets - * being suppressed the expected protocol generated - * additional BECN packets if a "large" number of - * generations are swapped without progress being made - * for receive. "Large" is set empirically to 4. - * - * FECN packets are ignored for all control messages - * (except ACKs and NAKs) since they indicate - * congestion on the control path which is not rate - * controlled. The CCA specification allows FECN on - * ACKs to be disregarded as well. - */ - - rcv_ev.is_congested = - _is_cca_fecn_set(rcv_ev. - p_hdr) & IPS_RECV_EVENT_FECN; - rcv_ev.is_congested |= - (_is_cca_becn_set(rcv_ev.p_hdr) << - (IPS_RECV_EVENT_BECN - 1)); - } else - rcv_ev.is_congested = 0; - #ifdef PSM_DEBUG if_pf(_check_headers(&rcv_ev, psm_hal_hdr_q)) goto skip_packet; diff --git a/ptl_ips/ips_recvhdrq.h b/ptl_ips/ips_recvhdrq.h index daef846..7352ff6 100644 --- a/ptl_ips/ips_recvhdrq.h +++ b/ptl_ips/ips_recvhdrq.h @@ -169,12 +169,13 @@ void * ips_recvhdrq_event_payload(const struct ips_recvhdrq_event *rcv_ev)) { if (psmi_hal_rhf_get_use_egr_buff(rcv_ev->psm_hal_rhf)) - return psmi_hal_get_egr_buff( + return (char*)(psmi_hal_get_egr_buff( psmi_hal_rhf_get_egr_buff_index(rcv_ev->psm_hal_rhf), - rcv_ev->psm_hal_hdr_q + 1 /* The circular list q (cl_q) for the - egr buff for any rx hdrq event is - always one more than the hdrq cl q */, - rcv_ev->recvq->context->psm_hw_ctxt)+ + (psmi_hal_cl_q)(rcv_ev->psm_hal_hdr_q + 1) /* The circular list q + (cl_q) for the egr buff for any rx + hdrq event is always one more than + the hdrq cl q */, + rcv_ev->recvq->context->psm_hw_ctxt))+ (psmi_hal_rhf_get_egr_buff_offset(rcv_ev->psm_hal_rhf)*64); else return NULL; diff --git a/ptl_ips/ips_scb.c b/ptl_ips/ips_scb.c index 52b9a93..83517ac 100644 --- a/ptl_ips/ips_scb.c +++ b/ptl_ips/ips_scb.c @@ -201,6 +201,12 @@ psm2_error_t ips_scbctrl_fini(struct ips_scbctrl *scbc) if (scbc->sbuf_buf_alloc) { psmi_free(scbc->sbuf_buf_alloc); } + if (scbc->timers != NULL) { + psmi_free(scbc->timers); + } + if (scbc->scb_imm_buf) { + psmi_free(scbc->scb_imm_buf); + } return PSM2_OK; } diff --git a/ptl_ips/ptl.c b/ptl_ips/ptl.c index 39b5631..8b5afc1 100644 --- a/ptl_ips/ptl.c +++ b/ptl_ips/ptl.c @@ -574,12 +574,12 @@ PSMI_INLINE(int ips_try_lock_shared_context(struct ptl_shared *recvshc)) { return pthread_spin_trylock(recvshc->context_lock); } - +/* Unused PSMI_INLINE(void ips_lock_shared_context(struct ptl_shared *recvshc)) { pthread_spin_lock(recvshc->context_lock); } - +*/ PSMI_INLINE(void ips_unlock_shared_context(struct ptl_shared *recvshc)) { pthread_spin_unlock(recvshc->context_lock); diff --git a/ptl_ips/ptl_fwd.h b/ptl_ips/ptl_fwd.h index 3702fba..b774260 100644 --- a/ptl_ips/ptl_fwd.h +++ b/ptl_ips/ptl_fwd.h @@ -61,7 +61,7 @@ typedef struct ips_epaddr ips_epaddr_t; typedef struct ips_msgctl ips_msgctl_t; /* Symbol in ips ptl */ -struct ptl_ctl_init psmi_ptl_ips; +extern struct ptl_ctl_init psmi_ptl_ips; -struct ptl_ctl_rcvthread psmi_ptl_ips_rcvthread; +extern struct ptl_ctl_rcvthread psmi_ptl_ips_rcvthread; #endif /* _PTL_FWD_IPS_H */ diff --git a/ptl_self/ptl.c b/ptl_self/ptl.c index 49d898d..35f57a3 100644 --- a/ptl_self/ptl.c +++ b/ptl_self/ptl.c @@ -143,17 +143,8 @@ self_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags_user, return PSM2_NO_MEMORY; #ifdef PSM_CUDA - /* CUDA documentation dictates the use of SYNC_MEMOPS attribute - * when the buffer pointer received into PSM has been allocated - * by the application. This guarantees the all memory operations - * to this region of memory (used by multiple layers of the stack) - * always synchronize - */ - if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)ubuf)) { - int trueflag = 1; - PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag, - CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, - (CUdeviceptr)ubuf); + if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)) { + psmi_cuda_set_attr_sync_memops(ubuf); send_req->is_buf_gpu_mem = 1; } else send_req->is_buf_gpu_mem = 0; diff --git a/ptl_self/ptl_fwd.h b/ptl_self/ptl_fwd.h index 77ee7f9..7ee6b73 100644 --- a/ptl_self/ptl_fwd.h +++ b/ptl_self/ptl_fwd.h @@ -57,6 +57,6 @@ #define _PTL_FWD_SELF_H /* Symbol in am ptl */ -struct ptl_ctl_init psmi_ptl_self; +extern struct ptl_ctl_init psmi_ptl_self; #endif diff --git a/rpm_release_extension b/rpm_release_extension index 0d6dd55..725a5ba 100644 --- a/rpm_release_extension +++ b/rpm_release_extension @@ -1 +1 @@ -91_1 +185