diff --git a/COMMIT b/COMMIT
index d92e801..b5b8411 100644
--- a/COMMIT
+++ b/COMMIT
@@ -1 +1 @@
-853ab1113c4eabf7218dfab673e433588fe7a8c4
\ No newline at end of file
+7a33bedc4bb3dff4e57c00293a2d70890db4d983
\ No newline at end of file
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
new file mode 100644
index 0000000..7571183
--- /dev/null
+++ b/CONTRIBUTORS
@@ -0,0 +1,15 @@
+The following developers have all contributed bug fixes to the open source
+version of the PSM library. Intel gratefully thanks them for their 
+contributions:
+
+Michal Schmidt (michich on github.com)
+Lisanna Dettwyler (LisannaDettwyler on github.com)
+Ana Guerrero Lopez (ana on github.com)
+Brian Smith (bsmith94 on github.com)
+Michael J OConnor (michael-j-oconnor on github.com)
+Nicolas Morey-Chaismartin (nmorey on github.com)
+Bernhard M. Wiedemann (bmwidemann on github.com)
+Dmitry (dmitrygx on github.com)
+Florian Weimer (fweimer on github.com)
+Jonas Hahnfeld (hahnjo on github.com)
+Tom Stellard (tstellar on github.com)
diff --git a/COPYING b/COPYING
index ea3d558..d0d6f87 100644
--- a/COPYING
+++ b/COPYING
@@ -313,64 +313,3 @@ PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGES.
 
                      END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-convey the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-
-Also add information on how to contact you by electronic and paper mail.
-
-If the program is interactive, make it output a short notice like this
-when it starts in an interactive mode:
-
-    Gnomovision version 69, Copyright (C) year name of author
-    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, the commands you use may
-be called something other than `show w' and `show c'; they could even be
-mouse-clicks or menu items--whatever suits your program.
-
-You should also get your employer (if you work as a programmer) or your
-school, if any, to sign a "copyright disclaimer" for the program, if
-necessary.  Here is a sample; alter the names:
-
-  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
-  `Gnomovision' (which makes passes at compilers) written by James Hacker.
-
-  <signature of Ty Coon>, 1 April 1989
-  Ty Coon, President of Vice
-
-This General Public License does not permit incorporating your program into
-proprietary programs.  If your program is a subroutine library, you may
-consider it more useful to permit linking proprietary applications with the
-library.  If this is what you want to do, use the GNU Library General
-Public License instead of this License.
-
diff --git a/Makefile b/Makefile
index 8f51f46..5a31d64 100644
--- a/Makefile
+++ b/Makefile
@@ -164,7 +164,7 @@ nthreads := $(shell echo $$(( `nproc` * 2 )) )
 # The DISTRO variable is used subsequently for variable
 # behaviors of the 3 distros.
 
-DISTRO := $(shell . /etc/os-release; echo $$ID)
+DISTRO := $(shell . /etc/os-release; if [[ "$$ID" == "sle_hpc" ]]; then ID="sles"; fi; echo $$ID)
 
 # By default the following two variables have the following values:
 LIBPSM2_COMPAT_CONF_DIR := /etc
@@ -277,7 +277,7 @@ OSVERSION := $(shell grep VERSION_ID /etc/os-release | cut -f 2 -d\" | cut -f 1 
 OSSUBVERSION := $(shell grep VERSION_ID /etc/os-release | cut -f 2 -d\" | cut -f 2 -d.)
 
 override RPM_NAME_BASEEXT := $(shell \
-    if [ "$(OS)" = "SLES" ]; then \
+    if [ "$(OS)" = "SLES" -o "$(OS)" = "SLE_HPC" ]; then \
        if [ $(OSVERSION) -gt 11 ]; then \
           if [ $(OSVERSION) -eq 12 ]; then \
              if [ $(OSSUBVERSION) -gt 2 ]; then \
@@ -483,7 +483,7 @@ dist: distclean
 	PRUNE_LIST="";										\
 	for pd in ".git" "cscope*" "$(shell realpath --relative-to=${top_srcdir} ${OUTDIR})"	\
 		"*.orig" "*~" "#*" ".gitignore" "doc" "libcm" "psm.supp" "test" "psm_hal_MOCK"	\
-		 "tools" "artifacts" "*.rej.patch"; do			\
+		 "psm_test" "tools" "artifacts" "*.rej.patch"; do			\
 		PRUNE_LIST="$$PRUNE_LIST -name $$pd -prune -o";					\
 	done;											\
 	for hid in psm_hal_* ; do								\
diff --git a/README b/README
index a6efb40..7990555 100644
--- a/README
+++ b/README
@@ -67,6 +67,7 @@ Contains the following sections:
 - INSTALLING
   * INSTALLING USING MAKEFILE
   * INSTALLING USING EITHER YUM OR DNF
+- TESTING
 - RELATED SOFTWARE TO PSM2
 - SUPPORTING DOCUMENTATION
 
@@ -181,6 +182,10 @@ BUILDING USING RPMBUILD
 
   Other supporting RPM package names will be as listed above.
 
+2. To build rpm files from the srpm file with Intel C (icc), specify the
+  correct CCARCH in the rpmbuild environment:
+ 	$ env CCARCH=icc rpmbuild --rebuild SRPMS/libpsm2-10.3.7-1.src.rpm
+
 INSTALLING
 ==========
 
diff --git a/buildflags.mak b/buildflags.mak
index 6790fb7..7c3cda0 100644
--- a/buildflags.mak
+++ b/buildflags.mak
@@ -60,19 +60,11 @@ endif
 export os ?= $(shell uname -s | tr '[A-Z]' '[a-z]')
 export arch := $(shell uname -m | sed -e 's,\(i[456]86\|athlon$$\),i386,')
 
-ifeq (${CCARCH},gcc)
-	export CC := gcc
+ifeq (${CCARCH},$(filter ${CCARCH},gcc gcc4 icc clang))
+	export CC := ${CCARCH}
 else
-	ifeq (${CCARCH},gcc4)
-		export CC := gcc4
-	else
-		ifeq (${CCARCH},icc)
-		     export CC := icc
-		else
-		     anerr := $(error Unknown C compiler arch: ${CCARCH})
-		endif # ICC
-	endif # gcc4
-endif # gcc
+	anerr := $(error Unknown C compiler arch: ${CCARCH})
+endif
 
 ifeq (${FCARCH},gfortran)
 	export FC := gfortran
@@ -108,48 +100,48 @@ BASECFLAGS +=-Wall $(WERROR)
 # test if compiler supports 32B(AVX2)/64B(AVX512F) move instruction.
 #
 ifeq (${CC},icc)
-  ifeq ($(PSM_DISABLE_AVX2),)
-    MAVX2=-xATOM_SSE4.2 -DPSM_AVX512
-  else
-    MAVX2=-march=core-avx-i
-  endif
+	ifeq ($(PSM_DISABLE_AVX2),)
+		MAVX2=-xATOM_SSE4.2 -DPSM_AVX512
+	else
+		MAVX2=-march=core-avx-i
+	endif
 else
-  ifeq ($(PSM_DISABLE_AVX2),)
-    MAVX2=-mavx2
-  else
-    MAVX2=-mavx
-  endif
+	ifeq ($(PSM_DISABLE_AVX2),)
+		MAVX2=-mavx2
+	else
+		MAVX2=-mavx
+	endif
 endif
 
 ifneq (icc,${CC})
-  ifeq ($(PSM_DISABLE_AVX2),)
-    RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX2 ; echo $$?)
-  else
-    RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX ; echo $$?)
-    $(warning ***NOTE TO USER**** Disabling AVX2 will harm performance)
-  endif
-
-  ifeq (0,${RET})
-      BASECFLAGS += ${MAVX2}
-  else
-      $(error Compiler does not support ${MAVX2} )
-  endif
+	ifeq ($(PSM_DISABLE_AVX2),)
+		RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX2 ; echo $$?)
+	else
+		RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX ; echo $$?)
+		$(warning ***NOTE TO USER**** Disabling AVX2 will harm performance)
+	endif
+
+	ifeq (0,${RET})
+		BASECFLAGS += ${MAVX2}
+	else
+		$(error Compiler does not support ${MAVX2} )
+	endif
 else
-    BASECFLAGS += ${MAVX2}
+		BASECFLAGS += ${MAVX2}
 endif
 
 # This support is dynamic at runtime, so is OK to enable as long as compiler can generate
 # the code.
 ifneq (,${PSM_AVX512})
-  ifneq (icc,${CC})
-    RET := $(shell echo "int main() {}" | ${CC} -mavx512f -E -dM -xc - 2>&1 | grep -q AVX512 ; echo $$?)
-    ifeq (0,${RET})
-      BASECFLAGS += -mavx512f
-    else
-        $(error Compiler does not support AVX512 )
-    endif
-    BASECFLAGS += -DPSM_AVX512
-  endif
+	ifneq (icc,${CC})
+		RET := $(shell echo "int main() {}" | ${CC} -mavx512f -E -dM -xc - 2>&1 | grep -q AVX512 ; echo $$?)
+		ifeq (0,${RET})
+			BASECFLAGS += -mavx512f
+		else
+			$(error Compiler does not support AVX512 )
+		endif
+		BASECFLAGS += -DPSM_AVX512
+	endif
 endif
 
 #
@@ -158,38 +150,42 @@ endif
 BASECFLAGS += -D_DEFAULT_SOURCE -D_SVID_SOURCE -D_BSD_SOURCE
 
 ifneq (,${HFI_BRAKE_DEBUG})
-  BASECFLAGS += -DHFI_BRAKE_DEBUG
+	BASECFLAGS += -DHFI_BRAKE_DEBUG
+endif
+ifneq (,${PSM_FI})
+	BASECFLAGS += -DPSM_FI
 endif
 ifneq (,${PSM_DEBUG})
-  BASECFLAGS += -O -g3 -DPSM_DEBUG -D_HFI_DEBUGGING -funit-at-a-time -Wp,-D_FORTIFY_SOURCE=2
+	BASECFLAGS += -O -g3 -DPSM_DEBUG -D_HFI_DEBUGGING -funit-at-a-time -Wp,-D_FORTIFY_SOURCE=2
 else
-  BASECFLAGS += -O3 -g3
+	BASECFLAGS += -O3 -g3
 endif
 ifneq (,${PSM_COVERAGE}) # This check must come after PSM_DEBUG to override optimization setting
-  BASECFLAGS += -O -fprofile-arcs -ftest-coverage
-  LDFLAGS += -fprofile-arcs
+	BASECFLAGS += -O -fprofile-arcs -ftest-coverage
+	LDFLAGS += -fprofile-arcs
 endif
 ifneq (,${PSM_LOG})
-   BASECFLAGS += -DPSM_LOG
+	 BASECFLAGS += -DPSM_LOG
 ifneq (,${PSM_LOG_FAST_IO})
-   BASECFLAGS += -DPSM_LOG_FAST_IO
-   PSM2_ADDITIONAL_GLOBALS += psmi_log_fini;psmi_log_message;
+	 BASECFLAGS += -DPSM_LOG_FAST_IO
+	 PSM2_ADDITIONAL_GLOBALS += psmi_log_fini;psmi_log_message;
 endif
 endif
 ifneq (,${PSM_PERF})
-   BASECFLAGS += -DRDPMC_PERF_FRAMEWORK
+	 BASECFLAGS += -DRDPMC_PERF_FRAMEWORK
 endif
 ifneq (,${PSM_HEAP_DEBUG})
-   BASECFLAGS += -DPSM_HEAP_DEBUG
-   PSM2_ADDITIONAL_GLOBALS += _psmi_heapdebug_val_heapallocs;
+	 BASECFLAGS += -DPSM_HEAP_DEBUG
+	 PSM2_ADDITIONAL_GLOBALS += _psmi_heapdebug_val_heapallocs;
 endif
 ifneq (,${PSM_PROFILE})
-  BASECFLAGS += -DPSM_PROFILE
+	BASECFLAGS += -DPSM_PROFILE
 endif
+BASECFLAGS += -DNVIDIA_GPU_DIRECT
 ifneq (,${PSM_CUDA})
-  BASECFLAGS += -DNVIDIA_GPU_DIRECT -DPSM_CUDA
-  CUDA_HOME ?= /usr/local/cuda
-  INCLUDES += -I$(CUDA_HOME)/include
+	BASECFLAGS += -DPSM_CUDA
+	CUDA_HOME ?= /usr/local/cuda
+	INCLUDES += -I$(CUDA_HOME)/include
 endif
 
 BASECFLAGS += -fpic -fPIC -D_GNU_SOURCE
@@ -199,15 +195,16 @@ ASFLAGS += -g3 -fpic
 BASECFLAGS += ${OPA_CFLAGS}
 
 ifeq (${CCARCH},icc)
-    BASECFLAGS += -fpic -fPIC -D_GNU_SOURCE -DPACK_STRUCT_STL=packed,
-    LDFLAGS += -static-intel
+	BASECFLAGS += -fpic -fPIC -D_GNU_SOURCE -DPACK_STRUCT_STL=packed,
+	LDFLAGS += -static-intel
 else
-	ifeq (${CCARCH},gcc)
-	    BASECFLAGS += -funwind-tables -Wno-strict-aliasing -Wformat-security
+	LDFLAGS += -Wl,--build-id
+	ifeq (${CCARCH},$(filter ${CCARCH},gcc clang))
+		BASECFLAGS += -funwind-tables -Wno-strict-aliasing -Wformat-security
 	else
-	    ifneq (${CCARCH},gcc4)
-		$(error Unknown compiler arch "${CCARCH}")
-	    endif # gcc4
+		ifneq (${CCARCH},gcc4)
+			$(error Unknown compiler arch "${CCARCH}")
+		endif # gcc4
 	endif # gcc
 endif # icc
 
diff --git a/compat/buildflags.mak b/compat/buildflags.mak
index b448e4e..db34848 100644
--- a/compat/buildflags.mak
+++ b/compat/buildflags.mak
@@ -57,19 +57,11 @@ export os ?= $(shell uname -s | tr '[A-Z]' '[a-z]')
 export arch := $(shell uname -m | sed -e 's,\(i[456]86\|athlon$$\),i386,')
 export CCARCH ?= gcc
 
-ifeq (${CCARCH},gcc)
-	export CC := gcc
+ifeq (${CCARCH},$(filter ${CCARCH},gcc gcc4 icc clang))
+	export CC := ${CCARCH}
 else
-	ifeq (${CCARCH},gcc4)
-		export CC := gcc4
-	else
-		ifeq (${CCARCH},icc)
-				 export CC := icc
-		else
-				 anerr := $(error Unknown C compiler arch: ${CCARCH})
-		endif # ICC
-	endif # gcc4
-endif # gcc
+	anerr := $(error Unknown C compiler arch: ${CCARCH})
+endif
 
 BASECFLAGS += $(BASE_FLAGS)
 LDFLAGS += $(BASE_FLAGS)
@@ -90,7 +82,7 @@ ifeq (${CCARCH},icc)
     BASECFLAGS += -O3 -g3
     LDFLAGS += -static-intel
 else
-	ifeq (${CCARCH},gcc)
+	ifeq (${CCARCH},$(filter ${CCARCH},gcc clang))
 	    BASECFLAGS += -Wno-strict-aliasing
 	else
 		ifneq (${CCARCH},gcc4)
diff --git a/compat/psm-compat.c b/compat/psm-compat.c
index 7d12165..258851c 100644
--- a/compat/psm-compat.c
+++ b/compat/psm-compat.c
@@ -242,11 +242,11 @@ psm_mq_setopt(psm2_mq_t mq, int key, const void *value)
 }
 
 psm2_error_t
-psm_mq_init(psm2_ep_t ep, uint64_t tag_order_mask,
+psm_mq_init(psm2_ep_t ep, uint64_t ignored,
       const struct psm2_optkey *opts,
       int numopts, psm2_mq_t *mqo)
 {
-  return psm2_mq_init(ep, tag_order_mask, opts, numopts, mqo);
+  return psm2_mq_init(ep, ignored, opts, numopts, mqo);
 }
 
 psm2_error_t
diff --git a/include/opa_service.h b/include/opa_service.h
index 3ec4824..1e7c9ab 100644
--- a/include/opa_service.h
+++ b/include/opa_service.h
@@ -84,8 +84,11 @@ int hfi_get_ctrs_port_names(int unitno, char **namep);
 /* sysfs helper routines (only those currently used are exported;
  * try to avoid using others) */
 
-/* Initializes the following sysfs helper routines. */
-void sysfs_init(const char *dflt_hfi_class_path);
+/* Initializes the following sysfs helper routines.
+   sysfs_init() returns 0 on success, non-zero on an error: */
+int sysfs_init(const char *dflt_hfi_class_path);
+/* Complementary */
+void sysfs_fini(void);
 
 /* read a string value into buff, no more than size bytes.
    returns the number of bytes read */
diff --git a/include/opa_user.h b/include/opa_user.h
index 637dacb..624b8b2 100644
--- a/include/opa_user.h
+++ b/include/opa_user.h
@@ -133,14 +133,8 @@
 #ifdef PSM_CUDA
 extern int is_driver_gpudirect_enabled;
 
-static __inline__ int _psmi_is_driver_gpudirect_enabled() __attribute__((always_inline));
-
-static __inline__ int
-_psmi_is_driver_gpudirect_enabled()
-{
-	return is_driver_gpudirect_enabled;
-}
-#define PSMI_IS_DRIVER_GPUDIRECT_ENABLED _psmi_is_driver_gpudirect_enabled()
+#define PSMI_IS_DRIVER_GPUDIRECT_ENABLED  likely(is_driver_gpudirect_enabled)
+#define PSMI_IS_DRIVER_GPUDIRECT_DISABLED unlikely(!is_driver_gpudirect_enabled)
 #endif
 
 /* hfi kdeth header format */
diff --git a/include/rbtree.c b/include/rbtree.c
index 9d6930d..b79f135 100644
--- a/include/rbtree.c
+++ b/include/rbtree.c
@@ -85,13 +85,22 @@
 
 #include <string.h> /* for memset declaration */
 
-#if !defined ( RBTREE_GET_LEFTMOST )       || \
+// RBTREE_CMP should be a comparator, i.e. RBTREE_CMP(a, b) should evaluate to
+// -1, 0, or 1 depending on if a < b, a == b, or a > b, respectively.
+#ifdef RBTREE_CMP
+
+#if defined(RBTREE_GET_LEFTMOST) || defined(RBTREE_GET_RIGHTMOST)
+#error Cannot define both RBTREE_CMP and RBTREE_GET_(LEFT|RIGHT)MOST
+#endif
+
+#elif !defined ( RBTREE_GET_LEFTMOST )       || \
 	! defined ( RBTREE_GET_RIGHTMOST ) || \
 	! defined ( RBTREE_MAP_COUNT )     || \
 	! defined ( RBTREE_ASSERT )
 #error "You must define RBTREE_GET_LEFTMOST and RBTREE_GET_RIGHTMOST and \
         RBTREE_MAP_COUNT and RBTREE_ASSERT before including rbtree.c"
-#endif
+
+#endif /* RBTREE_CMP */
 
 #define IN /* nothing */
 
@@ -117,13 +126,24 @@ static void ips_cl_qmap_remove_item(
 static cl_map_item_t* ips_cl_qmap_successor(
 				IN	cl_qmap_t* const	p_map,
 				IN	const cl_map_item_t*	p_item);
+
+
+#ifndef RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR
 static cl_map_item_t* ips_cl_qmap_predecessor(
 				IN	cl_qmap_t* const	p_map,
 				IN	const cl_map_item_t*	p_item);
+#endif
+
+#if defined(RBTREE_GET_LEFTMOST)
 static cl_map_item_t* ips_cl_qmap_search(
 				IN	cl_qmap_t* const	p_map,
 				IN	unsigned long		start,
 				IN	unsigned long		end);
+#else
+static cl_map_item_t* ips_cl_qmap_searchv(
+				cl_qmap_t* const	p_map,
+				const RBTREE_MI_PL *key);
+#endif
 
 /*
  * Get the root.
@@ -380,7 +400,11 @@ ips_cl_qmap_insert_item(
 		p_insert_at = p_comp_item;
 
 		/* Traverse the tree until the correct insertion point is found. */
+#ifdef RBTREE_GET_LEFTMOST
 		if( RBTREE_GET_LEFTMOST(&p_item->payload) < RBTREE_GET_LEFTMOST(&p_insert_at->payload) )
+#else
+		if(RBTREE_CMP(&p_item->payload, &p_insert_at->payload) < 0)
+#endif
 		{
 			p_comp_item = p_insert_at->p_left;
 			compare_res = 1;
@@ -604,6 +628,11 @@ ips_cl_qmap_successor(
 	}
 }
 
+// When includer defines RBTREE_CMP, ips_cl_qmap_search() is not emitted.
+// When this happens, ips_cl_qmap_predecessor() may not be called.
+// Combined with -Werror -Wunused-function, libpsm2 fails to build.
+// So provide macro to control emitting this function
+#ifndef RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR
 static cl_map_item_t *
 ips_cl_qmap_predecessor(
 	IN	cl_qmap_t* const		p_map,
@@ -627,7 +656,9 @@ ips_cl_qmap_predecessor(
 		return p_tmp;
 	}
 }
+#endif /* RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR */
 
+#if defined(RBTREE_GET_LEFTMOST)
 /*
  * return the first node with buffer overlapping or zero.
  */
@@ -690,3 +721,23 @@ ips_cl_qmap_search(cl_qmap_t * const p_map,
 
 	return p_item;
 }
+#else /* defined(...LEFTMOST) || defined(...RIGHTMOST) */
+static cl_map_item_t *
+ips_cl_qmap_searchv(cl_qmap_t * const p_map, const RBTREE_MI_PL *key)
+{
+	RBTREE_ASSERT( p_map );
+	cl_map_item_t *p_item = __cl_map_root(p_map);
+
+	while (p_item != p_map->nil_item) {
+		if (RBTREE_CMP(key, &p_item->payload) > 0) {
+			p_item = p_item->p_right;
+		} else if (RBTREE_CMP(key, &p_item->payload) < 0) {
+			p_item = p_item->p_left;
+		} else {
+			break;
+		}
+	}
+
+	return p_item;
+}
+#endif /* defined(...LEFTMOST) || defined(...RIGHTMOST) */
diff --git a/libpsm2.spec.in b/libpsm2.spec.in
index b033dff..e9ef766 100644
--- a/libpsm2.spec.in
+++ b/libpsm2.spec.in
@@ -71,8 +71,8 @@ Obsoletes: hfi1-psm < 1.0.0
 
 %if "@RPM_NAME_BASEEXT@"
 %package -n @RPM_NAME@@RPM_NAME_BASEEXT@
-%endif
 Summary: Intel PSM2 Libraries
+%endif
 Provides: @RPM_NAME@ = %{version}-%{release}
 Provides: @RPM_NAME@%{_isa} = %{version}-%{release}
 %if 0%{?suse_version}
diff --git a/makesrpm.sh b/makesrpm.sh
index 31caa01..18f74e2 100755
--- a/makesrpm.sh
+++ b/makesrpm.sh
@@ -80,7 +80,7 @@ function usage()
     echo "     -d <path>, -dir <path>"
     echo "           Optionally sets output folder for rpmbuild to use"
     echo "     -h <hal_gen>, -hal_gen <hal_gen>"
-    echo "           Optional, default is includes all HAL generations"
+    echo "           Optional, default is to build gen1"
     echo "           Sets hal generations for rpmbuild to use"
     echo "     Examples:"
     echo "           $0 b"
@@ -142,7 +142,7 @@ while [ "$1" != "" ]; do
 done
 
 if [ "$HAL_GENS" = "" ]; then
-    HAL_GENS="*"
+    HAL_GENS="gen1"
 fi
 
 # Generic cleanup, build, and tmp folder creation
diff --git a/opa/opa_dwordcpy-i386.S b/opa/opa_dwordcpy-i386.S
index f3d898d..863941b 100644
--- a/opa/opa_dwordcpy-i386.S
+++ b/opa/opa_dwordcpy-i386.S
@@ -53,6 +53,10 @@
 
 /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
 
+#ifdef __CET__
+#include <cet.h>
+#endif
+
  	.globl hfi_dwordcpy
 	.file	"opa_dword32cpy.S"
 	.text
@@ -61,6 +65,9 @@ hfi_dwordcpy:
 	// standard C calling convention, args on stack
         // does not return any value
 	.type	hfi_dwordcpy, @function
+#ifdef _CET_ENDBR
+	_CET_ENDBR
+#endif
 	// save caller-saved regs
 	mov    %edi,%eax
 	mov    %esi,%edx
diff --git a/opa/opa_dwordcpy-x86_64-fast.S b/opa/opa_dwordcpy-x86_64-fast.S
index fe07ebf..12fe9a3 100644
--- a/opa/opa_dwordcpy-x86_64-fast.S
+++ b/opa/opa_dwordcpy-x86_64-fast.S
@@ -53,6 +53,10 @@
 
 /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
 
+#ifdef __CET__
+#include <cet.h>
+#endif
+
  	.globl hfi_dwordcpy
 	.file	"opa_dwordcpy-x86_64-fast.S"
 	.text
@@ -61,6 +65,9 @@
         // does not return any value
 hfi_dwordcpy:
 	.type	hfi_dwordcpy, @function
+#ifdef _CET_ENDBR
+	_CET_ENDBR
+#endif
 	movl %edx,%ecx
 	shrl $1,%ecx
 	andl $1,%edx
diff --git a/opa/opa_sysfs.c b/opa/opa_sysfs.c
index 91446ec..854f604 100644
--- a/opa/opa_sysfs.c
+++ b/opa/opa_sysfs.c
@@ -72,15 +72,20 @@ static size_t sysfs_path_len;
 static char *hfifs_path;
 static long sysfs_page_size;
 
-void sysfs_init(const char *dflt_hfi_class_path)
+int sysfs_init(const char *dflt_hfi_class_path)
 {
+	int rv = 0;
+
 	if (NULL != (sysfs_path = getenv("HFI_SYSFS_PATH")))
 	{
 		char *syspath = strdup(sysfs_path);
 
 		if (!syspath)
+		{
 			_HFI_DBG("Failed to strdup(\"%s\") for syspath.\n",
 				 sysfs_path);
+			rv = -1;
+		}
 		else
 			sysfs_path = syspath;
 	}
@@ -89,7 +94,10 @@ void sysfs_init(const char *dflt_hfi_class_path)
 		char *syspath = malloc(len);
 
 		if (!syspath)
+		{
 			_HFI_DBG("Failed to alloc %u bytes for syspath.\n",len);
+			rv = -1;
+		}
 		else
 		{
 			snprintf(syspath, len, "%s_0", dflt_hfi_class_path);
@@ -104,6 +112,7 @@ void sysfs_init(const char *dflt_hfi_class_path)
 		{
 			_HFI_DBG("Did not find sysfs directory %s, using anyway\n",
 				 sysfs_path);
+			rv = -1;
 		}
 		else
 		{
@@ -125,6 +134,13 @@ void sysfs_init(const char *dflt_hfi_class_path)
 
 	if (!sysfs_page_size)
 		sysfs_page_size = sysconf(_SC_PAGESIZE);
+
+	return rv;
+}
+
+void sysfs_fini(void)
+{
+	free(sysfs_path);
 }
 
 const char *hfi_sysfs_path(void)
diff --git a/psm.c b/psm.c
index cb12dc5..7f929ce 100644
--- a/psm.c
+++ b/psm.c
@@ -65,11 +65,14 @@ static int psmi_verno = PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR);
 static int psmi_verno_client_val;
 int psmi_epid_ver;
 
+// Special psmi_refcount values
 #define PSMI_NOT_INITIALIZED    0
-#define PSMI_INITIALIZED        1
-#define PSMI_FINALIZED         -1	/* Prevent the user from calling psm2_init
-					 * once psm_finalize has been called. */
-static int psmi_isinit = PSMI_NOT_INITIALIZED;
+#define PSMI_FINALIZED         -1
+
+// PSM2 doesn't support transitioning out of the PSMI_FINALIZED state
+// once psmi_refcount is set to PSMI_FINALIZED, any further attempts to change
+// psmi_refcount should be treated as an error
+static int psmi_refcount = PSMI_NOT_INITIALIZED;
 
 /* Global lock used for endpoint creation and destroy
  * (in functions psm2_ep_open and psm2_ep_close) and also
@@ -84,16 +87,56 @@ uint64_t *shared_affinity_ptr;
 char *sem_affinity_shm_rw_name;
 char *affinity_shm_name;
 
+uint32_t psmi_cpu_model;
+
 #ifdef PSM_CUDA
 int is_cuda_enabled;
 int is_gdr_copy_enabled;
 int device_support_gpudirect;
+int gpu_p2p_supported = 0;
+int my_gpu_device = 0;
 int cuda_lib_version;
 int is_driver_gpudirect_enabled;
 int is_cuda_primary_context_retain = 0;
 uint32_t cuda_thresh_rndv;
 uint32_t gdr_copy_threshold_send;
 uint32_t gdr_copy_threshold_recv;
+
+void *psmi_cuda_lib;
+CUresult (*psmi_cuInit)(unsigned int  Flags );
+CUresult (*psmi_cuCtxDetach)(CUcontext c);
+CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c);
+CUresult (*psmi_cuCtxSetCurrent)(CUcontext c);
+CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
+CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
+CUresult (*psmi_cuDeviceCanAccessPeer)(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
+CUresult (*psmi_cuDeviceGet)(CUdevice* device, int  ordinal);
+CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
+CUresult (*psmi_cuDriverGetVersion)(int* driverVersion);
+CUresult (*psmi_cuDeviceGetCount)(int* count);
+CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags);
+CUresult (*psmi_cuStreamDestroy)(CUstream phStream);
+CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags);
+CUresult (*psmi_cuEventDestroy)(CUevent hEvent);
+CUresult (*psmi_cuEventQuery)(CUevent hEvent);
+CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream);
+CUresult (*psmi_cuEventSynchronize)(CUevent hEvent);
+CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags);
+CUresult (*psmi_cuMemFreeHost)(void* p);
+CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount);
+CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream);
+CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr);
+CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags);
+CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr);
+CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr);
+CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active);
+CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev);
+CUresult (*psmi_cuCtxGetDevice)(CUdevice* device);
+CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
 #endif
 
 /*
@@ -102,9 +145,8 @@ uint32_t gdr_copy_threshold_recv;
  * It is supposed to be filled with logical OR
  * on conditional compilation basis
  * along with future features/capabilities.
- * At the very beginning we start with Multi EPs.
  */
-uint64_t psm2_capabilities_bitset = PSM2_MULTI_EP_CAP;
+uint64_t psm2_capabilities_bitset = PSM2_MULTI_EP_CAP | PSM2_LIB_REFCOUNT_CAP;
 
 int psmi_verno_client()
 {
@@ -128,7 +170,7 @@ int psmi_verno_isinteroperable(uint16_t verno)
 
 int MOCKABLE(psmi_isinitialized)()
 {
-	return (psmi_isinit == PSMI_INITIALIZED);
+	return (psmi_refcount > 0);
 }
 MOCK_DEF_EPILOGUE(psmi_isinitialized);
 
@@ -169,6 +211,7 @@ int psmi_cuda_lib_load()
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxSetCurrent);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerGetAttribute);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerSetAttribute);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceCanAccessPeer);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetAttribute);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGet);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetCount);
@@ -229,23 +272,27 @@ int psmi_cuda_initialize()
 		return err;
 	}
 
-	CUdevice device;
+	CUdevice current_device;
 	CUcontext primary_ctx;
-	PSMI_CUDA_CALL(cuCtxGetDevice, &device);
+	PSMI_CUDA_CALL(cuCtxGetDevice, &current_device);
 	int is_ctx_active;
 	unsigned ctx_flags;
-	PSMI_CUDA_CALL(cuDevicePrimaryCtxGetState, device, &ctx_flags, &is_ctx_active);
+	PSMI_CUDA_CALL(cuDevicePrimaryCtxGetState, current_device, &ctx_flags,
+			&is_ctx_active);
 	if (!is_ctx_active) {
 		/* There is an issue where certain CUDA API calls create
 		 * contexts but does not make it active which cause the
 		 * driver API call to fail with error 709 */
-		PSMI_CUDA_CALL(cuDevicePrimaryCtxRetain, &primary_ctx, device);
+		PSMI_CUDA_CALL(cuDevicePrimaryCtxRetain, &primary_ctx,
+				current_device);
 		is_cuda_primary_context_retain = 1;
 	}
 
 	/* Check if all devices support Unified Virtual Addressing. */
 	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
 
+	device_support_gpudirect = 1;
+
 	for (dev = 0; dev < num_devices; dev++) {
 		CUdevice device;
 		PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
@@ -265,11 +312,24 @@ int psmi_cuda_initialize()
 				&major,
 				CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
 				device);
-		if (major >= 3)
-			device_support_gpudirect = 1;
-		else {
+		if (major < 3) {
 			device_support_gpudirect = 0;
-			_HFI_INFO("Device %d does not support GPUDirect RDMA (Non-fatal error) \n", dev);
+			_HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev);
+		}
+
+		if (device != current_device) {
+			int canAccessPeer = 0;
+			PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer,
+					current_device, device);
+
+			if (canAccessPeer != 1)
+				_HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev);
+			else
+				gpu_p2p_supported |= (1 << device);
+		} else {
+			/* Always support p2p on the same GPU */
+			my_gpu_device = device;
+			gpu_p2p_supported |= (1 << device);
 		}
 	}
 
@@ -336,10 +396,12 @@ psm2_error_t __psm2_init(int *major, int *minor)
 	GENERIC_PERF_SET_SLOT_NAME(PSM_TX_SPEEDPATH_CTR, "TX");
 	GENERIC_PERF_SET_SLOT_NAME(PSM_RX_SPEEDPATH_CTR, "RX");
 
-	if (psmi_isinit == PSMI_INITIALIZED)
+	if (psmi_refcount > 0) {
+		psmi_refcount++;
 		goto update;
+	}
 
-	if (psmi_isinit == PSMI_FINALIZED) {
+	if (psmi_refcount == PSMI_FINALIZED) {
 		err = PSM2_IS_FINALIZED;
 		goto fail;
 	}
@@ -363,10 +425,12 @@ psm2_error_t __psm2_init(int *major, int *minor)
 			"!!! WARNING !!! You are running an internal-only PSM *PROFILE* build.\n");
 #endif
 
+#ifdef PSM_FI
 	/* Make sure we complain if fault injection is enabled */
 	if (getenv("PSM2_FI") && !getenv("PSM2_NO_WARN"))
 		fprintf(stderr,
 			"!!! WARNING !!! You are running with fault injection enabled!\n");
+#endif /* #ifdef PSM_FI */
 
 	/* Make sure, as an internal check, that this version knows how to detect
 	 * compatibility with other library versions it may communicate with */
@@ -413,7 +477,7 @@ psm2_error_t __psm2_init(int *major, int *minor)
 				((id.eax & CPUID_EXMODEL_MASK) >> 12);
 	}
 
-	psmi_isinit = PSMI_INITIALIZED;
+	psmi_refcount++;
 	/* hfi_debug lives in libhfi.so */
 	psmi_getenv("PSM2_TRACEMASK",
 		    "Mask flags for tracing",
@@ -468,7 +532,9 @@ psm2_error_t __psm2_init(int *major, int *minor)
 
 	psmi_multi_ep_init();
 
+#ifdef PSM_FI
 	psmi_faultinj_init();
+#endif /* #ifdef PSM_FI */
 
 	psmi_epid_init();
 
@@ -496,7 +562,6 @@ psm2_error_t __psm2_init(int *major, int *minor)
 #endif
 
 update:
-
 	if (getenv("PSM2_IDENTIFY")) {
                 Dl_info info_psm;
 		char ofed_delta[100] = "";
@@ -533,6 +598,8 @@ update:
 	*major = (int)psmi_verno_major;
 	*minor = (int)psmi_verno_minor;
 fail:
+	_HFI_DBG("psmi_refcount=%d,err=%u\n", psmi_refcount, err);
+
 	PSM2_LOG_MSG("leaving");
 	return err;
 }
@@ -604,18 +671,21 @@ psm2_error_t __psm2_info_query(psm2_info_query_t q, void *out,
 	        2, /* PSM2_INFO_QUERY_MTU               */
 		2, /* PSM2_INFO_QUERY_LINK_SPEED        */
 		1, /* PSM2_INFO_QUERY_NETWORK_TYPE      */
+		0, /* PSM2_INFO_QUERY_FEATURE_MASK      */
 	};
 	psm2_error_t rv = PSM2_INTERNAL_ERR;
 
 	if ((q < 0) ||
-	    (q >= PSM2_INFO_QUERY_LAST) ||
-	    (nargs != expected_arg_cnt[q]))
-		return rv;
+	    (q >= PSM2_INFO_QUERY_LAST))
+		return 	PSM2_IQ_INVALID_QUERY;
+
+	if (nargs != expected_arg_cnt[q])
+		return PSM2_PARAM_ERR;
 
 	switch (q)
 	{
 	case PSM2_INFO_QUERY_NUM_UNITS:
-		*((uint32_t*)out) = psmi_hal_get_num_units_(1);
+		*((uint32_t*)out) = psmi_hal_get_num_units_();
 		rv = PSM2_OK;
 		break;
 	case PSM2_INFO_QUERY_NUM_PORTS:
@@ -719,7 +789,16 @@ psm2_error_t __psm2_info_query(psm2_info_query_t q, void *out,
 				rv = PSM2_OK;
 			}
 		}
-
+		break;
+	case PSM2_INFO_QUERY_FEATURE_MASK:
+		{
+#ifdef PSM_CUDA
+		*((uint32_t*)out) = PSM2_INFO_QUERY_FEATURE_CUDA;
+#else
+		*((uint32_t*)out) = 0;
+#endif /* #ifdef PSM_CUDA */
+		}
+		rv = PSM2_OK;
 		break;
 	default:
 		break;
@@ -743,7 +822,14 @@ psm2_error_t __psm2_finalize(void)
 
 	PSM2_LOG_MSG("entering");
 
+	_HFI_DBG("psmi_refcount=%d\n", psmi_refcount);
 	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+	psmi_assert(psmi_refcount > 0);
+	psmi_refcount--;
+
+	if (psmi_refcount > 0) {
+		return PSM2_OK;
+	}
 
 	/* When PSM_PERF is enabled, the following line causes the
 	   instruction cycles gathered in the current run to be dumped
@@ -751,15 +837,15 @@ psm2_error_t __psm2_finalize(void)
 	GENERIC_PERF_DUMP(stderr);
 	ep = psmi_opened_endpoint;
 	while (ep != NULL) {
-		psmi_opened_endpoint = ep->user_ep_next;
+		psm2_ep_t saved_ep = ep->user_ep_next;
 		psm2_ep_close(ep, PSM2_EP_CLOSE_GRACEFUL,
 			     2 * PSMI_MIN_EP_CLOSE_TIMEOUT);
-		ep = psmi_opened_endpoint;
+		psmi_opened_endpoint = ep = saved_ep;
 	}
 
-	psmi_epid_fini();
-
+#ifdef PSM_FI
 	psmi_faultinj_fini();
+#endif /* #ifdef PSM_FI */
 
 	/* De-allocate memory for any allocated space to store hostnames */
 	psmi_epid_itor_init(&itor, PSMI_EP_HOSTNAME);
@@ -767,6 +853,8 @@ psm2_error_t __psm2_finalize(void)
 		psmi_free(hostname);
 	psmi_epid_itor_fini(&itor);
 
+	psmi_epid_fini();
+
 	/* unmap shared mem object for affinity */
 	if (psmi_affinity_shared_file_opened) {
 		/*
@@ -807,15 +895,25 @@ psm2_error_t __psm2_finalize(void)
 	psmi_hal_finalize();
 #ifdef PSM_CUDA
 	if (is_cuda_primary_context_retain) {
+		/*
+		 * This code will be called during deinitialization, and if
+		 * CUDA is deinitialized before PSM, then
+		 * CUDA_ERROR_DEINITIALIZED will happen here
+		 */
 		CUdevice device;
-		PSMI_CUDA_CALL(cuCtxGetDevice, &device);
-		PSMI_CUDA_CALL(cuDevicePrimaryCtxRelease, device);
+		if (psmi_cuCtxGetDevice(&device) == CUDA_SUCCESS)
+			PSMI_CUDA_CALL(cuDevicePrimaryCtxRelease, device);
 	}
 #endif
 
-	psmi_isinit = PSMI_FINALIZED;
+	psmi_refcount = PSMI_FINALIZED;
 	PSM2_LOG_MSG("leaving");
 	psmi_log_fini();
+
+	psmi_stats_deregister_all();
+
+	psmi_heapdebug_finalize();
+
 	return PSM2_OK;
 }
 PSMI_API_DECL(psm2_finalize)
diff --git a/psm2.h b/psm2.h
index fa0ec20..84f59bb 100644
--- a/psm2.h
+++ b/psm2.h
@@ -277,9 +277,9 @@ typedef struct psm2_mq *psm2_mq_t;
 /*! @defgroup init PSM2 Initialization and Maintenance
  * @{
  */
-#define PSM2_VERNO       0x0201	/*!< Header-defined Version number */
+#define PSM2_VERNO       0x0202	/*!< Header-defined Version number */
 #define PSM2_VERNO_MAJOR 0x02	/*!< Header-defined Major Version Number */
-#define PSM2_VERNO_MINOR 0x01	/*!< Header-defined Minor Version Number */
+#define PSM2_VERNO_MINOR 0x02	/*!< Header-defined Minor Version Number */
 #define PSM2_VERNO_COMPAT_MAJOR 0x01    /*!<Minimum PSM1 Major Version Number for Compatibility */
 
 /*! @brief PSM2 Error type
@@ -365,6 +365,9 @@ enum psm2_error {
 	/*! AM reply error */
 	PSM2_AM_INVALID_REPLY = 70,
 
+	/*! Info query invalid query error */
+	PSM2_IQ_INVALID_QUERY = 71,
+
     /*! Reserved Value to indicate highest ENUM value */
     PSM2_ERROR_LAST = 80
 };
@@ -485,6 +488,7 @@ psm2_error_t psm2_init(int *api_verno_major, int *api_verno_minor);
  * consecutive bits : 0x2, 0x4 ... and so on.
  */
 #define PSM2_MULTI_EP_CAP 0x1	/* Multiple Endpoints capability */
+#define PSM2_LIB_REFCOUNT_CAP 0x2	/* Library finalization is managed with reference count */
 
 /** @brief PSM2 capabilities provider
  *
@@ -759,7 +763,7 @@ struct psm2_ep_open_opts {
  *                   option can be controlled to either disable (@ref
  *                   PSM2_EP_OPEN_AFFINITY_SKIP) or enable the affinity setting
  *                   only if it is already unset (@ref
- *                   PSM2_EP_OPEN_AFFINITY_SET) or regardless of affinity begin
+ *                   PSM2_EP_OPEN_AFFINITY_SET) or regardless of affinity being
  *                   set or not (@ref PSM2_EP_OPEN_AFFINITY_FORCE).
  *                   If @c HFI_NO_CPUAFFINITY is set in the environment, this
  *                   setting is ignored.
@@ -1633,6 +1637,12 @@ typedef enum psm2_info_query_et
        the network type (use: psm2_info_query_arg_t.length).
        Output parameter: char*, description: the network type. */
 	PSM2_INFO_QUERY_NETWORK_TYPE,
+
+/*! Required input arguments 0
+    Output parameter: uint32_t*, description: a bit mask of the features in libpsm2.
+    See psm2_info_query_feature_mask below for bit mask definition. */
+	PSM2_INFO_QUERY_FEATURE_MASK,
+
 	PSM2_INFO_QUERY_LAST, /* must appear last, and the info query
 				 constants are used as an index. */
 } psm2_info_query_t;
@@ -1707,6 +1717,15 @@ enum psm2_info_query_thresh_et
 	PSM2_INFO_QUERY_THRESH_SELF_END = PSM2_INFO_QUERY_THRESH_SELF_START,
 };
 
+enum psm2_info_query_feature_mask
+{
+	/*! The following bit means that the libpsm2 _can_ support cuda.
+	    If the PSM2_INFO_QUERY_FEATURE_MASK request is made and
+	    the PSM2_INFO_QUERY_FEATURE_CUDA bit is not present, thne cuda
+            is not supported. */
+	PSM2_INFO_QUERY_FEATURE_CUDA      = (1 << 0),
+};
+
 /** @brief Union for info query arg type
  */
 typedef union psm2_info_query_arg
diff --git a/psm2_hal.c b/psm2_hal.c
index 100ceaf..b4b9d9a 100644
--- a/psm2_hal.c
+++ b/psm2_hal.c
@@ -76,7 +76,7 @@ void psmi_hal_register_instance(psmi_hal_instance_t *psm_hi)
 	REJECT_IMPROPER_HI(hfp_close_context);
 	REJECT_IMPROPER_HI(hfp_context_open);
 	REJECT_IMPROPER_HI(hfp_dma_slot_available);
-	REJECT_IMPROPER_HI(hfp_finalize);
+	REJECT_IMPROPER_HI(hfp_finalize_);
 	REJECT_IMPROPER_HI(hfp_forward_packet_to_subcontext);
 	REJECT_IMPROPER_HI(hfp_free_tid);
 	REJECT_IMPROPER_HI(hfp_get_bthqp);
@@ -107,7 +107,7 @@ void psmi_hal_register_instance(psmi_hal_instance_t *psm_hi)
 	REJECT_IMPROPER_HI(hfp_get_port_lmc);
 	REJECT_IMPROPER_HI(hfp_get_port_num);
 	REJECT_IMPROPER_HI(hfp_get_port_rate);
-	REJECT_IMPROPER_HI(hfp_get_port_sc2vl);
+	REJECT_IMPROPER_HI(hfp_get_sc2vl_map);
 	REJECT_IMPROPER_HI(hfp_get_port_sl2sc);
 	REJECT_IMPROPER_HI(hfp_get_receive_event);
 	REJECT_IMPROPER_HI(hfp_get_rhf_expected_sequence_number);
@@ -165,49 +165,105 @@ void psmi_hal_register_instance(psmi_hal_instance_t *psm_hi)
 	REJECT_IMPROPER_HI(hfp_get_num_units);
 	REJECT_IMPROPER_HI(hfp_initialize);
 
-	SLIST_INSERT_HEAD(&head_hi, psm_hi, next_hi);
-	sysfs_init(psm_hi->hfi_sys_class_path);
+#ifndef PSM2_MOCK_TESTING
+	if (!sysfs_init(psm_hi->hfi_sys_class_path))
+#endif
+		SLIST_INSERT_HEAD(&head_hi, psm_hi, next_hi);
 }
 
-static struct _psmi_hal_instance *psmi_hal_get_pi_inst(int *pnumunits,
-						       int *pnumports,
-						       int *pdflt_pkey);
-
-#if PSMI_HAL_INST_CNT > 1
+static struct _psmi_hal_instance *psmi_hal_get_pi_inst(void);
 
-int psmi_hal_pre_init_func(enum psmi_hal_pre_init_func_krnls k, ...)
+int psmi_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...)
 {
 	va_list ap;
 	va_start(ap, k);
 
-	int rv = 0,numunits,numports,dflt_pkey;
-	struct _psmi_hal_instance *p = psmi_hal_get_pi_inst(&numunits,
-							    &numports,
-							    &dflt_pkey);
+	int rv = 0;
+	struct _psmi_hal_instance *p = psmi_hal_get_pi_inst();
+
 	if (!p)
 		rv = -1;
 	else
 	{
 		switch(k)
 		{
-		case psmi_hal_pre_init_func_get_num_units:
-			rv = numunits;
+		case psmi_hal_pre_init_cache_func_get_num_units:
+			rv = p->params.num_units;
 			break;
-		case psmi_hal_pre_init_func_get_num_ports:
-			rv = numports;
+		case psmi_hal_pre_init_cache_func_get_num_ports:
+			rv = p->params.num_ports;
 			break;
-		case psmi_hal_pre_init_func_get_unit_active:
-			rv = p->hfp_get_unit_active( va_arg(ap,int) );
+		case psmi_hal_pre_init_cache_func_get_unit_active:
+			{
+				int unit = va_arg(ap,int);
+
+				if ((unit >= 0) && (unit < p->params.num_units))
+				{
+					if (!p->params.unit_active_valid[unit]) {
+						p->params.unit_active_valid[unit] = 1;
+						p->params.unit_active[unit] = p->hfp_get_unit_active(unit);
+					}
+					rv = p->params.unit_active[unit];
+				}
+				else
+					rv = -1;
+			}
 			break;
-		case psmi_hal_pre_init_func_get_port_active:
-			rv = p->hfp_get_port_active( va_arg(ap,int),
-						     va_arg(ap,int) );
+		case psmi_hal_pre_init_cache_func_get_port_active:
+			{
+				int unit = va_arg(ap,int);
+
+				if ((unit >= 0) && (unit < p->params.num_units))
+				{
+					int port = va_arg(ap,int);
+					if ((port >= 1) && (port <= p->params.num_ports))
+					{
+						if (!p->params.port_active_valid[unit*port]) {
+							p->params.port_active_valid[unit*port] = 1;
+							p->params.port_active[unit*port] = p->hfp_get_port_active(unit,port);
+						}
+						rv = p->params.port_active[unit*port];
+					}
+					else
+						rv = -1;
+				}
+				else
+					rv = -1;
+			}
+			break;
+		case psmi_hal_pre_init_cache_func_get_num_contexts:
+			{
+				int unit = va_arg(ap,int);
+				if ((unit >= 0) && (unit < p->params.num_units))
+				{
+					if (!p->params.num_contexts_valid[unit]) {
+						p->params.num_contexts_valid[unit] = 1;
+						p->params.num_contexts[unit] = p->hfp_get_num_contexts(unit);
+					}
+					rv = p->params.num_contexts[unit];
+				}
+				else
+					rv = -1;
+			}
 			break;
-		case psmi_hal_pre_init_func_get_num_contexts:
-			rv = p->hfp_get_num_contexts( va_arg(ap,int) );
+		case psmi_hal_pre_init_cache_func_get_num_free_contexts:
+			{
+				int unit = va_arg(ap,int);
+
+				if ((unit >= 0) && (unit < p->params.num_units))
+				{
+					if (!p->params.num_free_contexts_valid[unit]) {
+						p->params.num_free_contexts_valid[unit] = 1;
+						p->params.num_free_contexts[unit] = p->hfp_get_num_free_contexts(unit);
+					}
+					rv = p->params.num_free_contexts[unit];
+				}
+				else
+					rv = -1;
+			}
 			break;
-		case psmi_hal_pre_init_func_get_num_free_contexts:
-			rv = p->hfp_get_num_free_contexts( va_arg(ap,int) );
+		case psmi_hal_pre_init_cache_func_get_default_pkey:
+			rv = p->params.default_pkey;
 			break;
 		default:
 			rv = -1;
@@ -219,13 +275,12 @@ int psmi_hal_pre_init_func(enum psmi_hal_pre_init_func_krnls k, ...)
 	return rv;
 }
 
-#endif
+static struct _psmi_hal_instance *psmi_hal_get_pi_inst(void)
+{
 
+	if (psmi_hal_current_hal_instance)
+		return psmi_hal_current_hal_instance;
 
-static struct _psmi_hal_instance *psmi_hal_get_pi_inst(int *pnumunits,
-						       int *pnumports,
-						       int *pdflt_pkey)
-{
 	if (SLIST_EMPTY(&head_hi))
 		return NULL;
 
@@ -243,7 +298,6 @@ static struct _psmi_hal_instance *psmi_hal_get_pi_inst(int *pnumunits,
 		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
 		    (union psmi_envvar_val)PSM_HAL_INSTANCE_ANY_GEN, &env_hi_pref);
 
-	int wait = 0;
 	/* The hfp_get_num_units() call below, will not wait for the HFI driver
 	   to come up and create device nodes in /dev/.) */
 	struct _psmi_hal_instance *p;
@@ -252,15 +306,38 @@ static struct _psmi_hal_instance *psmi_hal_get_pi_inst(int *pnumunits,
 		if ((env_hi_pref.e_int == PSM_HAL_INSTANCE_ANY_GEN) ||
 		    (p->type == env_hi_pref.e_int))
 		{
-			int nunits = p->hfp_get_num_units(wait);
+			const int valid_flags = PSM_HAL_PARAMS_VALID_DEFAULT_PKEY |
+				PSM_HAL_PARAMS_VALID_NUM_UNITS |
+				PSM_HAL_PARAMS_VALID_NUM_PORTS;
+
+			if ((p->params.sw_status & valid_flags) == valid_flags)
+				return p;
+
+			int nunits = p->hfp_get_num_units();
 			int nports = p->hfp_get_num_ports();
 			int dflt_pkey = p->hfp_get_default_pkey();
-			if (nunits > 0 && nports > 0 && dflt_pkey > 0)
+			if (nunits > 0 && nports > 0 && dflt_pkey > 0
+#ifndef PSM2_MOCK_TESTING
+			    && (0 == sysfs_init(p->hfi_sys_class_path))
+#endif
+				)
 			{
-				sysfs_init(p->hfi_sys_class_path);
-				*pnumunits = nunits;
-				*pnumports = nports;
-				*pdflt_pkey = dflt_pkey;
+				p->params.num_units = nunits;
+				p->params.num_ports = nports;
+				p->params.default_pkey = dflt_pkey;
+				p->params.sw_status |= valid_flags;
+				p->params.unit_active = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, sizeof(int8_t));
+				p->params.unit_active_valid = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, sizeof(int8_t));
+				p->params.port_active = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits*nports, sizeof(int8_t));
+				p->params.port_active_valid = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits*nports, sizeof(int8_t));
+				p->params.num_contexts = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits,
+										  sizeof(uint16_t));
+				p->params.num_contexts_valid = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits,
+											sizeof(uint16_t));
+				p->params.num_free_contexts = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits,
+										       sizeof(uint16_t));
+				p->params.num_free_contexts_valid = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits,
+											     sizeof(uint16_t));
 				return p;
 			}
 		}
@@ -271,23 +348,15 @@ static struct _psmi_hal_instance *psmi_hal_get_pi_inst(int *pnumunits,
 /* psmi_hal_initialize */
 int psmi_hal_initialize(void)
 {
-	int nunits = 0;
-	int nports = 0;
-	int dflt_pkey = 0;
-	struct _psmi_hal_instance *p = psmi_hal_get_pi_inst(&nunits, &nports, &dflt_pkey);
+	struct _psmi_hal_instance *p = psmi_hal_get_pi_inst();
 
 	if (!p)
 		return -PSM_HAL_ERROR_INIT_FAILED;
 
-	memset(&p->params,0,sizeof(p->params));
-
 	int rv = p->hfp_initialize(p);
 
 	if (!rv)
 	{
-		p->params.num_units = nunits;
-		p->params.num_ports = nports;
-		p->params.default_pkey = dflt_pkey;
 		psmi_hal_current_hal_instance = p;
 
 		if (psmi_hal_has_cap(PSM_HAL_CAP_HDRSUPP)) {
@@ -310,6 +379,34 @@ int psmi_hal_initialize(void)
 	return -PSM_HAL_ERROR_INIT_FAILED;
 }
 
+int psmi_hal_finalize(void)
+{
+	struct _psmi_hal_instance *p = psmi_hal_current_hal_instance;
+
+	int rv = psmi_hal_finalize_();
+
+	psmi_free(p->params.unit_active);
+	psmi_free(p->params.unit_active_valid);
+	psmi_free(p->params.port_active);
+	psmi_free(p->params.port_active_valid);
+	psmi_free(p->params.num_contexts);
+	psmi_free(p->params.num_contexts_valid);
+	psmi_free(p->params.num_free_contexts);
+	psmi_free(p->params.num_free_contexts_valid);
+	p->params.unit_active = NULL;
+	p->params.unit_active_valid = NULL;
+	p->params.port_active = NULL;
+	p->params.port_active_valid = NULL;
+	p->params.num_contexts = NULL;
+	p->params.num_contexts_valid = NULL;
+	p->params.num_free_contexts = NULL;
+	p->params.num_free_contexts_valid = NULL;
+	psmi_hal_current_hal_instance = NULL;
+	sysfs_fini();
+	return rv;
+}
+
+
 #ifdef PSM2_MOCK_TESTING
 
 #include "psm_hal_gen1/opa_user_gen1.h"
diff --git a/psm2_hal.h b/psm2_hal.h
index e2367f5..1bec596 100644
--- a/psm2_hal.h
+++ b/psm2_hal.h
@@ -166,6 +166,10 @@ typedef enum
 	PSM_HAL_PSMI_RUNTIME_INTR_ENABLED       = (1UL <<  2),
 	/* Header suppression is enabled: */
 	PSM_HAL_HDRSUPP_ENABLED                 = (1UL <<  3),
+	PSM_HAL_PARAMS_VALID_NUM_UNITS          = (1UL <<  4),
+	PSM_HAL_PARAMS_VALID_NUM_PORTS          = (1UL <<  5),
+	PSM_HAL_PARAMS_VALID_DEFAULT_PKEY       = (1UL <<  6),
+
 } psmi_hal_sw_status;
 
 /* The _psmi_hal_params structure stores values that remain constant for the entire life of
@@ -173,11 +177,16 @@ typedef enum
    The values are settled after the context is opened. */
 typedef struct _psmi_hal_params
 {
-	uint16_t   num_units;
-	uint16_t   num_ports;
 	uint32_t   cap_mask;
 	uint32_t   sw_status;
+	/* start cached members */
+	uint16_t   num_units;
+	uint16_t   num_ports;
 	uint16_t   default_pkey;
+	int8_t     *unit_active,*unit_active_valid;
+	int8_t     *port_active,*port_active_valid;
+	uint16_t   *num_contexts,*num_contexts_valid;
+	uint16_t   *num_free_contexts,*num_free_contexts_valid;
 } psmi_hal_params_t;
 
 /* HAL assumes that the rx hdr q and the egr buff q are circular lists
@@ -403,12 +412,12 @@ struct _psmi_hal_instance
 	/* Initialize the HAL INSTANCE. */
 	int (*hfp_initialize)(psmi_hal_instance_t *);
 	/* Finalize the HAL INSTANCE. */
-	int (*hfp_finalize)(void);
+	int (*hfp_finalize_)(void);
 
 	/* Returns the number of hfi units installed on ths host:
 	   NOTE: hfp_get_num_units is a function that must
 	   be callable before the hal instance is initialized. */
-	int (*hfp_get_num_units)(int wait);
+	int (*hfp_get_num_units)(void);
 
 	/* Returns the number of ports on each hfi unit installed.
 	   on ths host.
@@ -458,7 +467,7 @@ struct _psmi_hal_instance
 	int (*hfp_get_port_lmc)(int unit, int port);
 	int (*hfp_get_port_rate)(int unit, int port);
 	int (*hfp_get_port_sl2sc)(int unit, int port,int sl);
-	int (*hfp_get_port_sc2vl)(int unit, int port,int sc);
+	int (*hfp_get_sc2vl_map)(struct ips_proto *proto);
 	int (*hfp_set_pkey)(psmi_hal_hw_context, uint16_t);
 	int (*hfp_poll_type)(uint16_t poll_type, psmi_hal_hw_context);
 	int (*hfp_get_port_lid)(int unit, int port);
@@ -693,48 +702,42 @@ void psmi_hal_register_instance(psmi_hal_instance_t *);
     another failure has occured during initialization. */
 int psmi_hal_initialize(void);
 
-/* note that:
+int psmi_hal_finalize(void);
 
-int psmi_hal_get_num_units(void);
+#include "psm2_hal_inlines_d.h"
 
-Is intentionally left out as it is called during initialization,
-and the results are cached in the hw params.
-*/
+enum psmi_hal_pre_init_cache_func_krnls
+{
+	psmi_hal_pre_init_cache_func_get_num_units,
+	psmi_hal_pre_init_cache_func_get_num_ports,
+	psmi_hal_pre_init_cache_func_get_unit_active,
+	psmi_hal_pre_init_cache_func_get_port_active,
+	psmi_hal_pre_init_cache_func_get_num_contexts,
+	psmi_hal_pre_init_cache_func_get_num_free_contexts,
+	psmi_hal_pre_init_cache_func_get_default_pkey,
+};
 
-#include "psm2_hal_inlines_d.h"
+int psmi_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...);
+
+#define PSMI_HAL_DISPATCH_PI(KERNEL,...) ( psmi_hal_pre_init_cache_func(psmi_hal_pre_init_cache_func_ ## KERNEL , ##__VA_ARGS__ ) )
 
 #if PSMI_HAL_INST_CNT == 1
 
 #define PSMI_HAL_DISPATCH(KERNEL,...)    ( PSMI_HAL_CAT_INL_SYM(KERNEL) ( __VA_ARGS__ ) )
 
-#define PSMI_HAL_DISPATCH_PI(KERNEL,...) PSMI_HAL_DISPATCH(KERNEL , ##__VA_ARGS__ )
-
 #else
 
-enum psmi_hal_pre_init_func_krnls
-{
-	psmi_hal_pre_init_func_get_num_units,
-	psmi_hal_pre_init_func_get_num_ports,
-	psmi_hal_pre_init_func_get_unit_active,
-	psmi_hal_pre_init_func_get_port_active,
-	psmi_hal_pre_init_func_get_num_contexts,
-	psmi_hal_pre_init_func_get_num_free_contexts,
-};
-
-int psmi_hal_pre_init_func(enum psmi_hal_pre_init_func_krnls k, ...);
-
 #define PSMI_HAL_DISPATCH(KERNEL,...)    ( psmi_hal_current_hal_instance->hfp_ ## KERNEL ( __VA_ARGS__ ))
 
-#define PSMI_HAL_DISPATCH_PI(KERNEL,...) ( psmi_hal_pre_init_func(psmi_hal_pre_init_func_ ## KERNEL , ##__VA_ARGS__ ) )
-
 #endif
 
-#define psmi_hal_get_num_units_(...)				PSMI_HAL_DISPATCH_PI(get_num_units,__VA_ARGS__)
-#define psmi_hal_get_num_ports_(...)				PSMI_HAL_DISPATCH_PI(get_num_ports,##__VA_ARGS__)
-#define psmi_hal_get_unit_active(...)				PSMI_HAL_DISPATCH_PI(get_unit_active,__VA_ARGS__)
-#define psmi_hal_get_port_active(...)				PSMI_HAL_DISPATCH_PI(get_port_active,__VA_ARGS__)
-#define psmi_hal_get_num_contexts(...)				PSMI_HAL_DISPATCH_PI(get_num_contexts,__VA_ARGS__)
-#define psmi_hal_get_num_free_contexts(...)			PSMI_HAL_DISPATCH_PI(get_num_free_contexts,__VA_ARGS__)
+#define psmi_hal_get_num_units_(...)                           PSMI_HAL_DISPATCH_PI(get_num_units,##__VA_ARGS__)
+#define psmi_hal_get_num_ports_(...)                           PSMI_HAL_DISPATCH_PI(get_num_ports,##__VA_ARGS__)
+#define psmi_hal_get_unit_active(...)                          PSMI_HAL_DISPATCH_PI(get_unit_active,__VA_ARGS__)
+#define psmi_hal_get_port_active(...)                          PSMI_HAL_DISPATCH_PI(get_port_active,__VA_ARGS__)
+#define psmi_hal_get_num_contexts(...)                         PSMI_HAL_DISPATCH_PI(get_num_contexts,__VA_ARGS__)
+#define psmi_hal_get_num_free_contexts(...)                    PSMI_HAL_DISPATCH_PI(get_num_free_contexts,__VA_ARGS__)
+#define psmi_hal_get_default_pkey(...)			       PSMI_HAL_DISPATCH_PI(get_default_pkey,##__VA_ARGS__)
 #define psmi_hal_context_open(...)				PSMI_HAL_DISPATCH(context_open,__VA_ARGS__)
 #define psmi_hal_close_context(...)				PSMI_HAL_DISPATCH(close_context,__VA_ARGS__)
 #define psmi_hal_get_port_index2pkey(...)			PSMI_HAL_DISPATCH(get_port_index2pkey,__VA_ARGS__)
@@ -743,7 +746,7 @@ int psmi_hal_pre_init_func(enum psmi_hal_pre_init_func_krnls k, ...);
 #define psmi_hal_get_port_lmc(...)				PSMI_HAL_DISPATCH(get_port_lmc,__VA_ARGS__)
 #define psmi_hal_get_port_rate(...)				PSMI_HAL_DISPATCH(get_port_rate,__VA_ARGS__)
 #define psmi_hal_get_port_sl2sc(...)				PSMI_HAL_DISPATCH(get_port_sl2sc,__VA_ARGS__)
-#define psmi_hal_get_port_sc2vl(...)				PSMI_HAL_DISPATCH(get_port_sc2vl,__VA_ARGS__)
+#define psmi_hal_get_sc2vl_map(...)				PSMI_HAL_DISPATCH(get_sc2vl_map, __VA_ARGS__)
 #define psmi_hal_set_pkey(...)					PSMI_HAL_DISPATCH(set_pkey,__VA_ARGS__)
 #define psmi_hal_poll_type(...)					PSMI_HAL_DISPATCH(poll_type,__VA_ARGS__)
 #define psmi_hal_get_port_lid(...)				PSMI_HAL_DISPATCH(get_port_lid,__VA_ARGS__)
@@ -782,7 +785,7 @@ int psmi_hal_pre_init_func(enum psmi_hal_pre_init_func_krnls k, ...);
 #define psmi_hal_tidflow_get_genmismatch(...)			PSMI_HAL_DISPATCH(tidflow_get_genmismatch,__VA_ARGS__)
 #define psmi_hal_forward_packet_to_subcontext(...)		PSMI_HAL_DISPATCH(forward_packet_to_subcontext,__VA_ARGS__)
 #define psmi_hal_subcontext_ureg_get(...)			PSMI_HAL_DISPATCH(subcontext_ureg_get,__VA_ARGS__)
-#define psmi_hal_finalize(...)					PSMI_HAL_DISPATCH(finalize,__VA_ARGS__)
+#define psmi_hal_finalize_(...)                                 PSMI_HAL_DISPATCH(finalize_,__VA_ARGS__)
 #define psmi_hal_get_hfi_event_bits(...)			PSMI_HAL_DISPATCH(get_hfi_event_bits,__VA_ARGS__)
 #define psmi_hal_ack_hfi_event(...)				PSMI_HAL_DISPATCH(ack_hfi_event,__VA_ARGS__)
 #define psmi_hal_hfi_reset_context(...)				PSMI_HAL_DISPATCH(hfi_reset_context,__VA_ARGS__)
@@ -863,7 +866,6 @@ int psmi_hal_pre_init_func(enum psmi_hal_pre_init_func_krnls k, ...);
 #define psmi_hal_get_hfi_name()					psmi_hal_current_hal_instance->hfi_name
 #define psmi_hal_get_num_units()				psmi_hal_current_hal_instance->params.num_units
 #define psmi_hal_get_num_ports()				psmi_hal_current_hal_instance->params.num_ports
-#define psmi_hal_get_default_pkey()				psmi_hal_current_hal_instance->params.default_pkey
 #define psmi_hal_get_cap_mask()					psmi_hal_current_hal_instance->params.cap_mask
 #define psmi_hal_set_cap_mask(NEW_MASK)				(psmi_hal_current_hal_instance->params.cap_mask = (NEW_MASK))
 #define psmi_hal_add_cap(CAP)					(psmi_hal_current_hal_instance->params.cap_mask |= (CAP))
diff --git a/psm2_hal_inline_t.h b/psm2_hal_inline_t.h
index 8e061a2..f48f6c6 100644
--- a/psm2_hal_inline_t.h
+++ b/psm2_hal_inline_t.h
@@ -56,10 +56,10 @@
 
 static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(initialize)
 				(psmi_hal_instance_t *);
-static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(finalize)
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(finalize_)
 				(void);
 static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_units)
-				(int wait);
+				(void);
 static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_ports)
 				(void);
 static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_unit_active)
@@ -95,8 +95,8 @@ static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_rate)
 				(int unit, int port);
 static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_sl2sc)
 				(int unit, int port, int sl);
-static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_sc2vl)
-				(int unit, int port, int sc);
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_sc2vl_map)
+				(struct ips_proto *proto);
 static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(set_pkey)
 				(psmi_hal_hw_context, uint16_t);
 static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(poll_type)
diff --git a/psm2_mq.h b/psm2_mq.h
index c193afc..7b63608 100644
--- a/psm2_mq.h
+++ b/psm2_mq.h
@@ -256,15 +256,7 @@ extern "C" {
  * Matched Queue communication operations.
  *
  * @param[in] ep Endpoint over which to initialize Matched Queue
- * @param[in] tag_order_mask Order mask hint to let MQ know what bits of the
- *                           send tag are required to maintain MQ message
- *                           order.  In MPI parlance, this mask sets the bits
- *                           that store the context (or communicator ID).  The
- *                           user can choose to pass PSM2_MQ_ORDERMASK_NONE or
- *                           PSM2_MQ_ORDERMASK_ALL to tell MQ to respectively
- *                           provide no ordering guarantees or to provide
- *                           ordering over all messages by ignoring the
- *                           contexts of the send tags.
+ * @param[in] ignored
  * @param[in] opts Set of options for Matched Queue
  * @param[in] numopts Number of options passed
  * @param[out] mq User-supplied storage to return the Matched Queue handle
@@ -311,7 +303,7 @@ extern "C" {
    @endcode
  */
 psm2_error_t
-psm2_mq_init(psm2_ep_t ep, uint64_t tag_order_mask,
+psm2_mq_init(psm2_ep_t ep, uint64_t ignored,
 	    const struct psm2_optkey *opts, int numopts, psm2_mq_t *mq);
 
 #define PSM2_MQ_ORDERMASK_NONE	0ULL
diff --git a/psm_am.c b/psm_am.c
index c421142..f1f3a45 100644
--- a/psm_am.c
+++ b/psm_am.c
@@ -130,6 +130,13 @@ psm2_error_t psmi_am_init_internal(psm2_ep_t ep)
 
 }
 
+void psmi_am_fini_internal(psm2_ep_t ep)
+{
+	if(ep->am_htable != NULL) {
+		psmi_free(ep->am_htable);
+	}
+}
+
 psm2_error_t
 __psm2_am_register_handlers(psm2_ep_t ep,
 			   const psm2_am_handler_fn_t *handlers,
diff --git a/psm_am_internal.h b/psm_am_internal.h
index bc2c128..af151dc 100644
--- a/psm_am_internal.h
+++ b/psm_am_internal.h
@@ -103,5 +103,6 @@ PSMI_ALWAYS_INLINE(struct psm2_ep_am_handle_entry *
 
 /* PSM internal initialization */
 psm2_error_t psmi_am_init_internal(psm2_ep_t ep);
+void psmi_am_fini_internal(psm2_ep_t ep);
 
 #endif
diff --git a/psm_config.h b/psm_config.h
index 3c42106..85fc1bc 100644
--- a/psm_config.h
+++ b/psm_config.h
@@ -153,7 +153,7 @@
 #define MQ_HFI_THRESH_TINY		8
 #define MQ_HFI_THRESH_EGR_SDMA_XEON	34000       /* Eager Xeon blocking */
 #define MQ_HFI_THRESH_EGR_SDMA_PHI2	200000      /* Eager Phi2 blocking */
-#define MQ_HFI_THRESH_EGR_SDMA_SQ_XEON	16000    /* Eager Xeon non-blocking */
+#define MQ_HFI_THRESH_EGR_SDMA_SQ_XEON	16384    /* Eager Xeon non-blocking */
 #define MQ_HFI_THRESH_EGR_SDMA_SQ_PHI2	65536    /* Eager Phi2 non-blocking */
 
 #define MQ_HFI_THRESH_RNDV_PHI2		200000
diff --git a/psm_context.c b/psm_context.c
index 48f4671..6b9a8ae 100644
--- a/psm_context.c
+++ b/psm_context.c
@@ -223,8 +223,12 @@ psmi_create_and_open_affinity_shm(psm2_uuid_t const job_key)
 	}
 
 	ret = ftruncate(shm_fd, AFFINITY_SHMEMSIZE);
-	if ( ret < 0 )
+	if ( ret < 0 ) {
+		_HFI_VDBG("Cannot truncate affinity shared mem fd:%s, errno=%d\n",
+			affinity_shm_name, errno);
+		if (shm_fd >= 0) close(shm_fd);
 		return ret;
+	}
 
 	shared_affinity_ptr = (uint64_t *) mmap(NULL, AFFINITY_SHMEMSIZE, PROT_READ | PROT_WRITE,
 					MAP_SHARED, shm_fd, 0);
@@ -529,10 +533,20 @@ psmi_context_open(const psm2_ep_t ep, long unit_param, long port,
 
 	context->ep = (psm2_ep_t) ep;
 
-#ifdef PSM_CUDA
 	/* Check backward compatibility bits here and save the info */
 	if (psmi_hal_has_cap(PSM_HAL_CAP_GPUDIRECT_OT))
+	{
+#ifdef PSM_CUDA
 		is_driver_gpudirect_enabled = 1;
+#else
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "FATAL ERROR: "
+				  "CUDA version of hfi1 driver is loaded with non-CUDA version of "
+				  "psm2 library.\n");
+#endif
+	}
+#ifdef PSM_CUDA
+	else
+		fprintf(stderr,"WARNING: running CUDA version of libpsm2 with non CUDA version of hfi1 driver.\n");
 #endif
 	_HFI_VDBG("hfi_userinit() passed.\n");
 
diff --git a/psm_context.h b/psm_context.h
index d152a7f..c9387d1 100644
--- a/psm_context.h
+++ b/psm_context.h
@@ -82,7 +82,6 @@ struct psmi_context {
 
 	psm2_ep_t ep;		/* psm ep handle */
 	psm2_epid_t epid;	/* psm integral ep id */
-	uint32_t rcvthread_flags;
 	psm2_error_t status_lasterr;
 	time_t networkLostTime;
 } psmi_context_t;
diff --git a/psm_diags.c b/psm_diags.c
index 2a43c22..8b4ba8a 100644
--- a/psm_diags.c
+++ b/psm_diags.c
@@ -322,22 +322,23 @@ int memcpy_check_size(memcpy_fn_t fn, int *p, int *f, size_t n)
 	if (USE_MALLOC) {
 		src = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size);
 		dst = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size);
-		if (src == NULL || dst == NULL)
-			if (src)
-				psmi_free(src);
-		if (dst)
-			psmi_free(dst);
-		return -1;
+		if (src == NULL || dst == NULL) {
+			if (src) psmi_free(src);
+			if (dst) psmi_free(dst);
+			return -1;
+		}
 	} else {
-		void *src_p, *dst_p;
+		void *src_p = NULL, *dst_p = NULL;
 		if (posix_memalign(&src_p, 64, size) != 0 ||
-		    posix_memalign(&dst_p, 64, size) != 0)
+		    posix_memalign(&dst_p, 64, size) != 0) {
+			if (src_p) free(src_p);
+			if (dst_p) free(dst_p);
 			return -1;
-		else {
-			src = (uint8_t *) src_p;
-			dst = (uint8_t *) dst_p;
 		}
+		src = (uint8_t *) src_p;
+		dst = (uint8_t *) dst_p;
 	}
+
 	int src_align, dst_align;
 	for (src_align = 0; src_align < num_aligns; src_align++) {
 		for (dst_align = 0; dst_align < num_aligns; dst_align++) {
@@ -356,7 +357,12 @@ int memcpy_check_size(memcpy_fn_t fn, int *p, int *f, size_t n)
 			}
 		}
 	}
-	psmi_free(src);
-	psmi_free(dst);
+	if (USE_MALLOC) {
+		psmi_free(src);
+		psmi_free(dst);
+	} else {
+		free(src);
+		free(dst);
+	}
 	return 0;
 }
diff --git a/psm_ep.c b/psm_ep.c
index d78431d..8c4fe5e 100644
--- a/psm_ep.c
+++ b/psm_ep.c
@@ -57,6 +57,7 @@
 #include <sys/stat.h>
 #include <sched.h>		/* cpu_set */
 #include <ctype.h>		/* isalpha */
+#include <stdbool.h>
 
 #include "psm_user.h"
 #include "psm2_hal.h"
@@ -71,6 +72,8 @@
  */
 psm2_ep_t psmi_opened_endpoint = NULL;
 int psmi_opened_endpoint_count = 0;
+static uint16_t *hfi_lids;
+static uint32_t nlids;
 
 static psm2_error_t psmi_ep_open_device(const psm2_ep_t ep,
 				       const struct psm2_ep_open_opts *opts,
@@ -297,8 +300,6 @@ static psm2_error_t
 psmi_ep_devlids(uint16_t **lids, uint32_t *num_lids_o,
 		uint64_t my_gid_hi, uint64_t my_gid_lo)
 {
-	static uint16_t *hfi_lids;
-	static uint32_t nlids;
 	uint32_t num_units;
 	int i;
 	psm2_error_t err = PSM2_OK;
@@ -863,10 +864,6 @@ __psm2_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled,
 		goto fail;
 	}
 
-	/* Set environment variable if PSM is not allowed to set affinity */
-	if (opts.affinity == PSM2_EP_OPEN_AFFINITY_SKIP)
-		setenv("HFI_NO_CPUAFFINITY", "1", 1);
-
 	/* Allocate end point structure storage */
 	ptl_sizes =
 	    (psmi_device_is_enabled(devid_enabled, PTL_DEVID_SELF) ?
@@ -922,6 +919,10 @@ __psm2_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled,
 		    &envvar_val);
 	ep->yield_spin_cnt = envvar_val.e_uint;
 
+	/* Set skip_affinity flag if PSM is not allowed to set affinity */
+	if (opts.affinity == PSM2_EP_OPEN_AFFINITY_SKIP)
+		ep->skip_affinity = true;
+
 	ptl_sizes = 0;
 	amsh_ptl = ips_ptl = self_ptl = NULL;
 	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
@@ -1179,6 +1180,8 @@ psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in)
 
 	PSMI_LOCK(psmi_creation_lock);
 
+	psmi_am_fini_internal(ep);
+
 	if (psmi_opened_endpoint == NULL) {
 		err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED,
 					"PSM Endpoint is closed or does not exist");
@@ -1322,6 +1325,7 @@ psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in)
 		if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS))
 			psmi_context_close(&ep->context);
 
+		psmi_epid_remove_all(ep);
 		psmi_free(ep->epaddr);
 		psmi_free(ep->context_mylabel);
 
@@ -1332,9 +1336,17 @@ psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in)
 
 	} while ((err == PSM2_OK || err == PSM2_TIMEOUT) && tmp != ep);
 
-	if (mmq)
-	        err = psmi_mq_free(mmq);
+	if (mmq) {
+		psmi_destroy_lock(&(mmq->progress_lock));
+		err = psmi_mq_free(mmq);
+	}
 
+	if (hfi_lids)
+	{
+		psmi_free(hfi_lids);
+		hfi_lids = NULL;
+		nlids = 0;
+	}
 
 	PSMI_UNLOCK(psmi_creation_lock);
 
@@ -1363,7 +1375,6 @@ psmi_ep_open_device(const psm2_ep_t ep,
 	 *    option affinity skip.
 	 */
 	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
-		uint32_t rcvthread_flags;
 		union psmi_envvar_val env_rcvthread;
 		static int norcvthread;	/* only for first rail */
 
@@ -1394,11 +1405,10 @@ psmi_ep_open_device(const psm2_ep_t ep,
 			    (union psmi_envvar_val)(norcvthread++ ? 0 :
 						    PSMI_RCVTHREAD_FLAGS),
 			    &env_rcvthread);
-		rcvthread_flags = env_rcvthread.e_uint;
 
-		/* If enabled, use the pollurg capability to implement a receive
+		/* If enabled, use the polling capability to implement a receive
 		 * interrupt thread that can handle urg packets */
-		if (rcvthread_flags) {
+		if (env_rcvthread.e_uint) {
 			psmi_hal_add_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD);
 #ifdef PSMI_PLOCK_IS_NOLOCK
 			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
@@ -1406,7 +1416,6 @@ psmi_ep_open_device(const psm2_ep_t ep,
 					  "with RCVTHREAD on");
 #endif
 		}
-		context->rcvthread_flags = rcvthread_flags;
 
 		*epid = context->epid;
 	} else if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
diff --git a/psm_ep.h b/psm_ep.h
index 055573d..b526fa0 100644
--- a/psm_ep.h
+++ b/psm_ep.h
@@ -83,6 +83,8 @@
 #define PSMI_SL_MAX	31
 #define PSMI_SC_ADMIN	15
 #define PSMI_VL_ADMIN	15
+#define PSMI_SC_NBITS   5  /* Number of bits in SC */
+#define PSMI_N_SCS       (1 << PSMI_SC_NBITS)  /* The number of SC's */
 
 #define PSMI_EPID_PACK_V1(lid, context, subcontext, hfiunit, epid_version, rank) \
 	(((((uint64_t)lid)&0xffff)<<16)			|								\
@@ -175,6 +177,7 @@ struct psm2_ep {
 
 	/* All ptl data is allocated inline below */
 	uint8_t ptl_base_data[0] __attribute__ ((aligned(64)));
+	bool skip_affinity;
 };
 
 struct mqq {
diff --git a/psm_error.h b/psm_error.h
index f335382..cb1b4ba 100644
--- a/psm_error.h
+++ b/psm_error.h
@@ -65,7 +65,7 @@
 #define PSMI_EP_NORETURN	    ((psm2_ep_t) -2)
 #define PSMI_EP_LOGEVENT	    ((psm2_ep_t) -3)
 
-psm2_ep_errhandler_t psmi_errhandler_global;
+extern psm2_ep_errhandler_t psmi_errhandler_global;
 
 psm2_error_t MOCKABLE(psmi_handle_error)(psm2_ep_t ep, psm2_error_t error,
 			      const char *buf, ...)
diff --git a/psm_hal_gen1/opa_proto_gen1.c b/psm_hal_gen1/opa_proto_gen1.c
index 1f2b13e..eb8bce9 100644
--- a/psm_hal_gen1/opa_proto_gen1.c
+++ b/psm_hal_gen1/opa_proto_gen1.c
@@ -72,7 +72,218 @@
 
 #include <sched.h>
 
-#define ALIGN(x, a) (((x)+(a)-1)&~((a)-1))
+size_t arrsz[MAPSIZE_MAX] = { 0 };
+
+static int map_hfi_mem(int fd, struct _hfi_ctrl *ctrl, size_t subctxt_cnt)
+{
+#define CREDITS_NUM     64
+	struct hfi1_ctxt_info *cinfo = &ctrl->ctxt_info;
+	struct hfi1_base_info *binfo = &ctrl->base_info;
+	size_t sz;
+	__u64 off;
+	void *maddr;
+
+
+	/* 1. Map the PIO credits address */
+	off = binfo->sc_credits_addr &~ HFI_MMAP_PGMASK;
+
+	sz = HFI_MMAP_PGSIZE;
+	maddr = HFI_MMAP_ERRCHECK(fd, binfo, sc_credits_addr, sz, PROT_READ);
+	hfi_touch_mmap(maddr, sz);
+	arrsz[SC_CREDITS] = sz;
+
+	binfo->sc_credits_addr |= off;
+
+
+	/* 2. Map the PIO buffer SOP address
+	 * Skipping the cast of cinfo->credits to size_t. This causes the outcome of the multiplication
+	 * to be sign-extended in the event of too large input values. This results in a very large product
+	 * when treated as unsigned which in turn will make the HFI_MMAP_ERRCHECK() macro fail and give an
+	 * adequate error report. TODO: Consider sanitizing the credits value explicitly
+	 */
+	sz = cinfo->credits * CREDITS_NUM;
+	HFI_MMAP_ERRCHECK(fd, binfo, pio_bufbase_sop, sz, PROT_WRITE);
+	arrsz[PIO_BUFBASE_SOP] = sz;
+
+
+	/* 3. Map the PIO buffer address */
+	sz = cinfo->credits * CREDITS_NUM;
+	HFI_MMAP_ERRCHECK(fd, binfo, pio_bufbase, sz, PROT_WRITE);
+	arrsz[PIO_BUFBASE] = sz;
+
+
+	/* 4. Map the receive header queue
+	 * (u16 * u16 -> max value 0xfffe0001)
+	 */
+	sz = (size_t)cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize;
+	maddr = HFI_MMAP_ERRCHECK(fd, binfo, rcvhdr_bufbase, sz, PROT_READ);
+	hfi_touch_mmap(maddr, sz);
+	arrsz[RCVHDR_BUFBASE] = sz;
+
+
+	/* 5. Map the receive eager buffer
+	 * (u16 * u32. Assuming size_t's precision is 64 bits - no overflow)
+	 */
+	sz = (size_t)cinfo->egrtids * cinfo->rcvegr_size;
+	maddr = HFI_MMAP_ERRCHECK(fd, binfo, rcvegr_bufbase, sz, PROT_READ);
+	hfi_touch_mmap(maddr, sz);
+	arrsz[RCVEGR_BUFBASE] = sz;
+
+
+	/* 6. Map the sdma completion queue */
+	if (cinfo->runtime_flags & HFI1_CAP_SDMA) {
+		sz = cinfo->sdma_ring_size * sizeof(struct hfi1_sdma_comp_entry);
+		HFI_MMAP_ERRCHECK(fd, binfo, sdma_comp_bufbase, sz, PROT_READ);
+	} else {
+		sz = 0;
+		binfo->sdma_comp_bufbase = (__u64)0;
+	}
+	arrsz[SDMA_COMP_BUFBASE] = sz;
+
+
+	/* 7. Map RXE per-context CSRs */
+	sz = HFI_MMAP_PGSIZE;
+	HFI_MMAP_ERRCHECK(fd, binfo, user_regbase, sz, PROT_WRITE|PROT_READ);
+	arrsz[USER_REGBASE] = sz;
+	/* Set up addresses for optimized register writeback routines.
+ 	 * This is for the real onchip registers, shared context or not
+ 	 */
+	uint64_t *regbasep = (uint64_t *)binfo->user_regbase;
+	ctrl->__hfi_rcvhdrtail = (volatile __le64 *)(regbasep + ur_rcvhdrtail);
+	ctrl->__hfi_rcvhdrhead = (volatile __le64 *)(regbasep + ur_rcvhdrhead);
+	ctrl->__hfi_rcvegrtail = (volatile __le64 *)(regbasep + ur_rcvegrindextail);
+	ctrl->__hfi_rcvegrhead = (volatile __le64 *)(regbasep + ur_rcvegrindexhead);
+	ctrl->__hfi_rcvofftail = (volatile __le64 *)(regbasep + ur_rcvegroffsettail);
+
+	if (cinfo->runtime_flags & HFI1_CAP_HDRSUPP) {
+		ctrl->__hfi_rcvtidflow = (volatile __le64 *)(regbasep + ur_rcvtidflowtable);
+		ctrl->__hfi_tfvalid = 1;
+	} else {
+		ctrl->__hfi_rcvtidflow = ctrl->regs;
+		ctrl->__hfi_tfvalid = 0;
+	}
+
+
+	/* 8. Map the rcvhdrq tail register address */
+	if (cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL) {
+		sz = HFI_MMAP_PGSIZE;
+		HFI_MMAP_ERRCHECK(fd, binfo, rcvhdrtail_base, sz, PROT_READ);
+	} else {
+		/* We don't use receive header queue tail register to detect new packets,
+ 		 * but here we save the address for false-eager-full recovery
+ 		 */
+		sz = 0;
+		/* This points inside the previously established mapping (user_rehbase). Don't munmap()! */
+		binfo->rcvhdrtail_base = (uint64_t) (uintptr_t) ctrl->__hfi_rcvhdrtail;
+	}
+	ctrl->__hfi_rcvtail = (__le64 *)binfo->rcvhdrtail_base;
+	arrsz[RCVHDRTAIL_BASE] = sz;
+
+
+	/* 9. Map the event page */
+	off = binfo->events_bufbase &~ HFI_MMAP_PGMASK;
+
+	sz = HFI_MMAP_PGSIZE;
+	HFI_MMAP_ERRCHECK(fd, binfo, events_bufbase, sz, PROT_READ);
+	arrsz[EVENTS_BUFBASE] = sz;
+	/* keep the offset in the address */
+	binfo->events_bufbase |= off;
+
+
+	/* 10. Map the status page */
+	sz = HFI_MMAP_PGSIZE;
+	HFI_MMAP_ERRCHECK(fd, binfo, status_bufbase, sz, PROT_READ);
+	arrsz[STATUS_BUFBASE] = sz;
+
+
+	if (!subctxt_cnt)
+		return 0;
+
+	/* 11. If subcontext is used, map the buffers */
+	const char *errstr = "Incorrect input values for the subcontext";
+	size_t factor;
+
+	/* 11a) subctxt_uregbase */
+	sz = HFI_MMAP_PGSIZE;
+	maddr = HFI_MMAP_ERRCHECK(fd, binfo, subctxt_uregbase, sz, PROT_READ|PROT_WRITE);
+	hfi_touch_mmap(maddr, sz);
+	arrsz[SUBCTXT_UREGBASE] = sz;
+
+
+	/* 11b) subctxt_rcvhdrbuf
+	 * u16 * u16. Prevent promotion to int through an explicit cast to size_t
+	 */
+	factor = (size_t)cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize;
+	factor = ALIGN(factor, HFI_MMAP_PGSIZE);
+	sz = factor * subctxt_cnt;
+	maddr = HFI_MMAP_ERRCHECK(fd, binfo, subctxt_rcvhdrbuf, sz, PROT_READ|PROT_WRITE);
+	hfi_touch_mmap(maddr, sz);
+	arrsz[SUBCTXT_RCVHDRBUF] = sz;
+
+
+	/* 11c) subctxt_rcvegrbuf
+	 * u16 * u32. Assuming size_t's precision to be 64 bits (no overflow)
+	 */
+	factor = (size_t)cinfo->egrtids * cinfo->rcvegr_size;
+	factor = ALIGN(factor, HFI_MMAP_PGSIZE);
+	sz = factor * subctxt_cnt;
+	if (sz / subctxt_cnt != factor) {
+		_HFI_INFO("%s (rcvegrbuf)\n", errstr);
+		goto err_int_overflow_subctxt_rcvegrbuf;
+	}
+	maddr = HFI_MMAP_ERRCHECK(fd, binfo, subctxt_rcvegrbuf, sz, PROT_READ|PROT_WRITE);
+	hfi_touch_mmap(maddr, sz);
+	arrsz[SUBCTXT_RCVEGRBUF] = sz;
+
+	return 0;
+
+err_int_overflow_subctxt_rcvegrbuf:
+err_mmap_subctxt_rcvegrbuf:
+	/* if we got here, subctxt_cnt must be != 0 */
+	HFI_MUNMAP_ERRCHECK(binfo, subctxt_rcvhdrbuf, arrsz[SUBCTXT_RCVHDRBUF]);
+
+err_mmap_subctxt_rcvhdrbuf:
+	/* if we got it here, subctxt_cnt must be != 0 */
+	HFI_MUNMAP_ERRCHECK(binfo, subctxt_uregbase, arrsz[SUBCTXT_UREGBASE]);
+
+err_mmap_subctxt_uregbase:
+	HFI_MUNMAP_ERRCHECK(binfo, status_bufbase, arrsz[STATUS_BUFBASE]);
+
+err_mmap_status_bufbase:
+	HFI_MUNMAP_ERRCHECK(binfo, events_bufbase, arrsz[EVENTS_BUFBASE]);
+
+err_mmap_events_bufbase:
+	if(cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL) {
+		HFI_MUNMAP_ERRCHECK(binfo, rcvhdrtail_base, arrsz[RCVHDRTAIL_BASE]);
+	}
+
+err_mmap_rcvhdrtail_base:
+	HFI_MUNMAP_ERRCHECK(binfo, user_regbase, arrsz[USER_REGBASE]);
+
+err_mmap_user_regbase:
+	/* the condition could be: if(cinfo->runtime_flags & HFI1_CAP_SDMA) too */
+	if(binfo->sdma_comp_bufbase != 0) {
+		HFI_MUNMAP_ERRCHECK(binfo, sdma_comp_bufbase, arrsz[SDMA_COMP_BUFBASE]);
+	}
+
+err_mmap_sdma_comp_bufbase:
+	HFI_MUNMAP_ERRCHECK(binfo, rcvegr_bufbase, arrsz[RCVEGR_BUFBASE]);
+
+err_mmap_rcvegr_bufbase:
+	HFI_MUNMAP_ERRCHECK(binfo, rcvhdr_bufbase, arrsz[RCVHDR_BUFBASE]);
+
+err_mmap_rcvhdr_bufbase:
+	HFI_MUNMAP_ERRCHECK(binfo, pio_bufbase, arrsz[PIO_BUFBASE]);
+
+err_mmap_pio_bufbase:
+	HFI_MUNMAP_ERRCHECK(binfo, pio_bufbase_sop, arrsz[PIO_BUFBASE_SOP]);
+
+err_mmap_pio_bufbase_sop:
+	HFI_MUNMAP_ERRCHECK(binfo, sc_credits_addr, arrsz[SC_CREDITS]);
+
+err_mmap_sc_credits_addr:
+	return -1;
+}
 
 /* It is allowed to have multiple devices (and of different types)
    simultaneously opened and initialized, although this (still! Oct 07)
@@ -82,15 +293,13 @@
    struct _hfi_ctrl *.  The struct _hfi_ctrl * used for everything
    else is returned as part of hfi1_base_info.
 */
-struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo)
+struct _hfi_ctrl *hfi_userinit_internal(int fd, bool skip_affinity,
+		struct hfi1_user_info_dep *uinfo)
 {
 	struct _hfi_ctrl *spctrl = NULL;
 	struct hfi1_ctxt_info *cinfo;
 	struct hfi1_base_info *binfo;
-	void *tmp;
-	uint64_t *tmp64;
 	struct hfi1_cmd c;
-	uintptr_t pg_mask;
 	int __hfi_pg_sz;
 #ifdef PSM2_SUPPORT_IW_CMD_API
 	/* for major version 6 of driver, we will use uinfo_new.  See below for details. */
@@ -99,12 +308,11 @@ struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo)
 
 	/* First get the page size */
 	__hfi_pg_sz = sysconf(_SC_PAGESIZE);
-	pg_mask = ~(intptr_t) (__hfi_pg_sz - 1);
 
 	if (!(spctrl = calloc(1, sizeof(struct _hfi_ctrl)))) {
 		_HFI_INFO("can't allocate memory for hfi_ctrl: %s\n",
 			  strerror(errno));
-		goto err;
+		goto err_calloc_hfi_ctrl;
 	}
 	cinfo = &spctrl->ctxt_info;
 	binfo = &spctrl->base_info;
@@ -157,7 +365,7 @@ struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo)
 			_HFI_INFO("assign_context command failed: %s\n",
 				  strerror(errno));
 		}
-		goto err;
+		goto err_hfi_cmd_assign_ctxt;
 	}
 
 #ifdef PSM2_SUPPORT_IW_CMD_API
@@ -180,37 +388,37 @@ struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo)
 
 	if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) {
 		_HFI_INFO("CTXT_INFO command failed: %s\n", strerror(errno));
-		goto err;
+		goto err_hfi_cmd_ctxt_info;
 	}
 
 	/* sanity checking... */
 	if (cinfo->rcvtids%8) {
 		_HFI_INFO("rcvtids not 8 multiple: %d\n", cinfo->rcvtids);
-		goto err;
+		goto err_sanity_check;
 	}
 	if (cinfo->egrtids%8) {
 		_HFI_INFO("egrtids not 8 multiple: %d\n", cinfo->egrtids);
-		goto err;
+		goto err_sanity_check;
 	}
 	if (cinfo->rcvtids < cinfo->egrtids) {
 		_HFI_INFO("rcvtids(%d) < egrtids(%d)\n",
 				cinfo->rcvtids, cinfo->egrtids);
-		goto err;
+		goto err_sanity_check;
 	}
 	if (cinfo->rcvhdrq_cnt%32) {
 		_HFI_INFO("rcvhdrq_cnt not 32 multiple: %d\n",
 				cinfo->rcvhdrq_cnt);
-		goto err;
+		goto err_sanity_check;
 	}
 	if (cinfo->rcvhdrq_entsize%64) {
 		_HFI_INFO("rcvhdrq_entsize not 64 multiple: %d\n",
 				cinfo->rcvhdrq_entsize);
-		goto err;
+		goto err_sanity_check;
 	}
 	if (cinfo->rcvegr_size%__hfi_pg_sz) {
 		_HFI_INFO("rcvegr_size not page multiple: %d\n",
 				cinfo->rcvegr_size);
-		goto err;
+		goto err_sanity_check;
 	}
 
 	_HFI_VDBG("ctxtinfo: runtime_flags %llx, rcvegr_size %d\n",
@@ -227,8 +435,10 @@ struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo)
 		  cinfo->egrtids, cinfo->sdma_ring_size);
 
 	/* if affinity has not been setup, set it */
-	if ((!getenv("HFI_NO_CPUAFFINITY") && cinfo->rec_cpu != (__u16) -1) ||
-		getenv("HFI_FORCE_CPUAFFINITY")) {
+	if (getenv("HFI_FORCE_CPUAFFINITY") ||
+		(cinfo->rec_cpu != (__u16) -1 &&
+		!(getenv("HFI_NO_CPUAFFINITY") || skip_affinity)))
+	{
 		cpu_set_t cpuset;
 		CPU_ZERO(&cpuset);
 		CPU_SET(cinfo->rec_cpu, &cpuset);
@@ -240,7 +450,6 @@ struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo)
 		}
 	}
 
-
 	/* 4. Get user base info from driver */
 	c.type = PSMI_HFI_CMD_USER_INFO;
 	c.len = sizeof(*binfo);
@@ -248,7 +457,7 @@ struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo)
 
 	if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) {
 		_HFI_INFO("BASE_INFO command failed: %s\n", strerror(errno));
-		goto err;
+		goto err_hfi_cmd_user_info;
 	}
 
 	hfi_set_user_version(binfo->sw_version);
@@ -276,272 +485,15 @@ struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo)
 		    ("User major version 0x%x not same as driver major 0x%x\n",
 		     hfi_get_user_major_version(), binfo->sw_version >> HFI1_SWMAJOR_SHIFT);
 		if ((binfo->sw_version >> HFI1_SWMAJOR_SHIFT) < hfi_get_user_major_version())
-			goto err;	/* else assume driver knows how to be compatible */
+			goto err_version_mismatch;	/* else assume driver knows how to be compatible */
 	} else if ((binfo->sw_version & 0xffff) != HFI1_USER_SWMINOR) {
 		_HFI_PRDBG
 		    ("User minor version 0x%x not same as driver minor 0x%x\n",
 		     HFI1_USER_SWMINOR, binfo->sw_version & 0xffff);
 	}
 
-	/* Map the PIO credits address */
-	if ((tmp = hfi_mmap64(0, __hfi_pg_sz,
-			      PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
-			      (__off64_t) binfo->sc_credits_addr &
-			      pg_mask)) == MAP_FAILED) {
-		_HFI_INFO("mmap of sc_credits_addr (%llx) failed: %s\n",
-			  (unsigned long long)binfo->sc_credits_addr,
-			  strerror(errno));
-		goto err;
-	} else {
-		hfi_touch_mmap(tmp, __hfi_pg_sz);
-		binfo->sc_credits_addr = (uint64_t) (uintptr_t) tmp |
-		    (binfo->sc_credits_addr & ~pg_mask);
-		_HFI_VDBG("sc_credits_addr %llx\n",
-			  binfo->sc_credits_addr);
-	}
-
-	/* Map the PIO buffer SOP address */
-	if ((tmp = hfi_mmap64(0, cinfo->credits * 64,
-			      PROT_WRITE, MAP_SHARED | MAP_LOCKED, fd,
-			      (__off64_t) binfo->pio_bufbase_sop & pg_mask))
-	    == MAP_FAILED) {
-		_HFI_INFO("mmap of pio buffer sop at %llx failed: %s\n",
-			  (unsigned long long)binfo->pio_bufbase_sop,
-			  strerror(errno));
-		goto err;
-	} else {
-		/* Do not try to read the PIO buffers; they are mapped write */
-		/* only.  We'll fault them in as we write to them. */
-		binfo->pio_bufbase_sop = (uintptr_t) tmp;
-		_HFI_VDBG("pio_bufbase_sop %llx\n",
-			  binfo->pio_bufbase_sop);
-	}
-
-	/* Map the PIO buffer address */
-	if ((tmp = hfi_mmap64(0, cinfo->credits * 64,
-			      PROT_WRITE, MAP_SHARED | MAP_LOCKED, fd,
-			      (__off64_t) binfo->pio_bufbase & pg_mask)) ==
-	    MAP_FAILED) {
-		_HFI_INFO("mmap of pio buffer at %llx failed: %s\n",
-			  (unsigned long long)binfo->pio_bufbase,
-			  strerror(errno));
-		goto err;
-	} else {
-		/* Do not try to read the PIO buffers; they are mapped write */
-		/* only.  We'll fault them in as we write to them. */
-		binfo->pio_bufbase = (uintptr_t) tmp;
-		_HFI_VDBG("sendpio_bufbase %llx\n", binfo->pio_bufbase);
-	}
-
-	/* Map the receive header queue */
-	if ((tmp =
-	     hfi_mmap64(0, cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize,
-			PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
-			(__off64_t) binfo->rcvhdr_bufbase & pg_mask)) ==
-	    MAP_FAILED) {
-		_HFI_INFO("mmap of rcvhdrq at %llx failed: %s\n",
-			  (unsigned long long)binfo->rcvhdr_bufbase,
-			  strerror(errno));
-		goto err;
-	} else {
-		/* for use in protocol code */
-		hfi_touch_mmap(tmp,
-			       cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize);
-		binfo->rcvhdr_bufbase = (uintptr_t) tmp;	/* set to mapped address */
-		_HFI_VDBG("rcvhdr_bufbase %llx\n", binfo->rcvhdr_bufbase);
-	}
-
-	/* Map the receive eager buffer */
-	if ((tmp =
-	     hfi_mmap64(0, cinfo->egrtids * cinfo->rcvegr_size,
-			PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
-			(__off64_t) binfo->rcvegr_bufbase & pg_mask)) ==
-	    MAP_FAILED) {
-		_HFI_INFO("mmap of rcvegrq bufs from %llx failed: %s\n",
-			  (unsigned long long)binfo->rcvegr_bufbase,
-			  strerror(errno));
-		goto err;
-	} else {
-		hfi_touch_mmap(tmp, cinfo->egrtids * cinfo->rcvegr_size);
-		binfo->rcvegr_bufbase = (uint64_t) (uintptr_t) tmp;
-		_HFI_VDBG("rcvegr_bufbase %llx\n", binfo->rcvegr_bufbase);
-	}
-
-	/* Map the sdma completion queue */
-	if (!(cinfo->runtime_flags & HFI1_CAP_SDMA)) {
-		binfo->sdma_comp_bufbase = 0;
-	} else
-	    if ((tmp =
-		 hfi_mmap64(0, cinfo->sdma_ring_size *
-				sizeof(struct hfi1_sdma_comp_entry),
-			    PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
-			    (__off64_t) binfo->sdma_comp_bufbase & pg_mask)) ==
-		MAP_FAILED) {
-		_HFI_INFO
-		    ("mmap of sdma completion queue from %llx failed: %s\n",
-		     (unsigned long long)binfo->sdma_comp_bufbase,
-		     strerror(errno));
-		goto err;
-	} else {
-		binfo->sdma_comp_bufbase = (uint64_t) (uintptr_t) tmp;
-	}
-	_HFI_VDBG("sdma_comp_bufbase %llx\n", binfo->sdma_comp_bufbase);
-
-	/* Map RXE per-context CSRs */
-	if ((tmp = hfi_mmap64(0, __hfi_pg_sz,
-			      PROT_WRITE | PROT_READ, MAP_SHARED | MAP_LOCKED,
-			      fd,
-			      (__off64_t) binfo->user_regbase & pg_mask)) ==
-	    MAP_FAILED) {
-		_HFI_INFO("mmap of user registers at %llx failed: %s\n",
-			  (unsigned long long)binfo->user_regbase,
-			  strerror(errno));
-		goto err;
-	} else {
-		/* we don't try to fault these in, no need */
-		binfo->user_regbase = (uint64_t) (uintptr_t) tmp;
-		_HFI_VDBG("user_regbase %llx\n", binfo->user_regbase);
-	}
-
-	/*
-	 * Set up addresses for optimized register writeback routines.
-	 * This is for the real onchip registers, shared context or not
-	 */
-	tmp64 = (uint64_t *) tmp;
-	spctrl->__hfi_rcvhdrtail = (volatile __le64 *)&tmp64[ur_rcvhdrtail];
-	spctrl->__hfi_rcvhdrhead = (volatile __le64 *)&tmp64[ur_rcvhdrhead];
-	spctrl->__hfi_rcvegrtail =
-	    (volatile __le64 *)&tmp64[ur_rcvegrindextail];
-	spctrl->__hfi_rcvegrhead =
-	    (volatile __le64 *)&tmp64[ur_rcvegrindexhead];
-	spctrl->__hfi_rcvofftail =
-	    (volatile __le64 *)&tmp64[ur_rcvegroffsettail];
-
-	if (!(cinfo->runtime_flags & HFI1_CAP_HDRSUPP)) {
-		spctrl->__hfi_rcvtidflow = spctrl->regs;
-		spctrl->__hfi_tfvalid = 0;
-	} else {
-		spctrl->__hfi_rcvtidflow =
-		    (volatile __le64 *)&tmp64[ur_rcvtidflowtable];
-		spctrl->__hfi_tfvalid = 1;
-	}
-
-	/* Map the rcvhdrq tail register address */
-	if (!(cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL)) {
-		/*
-		 * We don't use receive header queue tail register to detect
-		 * new packets, but here we save the address for
-		 * false-eager-full recovery.
-		 */
-		binfo->rcvhdrtail_base =
-		    (uint64_t) (uintptr_t) spctrl->__hfi_rcvhdrtail;
-		spctrl->__hfi_rcvtail = (__le64 *) binfo->rcvhdrtail_base;
-	} else
-	    if ((tmp = hfi_mmap64(0, __hfi_pg_sz,
-				  PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
-				  (__off64_t) binfo->rcvhdrtail_base &
-				  pg_mask)) == MAP_FAILED) {
-		_HFI_INFO("mmap of rcvhdrq tail addr %llx failed: %s\n",
-			  (unsigned long long)binfo->rcvhdrtail_base,
-			  strerror(errno));
-		goto err;
-	} else {
-		hfi_touch_mmap(tmp, __hfi_pg_sz);
-		binfo->rcvhdrtail_base = (uint64_t) (uintptr_t) tmp;
-		spctrl->__hfi_rcvtail = (__le64 *) binfo->rcvhdrtail_base;
-	}
-	_HFI_VDBG("rcvhdr_tail_addr %llx\n", binfo->rcvhdrtail_base);
-
-	/* Map the event page */
-	if ((tmp = hfi_mmap64(0, __hfi_pg_sz,
-			      PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
-			      (__off64_t) binfo->events_bufbase & pg_mask)) ==
-	    MAP_FAILED) {
-		_HFI_INFO("mmap of status page at %llx failed: %s\n",
-			  (unsigned long long)binfo->events_bufbase,
-			  strerror(errno));
-		goto err;
-	} else {
-		binfo->events_bufbase = (uint64_t) (uintptr_t) tmp |
-		    (binfo->events_bufbase & ~pg_mask);
-		_HFI_VDBG("events_bufbase %llx\n", binfo->events_bufbase);
-	}
-
-	/* Map the status page */
-	if ((tmp = hfi_mmap64(0, __hfi_pg_sz,
-			      PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
-			      (__off64_t) binfo->status_bufbase & pg_mask)) ==
-	    MAP_FAILED) {
-		_HFI_INFO("mmap of status page (%llx) failed: %s\n",
-			  (unsigned long long)binfo->status_bufbase,
-			  strerror(errno));
-		goto err;
-	} else {
-		binfo->status_bufbase = (uintptr_t) tmp;
-		_HFI_VDBG("status_bufbase %llx\n", binfo->status_bufbase);
-	}
-
-	/* If subcontext is used, map the buffers */
-	if (uinfo->subctxt_cnt) {
-		unsigned num_subcontexts = uinfo->subctxt_cnt;
-		size_t size;
-
-		size = __hfi_pg_sz;
-		if ((tmp = hfi_mmap64(0, size,
-				      PROT_READ | PROT_WRITE,
-				      MAP_SHARED | MAP_LOCKED, fd,
-				      (__off64_t) binfo->subctxt_uregbase &
-				      pg_mask)) == MAP_FAILED) {
-			_HFI_INFO
-			    ("mmap of subcontext uregbase array (%llx) failed: %s\n",
-			     (unsigned long long)binfo->subctxt_uregbase,
-			     strerror(errno));
-			goto err;
-		} else {
-			hfi_touch_mmap(tmp, size);
-			binfo->subctxt_uregbase = (uint64_t) (uintptr_t) tmp;
-			_HFI_VDBG("subctxt_uregbase %llx\n",
-				  binfo->subctxt_uregbase);
-		}
-
-		size = ALIGN(cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize,
-			     __hfi_pg_sz) * num_subcontexts;
-		if ((tmp = hfi_mmap64(0, size,
-				      PROT_READ | PROT_WRITE,
-				      MAP_SHARED | MAP_LOCKED, fd,
-				      (__off64_t) binfo->subctxt_rcvhdrbuf &
-				      pg_mask)) == MAP_FAILED) {
-			_HFI_INFO
-			    ("mmap of subcontext rcvhdr_base array (%llx) failed: %s\n",
-			     (unsigned long long)binfo->subctxt_rcvhdrbuf,
-			     strerror(errno));
-			goto err;
-		} else {
-			hfi_touch_mmap(tmp, size);
-			binfo->subctxt_rcvhdrbuf = (uint64_t) (uintptr_t) tmp;
-			_HFI_VDBG("subctxt_rcvhdrbuf %llx\n",
-				  binfo->subctxt_rcvhdrbuf);
-		}
-
-		size = ALIGN(cinfo->egrtids * cinfo->rcvegr_size,
-			     __hfi_pg_sz) * num_subcontexts;
-		if ((tmp = hfi_mmap64(0, size,
-				      PROT_READ | PROT_WRITE,
-				      MAP_SHARED | MAP_LOCKED, fd,
-				      (__off64_t) binfo->subctxt_rcvegrbuf &
-				      pg_mask)) == MAP_FAILED) {
-			_HFI_INFO
-			    ("mmap of subcontext rcvegrbuf array (%llx) failed: %s\n",
-			     (unsigned long long)binfo->subctxt_rcvegrbuf,
-			     strerror(errno));
-			goto err;
-		} else {
-			hfi_touch_mmap(tmp, size);
-			binfo->subctxt_rcvegrbuf = (uint64_t) (uintptr_t) tmp;
-			_HFI_VDBG("subctxt_rcvegrbuf %llx\n",
-				  binfo->subctxt_rcvegrbuf);
-		}
-	}
+	if (map_hfi_mem(fd, spctrl, uinfo->subctxt_cnt) == -1)
+		goto err_map_hfi_mem;
 
 	/* Save some info. */
 	spctrl->fd = fd;
@@ -560,8 +512,32 @@ struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo)
 
 	return spctrl;
 
-err:
-	if (spctrl)
-		free(spctrl);
+err_map_hfi_mem:
+err_version_mismatch:
+err_hfi_cmd_user_info:
+	/* TODO: restore the original CPU affinity? */
+
+err_sanity_check:
+err_hfi_cmd_ctxt_info:
+	/* TODO: ioctl de-assign context here? */
+	// without de-assigning the context, all subsequent hfi_userinit_internal()
+	// calls are going to fail
+	_HFI_ERROR("An unrecoverable error occurred while communicating with the driver\n");
+	abort(); /* TODO: or do we want to include psm_user.h to use psmi_handle_error()? */
+	// no recovery here
+
+	/* if we failed to allocate memory or to assign the context, we might still recover from this.
+ 	 * Returning NULL will cause the function to be reinvoked n times. Do we really want this
+ 	 * behavior?
+	*/
+err_hfi_cmd_assign_ctxt:
+	free(spctrl);
+
+err_calloc_hfi_ctrl:
 	return NULL;
 }
+
+struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo)
+{
+	return hfi_userinit_internal(fd, false, uinfo);
+}
diff --git a/psm_hal_gen1/opa_service_gen1.c b/psm_hal_gen1/opa_service_gen1.c
index e4719e3..641e262 100644
--- a/psm_hal_gen1/opa_service_gen1.c
+++ b/psm_hal_gen1/opa_service_gen1.c
@@ -89,84 +89,6 @@ static sw_version_t sw_version =
 	}
 };
 
-/*
- * This function is necessary in a udev-based world.  There can be an
- * arbitrarily long (but typically less than one second) delay between
- * a driver getting loaded and any dynamic special files turning up.
- *
- * The timeout is in milliseconds.  A value of zero means "callee
- * decides timeout".  Negative is infinite.
- *
- * Returns 0 on success, -1 on error or timeout.  Check errno to see
- * whether there was a timeout (ETIMEDOUT) or an error (any other
- * non-zero value).
- */
-int hfi_wait_for_device(const char *path, long timeout)
-{
-	int saved_errno;
-	struct stat st;
-	long elapsed;
-	int ret;
-
-	if (timeout == 0)
-		timeout = 15000;
-
-	elapsed = 0;
-
-	while (1) {
-		static const long default_ms = 250;
-		struct timespec req = { 0 };
-		long ms;
-
-		ret = stat(path, &st);
-		saved_errno = errno;
-
-		if (ret == 0 || (ret == -1 && errno != ENOENT))
-			break;
-
-		if ((timeout > 0) && ((timeout - elapsed) <= 0)) {
-			saved_errno = ETIMEDOUT;
-			break;
-		}
-
-		if (elapsed == 0) {
-			if (timeout < 0)
-				_HFI_DBG
-				    ("Device file %s not present on first check; "
-				     "waiting indefinitely...\n", path);
-			else
-				_HFI_DBG
-				    ("Device file %s not present on first check; "
-				     "waiting up to %.1f seconds...\n", path,
-				     timeout / 1e3);
-		}
-
-		if (timeout < 0 || timeout - elapsed >= default_ms)
-			ms = default_ms;
-		else
-			ms = timeout;
-
-		elapsed += ms;
-		req.tv_nsec = ms * 1000000;
-
-		ret = nanosleep(&req, NULL);
-		saved_errno = errno;
-
-		if (ret == -1)
-			break;
-	}
-
-	if (ret == 0)
-		_HFI_DBG("Found %s after %.1f seconds\n", path, elapsed / 1e3);
-	else
-		_HFI_INFO
-		    ("The %s device failed to appear after %.1f seconds: %s\n",
-		     path, elapsed / 1e3, strerror(saved_errno));
-
-	errno = saved_errno;
-	return ret;
-}
-
 /* fwd declaration */
 ustatic int _hfi_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count);
 
@@ -223,13 +145,6 @@ int hfi_context_open_ex(int unit, int port, uint64_t open_timeout,
 		snprintf(dev_name, dev_name_len, "%s_%u", HFI_DEVICE_PATH_GEN1,
 			 0);
 
-	if (hfi_wait_for_device(dev_name, (long)open_timeout) == -1) {
-		_HFI_DBG("Could not find an HFI Unit on device "
-			 "%s (%lds elapsed)", dev_name,
-			 (long)open_timeout / 1000);
-		return -1;
-	}
-
 	if ((fd = open(dev_name, O_RDWR)) == -1) {
 		_HFI_DBG("(host:Can't open %s for reading and writing",
 			 dev_name);
@@ -397,7 +312,7 @@ void *hfi_mmap64(void *addr, size_t length, int prot, int flags, int fd,
 /* that a working chip has been found for each possible unit #. */
 /* number of units >=0 (0 means none found). */
 /* formerly used sysfs file "num_units" */
-int hfi_get_num_units(int wait)
+int hfi_get_num_units(void)
 {
 	int ret;
 
@@ -407,12 +322,7 @@ int hfi_get_num_units(int wait)
 		int r;
 
 		snprintf(pathname, sizeof(pathname), HFI_DEVICE_PATH_GEN1 "_%d", ret);
-		if (wait && (ret == 0))
-			/* We only wait for the first device to come up.  Not
-			   on subsequent devices in order to save time. */
-			r = hfi_wait_for_device(pathname, 0);
-		else
-			r = stat(pathname, &st);
+		r = stat(pathname, &st);
 		if (!r)
 			continue;
 		else
@@ -443,14 +353,14 @@ int hfi_get_unit_active(int unit)
 
 /* get the number of contexts from the unit id. */
 /* Returns 0 if no unit or no match. */
-int hfi_get_num_contexts(int unit_id, int wait)
+int hfi_get_num_contexts(int unit_id)
 {
 	int n = 0;
 	int units;
 	int64_t val;
 	uint32_t p = HFI_MIN_PORT;
 
-	units = hfi_get_num_units(wait);
+	units = hfi_get_num_units();
 
 	if_pf(units <=  0)
 		return 0;
diff --git a/psm_hal_gen1/opa_service_gen1.h b/psm_hal_gen1/opa_service_gen1.h
index 9bce8ca..6e18e57 100644
--- a/psm_hal_gen1/opa_service_gen1.h
+++ b/psm_hal_gen1/opa_service_gen1.h
@@ -173,21 +173,16 @@ int hfi_get_port_index2pkey(int unit, int port, int index);
 
 /* Get the number of units supported by the driver.  Does not guarantee
    that a working chip has been found for each possible unit #.
-   When the parameter 'wait' is non-zero, the code will wait briefly as
-   the driver may be coming up.  If 'wait' is zero, the function does not wait.
    Returns -1 with errno set, or number of units >=0 (0 means none found). */
-int hfi_get_num_units(int wait);
+int hfi_get_num_units();
 
 /* Given a unit number, returns 1 if any port on the unit is active.
    returns 0 if no port on the unit is active.
    returns -1 when an error occurred. */
 int hfi_get_unit_active(int unit);
 
-/* get the number of contexts from the unit id.
-   When the parameter 'wait' is non-zero, the code will wait briefly as
-   the driver may be coming up.  If 'wait' is zero, the function does not wait.
-   Returns 0 if no unit or no match. */
-int hfi_get_num_contexts(int unit, int wait);
+/* get the number of contexts from the unit id. */
+int hfi_get_num_contexts(int unit);
 
 /* Open hfi device file, return -1 on error. */
 int hfi_context_open(int unit, int port, uint64_t open_timeout);
@@ -242,9 +237,6 @@ int hfi_get_ctrs_port_names(int unitno, char **namep);
 /* sysfs helper routines (only those currently used are exported;
  * try to avoid using others) */
 
-/* Initializes the following sysfs helper routines. */
-void sysfs_init(const char *dflt_hfi_class_path);
-
 const char *hfi_sysfs_path(void);
 
 /* read a string value */
@@ -285,10 +277,6 @@ int hfi_hfifs_unit_rd(uint32_t unit, const char *, void *, int);
 
 int hfi_hfifs_open(const char *relname, int flags);
 
-/* wait for device special file to show up. timeout is in
- *    milliseconds, 0 is "callee knows best", < 0 is infinite. */
-int hfi_wait_for_device(const char *path, long timeout);
-
 int hfi_cmd_wait_for_packet(int fd);
 
 #endif /* OPA_SERVICE_GEN1_H */
diff --git a/psm_hal_gen1/opa_user_gen1.h b/psm_hal_gen1/opa_user_gen1.h
index 9731b2b..adb120d 100644
--- a/psm_hal_gen1/opa_user_gen1.h
+++ b/psm_hal_gen1/opa_user_gen1.h
@@ -77,6 +77,7 @@
 #include <sys/mman.h>
 #include <sys/user.h>
 #include <syslog.h>
+#include <stdbool.h>
 #include "opa_intf.h"
 #include "opa_common_gen1.h"
 #include "opa_byteorder.h"
@@ -149,6 +150,82 @@ struct hfi_pbc {
 	__u16 fill1;
 };
 
+typedef enum mapsize
+{	SC_CREDITS,
+	PIO_BUFBASE_SOP,
+	PIO_BUFBASE,
+	RCVHDR_BUFBASE,
+	RCVEGR_BUFBASE,
+	SDMA_COMP_BUFBASE,
+	USER_REGBASE,
+	RCVHDRTAIL_BASE,
+	EVENTS_BUFBASE,
+	STATUS_BUFBASE,
+	SUBCTXT_UREGBASE,
+	SUBCTXT_RCVHDRBUF,
+	SUBCTXT_RCVEGRBUF,
+	MAPSIZE_MAX
+} mapsize_t;
+
+/* TODO: consider casting in the ALIGN() macro */
+#define ALIGN(x, a)				(((x)+(a)-1)&~((a)-1))
+#define ALIGNDOWN_PTR(x, a)			((void*)(((uintptr_t)(x))&~((uintptr_t)((a)-1))))
+
+/* using the same flags for all the mappings */
+#define HFI_MMAP_FLAGS				(MAP_SHARED|MAP_LOCKED)
+#define HFI_MMAP_PGSIZE				sysconf(_SC_PAGESIZE)
+/* cast to uintptr_t as opposed to intptr_t which evaluates to a signed type
+ *  * on which one should not perform bitwise operations (undefined behavior)
+ *   */
+#define HFI_MMAP_PGMASK				(~(uintptr_t)(HFI_MMAP_PGSIZE-1))
+
+/* this is only an auxiliary macro for HFI_MMAP_ERRCHECK()
+ * @off expected to be unsigned in order to AND with the page mask and avoid undefined behavior
+ */
+#define U64_TO_OFF64_PGMASK(off)		((__off64_t)((off) & HFI_MMAP_PGMASK))
+
+#define HFI_MMAP_ALIGNOFF(fd, off, size, prot)	hfi_mmap64(0,(size),(prot),HFI_MMAP_FLAGS,(fd),U64_TO_OFF64_PGMASK((off)))
+/* complementary */
+#define HFI_MUNMAP(addr, size)			munmap((addr), (size))
+
+/* make sure uintmax_t can hold the result of unsigned int multiplication */
+#if UINT_MAX > (UINTMAX_MAX / UINT_MAX)
+#error We cannot safely multiply unsigned integers on this platform
+#endif
+
+/* @member assumed to be of type u64 and validated to be so */
+#define HFI_MMAP_ERRCHECK(fd, binfo, member, size, prot) ({						\
+		typeof((binfo)->member) *__tptr = (__u64 *)NULL;					\
+		(void)__tptr;										\
+		void *__maddr = HFI_MMAP_ALIGNOFF((fd), (binfo)->member, (size), (prot));		\
+		do {											\
+			if (unlikely(__maddr == MAP_FAILED)) {						\
+				uintmax_t outval = (uintmax_t)((binfo)->member);			\
+				_HFI_INFO("mmap of " #member " (0x%jx) size %zu failed: %s\n",		\
+					outval, size, strerror(errno));					\
+				goto err_mmap_##member;							\
+			}										\
+			(binfo)->member = (__u64)__maddr;						\
+			_HFI_VDBG(#member "mmap %jx successful\n", (uintmax_t)((binfo)->member));	\
+		} while(0);										\
+		__maddr;										\
+})
+
+/* assigns 0 to the member after unmapping */
+#define HFI_MUNMAP_ERRCHECK(binfo, member, size)						\
+		do {	typeof((binfo)->member) *__tptr = (__u64 *)NULL;			\
+			(void)__tptr;								\
+			void *__addr = ALIGNDOWN_PTR((binfo)->member, HFI_MMAP_PGSIZE);		\
+			if (unlikely( __addr == NULL || (munmap(__addr, (size)) == -1))) {	\
+				_HFI_INFO("unmap of " #member " (%p) failed: %s\n",		\
+					__addr, strerror(errno));				\
+			}									\
+			else {									\
+				_HFI_VDBG("unmap of " #member "(%p) succeeded\n", __addr);	\
+				(binfo)->member = 0;						\
+			}									\
+		} while(0)
+
 #define HFI_PCB_SIZE_IN_BYTES 8
 
 /* Usable bytes in header (hdrsize - lrh - bth) */
@@ -199,7 +276,7 @@ struct _hfi_ctrl {
 	struct hfi1_base_info base_info;
 
 	/* some local storages in some condition: */
-	/* as storage of __hfi_rcvtidflow in hfi_userinit(). */
+	/* as storage of __hfi_rcvtidflow in hfi_userinit_internal(). */
 	__le64 regs[HFI_TF_NFLOWS];
 
 	/* location to which OPA writes the rcvhdrtail register whenever
@@ -236,9 +313,13 @@ struct _hfi_ctrl {
    struct _hfi_ctrl *.  The struct _hfi_ctrl * used for everything
    else is returned by this routine.
 */
-
 struct _hfi_ctrl *hfi_userinit(int32_t, struct hfi1_user_info_dep *);
 
+/* Internal function extends API, while original remains for backwards
+   compatibility with external code
+*/
+struct _hfi_ctrl *hfi_userinit_internal(int32_t, bool, struct hfi1_user_info_dep *);
+
 /* don't inline these; it's all init code, and not inlining makes the */
 /* overall code shorter and easier to debug */
 void hfi_touch_mmap(void *, size_t) __attribute__ ((noinline));
@@ -477,10 +558,9 @@ static __inline__ int32_t hfi_update_tid(struct _hfi_ctrl *ctrl,
 					 uint64_t tidlist, uint32_t *tidcnt, uint16_t flags)
 {
 	struct hfi1_cmd cmd;
-#ifdef PSM_CUDA
-	struct hfi1_tid_info_v2 tidinfo;
-#else
 	struct hfi1_tid_info tidinfo;
+#ifdef PSM_CUDA
+	struct hfi1_tid_info_v2 tidinfov2;
 #endif
 	int err;
 
@@ -491,23 +571,30 @@ static __inline__ int32_t hfi_update_tid(struct _hfi_ctrl *ctrl,
 	tidinfo.tidcnt = 0;		/* clear to zero */
 
 	cmd.type = PSMI_HFI_CMD_TID_UPDATE;
-#ifdef PSM_CUDA
-	cmd.type = PSMI_HFI_CMD_TID_UPDATE_V2;
-
-	if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED)
-		tidinfo.flags = flags;
-	else
-		tidinfo.flags = 0;
-#endif
-
 	cmd.len = sizeof(tidinfo);
 	cmd.addr = (__u64) &tidinfo;
+#ifdef PSM_CUDA
+	if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) {
+		/* Copy values to v2 struct */
+		tidinfov2.vaddr   = tidinfo.vaddr;
+		tidinfov2.length  = tidinfo.length;
+		tidinfov2.tidlist = tidinfo.tidlist;
+		tidinfov2.tidcnt  = tidinfo.tidcnt;
+		tidinfov2.flags   = flags;
+
+		cmd.type = PSMI_HFI_CMD_TID_UPDATE_V2;
+		cmd.len = sizeof(tidinfov2);
+		cmd.addr = (__u64) &tidinfov2;
+	}
+#endif
 
 	err = hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd));
 
 	if (err != -1) {
-		*length = tidinfo.length;
-		*tidcnt = tidinfo.tidcnt;
+		struct hfi1_tid_info *rettidinfo =
+			(struct hfi1_tid_info *)cmd.addr;
+		*length = rettidinfo->length;
+		*tidcnt = rettidinfo->tidcnt;
 	}
 
 	return err;
diff --git a/psm_hal_gen1/psm_gdrcpy.c b/psm_hal_gen1/psm_gdrcpy.c
index 06cb9c2..1896f9e 100644
--- a/psm_hal_gen1/psm_gdrcpy.c
+++ b/psm_hal_gen1/psm_gdrcpy.c
@@ -63,9 +63,6 @@
 
 static int gdr_fd;
 
-int is_gdr_copy_enabled;
-
-
 int get_gdr_fd(){
 	return gdr_fd;
 }
@@ -175,6 +172,9 @@ gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf,
 					   ((buf + size - 1) & GPU_PAGE_MASK) -
 					   pageaddr);
 
+	_HFI_VDBG("(gpudirect) buf=%p size=%zu pageaddr=%p pagelen=%u flags=0x%x proto=%p\n",
+		(void *)buf, size, (void *)pageaddr, pagelen, flags, proto);
+
 	query_params.query_params_in.gpu_buf_addr = pageaddr;
 	query_params.query_params_in.gpu_buf_size = pagelen;
  retry:
@@ -186,7 +186,7 @@ gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf,
 			if (!handle_out_of_bar_space(proto)) {
 				/* Fatal error */
 				psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
-								  "Unable to PIN GPU pages(Out of BAR1 space)\n");
+						  "Unable to PIN GPU pages(Out of BAR1 space) (errno: %d)\n", errno);
 				return NULL;
 			} else {
 				goto retry;
diff --git a/psm_hal_gen1/psm_hal_gen1.c b/psm_hal_gen1/psm_hal_gen1.c
index 732943f..be5e351 100644
--- a/psm_hal_gen1/psm_hal_gen1.c
+++ b/psm_hal_gen1/psm_hal_gen1.c
@@ -82,7 +82,7 @@ static hfp_gen1_t psm_gen1_hi = {
 		.hfp_close_context			  = hfp_gen1_close_context,
 		.hfp_context_open			  = hfp_gen1_context_open,
 		.hfp_dma_slot_available			  = hfp_gen1_dma_slot_available,
-		.hfp_finalize				  = hfp_gen1_finalize,
+		.hfp_finalize_				  = hfp_gen1_finalize_,
 		.hfp_forward_packet_to_subcontext	  = hfp_gen1_forward_packet_to_subcontext,
 		.hfp_free_tid				  = hfp_gen1_free_tid,
 		.hfp_get_bthqp				  = hfp_gen1_get_bthqp,
@@ -102,18 +102,15 @@ static hfp_gen1_t psm_gen1_hi = {
 		.hfp_get_jkey				  = hfp_gen1_get_jkey,
 		.hfp_get_lid				  = hfp_gen1_get_lid,
 		.hfp_get_node_id			  = hfp_gen1_get_node_id,
-		.hfp_get_num_contexts			  = hfp_gen1_get_num_contexts,
-		.hfp_get_num_free_contexts		  = hfp_gen1_get_num_free_contexts,
 		.hfp_get_pio_size			  = hfp_gen1_get_pio_size,
 		.hfp_get_pio_stall_cnt			  = hfp_gen1_get_pio_stall_cnt,
-		.hfp_get_port_active			  = hfp_gen1_get_port_active,
 		.hfp_get_port_gid			  = hfp_gen1_get_port_gid,
 		.hfp_get_port_index2pkey		  = hfp_gen1_get_port_index2pkey,
 		.hfp_get_port_lid			  = hfp_gen1_get_port_lid,
 		.hfp_get_port_lmc			  = hfp_gen1_get_port_lmc,
 		.hfp_get_port_num			  = hfp_gen1_get_port_num,
 		.hfp_get_port_rate			  = hfp_gen1_get_port_rate,
-		.hfp_get_port_sc2vl			  = hfp_gen1_get_port_sc2vl,
+		.hfp_get_sc2vl_map			  = hfp_gen1_get_sc2vl_map,
 		.hfp_get_port_sl2sc			  = hfp_gen1_get_port_sl2sc,
 		.hfp_get_receive_event			  = hfp_gen1_get_receive_event,
 		.hfp_get_rhf_expected_sequence_number	  = hfp_gen1_get_rhf_expected_sequence_number,
@@ -127,7 +124,6 @@ static hfp_gen1_t psm_gen1_hi = {
 		.hfp_get_subctxt_cnt			  = hfp_gen1_get_subctxt_cnt,
 		.hfp_get_tid_exp_cnt			  = hfp_gen1_get_tid_exp_cnt,
 		.hfp_get_tidcache_invalidation		  = hfp_gen1_get_tidcache_invalidation,
-		.hfp_get_unit_active			  = hfp_gen1_get_unit_active,
 		.hfp_get_unit_id			  = hfp_gen1_get_unit_id,
 		.hfp_get_user_major_bldtime_version	  = hfp_gen1_get_user_major_bldtime_version,
 		.hfp_get_user_major_bldtime_version	  = hfp_gen1_get_user_major_bldtime_version,
@@ -171,8 +167,12 @@ static hfp_gen1_t psm_gen1_hi = {
 		.hfp_writev				  = hfp_gen1_writev,
 #endif
 		.hfp_get_default_pkey			  = hfp_gen1_get_default_pkey,
+		.hfp_get_num_contexts			  = hfp_gen1_get_num_contexts,
+		.hfp_get_num_free_contexts		  = hfp_gen1_get_num_free_contexts,
 		.hfp_get_num_units			  = hfp_gen1_get_num_units,
 		.hfp_get_num_ports			  = hfp_gen1_get_num_ports,
+		.hfp_get_port_active			  = hfp_gen1_get_port_active,
+		.hfp_get_unit_active			  = hfp_gen1_get_unit_active,
 		.hfp_initialize				  = hfp_gen1_initialize,
 	},
 	/* start of private hfp_gen1_private data */
diff --git a/psm_hal_gen1/psm_hal_gen1.h b/psm_hal_gen1/psm_hal_gen1.h
index abe04a5..c4610f2 100644
--- a/psm_hal_gen1/psm_hal_gen1.h
+++ b/psm_hal_gen1/psm_hal_gen1.h
@@ -89,6 +89,7 @@ typedef struct _hfp_gen1_pc_private
 	struct ips_subcontext_ureg *subcontext_ureg[HFI1_MAX_SHARED_CTXTS];
 	struct ips_spio		    spio_ctrl;
 	struct hfi1_user_info_dep   user_info;
+	uint16_t                    sc2vl[PSMI_N_SCS];
 } hfp_gen1_pc_private;
 
 /* At the end of each scb struct, we have space reserved to accommodate
diff --git a/psm_hal_gen1/psm_hal_gen1_spio.c b/psm_hal_gen1/psm_hal_gen1_spio.c
index 8767dd9..eb9d5aa 100644
--- a/psm_hal_gen1/psm_hal_gen1_spio.c
+++ b/psm_hal_gen1/psm_hal_gen1_spio.c
@@ -640,7 +640,7 @@ ips_spio_process_events(const struct ptl *ptl_gen)
 
 	if (event_mask & PSM_HAL_HFI_EVENT_SL2VL_CHANGE) {
 		_HFI_INFO("SL2VL mapping changed for port.\n");
-		ips_ibta_init_sl2sc2vl_table(&((struct ptl_ips *)(ctrl->ptl))->proto);
+		ips_ibta_init_sl2sc_table(&((struct ptl_ips *)(ctrl->ptl))->proto);
 	}
 
 	return PSM2_OK;
@@ -686,6 +686,7 @@ ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
 	if (do_lock)
 		pthread_spin_lock(&ctrl->spio_lock);
 
+#ifdef PSM_FI
 	if_pf(PSMI_FAULTINJ_ENABLED()) {
 		PSMI_FAULTINJ_STATIC_DECL(fi_lost, "piosend", 1,
 					  IPS_FAULTINJ_PIOLOST);
@@ -699,6 +700,7 @@ ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
 			goto fi_busy;
 		/* else fall through normal processing path, i.e. no faults */
 	}
+#endif /* #ifdef PSM_FI */
 
 	psmi_assert((length & 0x3) == 0);
 	paylen = length + (cksum_valid ? PSM_CRC_SIZE_IN_BYTES : 0);
@@ -709,7 +711,9 @@ ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
 
 		if_pf(spio_ctrl->spio_available_blocks < nblks) {
 			/* Check unit status */
+#ifdef PSM_FI
 fi_busy:
+#endif /* #ifdef PSM_FI */
 			if ((err =
 			     psmi_context_check_status(ctrl->context)) ==
 			    PSM2_OK) {
diff --git a/psm_hal_gen1/psm_hal_inline_i.h b/psm_hal_gen1/psm_hal_inline_i.h
index d573653..6346bae 100644
--- a/psm_hal_gen1/psm_hal_inline_i.h
+++ b/psm_hal_gen1/psm_hal_inline_i.h
@@ -53,6 +53,8 @@
 
 #include "psm_hal_gen1.h"
 
+extern size_t arrsz[MAPSIZE_MAX];
+
 static inline struct _hfp_gen1 *get_psm_gen1_hi(void)
 {
 	return (struct _hfp_gen1*) psmi_hal_current_hal_instance;
@@ -64,16 +66,16 @@ static PSMI_HAL_INLINE int hfp_gen1_initialize(psmi_hal_instance_t *phi)
 	return 0;
 }
 
-/* hfp_gen1_finalize */
-static PSMI_HAL_INLINE int hfp_gen1_finalize(void)
+/* hfp_gen1_finalize_ */
+static PSMI_HAL_INLINE int hfp_gen1_finalize_(void)
 {
 	return 0;
 }
 
 /* hfp_gen1_get_num_units */
-static PSMI_HAL_INLINE int hfp_gen1_get_num_units(int wait)
+static PSMI_HAL_INLINE int hfp_gen1_get_num_units(void)
 {
-	return hfi_get_num_units(wait);
+	return hfi_get_num_units();
 }
 
 /* hfp_gen1_get_num_ports */
@@ -120,63 +122,100 @@ static PSMI_HAL_INLINE int hfp_gen1_get_num_free_contexts(int unit)
 	return -PSM_HAL_ERROR_GENERAL_ERROR;
 }
 
-/* hfp_gen1_close_context */
-static PSMI_HAL_INLINE int hfp_gen1_close_context(psmi_hal_hw_context *ctxtp)
+static void free_egr_buffs(hfp_gen1_pc_private *psm_hw_ctxt)
 {
-	if (!ctxtp || !*ctxtp)
-		return PSM_HAL_ERROR_OK;
+#define FREE_EGR_BUFFS_TABLE(cl_qs_arr, index)          ips_recvq_egrbuf_table_free(((cl_qs_arr)[index]).egr_buffs)
+	size_t i, index, subctxt_cnt;
+	psm_hal_gen1_cl_q_t *cl_qs;
 
-	int i;
-	hfp_gen1_pc_private *psm_hw_ctxt = *ctxtp;
-
-	ips_recvq_egrbuf_table_free(psm_hw_ctxt->cl_qs[PSM_HAL_CL_Q_RX_EGR_Q].egr_buffs);
-
-	for (i=0;i < psm_hw_ctxt->user_info.subctxt_cnt;i++)
-		ips_recvq_egrbuf_table_free(
-			psm_hw_ctxt->cl_qs[
-				PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(i)
-				].egr_buffs);
-	struct hfi1_base_info *binfo;
-	struct hfi1_ctxt_info *cinfo;
-	int __hfi_pg_sz = sysconf(_SC_PAGESIZE);
+	cl_qs = psm_hw_ctxt->cl_qs;
+	index = PSM_HAL_CL_Q_RX_EGR_Q;
+	FREE_EGR_BUFFS_TABLE(cl_qs, index);
+
+	subctxt_cnt = psm_hw_ctxt->user_info.subctxt_cnt;
+	for (i = 0; i < subctxt_cnt; i++) {
+		index = PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(i);
+		FREE_EGR_BUFFS_TABLE(cl_qs, index);
+	}
+#undef FREE_EGR_BUFFS_TABLE
+}
+
+static void unmap_hfi_mem(hfp_gen1_pc_private *psm_hw_ctxt)
+{
+	size_t subctxt_cnt = psm_hw_ctxt->user_info.subctxt_cnt;
 	struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl;
-	binfo = &ctrl->base_info;
-	cinfo = &ctrl->ctxt_info;
-
-	munmap((void*)PSMI_ALIGNDOWN(binfo->sc_credits_addr, __hfi_pg_sz),
-	       __hfi_pg_sz);
-	munmap((void*)PSMI_ALIGNDOWN(binfo->pio_bufbase_sop, __hfi_pg_sz),
-	       cinfo->credits * 64);
-	munmap((void*)PSMI_ALIGNDOWN(binfo->pio_bufbase, __hfi_pg_sz),
-	       cinfo->credits * 64);
-	munmap((void*)PSMI_ALIGNDOWN(binfo->rcvhdr_bufbase, __hfi_pg_sz),
-	       cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize);
-	munmap((void*)PSMI_ALIGNDOWN(binfo->rcvegr_bufbase, __hfi_pg_sz),
-	       cinfo->egrtids * cinfo->rcvegr_size);
-	munmap((void*)PSMI_ALIGNDOWN(binfo->sdma_comp_bufbase, __hfi_pg_sz),
-	       cinfo->sdma_ring_size * sizeof(struct hfi1_sdma_comp_entry));
-	/* only unmap the RTAIL if it was enabled in the first place */
+	struct hfi1_base_info *binfo = &ctrl->base_info;
+	struct hfi1_ctxt_info *cinfo = &ctrl->ctxt_info;
+
+	/* 1. Unmap the PIO credits address */
+	HFI_MUNMAP_ERRCHECK(binfo, sc_credits_addr, arrsz[SC_CREDITS]);
+
+	/* 2. Unmap the PIO buffer SOP address */
+	HFI_MUNMAP_ERRCHECK(binfo, pio_bufbase_sop, arrsz[PIO_BUFBASE_SOP]);
+
+	/* 3. Unmap the PIO buffer address */
+	HFI_MUNMAP_ERRCHECK(binfo, pio_bufbase, arrsz[PIO_BUFBASE]);
+
+	/* 4. Unmap the receive header queue */
+	HFI_MUNMAP_ERRCHECK(binfo, rcvhdr_bufbase, arrsz[RCVHDR_BUFBASE]);
+
+	/* 5. Unmap the receive eager buffer */
+	HFI_MUNMAP_ERRCHECK(binfo, rcvegr_bufbase, arrsz[RCVEGR_BUFBASE]);
+
+	/* 6. Unmap the sdma completion queue */
+	HFI_MUNMAP_ERRCHECK(binfo, sdma_comp_bufbase, arrsz[SDMA_COMP_BUFBASE]);
+
+	/* 7. Unmap RXE per-context CSRs */
+	HFI_MUNMAP_ERRCHECK(binfo, user_regbase, arrsz[USER_REGBASE]);
+	ctrl->__hfi_rcvhdrtail = NULL;
+	ctrl->__hfi_rcvhdrhead = NULL;
+	ctrl->__hfi_rcvegrtail = NULL;
+	ctrl->__hfi_rcvegrhead = NULL;
+	ctrl->__hfi_rcvofftail = NULL;
+	if (cinfo->runtime_flags & HFI1_CAP_HDRSUPP) {
+		ctrl->__hfi_rcvtidflow = NULL;
+	}
+
+	/* 8. Unmap the rcvhdrq tail register address */
 	if (cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL) {
-		munmap((void*)PSMI_ALIGNDOWN(binfo->rcvhdrtail_base, __hfi_pg_sz),
-		       __hfi_pg_sz);
+		/* only unmap the RTAIL if it was enabled in the first place */
+		HFI_MUNMAP_ERRCHECK(binfo, rcvhdrtail_base, arrsz[RCVHDRTAIL_BASE]);
+	} else {
+		binfo->rcvhdrtail_base = 0;
 	}
-	munmap((void*)PSMI_ALIGNDOWN(binfo->user_regbase, __hfi_pg_sz),
-	       __hfi_pg_sz);
-	munmap((void*)PSMI_ALIGNDOWN(binfo->events_bufbase, __hfi_pg_sz),
-	       __hfi_pg_sz);
-	munmap((void*)PSMI_ALIGNDOWN(binfo->status_bufbase, __hfi_pg_sz),
-	       __hfi_pg_sz);
-
-	/* only unmap subcontext-related stuff it subcontexts are enabled */
-	if (psm_hw_ctxt->user_info.subctxt_cnt > 0) {
-		munmap((void*)PSMI_ALIGNDOWN(binfo->subctxt_uregbase, __hfi_pg_sz),
-		       __hfi_pg_sz);
-		munmap((void*)PSMI_ALIGNDOWN(binfo->subctxt_rcvhdrbuf, __hfi_pg_sz),
-		       __hfi_pg_sz);
-		munmap((void*)PSMI_ALIGNDOWN(binfo->subctxt_rcvegrbuf, __hfi_pg_sz),
-		       __hfi_pg_sz);
+
+	/* 9. Unmap the event page */
+	HFI_MUNMAP_ERRCHECK(binfo, events_bufbase, arrsz[EVENTS_BUFBASE]);
+
+	/* 10. Unmap the status page */
+	HFI_MUNMAP_ERRCHECK(binfo, status_bufbase, arrsz[STATUS_BUFBASE]);
+
+	/* 11. If subcontext is used, unmap the buffers */
+	if (subctxt_cnt > 0) {
+		/* only unmap subcontext-related stuff it subcontexts are enabled */
+		HFI_MUNMAP_ERRCHECK(binfo, subctxt_uregbase, arrsz[SUBCTXT_UREGBASE]);
+		HFI_MUNMAP_ERRCHECK(binfo, subctxt_rcvhdrbuf, arrsz[SUBCTXT_RCVHDRBUF]);
+		HFI_MUNMAP_ERRCHECK(binfo, subctxt_rcvegrbuf, arrsz[SUBCTXT_RCVEGRBUF]);
 	}
+}
 
+/* hfp_gen1_close_context */
+static PSMI_HAL_INLINE int hfp_gen1_close_context(psmi_hal_hw_context *ctxtp)
+{
+	hfp_gen1_pc_private *psm_hw_ctxt;
+
+	if (!ctxtp || !*ctxtp)
+		return PSM_HAL_ERROR_OK;
+
+	psm_hw_ctxt = (hfp_gen1_pc_private *)(*ctxtp);
+
+	/* Free the egress buffers */
+	free_egr_buffs(psm_hw_ctxt);
+
+	/* Unmap the HFI memory */
+	unmap_hfi_mem(psm_hw_ctxt);
+
+	/* Clean up the rest */
 	close(psm_hw_ctxt->ctrl->fd);
 	free(psm_hw_ctxt->ctrl);
 	psmi_free(psm_hw_ctxt);
@@ -226,7 +265,7 @@ psmi_init_userinfo_params(psm2_ep_t ep, int unit_id,
 	if (!shcontexts_enabled)
 		return err;
 
-	avail_contexts = hfi_get_num_contexts(unit_id, 0);
+	avail_contexts = hfi_get_num_contexts(unit_id);
 
 	if (avail_contexts == 0) {
 		err = psmi_handle_error(NULL, PSM2_EP_NO_DEVICE,
@@ -465,9 +504,16 @@ uint64_t get_cap_mask(uint64_t gen1_mask)
 		  { HFI1_CAP_STATIC_RATE_CTRL,	  PSM_HAL_CAP_STATIC_RATE_CTRL    },
 		  { HFI1_CAP_SDMA_HEAD_CHECK,	  PSM_HAL_CAP_SDMA_HEAD_CHECK     },
 		  { HFI1_CAP_EARLY_CREDIT_RETURN, PSM_HAL_CAP_EARLY_CREDIT_RETURN },
-#ifdef PSM_CUDA
+#ifdef HFI1_CAP_GPUDIRECT_OT
 		  { HFI1_CAP_GPUDIRECT_OT,        PSM_HAL_CAP_GPUDIRECT_OT        },
-#endif
+#else /* #ifdef HFI1_CAP_GPUDIRECT_OT */
+#ifndef PSM_CUDA
+		  /* lifted from hfi1_user.h */
+		  { (1UL << 63),                  PSM_HAL_CAP_GPUDIRECT_OT        },
+#else /* #ifndef PSM_CUDA */
+#error "Inconsistent build.  HFI1_CAP_GPUDIRECT_OT must be defined for CUDA builds."
+#endif /* #ifndef PSM_CUDA */
+#endif /* #ifdef HFI1_CAP_GPUDIRECT_OT */
 	  };
 	uint64_t rv = 0;
 	int i;
@@ -490,7 +536,7 @@ static PSMI_HAL_INLINE int hfp_gen1_context_open(int unit,
 				 unsigned retryCnt)
 {
 	int fd = -1;
-	psm2_error_t err = PSM_HAL_ERROR_OK;
+	psm2_error_t err = PSM2_OK;
 	hfp_gen1_pc_private *pc_private = psmi_malloc(ep, UNDEFINED, sizeof(hfp_gen1_pc_private));
 
 	if_pf (!pc_private) {
@@ -498,7 +544,7 @@ static PSMI_HAL_INLINE int hfp_gen1_context_open(int unit,
 		goto bail;
 	}
 
-	memset(pc_private,0,sizeof(hfp_gen1_pc_private));
+	memset(pc_private, 0, sizeof(hfp_gen1_pc_private));
 
 	char dev_name[PATH_MAX];
 	fd = hfi_context_open_ex(unit, port, open_timeout,
@@ -518,13 +564,14 @@ static PSMI_HAL_INLINE int hfp_gen1_context_open(int unit,
 		goto bail;
 	}
 
-	/* attempt to assign the context via hfi_userinit() */
+	/* attempt to assign the context via hfi_userinit_internal() */
 	int retry = 0;
 	do {
 		if (retry > 0)
-			_HFI_INFO("hfi_userinit: failed, trying again (%d/%d)\n",
+			_HFI_INFO("hfi_userinit_internal: failed, trying again (%d/%d)\n",
 				  retry, retryCnt);
-		pc_private->ctrl = hfi_userinit(fd, &pc_private->user_info);
+		pc_private->ctrl = hfi_userinit_internal(fd, ep->skip_affinity,
+				&pc_private->user_info);
 	} while (pc_private->ctrl == NULL && ++retry <= retryCnt);
 
 	if (!pc_private->ctrl)
@@ -781,9 +828,24 @@ static PSMI_HAL_INLINE int hfp_gen1_get_port_sl2sc(int unit, int port, int sl)
 	return hfi_get_port_sl2sc(unit, port, sl);
 }
 
-static PSMI_HAL_INLINE int hfp_gen1_get_port_sc2vl(int unit, int port, int sc)
+static PSMI_HAL_INLINE int hfp_gen1_get_sc2vl_map(struct ips_proto *proto)
 {
-	return hfi_get_port_sc2vl(unit, port, sc);
+	hfp_gen1_pc_private *psm_hw_ctxt = proto->ep->context.psm_hw_ctxt;
+	uint8_t i;
+
+	/* Get SC2VL table for unit, port */
+	for (i = 0; i < PSMI_N_SCS; i++) {
+		int ret = hfi_get_port_sc2vl(
+			psmi_hal_get_unit_id( proto->ep->context.psm_hw_ctxt),
+			psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt),
+			i);
+		if (ret < 0)
+			/* Unable to get SC2VL. Set it to default */
+			ret = PSMI_VL_DEFAULT;
+
+		psm_hw_ctxt->sc2vl[i] = (uint16_t) ret;
+	}
+	return PSM_HAL_ERROR_OK;
 }
 
 static PSMI_HAL_INLINE int hfp_gen1_set_pkey(psmi_hal_hw_context ctxt, uint16_t pkey)
@@ -1146,6 +1208,46 @@ static PSMI_HAL_INLINE int hfp_gen1_get_receive_event(psmi_hal_cl_idx head_idx, 
 	       PSM_HAL_ERROR_OK)
 		return rv;
 
+	/* If the hdrq_head is before cachedlastscan, that means that we have
+	 * already prescanned this for BECNs and FECNs, so we should not check
+	 * again
+	 */
+	if_pt((rcv_ev->proto->flags & IPS_PROTO_FLAG_CCA) &&
+	      (head_idx >= rcv_ev->recvq->state->hdrq_cachedlastscan)) {
+		/* IBTA CCA handling:
+		 * If FECN bit set handle IBTA CCA protocol. For the
+		 * flow that suffered congestion we flag it to generate
+		 * a control packet with the BECN bit set - This is
+		 * currently an unsolicited ACK.
+		 *
+		 * For all MQ packets the FECN processing/BECN
+		 * generation is done in the is_expected_or_nak
+		 * function as each eager packet is inspected there.
+		 *
+		 * For TIDFLOW/Expected data transfers the FECN
+		 * bit/BECN generation is done in protoexp_data. Since
+		 * header suppression can result in even FECN packets
+		 * being suppressed the expected protocol generated
+		 * additional BECN packets if a "large" number of
+		 * generations are swapped without progress being made
+		 * for receive. "Large" is set empirically to 4.
+		 *
+		 * FECN packets are ignored for all control messages
+		 * (except ACKs and NAKs) since they indicate
+		 * congestion on the control path which is not rate
+		 * controlled. The CCA specification allows FECN on
+		 * ACKs to be disregarded as well.
+		 */
+
+		rcv_ev->is_congested =
+			_is_cca_fecn_set(rcv_ev->
+					 p_hdr) & IPS_RECV_EVENT_FECN;
+		rcv_ev->is_congested |=
+			(_is_cca_becn_set(rcv_ev->p_hdr) <<
+			 (IPS_RECV_EVENT_BECN - 1));
+	} else
+		  rcv_ev->is_congested = 0;
+
 	return PSM_HAL_ERROR_OK;
 }
 
@@ -1265,9 +1367,10 @@ ips_proto_pbc_update(struct ips_proto *proto, struct ips_flow *flow,
 		     uint32_t isCtrlMsg, struct psm_hal_pbc *pbc, uint32_t hdrlen,
 		     uint32_t paylen))
 {
+	hfp_gen1_pc_private *psm_hw_ctxt = proto->ep->context.psm_hw_ctxt;
 	int dw = (sizeof(struct psm_hal_pbc) + hdrlen + paylen) >> BYTE2DWORD_SHIFT;
 	int sc = proto->sl2sc[flow->path->pr_sl];
-	int vl = proto->sc2vl[sc];
+	int vl = psm_hw_ctxt->sc2vl[sc];
 	uint16_t static_rate = 0;
 
 	if_pf(!isCtrlMsg && flow->path->pr_active_ipd)
diff --git a/psm_lock.h b/psm_lock.h
index c82960c..4a17272 100644
--- a/psm_lock.h
+++ b/psm_lock.h
@@ -69,11 +69,13 @@ typedef pthread_spinlock_t psmi_spinlock_t;
 
 #define psmi_spin_init(lock)	  pthread_spin_init(lock, \
 					PTHREAD_PROCESS_PRIVATE)
+#define psmi_spin_destroy(lock)	pthread_spin_destroy(lock)
 #define psmi_spin_lock(lock)	  pthread_spin_lock(lock)
 #define psmi_spin_trylock(lock) pthread_spin_trylock(lock)
 #define psmi_spin_unlock(lock)  pthread_spin_unlock(lock)
 #else
 typedef ips_atomic_t psmi_spinlock_t;
+#define PSMI_SPIN_INVALID   2
 #define PSMI_SPIN_LOCKED    1
 #define PSMI_SPIN_UNLOCKED  0
 #endif
@@ -103,10 +105,26 @@ PSMI_ALWAYS_INLINE(int psmi_spin_init(psmi_spinlock_t *lock))
 PSMI_ALWAYS_INLINE(int psmi_spin_trylock(psmi_spinlock_t *lock))
 {
 	if (ips_atomic_cmpxchg(lock, PSMI_SPIN_UNLOCKED, PSMI_SPIN_LOCKED)
-			== PSMI_SPIN_UNLOCKED)
+			== PSMI_SPIN_UNLOCKED) {
 		return 0;
-	else
-		return EBUSY;
+	}
+
+	return EBUSY;
+}
+
+PSMI_ALWAYS_INLINE(int psmi_spin_destroy(psmi_spinlock_t *lock))
+{
+	if (lock == NULL) {
+		return EINVAL;
+	}
+
+	/* We could just do psmi_spin_trylock() here and dispense with the invalid state */
+	if (ips_atomic_cmpxchg(lock, PSMI_SPIN_UNLOCKED, PSMI_SPIN_INVALID)
+			== PSMI_SPIN_UNLOCKED) {
+		return 0;
+	}
+
+	return EBUSY;
 }
 
 PSMI_ALWAYS_INLINE(int psmi_spin_lock(psmi_spinlock_t *lock))
@@ -139,6 +157,35 @@ PSMI_ALWAYS_INLINE(void psmi_init_lock(psmi_lock_t *lock))
 #endif
 }
 
+PSMI_ALWAYS_INLINE(void psmi_destroy_lock(psmi_lock_t *lock))
+{
+	int err;
+#ifdef PSMI_LOCK_IS_SPINLOCK
+	/* This will map to either pthread_spin_destroy() or our custom psmi_spin_destroy().
+	 * Both their return values can be interpreted by strerror().
+	 */
+	if ((err = psmi_spin_destroy(&(lock->lock))) != 0) {
+		_HFI_VDBG("Destroying spinlock failed: %s\n", strerror(err));
+	}
+	/* The same path for both the regular mutex and the debugging mutex */
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK) || defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG)
+	if ((err = pthread_mutex_destroy(&(lock->lock))) != 0) {
+		/* strerror_r() may be a better choice here but it is tricky
+		 * to reliably detect the XSI vs GNU version, and if hardcoded,
+		 * may be inadvertently changed when tampering with headers/makefiles
+		 * in the long run.
+		 *
+		 * This would result in incorrect operation: a segfault from
+		 * derefencing the return value or failure to retrieve the
+		 * error string.
+		 *
+		 * The C11's strerror_s may be an option here too.
+		 */
+		_HFI_VDBG("Destroying mutex failed: %s\n", strerror(err));
+	}
+#endif
+}
+
 PSMI_ALWAYS_INLINE(int psmi_sem_post(sem_t *sem, const char *name))
 {
 	if (sem_post(sem) == -1) {
diff --git a/psm_mq.c b/psm_mq.c
index f41c134..a25a581 100644
--- a/psm_mq.c
+++ b/psm_mq.c
@@ -766,14 +766,14 @@ psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
 
 	PSM2_LOG_MSG("entering");
 	psmi_assert(MQE_TYPE_IS_RECV(req->type));
+	psmi_mtucpy_fn_t psmi_mtucpy_fn = psmi_mq_mtucpy;
 #ifdef PSM_CUDA
-	psmi_mtucpy_fn_t psmi_mtucpy_fn;
-	if (req->is_buf_gpu_mem)
-		psmi_mtucpy_fn = psmi_mq_mtucpy;
-	else
+	if (!req->is_buf_gpu_mem)
 		psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
 #endif
 
+	_HFI_VDBG("(req=%p) buf=%p len=%u req.state=%u\n", req, buf, len, req->state);
+
 	switch (req->state) {
 	case MQ_STATE_COMPLETE:
 		if (req->req_data.buf != NULL) {	/* 0-byte messages don't alloc a sysbuf */
@@ -786,10 +786,8 @@ psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
 								    mq->ep->epaddr->proto);
 				psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
 			}
-			psmi_mtucpy_fn(ubuf, (const void *)req->req_data.buf, copysz);
-#else
-			psmi_mq_mtucpy(ubuf, (const void *)req->req_data.buf, copysz);
 #endif
+			psmi_mtucpy_fn(ubuf, (const void *)req->req_data.buf, copysz);
 			psmi_mq_sysbuf_free(mq, req->req_data.buf);
 		}
 		req->req_data.buf = buf;
@@ -814,12 +812,7 @@ psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
 #endif
 
 		if (req->recv_msgoff) {
-#ifdef PSM_CUDA
-			psmi_mtucpy_fn
-#else
-			psmi_mq_mtucpy
-#endif
-				(buf, (const void *)req->req_data.buf,
+			psmi_mtucpy_fn(buf, (const void *)req->req_data.buf,
 				       req->recv_msgoff);
 		}
 		psmi_mq_sysbuf_free(mq, req->req_data.buf);
@@ -836,12 +829,7 @@ psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
 		 */
 		req->recv_msgoff = min(req->recv_msgoff, copysz);
 		if (req->recv_msgoff) {
-#ifdef PSM_CUDA
-			psmi_mtucpy_fn
-#else
-			psmi_mq_mtucpy
-#endif
-				(buf, (const void *)req->req_data.buf,
+			psmi_mtucpy_fn(buf, (const void *)req->req_data.buf,
 				       req->recv_msgoff);
 		}
 		if (req->send_msgoff) {
@@ -895,17 +883,10 @@ __psm2_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *
 #ifdef PSM_CUDA
 		int gpu_mem = 0;
 		void *gpu_user_buffer = NULL;
-		/* CUDA documentation dictates the use of SYNC_MEMOPS attribute
-		 * when the buffer pointer received into PSM has been allocated
-		 * by the application. This guarantees the all memory operations
-		 * to this region of memory (used by multiple layers of the stack)
-		 * always synchronize
-		 */
-		if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)buf)) {
-			int trueflag = 1;
-			PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
-					   CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
-					  (CUdeviceptr)buf);
+
+		if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(buf)) {
+			psmi_cuda_set_attr_sync_memops(buf);
+
 			gpu_mem = 1;
 			gpu_user_buffer = buf;
 		}
@@ -980,21 +961,13 @@ __psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src,
 	psm2_mq_req_t req;
 
 #ifdef PSM_CUDA
-	int gpu_mem;
-	/* CUDA documentation dictates the use of SYNC_MEMOPS attribute
-	 * when the buffer pointer received into PSM has been allocated
-	 * by the application. This guarantees the all memory operations
-	 * to this region of memory (used by multiple layers of the stack)
-	 * always synchronize
-	 */
-	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)buf)) {
-		int trueflag = 1;
-		PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
-			       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
-			      (CUdeviceptr)buf);
+	int gpu_mem = 0;
+
+	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(buf)) {
+		psmi_cuda_set_attr_sync_memops(buf);
+
 		gpu_mem = 1;
-	} else
-		gpu_mem = 0;
+	}
 #endif
 
 	PSM2_LOG_MSG("entering");
@@ -1111,20 +1084,12 @@ __psm2_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len,
 		req->req_data.context = context;
 
 #ifdef PSM_CUDA
-	/* CUDA documentation dictates the use of SYNC_MEMOPS attribute
-	 * when the buffer pointer received into PSM has been allocated
-	 * by the application. This guarantees the all memory operations
-	 * to this region of memory (used by multiple layers of the stack)
-	 * always synchronize
-	 */
-	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)buf)) {
-		int trueflag = 1;
-		PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
-			       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
-			      (CUdeviceptr)buf);
-		req->is_buf_gpu_mem = 1;
-	} else
-		req->is_buf_gpu_mem = 0;
+		if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(buf)) {
+			psmi_cuda_set_attr_sync_memops(buf);
+			req->is_buf_gpu_mem = 1;
+		} else {
+			req->is_buf_gpu_mem = 0;
+		}
 #endif
 
 		PSMI_LOCK(mq->progress_lock);
@@ -1451,7 +1416,7 @@ psmi_mq_print_stats_finalize(psm2_mq_t mq)
  * the user can set options after obtaining an endpoint
  */
 psm2_error_t
-__psm2_mq_init(psm2_ep_t ep, uint64_t tag_order_mask,
+__psm2_mq_init(psm2_ep_t ep, uint64_t ignored,
 	      const struct psm2_optkey *opts, int numopts, psm2_mq_t *mqo)
 {
 	psm2_error_t err = PSM2_OK;
diff --git a/psm_mq_internal.h b/psm_mq_internal.h
index 1a26898..a1afaf8 100644
--- a/psm_mq_internal.h
+++ b/psm_mq_internal.h
@@ -306,11 +306,6 @@ mq_copy_tiny(uint32_t *dest, uint32_t *src, uint8_t len))
 {
 #ifdef PSM_CUDA
 	if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(dest) || PSMI_IS_CUDA_MEM(src))) {
-		if (!PSMI_IS_CUDA_ENABLED) {
-			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
-				 "Please enable PSM CUDA support when using GPU buffer \n");
-			return;
-		}
 		PSMI_CUDA_CALL(cuMemcpy, (CUdeviceptr)dest, (CUdeviceptr)src, len);
 		return;
 	}
@@ -347,8 +342,8 @@ mq_copy_tiny(uint32_t *dest, uint32_t *src, uint8_t len))
 	}
 }
 
-#ifdef PSM_CUDA
 typedef void (*psmi_mtucpy_fn_t)(void *dest, const void *src, uint32_t len);
+#ifdef PSM_CUDA
 
 PSMI_ALWAYS_INLINE(
 void
@@ -409,7 +404,7 @@ mq_status_copy(psm2_mq_req_t req, psm2_mq_status_t *status))
 	status->msg_tag = *((uint64_t *) req->req_data.tag.tag);
 	status->msg_length = req->req_data.send_msglen;
 	status->nbytes = req->req_data.recv_msglen;
-	status->error_code = req->req_data.error_code;
+	status->error_code = (psm2_error_t)req->req_data.error_code;
 	status->context = req->req_data.context;
 }
 
@@ -421,7 +416,7 @@ mq_status2_copy(psm2_mq_req_t req, psm2_mq_status2_t *status))
 	status->msg_tag = req->req_data.tag;
 	status->msg_length = req->req_data.send_msglen;
 	status->nbytes = req->req_data.recv_msglen;
-	status->error_code = req->req_data.error_code;
+	status->error_code = (psm2_error_t)req->req_data.error_code;
 	status->context = req->req_data.context;
 }
 
diff --git a/psm_mq_recv.c b/psm_mq_recv.c
index 0f46075..642fbc1 100644
--- a/psm_mq_recv.c
+++ b/psm_mq_recv.c
@@ -342,9 +342,11 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
 {
 	psm2_mq_req_t req;
 	uint32_t msglen;
+	psmi_mtucpy_fn_t psmi_mtucpy_fn;
 
 	if (msgorder && (req = mq_req_match(mq, src, tag, 1))) {
 		/* we have a match */
+		void *user_buffer = req->req_data.buf;
 		psmi_assert(MQE_TYPE_IS_RECV(req->type));
 		req->req_data.peer = src;
 		req->req_data.tag = *tag;
@@ -356,29 +358,17 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
 			  tag->tag[0], tag->tag[1], tag->tag[2], msglen,
 			  paylen);
 
-		void* user_buffer = NULL;
-
 		switch (opcode) {
 		case MQ_MSG_TINY:
 			/* mq_copy_tiny() can handle zero byte */
-
 #ifdef PSM_CUDA
 			if (PSMI_USE_GDR_COPY(req, msglen)) {
-				void* mmaped_host = gdr_convert_gpu_to_host_addr(GDR_FD,
+				user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD,
 								(unsigned long)req->req_data.buf,
 								msglen, 1, src->proto);
-				mq_copy_tiny((uint32_t *) mmaped_host,
-							 (uint32_t *) payload, msglen);
 			}
-			else {
-				mq_copy_tiny((uint32_t *) req->req_data.buf,
-							 (uint32_t *) payload, msglen);
-			}
-#else
-
-			mq_copy_tiny((uint32_t *) req->req_data.buf,
-						 (uint32_t *) payload, msglen);
 #endif
+			mq_copy_tiny((uint32_t *) user_buffer, (uint32_t *) payload, msglen);
 
 			req->state = MQ_STATE_COMPLETE;
 			ips_barrier();
@@ -386,9 +376,8 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
 			break;
 
 		case MQ_MSG_SHORT:	/* message fits in 1 payload */
-			user_buffer = req->req_data.buf;
+			psmi_mtucpy_fn = psmi_mq_mtucpy;
 #ifdef PSM_CUDA
-			psmi_mtucpy_fn_t psmi_mtucpy_fn = psmi_mq_mtucpy;
 			if (PSMI_USE_GDR_COPY(req, msglen)) {
 				user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD,
 							(unsigned long)req->req_data.buf,
@@ -397,18 +386,10 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
 			}
 #endif
 			if (msglen <= paylen) {
-#ifdef PSM_CUDA
 				psmi_mtucpy_fn(user_buffer, payload, msglen);
-#else
-				psmi_mq_mtucpy(user_buffer, payload, msglen);
-#endif
 			} else {
 				psmi_assert((msglen & ~0x3) == paylen);
-#ifdef PSM_CUDA
 				psmi_mtucpy_fn(user_buffer, payload, paylen);
-#else
-				psmi_mq_mtucpy(user_buffer, payload, paylen);
-#endif
 				/*
 				 * there are nonDW bytes attached in header,
 				 * copy after the DW payload.
diff --git a/psm_stats.c b/psm_stats.c
index c9b5777..c9f37e6 100644
--- a/psm_stats.c
+++ b/psm_stats.c
@@ -586,6 +586,10 @@ void stats_register_hfi_counters(psm2_ep_t ep)
 	psmi_stats_register_type("OPA device counters",
 				 PSMI_STATSTYPE_DEVCOUNTERS,
 				 entries, nc + npc, ep);
+	// psmi_stats_register_type makes it's own copy of entries
+	// so we should free the entries buffer.
+	// The snames will be freed when we deregister the hfi.
+	psmi_free(entries);
 	return;
 
 bail:
@@ -605,7 +609,7 @@ void stats_register_hfi_stats(psm2_ep_t ep)
 	struct psmi_stats_entry *entries = NULL;
 
 	ns = hfi_get_stats_names(&snames);
-	if (ns == -1 || snames == NULL)
+	if (ns <= 0 || snames == NULL)
 		goto bail;
 	entries = psmi_calloc(ep, STATS, ns, sizeof(struct psmi_stats_entry));
 	if (entries == NULL)
diff --git a/psm_user.h b/psm_user.h
index 5a35085..09477c5 100644
--- a/psm_user.h
+++ b/psm_user.h
@@ -56,6 +56,10 @@
 #ifndef _PSMI_USER_H
 #define _PSMI_USER_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "psm_config.h"
 #include <inttypes.h>
 #include <pthread.h>
@@ -64,6 +68,7 @@
 #include <numa.h>
 #include <semaphore.h>
 #include <fcntl.h>
+#include <stdbool.h>
 
 #include "psm2.h"
 #include "psm2_mq.h"
@@ -301,44 +306,47 @@ void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak));
 extern int is_cuda_enabled;
 extern int is_gdr_copy_enabled;
 extern int device_support_gpudirect;
+extern int gpu_p2p_supported;
+extern int my_gpu_device;
 extern int cuda_lib_version;
 
 extern CUcontext ctxt;
-void *psmi_cuda_lib;
-CUresult (*psmi_cuInit)(unsigned int  Flags );
-CUresult (*psmi_cuCtxDetach)(CUcontext c);
-CUresult (*psmi_cuCtxSetCurrent)(CUcontext c);
-CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c);
-CUresult (*psmi_cuCtxSetCurrent)(CUcontext c);
-CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
-CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
-CUresult (*psmi_cuDeviceGet)(CUdevice* device, int  ordinal);
-CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
-CUresult (*psmi_cuDriverGetVersion)(int* driverVersion);
-CUresult (*psmi_cuDeviceGetCount)(int* count);
-CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags);
-CUresult (*psmi_cuStreamDestroy)(CUstream phStream);
-CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags);
-CUresult (*psmi_cuEventDestroy)(CUevent hEvent);
-CUresult (*psmi_cuEventQuery)(CUevent hEvent);
-CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream);
-CUresult (*psmi_cuEventSynchronize)(CUevent hEvent);
-CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags);
-CUresult (*psmi_cuMemFreeHost)(void* p);
-CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
-CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
-CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
-CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount);
-CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream);
-CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr);
-CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags);
-CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr);
-CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr);
-CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active);
-CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev);
-CUresult (*psmi_cuCtxGetDevice)(CUdevice* device);
-CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
+extern void *psmi_cuda_lib;
+
+extern CUresult (*psmi_cuInit)(unsigned int  Flags );
+extern CUresult (*psmi_cuCtxDetach)(CUcontext c);
+extern CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c);
+extern CUresult (*psmi_cuCtxSetCurrent)(CUcontext c);
+extern CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
+extern CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
+extern CUresult (*psmi_cuDeviceCanAccessPeer)(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
+extern CUresult (*psmi_cuDeviceGet)(CUdevice* device, int  ordinal);
+extern CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
+extern CUresult (*psmi_cuDriverGetVersion)(int* driverVersion);
+extern CUresult (*psmi_cuDeviceGetCount)(int* count);
+extern CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags);
+extern CUresult (*psmi_cuStreamDestroy)(CUstream phStream);
+extern CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags);
+extern CUresult (*psmi_cuEventDestroy)(CUevent hEvent);
+extern CUresult (*psmi_cuEventQuery)(CUevent hEvent);
+extern CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream);
+extern CUresult (*psmi_cuEventSynchronize)(CUevent hEvent);
+extern CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags);
+extern CUresult (*psmi_cuMemFreeHost)(void* p);
+extern CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+extern CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+extern CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+extern CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount);
+extern CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+extern CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream);
+extern CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr);
+extern CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags);
+extern CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr);
+extern CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr);
+extern CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active);
+extern CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev);
+extern CUresult (*psmi_cuCtxGetDevice)(CUdevice* device);
+extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
 
 #define PSMI_CUDA_CALL(func, args...) do {				\
 		CUresult cudaerr;					\
@@ -358,6 +366,39 @@ CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
 		}							\
 	} while (0)
 
+/**
+ * Similar to PSMI_CUDA_CALL() except does not error out
+ * if func(args) returns CUDA_SUCCESS or except_err
+ *
+ * Invoker must provide 'CUresult cudaerr' in invoked scope
+ * so invoker can inspect whether cudaerr == CUDA_SUCCESS or
+ * cudaerr == except_err after expanded code is executed.
+ *
+ * As except_err is an allowed value, message is printed at
+ * DBG level.
+ */
+#define PSMI_CUDA_CALL_EXCEPT(except_err, func, args...) do { \
+		cudaerr = psmi_##func(args);				\
+		if (cudaerr != CUDA_SUCCESS && cudaerr != except_err) {	\
+			if (ctxt == NULL)				\
+				_HFI_ERROR(				\
+				"Check if CUDA is initialized"	\
+				"before psm2_ep_open call \n");		\
+			_HFI_ERROR(					\
+				"CUDA failure: %s() (at %s:%d)"		\
+				"returned %d\n",			\
+				#func, __FILE__, __LINE__, cudaerr);	\
+			psmi_handle_error(				\
+				PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
+				"Error returned from CUDA function.\n");\
+		} else if (cudaerr == except_err) { \
+			_HFI_DBG( \
+				"CUDA non-zero return value: %s() (at %s:%d)"		\
+				"returned %d\n",			\
+				#func, __FILE__, __LINE__, cudaerr);	\
+		} \
+	} while (0)
+
 #define PSMI_CUDA_CHECK_EVENT(event, cudaerr) do {			\
 		cudaerr = psmi_cuEventQuery(event);			\
 		if ((cudaerr != CUDA_SUCCESS) &&			\
@@ -383,7 +424,7 @@ CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
 
 PSMI_ALWAYS_INLINE(
 int
-_psmi_is_cuda_mem(void *ptr))
+_psmi_is_cuda_mem(const void *ptr))
 {
 	CUresult cres;
 	CUmemorytype mt;
@@ -401,14 +442,8 @@ _psmi_is_cuda_mem(void *ptr))
 		return 0;
 }
 
-PSMI_ALWAYS_INLINE(
-int
-_psmi_is_cuda_enabled())
-{
-	return is_cuda_enabled;
-}
-
-#define PSMI_IS_CUDA_ENABLED _psmi_is_cuda_enabled()
+#define PSMI_IS_CUDA_ENABLED  likely(is_cuda_enabled)
+#define PSMI_IS_CUDA_DISABLED unlikely(!is_cuda_enabled)
 
 PSMI_ALWAYS_INLINE(
 int
@@ -480,8 +515,28 @@ enum psm2_chb_match_type {
 };
 typedef enum psm2_chb_match_type psm2_chb_match_type_t;
 
+/*
+ * CUDA documentation dictates the use of SYNC_MEMOPS attribute
+ * when the buffer pointer received into PSM has been allocated
+ * by the application. This guarantees that all memory operations
+ * to this region of memory (used by multiple layers of the stack)
+ * always synchronize.
+ */
+static inline
+void psmi_cuda_set_attr_sync_memops(const void *ubuf)
+{
+	int true_flag = 1;
+
+	PSMI_CUDA_CALL(cuPointerSetAttribute, &true_flag,
+		       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr) ubuf);
+}
+
 #endif /* PSM_CUDA */
 
 #define COMPILE_TIME_ASSERT(NAME,COND) extern char NAME[1/ COND]
 
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
 #endif /* _PSMI_USER_H */
diff --git a/psm_utils.c b/psm_utils.c
index 521467f..7f7995d 100644
--- a/psm_utils.c
+++ b/psm_utils.c
@@ -196,6 +196,30 @@ void *psmi_epid_remove(psm2_ep_t ep, psm2_epid_t epid)
 	return psmi_epid_lookup_inner(ep, epid, 1);
 }
 
+void psmi_epid_remove_all(psm2_ep_t ep)
+{
+	size_t i;
+	struct psmi_epid_tabentry *e;
+
+	pthread_mutex_lock(&psmi_epid_table.tablock);
+
+	for (i = 0; i < psmi_epid_table.tabsize; i++) {
+		e = &psmi_epid_table.table[i];
+
+		if (e->entry == NULL || e->entry == EPADDR_DELETED)
+			continue;
+
+		if (e->ep == ep) {
+			/* unspecified fields implicitly zeroed */
+			*e = (struct psmi_epid_tabentry) {
+				.entry = EPADDR_DELETED
+			};
+		}
+	}
+
+	pthread_mutex_unlock(&psmi_epid_table.tablock);
+}
+
 psm2_error_t psmi_epid_add(psm2_ep_t ep, psm2_epid_t epid, void *entry)
 {
 	uint64_t key;
@@ -262,17 +286,29 @@ fail:
 	return err;
 }
 
+static psmi_lock_t psmi_gethostname_lock;
+
+static void __attribute__ ((constructor)) __psmi_gethostname_lock_constructor(void)
+{
+	psmi_init_lock(&psmi_gethostname_lock);
+}
+
 char *psmi_gethostname(void)
 {
-	/* XXX this will need a lock in a multi-threaded environment */
 	static char hostname[80] = { '\0' };
 	char *c;
 
 	if (hostname[0] == '\0') {
-		gethostname(hostname, sizeof(hostname));
-		hostname[sizeof(hostname) - 1] = '\0';	/* no guarantee of nul termination */
-		if ((c = strchr(hostname, '.')))
-			*c = '\0';
+		PSMI_LOCK(psmi_gethostname_lock);
+		/* CRITICAL SECTION START */
+		if (hostname[0] == '\0') {
+			gethostname(hostname, sizeof(hostname));
+			hostname[sizeof(hostname) - 1] = '\0';	/* no guarantee of nul termination */
+			if ((c = strchr(hostname, '.')))
+				*c = '\0';
+		}
+		PSMI_UNLOCK(psmi_gethostname_lock);
+		/* CRITICAL SECTION END */
 	}
 
 	return hostname;
@@ -817,6 +853,8 @@ void psmi_multi_ep_init()
 	psmi_multi_ep_enabled = env_fi.e_uint;
 }
 
+#ifdef PSM_FI
+
 int psmi_faultinj_enabled = 0;
 int psmi_faultinj_verbose = 0;
 char *psmi_faultinj_outfile = NULL;
@@ -984,6 +1022,8 @@ int psmi_faultinj_is_fault(struct psmi_faultinj_spec *fi)
 		return 0;
 }
 
+#endif /* #ifdef PSM_FI */
+
 /* For memory allocation, we kind of break the PSM error handling rules.
  * If the caller gets NULL, it has to assume that the error has been handled
  * and should always return PSM2_NO_MEMORY */
@@ -1237,6 +1277,35 @@ void _psmi_heapdebug_val_heapallocs(const char *curloc)
 	}
 }
 
+/* psmi_heapdebug_finalize() validates the heap and then emits all of the allocations to stdout.
+   to help debug heap memory leaks. */
+void psmi_heapdebug_finalize(void)
+{
+	/* First validate the existing heap allocations: */
+
+	psmi_heapdebug_val_heapallocs();
+
+	printf("orphaned heap allocations: %d\n", n_allocations);
+
+	if (n_allocations > 0)
+	{
+		/* Now, emit all of the alloations to stdout. */
+
+		HD_Header_Type *p = HD_root_of_list;
+
+		while (p)
+		{
+			printf("orphaned heap allocation: %p allocated at: %s, size: %lu\n",
+			       p, p->allocLoc, p->sizeOfAlloc);
+
+			p = p->nextHD_header;
+		}
+		fflush(0);
+		/* Abort if any allocations still exist: */
+		abort();
+	}
+}
+
 /* hd_est_hdr_trlr() establishes the new allocation to the singly linked list, and adds
  * the header and trailer to the allocation.  Lastly, it validates the existing singly-linked
  * list for integrity. */
@@ -1246,15 +1315,9 @@ static void hd_est_hdr_trlr(HD_Header_Type *hd_alloc,
 			    uint64_t actualSize,
 			    const char *curloc)
 {
-#if 0
-	/* if we use this block of code, psm hangs running mpistress.  See JIRA STL-5244.  */
+	/* First, write HD_NO_MANS_LAND to the entire allocation: */
 	memset(systemAlloc,HD_NO_MANS_LAND,systemSize);
-#else
-	/* write HD_NO_MANS_LAND to the area between the system allocation and the start of the hd header. */
-	signed char *pchr = systemAlloc;
-	for (;pchr < (signed char*) hd_alloc;pchr++)
-		*pchr = (signed char) HD_NO_MANS_LAND;
-#endif
+
 	/* Write the HD header info: */
 	memcpy(hd_alloc->magic1,HD_HDR_MGC_1,sizeof(HD_HDR_MGC_1));
 	hd_alloc->allocLoc = curloc;
diff --git a/psm_utils.h b/psm_utils.h
index fc38153..0c58307 100644
--- a/psm_utils.h
+++ b/psm_utils.h
@@ -87,6 +87,7 @@ psm2_error_t psmi_epid_init();
 psm2_error_t psmi_epid_fini();
 void *psmi_epid_lookup(psm2_ep_t ep, psm2_epid_t epid);
 void *psmi_epid_remove(psm2_ep_t ep, psm2_epid_t epid);
+void psmi_epid_remove_all(psm2_ep_t ep);
 psm2_error_t psmi_epid_add(psm2_ep_t ep, psm2_epid_t epid, void *entry);
 #define PSMI_EP_HOSTNAME    ((psm2_ep_t) -1)	/* Special endpoint handle we use
 						 * to register hostnames */
@@ -182,9 +183,17 @@ void _psmi_heapdebug_val_heapallocs(const char *curloc);
 
 #define psmi_heapdebug_val_heapallocs() _psmi_heapdebug_val_heapallocs(PSMI_CURLOC)
 
+/* Finialize the heapdebug functionality after tear down of the psm
+   session when you are certain that all heap allocations have been
+   freed. psmi_heapdebug_finalize() will emit all of the extant
+   heap allocations and abort if there are any.  This is to aid
+   in debug of heap leaks. */
+void psmi_heapdebug_finalize(void);
+
 #else
 
 #define psmi_heapdebug_val_heapallocs() /* nothing */
+#define psmi_heapdebug_finalize() /* nothing */
 
 #endif
 
@@ -320,7 +329,7 @@ uint32_t psmi_crc(unsigned char *buf, int len);
 /*
  * Global model so we can tune defaults better for specific cpu's
  */
-uint32_t psmi_cpu_model;
+extern uint32_t psmi_cpu_model;
 
 /*
  * Diagnostics, all in psm_diags.c
@@ -333,6 +342,7 @@ int psmi_diags(void);
 extern int psmi_multi_ep_enabled;
 void psmi_multi_ep_init();
 
+#ifdef PSM_FI
 /*
  * Fault injection
  */
@@ -354,6 +364,7 @@ struct psmi_faultinj_spec *psmi_faultinj_getspec(char *spec_name,
 	    (var) = psmi_faultinj_getspec((spec_name), (num), (denom));
 int psmi_faultinj_is_fault(struct psmi_faultinj_spec *spec);
 
+#endif /* #ifdef PSM_FI */
 /*
  * PSM core component set/get options
  */
diff --git a/ptl_am/am_cuda_memhandle_cache.c b/ptl_am/am_cuda_memhandle_cache.c
index 8406a37..730562d 100644
--- a/ptl_am/am_cuda_memhandle_cache.c
+++ b/ptl_am/am_cuda_memhandle_cache.c
@@ -55,25 +55,139 @@
 
 #include "psm_user.h"
 #include "am_cuda_memhandle_cache.h"
-#define RBTREE_GET_LEFTMOST(PAYLOAD_PTR)  ((PAYLOAD_PTR)->start)
-#define RBTREE_GET_RIGHTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start+((PAYLOAD_PTR)->length))
+
+/*
+ * rbtree cruft
+ */
+struct _cl_map_item;
+
+typedef struct
+{
+	unsigned long		start;		 /* start virtual address */
+	CUipcMemHandle		cuda_ipc_handle; /* cuda ipc mem handle */
+	CUdeviceptr		cuda_ipc_dev_ptr;/* Cuda device pointer */
+	uint16_t		length;	 /* length*/
+	psm2_epid_t             epid;
+	struct _cl_map_item*	i_prev;	 /* idle queue previous */
+	struct _cl_map_item*	i_next;	 /* idle queue next */
+}__attribute__ ((aligned (128))) rbtree_cuda_memhandle_cache_mapitem_pl_t;
+
+typedef struct {
+	uint32_t		nelems;	/* number of elements in the cache */
+} rbtree_cuda_memhandle_cache_map_pl_t;
+
+static psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size);
+
+/*
+ * Custom comparator
+ */
+typedef rbtree_cuda_memhandle_cache_mapitem_pl_t cuda_cache_item;
+
+static int cuda_cache_key_cmp(const cuda_cache_item *a, const cuda_cache_item *b)
+{
+	// When multi-ep is disabled, cache can assume
+	//   1 epid == 1 remote process == 1 CUDA address space
+	// But when multi-ep is enabled, one process can have many epids, so in this case
+	// cannot use epid as part of cache key.
+	if (!psmi_multi_ep_enabled) {
+		if (a->epid < b->epid)
+			return -1;
+		if (a->epid > b->epid)
+			return 1;
+	}
+
+	unsigned long a_end, b_end;
+	// normalize into inclusive upper bounds to handle
+	// 0-length entries
+	a_end = (a->start + a->length);
+	b_end = (b->start + b->length);
+	if (a->length > 0)
+		a_end--;
+
+	if (b->length > 0)
+		b_end--;
+
+	if (a_end < b->start)
+		return -1;
+	if (b_end < a->start)
+		return 1;
+
+	return 0;
+}
+
+
+/*
+ * Necessary rbtree cruft
+ */
+#define RBTREE_MI_PL  rbtree_cuda_memhandle_cache_mapitem_pl_t
+#define RBTREE_MAP_PL rbtree_cuda_memhandle_cache_map_pl_t
+#define RBTREE_CMP(a,b) cuda_cache_key_cmp((a), (b))
 #define RBTREE_ASSERT                     psmi_assert
 #define RBTREE_MAP_COUNT(PAYLOAD_PTR)     ((PAYLOAD_PTR)->nelems)
+#define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR
 
+#include "rbtree.h"
 #include "rbtree.c"
 
-#ifdef PSM_DEBUG
-static int cache_hit_counter;
-static int cache_miss_counter;
-#endif
+/*
+ * Convenience rbtree cruft
+ */
+#define NELEMS			cuda_memhandle_cachemap.payload.nelems
+
+#define IHEAD			cuda_memhandle_cachemap.root
+#define LAST			IHEAD->payload.i_prev
+#define FIRST			IHEAD->payload.i_next
+#define INEXT(x)		x->payload.i_next
+#define IPREV(x)		x->payload.i_prev
+
+/*
+ * Actual module data
+ */
+static cl_qmap_t cuda_memhandle_cachemap; /* Global cache */
+static uint8_t cuda_memhandle_cache_enabled;
+static mpool_t cuda_memhandle_mpool;
+static uint32_t cuda_memhandle_cache_size;
+
+static uint64_t cache_hit_counter;
+static uint64_t cache_miss_counter;
+static uint64_t cache_evict_counter;
+static uint64_t cache_collide_counter;
+static uint64_t cache_clear_counter;
+
+static void print_cuda_memhandle_cache_stats(void)
+{
+	_HFI_DBG("enabled=%u,size=%u,hit=%lu,miss=%lu,evict=%lu,collide=%lu,clear=%lu\n",
+		cuda_memhandle_cache_enabled, cuda_memhandle_cache_size,
+		cache_hit_counter, cache_miss_counter,
+		cache_evict_counter, cache_collide_counter, cache_clear_counter);
+}
+
+/*
+ * This is the callback function when mempool are resized or destroyed.
+ * Upon calling cache fini mpool is detroyed which in turn calls this callback
+ * which helps in closing all memhandles.
+ */
+static void
+psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj)
+{
+	cl_map_item_t* memcache_item = (cl_map_item_t*)obj;
+	if (!is_alloc) {
+		if(memcache_item->payload.start)
+			PSMI_CUDA_CALL(cuIpcCloseMemHandle,
+				       memcache_item->payload.cuda_ipc_dev_ptr);
+	}
+}
 
 /*
  * Creating mempool for cuda memhandle cache nodes.
  */
-psm2_error_t
+static psm2_error_t
 am_cuda_memhandle_mpool_init(uint32_t memcache_size)
 {
 	psm2_error_t err;
+	if (memcache_size < 1)
+		return PSM2_PARAM_ERR;
+
 	cuda_memhandle_cache_size = memcache_size;
 	/* Creating a memory pool of size PSM2_CUDA_MEMCACHE_SIZE
 	 * which includes the Root and NIL items
@@ -95,38 +209,58 @@ am_cuda_memhandle_mpool_init(uint32_t memcache_size)
 /*
  * Initialize rbtree.
  */
-psm2_error_t am_cuda_memhandle_cache_map_init()
+psm2_error_t am_cuda_memhandle_cache_init(uint32_t memcache_size)
 {
+	psm2_error_t err = am_cuda_memhandle_mpool_init(memcache_size);
+	if (err != PSM2_OK)
+		return err;
+
 	cl_map_item_t *root, *nil_item;
 	root = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t));
 	if (root == NULL)
 		return PSM2_NO_MEMORY;
 	nil_item = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t));
-	if (nil_item == NULL)
+	if (nil_item == NULL) {
+		psmi_free(root);
 		return PSM2_NO_MEMORY;
+	}
+
 	nil_item->payload.start = 0;
 	nil_item->payload.epid = 0;
 	nil_item->payload.length = 0;
 	cuda_memhandle_cache_enabled = 1;
 	ips_cl_qmap_init(&cuda_memhandle_cachemap,root,nil_item);
 	NELEMS = 0;
+
+	cache_hit_counter = 0;
+	cache_miss_counter = 0;
+	cache_evict_counter = 0;
+	cache_collide_counter = 0;
+	cache_clear_counter = 0;
+
 	return PSM2_OK;
 }
 
 void am_cuda_memhandle_cache_map_fini()
 {
-#ifdef PSM_DEBUG
-	_HFI_DBG("cache hit counter: %d\n", cache_hit_counter);
-	_HFI_DBG("cache miss counter: %d\n", cache_miss_counter);
-#endif
+	print_cuda_memhandle_cache_stats();
 
-	if (cuda_memhandle_cachemap.nil_item)
+	if (cuda_memhandle_cachemap.nil_item) {
 		psmi_free(cuda_memhandle_cachemap.nil_item);
-	if (cuda_memhandle_cachemap.root)
+		cuda_memhandle_cachemap.nil_item = NULL;
+	}
+
+	if (cuda_memhandle_cachemap.root) {
 		psmi_free(cuda_memhandle_cachemap.root);
-	if (cuda_memhandle_cache_enabled)
+		cuda_memhandle_cachemap.root = NULL;
+	}
+
+	if (cuda_memhandle_cache_enabled) {
 		psmi_mpool_destroy(cuda_memhandle_mpool);
-	return;
+		cuda_memhandle_cache_enabled = 0;
+	}
+
+	cuda_memhandle_cache_size = 0;
 }
 
 /*
@@ -143,6 +277,7 @@ am_cuda_idleq_insert(cl_map_item_t* memcache_item)
 	INEXT(FIRST) = memcache_item;
 	IPREV(memcache_item) = FIRST;
 	FIRST = memcache_item;
+	INEXT(FIRST) = NULL;
 	return;
 }
 
@@ -155,11 +290,13 @@ am_cuda_idleq_remove_last(cl_map_item_t* memcache_item)
 	if (!INEXT(memcache_item)) {
 		LAST = NULL;
 		FIRST = NULL;
-		return;
+	} else {
+		LAST = INEXT(memcache_item);
+		IPREV(LAST) = NULL;
 	}
-	LAST = INEXT(memcache_item);
-	IPREV(LAST) = NULL;
-	return;
+	// Null-out now-removed memcache_item's next and prev pointers out of
+	// an abundance of caution
+	INEXT(memcache_item) = IPREV(memcache_item) = NULL;
 }
 
 static void
@@ -167,15 +304,16 @@ am_cuda_idleq_remove(cl_map_item_t* memcache_item)
 {
 	if (LAST == memcache_item) {
 		am_cuda_idleq_remove_last(memcache_item);
-		return;
-	}
-	if (INEXT(memcache_item) == NULL) {
-		INEXT(IPREV(memcache_item)) = NULL;
-		return;
+	} else if (FIRST == memcache_item) {
+		FIRST = IPREV(memcache_item);
+		INEXT(FIRST) = NULL;
+	} else {
+		INEXT(IPREV(memcache_item)) = INEXT(memcache_item);
+		IPREV(INEXT(memcache_item)) = IPREV(memcache_item);
 	}
-	INEXT(IPREV(memcache_item)) = INEXT(memcache_item);
-	IPREV(INEXT(memcache_item)) = IPREV(memcache_item);
-	return;
+	// Null-out now-removed memcache_item's next and prev pointers out of
+	// an abundance of caution
+	INEXT(memcache_item) = IPREV(memcache_item) = NULL;
 }
 
 static void
@@ -207,10 +345,14 @@ am_cuda_memhandle_cache_validate(cl_map_item_t* memcache_item,
 			 && epid == memcache_item->payload.epid) {
 		return PSM2_OK;
 	}
+	_HFI_DBG("cache collision: new entry start=%lu,length=%u\n", sbuf, length);
+
+	cache_collide_counter++;
 	ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, memcache_item);
 	PSMI_CUDA_CALL(cuIpcCloseMemHandle,
 		       memcache_item->payload.cuda_ipc_dev_ptr);
 	am_cuda_idleq_remove(memcache_item);
+	memset(memcache_item, 0, sizeof(*memcache_item));
 	psmi_mpool_put(memcache_item);
 	return PSM2_OK_NO_PROGRESS;
 }
@@ -219,14 +361,18 @@ am_cuda_memhandle_cache_validate(cl_map_item_t* memcache_item,
  * Current eviction policy: Least Recently Used.
  */
 static void
-am_cuda_memhandle_cache_evict()
+am_cuda_memhandle_cache_evict(void)
 {
+	cache_evict_counter++;
 	cl_map_item_t *p_item = LAST;
+	_HFI_VDBG("Removing (epid=%lu,start=%lu,length=%u,dev_ptr=0x%llX,it=%p) from cuda_memhandle_cachemap.\n",
+		p_item->payload.epid, p_item->payload.start, p_item->payload.length,
+		p_item->payload.cuda_ipc_dev_ptr, p_item);
 	ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, p_item);
 	PSMI_CUDA_CALL(cuIpcCloseMemHandle, p_item->payload.cuda_ipc_dev_ptr);
 	am_cuda_idleq_remove_last(p_item);
+	memset(p_item, 0, sizeof(*p_item));
 	psmi_mpool_put(p_item);
-	return;
 }
 
 static psm2_error_t
@@ -236,6 +382,7 @@ am_cuda_memhandle_cache_register(uintptr_t sbuf, CUipcMemHandle* handle,
 {
 	if (NELEMS == cuda_memhandle_cache_size)
 		am_cuda_memhandle_cache_evict();
+
 	cl_map_item_t* memcache_item = psmi_mpool_get(cuda_memhandle_mpool);
 	/* memcache_item cannot be NULL as we evict
 	 * before the call to mpool_get. Check has
@@ -253,6 +400,15 @@ am_cuda_memhandle_cache_register(uintptr_t sbuf, CUipcMemHandle* handle,
 	return PSM2_OK;
 }
 
+static void am_cuda_memhandle_cache_clear(void)
+{
+	_HFI_DBG("Closing all handles, clearing cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS);
+	while (NELEMS) {
+		am_cuda_memhandle_cache_evict();
+	}
+	_HFI_DBG("Closed all handles, cleared cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS);
+}
+
 /*
  * The key used to search the cache is the senders buf address pointer.
  * Upon a succesful hit in the cache, additional validation is required
@@ -262,36 +418,67 @@ CUdeviceptr
 am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle,
 				uint32_t length, psm2_epid_t epid)
 {
+	_HFI_VDBG("sbuf=%lu,handle=%p,length=%u,epid=%lu\n",
+		sbuf, handle, length, epid);
+
 	CUdeviceptr cuda_ipc_dev_ptr;
-	if(cuda_memhandle_cache_enabled) {
-		cl_qmap_t *p_map = &cuda_memhandle_cachemap;
-		cl_map_item_t *p_item;
-		unsigned long start = (unsigned long)sbuf;
-		unsigned long end = start + length;
-		p_item = ips_cl_qmap_search(p_map, start, end);
-		if (p_item->payload.start) {
-			if (am_cuda_memhandle_cache_validate(p_item, sbuf,
-					       handle, length, epid) == PSM2_OK) {
-#ifdef PSM_DEBUG
-				cache_hit_counter++;
-#endif
-				am_cuda_idleq_reorder(p_item);
-				return p_item->payload.cuda_ipc_dev_ptr;
-			}
-		}
-#ifdef PSM_DEBUG
-		cache_miss_counter++;
-#endif
-		PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr,
-				 *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
-		am_cuda_memhandle_cache_register(sbuf, handle,
-					       length, epid, cuda_ipc_dev_ptr);
-		return cuda_ipc_dev_ptr;
-	} else {
+	if(!cuda_memhandle_cache_enabled) {
 		PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr,
 				 *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
 		return cuda_ipc_dev_ptr;
 	}
+
+	cuda_cache_item key = {
+		.start = (unsigned long) sbuf,
+		.length= length,
+		.epid = epid
+	};
+
+	/*
+	 * preconditions:
+	 *  1) newrange [start,end) may or may not be in cachemap already
+	 *  2) there are no overlapping address ranges in cachemap
+	 * postconditions:
+	 *  1) newrange is in cachemap
+	 *  2) there are no overlapping address ranges in cachemap
+	 *
+	 * The key used to search the cache is the senders buf address pointer.
+	 * Upon a succesful hit in the cache, additional validation is required
+	 * as multiple senders could potentially send the same buf address value.
+	 */
+	cl_map_item_t *p_item = ips_cl_qmap_searchv(&cuda_memhandle_cachemap, &key);
+	while (p_item->payload.start) {
+		// Since a precondition is that there are no overlapping ranges in cachemap,
+		// an exact match implies no need to check further
+		if (am_cuda_memhandle_cache_validate(p_item, sbuf, handle, length, epid) == PSM2_OK) {
+			cache_hit_counter++;
+			am_cuda_idleq_reorder(p_item);
+			return p_item->payload.cuda_ipc_dev_ptr;
+		}
+
+		// newrange is not in the cache and overlaps at least one existing range.
+		// am_cuda_memhandle_cache_validate() closed and removed existing range.
+		// Continue searching for more overlapping ranges
+		p_item = ips_cl_qmap_searchv(&cuda_memhandle_cachemap, &key);
+	}
+	cache_miss_counter++;
+
+	CUresult cudaerr;
+	PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_ALREADY_MAPPED, cuIpcOpenMemHandle,
+		&cuda_ipc_dev_ptr, *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
+
+	if (cudaerr == CUDA_ERROR_ALREADY_MAPPED) {
+		// remote memory already mapped. Close all handles, clear cache,
+		// and try again
+		am_cuda_memhandle_cache_clear();
+		cache_clear_counter++;
+		PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, *handle,
+			CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
+	}
+
+	am_cuda_memhandle_cache_register(sbuf, handle,
+					   length, epid, cuda_ipc_dev_ptr);
+	return cuda_ipc_dev_ptr;
 }
 
 void
@@ -302,20 +489,4 @@ am_cuda_memhandle_release(CUdeviceptr cuda_ipc_dev_ptr)
 	return;
 }
 
-/*
- * This is the callback function when mempool are resized or destroyed.
- * Upon calling cache fini mpool is detroyed which in turn calls this callback
- * which helps in closing all memhandles.
- */
-void
-psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj)
-{
-	cl_map_item_t* memcache_item = (cl_map_item_t*)obj;
-	if (!is_alloc) {
-		if(memcache_item->payload.start)
-			PSMI_CUDA_CALL(cuIpcCloseMemHandle,
-				       memcache_item->payload.cuda_ipc_dev_ptr);
-	}
-}
-
 #endif
diff --git a/ptl_am/am_cuda_memhandle_cache.h b/ptl_am/am_cuda_memhandle_cache.h
index 494de32..2b1dbc0 100644
--- a/ptl_am/am_cuda_memhandle_cache.h
+++ b/ptl_am/am_cuda_memhandle_cache.h
@@ -56,58 +56,16 @@
 #ifndef _AM_CUDA_MEMHANDLE_CACHE_H
 #define _AM_CUDA_MEMHANDLE_CACHE_H
 
-#include <stdio.h>
-#include <stdlib.h>
+#include "psm_user.h"
 #include <stdint.h>
-#include <unistd.h>
-
-struct _cl_map_item;
-
-typedef struct
-{
-	unsigned long		start;		 /* start virtual address */
-	CUipcMemHandle		cuda_ipc_handle; /* cuda ipc mem handle */
-	CUdeviceptr		cuda_ipc_dev_ptr;/* Cuda device pointer */
-	uint16_t		length;	 /* length*/
-	psm2_epid_t             epid;
-	struct _cl_map_item*	i_prev;	 /* idle queue previous */
-	struct _cl_map_item*	i_next;	 /* idle queue next */
-}__attribute__ ((aligned (128))) rbtree_cuda_memhandle_cache_mapitem_pl_t;
-
-typedef struct {
-	uint32_t		nelems;	/* number of elements in the cache */
-} rbtree_cuda_memhandle_cache_map_pl_t;
-
-#define RBTREE_MI_PL  rbtree_cuda_memhandle_cache_mapitem_pl_t
-#define RBTREE_MAP_PL rbtree_cuda_memhandle_cache_map_pl_t
-
-#include "rbtree.h"
-
-cl_qmap_t cuda_memhandle_cachemap; /* Global cache */
-uint8_t cuda_memhandle_cache_enabled;
-mpool_t cuda_memhandle_mpool;
-uint32_t cuda_memhandle_cache_size;
-#define CUDA_MEMHANDLE_CACHE_SIZE 64
-
-/*
- * Macro definition for easy programming.
- */
-
-#define NELEMS			cuda_memhandle_cachemap.payload.nelems
-
-/*
- * Macro for idle queue management.
- */
-#define IHEAD			cuda_memhandle_cachemap.root
-#define LAST			IHEAD->payload.i_prev
-#define FIRST			IHEAD->payload.i_next
-#define INEXT(x)		x->payload.i_next
-#define IPREV(x)		x->payload.i_prev
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size);
+#define CUDA_MEMHANDLE_CACHE_SIZE 64
 
-psm2_error_t am_cuda_memhandle_cache_map_init();
+psm2_error_t am_cuda_memhandle_cache_init(uint32_t memcache_size);
 
 CUdeviceptr
 am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle,
@@ -115,10 +73,12 @@ am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle,
 void
 am_cuda_memhandle_release(CUdeviceptr cuda_ipc_dev_ptr);
 
-void psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj);
-
 void am_cuda_memhandle_cache_map_fini();
 
+#ifdef __cplusplus
+} /* extern "C" */
 #endif
 
-#endif
+#endif /* _AM_CUDA_MEMHANDLE_CACHE_H */
+
+#endif /* PSM_CUDA */
diff --git a/ptl_am/am_reqrep_shmem.c b/ptl_am/am_reqrep_shmem.c
index 95973c9..9be72f9 100644
--- a/ptl_am/am_reqrep_shmem.c
+++ b/ptl_am/am_reqrep_shmem.c
@@ -144,23 +144,38 @@ static inline uintptr_t am_ctl_sizeof_block()
 
 #undef _PA
 
+static uint32_t create_extra_ep_data()
+{
+	uint32_t ret = getpid();
+
+#ifdef PSM_CUDA
+	/* PID is at maximum 22 bits */
+	ret |= my_gpu_device << 22;
+#endif
+
+	return ret;
+}
+
+static void read_extra_ep_data(uint32_t data, uint32_t *pid, uint32_t *gpu)
+{
+	uint32_t pid_mask = (1 << 22) - 1;
+
+	*pid = data & pid_mask;
+	*gpu = (data & ~pid_mask) >> 22;
+}
+
 static void am_update_directory(struct am_ctl_nodeinfo *);
 
 static
 void amsh_atexit()
 {
-	static pthread_mutex_t mutex_once = PTHREAD_MUTEX_INITIALIZER;
-	static int atexit_once;
+	static ips_atomic_t atexit_once = { 0 };
 	psm2_ep_t ep;
 	struct ptl_am *ptl;
 
-	pthread_mutex_lock(&mutex_once);
-	if (atexit_once) {
-		pthread_mutex_unlock(&mutex_once);
+	/* bail out if previous value is non-zero */
+	if (ips_atomic_cmpxchg(&atexit_once, 0, 1) != 0)
 		return;
-	} else
-		atexit_once = 1;
-	pthread_mutex_unlock(&mutex_once);
 
 	ep = psmi_opened_endpoint;
 	while (ep) {
@@ -240,7 +255,7 @@ psm2_error_t psmi_shm_create(ptl_t *ptl_gen)
 	size_t segsz;
 	psm2_error_t err = PSM2_OK;
 	int shmfd = -1;
-	char *amsh_keyname;
+	char *amsh_keyname = NULL;
 	int iterator;
 	/* Get which kassist mode to use. */
 	ptl->psmi_kassist_mode = psmi_get_kassist_mode();
@@ -269,6 +284,8 @@ psm2_error_t psmi_shm_create(ptl_t *ptl_gen)
 		shmfd =
 		    shm_open(amsh_keyname, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
 		if (shmfd < 0) {
+			psmi_free(amsh_keyname);
+			amsh_keyname = NULL;
 			if (errno == EACCES && iterator < INT_MAX)
 				continue;
 			else {
@@ -301,6 +318,7 @@ psm2_error_t psmi_shm_create(ptl_t *ptl_gen)
 		}
 	}
 	if (err) {
+		if (amsh_keyname) psmi_free(amsh_keyname);
 		err = psmi_handle_error(NULL,
 					PSM2_SHMEM_SEGMENT_ERR,
 					"Error creating shared memory object "
@@ -328,6 +346,7 @@ psm2_error_t psmi_shm_create(ptl_t *ptl_gen)
 		err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
 					"Error mmapping shared memory: %s",
 					strerror(errno));
+		psmi_free(amsh_keyname);
 		goto fail;
 	}
 
@@ -454,6 +473,7 @@ psm2_error_t psmi_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm
 							"shared memory object "
 							"with fstat: %s",
 							strerror(errno));
+				close(dest_shmfd);
 				goto fail;
 			}
 			if (getuid() == st.st_uid) {
@@ -480,6 +500,7 @@ psm2_error_t psmi_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm
 		err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
 					"Error mmapping remote shared memory: %s",
 					strerror(errno));
+		close(dest_shmfd);
 		goto fail;
 	}
 	close(dest_shmfd);
@@ -560,7 +581,8 @@ psm2_error_t psmi_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm
 
 	if (shmidx == (uint16_t)-1)
 		err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
-					"Could not connect to local endpoint");	fail:
+					"Could not connect to local endpoint");
+fail:
 	return err;
 }
 
@@ -593,9 +615,10 @@ static psm2_error_t amsh_init_segment(ptl_t *ptl_gen)
 
 	/* We core dump right after here if we don't check the mmap */
 
-	struct sigaction act;
-	act.sa_sigaction = amsh_mmap_fault;
-	act.sa_flags = SA_SIGINFO;
+	struct sigaction act = {
+		.sa_sigaction = amsh_mmap_fault,
+		.sa_flags = SA_SIGINFO
+	};
 
 	sigaction(SIGSEGV, &act, &action_stash.SIGSEGV_old_act);
 	sigaction(SIGBUS, &act, &action_stash.SIGBUS_old_act);
@@ -1014,7 +1037,7 @@ amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req)
 				req->args[0].u32w1 = ptl->connect_phase;
 				req->args[1].u64w0 = (uint64_t) ptl->epid;
 				psmi_assert(shmidx != (uint16_t)-1);
-				req->args[2].u32w0 = getpid();
+				req->args[2].u32w0 = create_extra_ep_data();
 				req->args[2].u32w1 = PSM2_OK;
 				req->args[3].u64w0 =
 				    (uint64_t) (uintptr_t) &req->errors[i];
@@ -1154,7 +1177,7 @@ amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req)
 				req->args[0].u16w1 = shmidx;
 				req->args[0].u32w1 = ptl->connect_phase;
 				req->args[1].u64w0 = (uint64_t) ptl->epid;
-				req->args[2].u32w0 = getpid();
+				req->args[2].u32w0 = create_extra_ep_data();
 				req->args[2].u32w1 = PSM2_OK;
 				req->args[3].u64w0 =
 				    (uint64_t) (uintptr_t) &req->errors[i];
@@ -1958,171 +1981,94 @@ amsh_mq_rndv(ptl_t *ptl, psm2_mq_t mq, psm2_mq_req_t req,
 	req->send_msgoff = 0;
 
 #ifdef PSM_CUDA
-		/* If the send buffer is on gpu, we create a cuda IPC
-		 * handle and send it as payload in the RTS
-		 */
-		if (req->is_buf_gpu_mem) {
-			CUdeviceptr buf_base_ptr;
-			PSMI_CUDA_CALL(cuMemGetAddressRange, &buf_base_ptr, NULL, (CUdeviceptr)buf);
-
-			/* Offset in GPU buffer from which we copy data, we have to
-			 * send it separetly because this offset is lost
-			 * when cuIpcGetMemHandle  is called */
-			req->cuda_ipc_offset = buf - (void*)buf_base_ptr;
-			args[2].u32w0 = (uint32_t)req->cuda_ipc_offset;
-
-			PSMI_CUDA_CALL(cuIpcGetMemHandle,
-				      &req->cuda_ipc_handle,
-				      (CUdeviceptr) buf);
-			if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) {
-				psmi_am_reqq_add(AMREQUEST_SHORT, ptl,
-						 epaddr, mq_handler_hidx,
-						 args, 5, (void*)&req->cuda_ipc_handle,
-						 sizeof(CUipcMemHandle), NULL, 0);
-			} else {
-				psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx,
-							args, 5, (void*)&req->cuda_ipc_handle,
-							sizeof(CUipcMemHandle), 0);
-			}
-			req->cuda_ipc_handle_attached = 1;
-		} else
-#endif
+	/* If the send buffer is on gpu, we create a cuda IPC
+	 * handle and send it as payload in the RTS */
+	if (req->is_buf_gpu_mem) {
+		CUdeviceptr buf_base_ptr;
+		PSMI_CUDA_CALL(cuMemGetAddressRange, &buf_base_ptr, NULL, (CUdeviceptr)buf);
+
+		/* Offset in GPU buffer from which we copy data, we have to
+			* send it separetly because this offset is lost
+			* when cuIpcGetMemHandle  is called */
+		req->cuda_ipc_offset = buf - (void*)buf_base_ptr;
+		args[2].u32w0 = (uint32_t)req->cuda_ipc_offset;
+
+		PSMI_CUDA_CALL(cuIpcGetMemHandle,
+				&req->cuda_ipc_handle,
+				(CUdeviceptr) buf);
 		if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) {
 			psmi_am_reqq_add(AMREQUEST_SHORT, ptl,
-					 epaddr, mq_handler_hidx,
-					 args, 5, NULL, 0, NULL, 0);
+						epaddr, mq_handler_hidx,
+						args, 5, (void*)&req->cuda_ipc_handle,
+						sizeof(CUipcMemHandle), NULL, 0);
 		} else {
 			psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx,
-						args, 5, NULL, 0, 0);
+						args, 5, (void*)&req->cuda_ipc_handle,
+						sizeof(CUipcMemHandle), 0);
 		}
+		req->cuda_ipc_handle_attached = 1;
+	} else
+#endif
+	if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) {
+		psmi_am_reqq_add(AMREQUEST_SHORT, ptl, epaddr, mq_handler_hidx,
+					args, 5, NULL, 0, NULL, 0);
+	} else {
+		psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx,
+					args, 5, NULL, 0, 0);
+	}
+
+	mq->stats.tx_num++;
+	mq->stats.tx_shm_num++;
+	mq->stats.tx_rndv_num++;
+	mq->stats.tx_rndv_bytes += len;
 
 	return err;
 }
 
-/*
- * All shared am mq sends, req can be NULL
- */
 PSMI_ALWAYS_INLINE(
 psm2_error_t
-amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
-		   uint32_t flags_user, uint32_t flags_internal, psm2_mq_tag_t *tag,
-		   const void *ubuf, uint32_t len))
+amsh_mq_send_inner_eager(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
+			psm2_amarg_t *args, uint32_t flags_user, uint32_t flags_internal,
+			psm2_mq_tag_t *tag, const void *ubuf, uint32_t len))
 {
-	psm2_amarg_t args[3];
-	psm2_error_t err = PSM2_OK;
-	int is_blocking = (req == NULL);
+	uint32_t bytes_left = len;
+	uint32_t bytes_this = 0;
 
-#ifdef PSM_CUDA
-	int gpu_mem;
-	/* All sends from  a gpu buffer use the rendezvous protocol */
-	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)ubuf)) {
-		if (!PSMI_IS_CUDA_ENABLED)
-			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
-				 " Please enable PSM CUDA support when using GPU buffer \n");
-		gpu_mem = 1;
-		goto do_rendezvous;
-	} else
-		gpu_mem = 0;
-#endif
+	psm2_handler_t handler = mq_handler_hidx;
+
+	args[1].u32w1 = tag->tag[0];
+	args[1].u32w0 = tag->tag[1];
+	args[2].u32w1 = tag->tag[2];
+	args[2].u32w0 = 0;
 
 	if (!flags_user && len <= AMLONG_MTU) {
 		if (len <= 32)
 			args[0].u32w0 = MQ_MSG_TINY;
 		else
 			args[0].u32w0 = MQ_MSG_SHORT;
-		args[1].u32w1 = tag->tag[0];
-		args[1].u32w0 = tag->tag[1];
-		args[2].u32w1 = tag->tag[2];
-
-		if (flags_internal & PSMI_REQ_FLAG_FASTPATH) {
-			psmi_am_reqq_add(AMREQUEST_SHORT, epaddr->ptlctl->ptl,
-					 epaddr, mq_handler_hidx,
-					 args, 3, (void *)ubuf, len, NULL, 0);
-		} else {
-			psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr,
-				mq_handler_hidx, args, 3, ubuf, len, 0);
-		}
-	} else if (flags_user & PSM2_MQ_FLAG_SENDSYNC)
-		goto do_rendezvous;
-	else if (len <= mq->shm_thresh_rv) {
-		uint32_t bytes_left = len;
-		uint32_t bytes_this = min(bytes_left, AMLONG_MTU);
-		uint8_t *buf = (uint8_t *) ubuf;
+	} else {
 		args[0].u32w0 = MQ_MSG_EAGER;
 		args[0].u32w1 = len;
-		args[1].u32w1 = tag->tag[0];
-		args[1].u32w0 = tag->tag[1];
-		args[2].u32w1 = tag->tag[2];
+	}
+
+	do {
+		args[2].u32w0 += bytes_this;
+		bytes_this = min(bytes_left, AMLONG_MTU);
+
+		/* Assume that shared-memory active messages are delivered in order */
 		if (flags_internal & PSMI_REQ_FLAG_FASTPATH) {
 			psmi_am_reqq_add(AMREQUEST_SHORT, epaddr->ptlctl->ptl,
-					 epaddr, mq_handler_hidx,
-					 args, 3, buf, bytes_this, NULL, 0);
+					epaddr, handler, args, 3, (void *)ubuf,
+					bytes_this, NULL, 0);
 		} else {
 			psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr,
-						mq_handler_hidx, args, 3, buf,
-						bytes_this, 0);
+						handler, args, 3, ubuf, bytes_this, 0);
 		}
-		bytes_left -= bytes_this;
-		buf += bytes_this;
-		args[2].u32w0 = 0;
-		while (bytes_left) {
-			args[2].u32w0 += bytes_this;
-			bytes_this = min(bytes_left, AMLONG_MTU);
-			/* Here we kind of bend the rules, and assume that shared-memory
-			 * active messages are delivered in order */
-			if (flags_internal & PSMI_REQ_FLAG_FASTPATH) {
-				psmi_am_reqq_add(AMREQUEST_SHORT, epaddr->ptlctl->ptl,
-						 epaddr, mq_handler_data_hidx,
-						 args, 3, buf, bytes_this, NULL, 0);
-			} else {
-				psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr,
-							mq_handler_data_hidx, args,
-							3, buf, bytes_this, 0);
-			}
-			buf += bytes_this;
-			bytes_left -= bytes_this;
-		}
-	} else {
-do_rendezvous:
-		if (is_blocking) {
-			req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
-			if_pf(req == NULL)
-			    return PSM2_NO_MEMORY;
-			req->req_data.send_msglen = len;
-			req->req_data.tag = *tag;
-
-			/* Since SEND command is blocking, this request is
-			 * entirely internal and we will not be exposed to user.
-			 * Setting as internal so it will not be added to
-			 * mq->completed_q */
-			req->flags_internal |= (flags_internal | PSMI_REQ_FLAG_IS_INTERNAL);
-		}
-#ifdef PSM_CUDA
-		/* CUDA documentation dictates the use of SYNC_MEMOPS attribute
-		 * when the buffer pointer received into PSM has been allocated
-		 * by the application. This guarantees the all memory operations
-		 * to this region of memory (used by multiple layers of the stack)
-		 * always synchronize
-		 */
-		if (gpu_mem) {
-			int trueflag = 1;
-			PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
-				       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
-				      (CUdeviceptr)ubuf);
-			req->is_buf_gpu_mem = 1;
-		} else
-			req->is_buf_gpu_mem = 0;
-#endif
-
-		err =
-		    amsh_mq_rndv(epaddr->ptlctl->ptl, mq, req, epaddr, tag,
-				 ubuf, len);
 
-		if (err == PSM2_OK && is_blocking) {	/* wait... */
-			err = psmi_mq_wait_internal(&req);
-		}
-		return err;	/* skip eager accounting below */
-	}
+		ubuf += bytes_this;
+		bytes_left -= bytes_this;
+		handler = mq_handler_data_hidx;
+	} while(bytes_left);
 
 	/* All eager async sends are always "all done" */
 	if (req != NULL) {
@@ -2135,6 +2081,98 @@ do_rendezvous:
 	mq->stats.tx_eager_num++;
 	mq->stats.tx_eager_bytes += len;
 
+	return PSM2_OK;
+}
+
+/*
+ * All shared am mq sends, req can be NULL
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
+		   uint32_t flags_user, uint32_t flags_internal, psm2_mq_tag_t *tag,
+		   const void *ubuf, uint32_t len))
+{
+	psm2_amarg_t args[3];
+	psm2_error_t err = PSM2_OK;
+	int is_blocking = (req == NULL);
+
+#ifdef PSM_CUDA
+	int gpu_mem = 0;
+	int ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & gpu_p2p_supported;
+
+	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)) {
+		gpu_mem = 1;
+
+		/* All sends from a gpu buffer use the rendezvous protocol if p2p is supported */
+		if (ep_supports_p2p) {
+			goto do_rendezvous;
+		}
+
+		/*
+		 * Use eager messages if P2P is unsupported between endpoints.
+		 * Potentially use rendezvous with blocking requests only.
+		 */
+		if (!is_blocking)
+			goto do_eager;
+	}
+#endif
+	if (flags_user & PSM2_MQ_FLAG_SENDSYNC)
+		goto do_rendezvous;
+
+	if (len <= mq->shm_thresh_rv)
+#ifdef PSM_CUDA
+do_eager:
+#endif
+		return amsh_mq_send_inner_eager(mq, req, epaddr, args, flags_user,
+						flags_internal, tag, ubuf, len);
+do_rendezvous:
+	if (is_blocking) {
+		req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+		if_pf(req == NULL)
+			return PSM2_NO_MEMORY;
+		req->req_data.send_msglen = len;
+		req->req_data.tag = *tag;
+
+		/* Since SEND command is blocking, this request is
+		 * entirely internal and we will not be exposed to user.
+		 * Setting as internal so it will not be added to
+		 * mq->completed_q */
+		req->flags_internal |= (flags_internal | PSMI_REQ_FLAG_IS_INTERNAL);
+	}
+#ifdef PSM_CUDA
+	void *host_buf = NULL;
+
+	req->is_buf_gpu_mem = gpu_mem;
+	if (req->is_buf_gpu_mem) {
+		psmi_cuda_set_attr_sync_memops(ubuf);
+
+		/* Use host buffer for blocking requests if GPU P2P is
+		 * unsupported between endpoints.
+		 * This will be only used with blocking requests. */
+		if (!ep_supports_p2p) {
+			host_buf = psmi_malloc(epaddr->ptlctl->ep, UNDEFINED, len);
+			PSMI_CUDA_CALL(cuMemcpyDtoH, host_buf, (CUdeviceptr)ubuf, len);
+
+			/* Reset is_buf_gpu_mem since host buffer is being used
+			 * instead of one from GPU. */
+			ubuf = host_buf;
+			req->is_buf_gpu_mem = 0;
+		}
+	}
+#endif
+
+	err = amsh_mq_rndv(epaddr->ptlctl->ptl, mq, req, epaddr, tag, ubuf, len);
+
+	if (err == PSM2_OK && is_blocking) {	/* wait... */
+		err = psmi_mq_wait_internal(&req);
+	}
+
+#ifdef PSM_CUDA
+	if (err == PSM2_OK && host_buf)
+		psmi_free(host_buf);
+#endif
+
 	return err;
 }
 
@@ -2205,11 +2243,9 @@ const char *psmi_kassist_getmode(int mode)
 static
 int psmi_get_kassist_mode()
 {
-	int mode = PSMI_KASSIST_MODE_DEFAULT;
-	/* Cuda PSM only supports KASSIST_CMA_GET */
-#ifdef PSM_CUDA
-	mode = PSMI_KASSIST_CMA_GET;
-#else
+	/* Cuda PSM2 supports only KASSIST_CMA_GET */
+	int mode = PSMI_KASSIST_CMA_GET;
+#ifndef PSM_CUDA
 	union psmi_envvar_val env_kassist;
 
 	if (!psmi_getenv("PSM2_KASSIST_MODE",
@@ -2225,11 +2261,6 @@ int psmi_get_kassist_mode()
 			mode = PSMI_KASSIST_CMA_GET;
 		else
 			mode = PSMI_KASSIST_OFF;
-	} else {
-		/* cma-get is the fastest, so it's the default.
-		   Availability of CMA is checked in psmi_shm_create();
-		   if CMA is not available it falls back to 'none' there. */
-		mode = PSMI_KASSIST_CMA_GET;
 	}
 #endif
 	return mode;
@@ -2253,7 +2284,8 @@ amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
 	int16_t return_shmidx = args[0].u16w1;
 	psm2_error_t err = (psm2_error_t) args[2].u32w1;
 	psm2_error_t *perr = (psm2_error_t *) (uintptr_t) args[3].u64w0;
-	int pid = args[2].u32w0;
+	unsigned int pid;
+	unsigned int gpuid;
 	int force_remap = 0;
 
 	psm2_epaddr_t epaddr;
@@ -2266,6 +2298,7 @@ amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
 
 	/* We do this because it's an assumption below */
 	psmi_assert_always(buf == NULL && len == 0);
+	read_extra_ep_data(args[2].u32w0, &pid, &gpuid);
 
 	_HFI_VDBG("Conn op=%d, phase=%d, epid=%llx, err=%d\n",
 		  op, phase, (unsigned long long)epid, err);
@@ -2279,6 +2312,7 @@ amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
 			/* If old pid is unknown consider new pid the correct one */
 			if (((am_epaddr_t *) epaddr)->pid == AMSH_PID_UNKNOWN) {
 				((am_epaddr_t *) epaddr)->pid = pid;
+				((am_epaddr_t *) epaddr)->gpuid = gpuid;
 			} else {
 				psmi_epid_remove(ptl->ep, epid);
 				epaddr = NULL;
@@ -2312,6 +2346,7 @@ amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
 					     args_segoff);
 
 			((am_epaddr_t *) epaddr)->pid = pid;
+			((am_epaddr_t *) epaddr)->gpuid = gpuid;
 		}
 
 		/* Rewrite args */
@@ -2320,7 +2355,7 @@ amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
 		/* and return our shmidx for the connecting process */
 		args[0].u16w1 = shmidx;
 		args[1].u64w0 = (psm2_epid_t) ptl->epid;
-		args[2].u32w0 = getpid();
+		args[2].u32w0 = create_extra_ep_data();
 		args[2].u32w1 = PSM2_OK;
 		((am_epaddr_t *) epaddr)->cstate_incoming =
 			AMSH_CSTATE_INCOMING_ESTABLISHED;
@@ -2539,10 +2574,7 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl)
 			    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
 			    (union psmi_envvar_val)
 			    CUDA_MEMHANDLE_CACHE_SIZE, &env_memcache_size);
-		if ((err = am_cuda_memhandle_mpool_init(env_memcache_size.e_uint)
-		     != PSM2_OK))
-			goto fail;
-		if ((err = am_cuda_memhandle_cache_map_init() != PSM2_OK))
+		if ((err = am_cuda_memhandle_cache_init(env_memcache_size.e_uint) != PSM2_OK))
 			goto fail;
 	}
 #endif
@@ -2640,6 +2672,10 @@ static psm2_error_t amsh_fini(ptl_t *ptl_gen, int force, uint64_t timeout_ns)
 	 * deallocated to reference memory that disappeared */
 	ptl->repH.head = &ptl->amsh_empty_shortpkt;
 	ptl->reqH.head = &ptl->amsh_empty_shortpkt;
+
+	if (ptl->am_ep)
+		psmi_free(ptl->am_ep);
+
 #ifdef PSM_CUDA
 	if (PSMI_IS_CUDA_ENABLED)
 		am_cuda_memhandle_cache_map_fini();
diff --git a/ptl_am/psm_am_internal.h b/ptl_am/psm_am_internal.h
index 8e07a57..c4c08a5 100644
--- a/ptl_am/psm_am_internal.h
+++ b/ptl_am/psm_am_internal.h
@@ -72,9 +72,15 @@ struct am_epaddr {
 	uint16_t shmidx;
 	uint16_t return_shmidx;
 
-	uint32_t cstate_outgoing:4;
-	uint32_t cstate_incoming:4;
+	uint32_t cstate_outgoing:3;
+	uint32_t cstate_incoming:3;
 	uint32_t pid:22;
+	/*
+	 * Device number of GPU used by given EP, only used when CUDA is
+	 * enabled. There is no gain from #ifdefing it out, since it does not
+	 * use any extra space.
+	 */
+	uint32_t gpuid:4;
 } am_epaddr_t;
 
 /* Up to NSHORT_ARGS are supported via am_pkt_short_t; the remaining
diff --git a/ptl_am/ptl_fwd.h b/ptl_am/ptl_fwd.h
index e1bd064..1d0fec4 100644
--- a/ptl_am/ptl_fwd.h
+++ b/ptl_am/ptl_fwd.h
@@ -57,7 +57,7 @@
 #define _PTL_FWD_AMSH_H
 
 /* Symbol in am ptl */
-struct ptl_ctl_init psmi_ptl_amsh;
+extern struct ptl_ctl_init psmi_ptl_amsh;
 
 extern int psmi_shm_mq_rv_thresh;
 
diff --git a/ptl_ips/ips_config.h b/ptl_ips/ips_config.h
index c323194..329a69c 100644
--- a/ptl_ips/ips_config.h
+++ b/ptl_ips/ips_config.h
@@ -78,6 +78,8 @@
  */
 #define IPS_PROTOEXP_MIN_MTU		2048
 
+#ifdef PSM_FI
+
 /* Fault injection, becomes parameters to psmi_faultinj_getspec so
  * a comma-delimited list of
  *   "spec_name", num, denom
@@ -95,6 +97,7 @@
 #define IPS_FAULTINJ_PIOBUSY	10	/* 1 every 10 pio sends get busy */
 #define IPS_FAULTINJ_RECVLOST	200	/* 1 every 200 pkts dropped at recv */
 
+#endif /* #ifdef PSM_FI */
 
 /* TID */
 
diff --git a/ptl_ips/ips_epstate.h b/ptl_ips/ips_epstate.h
index 7308040..d7b7cb3 100644
--- a/ptl_ips/ips_epstate.h
+++ b/ptl_ips/ips_epstate.h
@@ -91,7 +91,7 @@ struct ips_epstate_entry *
 ips_epstate_lookup(const struct ips_epstate *eps, ips_epstate_idx idx))
 {
 	idx = (idx + eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1);
-	if (idx < eps->eps_tabsize)
+	if (idx < (ips_epstate_idx)eps->eps_tabsize)
 		return &eps->eps_tab[idx];
 	else
 		return NULL;
diff --git a/ptl_ips/ips_path_rec.c b/ptl_ips/ips_path_rec.c
index 5b37347..d08b6f9 100644
--- a/ptl_ips/ips_path_rec.c
+++ b/ptl_ips/ips_path_rec.c
@@ -565,13 +565,13 @@ static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto)
 	return err;
 }
 
-/* (Re)load the SL2VL table */
-psm2_error_t ips_ibta_init_sl2sc2vl_table(struct ips_proto *proto)
+/* (Re)load the SL2SC table */
+psm2_error_t ips_ibta_init_sl2sc_table(struct ips_proto *proto)
 {
 	int ret, i;
 
 	/* Get SL2SC table for unit, port */
-	for (i = 0; i < 32; i++) {
+	for (i = 0; i < PSMI_N_SCS; i++) {
 		if ((ret =
 		     psmi_hal_get_port_sl2sc(psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt),
 					psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt),
@@ -582,19 +582,7 @@ psm2_error_t ips_ibta_init_sl2sc2vl_table(struct ips_proto *proto)
 
 		proto->sl2sc[i] = (uint16_t) ret;
 	}
-	/* Get SC2VL table for unit, port */
-	for (i = 0; i < 32; i++) {
-		if ((ret =
-		     psmi_hal_get_port_sc2vl(psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt),
-					psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt),
-					(uint8_t) i)) < 0) {
-			/* Unable to get SC2VL. Set it to default */
-			ret = PSMI_VL_DEFAULT;
-		}
-
-		proto->sc2vl[i] = (uint16_t) ret;
-	}
-
+	psmi_hal_get_sc2vl_map(proto);
 	return PSM2_OK;
 }
 
@@ -633,7 +621,7 @@ psm2_error_t ips_ibta_link_updown_event(struct ips_proto *proto)
 	proto->epinfo.ep_link_rate = ips_rate_to_enum(ret);
 
 	/* Load the SL2SC2VL table */
-	ips_ibta_init_sl2sc2vl_table(proto);
+	ips_ibta_init_sl2sc_table(proto);
 
 	/* Regenerate new IPD table for the updated link rate. */
 	ips_gen_ipd_table(proto);
@@ -691,7 +679,9 @@ MOCKABLE(ips_ibta_init)(struct ips_proto *proto)
 		char ccabuf[256];
 		uint8_t *p;
 
-		proto->flags |= IPS_PROTO_FLAG_CCA;
+		/* Start out by turning on both styles of congestion control.
+		 * Later, we will eliminate the correct one. */
+		proto->flags |= IPS_PROTO_FLAG_CCA | IPS_PROTO_FLAG_CC_REPL_BECN;
 /*
  * If user set any environment variable, use self CCA.
  */
@@ -758,15 +748,18 @@ MOCKABLE(ips_ibta_init)(struct ips_proto *proto)
 		for (i = 0; i < proto->ccti_limit; i++)
 			_HFI_CCADBG("cct[%d] = 0x%04x\n", i, (int) proto->cct[i]);
 
-
+		/* Note, here, we are leaving CC style(s):
+		   (IPS_PROTO_FLAG_CCA | IPS_PROTO_FLAG_CCA_PRESCAN) */
+		proto->flags &= ~IPS_PROTO_FLAG_CC_REPL_BECN;
 		goto finishcca;
 
 /*
  * Disable CCA.
  */
 disablecca:
-		proto->flags &= ~IPS_PROTO_FLAG_CCA;
-		proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN;
+		/* Note, here, we are leaving CC style:
+		   IPS_PROTO_FLAG_CC_REPL_BECN */
+		proto->flags &= ~(IPS_PROTO_FLAG_CCA | IPS_PROTO_FLAG_CCA_PRESCAN);
 	}
 
 finishcca:
diff --git a/ptl_ips/ips_proto.c b/ptl_ips/ips_proto.c
index 35dcce7..dfd03e6 100644
--- a/ptl_ips/ips_proto.c
+++ b/ptl_ips/ips_proto.c
@@ -660,11 +660,11 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 	gpudirect_recv_threshold = env_gpudirect_recv_thresh.e_uint;
 
 	if (env_gpudirect_rdma.e_uint && device_support_gpudirect) {
-		if (!PSMI_IS_CUDA_ENABLED ||
+		if (PSMI_IS_CUDA_DISABLED ||
 			/* All pio, No SDMA*/
 			(proto->flags & IPS_PROTO_FLAG_SPIO) ||
 			!(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) ||
-			!PSMI_IS_DRIVER_GPUDIRECT_ENABLED)
+			PSMI_IS_DRIVER_GPUDIRECT_DISABLED)
 			err = psmi_handle_error(PSMI_EP_NORETURN,
 					PSM2_INTERNAL_ERR,
 					"Requires hfi1 driver with GPU-Direct feature enabled.\n");
@@ -685,7 +685,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 					&env_gpudirect_rdma_send);
 
 		if (env_gpudirect_rdma_send.e_uint && device_support_gpudirect) {
-			if (!PSMI_IS_CUDA_ENABLED ||
+			if (PSMI_IS_CUDA_DISABLED ||
 				/* All pio, No SDMA*/
 				(proto->flags & IPS_PROTO_FLAG_SPIO))
 				err = psmi_handle_error(PSMI_EP_NORETURN,
@@ -705,7 +705,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 					&env_gpudirect_rdma_recv);
 
 		if (env_gpudirect_rdma_recv.e_uint && device_support_gpudirect) {
-			if (!PSMI_IS_CUDA_ENABLED ||
+			if (PSMI_IS_CUDA_DISABLED ||
 				!(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED))
 					err = psmi_handle_error(PSMI_EP_NORETURN,
 							PSM2_INTERNAL_ERR,
@@ -786,6 +786,9 @@ ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in)
 	int i;
 	union psmi_envvar_val grace_intval;
 
+	/* Poll one more time to attempt to synchronize with the peer ep's. */
+	ips_ptl_poll(proto->ptl, 0);
+
 	psmi_getenv("PSM2_CLOSE_GRACE_PERIOD",
 		    "Additional grace period in seconds for closing end-point.",
 		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
@@ -900,12 +903,12 @@ ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in)
 		uint64_t t_grace_interval_start = get_cycles();
 		int num_disconnect_requests = proto->num_disconnect_requests;
 		PSMI_BLOCKUNTIL(
-		    proto->ep, err,
-		    proto->num_connected_incoming == 0 ||
+			proto->ep, err,
+			proto->num_connected_incoming == 0 ||
 			(!psmi_cycles_left(t_start, timeout_in) &&
-			    (!psmi_cycles_left(t_grace_interval_start,
-					       t_grace_interval) ||
-			     !psmi_cycles_left(t_grace_start, t_grace_time))));
+			 (!psmi_cycles_left(t_grace_interval_start,
+					    t_grace_interval) ||
+			  !psmi_cycles_left(t_grace_start, t_grace_time))));
 		if (num_disconnect_requests == proto->num_disconnect_requests) {
 			/* nothing happened in this grace interval so break out early */
 			break;
@@ -1649,6 +1652,8 @@ fail:
 	return err;
 }
 
+#ifdef PSM_FI
+
 /*
  * Fault injection in dma sends. Since DMA through writev() is all-or-nothing,
  * we don't inject faults on a packet-per-packet basis since the code gets
@@ -1671,6 +1676,8 @@ PSMI_ALWAYS_INLINE(int dma_do_fault())
 	return 0;
 }
 
+#endif /* #ifdef PSM_FI */
+
 /*
  * Driver defines the following sdma completion error code, returned
  * as negative value:
@@ -1812,10 +1819,11 @@ ips_dma_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
 	uint16_t iovcnt;
 	struct iovec iovec[2];
 
+#ifdef PSM_FI
 	/* See comments above for fault injection */
 	if_pf(dma_do_fault())
 	    return PSM2_OK;
-
+#endif /* #ifdef PSM_FI */
 	/*
 	 * Check if there is a sdma queue slot.
 	 */
@@ -1873,14 +1881,14 @@ ips_dma_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
 		sdmahdr->ctrl = 2 |
 			(PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) |
 			(iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT);
-	} else {
+	} else
 #endif
+	{
 		sdmahdr->ctrl = 1 |
 			(PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) |
 			(iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT);
-#ifdef PSM_CUDA
 	}
-#endif
+
 	/*
 	 * Write into driver to do SDMA work.
 	 */
@@ -1991,8 +1999,10 @@ scb_dma_send(struct ips_proto *proto, struct ips_flow *flow,
 	int16_t credits;
 	ssize_t ret;
 
+#ifdef PSM_FI
 	/* See comments above for fault injection */
 	if_pf(dma_do_fault()) goto fail;
+#endif /* #ifdef PSM_FI */
 
 	/* Check how many SCBs to send based on flow credits */
 	credits = flow->credits;
@@ -2144,14 +2154,14 @@ scb_dma_send(struct ips_proto *proto, struct ips_flow *flow,
 				sdmahdr->ctrl = 2 |
 					(PSM_HAL_EXP << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) |
 					(iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT);
-			} else {
+			} else
 #endif
+			{
+
 				sdmahdr->ctrl = 1 |
 					(PSM_HAL_EXP << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) |
 					(iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT);
-#ifdef PSM_CUDA
 			}
-#endif
 			_HFI_VDBG("tid-info=%p,%d\n",
 				  iovec[vec_idx - 1].iov_base,
 				  (int)iovec[vec_idx - 1].iov_len);
@@ -2162,14 +2172,13 @@ scb_dma_send(struct ips_proto *proto, struct ips_flow *flow,
 				sdmahdr->ctrl = 2 |
 					(PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) |
 					(iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT);
-			} else {
+			} else
 #endif
+			{
 				sdmahdr->ctrl = 1 |
 					(PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) |
 					(iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT);
-#ifdef PSM_CUDA
 			}
-#endif
 		}
 
 		/* Can bound the number to send by 'num' */
diff --git a/ptl_ips/ips_proto.h b/ptl_ips/ips_proto.h
index c6030f4..dc8e7d4 100644
--- a/ptl_ips/ips_proto.h
+++ b/ptl_ips/ips_proto.h
@@ -374,8 +374,6 @@ struct ips_proto {
 
 	/* SL2SC and SC2VL table for protocol */
 	uint16_t sl2sc[32];
-	uint16_t sc2vl[32];
-
 	/* CCA per port */
 	uint16_t *cct;		/* cct table */
 	uint16_t ccti_size;	/* ccti table size */
@@ -690,8 +688,8 @@ typedef int (*ips_packet_service_fn_t)(struct ips_recvhdrq_event *rcv_ev);
 extern ips_packet_service_fn_t
 	ips_packet_service_routine[OPCODE_FUTURE_FROM-OPCODE_RESERVED];
 
-/* IBTA feature related functions (path record, sl2sc2vl etc.) */
-psm2_error_t ips_ibta_init_sl2sc2vl_table(struct ips_proto *proto);
+/* IBTA feature related functions (path record, sl2sc etc.) */
+psm2_error_t ips_ibta_init_sl2sc_table(struct ips_proto *proto);
 psm2_error_t ips_ibta_link_updown_event(struct ips_proto *proto);
 
 psm2_error_t
@@ -706,15 +704,15 @@ psmi_get_sdma_req_info(struct ips_scb *scb, size_t *extra))
 {
 	*extra = 0;
 #ifdef PSM_CUDA
-	if (!PSMI_IS_DRIVER_GPUDIRECT_ENABLED)
-		return (void *)(((char *)&scb->pbc) -
+	if (PSMI_IS_DRIVER_GPUDIRECT_DISABLED)
+		return (struct psm_hal_sdma_req_info *)(((char *)&scb->pbc) -
 				(sizeof(struct psm_hal_sdma_req_info) -
 				 PSM_HAL_CUDA_SDMA_REQ_INFO_EXTRA));
 
 	*extra = PSM_HAL_CUDA_SDMA_REQ_INFO_EXTRA;
 #endif
 
-	return (void *)(((char *)&scb->pbc) - (sizeof(struct psm_hal_sdma_req_info)));
+	return (struct psm_hal_sdma_req_info *)(((char *)&scb->pbc) - (sizeof(struct psm_hal_sdma_req_info)));
 }
 
 #ifdef PSM_CUDA
@@ -730,4 +728,19 @@ uint32_t ips_cuda_next_window(uint32_t max_window, uint32_t offset,
 }
 #endif
 
+/* Determine if FECN bit is set IBTA 1.2.1 CCA Annex A*/
+
+static __inline__ uint8_t
+_is_cca_fecn_set(const struct ips_message_header *p_hdr)
+{
+	return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_FECN_SHIFT) & 0x1;
+}
+
+/* Detrmine if BECN bit is set IBTA 1.2.1 CCA Annex A*/
+static __inline__ uint8_t
+_is_cca_becn_set(const struct ips_message_header *p_hdr)
+{
+	return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_BECN_SHIFT) & 0x1;
+}
+
 #endif /* _IPS_PROTO_H */
diff --git a/ptl_ips/ips_proto_expected.c b/ptl_ips/ips_proto_expected.c
index 1f507ed..7e7e997 100644
--- a/ptl_ips/ips_proto_expected.c
+++ b/ptl_ips/ips_proto_expected.c
@@ -1919,20 +1919,15 @@ ips_tid_recv_alloc_frag(struct ips_protoexp *protoexp,
 			  ((bufptr + size - 1) & page_mask) -
 			  (bufptr & page_mask));
 		tidoff = pageoff = (uint32_t) (bufptr & page_offset_mask);
-	} else {
+	} else
+#endif
+	{
 		pageaddr = bufptr & protoexp->tid_page_mask;
 		pagelen = (uint32_t) (PSMI_PAGESIZE +
 			  ((bufptr + size - 1) & protoexp->tid_page_mask) -
 			  (bufptr & protoexp->tid_page_mask));
 		tidoff = pageoff = (uint32_t) (bufptr & protoexp->tid_page_offset_mask);
 	}
-#else
-	pageaddr = bufptr & protoexp->tid_page_mask;
-	pagelen = (uint32_t) (PSMI_PAGESIZE +
-			     ((bufptr + size - 1) & protoexp->tid_page_mask) -
-			     (bufptr & protoexp->tid_page_mask));
-	tidoff = pageoff = (uint32_t) (bufptr & protoexp->tid_page_offset_mask);
-#endif
 
 	reglen = pagelen;
 	if (protoexp->tidc.tid_array) {
@@ -2298,8 +2293,9 @@ ipsaddr_next:
 					getreq->tidgr_offset + nbytes_this;
 				nbytes_this -= pageoff & (PSMI_GPU_PAGESIZE - 1);
 			}
-		} else {
+		} else
 #endif
+		{
 			if ((getreq->tidgr_offset + nbytes_this) <
 					getreq->tidgr_length &&
 					nbytes_this > PSMI_PAGESIZE) {
@@ -2309,9 +2305,7 @@ ipsaddr_next:
 					getreq->tidgr_offset + nbytes_this;
 				nbytes_this -= pageoff & (PSMI_PAGESIZE - 1);
 			}
-#ifdef PSM_CUDA
 		}
-#endif
 
 		psmi_assert(nbytes_this >= 4);
 		psmi_assert(nbytes_this <= PSM_TID_WINSIZE);
diff --git a/ptl_ips/ips_proto_help.h b/ptl_ips/ips_proto_help.h
index 42567f5..ba32b84 100644
--- a/ptl_ips/ips_proto_help.h
+++ b/ptl_ips/ips_proto_help.h
@@ -522,6 +522,7 @@ ips_proto_process_packet(const struct ips_recvhdrq_event *rcv_ev))
 {
 	uint32_t index;
 
+#ifdef PSM_FI
 	/* NOTE: Fault injection will currently not work with hardware
 	 * suppression. See note below for reason why as we currently
 	 * do not update the hardware tidflow table if FI is dropping
@@ -545,7 +546,7 @@ ips_proto_process_packet(const struct ips_recvhdrq_event *rcv_ev))
 		if (psmi_faultinj_is_fault(fi_recv))
 			return IPS_RECVHDRQ_CONTINUE;
 	}
-
+#endif /* #ifdef PSM_FI */
 	/* see file ips_proto_header.h for details */
 	index = _get_proto_hfi_opcode(rcv_ev->p_hdr) - OPCODE_RESERVED;
 	if (index >= (OPCODE_FUTURE_FROM - OPCODE_RESERVED))
diff --git a/ptl_ips/ips_proto_mq.c b/ptl_ips/ips_proto_mq.c
index 32471fd..8a047c6 100644
--- a/ptl_ips/ips_proto_mq.c
+++ b/ptl_ips/ips_proto_mq.c
@@ -565,22 +565,6 @@ int psmi_cuda_is_buffer_gpu_mem(void *ubuf)
 	return (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf));
 }
 
-/*
- * CUDA documentation dictates the use of SYNC_MEMOPS attribute
- * when the buffer pointer received into PSM has been allocated
- * by the application. This guarantees that all memory operations
- * to this region of memory (used by multiple layers of the stack)
- * always synchronize.
- */
-static inline
-void psmi_cuda_set_attr_sync_memops(void *ubuf)
-{
-	int trueflag = 1;
-
-	PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
-		       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr) ubuf);
-}
-
 static inline
 int psmi_cuda_is_needed_rendezvous(struct ips_proto *proto, uint32_t len)
 {
@@ -691,6 +675,8 @@ ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user,
 	if_pf(req == NULL)
 		return PSM2_NO_MEMORY;
 
+	_HFI_VDBG("(req=%p) ubuf=%p len=%u\n", req, ubuf, len);
+
 	req->flags_user = flags_user;
 	req->flags_internal = flags_internal;
 	ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next;
@@ -704,8 +690,9 @@ ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user,
 
 #ifdef PSM_CUDA
 	req->is_buf_gpu_mem = psmi_cuda_is_buffer_gpu_mem((void*)ubuf);
+	req->cuda_hostbuf_used = 0;
 	if (req->is_buf_gpu_mem) {
-		psmi_cuda_set_attr_sync_memops((void*)ubuf);
+		psmi_cuda_set_attr_sync_memops(ubuf);
 		if (psmi_cuda_is_needed_rendezvous(proto, len))
 			goto do_rendezvous;
 	}
@@ -882,6 +869,8 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 	ips_scb_t *scb;
 	int gpu_mem = 0;
 
+	_HFI_VDBG("ubuf=%p len=%u\n", ubuf, len);
+
 	ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next;
 	ipsaddr->msgctl->ipsaddr_next = ipsaddr->next;
 	proto = ((psm2_epaddr_t) ipsaddr)->proto;
@@ -891,7 +880,7 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 #ifdef PSM_CUDA
 	gpu_mem = psmi_cuda_is_buffer_gpu_mem((void*)ubuf);
 	if (gpu_mem) {
-		psmi_cuda_set_attr_sync_memops((void*)ubuf);
+		psmi_cuda_set_attr_sync_memops(ubuf);
 		if (psmi_cuda_is_needed_rendezvous(proto, len))
 			goto do_rendezvous;
 	}
@@ -1031,6 +1020,7 @@ ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 			return err;
 
 #ifdef PSM_CUDA
+		req->cuda_hostbuf_used = 0;
 		if (gpu_mem) {
 			req->is_buf_gpu_mem = 1;
 		} else
@@ -1069,12 +1059,6 @@ do_rendezvous:
 		req->flags_internal |= PSMI_REQ_FLAG_IS_INTERNAL;
 
 #ifdef PSM_CUDA
-		/* CUDA documentation dictates the use of SYNC_MEMOPS attribute
-		 * when the buffer pointer received into PSM has been allocated
-		 * by the application. This guarantees the all memory operations
-		 * to this region of memory (used by multiple layers of the stack)
-		 * always synchronize
-		 */
 		if (gpu_mem) {
 			req->is_buf_gpu_mem = 1;
 		} else
diff --git a/ptl_ips/ips_proto_params.h b/ptl_ips/ips_proto_params.h
index ae2b894..47bf125 100644
--- a/ptl_ips/ips_proto_params.h
+++ b/ptl_ips/ips_proto_params.h
@@ -214,8 +214,10 @@
 #define IPS_PROTO_FLAG_PPOLICY_STATIC 0x1c00
 
 /* IBTA CCA Protocol support */
-#define IPS_PROTO_FLAG_CCA 0x2000
+#define IPS_PROTO_FLAG_CCA 0x2000		/* Enables full-fledged CCA */
 #define IPS_PROTO_FLAG_CCA_PRESCAN 0x4000	/* Enable RAPID CCA prescanning */
+#define IPS_PROTO_FLAG_CC_REPL_BECN  0x8000	/* A simple congestion control scheme */
+						/* that simply replies a BECN on rx FECN. */
 
 #ifdef PSM_CUDA
 /* Use RNDV (TID) for all message sizes */
diff --git a/ptl_ips/ips_recvhdrq.c b/ptl_ips/ips_recvhdrq.c
index 16908ba..6c5fd07 100644
--- a/ptl_ips/ips_recvhdrq.c
+++ b/ptl_ips/ips_recvhdrq.c
@@ -149,21 +149,6 @@ _get_proto_subcontext(const struct ips_message_header *p_hdr)
 		 HFI_BTH_SUBCTXT_SHIFT) & HFI_BTH_SUBCTXT_MASK);
 }
 
-/* Determine if FECN bit is set IBTA 1.2.1 CCA Annex A*/
-
-static __inline__ uint8_t
-_is_cca_fecn_set(const struct ips_message_header *p_hdr)
-{
-	return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_FECN_SHIFT) & 0x1;
-}
-
-/* Detrmine if BECN bit is set IBTA 1.2.1 CCA Annex A*/
-static __inline__ uint8_t
-_is_cca_becn_set(const struct ips_message_header *p_hdr)
-{
-	return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_BECN_SHIFT) & 0x1;
-}
-
 static __inline__ void _dump_invalid_pkt(struct ips_recvhdrq_event *rcv_ev)
 {
 	char *payload = ips_recvhdrq_event_payload(rcv_ev);
@@ -426,46 +411,6 @@ psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq)
 		    ("new packet: rcv_hdr %p, rhf %" PRIx64 "\n",
 		     rcv_ev.p_hdr, rcv_ev.psm_hal_rhf.raw_rhf);
 
-		/* If the hdrq_head is before cachedlastscan, that means that we have
-		 * already prescanned this for BECNs and FECNs, so we should not check
-		 * again
-		 */
-		if_pt((recvq->proto->flags & IPS_PROTO_FLAG_CCA) &&
-				(state->hdrq_head >= state->hdrq_cachedlastscan)) {
-			/* IBTA CCA handling:
-			 * If FECN bit set handle IBTA CCA protocol. For the
-			 * flow that suffered congestion we flag it to generate
-			 * a control packet with the BECN bit set - This is
-			 * currently an unsolicited ACK.
-			 *
-			 * For all MQ packets the FECN processing/BECN
-			 * generation is done in the is_expected_or_nak
-			 * function as each eager packet is inspected there.
-			 *
-			 * For TIDFLOW/Expected data transfers the FECN
-			 * bit/BECN generation is done in protoexp_data. Since
-			 * header suppression can result in even FECN packets
-			 * being suppressed the expected protocol generated
-			 * additional BECN packets if a "large" number of
-			 * generations are swapped without progress being made
-			 * for receive. "Large" is set empirically to 4.
-			 *
-			 * FECN packets are ignored for all control messages
-			 * (except ACKs and NAKs) since they indicate
-			 * congestion on the control path which is not rate
-			 * controlled. The CCA specification allows FECN on
-			 * ACKs to be disregarded as well.
-			 */
-
-			rcv_ev.is_congested =
-			    _is_cca_fecn_set(rcv_ev.
-					     p_hdr) & IPS_RECV_EVENT_FECN;
-			rcv_ev.is_congested |=
-			    (_is_cca_becn_set(rcv_ev.p_hdr) <<
-			     (IPS_RECV_EVENT_BECN - 1));
-		} else
-			rcv_ev.is_congested = 0;
-
 #ifdef PSM_DEBUG
 		if_pf(_check_headers(&rcv_ev, psm_hal_hdr_q))
 			goto skip_packet;
diff --git a/ptl_ips/ips_recvhdrq.h b/ptl_ips/ips_recvhdrq.h
index daef846..7352ff6 100644
--- a/ptl_ips/ips_recvhdrq.h
+++ b/ptl_ips/ips_recvhdrq.h
@@ -169,12 +169,13 @@ void *
 ips_recvhdrq_event_payload(const struct ips_recvhdrq_event *rcv_ev))
 {
 	if (psmi_hal_rhf_get_use_egr_buff(rcv_ev->psm_hal_rhf))
-		return psmi_hal_get_egr_buff(
+		return (char*)(psmi_hal_get_egr_buff(
 			psmi_hal_rhf_get_egr_buff_index(rcv_ev->psm_hal_rhf),
-			rcv_ev->psm_hal_hdr_q + 1 /* The circular list q (cl_q) for the
-						     egr buff for any rx hdrq event is
-						     always one more than the hdrq cl q */,
-			rcv_ev->recvq->context->psm_hw_ctxt)+
+			(psmi_hal_cl_q)(rcv_ev->psm_hal_hdr_q + 1) /* The circular list q
+						     (cl_q) for the egr buff for any rx
+						     hdrq event is always one more than
+						     the hdrq cl q */,
+			rcv_ev->recvq->context->psm_hw_ctxt))+
 			(psmi_hal_rhf_get_egr_buff_offset(rcv_ev->psm_hal_rhf)*64);
 	else
 		return NULL;
diff --git a/ptl_ips/ips_scb.c b/ptl_ips/ips_scb.c
index 52b9a93..83517ac 100644
--- a/ptl_ips/ips_scb.c
+++ b/ptl_ips/ips_scb.c
@@ -201,6 +201,12 @@ psm2_error_t ips_scbctrl_fini(struct ips_scbctrl *scbc)
 	if (scbc->sbuf_buf_alloc) {
 		psmi_free(scbc->sbuf_buf_alloc);
 	}
+	if (scbc->timers != NULL) {
+		psmi_free(scbc->timers);
+	}
+	if (scbc->scb_imm_buf) {
+		psmi_free(scbc->scb_imm_buf);
+	}
 	return PSM2_OK;
 }
 
diff --git a/ptl_ips/ptl.c b/ptl_ips/ptl.c
index 39b5631..8b5afc1 100644
--- a/ptl_ips/ptl.c
+++ b/ptl_ips/ptl.c
@@ -574,12 +574,12 @@ PSMI_INLINE(int ips_try_lock_shared_context(struct ptl_shared *recvshc))
 {
 	return pthread_spin_trylock(recvshc->context_lock);
 }
-
+/* Unused
 PSMI_INLINE(void ips_lock_shared_context(struct ptl_shared *recvshc))
 {
 	pthread_spin_lock(recvshc->context_lock);
 }
-
+*/
 PSMI_INLINE(void ips_unlock_shared_context(struct ptl_shared *recvshc))
 {
 	pthread_spin_unlock(recvshc->context_lock);
diff --git a/ptl_ips/ptl_fwd.h b/ptl_ips/ptl_fwd.h
index 3702fba..b774260 100644
--- a/ptl_ips/ptl_fwd.h
+++ b/ptl_ips/ptl_fwd.h
@@ -61,7 +61,7 @@ typedef struct ips_epaddr ips_epaddr_t;
 typedef struct ips_msgctl ips_msgctl_t;
 
 /* Symbol in ips ptl */
-struct ptl_ctl_init psmi_ptl_ips;
+extern struct ptl_ctl_init psmi_ptl_ips;
 
-struct ptl_ctl_rcvthread psmi_ptl_ips_rcvthread;
+extern struct ptl_ctl_rcvthread psmi_ptl_ips_rcvthread;
 #endif /* _PTL_FWD_IPS_H */
diff --git a/ptl_self/ptl.c b/ptl_self/ptl.c
index 49d898d..35f57a3 100644
--- a/ptl_self/ptl.c
+++ b/ptl_self/ptl.c
@@ -143,17 +143,8 @@ self_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags_user,
 	    return PSM2_NO_MEMORY;
 
 #ifdef PSM_CUDA
-	/* CUDA documentation dictates the use of SYNC_MEMOPS attribute
-	 * when the buffer pointer received into PSM has been allocated
-	 * by the application. This guarantees the all memory operations
-	 * to this region of memory (used by multiple layers of the stack)
-	 * always synchronize
-	 */
-	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)ubuf)) {
-		int trueflag = 1;
-		PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag,
-			       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
-			      (CUdeviceptr)ubuf);
+	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)) {
+		psmi_cuda_set_attr_sync_memops(ubuf);
 		send_req->is_buf_gpu_mem = 1;
 	} else
 		send_req->is_buf_gpu_mem = 0;
diff --git a/ptl_self/ptl_fwd.h b/ptl_self/ptl_fwd.h
index 77ee7f9..7ee6b73 100644
--- a/ptl_self/ptl_fwd.h
+++ b/ptl_self/ptl_fwd.h
@@ -57,6 +57,6 @@
 #define _PTL_FWD_SELF_H
 
 /* Symbol in am ptl */
-struct ptl_ctl_init psmi_ptl_self;
+extern struct ptl_ctl_init psmi_ptl_self;
 
 #endif
diff --git a/rpm_release_extension b/rpm_release_extension
index 0d6dd55..725a5ba 100644
--- a/rpm_release_extension
+++ b/rpm_release_extension
@@ -1 +1 @@
-91_1
+185