From 961e70a7c29342f4dac5a4c0f266e176bb305a9b Mon Sep 17 00:00:00 2001 From: Packit Date: Sep 10 2020 08:30:29 +0000 Subject: libpsm2-11.2.91_1 base --- diff --git a/40-psm.rules b/40-psm.rules new file mode 100644 index 0000000..ba8d494 --- /dev/null +++ b/40-psm.rules @@ -0,0 +1,52 @@ +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2015 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2015 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +KERNEL=="hfi1", MODE="0666" +KERNEL=="hfi1_[0-9]", MODE="0666" diff --git a/COMMIT b/COMMIT new file mode 100644 index 0000000..d92e801 --- /dev/null +++ b/COMMIT @@ -0,0 +1 @@ +853ab1113c4eabf7218dfab673e433588fe7a8c4 \ No newline at end of file diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..ea3d558 --- /dev/null +++ b/COPYING @@ -0,0 +1,376 @@ +This software is available to you under a choice of one of two +licenses. You may choose to be licensed under the terms of the +BSD license or the GNU General Public License (GPL) Version +2, both included below. + +Copyright(c) 2016 Intel Corporation. All rights reserved. + +================================================================== + BSD Simplified License +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +================================================================== + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8f51f46 --- /dev/null +++ b/Makefile @@ -0,0 +1,613 @@ +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2017 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2017 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + + +HISTORY = .outdirs +HISTORIC_TARGETS = $(patsubst %, %_clean, $(shell cat $(HISTORY) 2> /dev/null)) + +RPM_NAME := libpsm2 +CONFIG_FILE := .config +TEMP_INST_DIR := $(shell mktemp -d) + +ifeq ($(CONFIG_FILE), $(wildcard $(CONFIG_FILE))) +include $(CONFIG_FILE) +endif + +PSM_HAL_ENABLE ?= * + +PSM_HAL_ENABLE_D = $(wildcard $(addprefix psm_hal_,$(PSM_HAL_ENABLE))) + +PSM_HAL_INSTANCE_OBJFILES = $(addsuffix /*.o,$(PSM_HAL_ENABLE_D)) + +SUBDIRS = ptl_self ptl_ips ptl_am libuuid opa ${wildcard $(PSM_HAL_ENABLE_D)} +top_srcdir := $(shell readlink -m .) + +# Default locations +OUTDIR := $(top_srcdir)/build_release +MOCK_OUTDIR := $(top_srcdir)/build_mock +DEBUG_OUTDIR := $(top_srcdir)/build_debug + +# We need a temporary test variable, as the OUTDIR macro +# can be overriden by the shell and thus not run. +TESTOUTDIR= $(shell readlink -m $(OUTDIR)) +ifeq ($(top_srcdir), $(TESTOUTDIR)) +$(error OUTDIR cannot be the same as your source folder ${top_srcdir})) +endif + +ifeq (/,$(TESTOUTDIR)) +$(error OUTDIR cannot be the / folder )) +endif + +# Forces any value to be full path. +# We don't need to override MOCK_OUTDIR or DEBUG_OUTDIR +# as they are recursive make invocations and use OUTDIR +ifneq ($(MAKECMDGOALS), mock) +ifneq ($(MAKECMDGOALS), debug) +override OUTDIR := $(shell readlink -m $(OUTDIR)) +endif +endif + +PSM2_VERNO_MAJOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' $(top_srcdir)/psm2.h) +PSM2_VERNO_MINOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_MINOR.*0x\([0-9]\?[0-9a-f]\+\).*/\1/p' $(top_srcdir)/psm2.h) +PSM2_LIB_MAJOR := $(shell printf "%d" ${PSM2_VERNO_MAJOR}) +PSM2_LIB_MINOR := $(shell printf "%d" `sed -n 's/^\#define.*PSM2_VERNO_MINOR.*\(0x[0-9a-f]\+\).*/\1/p' $(top_srcdir)/psm2.h`) +LINKER_SCRIPT_FILE = ${OUTDIR}/psm2_linker_script.map +SOURCES_CHKSUM_FILES = Makefile buildflags.mak $(LINKER_SCRIPT_FILE) \ + `find . -regex '\(.*\.h\|.*\.c\)' -not -path "./test/*" -not -path "./tools/*" -not -path "_revision.c" | sort` +SOURCES_CHKSUM_VALUE = $(shell cat ${SOURCES_CHKSUM_FILES} | sha1sum | cut -d' ' -f 1) +OPA_LIB_MAJOR := 4 +OPA_LIB_MINOR := 0 + +export PSM2_VERNO_MAJOR +export PSM2_LIB_MAJOR +export PSM2_VERNO_MINOR +export PSM2_LIB_MINOR +export OPA_LIB_MAJOR +export OPA_LIB_MINOR +export CCARCH ?= gcc +export FCARCH ?= gfortran +export AR ?= ar + +include $(top_srcdir)/buildflags.mak + +# We need to unexport these environs as during mock testing and normal calls, +# if they are exported then during each submake they will be evaulated again. +# This is costly and the LINKER_SCRIPT_FILE doesn't exist until after its +# target rule runs. +unexport SOURCES_CHKSUM_FILES +unexport SOURCES_CHKSUM_VALUE +unexport LINKER_SCRIPT_FILE +INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips -I$(OUTDIR) + +ifneq (x86_64,$(arch)) + ifneq (i386,$(arch)) + $(error Unsupported architecture $(arch)) + endif +endif + +ifndef LIBDIR + ifeq (${arch},x86_64) + INSTALL_LIB_TARG=/usr/lib64 + else + INSTALL_LIB_TARG=/usr/lib + endif +else + INSTALL_LIB_TARG=${LIBDIR} +endif +export DESTDIR +export INSTALL_LIB_TARG + +TARGLIB := libpsm2 +COMPATMAJOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_COMPAT_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' \ + $(top_srcdir)/psm2.h) +COMPATLIB := libpsm_infinipath + +MAJOR := $(PSM2_LIB_MAJOR) +MINOR := $(PSM2_LIB_MINOR) + +nthreads := $(shell echo $$(( `nproc` * 2 )) ) + +# The following line sets the DISTRO variable to: +# 'rhel' if the host is running RHEL. +# 'suse' if the host is running SUSE. +# 'fedora' if the host is running Fedora. +# 'ubuntu' if the host is running Ubuntu. +# +# The DISTRO variable is used subsequently for variable +# behaviors of the 3 distros. + +DISTRO := $(shell . /etc/os-release; echo $$ID) + +# By default the following two variables have the following values: +LIBPSM2_COMPAT_CONF_DIR := /etc +LIBPSM2_COMPAT_SYM_CONF_DIR := /etc +# We can't set SPEC_FILE_RELEASE_DIST to an empty value, a space will result. +# It then messes up sed operations for PSM_CUDA=1. +# So leaving the commented out line here as documentation to NOT set it. +# SPEC_FILE_RELEASE_DIST := +UDEV_40_PSM_RULES := %{_udevrulesdir}/40-psm.rules + +ifeq (fedora,$(DISTRO)) + # On Fedora, we change these two variables to these values: + LIBPSM2_COMPAT_CONF_DIR := /usr/lib + LIBPSM2_COMPAT_SYM_CONF_DIR := %{_prefix}/lib + SPEC_FILE_RELEASE_DIST := %{?dist} + UDEV_40_PSM_RULES :=# +else ifeq (rhel,${DISTRO}) + # Insert code specific to RHEL here. +else ifeq (sles,${DISTRO}) + # Insert code specific to SLES here. +endif + +ifdef PSM_CUDA +#Value needs to be something without spaces or dashes '-' +SPEC_FILE_RELEASE_DIST += cuda +endif + +export LIBPSM2_COMPAT_CONF_DIR + +# The desired version number comes from the most recent tag starting with "v" +ifeq (true, $(shell git rev-parse --is-inside-work-tree 2>/dev/null)) +ISGIT := 1 # Cache the result for later +# Note, we don't define ISGIT if we are not in a git folder +VERSION := $(shell git describe --tags --abbrev=0 --match='psm-v*' | sed -e 's/^psm-v//' -e 's/-/_/') +else +ISGIT := 0 +VERSION := version +endif + +# If we have a file called 'rpm_release_extension' (as on github), +# we take the release extension number from this file +RELEASE_EXT := $(shell if [ -e rpm_release_extension ] ;\ + then cat rpm_release_extension; fi) +CURRENTSHA := $(shell if [ $(ISGIT) = 1 -a -f rpm_release_extension ] ;\ + then git log --pretty=format:'%h' -n 1; fi) +RPMEXTHASH := $(shell if [ $(ISGIT) = 1 -a -f rpm_release_extension ] ;\ + then git log --pretty=format:'%h' -n 1 rpm_release_extension; fi) + +# This logic should kick-in only on github +ifdef RELEASE_EXT +ifneq ($(CURRENTSHA), $(RPMEXTHASH)) +# On github, the last commit for each release should be the one to bump up +# the release extension number in 'rpm_release_extension'. Further commits +# are counted here and appended to the final rpm name to distinguish commits +# present only on github +NCOMMITS := $(shell if [ $(ISGIT) = 1 -a -f rpm_release_extension ] ;\ + then git log --children $(RPMEXTHASH)..$(CURRENTSHA) \ + --pretty=oneline . | wc -l; fi) +RELEASE := $(RELEASE_EXT)_$(NCOMMITS) +endif +endif + +# The desired release number comes the git describe following the version which +# is the number of commits since the version tag was planted suffixed by the g +ifndef RELEASE +RELTAG := "psm-v$(VERSION)" +RELEASE := $(shell if [ -f rpm_release_extension ]; then cat rpm_release_extension;\ + elif [ $(ISGIT) = 1 ] ; then git rev-list $(RELTAG)..HEAD -- . | wc -l; \ + else echo "release" ; fi) +endif + +DIST_SHA := ${shell if [ $(ISGIT) = 1 ] ; then git log -n1 --pretty=format:%H .; \ + else echo DIST_SHA ; fi} + +# Concatenated version and release +ifndef VERSION_RELEASE_OVERRIDE +VERSION_RELEASE := $(VERSION).$(RELEASE) +else +VERSION_RELEASE := ${VERSION_RELEASE_OVERRIDE} +endif + +LDLIBS := -lrt -ldl -lnuma ${EXTRA_LIBS} -pthread + +PKG_CONFIG ?= pkg-config + +UDEVDIR := $(shell $(PKG_CONFIG) --variable=udevdir udev 2>/dev/null) +ifndef UDEVDIR + UDEVDIR = /lib/udev +endif + +export UDEVDIR + +# The DIST variable is a name kernel corresponding to: +# 1. The name of the directory containing the source code distribution +# (see dist: target below). +# 2. The basename of the filename of the tar file created in the dist: +# target. +DIST := ${RPM_NAME}-${VERSION_RELEASE} + +# If user has empty RPM NAME BASEEXT (defined or not), then attempt to +# see if we are running on SLES 12.3 or newer. +# If we are, then change the base package name, but not the supporting +# packages to libpsm2-2. Do note this requires support both in the Makefile +# specfile target rule as well as changes in the libpsm2.spec.in +# file as well. +ifeq ($(RPM_NAME_BASEEXT),) +# Detect current version of the OS +OS := $(shell grep -m1 NAME /etc/os-release | cut -f 2 -d\") +OSVERSION := $(shell grep VERSION_ID /etc/os-release | cut -f 2 -d\" | cut -f 1 -d.) +OSSUBVERSION := $(shell grep VERSION_ID /etc/os-release | cut -f 2 -d\" | cut -f 2 -d.) + +override RPM_NAME_BASEEXT := $(shell \ + if [ "$(OS)" = "SLES" ]; then \ + if [ $(OSVERSION) -gt 11 ]; then \ + if [ $(OSVERSION) -eq 12 ]; then \ + if [ $(OSSUBVERSION) -gt 2 ]; then \ + echo "-2"; \ + fi \ + else \ + echo "-2"; \ + fi \ + fi \ + fi) +endif + +HALDECLFILE=$(OUTDIR)/psm2_hal_inlines_d.h +HALIMPLFILE=$(OUTDIR)/psm2_hal_inlines_i.h + +all: symlinks $(HALDECLFILE) $(HALIMPLFILE) | $(OUTDIR) + @if [ ! -e $(HISTORY) ] || [ -z "`grep -E '^$(OUTDIR)$$' $(HISTORY)`" ]; then \ + echo $(OUTDIR) >> $(HISTORY); \ + fi + # Our buildflags.mak exports all variables, all are propogated to submakes. + @for subdir in $(SUBDIRS); do \ + mkdir -p $(OUTDIR)/$$subdir; \ + $(MAKE) -j $(nthreads) -C $$subdir OUTDIR=$(OUTDIR)/$$subdir; \ + if [ $$? -ne 0 ]; then exit 1; fi ;\ + done + $(MAKE) -j $(nthreads) $(OUTDIR)/${TARGLIB}.so + $(MAKE) -j $(nthreads) $(OUTDIR)/${TARGLIB}.a + @mkdir -p $(OUTDIR)/compat + $(MAKE) -j $(nthreads) -C compat OUTDIR=$(OUTDIR)/compat + +$(HALDECLFILE): | $(OUTDIR) + @test -f $(HALDECLFILE) || ( \ + n_hal_insts=$(words $(wildcard $(PSM_HAL_ENABLE_D)));\ + echo "#define PSMI_HAL_INST_CNT $$n_hal_insts" > $(HALDECLFILE);\ + if [ $$n_hal_insts -eq 1 ]; then \ + echo "#define PSMI_HAL_INLINE inline" >> $(HALDECLFILE);\ + hal_inst_dir=$(PSM_HAL_ENABLE_D); \ + echo "#define PSMI_HAL_CAT_INL_SYM(KERNEL) hfp_$(subst psm_hal_,,$(PSM_HAL_ENABLE_D))" \ + "## _ ## KERNEL" >> $(HALDECLFILE);\ + echo "#include \"psm2_hal_inline_t.h\"" >> $(HALDECLFILE);\ + else \ + echo "#define PSMI_HAL_INLINE /* nothing */" >> $(HALDECLFILE);\ + fi ) + +$(HALIMPLFILE): | $(OUTDIR) + @test -f $(HALIMPLFILE) || ( \ + n_hal_insts=$(words $(wildcard $(PSM_HAL_ENABLE_D)));\ + if [ $$n_hal_insts -eq 1 ]; then\ + hal_inst=$(PSM_HAL_ENABLE_D);\ + echo "#include \"$$hal_inst/psm_hal_inline_i.h\"" >> $(HALIMPLFILE);\ + else\ + echo "/* no inlining since more than 1 hal instance" >> $(HALIMPLFILE);\ + echo " is included in the libpsm2 linkage. */" >> $(HALIMPLFILE);\ + fi ) + +%_clean: + make OUTDIR=$* clean + +clean: cleanlinks + rm -rf ${OUTDIR} + @if [ -e $(HISTORY) ]; then \ + grep -v -E "^$(OUTDIR)$$" $(HISTORY) > $(HISTORY)_tmp; \ + mv $(HISTORY)_tmp $(HISTORY); \ + if [ "`wc -c $(HISTORY) | cut -d ' ' -f 1`" -eq 0 ]; then \ + rm -f $(HISTORY); \ + fi; \ + fi + rm -fr $(TEMP_INST_DIR) + +# Easily add more items to config target if more options need +# to be cached. +config: $(CONFIG_FILE) + +$(CONFIG_FILE): + @echo PSM_HAL_ENABLE=$(PSM_HAL_ENABLE) > $(CONFIG_FILE) + @echo CCARCH=$(CCARCH) >> $(CONFIG_FILE) + @echo HFI_BRAKE_DEBUG=$(HFI_BRAKE_DEBUG) >> $(CONFIG_FILE) + @echo PSM_DEBUG=$(PSM_DEBUG) >> $(CONFIG_FILE) + @echo PSM_AVX512=$(PSM_AVX512) >> $(CONFIG_FILE) + @echo PSM_LOG=$(PSM_LOG) >> $(CONFIG_FILE) + @echo PSM_LOG_FAST_IO=$(PSM_LOG_FAST_IO) >> $(CONFIG_FILE) + @echo PSM_PERF=$(PSM_PERF) >> $(CONFIG_FILE) + @echo PSM_HEAP_DEBUG=$(PSM_HEAP_DEBUG) >> $(CONFIG_FILE) + @echo PSM_PROFILE=$(PSM_PROFILE) >> $(CONFIG_FILE) + @echo PSM_CUDA=$(PSM_CUDA) >> $(CONFIG_FILE) + @echo Wrote $(CONFIG_FILE) + +mock: OUTDIR := $(MOCK_OUTDIR) +mock: + $(MAKE) OUTDIR=$(OUTDIR) PSM2_MOCK_TESTING=1 + +debug: OUTDIR := $(DEBUG_OUTDIR) +debug: + $(MAKE) OUTDIR=$(OUTDIR) PSM_DEBUG=1 + +test_clean: + if [ -d ./test ]; then \ + $(MAKE) -C test clean; \ + fi + +specfile_clean: + rm -f ${OUTDIR}/${RPM_NAME}.spec + +distclean: specfile_clean cleanlinks $(HISTORIC_TARGETS) test_clean + rm -f $(CONFIG_FILE) + rm -rf ${OUTDIR}/${DIST} + rm -f ${OUTDIR}/${DIST}.tar.gz + rm -fr temp.* *.rej.patch + +$(OUTDIR): + mkdir -p ${OUTDIR} + +symlinks: + @test -L $(top_srcdir)/include/linux-x86_64 || \ + ln -sf linux-i386 $(top_srcdir)/include/linux-x86_64 + +cleanlinks: + rm -rf $(top_srcdir)/include/linux-x86_64 + +install: all + for subdir in $(SUBDIRS) ; do \ + mkdir -p $(OUTDIR)/$$subdir ; \ + $(MAKE) -j $(nthreads) -C $$subdir OUTDIR=$(OUTDIR)/$$subdir install ; \ + done + $(MAKE) -j $(nthreads) $(OUTDIR)/${TARGLIB}.so OUTDIR=$(OUTDIR) + $(MAKE) -j $(nthreads) -C compat OUTDIR=$(OUTDIR)/compat install + install -D $(OUTDIR)/${TARGLIB}.so.${MAJOR}.${MINOR} \ + ${DESTDIR}${INSTALL_LIB_TARG}/${TARGLIB}.so.${MAJOR}.${MINOR} + (cd ${DESTDIR}${INSTALL_LIB_TARG} ; \ + ln -sf ${TARGLIB}.so.${MAJOR}.${MINOR} ${TARGLIB}.so.${MAJOR} ; \ + ln -sf ${TARGLIB}.so.${MAJOR} ${TARGLIB}.so) + install -D $(OUTDIR)/${TARGLIB}.a \ + ${DESTDIR}${INSTALL_LIB_TARG}/${TARGLIB}.a + install -m 0644 -D psm2.h ${DESTDIR}/usr/include/psm2.h + install -m 0644 -D psm2_mq.h ${DESTDIR}/usr/include/psm2_mq.h + install -m 0644 -D psm2_am.h ${DESTDIR}/usr/include/psm2_am.h +ifneq (fedora,${DISTRO}) + install -m 0644 -D 40-psm.rules ${DESTDIR}$(UDEVDIR)/rules.d/40-psm.rules +endif + # The following files and dirs were part of the noship rpm: + mkdir -p ${DESTDIR}/usr/include/hfi1diag + mkdir -p ${DESTDIR}/usr/include/hfi1diag/linux-x86_64 + install -m 0644 -D include/linux-x86_64/bit_ops.h ${DESTDIR}/usr/include/hfi1diag/linux-x86_64/bit_ops.h + install -m 0644 -D include/linux-x86_64/sysdep.h ${DESTDIR}/usr/include/hfi1diag/linux-x86_64/sysdep.h + install -m 0644 -D include/opa_udebug.h ${DESTDIR}/usr/include/hfi1diag/opa_udebug.h + install -m 0644 -D include/opa_debug.h ${DESTDIR}/usr/include/hfi1diag/opa_debug.h + install -m 0644 -D include/opa_intf.h ${DESTDIR}/usr/include/hfi1diag/opa_intf.h + for h in opa_user_gen1.h opa_service_gen1.h opa_common_gen1.h ; do \ + sed -e 's/#include "opa_user_gen1.h"/#include "opa_user.h"/' \ + -e 's/#include "opa_common_gen1.h"/#include "opa_common.h"/' \ + -e 's/#include "hfi1_deprecated_gen1.h"/#include "hfi1_deprecated.h"/' \ + -e 's/#include "opa_service_gen1.h"/#include "opa_service.h"/' psm_hal_gen1/$$h \ + > $(TEMP_INST_DIR)/$$h ; \ + done + cat include/opa_user.h $(TEMP_INST_DIR)/opa_user_gen1.h > $(TEMP_INST_DIR)/opa_user.h + cat include/opa_service.h $(TEMP_INST_DIR)/opa_service_gen1.h > $(TEMP_INST_DIR)/opa_service.h + install -m 0644 -D $(TEMP_INST_DIR)/opa_user.h ${DESTDIR}/usr/include/hfi1diag/opa_user.h + install -m 0644 -D $(TEMP_INST_DIR)/opa_service.h ${DESTDIR}/usr/include/hfi1diag/opa_service.h + install -m 0644 -D $(TEMP_INST_DIR)/opa_common_gen1.h ${DESTDIR}/usr/include/hfi1diag/opa_common.h + install -m 0644 -D include/opa_byteorder.h ${DESTDIR}/usr/include/hfi1diag/opa_byteorder.h + install -m 0644 -D include/psm2_mock_testing.h ${DESTDIR}/usr/include/hfi1diag/psm2_mock_testing.h + install -m 0644 -D include/opa_revision.h ${DESTDIR}/usr/include/hfi1diag/opa_revision.h + install -m 0644 -D psmi_wrappers.h ${DESTDIR}/usr/include/hfi1diag/psmi_wrappers.h + install -m 0644 -D psm_hal_gen1/hfi1_deprecated_gen1.h ${DESTDIR}/usr/include/hfi1diag/hfi1_deprecated.h + rm -fr $(TEMP_INST_DIR) + +specfile: specfile_clean | $(OUTDIR) + sed -e 's/@VERSION@/'${VERSION_RELEASE}'/g' libpsm2.spec.in | \ + sed -e 's/@TARGLIB@/'${TARGLIB}'/g' \ + -e 's/@RPM_NAME@/'${RPM_NAME}'/g' \ + -e 's/@RPM_NAME_BASEEXT@/'${RPM_NAME_BASEEXT}'/g' \ + -e 's/@COMPATLIB@/'${COMPATLIB}'/g' \ + -e 's/@COMPATMAJOR@/'${COMPATMAJOR}'/g' \ + -e 's;@UDEVDIR@;'${UDEVDIR}';g' \ + -e 's/@MAJOR@/'${MAJOR}'/g' \ + -e 's/@MINOR@/'${MINOR}'/g' \ + -e 's:@LIBPSM2_COMPAT_CONF_DIR@:'${LIBPSM2_COMPAT_CONF_DIR}':g' \ + -e 's:@LIBPSM2_COMPAT_SYM_CONF_DIR@:'${LIBPSM2_COMPAT_SYM_CONF_DIR}':g' \ + -e 's;@SPEC_FILE_RELEASE_DIST@;'${SPEC_FILE_RELEASE_DIST}';g' \ + -e 's/@DIST_SHA@/'${DIST_SHA}'/g' > \ + ${OUTDIR}/${RPM_NAME}.spec + if [ -f /etc/redhat-release ] && [ `grep -o "[0-9.]*" /etc/redhat-release | cut -d"." -f1` -lt 7 ]; then \ + sed -i 's;@40_PSM_RULES@;'${UDEVDIR}'/rules.d/40-psm.rules;g' ${OUTDIR}/${RPM_NAME}.spec; \ + else \ + sed -i 's;@40_PSM_RULES@;'${UDEV_40_PSM_RULES}';g' ${OUTDIR}/${RPM_NAME}.spec; \ + fi + +# We can't totally prevent two make dist calls in a row from packaging +# the previous make dist, unless we switch to using a dedicated ./src folder +# That will come in the next major revision of the Makefile for now we can +# prevent the easy and default cases +# +# Notes on PRUNE_LIST: +# To make the dist, we always eliminate the psm_hal_MOCK dir. +# we also eliminate the psm hal instances that are not enabled via the PSM_HAL_ENABLE variable. +# To implement this, we build the prune list in two passes: +# 1. The first pass includes all of the common items we want to exclude. +# 2. The second pass we include the differnce of +# (all of the PSM HAL instances) minus (the PSM hal instances that are enabled) +# The final prune list is supplied to find, and the dist is created. +dist: distclean + mkdir -p ${OUTDIR}/${DIST} + PRUNE_LIST=""; \ + for pd in ".git" "cscope*" "$(shell realpath --relative-to=${top_srcdir} ${OUTDIR})" \ + "*.orig" "*~" "#*" ".gitignore" "doc" "libcm" "psm.supp" "test" "psm_hal_MOCK" \ + "tools" "artifacts" "*.rej.patch"; do \ + PRUNE_LIST="$$PRUNE_LIST -name $$pd -prune -o"; \ + done; \ + for hid in psm_hal_* ; do \ + found=0; \ + for ehid in $(PSM_HAL_ENABLE_D) ; do \ + if [ "$$hid" = "$$ehid" ]; then \ + found=1; \ + break; \ + fi; \ + done; \ + if [ $$found -eq 0 ]; then \ + PRUNE_LIST="$$PRUNE_LIST -name $$hid -prune -o"; \ + fi; \ + done; \ + for x in $$(/usr/bin/find . \ + $$PRUNE_LIST \ + -print); do \ + dir=$$(dirname $$x); \ + mkdir -p ${OUTDIR}/${DIST}/$$dir; \ + [ ! -d $$x ] && cp $$x ${OUTDIR}/${DIST}/$$dir; \ + done + if [ $(ISGIT) = 1 ] ; then git log -n1 --pretty=format:%H . > ${OUTDIR}/${DIST}/COMMIT ; fi + echo ${RELEASE} > ${OUTDIR}/${DIST}/rpm_release_extension + cd ${OUTDIR}; tar czvf ${DIST}.tar.gz ${DIST} + @echo "${DIST}.tar.gz is located in ${OUTDIR}/${DIST}.tar.gz" + +ofeddist: + $(MAKE) -j $(nthreads) dist + +# rebuild the cscope database, skipping sccs files, done once for +# top level +cscope: + find * -type f ! -name '[ps].*' \( -iname '*.[cfhs]' -o \ + -iname \\*.cc -o -name \\*.cpp -o -name \\*.f90 \) -print | cscope -bqu -i - + +sources-checksum: + @echo ${SOURCES_CHKSUM_VALUE} + +${TARGLIB}-objs := ptl_am/am_reqrep_shmem.o \ + ptl_am/am_reqrep.o \ + ptl_am/ptl.o \ + ptl_am/cmarwu.o \ + ptl_am/am_cuda_memhandle_cache.o \ + psm_context.o \ + psm_ep.o \ + psm_ep_connect.o \ + psm_error.o \ + psm_utils.o \ + psm_sysbuf.o \ + psm_timer.o \ + psm_am.o \ + psm_mq.o \ + psm_mq_utils.o \ + psm_mq_recv.o \ + psm_mpool.o \ + psm_stats.o \ + psm_memcpy.o \ + psm_mock.o \ + psm.o \ + psm_perf.o \ + libuuid/psm_uuid.o \ + libuuid/parse.o \ + libuuid/pack.o \ + libuuid/unpack.o \ + libuuid/unparse.o \ + ptl_ips/ptl.o \ + ptl_ips/ptl_rcvthread.o \ + ptl_ips/ips_scb.o \ + ptl_ips/ips_epstate.o \ + ptl_ips/ips_recvq.o \ + ptl_ips/ips_recvhdrq.o \ + ptl_ips/ips_proto.o \ + ptl_ips/ips_proto_recv.o \ + ptl_ips/ips_proto_connect.o \ + ptl_ips/ips_proto_expected.o \ + ptl_ips/ips_tid.o \ + ptl_ips/ips_tidcache.o \ + ptl_ips/ips_tidflow.o \ + ptl_ips/ips_crc32.o \ + ptl_ips/ips_proto_dump.o \ + ptl_ips/ips_proto_mq.o \ + ptl_ips/ips_proto_am.o \ + ptl_ips/ips_path_rec.o \ + ptl_ips/ips_opp_path_rec.o \ + ptl_ips/ips_writehdrq.o \ + ptl_self/ptl.o \ + opa/*.o \ + psm_diags.o \ + psm2_hal.o \ + $(PSM_HAL_INSTANCE_OBJFILES) \ + psmi_wrappers.o + +${TARGLIB}-objs := $(patsubst %.o, ${OUTDIR}/%.o, ${${TARGLIB}-objs}) + +DEPS:= $(${TARGLIB}-objs:.o=.d) +-include $(DEPS) + +${OUTDIR}/${TARGLIB}.so: ${OUTDIR}/${TARGLIB}.so.${MAJOR} + ln -fs ${TARGLIB}.so.${MAJOR}.${MINOR} $@ + +${OUTDIR}/${TARGLIB}.so.${MAJOR}: ${OUTDIR}/${TARGLIB}.so.${MAJOR}.${MINOR} + ln -fs ${TARGLIB}.so.${MAJOR}.${MINOR} $@ + +# when we build the shared library, generate a revision and date +# string in it, for easier id'ing when people may have copied the +# file around. Generate it such that the ident command can find it +# and strings -a | grep OPA does a reasonable job as well. +$(OUTDIR)/${TARGLIB}.so.${MAJOR}.${MINOR}: ${${TARGLIB}-objs} $(LINKER_SCRIPT_FILE) + echo "char psmi_hfi_IFS_version[]=\"`printenv RELEASE_TAG`\";" > ${OUTDIR}/_revision.c + date -u -d@$${SOURCE_DATE_EPOCH:-$$(date +%s)} +'char psmi_hfi_build_timestamp[] ="%F %T%:z";' >> ${OUTDIR}/_revision.c + echo "char psmi_hfi_sources_checksum[] =\"${SOURCES_CHKSUM_VALUE}\";" >> ${OUTDIR}/_revision.c + echo "char psmi_hfi_git_checksum[] =\"`git rev-parse HEAD`\";" >> ${OUTDIR}/_revision.c + $(CC) -c $(CFLAGS) $(BASECFLAGS) $(INCLUDES) ${OUTDIR}/_revision.c -o $(OUTDIR)/_revision.o + $(CC) $(LINKER_SCRIPT) $(LDFLAGS) -o $@ -Wl,-soname=${TARGLIB}.so.${MAJOR} -shared \ + ${${TARGLIB}-objs} $(OUTDIR)/_revision.o $(LDLIBS) + +$(OUTDIR)/${TARGLIB}.a: $(OUTDIR)/${TARGLIB}.so.${MAJOR}.${MINOR} + $(AR) rcs $(OUTDIR)/${TARGLIB}.a ${${TARGLIB}-objs} $(OUTDIR)/_revision.o + +${OUTDIR}/%.o: ${top_srcdir}/%.c + $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -MMD -c $< -o $@ + +$(LINKER_SCRIPT_FILE): psm2_linker_script_map.in + sed "s/_psm2_additional_globals_;/$(PSM2_ADDITIONAL_GLOBALS)/" \ + psm2_linker_script_map.in > ${OUTDIR}/psm2_linker_script.map + +.PHONY: all %_clean clean config mock debug distclean symlinks cleanlinks install specfile dist ofeddist cscope sources-checksum diff --git a/README b/README new file mode 100644 index 0000000..a6efb40 --- /dev/null +++ b/README @@ -0,0 +1,299 @@ + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + Copyright (c) 2003-2017 Intel Corporation. All rights reserved. + +================================================================================ + +ABSTRACT +-------- + +Discusses how to build, install and test the PSM2 library source code. + +Contains the following sections: + +- INTRODUCTION +- DEPENDENCIES +- BUILDING + * BUILDING USING MAKEFILE + * BUILDING USING RPMBUILD (CREATING SOURCE AND BINARY RPM'S) +- INSTALLING + * INSTALLING USING MAKEFILE + * INSTALLING USING EITHER YUM OR DNF +- RELATED SOFTWARE TO PSM2 +- SUPPORTING DOCUMENTATION + +INTRODUCTION +============ + +This README file discusses how to build, install and test the PSM2 library +source code. + +The PSM2 library supports a number of fabric media and stacks, and all of +them run on version 7.X of Red Hat Enterprise Linux (abbreviated: RHEL), and +SuSE SLES. + +Only the x86_64 architecture is supported. + +Building PSM2 is possible on RHEL 7.2+ as it ships with hfi1 kernel driver. +For older RHEL 7.x versions and SuSE SLES, OPA is not natively supported +in the kernel and therefore, building PSM2 is not possible unless +you have the correct kernel-devel package or use latest versions of IFS. + +There are two mechanisms for building and installing the PSM2 library: + + 1. Use provided Makefiles to build and install or + 2. Generate the *.rpm files which you can then install using either + yum or dnf command + +DEPENDENCIES +============ + +The following packages are required to build the PSM2 library source code: +(all packages are for the x86_64 architecture) + +compat-rdma-devel +gcc-4.8.2 +glibc-devel +glibc-headers +kernel-headers + +Additional packages for GPU Direct support include: +NVIDIA CUDA toolkit 8.0 or greater. Older versions are not supported. + +In addition to depending on these packages, root privileges are required to +install the runtime libraries and development header files into standard +system location. + +BUILDING +======== + +The instructions below use $BASENAME, $PRODUCT and $RELEASE to refer to +the base name of the tarball, RPM that will be generated and the product +and release identifiers of the RPM. + +The base name of the RPM changes depending on which version/branch +of code you derive the tar file from. + +Up until v10.2 of PSM2, the base name for the RPM is hfi1-psm. +From v10.2 onwards, the base name will be libpsm2. The internal +library remains unchanged and is still libpsm2.so.2. + +BUILDING USING MAKEFILES +------------------------ + +1. Untar the tarball: + $ tar zxvf $BASENAME-$PRODUCT-$RELEASE.tar.gz +2. Change directory into the untarred location: + $ cd $BASENAME-$PRODUCT-$RELEASE +3. Build: + 3.1. To build with GNU C (gcc), run make on the command line: + $ make + - or - + $ make CCARCH=gcc + 3.2 To build with Intel C (icc), specify the correct CCARCH: + $ make CCARCH=icc + 3.3. To build with CUDA support, specify PSM_CUDA=1 + on the command line along with the desired compiler: + $ make PSM_CUDA=1 CCARCH=gcc + - or - + $ make PSM_CUDA=1 CCARCH=icc + +BUILDING USING RPMBUILD +----------------------- + +1. Run this command from your $PWD to generate rpm, srpm files + $ ./makesrpm.sh a + + This command results in the following collection of rpm's and source + code rpm's under your $PWD/temp.X/ directory. + ("X" is the pid of the bash script that created the srpm and rpm files) + (Result shown here for RHEL systems.) + + RPMS/x86_64/libpsm2-compat-10.3.7-1x86_64.rpm + RPMS/x86_64/libpsm2-devel-10.3.7-1x86_64.rpm + RPMS/x86_64/libpsm2-10.3.7-1x86_64.rpm + RPMS/x86_64/libpsm2-debuginfo-10.3.7-1x86_64.rpm + SRPMS/libpsm2-10.3.7-1.src.rpm + + 1.1. Optionally for GPU Direct support run this command from your $PWD to + generate rpm, srpm files + $ ./makesrpm.sh a -cuda + + This command results in the following collection of rpm's and source code + rpm's under your $PWD/temp.X/ directory. ("X" is the pid of the bash + script that created the srpm and rpm files): + RPMS/x86_64/libpsm2-10.3.7-1cuda.x86_64.rpm + RPMS/x86_64/libpsm2-compat-10.3.7-1cuda.x86_64.rpm + RPMS/x86_64/libpsm2-devel-10.3.7-1cuda.x86_64.rpm + SRPMS/x86_64/libpsm2-10.3.7-1cuda.src.rpm + + On systems with SLES 12.3 or newer, the package name for the base libpsm2 + RPM will be: + libpsm2-2-10.3.7-1.x86_64.rpm + + Other supporting RPM package names will be as listed above. + +INSTALLING +========== + +INSTALLING USING MAKEFILE +------------------------- + +Install the libraries and header files on the system (as root): + $ make install + +The libraries will be installed in /usr/lib64, and the header files will +be installed in /usr/include. + +This behavior can be altered by using the "DESTDIR" and "LIBDIR" variables on +the "make install" command line. "DESTDIR" will add a leading path component +to the overall install path and "LIBDIR" will change the path where libraries +will be installed. For example, "make DESTDIR=/tmp/psm-install install" will +install all files (libraries and headers) into "/tmp/psm-install/usr/...", +"make DESTDIR=/tmp/psm-install LIBDIR=/libraries install" will install the +libraries in "/tmp/psm-install/libraries" and the headers in +"/tmp/psm-install/usr/include", and "make LIBDIR=/tmp/libs install" will +install the libraries in "/tmp/libs" and the headers in "/usr/include". + + +INSTALLING USING EITHER YUM OR DNF +---------------------------------- + +You can install the rpm's and source rpm's previously built using rpmbuild using +either the yum or dnf command as the root user. See the appropriate man page for +details of installing rpm's. + +Note: It is also possible to use rpm command to install rpm's, but it is recommended +that one use yum/dnf as rpm tool has issues with name changes and obsoletes tags. +yum or dnf should be better able to resolve dependency issues. + +RELATED SOFTWARE TO PSM2 +======================== + +MPI Libraries supported +----------------------- +A large number of open source (Open MPI, MVAPICH2) and Vendor MPI +implementations support PSM2 for optimized communication on HCAs. Vendor MPI +implementations (HP-MPI, Intel MPI 4.0 with PMI, Platform/Scali MPI) +require that the PSM2 runtime libraries be installed and available on +each node. Usually a configuration file or a command line switch to mpirun +needs to be specified to utilize the PSM2 transport. + +Open MPI support +--------------- +If using a version of Open MPI that is not packaged within IFS release, it +is required to use at least v1.10.4. Older versions are not supported. Since +v1.10.4 is not in active development, it is further recommended to use upstream +versions v2.1.2 or newer. + +If NVIDIA* CUDA* support is desired, you can use Open MPI built with CUDA* +support provided by Intel in the IFS installer 10.4 or newer. This Open MPI +build is identified with the "-cuda-hfi" tag to the Open MPI base version +name. The NVIDIA* CUDA* support changes have also been accepted into v2.1.3, +v3.0.1 and v3.1.0 branches of upstream Open MPI repository. + +PSM2 header and runtime files need to be installed on a node where the Open MPI +build is performed. All compute nodes additionally should have the PSM2 runtime +libraries available on them. Open MPI provides a standard configure, make and +make install mechanism which will detect and build the relevant PSM2 network +modules for Open MPI once the header and runtime files are detected. + +MVAPICH2 support +---------------- +MVAPICH2 supports PSM2 transport for optimized communication on HFI hardware. +OPA IFS supports MVAPICH2 v2.1 (or later). PSM2 header and runtime files +need to be installed on a node where MVAPICH2 builds are performed. All +compute nodes should also have the PSM2 runtime libraries available on them. + +For building and installing MVAPICH2 with OPA support, refer to MVAPICH2 +user guides here: +http://mvapich.cse.ohio-state.edu/userguide/ + +(Note: Support for PSM2 is included in v2.2 and newer) + +OFED Support +------------ +Intel OPA is not yet included within OFED. But the hfi1 driver is available +publicly at kernel.org. + +SUPPORTING DOCUMENTATION +------------------------ +PSM2 Programmer's Guide is published along with documentation for "IntelĀ® Omni-Path +Host Fabric Interface PCIe Adapter 100 Series" +(https://www.intel.com/content/www/us/en/support/articles/000016242/network-and-i-o/fabric-products.html) + +Refer to this document for description on APIs and environment variables that +are available for use. For sample code on writing applications leveraging the +PSM2 APIs, refer to Section 5. + +PSM Compatibility Support +------------ + +libpsm2-compat suppports applications that use the PSM API instead of +the PSM2 API, through a compatibility library. This library is an interface +between PSM applications and the PSM2 API. + +If the system has an application that is coded to use PSM and has requirements +to use PSM2 (i.e. the host has Omni-Path hardware), the compatibility library +must be used. + +Please refer to your operating system's documentation to find how to modify the +order in which system directories are searched for dynamic libraries. The +libpsm2-compat version of libpsm_infinipath.so.1 must be earlier on the search +path than that of libpsm-infinipath. Doing so allows applications coded to PSM +to transparently use the PSM2 API and devices which require it. + +Please note that the installation path for the libpsm2-compat version of +libpsm_infinipath.so.1 will differ depending on your operating system +specifics. Common locations include: +- /usr/lib64/psm2-compat/ +- /usr/lib/psm2-compat/ + diff --git a/buildflags.mak b/buildflags.mak new file mode 100644 index 0000000..6790fb7 --- /dev/null +++ b/buildflags.mak @@ -0,0 +1,223 @@ +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2016 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2016 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Copyright (c) 2003-2016 Intel Corporation. All rights reserved. +# + +# set top_srcdir and include this file + +ifeq (,$(top_srcdir)) +$(error top_srcdir must be set to include makefile fragment) +endif + +export os ?= $(shell uname -s | tr '[A-Z]' '[a-z]') +export arch := $(shell uname -m | sed -e 's,\(i[456]86\|athlon$$\),i386,') + +ifeq (${CCARCH},gcc) + export CC := gcc +else + ifeq (${CCARCH},gcc4) + export CC := gcc4 + else + ifeq (${CCARCH},icc) + export CC := icc + else + anerr := $(error Unknown C compiler arch: ${CCARCH}) + endif # ICC + endif # gcc4 +endif # gcc + +ifeq (${FCARCH},gfortran) + export FC := gfortran +else + anerr := $(error Unknown Fortran compiler arch: ${FCARCH}) +endif # gfortran + +BASECFLAGS := $(BASE_FLAGS) -pthread +LDFLAGS += $(BASE_FLAGS) +ASFLAGS += $(BASE_FLAGS) + +ifeq ($(PSM2_MOCK_TESTING),1) +BASECFLAGS += -DPSM2_MOCK_TESTING=1 +unexport LINKER_SCRIPT +# We skip the linker script for mock testing version, we want all symbols +# to be reachable from outside the library +else +LINKER_SCRIPT := -Wl,--version-script $(LINKER_SCRIPT_FILE) +endif + +WERROR := -Werror +INCLUDES := -I$(top_srcdir)/include -I$(top_srcdir)/mpspawn -I$(top_srcdir)/include/$(os)-$(arch) + +# +# use IFS provided hfi1_user.h if installed. +# +IFS_HFI_HEADER_PATH := /usr/include/uapi +INCLUDES += -I${IFS_HFI_HEADER_PATH} + +BASECFLAGS +=-Wall $(WERROR) + +# +# test if compiler supports 32B(AVX2)/64B(AVX512F) move instruction. +# +ifeq (${CC},icc) + ifeq ($(PSM_DISABLE_AVX2),) + MAVX2=-xATOM_SSE4.2 -DPSM_AVX512 + else + MAVX2=-march=core-avx-i + endif +else + ifeq ($(PSM_DISABLE_AVX2),) + MAVX2=-mavx2 + else + MAVX2=-mavx + endif +endif + +ifneq (icc,${CC}) + ifeq ($(PSM_DISABLE_AVX2),) + RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX2 ; echo $$?) + else + RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX ; echo $$?) + $(warning ***NOTE TO USER**** Disabling AVX2 will harm performance) + endif + + ifeq (0,${RET}) + BASECFLAGS += ${MAVX2} + else + $(error Compiler does not support ${MAVX2} ) + endif +else + BASECFLAGS += ${MAVX2} +endif + +# This support is dynamic at runtime, so is OK to enable as long as compiler can generate +# the code. +ifneq (,${PSM_AVX512}) + ifneq (icc,${CC}) + RET := $(shell echo "int main() {}" | ${CC} -mavx512f -E -dM -xc - 2>&1 | grep -q AVX512 ; echo $$?) + ifeq (0,${RET}) + BASECFLAGS += -mavx512f + else + $(error Compiler does not support AVX512 ) + endif + BASECFLAGS += -DPSM_AVX512 + endif +endif + +# +# feature test macros for drand48_r +# +BASECFLAGS += -D_DEFAULT_SOURCE -D_SVID_SOURCE -D_BSD_SOURCE + +ifneq (,${HFI_BRAKE_DEBUG}) + BASECFLAGS += -DHFI_BRAKE_DEBUG +endif +ifneq (,${PSM_DEBUG}) + BASECFLAGS += -O -g3 -DPSM_DEBUG -D_HFI_DEBUGGING -funit-at-a-time -Wp,-D_FORTIFY_SOURCE=2 +else + BASECFLAGS += -O3 -g3 +endif +ifneq (,${PSM_COVERAGE}) # This check must come after PSM_DEBUG to override optimization setting + BASECFLAGS += -O -fprofile-arcs -ftest-coverage + LDFLAGS += -fprofile-arcs +endif +ifneq (,${PSM_LOG}) + BASECFLAGS += -DPSM_LOG +ifneq (,${PSM_LOG_FAST_IO}) + BASECFLAGS += -DPSM_LOG_FAST_IO + PSM2_ADDITIONAL_GLOBALS += psmi_log_fini;psmi_log_message; +endif +endif +ifneq (,${PSM_PERF}) + BASECFLAGS += -DRDPMC_PERF_FRAMEWORK +endif +ifneq (,${PSM_HEAP_DEBUG}) + BASECFLAGS += -DPSM_HEAP_DEBUG + PSM2_ADDITIONAL_GLOBALS += _psmi_heapdebug_val_heapallocs; +endif +ifneq (,${PSM_PROFILE}) + BASECFLAGS += -DPSM_PROFILE +endif +ifneq (,${PSM_CUDA}) + BASECFLAGS += -DNVIDIA_GPU_DIRECT -DPSM_CUDA + CUDA_HOME ?= /usr/local/cuda + INCLUDES += -I$(CUDA_HOME)/include +endif + +BASECFLAGS += -fpic -fPIC -D_GNU_SOURCE + +ASFLAGS += -g3 -fpic + +BASECFLAGS += ${OPA_CFLAGS} + +ifeq (${CCARCH},icc) + BASECFLAGS += -fpic -fPIC -D_GNU_SOURCE -DPACK_STRUCT_STL=packed, + LDFLAGS += -static-intel +else + ifeq (${CCARCH},gcc) + BASECFLAGS += -funwind-tables -Wno-strict-aliasing -Wformat-security + else + ifneq (${CCARCH},gcc4) + $(error Unknown compiler arch "${CCARCH}") + endif # gcc4 + endif # gcc +endif # icc + +# We run export here to ensure all the above setup is in the environment +# for sub makes. However, we exclude this during clean and distclean +# to avoid resolution of some variables that don't need to be resolved +# and avoid unnecessary missing file warnings during cleanup. +ifneq ($(MAKECMDGOALS), clean) +ifneq ($(MAKECMDGOALS), distclean) +export +endif +endif + diff --git a/compat/40-psm-compat.rules b/compat/40-psm-compat.rules new file mode 100644 index 0000000..fc7c4b1 --- /dev/null +++ b/compat/40-psm-compat.rules @@ -0,0 +1,52 @@ +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2015 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2015 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +KERNEL=="hfi1", SYMLINK+="ipath" +KERNEL=="hfi1_[0-9]", MODE="0666", SYMLINK+="ipath" diff --git a/compat/Makefile b/compat/Makefile new file mode 100644 index 0000000..996b7e9 --- /dev/null +++ b/compat/Makefile @@ -0,0 +1,90 @@ +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2015 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2015 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +OUTDIR = . + +COMPATLIB := libpsm_infinipath +COMPAT_LIB_TARG := $(INSTALL_LIB_TARG)/psm2-compat +compat_build_dir := $(shell readlink -m .) + +MAJOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_COMPAT_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' ../psm2.h) + +top_srcdir := $(compat_build_dir)/.. +include $(compat_build_dir)/buildflags.mak +INCLUDES += -I$(top_srcdir) + +${COMPATLIB}-objs := psm-compat.o +${COMPATLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${COMPATLIB}-objs}) + +DEPS:= $(${COMPATLIB}-objs:.o=.d) +-include $(DEPS) + +all .DEFAULT: ${${COMPATLIB}-objs} $(OUTDIR)/${COMPATLIB}.so.${MAJOR} + +install: all + install -m 0644 -D 40-psm-compat.rules ${DESTDIR}$(UDEVDIR)/rules.d/40-psm-compat.rules + install -m 0644 -D libpsm2-compat.conf ${DESTDIR}${LIBPSM2_COMPAT_CONF_DIR}/modprobe.d/libpsm2-compat.conf + install -m 0755 -D libpsm2-compat.cmds ${DESTDIR}/usr/lib/libpsm2/libpsm2-compat.cmds + install -D $(OUTDIR)/${COMPATLIB}.so.${MAJOR} ${DESTDIR}${COMPAT_LIB_TARG}/${COMPATLIB}.so.${MAJOR} + +$(OUTDIR)/%.o: $(compat_build_dir)/%.c + $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -MMD -c $< -o $@ + +$(OUTDIR)/${COMPATLIB}.so.${MAJOR}: ${${COMPATLIB}-objs} + $(CC) $(BASECFLAGS) $(LINKER_SCRIPT) $(LDFLAGS) -Wl,-soname=${COMPATLIB}.so.${MAJOR} -shared \ + -L$(OUTDIR)/.. ${${COMPATLIB}-objs} -lpsm2 -o $@ + +clean: + @if [ -d $(OUTDIR) ]; then \ + cd $(OUTDIR); \ + rm -f *.o *.d *.gcda *.gcno ${COMPATLIB}.*; \ + cd -; \ + fi diff --git a/compat/buildflags.mak b/compat/buildflags.mak new file mode 100644 index 0000000..b448e4e --- /dev/null +++ b/compat/buildflags.mak @@ -0,0 +1,100 @@ +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2015 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2015 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +ifeq (,$(top_srcdir)) +$(error top_srcdir must be set to include makefile fragment) +endif + +export os ?= $(shell uname -s | tr '[A-Z]' '[a-z]') +export arch := $(shell uname -m | sed -e 's,\(i[456]86\|athlon$$\),i386,') +export CCARCH ?= gcc + +ifeq (${CCARCH},gcc) + export CC := gcc +else + ifeq (${CCARCH},gcc4) + export CC := gcc4 + else + ifeq (${CCARCH},icc) + export CC := icc + else + anerr := $(error Unknown C compiler arch: ${CCARCH}) + endif # ICC + endif # gcc4 +endif # gcc + +BASECFLAGS += $(BASE_FLAGS) +LDFLAGS += $(BASE_FLAGS) +ASFLAGS += $(BASE_FLAGS) + +LINKER_SCRIPT_FILE := psm2_compat_linker_script.map +LINKER_SCRIPT := -Wl,--version-script $(LINKER_SCRIPT_FILE) +WERROR := -Werror +INCLUDES := -I$(top_srcdir)/include -I$(top_srcdir)/include/$(os)-$(arch) -I$(top_srcdir)/mpspawn + +BASECFLAGS +=-Wall $(WERROR) + +BASECFLAGS += -fpic -fPIC + +ASFLAGS += -g3 -fpic + +ifeq (${CCARCH},icc) + BASECFLAGS += -O3 -g3 + LDFLAGS += -static-intel +else + ifeq (${CCARCH},gcc) + BASECFLAGS += -Wno-strict-aliasing + else + ifneq (${CCARCH},gcc4) + $(error Unknown compiler arch "${CCARCH}") + endif + endif +endif diff --git a/compat/libpsm2-compat.cmds b/compat/libpsm2-compat.cmds new file mode 100755 index 0000000..dcead1e --- /dev/null +++ b/compat/libpsm2-compat.cmds @@ -0,0 +1,70 @@ +#!/bin/sh +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2015 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2015 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# This script was created to allow for both an hfi1 and qib adapter +# to co-exist on the same machine. +# The simlink from /dev/ipath is removed to allow ib_qib to load +# correctly and create a proper device file. + +case "$1" in +start) + # Remove symlink if hfi1 was loaded first + if [ -L "/dev/ipath" ]; then + rm /dev/ipath + fi + ;; +stop) + # Restore symlink if hfi1 is loaded + if [ -f "/dev/hfi1" ] && ! [ -L "/dev/ipath" ]; then + ln -s /dev/hfi1 /dev/ipath + fi + ;; +esac diff --git a/compat/libpsm2-compat.conf b/compat/libpsm2-compat.conf new file mode 100644 index 0000000..d71e8f2 --- /dev/null +++ b/compat/libpsm2-compat.conf @@ -0,0 +1,52 @@ +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2015 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2015 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +install ib_qib /usr/lib/libpsm2/libpsm2-compat.cmds start; modprobe -i ib_qib $CMDLINE_OPTS +remove ib_qib modprobe -r -i ib_qib && /usr/lib/libpsm2/libpsm2-compat.cmds stop diff --git a/compat/psm-compat.c b/compat/psm-compat.c new file mode 100644 index 0000000..7d12165 --- /dev/null +++ b/compat/psm-compat.c @@ -0,0 +1,336 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "../psm2.h" +#include "../psm2_mq.h" +#include "../psm2_am.h" + +/* Functions from TS psm.h */ +psm2_error_t +psm_init(int *major, int *minor) +{ + return psm2_init(major, minor); +} + +psm2_error_t +psm_finalize(void) +{ + return psm2_finalize(); +} + +psm2_error_t +psm_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames) +{ + return psm2_map_nid_hostname(num, nids, hostnames); +} + +void +psm_epaddr_setlabel(psm2_epaddr_t epaddr, char const *epaddr_label) +{ + return psm2_epaddr_setlabel(epaddr, epaddr_label); +} + +void +psm_epaddr_setctxt(psm2_epaddr_t epaddr, void *ctxt) +{ + psm2_epaddr_setctxt(epaddr, ctxt); +} + +void * +psm_epaddr_getctxt(psm2_epaddr_t epaddr) +{ + return psm2_epaddr_getctxt(epaddr); +} + +psm2_error_t +psm_setopt(psm2_component_t component, const void *component_obj, + int optname, const void *optval, uint64_t optlen) +{ + return psm2_setopt(component, component_obj, + optname, optval, optlen); +} + +psm2_error_t +psm_getopt(psm2_component_t component, const void *component_obj, + int optname, void *optval, uint64_t *optlen) +{ + return psm2_getopt(component, component_obj, + optname, optval, optlen); +} + +psm2_error_t +psm_poll(psm2_ep_t ep) +{ + return psm2_poll(ep); +} + +void +psm_uuid_generate(psm2_uuid_t uuid_out) +{ + psm2_uuid_generate(uuid_out); +} + +/* Functions from TS psm_am.h */ +psm2_error_t +psm_am_register_handlers(psm2_ep_t ep, + const psm2_am_handler_fn_t *handlers, + int num_handlers, int *handlers_idx) +{ + return psm2_am_register_handlers(ep, handlers, num_handlers, handlers_idx); +} + +psm2_error_t +psm_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler, + psm2_amarg_t *args, int nargs, void *src, size_t len, + int flags, psm2_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + return psm2_am_request_short(epaddr, handler, args, nargs, src, len, flags, completion_fn, completion_ctxt); +} + +psm2_error_t +psm_am_reply_short(psm2_am_token_t token, psm2_handler_t handler, + psm2_amarg_t *args, int nargs, void *src, size_t len, + int flags, psm2_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + return psm2_am_reply_short(token, handler, args, nargs, src, len, flags, completion_fn, completion_ctxt); +} + +psm2_error_t +psm_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters, + size_t sizeof_parameters_in, + size_t *sizeof_parameters_out) +{ + return psm2_am_get_parameters(ep, parameters, sizeof_parameters_in, sizeof_parameters_out); +} + + +/* Functions from TS psm_error.h */ + +psm2_error_t +psm_error_defer(psm2_error_token_t token) +{ + return psm2_error_defer(token); +} + +psm2_error_t +psm_error_register_handler(psm2_ep_t ep, const psm2_ep_errhandler_t errhandler) +{ + return psm2_error_register_handler(ep, errhandler); +} + +const char * +psm_error_get_string(psm2_error_t error) +{ + return psm2_error_get_string(error); +} + +/* Functions from TS psm_mq.h */ +psm2_error_t +psm_mq_iprobe(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, psm2_mq_status_t *status) +{ + return psm2_mq_iprobe(mq, tag, tagsel, status); +} + +psm2_error_t +psm_mq_cancel(psm2_mq_req_t *ireq) +{ + return psm2_mq_cancel(ireq); +} + +psm2_error_t +psm_mq_wait(psm2_mq_req_t *ireq, psm2_mq_status_t *status) +{ + return psm2_mq_wait(ireq, status); +} + +psm2_error_t +psm_mq_test(psm2_mq_req_t *ireq, psm2_mq_status_t *status) +{ + return psm2_mq_test(ireq, status); +} + +psm2_error_t +psm_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, + const void *buf, uint32_t len, void *context, psm2_mq_req_t *req) +{ + return psm2_mq_isend(mq, dest, flags, stag, buf, len, context, req); +} + +psm2_error_t +psm_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, + const void *buf, uint32_t len) +{ + return psm2_mq_send(mq, dest, flags, stag, buf, len); +} + +psm2_error_t +psm_mq_irecv(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, uint32_t flags, + void *buf, uint32_t len, void *context, psm2_mq_req_t *reqo) +{ + return psm2_mq_irecv(mq, tag, tagsel, flags, buf, len, context, reqo); +} + +psm2_error_t +psm_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status_t *status) +{ + return psm2_mq_ipeek(mq, oreq, status); +} + +psm2_error_t +psm_mq_getopt(psm2_mq_t mq, int key, void *value) +{ + return psm2_mq_getopt(mq, key, value); +} + +psm2_error_t +psm_mq_setopt(psm2_mq_t mq, int key, const void *value) +{ + return psm2_mq_setopt(mq, key, value); +} + +psm2_error_t +psm_mq_init(psm2_ep_t ep, uint64_t tag_order_mask, + const struct psm2_optkey *opts, + int numopts, psm2_mq_t *mqo) +{ + return psm2_mq_init(ep, tag_order_mask, opts, numopts, mqo); +} + +psm2_error_t +psm_mq_finalize(psm2_mq_t mq) +{ + return psm2_mq_finalize(mq); +} + +void +psm_mq_get_stats(psm2_mq_t mq, psm2_mq_stats_t *stats) +{ + psm2_mq_get_stats(mq, stats); +} + +/* Functions from TS psm_mq.h */ +psm2_error_t +psm_ep_num_devunits(uint32_t *num_units_o) +{ + return psm2_ep_num_devunits(num_units_o); +} + +uint64_t +psm_epid_nid(psm2_epid_t epid) +{ + return psm2_epid_nid(epid); +} + +uint64_t +psm_epid_context(psm2_epid_t epid) +{ + return psm2_epid_context(epid); +} + +uint64_t +psm_epid_port(psm2_epid_t epid) +{ + return psm2_epid_port(epid); +} + +psm2_error_t +psm_ep_query (int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo) +{ + return psm2_ep_query (num_of_epinfo, array_of_epinfo); +} + +psm2_error_t +psm_ep_epid_lookup (psm2_epid_t epid, psm2_epconn_t *epconn) +{ + return psm2_ep_epid_lookup (epid, epconn); +} + +psm2_error_t +psm_ep_epid_share_memory(psm2_ep_t ep, psm2_epid_t epid, int *result_o) +{ + return psm2_ep_epid_share_memory(ep, epid, result_o); +} + +psm2_error_t +psm_ep_open_opts_get_defaults(struct psm2_ep_open_opts *opts) +{ + return psm2_ep_open_opts_get_defaults(opts); +} + +psm2_error_t +psm_ep_open(psm2_uuid_t const unique_job_key, struct psm2_ep_open_opts const *opts_i, + psm2_ep_t *epo, psm2_epid_t *epido) +{ + return psm2_ep_open(unique_job_key, opts_i, epo, epido); +} + +psm2_error_t +psm_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in) +{ + return psm2_ep_close(ep, mode, timeout_in); +} + +psm2_error_t +psm_ep_connect(psm2_ep_t ep, int num_of_epid, + psm2_epid_t const *array_of_epid, + int const *array_of_epid_mask, + psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, + int64_t timeout) +{ + return psm2_ep_connect(ep, num_of_epid, array_of_epid, array_of_epid_mask, + array_of_errors, array_of_epaddr, timeout); +} diff --git a/compat/psm2_compat_linker_script.map b/compat/psm2_compat_linker_script.map new file mode 100644 index 0000000..0933c68 --- /dev/null +++ b/compat/psm2_compat_linker_script.map @@ -0,0 +1,66 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* See http://sourceware.org/binutils/docs/ld/VERSION.html#VERSION for more info. + C++ // Comments don't work in this file. */ + +PSM_1.0 +{ + /* Expose only those symbols we choose to. This way we do not + pollute users namespace more than absolutely necessary. */ + global: + psm_*; + + /* Make all other symbols local */ + local: + *; +}; diff --git a/debian/changelog.in b/debian/changelog.in new file mode 100644 index 0000000..2b3f41b --- /dev/null +++ b/debian/changelog.in @@ -0,0 +1,11 @@ +libpsm2 (10.2.91) UNRELEASED; urgency=medium + + * Add Ubuntu support + + -- Tymoteusz Kielan Thu, 08 Dec 2016 11:49:12 +0100 + +hfi1-psm (0.7) UNRELEASED; urgency=medium + + * Initial release + + -- Brian T. Smith Mon, 14 Mar 2016 12:26:35 -0500 diff --git a/debian/changelog.tmpl b/debian/changelog.tmpl new file mode 100644 index 0000000..2b3f41b --- /dev/null +++ b/debian/changelog.tmpl @@ -0,0 +1,11 @@ +libpsm2 (10.2.91) UNRELEASED; urgency=medium + + * Add Ubuntu support + + -- Tymoteusz Kielan Thu, 08 Dec 2016 11:49:12 +0100 + +hfi1-psm (0.7) UNRELEASED; urgency=medium + + * Initial release + + -- Brian T. Smith Mon, 14 Mar 2016 12:26:35 -0500 diff --git a/debian/compat b/debian/compat new file mode 100644 index 0000000..ec63514 --- /dev/null +++ b/debian/compat @@ -0,0 +1 @@ +9 diff --git a/debian/control b/debian/control new file mode 100644 index 0000000..57e05e0 --- /dev/null +++ b/debian/control @@ -0,0 +1,26 @@ +Source: libpsm2 +Maintainer: Tymoteusz Kielan +Section: libs +Priority: optional +Standards-Version: 3.9.8 +Build-Depends: debhelper (>= 9), uuid-dev, libnuma-dev + +Package: libpsm2 +Architecture: linux-any +Depends: ${misc:Depends}, ${shlibs:Depends} +Description: Intel PSM2 library + PSM2 is Intel's low-level user-space communications interface for + the Intel(R) OPA family of products. PSM2 users are enabled with + mechanisms necessary to implement higher level communications + interfaces in parallel environments. + +Package: libpsm2-dev +Architecture: linux-any +Section: libdevel +Depends: ${misc:Depends}, libpsm2 (= ${binary:Version}), uuid-dev, libnuma-dev +Description: Development files for Intel PSM2 library + PSM2 is Intel's low-level user-space communications interface for + the Intel(R) OPA family of products. PSM2 users are enabled with + mechanisms necessary to implement higher level communications + interfaces in parallel environments. This package contains the + development headers for Intel PSM2 library. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 0000000..c5c9349 --- /dev/null +++ b/debian/copyright @@ -0,0 +1,38 @@ + +/usr/share/common-licenses/GPL-2 + +Contact Information: +Intel Corporation, www.intel.com + +BSD LICENSE + +Copyright(c) 2017 Intel Corporation. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Copyright(c) 2014-2017 Intel Corporation. All rights reserved. +Copyright(c) 2016 System Fabric Works, Inc. All Rights Reserved. diff --git a/debian/libpsm2-dev.install b/debian/libpsm2-dev.install new file mode 100644 index 0000000..81db1ed --- /dev/null +++ b/debian/libpsm2-dev.install @@ -0,0 +1,21 @@ +/usr/lib/libpsm2.so +/usr/include/psm2.h +/usr/include/psm2_mq.h +/usr/include/psm2_am.h +/usr/include/hfi1diag/hfi1_deprecated_gen1.h +/usr/include/hfi1diag/linux-x86_64/bit_ops.h +/usr/include/hfi1diag/linux-x86_64/sysdep.h +/usr/include/hfi1diag/opa_udebug.h +/usr/include/hfi1diag/opa_debug.h +/usr/include/hfi1diag/opa_intf.h +/usr/include/hfi1diag/opa_user.h +/usr/include/hfi1diag/opa_service.h +/usr/include/hfi1diag/opa_byteorder.h +/usr/include/hfi1diag/opa_common_gen1.h +/usr/include/hfi1diag/opa_revision.h +/usr/include/hfi1diag/opa_service.h +/usr/include/hfi1diag/opa_service_gen1.h +/usr/include/hfi1diag/opa_user.h +/usr/include/hfi1diag/opa_user_gen1.h +/usr/include/hfi1diag/psm2_mock_testing.h +/usr/include/hfi1diag/psmi_wrappers.h diff --git a/debian/libpsm2.install b/debian/libpsm2.install new file mode 100644 index 0000000..f4f30e3 --- /dev/null +++ b/debian/libpsm2.install @@ -0,0 +1,3 @@ +/usr/lib/libpsm2.so.2.1 +/usr/lib/libpsm2.so.2 +/lib/udev/rules.d/40-psm.rules diff --git a/debian/rules b/debian/rules new file mode 100755 index 0000000..3ccf7d3 --- /dev/null +++ b/debian/rules @@ -0,0 +1,12 @@ +#!/usr/bin/make -f + +export DEB_BUILD_MAINT_OPTIONS=hardening=+all + +# Specify the library installation directory +export LIBDIR=/usr/lib + +%: + dh $@ --parallel + +override_dh_installdocs: + dh_installdocs --link-doc=libpsm2 diff --git a/debian/source/format b/debian/source/format new file mode 100644 index 0000000..89ae9db --- /dev/null +++ b/debian/source/format @@ -0,0 +1 @@ +3.0 (native) diff --git a/debian/source/options b/debian/source/options new file mode 100644 index 0000000..b7bc1f2 --- /dev/null +++ b/debian/source/options @@ -0,0 +1 @@ +compression = "xz" diff --git a/debian/symbols b/debian/symbols new file mode 100644 index 0000000..6a7f883 --- /dev/null +++ b/debian/symbols @@ -0,0 +1,75 @@ +libpsm2.so.2 libpsm2 #MINVER# + PSM2_1.0@PSM2_1.0 10.2 + __hfi_dbgout@PSM2_1.0 10.2 + __hfi_mylabel@PSM2_1.0 10.2 + __hfi_pico_per_cycle@PSM2_1.0 10.2 + hfi_cmd_write@PSM2_1.0 10.2 + hfi_context_close@PSM2_1.0 10.2 + hfi_context_open@PSM2_1.0 10.2 + hfi_debug@PSM2_1.0 10.2 + hfi_get_mylabel@PSM2_1.0 10.2 + hfi_get_port_lid@PSM2_1.0 10.2 + hfi_get_port_vl2mtu@PSM2_1.0 10.2 + hfi_mmap64@PSM2_1.0 10.2 + hfi_poll_type@PSM2_1.0 10.2 + hfi_set_mylabel@PSM2_1.0 10.2 + hfi_userinit@PSM2_1.0 10.2 + hfi_wait_for_packet@PSM2_1.0 10.2 + psm2_am_get_parameters@PSM2_1.0 10.2 + psm2_am_get_source@PSM2_1.0 10.2 + psm2_am_register_handlers@PSM2_1.0 10.2 + psm2_am_reply_short@PSM2_1.0 10.2 + psm2_am_request_short@PSM2_1.0 10.2 + psm2_capabilities_bitset@PSM2_1.0 10.3.0 + psm2_ep_close@PSM2_1.0 10.2 + psm2_ep_connect@PSM2_1.0 10.2 + psm2_ep_disconnect2@PSM2_1.0 10.3.0 + psm2_ep_disconnect@PSM2_1.0 10.2 + psm2_ep_epid_lookup2@PSM2_1.0 10.3.0 + psm2_ep_epid_lookup@PSM2_1.0 10.2 + psm2_ep_epid_share_memory@PSM2_1.0 10.2 + psm2_ep_num_devunits@PSM2_1.0 10.2 + psm2_ep_open@PSM2_1.0 10.2 + psm2_ep_open_opts_get_defaults@PSM2_1.0 10.2 + psm2_ep_query@PSM2_1.0 10.2 + psm2_epaddr_getctxt@PSM2_1.0 10.2 + psm2_epaddr_setctxt@PSM2_1.0 10.2 + psm2_epaddr_setlabel@PSM2_1.0 10.2 + psm2_epaddr_to_epid@PSM2_1.0 10.3.0 + psm2_epid_context@PSM2_1.0 10.2 + psm2_epid_nid@PSM2_1.0 10.2 + psm2_epid_port@PSM2_1.0 10.2 + psm2_error_defer@PSM2_1.0 10.2 + psm2_error_get_string@PSM2_1.0 10.2 + psm2_error_register_handler@PSM2_1.0 10.2 + psm2_finalize@PSM2_1.0 10.2 + psm2_get_capability_mask@PSM2_1.0 10.3.0 + psm2_getopt@PSM2_1.0 10.2 + psm2_init@PSM2_1.0 10.2 + psm2_map_nid_hostname@PSM2_1.0 10.2 + psm2_mq_cancel@PSM2_1.0 10.2 + psm2_mq_finalize@PSM2_1.0 10.2 + psm2_mq_get_stats@PSM2_1.0 10.2 + psm2_mq_getopt@PSM2_1.0 10.2 + psm2_mq_improbe2@PSM2_1.0 10.2 + psm2_mq_improbe@PSM2_1.0 10.2 + psm2_mq_imrecv@PSM2_1.0 10.2 + psm2_mq_init@PSM2_1.0 10.2 + psm2_mq_ipeek2@PSM2_1.0 10.2 + psm2_mq_ipeek@PSM2_1.0 10.2 + psm2_mq_iprobe2@PSM2_1.0 10.2 + psm2_mq_iprobe@PSM2_1.0 10.2 + psm2_mq_irecv2@PSM2_1.0 10.2 + psm2_mq_irecv@PSM2_1.0 10.2 + psm2_mq_isend2@PSM2_1.0 10.2 + psm2_mq_isend@PSM2_1.0 10.2 + psm2_mq_send2@PSM2_1.0 10.2 + psm2_mq_send@PSM2_1.0 10.2 + psm2_mq_setopt@PSM2_1.0 10.2 + psm2_mq_test2@PSM2_1.0 10.2 + psm2_mq_test@PSM2_1.0 10.2 + psm2_mq_wait2@PSM2_1.0 10.2 + psm2_mq_wait@PSM2_1.0 10.2 + psm2_poll@PSM2_1.0 10.2 + psm2_setopt@PSM2_1.0 10.2 + psm2_uuid_generate@PSM2_1.0 10.2 diff --git a/include/linux-i386/bit_ops.h b/include/linux-i386/bit_ops.h new file mode 100644 index 0000000..d272e75 --- /dev/null +++ b/include/linux-i386/bit_ops.h @@ -0,0 +1,98 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _HFI_i386_BIT_OPS_H +#define _HFI_i386_BIT_OPS_H + +static __inline__ void ips_clear_bit(int nr, volatile unsigned long *addr) +{ + asm volatile (LOCK_PREFIX "btrl %1,%0" : "=m"(*addr) : "dIr"(nr)); +} + +static __inline__ void ips_change_bit(int nr, volatile unsigned long *addr) +{ + asm volatile (LOCK_PREFIX "btcl %1,%0" : "=m"(*addr) : "dIr"(nr)); +} + +static __inline__ int ips_test_and_set_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile (LOCK_PREFIX "btsl %2,%1\n\tsbbl %0,%0" : "=r"(oldbit), + "=m"(*addr) : "dIr"(nr) : "memory"); + return oldbit; +} + +static __inline__ void ips___clear_bit(int nr, volatile unsigned long *addr) +{ + asm volatile ("btrl %1,%0" : "=m" (*addr) : "dIr"(nr)); +} + +static __inline__ void ips___change_bit(int nr, volatile unsigned long *addr) +{ + asm volatile ("btcl %1,%0" : "=m" (*addr) : "dIr"(nr)); +} + +static __inline__ int ips___test_and_set_bit(int nr, + volatile unsigned long *addr) +{ + int oldbit; + + asm volatile ("btsl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), + "=m"(*addr) : "dIr"(nr) : "memory"); + return oldbit; +} + +#endif /* _HFI_i386_BIT_OPS_H */ diff --git a/include/linux-i386/sysdep.h b/include/linux-i386/sysdep.h new file mode 100644 index 0000000..bfd5746 --- /dev/null +++ b/include/linux-i386/sysdep.h @@ -0,0 +1,171 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _HFI_i386_SYSDEP_H +#define _HFI_i386_SYSDEP_H + +typedef struct cpuid { + unsigned eax, ebx, ecx, edx; +} cpuid_t; + +static __inline__ void +get_cpuid(const unsigned func, const unsigned subfunc, cpuid_t *id) +{ + unsigned a, b, c, d; + + asm (" \ + mov %4, %%eax \n\ + mov %5, %%ecx \n\ + cpuid \n\ + mov %%eax, %0 \n\ + mov %%ebx, %1 \n\ + mov %%ecx, %2 \n\ + mov %%edx, %3 \n\ + " : "=g" (a), "=g" (b), "=g" (c), "=g" (d) + : "g" (func), "g" (subfunc) + : "%eax", "%ebx", "%ecx", "%edx" + ); + + id->eax = a; + id->ebx = b; + id->ecx = c; + id->edx = d; +} + +static __inline__ uint64_t get_cycles(void) +{ + uint64_t v; + uint32_t a, d; + + asm volatile ("rdtsc" : "=a" (a), "=d"(d)); + v = ((uint64_t) a) | (((uint64_t) d) << 32); + + return v; +} + +#ifndef LOCK_PREFIX +#define LOCK_PREFIX "lock " +#endif + +static __inline__ void ips_barrier() +{ + asm volatile ("" : : : "memory"); +} + +static __inline__ void ips_mb() +{ + asm volatile ("mfence" : : : "memory"); +} + +/* gcc-3.4 has a bug with this function body at -O0 */ +static +#if defined(__GNUC__) && __GNUC__ == 3 && __GNUC_MINOR__ == 4 +#else +__inline__ +#endif +void ips_rmb() +{ + asm volatile ("" : : : "memory"); +} + +static __inline__ void ips_wmb() +{ + asm volatile ("sfence" : : : "memory"); +} + +static __inline__ void ips_sync_writes() +{ + asm volatile ("sfence" : : : "memory"); +} + +static __inline__ void ips_sync_reads() +{ + asm volatile ("lfence" : : : "memory"); +} + +static __inline__ uint32_t ips_cmpxchg(volatile uint32_t *ptr, + uint32_t old_val, uint32_t new_val) +{ + uint32_t prev; + struct xchg_dummy { + uint32_t a[100]; + }; + + asm volatile (LOCK_PREFIX "cmpxchgl %1,%2" : "=a"(prev) + : "q"(new_val), "m"(*(struct xchg_dummy *)ptr), "0"(old_val) + : "memory"); + + return prev; +} + +typedef struct { + volatile int32_t counter; +} ips_atomic_t; + +#define ips_atomic_set(v, i) (((v)->counter) = (i)) +#define ips_atomic_cmpxchg(p, oval, nval) \ + ips_cmpxchg((volatile uint32_t *) &((p)->counter), oval, nval) + +#if 0 +static __inline__ int32_t +ips_cmpxchg(volatile int32_t *p, int32_t old_value, int32_t new_value) +{ + asm volatile ("lock cmpxchg %2, %0" : + "+m" (*p), "+a"(old_value) : "r"(new_value) : "memory"); + return old_value; +} +#endif + +#endif /* _HFI_i386_SYSDEP_H */ diff --git a/include/opa_byteorder.h b/include/opa_byteorder.h new file mode 100644 index 0000000..bc909c1 --- /dev/null +++ b/include/opa_byteorder.h @@ -0,0 +1,265 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef OPA_BYTEORDER_H +#define OPA_BYTEORDER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#ifndef __BYTE_ORDER +# error "BYTE_ORDER undefined" +#endif + +typedef __u16 __le16; +typedef __u16 __be16; +typedef __u32 __le32; +typedef __u32 __be32; +typedef __u64 __le64; +typedef __u64 __be64; + +static __inline__ __u16 __hfi_fswab16(__u16) + __attribute__ ((always_inline)); +static __inline__ __u32 __hfi_fswab32(__u32) + __attribute__ ((always_inline)); +static __inline__ __u64 __hfi_fswab64(__u64) + __attribute__ ((always_inline)); + +static __inline__ __u16 __hfi_fswab16(__u16 x) { + return ((x & (__u16) 0x00ffU) << 8) + | ((x & (__u16) 0xff00U) >> 8); +} static __inline__ __u32 __hfi_fswab32(__u32 x) { + return ((x & (__u32) 0x000000ffUL) << 24) + | ((x & (__u32) 0x0000ff00UL) << 8) + | ((x & (__u32) 0x00ff0000UL) >> 8) + | ((x & (__u32) 0xff000000UL) >> 24); +} + +static __inline__ __u64 __hfi_fswab64(__u64 x) { + return ((x & (__u64) 0x00000000000000ffULL) << 56) + | ((x & (__u64) 0x000000000000ff00ULL) << 40) + | ((x & (__u64) 0x0000000000ff0000ULL) << 24) + | ((x & (__u64) 0x00000000ff000000ULL) << 8) + | ((x & (__u64) 0x000000ff00000000ULL) >> 8) + | ((x & (__u64) 0x0000ff0000000000ULL) >> 24) + | ((x & (__u64) 0x00ff000000000000ULL) >> 40) + | ((x & (__u64) 0xff00000000000000ULL) >> 56); +} + +static __inline__ __u16 __cpu_to_le16(__le16) + __attribute__ ((always_inline)); +static __inline__ __u32 __cpu_to_le32(__le32) + __attribute__ ((always_inline)); +static __inline__ __u64 __cpu_to_le64(__le64) + __attribute__ ((always_inline)); + +static __inline__ __u16 __le16_to_cpu(__le16) + __attribute__ ((always_inline)); +static __inline__ __u32 __le32_to_cpu(__le32) + __attribute__ ((always_inline)); +static __inline__ __u64 __le64_to_cpu(__le64) + __attribute__ ((always_inline)); + +static __inline__ __u16 __cpu_to_be16(__be16) + __attribute__ ((always_inline)); +static __inline__ __u32 __cpu_to_be32(__be32) + __attribute__ ((always_inline)); +static __inline__ __u64 __cpu_to_be64(__be64) + __attribute__ ((always_inline)); + +static __inline__ __u16 __be16_to_cpu(__be16) + __attribute__ ((always_inline)); +static __inline__ __u32 __be32_to_cpu(__be32) + __attribute__ ((always_inline)); +static __inline__ __u64 __be64_to_cpu(__be64) + __attribute__ ((always_inline)); + +#if __BYTE_ORDER == __LITTLE_ENDIAN + +/* + * __cpu_to_le* routines + */ +static __inline__ __le16 __cpu_to_le16(__u16 x) { + return x; +} + +static __inline__ __le32 __cpu_to_le32(__u32 x) { + return x; +} + +static __inline__ __le64 __cpu_to_le64(__u64 x) { + return x; +} + +/* + * __le*_to_cpu routines + */ +static __inline__ __u16 __le16_to_cpu(__le16 x) { + return x; +} + +static __inline__ __u32 __le32_to_cpu(__le32 x) { + return x; +} + +static __inline__ __u64 __le64_to_cpu(__le64 x) { + return x; +} + +/* + * __cpu_to_be* routines + */ +static __inline__ __be16 __cpu_to_be16(__u16 x) { + return __hfi_fswab16(x); +} + +static __inline__ __be32 __cpu_to_be32(__u32 x) { + return __hfi_fswab32(x); +} + +static __inline__ __be64 __cpu_to_be64(__u64 x) { + return __hfi_fswab64(x); +} + +/* + * __be*_to_cpu routines + */ +static __inline__ __u16 __be16_to_cpu(__be16 x) { + return __hfi_fswab16(x); +} + +static __inline__ __u32 __be32_to_cpu(__be32 x) { + return __hfi_fswab32(x); +} + +static __inline__ __u64 __be64_to_cpu(__be64 x) { + return __hfi_fswab64(x); +} + +#elif __BYTE_ORDER == __BIG_ENDIAN + +/* + * __cpu_to_le* routines + */ +static __inline__ __le16 __cpu_to_le16(__u16 x) { + return __hfi_fswab16(x); +} + +static __inline__ __le32 __cpu_to_le32(__u32 x) { + return __hfi_fswab32(x); +} + +static __inline__ __le64 __cpu_to_le64(__u64 x) { + return __hfi_fswab64(x); +} + +/* + * __le*_to_cpu routines + */ +static __inline__ __u16 __le16_to_cpu(__le16 x) { + return __hfi_fswab16(x); +} + +static __inline__ __u32 __le32_to_cpu(__le32 x) { + return __hfi_fswab32(x); +} + +static __inline__ __u64 __le64_to_cpu(__le64 x) { + return __hfi_fswab64(x); +} + +/* + * __cpu_to_be* routines + */ +static __inline__ __be16 __cpu_to_be16(__u16 x) { + return x; +} + +static __inline__ __be32 __cpu_to_be32(__u32 x) { + return x; +} + +static __inline__ __be64 __cpu_to_be64(__u64 x) { + return x; +} + +/* + * __be*_to_cpu routines + */ +static __inline__ __u16 __be16_to_cpu(__be16 x) { + return x; +} + +static __inline__ __u32 __be32_to_cpu(__be32 x) { + return x; +} + +static __inline__ __u64 __be64_to_cpu(__be64 x) { + return x; +} + +#else +# error "unsupported BYTE_ORDER: " #BYTE_ORDER +#endif + +#ifdef __cplusplus +} /* extern "C" */ +#endif +#endif /* OPA_BYTEORDER_H */ diff --git a/include/opa_debug.h b/include/opa_debug.h new file mode 100644 index 0000000..d5d8ff2 --- /dev/null +++ b/include/opa_debug.h @@ -0,0 +1,108 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef OPA_DEBUG_H +#define OPA_DEBUG_H + +#ifndef _HFI_DEBUGGING /* debugging enabled or not */ +#define _HFI_DEBUGGING 1 +#endif + +#if _HFI_DEBUGGING + +/* + * Mask values for debugging. The scheme allows us to compile out any + * of the debug tracing stuff, and if compiled in, to enable or disable + * dynamically. This can be set at modprobe time also: + * modprobe hfi.ko hfi_debug=7 + */ + +#define __HFI_INFO 0x1 /* generic low verbosity stuff */ +#define __HFI_DBG 0x2 /* generic debug */ +#define __HFI_TRSAMPLE 0x8 /* generate trace buffer sample entries */ +/* leave some low verbosity spots open */ +#define __HFI_VERBDBG 0x40 /* very verbose debug */ +#define __HFI_PKTDBG 0x80 /* print packet data */ +/* print process startup (init)/exit messages and important env vars */ +#define __HFI_PROCDBG 0x100 +/* print mmap/nopage stuff, not using VDBG any more */ +#define __HFI_MMDBG 0x200 +/* low-level environment variables */ +#define __HFI_ENVDBG 0x400 +#define __HFI_EPKTDBG 0x800 /* print error packet data */ +#define __HFI_CCADBG 0x1000 /* print CCA related events */ +#else /* _HFI_DEBUGGING */ + +/* + * define all of these even with debugging off, for the few places that do + * if(hfi_debug & _HFI_xyzzy), but in a way that will make the + * compiler eliminate the code + */ + +#define __HFI_INFO 0x0 /* generic low verbosity stuff */ +#define __HFI_DBG 0x0 /* generic debug */ +#define __HFI_TRSAMPLE 0x0 /* generate trace buffer sample entries */ +#define __HFI_VERBDBG 0x0 /* very verbose debug */ +#define __HFI_PKTDBG 0x0 /* print packet data */ +#define __HFI_PROCDBG 0x0 /* print process startup (init)/exit messages */ +/* print mmap/nopage stuff, not using VDBG any more */ +#define __HFI_MMDBG 0x0 +#define __HFI_CCADBG 0x0 /* print CCA related events */ + +#endif /* _HFI_DEBUGGING */ + +#define __HFI_VERBOSEDBG __HFI_VERBDBG + +#endif /* OPA_DEBUG_H */ diff --git a/include/opa_intf.h b/include/opa_intf.h new file mode 100644 index 0000000..7254187 --- /dev/null +++ b/include/opa_intf.h @@ -0,0 +1,98 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef OPA_INTF_H +#define OPA_INTF_H + +#include +#include +#include + +#ifdef __inline__ +#undef __inline__ +#endif +#define __inline__ inline __attribute__((always_inline, unused)) + +#include "sysdep.h" +#include "bit_ops.h" + +/* these aren't implemented for user mode, which is OK until we multi-thread */ +typedef struct _atomic { + uint32_t counter; +} atomic_t; /* no atomic_t type in user-land */ +#define atomic_set(a, v) ((a)->counter = (v)) +#define atomic_inc_return(a) (++(a)->counter) + +#if defined(__GNUC__) +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1L) +#endif +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0L) +#endif +#ifndef if_pt +#define if_pt(cond) if (likely(cond)) +#endif +#ifndef if_pf +#define if_pf(cond) if (unlikely(cond)) +#endif +#define _Pragma_unlikely +#define _Pragma_likely +#else +#error "Unsupported compiler" +#endif + +#define yield() sched_yield() +#endif /* OPA_INTF_H */ diff --git a/include/opa_queue.h b/include/opa_queue.h new file mode 100644 index 0000000..f3d9595 --- /dev/null +++ b/include/opa_queue.h @@ -0,0 +1,512 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + * $FreeBSD: src/sys/sys/queue.h,v 1.32.2.7 2002/04/17 14:21:02 des Exp $ + */ + +#ifndef OPA_QUEUE_H_ +#define OPA_QUEUE_H_ + +/* + * This file defines five types of data structures: singly-linked lists, + * singly-linked tail queues, lists, tail queues, and circular queues. + * + * A singly-linked list is headed by a single forward pointer. The elements + * are singly linked for minimum space and pointer manipulation overhead at + * the expense of O(n) removal for arbitrary elements. New elements can be + * added to the list after an existing element or at the head of the list. + * Elements being removed from the head of the list should use the explicit + * macro for this purpose for optimum efficiency. A singly-linked list may + * only be traversed in the forward direction. Singly-linked lists are ideal + * for applications with large datasets and few or no removals or for + * implementing a LIFO queue. + * + * A singly-linked tail queue is headed by a pair of pointers, one to the + * head of the list and the other to the tail of the list. The elements are + * singly linked for minimum space and pointer manipulation overhead at the + * expense of O(n) removal for arbitrary elements. New elements can be added + * to the list after an existing element, at the head of the list, or at the + * end of the list. Elements being removed from the head of the tail queue + * should use the explicit macro for this purpose for optimum efficiency. + * A singly-linked tail queue may only be traversed in the forward direction. + * Singly-linked tail queues are ideal for applications with large datasets + * and few or no removals or for implementing a FIFO queue. + * + * A list is headed by a single forward pointer (or an array of forward + * pointers for a hash table header). The elements are doubly linked + * so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before + * or after an existing element or at the head of the list. A list + * may only be traversed in the forward direction. + * + * A tail queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or + * after an existing element, at the head of the list, or at the end of + * the list. A tail queue may be traversed in either direction. + * + * A circle queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or after + * an existing element, at the head of the list, or at the end of the list. + * A circle queue may be traversed in either direction, but has a more + * complex end of list detection. + * + * For details on the use of these macros, see the queue(3) manual page. + * + * + * SLIST LIST STAILQ TAILQ CIRCLEQ + * _HEAD + + + + + + * _HEAD_INITIALIZER + + + + + + * _ENTRY + + + + + + * _INIT + + + + + + * _EMPTY + + + + + + * _FIRST + + + + + + * _NEXT + + + + + + * _PREV - - - + + + * _LAST - - + + + + * _FOREACH + + + + + + * _FOREACH_REVERSE - - - + + + * _INSERT_HEAD + + + + + + * _INSERT_BEFORE - + - + + + * _INSERT_AFTER + + + + + + * _INSERT_TAIL - - + + + + * _REMOVE_HEAD + - + - - + * _REMOVE + + + + + + * + */ + +/* + * Singly-linked List declarations. + */ +#define SLIST_HEAD(name, type) \ +struct name { \ + struct type *slh_first; /* first element */ \ +} + +#define SLIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define SLIST_ENTRY(type) \ +struct { \ + struct type *sle_next; /* next element */ \ +} + +/* + * Singly-linked List functions. + */ +#define SLIST_EMPTY(head) ((head)->slh_first == NULL) + +#define SLIST_FIRST(head) ((head)->slh_first) + +#define SLIST_FOREACH(var, head, field) \ + for ((var) = SLIST_FIRST((head)); \ + (var); \ + (var) = SLIST_NEXT((var), field)) + +#define SLIST_INIT(head) do { \ + SLIST_FIRST((head)) = NULL; \ +} while (0) + +#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \ + SLIST_NEXT((slistelm), field) = (elm); \ +} while (0) + +#define SLIST_INSERT_HEAD(head, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \ + SLIST_FIRST((head)) = (elm); \ +} while (0) + +#define SLIST_NEXT(elm, field) ((elm)->field.sle_next) + +#define SLIST_REMOVE(head, elm, type, field) do { \ + if (SLIST_FIRST((head)) == (elm)) { \ + SLIST_REMOVE_HEAD((head), field); \ + } \ + else { \ + struct type *curelm = SLIST_FIRST((head)); \ + while (SLIST_NEXT(curelm, field) != (elm)) \ + curelm = SLIST_NEXT(curelm, field); \ + SLIST_NEXT(curelm, field) = \ + SLIST_NEXT(SLIST_NEXT(curelm, field), field); \ + } \ +} while (0) + +#define SLIST_REMOVE_HEAD(head, field) do { \ + SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ +} while (0) + +/* + * Singly-linked Tail queue declarations. + */ +#define STAILQ_HEAD(name, type) \ +struct name { \ + struct type *stqh_first;/* first element */ \ + struct type **stqh_last;/* addr of last next element */ \ +} + +#define STAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).stqh_first } + +#define STAILQ_ENTRY(type) \ +struct { \ + struct type *stqe_next; /* next element */ \ +} + +/* + * Singly-linked Tail queue functions. + */ +#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) + +#define STAILQ_FIRST(head) ((head)->stqh_first) + +#define STAILQ_FOREACH(var, head, field) \ + for ((var) = STAILQ_FIRST((head)); \ + (var); \ + (var) = STAILQ_NEXT((var), field)) + +#define STAILQ_INIT(head) do { \ + STAILQ_FIRST((head)) = NULL; \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_NEXT((tqelm), field) = (elm); \ +} while (0) + +#define STAILQ_INSERT_HEAD(head, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_FIRST((head)) = (elm); \ +} while (0) + +#define STAILQ_INSERT_TAIL(head, elm, field) do { \ + STAILQ_NEXT((elm), field) = NULL; \ + *(head)->stqh_last = (elm); \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ +} while (0) + +#define STAILQ_LAST(head, type, field) \ + (STAILQ_EMPTY(head) ? \ + NULL : \ + ((struct type *) \ + ((char *)((head)->stqh_last) - offsetof(struct type, field)))) + +#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) + +#define STAILQ_REMOVE(head, elm, type, field) do { \ + if (STAILQ_FIRST((head)) == (elm)) { \ + STAILQ_REMOVE_HEAD(head, field); \ + } \ + else { \ + struct type *curelm = STAILQ_FIRST((head)); \ + while (STAILQ_NEXT(curelm, field) != (elm)) \ + curelm = STAILQ_NEXT(curelm, field); \ + if ((STAILQ_NEXT(curelm, field) = \ + STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\ + (head)->stqh_last = &STAILQ_NEXT((curelm), field);\ + } \ +} while (0) + +#define STAILQ_REMOVE_HEAD(head, field) do { \ + if ((STAILQ_FIRST((head)) = \ + STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \ + if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +/* + * List declarations. + */ +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + +#define LIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define LIST_ENTRY(type) \ +struct { \ + struct type *le_next; /* next element */ \ + struct type **le_prev; /* address of previous next element */ \ +} + +/* + * List functions. + */ + +#define LIST_EMPTY(head) ((head)->lh_first == NULL) + +#define LIST_FIRST(head) ((head)->lh_first) + +#define LIST_FOREACH(var, head, field) \ + for ((var) = LIST_FIRST((head)); \ + (var); \ + (var) = LIST_NEXT((var), field)) + +#define LIST_INIT(head) do { \ + LIST_FIRST((head)) = NULL; \ +} while (0) + +#define LIST_INSERT_AFTER(listelm, elm, field) do { \ + if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\ + LIST_NEXT((listelm), field)->field.le_prev = \ + &LIST_NEXT((elm), field); \ + LIST_NEXT((listelm), field) = (elm); \ + (elm)->field.le_prev = &LIST_NEXT((listelm), field); \ +} while (0) + +#define LIST_INSERT_BEFORE(listelm, elm, field) do { \ + (elm)->field.le_prev = (listelm)->field.le_prev; \ + LIST_NEXT((elm), field) = (listelm); \ + *(listelm)->field.le_prev = (elm); \ + (listelm)->field.le_prev = &LIST_NEXT((elm), field); \ +} while (0) + +#define LIST_INSERT_HEAD(head, elm, field) do { \ + if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ + LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\ + LIST_FIRST((head)) = (elm); \ + (elm)->field.le_prev = &LIST_FIRST((head)); \ +} while (0) + +#define LIST_NEXT(elm, field) ((elm)->field.le_next) + +#define LIST_REMOVE(elm, field) do { \ + if (LIST_NEXT((elm), field) != NULL) \ + LIST_NEXT((elm), field)->field.le_prev = \ + (elm)->field.le_prev; \ + *(elm)->field.le_prev = LIST_NEXT((elm), field); \ +} while (0) + +/* + * Tail queue declarations. + */ +#define TAILQ_HEAD(name, type) \ +struct name { \ + struct type *tqh_first; /* first element */ \ + struct type **tqh_last; /* addr of last next element */ \ +} + +#define TAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).tqh_first } + +#define TAILQ_ENTRY(type) \ +struct { \ + struct type *tqe_next; /* next element */ \ + struct type **tqe_prev; /* address of previous next element */ \ +} + +/* + * Tail queue functions. + */ +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) + +#define TAILQ_FIRST(head) ((head)->tqh_first) + +#define TAILQ_FOREACH(var, head, field) \ + for ((var) = TAILQ_FIRST((head)); \ + (var); \ + (var) = TAILQ_NEXT((var), field)) + +#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var); \ + (var) = TAILQ_PREV((var), headname, field)) + +#define TAILQ_INIT(head) do { \ + TAILQ_FIRST((head)) = NULL; \ + (head)->tqh_last = &TAILQ_FIRST((head)); \ +} while (0) + +#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + TAILQ_NEXT((listelm), field) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ +} while (0) + +#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ + (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ + TAILQ_NEXT((elm), field) = (listelm); \ + *(listelm)->field.tqe_prev = (elm); \ + (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ +} while (0) + +#define TAILQ_INSERT_HEAD(head, elm, field) do { \ + if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ + TAILQ_FIRST((head))->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + TAILQ_FIRST((head)) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ +} while (0) + +#define TAILQ_INSERT_TAIL(head, elm, field) do { \ + TAILQ_NEXT((elm), field) = NULL; \ + (elm)->field.tqe_prev = (head)->tqh_last; \ + *(head)->tqh_last = (elm); \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ +} while (0) + +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) + +#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) + +#define TAILQ_PREV(elm, headname, field) \ + (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) + +#define TAILQ_REMOVE(head, elm, field) do { \ + if ((TAILQ_NEXT((elm), field)) != NULL) \ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + (elm)->field.tqe_prev; \ + else \ + (head)->tqh_last = (elm)->field.tqe_prev; \ + *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ +} while (0) + +/* + * Circular queue declarations. + */ +#define CIRCLEQ_HEAD(name, type) \ +struct name { \ + struct type *cqh_first; /* first element */ \ + struct type *cqh_last; /* last element */ \ +} + +#define CIRCLEQ_HEAD_INITIALIZER(head) \ + { (void *)&(head), (void *)&(head) } + +#define CIRCLEQ_ENTRY(type) \ +struct { \ + struct type *cqe_next; /* next element */ \ + struct type *cqe_prev; /* previous element */ \ +} + +/* + * Circular queue functions. + */ +#define CIRCLEQ_EMPTY(head) ((head)->cqh_first == (void *)(head)) + +#define CIRCLEQ_FIRST(head) ((head)->cqh_first) + +#define CIRCLEQ_FOREACH(var, head, field) \ + for ((var) = CIRCLEQ_FIRST((head)); \ + (var) != (void *)(head) || ((var) = NULL); \ + (var) = CIRCLEQ_NEXT((var), field)) + +#define CIRCLEQ_FOREACH_REVERSE(var, head, field) \ + for ((var) = CIRCLEQ_LAST((head)); \ + (var) != (void *)(head) || ((var) = NULL); \ + (var) = CIRCLEQ_PREV((var), field)) + +#define CIRCLEQ_INIT(head) do { \ + CIRCLEQ_FIRST((head)) = (void *)(head); \ + CIRCLEQ_LAST((head)) = (void *)(head); \ +} while (0) + +#define CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do { \ + CIRCLEQ_NEXT((elm), field) = CIRCLEQ_NEXT((listelm), field); \ + CIRCLEQ_PREV((elm), field) = (listelm); \ + if (CIRCLEQ_NEXT((listelm), field) == (void *)(head)) \ + CIRCLEQ_LAST((head)) = (elm); \ + else \ + CIRCLEQ_PREV(CIRCLEQ_NEXT((listelm), field), field) = (elm);\ + CIRCLEQ_NEXT((listelm), field) = (elm); \ +} while (0) + +#define CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do { \ + CIRCLEQ_NEXT((elm), field) = (listelm); \ + CIRCLEQ_PREV((elm), field) = CIRCLEQ_PREV((listelm), field); \ + if (CIRCLEQ_PREV((listelm), field) == (void *)(head)) \ + CIRCLEQ_FIRST((head)) = (elm); \ + else \ + CIRCLEQ_NEXT(CIRCLEQ_PREV((listelm), field), field) = (elm);\ + CIRCLEQ_PREV((listelm), field) = (elm); \ +} while (0) + +#define CIRCLEQ_INSERT_HEAD(head, elm, field) do { \ + CIRCLEQ_NEXT((elm), field) = CIRCLEQ_FIRST((head)); \ + CIRCLEQ_PREV((elm), field) = (void *)(head); \ + if (CIRCLEQ_LAST((head)) == (void *)(head)) \ + CIRCLEQ_LAST((head)) = (elm); \ + else \ + CIRCLEQ_PREV(CIRCLEQ_FIRST((head)), field) = (elm); \ + CIRCLEQ_FIRST((head)) = (elm); \ +} while (0) + +#define CIRCLEQ_INSERT_TAIL(head, elm, field) do { \ + CIRCLEQ_NEXT((elm), field) = (void *)(head); \ + CIRCLEQ_PREV((elm), field) = CIRCLEQ_LAST((head)); \ + if (CIRCLEQ_FIRST((head)) == (void *)(head)) \ + CIRCLEQ_FIRST((head)) = (elm); \ + else \ + CIRCLEQ_NEXT(CIRCLEQ_LAST((head)), field) = (elm); \ + CIRCLEQ_LAST((head)) = (elm); \ +} while (0) + +#define CIRCLEQ_LAST(head) ((head)->cqh_last) + +#define CIRCLEQ_NEXT(elm, field) ((elm)->field.cqe_next) + +#define CIRCLEQ_PREV(elm, field) ((elm)->field.cqe_prev) + +#define CIRCLEQ_REMOVE(head, elm, field) do { \ + if (CIRCLEQ_NEXT((elm), field) == (void *)(head)) \ + CIRCLEQ_LAST((head)) = CIRCLEQ_PREV((elm), field); \ + else \ + CIRCLEQ_PREV(CIRCLEQ_NEXT((elm), field), field) = \ + CIRCLEQ_PREV((elm), field); \ + if (CIRCLEQ_PREV((elm), field) == (void *)(head)) \ + CIRCLEQ_FIRST((head)) = CIRCLEQ_NEXT((elm), field); \ + else \ + CIRCLEQ_NEXT(CIRCLEQ_PREV((elm), field), field) = \ + CIRCLEQ_NEXT((elm), field); \ +} while (0) + +#endif /* !OPA_QUEUE_H_ */ diff --git a/include/opa_revision.h b/include/opa_revision.h new file mode 100644 index 0000000..4a28821 --- /dev/null +++ b/include/opa_revision.h @@ -0,0 +1,64 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef OPA_REVISION_H +#define OPA_REVISION_H + +/* Those variables are defined in the _revision.c file +which is dynamically generated during building of the library */ +extern char psmi_hfi_IFS_version[]; +extern char psmi_hfi_build_timestamp[]; +extern char psmi_hfi_sources_checksum[]; +extern char psmi_hfi_git_checksum[]; + +#endif /* OPA_REVISION_H */ diff --git a/include/opa_service.h b/include/opa_service.h new file mode 100644 index 0000000..3ec4824 --- /dev/null +++ b/include/opa_service.h @@ -0,0 +1,105 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef OPA_SERVICE_H +#define OPA_SERVICE_H + +/* This file contains all the lowest level routines calling into sysfs */ +/* and qib driver. All other calls are based on these routines. */ + +#include + +#include "opa_intf.h" +#include "opa_udebug.h" +#include "opa_byteorder.h" + +/* upper and lower bounds for HFI port numbers */ +#define HFI_MIN_PORT 1 +#define HFI_MAX_PORT 1 + +/* any unit id to match. */ +#define HFI_UNIT_ID_ANY ((long)-1) +/* any port num to match. */ +#define HFI_PORT_NUM_ANY ((long)0) + +/* Statistics maintained by the driver */ +int hfi_get_stats(uint64_t *, int); +int hfi_get_stats_names(char **namep); +/* Counters maintained in the chip, globally, and per-prot */ +int hfi_get_ctrs_unit(int unitno, uint64_t *, int); +int hfi_get_ctrs_unit_names(int unitno, char **namep); +int hfi_get_ctrs_port(int unitno, int port, uint64_t *, int); +int hfi_get_ctrs_port_names(int unitno, char **namep); + +/* sysfs helper routines (only those currently used are exported; + * try to avoid using others) */ + +/* Initializes the following sysfs helper routines. */ +void sysfs_init(const char *dflt_hfi_class_path); + +/* read a string value into buff, no more than size bytes. + returns the number of bytes read */ +size_t hfi_sysfs_unit_port_read(uint32_t unit, uint32_t port, const char *attr, + char *buff, size_t size); + +/* read up to one page of malloc'ed data (caller must free), returning + number of bytes read or -1 */ +int hfi_hfifs_read(const char *attr, char **datap); +int hfi_hfifs_unit_read(uint32_t unit, const char *attr, char **data); + +int64_t hfi_sysfs_unit_read_node_s64(uint32_t unit); +/* these read directly into supplied buffer and take a count */ +int hfi_hfifs_rd(const char *, void *, int); +int hfi_hfifs_unit_rd(uint32_t unit, const char *, void *, int); + +#endif /* OPA_SERVICE_H */ diff --git a/include/opa_udebug.h b/include/opa_udebug.h new file mode 100644 index 0000000..9fd59cb --- /dev/null +++ b/include/opa_udebug.h @@ -0,0 +1,194 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef OPA_UDEBUG_H +#define OPA_UDEBUG_H + +#include +#include "opa_debug.h" + +extern unsigned hfi_debug; +const char *hfi_get_unit_name(int unit); +extern char *__progname; + +static const char hfi_ident_tag[] = "PSM2_IDENTIFY"; +char *hfi_get_mylabel(); + +#if _HFI_DEBUGGING + +extern char *__hfi_mylabel; +void hfi_set_mylabel(char *); +extern FILE *__hfi_dbgout; + +#define _HFI_UNIT_ERROR(unit, fmt, ...) \ + do { \ + _Pragma_unlikely \ + printf("%s%s: " fmt, __hfi_mylabel, __progname, \ + ##__VA_ARGS__); \ + } while (0) + +#define _HFI_ERROR(fmt, ...) \ + do { \ + _Pragma_unlikely \ + printf("%s%s: " fmt, __hfi_mylabel, __progname, \ + ##__VA_ARGS__); \ + } while (0) + +#define _HFI_INFO(fmt, ...) \ + do { \ + _Pragma_unlikely \ + if (unlikely(hfi_debug&__HFI_INFO)) \ + printf("%s%s: " fmt, __hfi_mylabel, __func__, \ + ##__VA_ARGS__); \ + } while (0) + +#define __HFI_PKTDBG_ON unlikely(hfi_debug & __HFI_PKTDBG) + +#define __HFI_DBG_WHICH(which, fmt, ...) \ + do { \ + _Pragma_unlikely \ + if (unlikely(hfi_debug&(which))) \ + fprintf(__hfi_dbgout, "%s%s: " fmt, __hfi_mylabel, __func__, \ + ##__VA_ARGS__); \ + } while (0) + +#define __HFI_DBG_WHICH_NOFUNC(which, fmt, ...) \ + do { \ + _Pragma_unlikely \ + if (unlikely(hfi_debug&(which))) \ + fprintf(__hfi_dbgout, "%s" fmt, __hfi_mylabel, \ + ##__VA_ARGS__); \ + } while (0) + +#define _HFI_DBG(fmt, ...) __HFI_DBG_WHICH(__HFI_DBG, fmt, ##__VA_ARGS__) +#define _HFI_VDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_VERBDBG, fmt, ##__VA_ARGS__) +#define _HFI_PDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_PKTDBG, fmt, ##__VA_ARGS__) +#define _HFI_EPDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_EPKTDBG, fmt, ##__VA_ARGS__) +#define _HFI_PRDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_PROCDBG, fmt, ##__VA_ARGS__) +#define _HFI_ENVDBG(lev, fmt, ...) \ + __HFI_DBG_WHICH_NOFUNC( \ + (lev == 0) ? __HFI_INFO : \ + (lev > 1 ? __HFI_ENVDBG : (__HFI_PROCDBG|__HFI_ENVDBG)),\ + "env " fmt, ##__VA_ARGS__) +#define _HFI_MMDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_MMDBG, fmt, ##__VA_ARGS__) +#define _HFI_CCADBG(fmt, ...) __HFI_DBG_WHICH(__HFI_CCADBG, fmt, ##__VA_ARGS__) + +/* + * Use these macros (_HFI_DBG_ON and _HFI_DBG_ALWAYS) together + * for a scope of code preparing debug info for printing; e.g. + * if (_HFI_DBG_ON) { + * // put your code here + * _HFI_DBG_ALWAYS(print your results here); + * } + */ +#define _HFI_DBG_ON unlikely(hfi_debug & __HFI_DBG) +#define _HFI_DBG_ALWAYS(fmt, ...) \ + do { \ + _Pragma_unlikely \ + fprintf(__hfi_dbgout, "%s" fmt, __hfi_mylabel, \ + ##__VA_ARGS__); \ + } while (0) + +#define _HFI_VDBG_ON unlikely(hfi_debug & __HFI_VERBDBG) +#define _HFI_VDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) + +#define _HFI_PRDBG_ON unlikely(hfi_debug & __HFI_PROCDBG) +#define _HFI_PRDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) + +#define _HFI_CCADBG_ON unlikely(hfi_debug & __HFI_CCADBG) +#define _HFI_CCADBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) + +#define _HFI_INFO_ON unlikely(hfi_debug & __HFI_INFO) +#define _HFI_INFO_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) + +#else /* ! _HFI_DEBUGGING */ + +#define _HFI_UNIT_ERROR(unit, fmt, ...) \ + do { \ + printf("%s" fmt, "", ##__VA_ARGS__); \ + } while (0) + +#define _HFI_ERROR(fmt, ...) \ + do { \ + printf("%s" fmt, "", ##__VA_ARGS__); \ + } while (0) + +#define _HFI_INFO(fmt, ...) + +#define __HFI_PKTDBG_ON 0 + +#define _HFI_DBG(fmt, ...) +#define _HFI_PDBG(fmt, ...) +#define _HFI_EPDBG(fmt, ...) +#define _HFI_PRDBG(fmt, ...) +#define _HFI_ENVDBG(lev, fmt, ...) +#define _HFI_VDBG(fmt, ...) +#define _HFI_MMDBG(fmt, ...) +#define _HFI_CCADBG(fmt, ...) + +#define _HFI_DBG_ON 0 +#define _HFI_DBG_ALWAYS(fmt, ...) +#define _HFI_VDBG_ON 0 +#define _HFI_VDBG_ALWAYS(fmt, ...) +#define _HFI_PRDBG_ON 0 +#define _HFI_PRDBG_ALWAYS(fmt, ...) +#define _HFI_CCADBG_ON 0 +#define _HFI_CCADBG_ALWAYS(fmt, ...) +#define _HFI_INFO_ON 0 +#define _HFI_INFO_ALWAYS(fmt, ...) + +#endif /* _HFI_DEBUGGING */ + +#endif /* OPA_UDEBUG_H */ diff --git a/include/opa_user.h b/include/opa_user.h new file mode 100644 index 0000000..637dacb --- /dev/null +++ b/include/opa_user.h @@ -0,0 +1,246 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef OPA_USER_H +#define OPA_USER_H + +/* This file contains all of the data structures and routines that are + publicly visible and usable (to low level infrastructure code; it is + not expected that any application, or even normal application-level library, + will ever need to use any of this). + + Additional entry points and data structures that are used by these routines + may be referenced in this file, but they should not be generally available; + they are visible here only to allow use in inlined functions. Any variable, + data structure, or function that starts with a leading "_" is in this + category. +*/ + +/* Include header files we need that are unlikely to otherwise be needed by */ +/* programs. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "opa_intf.h" +#include "opa_byteorder.h" +#include "opa_udebug.h" +#include "opa_service.h" + +#define HFI_TF_NFLOWS 32 + +/* IB - LRH header consts */ +#define HFI_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ +#define HFI_LRH_SC_SHIFT 12 +#define HFI_LRH_SC_MASK 0xf +#define HFI_LRH_SL_SHIFT 4 +#define HFI_LRH_SL_MASK 0xf +#define HFI_LRH_PKTLEN_MASK 0xfff + +/* IB - BTH header consts */ +#define HFI_BTH_OPCODE_SHIFT 24 +#define HFI_BTH_OPCODE_MASK 0xff +#define HFI_BTH_BECN_SHIFT 30 +#define HFI_BTH_FECN_SHIFT 31 +#define HFI_BTH_QP_SHIFT 16 +#define HFI_BTH_QP_MASK 0xff +#define HFI_BTH_FLOWID_SHIFT 11 +#define HFI_BTH_FLOWID_MASK 0x1f +#define HFI_BTH_SUBCTXT_SHIFT 8 +#define HFI_BTH_SUBCTXT_MASK 0x7 + +#define HFI_BTH_SEQ_SHIFT 0 +#define HFI_BTH_SEQ_MASK 0x7ff +#define HFI_BTH_GEN_SHIFT 11 +#define HFI_BTH_GEN_MASK 0xfffff +#define HFI_BTH_ACK_SHIFT 31 + +/* KDETH header consts */ +#define HFI_KHDR_OFFSET_MASK 0x7fff +#define HFI_KHDR_OM_SHIFT 15 +#define HFI_KHDR_TID_SHIFT 16 +#define HFI_KHDR_TID_MASK 0x3ff +#define HFI_KHDR_TIDCTRL_SHIFT 26 +#define HFI_KHDR_TIDCTRL_MASK 0x3 +#define HFI_KHDR_INTR_SHIFT 28 +#define HFI_KHDR_SH_SHIFT 29 +#define HFI_KHDR_KVER_SHIFT 30 +#define HFI_KHDR_KVER_MASK 0x3 + +#define HFI_KHDR_MSGSEQ_MASK 0xffff +#define HFI_KHDR_TINYLEN_MASK 0xf +#define HFI_KHDR_TINYLEN_SHIFT 16 + +#define GET_HFI_KHDR_TIDCTRL(val) \ + (((val) >> HFI_KHDR_TIDCTRL_SHIFT) & \ + HFI_KHDR_TIDCTRL_MASK) + +#ifdef PSM_CUDA +extern int is_driver_gpudirect_enabled; + +static __inline__ int _psmi_is_driver_gpudirect_enabled() __attribute__((always_inline)); + +static __inline__ int +_psmi_is_driver_gpudirect_enabled() +{ + return is_driver_gpudirect_enabled; +} +#define PSMI_IS_DRIVER_GPUDIRECT_ENABLED _psmi_is_driver_gpudirect_enabled() +#endif + +/* hfi kdeth header format */ +struct hfi_kdeth { + __u32 kdeth0; + + union { + struct { + __u16 job_key; + __u16 hcrc; + }; + __u32 kdeth1; + }; +}; + +/* misc. */ +#define HFI_CRC_SIZE_IN_BYTES 4 + +#define HFI_DEFAULT_SERVICE_ID 0x1000117500000000ULL +#define HFI_DEFAULT_P_KEY 0x8001 /* fabric default pkey for app traffic */ + +#if 0 +#define HFI_PERMISSIVE_LID 0xFFFF +#define HFI_AETH_CREDIT_SHIFT 24 +#define HFI_AETH_CREDIT_MASK 0x1F +#define HFI_AETH_CREDIT_INVAL 0x1F +#define HFI_PSN_MASK 0xFFFFFF +#define HFI_MSN_MASK 0xFFFFFF +#define HFI_QPN_MASK 0xFFFFFF +#define HFI_MULTICAST_LID_BASE 0xC000 +#define HFI_MULTICAST_QPN 0xFFFFFF +#endif + +/* Receive Header Queue: receive type (from hfi) */ +#define RCVHQ_RCV_TYPE_EXPECTED 0 +#define RCVHQ_RCV_TYPE_EAGER 1 +#define RCVHQ_RCV_TYPE_NON_KD 2 +#define RCVHQ_RCV_TYPE_ERROR 3 + +/* OPA PSM assumes that the message header is always 56 bytes. */ +#define HFI_MESSAGE_HDR_SIZE 56 + +/* interval timing routines */ +/* Convert a count of cycles to elapsed nanoseconds */ +/* this is only accurate for reasonably large numbers of cycles (at least tens) +*/ +static __inline__ uint64_t cycles_to_nanosecs(uint64_t) + __attribute__ ((always_inline)); +/* convert elapsed nanoseconds to elapsed cycles */ +/* this is only accurate for reasonably large numbers of nsecs (at least tens) +*/ +static __inline__ uint64_t nanosecs_to_cycles(uint64_t) + __attribute__ ((always_inline)); + +/* Statistics maintained by the driver */ +const char *hfi_get_next_name(char **names); +int hfi_get_stats_names_count(void); +/* Counters maintained in the chip, globally, and per-prot */ +int hfi_get_ctrs_unit_names_count(int unitno); +int hfi_get_ctrs_port_names_count(int unitno); + +uint64_t hfi_get_single_unitctr(int unit, const char *attr, uint64_t *s); +int hfi_get_single_portctr(int unit, int port, const char *attr, uint64_t *c); +void hfi_release_names(char *namep); + +/* Syslog wrapper + + level is one of LOG_EMERG, LOG_ALERT, LOG_CRIT, LOG_ERR, LOG_WARNING, + LOG_NOTICE, LOG_INFO, LOG_DEBUG. + + prefix should be a short string to describe which part of the software stack + is using syslog, i.e. "PSM", "mpi", "mpirun". +*/ +void hfi_syslog(const char *prefix, int to_console, int level, + const char *format, ...) + __attribute__((format(printf, 4, 5))); + +void hfi_vsyslog(const char *prefix, int to_console, int level, + const char *format, va_list ap); + +/* + * Copy routine that may copy a byte multiple times but optimized for througput + * This is not safe to use for PIO routines where we want a guarantee that a + * byte is only copied/moved across the bus once. + */ +void hfi_dwordcpy(volatile uint32_t *dest, const uint32_t *src, + uint32_t ndwords); + +extern uint32_t __hfi_pico_per_cycle; /* only for use in these functions */ + +/* this is only accurate for reasonably large numbers of cycles (at least tens) */ +static __inline__ uint64_t cycles_to_nanosecs(uint64_t cycs) +{ + return (__hfi_pico_per_cycle * cycs) / 1000ULL; +} + +/* this is only accurate for reasonably large numbers of nsecs (at least tens) */ +static __inline__ uint64_t nanosecs_to_cycles(uint64_t ns) +{ + return (ns * 1000ULL) / __hfi_pico_per_cycle; +} + +#endif /* OPA_USER_H */ diff --git a/include/psm2_mock_testing.h b/include/psm2_mock_testing.h new file mode 100644 index 0000000..d1e9bff --- /dev/null +++ b/include/psm2_mock_testing.h @@ -0,0 +1,176 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef PSM2_MOCK_TESTING_H +#define PSM2_MOCK_TESTING_H + +/* PSM2_MOCK_TESTING being defined flips a couple of switches so that a + * testable version of libpsm2.so is built. It'll make properly annotated + * static functions be non-static, visible to the outside. Also, all mockable + * functions will be replaced with function pointers which will originally + * point to the actual implementation. However, those function pointers might + * be reset by the test code, thus allowing for mocking selected PSM2 functions + * for the purpose of the test. + * + * So far the following utilities have been introduced for enabling a + * conditional compilation of the testable vs. production version of the library: + * - ustatic: toggles function visibility + * - MOCKABLE(): decorates function name so that it is visible after being mocked + * - MOCK_DCL_EPILOGUE(): declares a function pointer which will be the seam + * for mocking a function + * - MOCK_DEF_EPILOGUE(): defines a function pointer which will be the seam + * for mocking a function + * + * If the declaration and definition of a static function @c foo reside in + * different files, this would be the common use case: + * + * @code + * // somefile.c: + * int MOCKABLE(foo)(); + * MOCK_DCL_EPILOGUE(foo); + * + * // otherfile.c: + * int MOCKABLE(foo)() { + * printf("I am the original foo!\n"); + * } + * MOCK_DEF_EPILOGUE(foo); + * @endcode + * + * If the production version of the library is being built, the following code + * would result: + * @code + * // somefile.c: + * int foo(); + * + * // otherfile.c: + * int foo() { + * printf("I am the original foo!\n"); + * } + * @endcode + * + * On the other hand, if a testable version of the libary is being build, it + * would produce the following code: + * @code + * // somefile.c: + * int foo_original_(); + * extern typeof(& foo_original_) foo; + * + * // otherfile.c: + * int foo_original_() { + * printf("I am the original foo!\n"); + * } + * typeof(& foo_original_) foo = foo_original_; + * @endcode + * + * If the function to be mocked is a static function residing in the header, + * the following syntax would be used: + * @code + * // somefile.c: + * ustatic int MOCKABLE(foo)() { + * printf("I am the original foo!\n"); + * } + * MOCK_DCL_EPILOGUE(foo); + * MOCK_DEF_EPILOGUE(foo); + * @endcode + * + * If the production version of the library is being built, the following code + * would result: + * @code + * // somefile.c: + * static int foo() { + * printf("I am the original foo!\n"); + * } + * @endcode + * + * Similarly, if a testable version of the libary is being build, it would + * produce the following code: + * @code + * // somefile.c: + * int foo_original_(); + * extern typeof(& foo_original_) foo; + * typeof(& foo_original_) foo = foo_original_; + * @endcode + */ +#ifndef PSM2_MOCK_TESTING + +/* If no testing is being done, ustatic resolves to regular "static" */ +#define ustatic static +/* If no testing is being done, no indirection is introduced */ +#define MOCKABLE(fname) fname +/* If no testing is being done, no declaration epilogue is needed */ +#define MOCK_DCL_EPILOGUE(fname) +/* If no testing is being done, no definition epilogue is needed */ +#define MOCK_DEF_EPILOGUE(fname) + +#else /* ndef PSM2_MOCK_TESTING */ + +/* For the testable version, all _ustatic_ function will NOT be static */ +#define ustatic +/* TODO override inline directives in the same fashion as static */ +/* For the testable version, the actual implementation function is renamed */ +#define MOCKABLE(x) x ## _original_ +/* For the testable version, we declare the function pointer which will be the + * point of indirection for calls to that function. It must be delared after + * the declaration of the actual function happens. + */ +#define MOCK_DCL_EPILOGUE(x) extern typeof(& x ## _original_) x; +/* For the testable version, we define the function pointer which will be the + * point of indirection for calls to that function. It must be delared after + * the definition of the actual function happens. + */ +#define MOCK_DEF_EPILOGUE(x) typeof(& x ## _original_) x = x ## _original_; + +#endif /* ndef PSM2_MOCK_TESTING */ + +#endif /* PSM2_MOCK_TESTING_H */ + diff --git a/include/rbtree.c b/include/rbtree.c new file mode 100644 index 0000000..9d6930d --- /dev/null +++ b/include/rbtree.c @@ -0,0 +1,692 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +/* + * Abstract: + * Implementation of quick map, a binary tree where the caller always provides + * all necessary storage. + * + * Environment: + * All + * + * $Revision$ + */ + + +/***************************************************************************** +* +* Map +* +* Map is an associative array. By providing a key, the caller can retrieve +* an object from the map. All objects in the map have an associated key, +* as specified by the caller when the object was inserted into the map. +* In addition to random access, the caller can traverse the map much like +* a linked list, either forwards from the first object or backwards from +* the last object. The objects in the map are always traversed in +* order since the nodes are stored sorted. +* +* This implementation of Map uses a red black tree verified against +* Cormen-Leiserson-Rivest text, McGraw-Hill Edition, fourteenth +* printing, 1994. +* +*****************************************************************************/ + +#include /* for memset declaration */ + +#if !defined ( RBTREE_GET_LEFTMOST ) || \ + ! defined ( RBTREE_GET_RIGHTMOST ) || \ + ! defined ( RBTREE_MAP_COUNT ) || \ + ! defined ( RBTREE_ASSERT ) +#error "You must define RBTREE_GET_LEFTMOST and RBTREE_GET_RIGHTMOST and \ + RBTREE_MAP_COUNT and RBTREE_ASSERT before including rbtree.c" +#endif + +#define IN /* nothing */ + +/****************************************************************************** +******************************************************************************* +************** ************ +************** IMPLEMENTATION OF QUICK MAP ************ +************** ************ +******************************************************************************* +******************************************************************************/ + +/* Forward declarations: */ +static void ips_cl_qmap_init( + IN cl_qmap_t *p_map, + IN cl_map_item_t* const root, + IN cl_map_item_t* const nil); +static void ips_cl_qmap_insert_item( + IN cl_qmap_t* const p_map, + IN cl_map_item_t* const p_item); +static void ips_cl_qmap_remove_item( + IN cl_qmap_t* const p_map, + IN cl_map_item_t* const p_item); +static cl_map_item_t* ips_cl_qmap_successor( + IN cl_qmap_t* const p_map, + IN const cl_map_item_t* p_item); +static cl_map_item_t* ips_cl_qmap_predecessor( + IN cl_qmap_t* const p_map, + IN const cl_map_item_t* p_item); +static cl_map_item_t* ips_cl_qmap_search( + IN cl_qmap_t* const p_map, + IN unsigned long start, + IN unsigned long end); + +/* + * Get the root. + */ +static inline cl_map_item_t* +__cl_map_root( + IN const cl_qmap_t* const p_map ) +{ + RBTREE_ASSERT( p_map ); + return( p_map->root->p_left ); +} + + +/* + * Returns whether a given item is on the left of its parent. + */ +static int +__cl_map_is_left_child( + IN const cl_map_item_t* const p_item ) +{ + RBTREE_ASSERT( p_item ); + RBTREE_ASSERT( p_item->p_up ); + RBTREE_ASSERT( p_item->p_up != p_item ); + + return( p_item->p_up->p_left == p_item ); +} + + +/* + * Retrieve the pointer to the parent's pointer to an item. + */ +static cl_map_item_t** +__cl_map_get_parent_ptr_to_item( + IN cl_map_item_t* const p_item ) +{ + RBTREE_ASSERT( p_item ); + RBTREE_ASSERT( p_item->p_up ); + RBTREE_ASSERT( p_item->p_up != p_item ); + + if( __cl_map_is_left_child( p_item ) ) + return( &p_item->p_up->p_left ); + + RBTREE_ASSERT( p_item->p_up->p_right == p_item ); + return( &p_item->p_up->p_right ); +} + + +/* + * Rotate a node to the left. This rotation affects the least number of links + * between nodes and brings the level of C up by one while increasing the depth + * of A one. Note that the links to/from W, X, Y, and Z are not affected. + * + * R R + * | | + * A C + * / \ / \ + * W C A Z + * / \ / \ + * B Z W B + * / \ / \ + * X Y X Y + */ +static void +__cl_map_rot_left( + IN cl_qmap_t* const p_map, + IN cl_map_item_t* const p_item ) +{ + cl_map_item_t **pp_root; + + RBTREE_ASSERT( p_map ); + RBTREE_ASSERT( p_item ); + RBTREE_ASSERT( p_item->p_right != p_map->nil_item ); + + pp_root = __cl_map_get_parent_ptr_to_item( p_item ); + + /* Point R to C instead of A. */ + *pp_root = p_item->p_right; + /* Set C's parent to R. */ + (*pp_root)->p_up = p_item->p_up; + + /* Set A's right to B */ + p_item->p_right = (*pp_root)->p_left; + /* + * Set B's parent to A. We trap for B being NIL since the + * caller may depend on NIL not changing. + */ + if( (*pp_root)->p_left != p_map->nil_item ) + (*pp_root)->p_left->p_up = p_item; + + /* Set C's left to A. */ + (*pp_root)->p_left = p_item; + /* Set A's parent to C. */ + p_item->p_up = *pp_root; +} + + +/* + * Rotate a node to the right. This rotation affects the least number of links + * between nodes and brings the level of A up by one while increasing the depth + * of C one. Note that the links to/from W, X, Y, and Z are not affected. + * + * R R + * | | + * C A + * / \ / \ + * A Z W C + * / \ / \ + * W B B Z + * / \ / \ + * X Y X Y + */ +static void +__cl_map_rot_right( + IN cl_qmap_t* const p_map, + IN cl_map_item_t* const p_item ) +{ + cl_map_item_t **pp_root; + + RBTREE_ASSERT( p_map ); + RBTREE_ASSERT( p_item ); + RBTREE_ASSERT( p_item->p_left != p_map->nil_item ); + + /* Point R to A instead of C. */ + pp_root = __cl_map_get_parent_ptr_to_item( p_item ); + (*pp_root) = p_item->p_left; + /* Set A's parent to R. */ + (*pp_root)->p_up = p_item->p_up; + + /* Set C's left to B */ + p_item->p_left = (*pp_root)->p_right; + /* + * Set B's parent to C. We trap for B being NIL since the + * caller may depend on NIL not changing. + */ + if( (*pp_root)->p_right != p_map->nil_item ) + (*pp_root)->p_right->p_up = p_item; + + /* Set A's right to C. */ + (*pp_root)->p_right = p_item; + /* Set C's parent to A. */ + p_item->p_up = *pp_root; +} + +/* + * Balance a tree starting at a given item back to the root. + */ +static void +__cl_map_ins_bal( + IN cl_qmap_t* const p_map, + IN cl_map_item_t* p_item ) +{ + cl_map_item_t* p_grand_uncle; + + RBTREE_ASSERT( p_map ); + RBTREE_ASSERT( p_item ); + RBTREE_ASSERT( p_item != p_map->root ); + + while( p_item->p_up->color == CL_MAP_RED ) + { + if( __cl_map_is_left_child( p_item->p_up ) ) + { + p_grand_uncle = p_item->p_up->p_up->p_right; + RBTREE_ASSERT( p_grand_uncle ); + if( p_grand_uncle->color == CL_MAP_RED ) + { + p_grand_uncle->color = CL_MAP_BLACK; + p_item->p_up->color = CL_MAP_BLACK; + p_item->p_up->p_up->color = CL_MAP_RED; + p_item = p_item->p_up->p_up; + continue; + } + + if( !__cl_map_is_left_child( p_item ) ) + { + p_item = p_item->p_up; + __cl_map_rot_left( p_map, p_item ); + } + p_item->p_up->color = CL_MAP_BLACK; + p_item->p_up->p_up->color = CL_MAP_RED; + __cl_map_rot_right( p_map, p_item->p_up->p_up ); + } + else + { + p_grand_uncle = p_item->p_up->p_up->p_left; + RBTREE_ASSERT( p_grand_uncle ); + if( p_grand_uncle->color == CL_MAP_RED ) + { + p_grand_uncle->color = CL_MAP_BLACK; + p_item->p_up->color = CL_MAP_BLACK; + p_item->p_up->p_up->color = CL_MAP_RED; + p_item = p_item->p_up->p_up; + continue; + } + + if( __cl_map_is_left_child( p_item ) ) + { + p_item = p_item->p_up; + __cl_map_rot_right( p_map, p_item ); + } + p_item->p_up->color = CL_MAP_BLACK; + p_item->p_up->p_up->color = CL_MAP_RED; + __cl_map_rot_left( p_map, p_item->p_up->p_up ); + } + } +} + +static void ips_cl_qmap_init( + IN cl_qmap_t *p_map, + IN cl_map_item_t* const root, + IN cl_map_item_t* const nil_item) +{ + RBTREE_ASSERT( p_map ); + RBTREE_ASSERT( root ); + RBTREE_ASSERT( nil_item ); + + memset(p_map,0,sizeof(cl_qmap_t)); + + p_map->root = root; + + /* setup the RB tree map */ + p_map->nil_item = nil_item; + + p_map->root->p_up = p_map->root; + p_map->root->p_left = p_map->nil_item; + p_map->root->p_right = p_map->nil_item; + p_map->root->color = CL_MAP_BLACK; + + p_map->nil_item->p_up = p_map->nil_item; + p_map->nil_item->p_left = p_map->nil_item; + p_map->nil_item->p_right = p_map->nil_item; + p_map->nil_item->color = CL_MAP_BLACK; +} + +static void +ips_cl_qmap_insert_item( + IN cl_qmap_t* const p_map, + IN cl_map_item_t* const p_item ) +{ + cl_map_item_t *p_insert_at, *p_comp_item; + int compare_res = 0; + + RBTREE_ASSERT( p_map ); + RBTREE_ASSERT( p_item ); + RBTREE_ASSERT( p_map->root->p_up == p_map->root ); + RBTREE_ASSERT( p_map->root->color != CL_MAP_RED ); + RBTREE_ASSERT( p_map->nil_item->color != CL_MAP_RED ); + + /* Find the insertion location. */ + p_insert_at = p_map->root; + p_comp_item = __cl_map_root( p_map ); + + while( p_comp_item != p_map->nil_item ) + { + p_insert_at = p_comp_item; + + /* Traverse the tree until the correct insertion point is found. */ + if( RBTREE_GET_LEFTMOST(&p_item->payload) < RBTREE_GET_LEFTMOST(&p_insert_at->payload) ) + { + p_comp_item = p_insert_at->p_left; + compare_res = 1; + } else { + p_comp_item = p_insert_at->p_right; + compare_res = -1; + } + } + + RBTREE_ASSERT( p_insert_at != p_map->nil_item ); + RBTREE_ASSERT( p_comp_item == p_map->nil_item ); + + /* Insert the item. */ + p_item->p_left = p_map->nil_item; + p_item->p_right = p_map->nil_item; + p_item->color = CL_MAP_RED; + if( p_insert_at == p_map->root ) + { + p_insert_at->p_left = p_item; + } + else if( compare_res > 0 ) /* key < p_insert_at->key */ + { + p_insert_at->p_left = p_item; + } + else + { + p_insert_at->p_right = p_item; + } + /* Increase the count. */ + RBTREE_MAP_COUNT(&p_map->payload)++; + + p_item->p_up = p_insert_at; + + /* + * We have added depth to this section of the tree. + * Rebalance as necessary as we retrace our path through the tree + * and update colors. + */ + __cl_map_ins_bal( p_map, p_item ); + + __cl_map_root( p_map )->color = CL_MAP_BLACK; + + /* + * Note that it is not necessary to re-color the nil node black because all + * red color assignments are made via the p_up pointer, and nil is never + * set as the value of a p_up pointer. + */ +} + +static void +__cl_map_del_bal( + IN cl_qmap_t* const p_map, + IN cl_map_item_t* p_item ) +{ + cl_map_item_t *p_uncle; + + while( (p_item->color != CL_MAP_RED) && (p_item->p_up != p_map->root) ) + { + if( __cl_map_is_left_child( p_item ) ) + { + p_uncle = p_item->p_up->p_right; + + if( p_uncle->color == CL_MAP_RED ) + { + p_uncle->color = CL_MAP_BLACK; + p_item->p_up->color = CL_MAP_RED; + __cl_map_rot_left( p_map, p_item->p_up ); + p_uncle = p_item->p_up->p_right; + } + + if( p_uncle->p_right->color != CL_MAP_RED ) + { + if( p_uncle->p_left->color != CL_MAP_RED ) + { + p_uncle->color = CL_MAP_RED; + p_item = p_item->p_up; + continue; + } + + p_uncle->p_left->color = CL_MAP_BLACK; + p_uncle->color = CL_MAP_RED; + __cl_map_rot_right( p_map, p_uncle ); + p_uncle = p_item->p_up->p_right; + } + p_uncle->color = p_item->p_up->color; + p_item->p_up->color = CL_MAP_BLACK; + p_uncle->p_right->color = CL_MAP_BLACK; + __cl_map_rot_left( p_map, p_item->p_up ); + break; + } + else + { + p_uncle = p_item->p_up->p_left; + + if( p_uncle->color == CL_MAP_RED ) + { + p_uncle->color = CL_MAP_BLACK; + p_item->p_up->color = CL_MAP_RED; + __cl_map_rot_right( p_map, p_item->p_up ); + p_uncle = p_item->p_up->p_left; + } + + if( p_uncle->p_left->color != CL_MAP_RED ) + { + if( p_uncle->p_right->color != CL_MAP_RED ) + { + p_uncle->color = CL_MAP_RED; + p_item = p_item->p_up; + continue; + } + + p_uncle->p_right->color = CL_MAP_BLACK; + p_uncle->color = CL_MAP_RED; + __cl_map_rot_left( p_map, p_uncle ); + p_uncle = p_item->p_up->p_left; + } + p_uncle->color = p_item->p_up->color; + p_item->p_up->color = CL_MAP_BLACK; + p_uncle->p_left->color = CL_MAP_BLACK; + __cl_map_rot_right( p_map, p_item->p_up ); + break; + } + } + p_item->color = CL_MAP_BLACK; +} + +static void +ips_cl_qmap_remove_item( + IN cl_qmap_t* const p_map, + IN cl_map_item_t* const p_item ) +{ + cl_map_item_t *p_child, *p_del_item; + + RBTREE_ASSERT( p_map ); + RBTREE_ASSERT( p_item ); + + if( p_item == p_map->nil_item ) + return; + + if( (p_item->p_right == p_map->nil_item) || (p_item->p_left == p_map->nil_item ) ) + { + /* The item being removed has children on at most on side. */ + p_del_item = p_item; + } + else + { + /* + * The item being removed has children on both side. + * We select the item that will replace it. After removing + * the substitute item and rebalancing, the tree will have the + * correct topology. Exchanging the substitute for the item + * will finalize the removal. + */ + p_del_item = ips_cl_qmap_successor(p_map, p_item); + RBTREE_ASSERT( p_del_item != p_map->nil_item ); + } + + RBTREE_MAP_COUNT(&p_map->payload)--; + + /* Get the pointer to the new root's child, if any. */ + if( p_del_item->p_left != p_map->nil_item ) + p_child = p_del_item->p_left; + else + p_child = p_del_item->p_right; + + /* + * This assignment may modify the parent pointer of the nil node. + * This is inconsequential. + */ + p_child->p_up = p_del_item->p_up; + (*__cl_map_get_parent_ptr_to_item( p_del_item )) = p_child; + + if( p_del_item->color != CL_MAP_RED ) + __cl_map_del_bal( p_map, p_child ); + + /* + * Note that the splicing done below does not need to occur before + * the tree is balanced, since the actual topology changes are made by the + * preceding code. The topology is preserved by the color assignment made + * below (reader should be reminded that p_del_item == p_item in some cases). + */ + if( p_del_item != p_item ) + { + /* + * Finalize the removal of the specified item by exchanging it with + * the substitute which we removed above. + */ + p_del_item->p_up = p_item->p_up; + p_del_item->p_left = p_item->p_left; + p_del_item->p_right = p_item->p_right; + (*__cl_map_get_parent_ptr_to_item( p_item )) = p_del_item; + p_item->p_right->p_up = p_del_item; + p_item->p_left->p_up = p_del_item; + p_del_item->color = p_item->color; + } + + RBTREE_ASSERT( p_map->nil_item->color != CL_MAP_RED ); +} + +static cl_map_item_t * +ips_cl_qmap_successor( + IN cl_qmap_t* const p_map, + IN const cl_map_item_t* p_item ) +{ + cl_map_item_t *p_tmp; + + p_tmp = p_item->p_right; + if (p_tmp != p_map->nil_item) { + while (p_tmp->p_left != p_map->nil_item) + p_tmp = p_tmp->p_left; + return p_tmp; + } else { + p_tmp = p_item->p_up; + while (p_tmp->p_right == p_item) { + p_item = p_tmp; + p_tmp = p_tmp->p_up; + } + if (p_tmp == p_map->root) + return p_map->nil_item; + return p_tmp; + } +} + +static cl_map_item_t * +ips_cl_qmap_predecessor( + IN cl_qmap_t* const p_map, + IN const cl_map_item_t* p_item ) +{ + cl_map_item_t *p_tmp; + + p_tmp = p_item->p_left; + if (p_tmp != p_map->nil_item) { + while (p_tmp->p_right != p_map->nil_item) + p_tmp = p_tmp->p_right; + return p_tmp; + } else { + p_tmp = p_item->p_up; + while (p_tmp->p_left == p_item) { + p_item = p_tmp; + p_tmp = p_tmp->p_up; + } + if (p_tmp == p_map->root) + return p_map->nil_item; + return p_tmp; + } +} + +/* + * return the first node with buffer overlapping or zero. + */ +static cl_map_item_t * +ips_cl_qmap_search(cl_qmap_t * const p_map, + unsigned long start, unsigned long end) +{ + cl_map_item_t *p_item, *p_tmp; + + RBTREE_ASSERT( p_map ); + p_item = __cl_map_root(p_map); + + while (p_item != p_map->nil_item) { + if (start > RBTREE_GET_LEFTMOST(&p_item->payload)) { + p_tmp = p_item->p_right; + if (p_tmp != p_map->nil_item) { + p_item = p_tmp; + continue; + } + + /* + * p_item is on immediate left side of 'start'. + */ + if (start >= RBTREE_GET_RIGHTMOST(&p_item->payload)) { + /* + * p_item is on immediate right + * side of 'start'. + */ + p_item = ips_cl_qmap_successor(p_map, p_item); + if (p_item != p_map->nil_item && + end <= RBTREE_GET_LEFTMOST(&p_item->payload)) + p_item = p_map->nil_item; + } + } else if (start < RBTREE_GET_LEFTMOST(&p_item->payload)) { + p_tmp = p_item->p_left; + if (p_tmp != p_map->nil_item) { + p_item = p_tmp; + continue; + } + + /* + * p_tmp is on immediate left side of 'start'. + */ + p_tmp = ips_cl_qmap_predecessor(p_map, p_item); + if (p_tmp == p_map->nil_item || + (start >= RBTREE_GET_RIGHTMOST(&p_tmp->payload))) { + /* + * p_item is on immediate right + * side of 'start'. + */ + if (end <= RBTREE_GET_LEFTMOST(&p_item->payload)) + p_item = p_map->nil_item; + } else + p_item = p_tmp; + } + + break; + } + + + return p_item; +} diff --git a/include/rbtree.h b/include/rbtree.h new file mode 100644 index 0000000..13245b0 --- /dev/null +++ b/include/rbtree.h @@ -0,0 +1,90 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __RBTREE_H__ + +#define __RBTREE_H__ + +#include + +#ifndef RBTREE_MAP_PL +#error "You must define RBTREE_MAP_PL before including rbtree.h" +#endif + +#ifndef RBTREE_MI_PL +#error "You must define RBTREE_MI_PL before including rbtree.h" +#endif + +/* + * Red-Black tid cache definition. + */ +typedef struct _cl_map_item { + struct _cl_map_item *p_left; /* left pointer */ + struct _cl_map_item *p_right; /* right pointer */ + struct _cl_map_item *p_up; /* up pointer */ + uint16_t color; /* red-black color */ + + RBTREE_MI_PL payload; +} cl_map_item_t; + +typedef struct _cl_qmap { + cl_map_item_t *root; /* root node pointer */ + cl_map_item_t *nil_item; /* terminator node pointer */ + + RBTREE_MAP_PL payload; +} cl_qmap_t; + +#define CL_MAP_RED 0 +#define CL_MAP_BLACK 1 + +#endif diff --git a/libpsm2.spec.in b/libpsm2.spec.in new file mode 100644 index 0000000..b033dff --- /dev/null +++ b/libpsm2.spec.in @@ -0,0 +1,178 @@ +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2017 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2017 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +Summary: Intel PSM2 Libraries +Name: @RPM_NAME@ +Version: @VERSION@ +Release: 1@SPEC_FILE_RELEASE_DIST@ +License: BSD or GPLv2 +URL: https://github.com/intel/opa-psm2/ + +# The tarball can be created by: +# git clone https://github.com/intel/opa-psm2 +# cd opa-psm2 +# git checkout @DIST_SHA@ +# make dist +Source0: @RPM_NAME@-%{version}.tar.gz + +# The OPA product is supported on x86_64 only: +ExclusiveArch: x86_64 + +BuildRequires: gcc +Provides: hfi1-psm +Obsoletes: hfi1-psm < 1.0.0 + +%if "@RPM_NAME_BASEEXT@" +%package -n @RPM_NAME@@RPM_NAME_BASEEXT@ +%endif +Summary: Intel PSM2 Libraries +Provides: @RPM_NAME@ = %{version}-%{release} +Provides: @RPM_NAME@%{_isa} = %{version}-%{release} +%if 0%{?suse_version} +BuildRequires: libnuma-devel +Requires: libnuma1 +%else +%if 0%{?rhel}==0 || 0%{?rhel} > 6 +BuildRequires: systemd +BuildRequires: numactl-devel +Requires: numactl-libs +%endif +%endif + +%package -n @RPM_NAME@-devel +Summary: Development files for Intel PSM2 +Requires: %{name}%{?_isa} = %{version}-%{release} +Provides: hfi1-psm-devel +Obsoletes: hfi1-psm-devel < 1.0.0 + +%package -n @RPM_NAME@-compat +Summary: Compat library for Intel PSM2 +Requires: %{name}%{?_isa} = %{version}-%{release} +%if 0%{?fedora} +Requires: systemd-udev +%endif +Provides: hfi1-psm-compat +Obsoletes: hfi1-psm-compat < 1.0.0 + +# If an alternate basename is defined, like in SLES >=12.3 +# Then we generate a different base src.rpm, so use this +# description instead. +%if "@RPM_NAME_BASEEXT@" +%description +The source code for the PSM2 messaging API, libpsm2. +A low-level user-level communications interface for the Intel(R) OPA +family of products. PSM2 users are enabled with mechanisms +necessary to implement higher level communications +interfaces in parallel environments. +%endif + +# In distro's other than SLES >=12.3 we use a single description +# for both the .src.rpm and the base binary rpm. As the +# RPM_NAME_BASEEXT defaults to empty contents. +%description -n @RPM_NAME@@RPM_NAME_BASEEXT@ +PSM2 Messaging API, or PSM2 API, is the low-level +user-level communications interface for the Intel(R) OPA +family of products. PSM2 users are enabled with mechanisms +necessary to implement higher level communications +interfaces in parallel environments. + +%description -n @RPM_NAME@-devel +Intel(R) PSM2, psm2*.h, headers and libpsm2.so files necessary +for developing software using libpsm2. + +%description -n @RPM_NAME@-compat +Support for MPIs linked with PSM versions < 2. This will allow +software compiled to use Intel(R) Truescale PSM, libinfinipath, to run +with Intel(R) OPA PSM2, libpsm2. + +%prep +%setup -q -n @RPM_NAME@-%{version} + +%build +make %{?_smp_mflags} + +%install +%make_install + +%post -p /sbin/ldconfig +%postun -p /sbin/ldconfig + +%files -n @RPM_NAME@@RPM_NAME_BASEEXT@ +%if 0%{?rhel} && 0%{?rhel} < 7 +%{!?_licensedir:%global license %doc} +%endif +%license COPYING +%{_libdir}/@TARGLIB@.so.@MAJOR@.@MINOR@ +%{_libdir}/@TARGLIB@.so.@MAJOR@ +@40_PSM_RULES@ + +%files -n @RPM_NAME@-devel +%{_libdir}/@TARGLIB@.so +%{_libdir}/@TARGLIB@.a +%{_includedir}/psm2.h +%{_includedir}/psm2_mq.h +%{_includedir}/psm2_am.h +%{_includedir}/hfi1diag + +%files -n @RPM_NAME@-compat +%{_libdir}/psm2-compat +%if 0%{?rhel} && 0%{?rhel} < 7 +@UDEVDIR@/rules.d/40-psm-compat.rules +%else +%{_udevrulesdir}/40-psm-compat.rules +%endif +@LIBPSM2_COMPAT_SYM_CONF_DIR@/modprobe.d/libpsm2-compat.conf +%{_prefix}/lib/libpsm2 + +%changelog +* Wed Aug 30 2017 Rusell McGuire +- Adjust RPM names to match SLES 12.3 distro names +* Tue Apr 05 2016 Paul Reger +- Upstream PSM2 source code for Fedora. diff --git a/libuuid/Makefile b/libuuid/Makefile new file mode 100644 index 0000000..2f5babe --- /dev/null +++ b/libuuid/Makefile @@ -0,0 +1,91 @@ +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2015 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2015 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Copyright (c) 2003-2014 Intel Corporation. All rights reserved. +# + +OUTDIR = . + +this_srcdir := $(shell readlink -m .) +top_srcdir := $(this_srcdir)/.. +BASECFLAGS += -DPSM_UUID=1 -Wno-unused-function +INCLUDES += -I$(top_srcdir) + +${TARGLIB}-objs := psm_uuid.o parse.o pack.o unpack.o unparse.o +${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs}) + +DEPS := $(${TARGLIB}-objs:.o=.d) + +.PHONY: all clean +IGNORE_DEP_TARGETS = clean + +all .DEFAULT: ${${TARGLIB}-objs} + +$(OUTDIR)/%.d: $(this_srcdir)/%.c + $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o) + +$(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS} + $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -c $< -o $@ + +clean: + @if [ -d $(OUTDIR) ]; then \ + cd $(OUTDIR); \ + rm -f *.o *.d *.gcda *.gcno; \ + cd -; \ + fi + +#ifeq prevents the deps from being included during clean +#-include line is required to pull in auto-dependecies during 2nd pass +ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),) +-include ${DEPS} +endif + +install: + @echo "Nothing to do for install." diff --git a/libuuid/compare.c b/libuuid/compare.c new file mode 100644 index 0000000..44f275b --- /dev/null +++ b/libuuid/compare.c @@ -0,0 +1,53 @@ +/* + * compare.c --- compare whether or not two UUID's are the same + * + * Returns 0 if the two UUID's are different, and 1 if they are the same. + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#include "psm_uuid.h" +#include + +#define UUCMP(u1,u2) if (u1 != u2) return((u1 < u2) ? -1 : 1); + +int uuid_compare(const uuid_t uu1, const uuid_t uu2) +{ + struct uuid uuid1, uuid2; + + uuid_unpack(uu1, &uuid1); + uuid_unpack(uu2, &uuid2); + + UUCMP(uuid1.time_low, uuid2.time_low); + UUCMP(uuid1.time_mid, uuid2.time_mid); + UUCMP(uuid1.time_hi_and_version, uuid2.time_hi_and_version); + UUCMP(uuid1.clock_seq, uuid2.clock_seq); + return memcmp(uuid1.node, uuid2.node, 6); +} + diff --git a/libuuid/pack.c b/libuuid/pack.c new file mode 100644 index 0000000..801b891 --- /dev/null +++ b/libuuid/pack.c @@ -0,0 +1,69 @@ +/* + * Internal routine for packing UUID's + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#include +#include +#include "psm_user.h" +#include "psm_uuid.h" + +void uuid_pack(const struct uuid *uu, uuid_t ptr) +{ + uint32_t tmp; + unsigned char *out = ptr; + + tmp = uu->time_low; + out[3] = (unsigned char) tmp; + tmp >>= 8; + out[2] = (unsigned char) tmp; + tmp >>= 8; + out[1] = (unsigned char) tmp; + tmp >>= 8; + out[0] = (unsigned char) tmp; + + tmp = uu->time_mid; + out[5] = (unsigned char) tmp; + tmp >>= 8; + out[4] = (unsigned char) tmp; + + tmp = uu->time_hi_and_version; + out[7] = (unsigned char) tmp; + tmp >>= 8; + out[6] = (unsigned char) tmp; + + tmp = uu->clock_seq; + out[9] = (unsigned char) tmp; + tmp >>= 8; + out[8] = (unsigned char) tmp; + + memcpy(out+10, uu->node, 6); +} + diff --git a/libuuid/parse.c b/libuuid/parse.c new file mode 100644 index 0000000..dd8c258 --- /dev/null +++ b/libuuid/parse.c @@ -0,0 +1,78 @@ +/* + * parse.c --- UUID parsing + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#include +#include +#include +#include + +#include "psm_user.h" +#include "psm_uuid.h" + +int uuid_parse(const char *in, uuid_t uu) +{ + struct uuid uuid; + int i; + const char *cp; + char buf[3]; + + if (strlen(in) != 36) + return -1; + for (i=0, cp = in; i <= 36; i++,cp++) { + if ((i == 8) || (i == 13) || (i == 18) || + (i == 23)) { + if (*cp == '-') + continue; + else + return -1; + } + if (i== 36) + if (*cp == 0) + continue; + if (!isxdigit(*cp)) + return -1; + } + uuid.time_low = strtoul(in, NULL, 16); + uuid.time_mid = strtoul(in+9, NULL, 16); + uuid.time_hi_and_version = strtoul(in+14, NULL, 16); + uuid.clock_seq = strtoul(in+19, NULL, 16); + cp = in+24; + buf[2] = 0; + for (i=0; i < 6; i++) { + buf[0] = *cp++; + buf[1] = *cp++; + uuid.node[i] = strtoul(buf, NULL, 16); + } + + uuid_pack(&uuid, uu); + return 0; +} diff --git a/libuuid/psm_uuid.c b/libuuid/psm_uuid.c new file mode 100644 index 0000000..4db29a6 --- /dev/null +++ b/libuuid/psm_uuid.c @@ -0,0 +1,114 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include +#include +#include +#include "psm_user.h" +#include "psm_uuid.h" + +static void psmi_make_drand_uuid(psm2_uuid_t uuid_out) +{ + struct drand48_data drand48_data; + int i; + long int rnum; + srand48_r((get_cycles() + getpid()) % LONG_MAX, &drand48_data); + for(i=0; i < 16; i++) { + lrand48_r(&drand48_data, &rnum); + uuid_out[i] = rnum % UCHAR_MAX; + } +} + +/* Since libuuid can call srand, we will generate our own uuids */ +void +__psm2_uuid_generate(psm2_uuid_t uuid_out) +{ + PSM2_LOG_MSG("entering"); + /* Prefer using urandom, fallback to drand48_r */ + struct stat urandom_stat; + size_t nbytes; + int fd; + if(stat("/dev/urandom", &urandom_stat) != 0) { + psmi_make_drand_uuid(uuid_out); + return; + } + + fd = open("/dev/urandom", O_RDONLY); + if(fd == -1) { + psmi_make_drand_uuid(uuid_out); + } else { + nbytes = read(fd, (char *) uuid_out, 16); + if(nbytes != 16) { + psmi_make_drand_uuid(uuid_out); + } + close(fd); + } + PSM2_LOG_MSG("leaving"); + return; +} +PSMI_API_DECL(psm2_uuid_generate) + +void +psmi_uuid_unparse(const uuid_t uu, char *out) +{ + uuid_unparse_lower(uu, out); +} + +int +psmi_uuid_parse(const char *in, uuid_t uu) +{ + return uuid_parse(in, uu); +} + diff --git a/libuuid/psm_uuid.h b/libuuid/psm_uuid.h new file mode 100644 index 0000000..09df044 --- /dev/null +++ b/libuuid/psm_uuid.h @@ -0,0 +1,78 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _PSM_UUID_H +#define _PSM_UUID_H +struct uuid { + uint32_t time_low; + uint16_t time_mid; + uint16_t time_hi_and_version; + uint16_t clock_seq; + uint8_t node[6]; +}; + +typedef unsigned char uuid_t[16]; + +int psmi_uuid_parse(const char *in, psm2_uuid_t uu); +void psmi_uuid_unparse(const psm2_uuid_t uuid, char *out); +int psmi_uuid_compare(const psm2_uuid_t uuA, const psm2_uuid_t uuB); +int uuid_compare(const uuid_t uu1, const uuid_t uu2); +void uuid_pack(const struct uuid *uu, uuid_t ptr); +void uuid_unparse(const uuid_t uu, char *out); +void uuid_unparse_upper(const uuid_t uu, char *out); +void uuid_unparse_lower(const uuid_t uu, char *out); +void uuid_unpack(const uuid_t in, struct uuid *uu); +int uuid_parse(const char *in, uuid_t uu); +#endif diff --git a/libuuid/unpack.c b/libuuid/unpack.c new file mode 100644 index 0000000..26e4394 --- /dev/null +++ b/libuuid/unpack.c @@ -0,0 +1,63 @@ +/* + * Internal routine for unpacking UUID + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#include +#include +#include "psm_user.h" +#include "psm_uuid.h" + +void uuid_unpack(const uuid_t in, struct uuid *uu) +{ + const uint8_t *ptr = in; + uint32_t tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + tmp = (tmp << 8) | *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_low = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_mid = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_hi_and_version = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->clock_seq = tmp; + + memcpy(uu->node, ptr, 6); +} + diff --git a/libuuid/unparse.c b/libuuid/unparse.c new file mode 100644 index 0000000..d859379 --- /dev/null +++ b/libuuid/unparse.c @@ -0,0 +1,75 @@ +/* + * unparse.c -- convert a UUID to string + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#include + +#include "psm_user.h" +#include "psm_uuid.h" + +static const char *fmt_lower = + "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x"; + +static const char *fmt_upper = + "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X"; + +#ifdef UUID_UNPARSE_DEFAULT_UPPER +#define FMT_DEFAULT fmt_upper +#else +#define FMT_DEFAULT fmt_lower +#endif + +static void uuid_unparse_x(const uuid_t uu, char *out, const char *fmt) +{ + struct uuid uuid; + + uuid_unpack(uu, &uuid); + sprintf(out, fmt, + uuid.time_low, uuid.time_mid, uuid.time_hi_and_version, + uuid.clock_seq >> 8, uuid.clock_seq & 0xFF, + uuid.node[0], uuid.node[1], uuid.node[2], + uuid.node[3], uuid.node[4], uuid.node[5]); +} + +void uuid_unparse_lower(const uuid_t uu, char *out) +{ + uuid_unparse_x(uu, out, fmt_lower); +} + +void uuid_unparse_upper(const uuid_t uu, char *out) +{ + uuid_unparse_x(uu, out, fmt_upper); +} + +void uuid_unparse(const uuid_t uu, char *out) +{ + uuid_unparse_x(uu, out, FMT_DEFAULT); +} diff --git a/makesdeb.sh b/makesdeb.sh new file mode 100755 index 0000000..6f3572b --- /dev/null +++ b/makesdeb.sh @@ -0,0 +1,169 @@ +#!/bin/bash +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2017 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2017 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +# Stop on error +set -e + +BUILD_OPTS="gGbBAS" +BUILD_OPT= +DEB_NAME=libpsm2 + +# OUT_DIR is where the Makefile places its meta-data +OUT_DIR=build_release + +# Set BUILD_DIR first, so user control can override the value +# This is where this script places deb(s) and uses its build meta-data. +# It can be set the same as OUT_DIR, and work just fine if desired. +BUILD_DIR=temp.$$ + +function literate() +{ + echo $(sed "s/\B/&$2/g" <<< "$1") +} + +function usage() +{ + SCRIPT=${0##*/} + echo "Usage: $SCRIPT [OPTIONS]" + echo + echo "Creates tar ball of source and source rpms by default." + echo "Optionally generates binary rpm(s) " + echo + echo " $(literate $BUILD_OPTS ',')" + echo " Optional, default is full build (source and binary)" + echo " Set single extension letter for dpkg-buildpackage argument" + echo " -r " + echo " Optional, set the output deb name" + echo " -e " + echo " Optional, set a base name extension" + echo " This only appends an extra string onto the base DEB name" + echo " Does not affect supporting DEBs" + echo " -c" + echo " Optional, default is unset" + echo " Sets PSM_CUDA=1, creating -cuda based manifest and debs" + echo " -d " + echo " Optionally sets output folder for dpkg-buildpackage to use" + echo " -h" + echo " Shows this screen" + echo " Examples:" + echo " $SCRIPT b" + echo " $SCRIPT s -c" + echo " $SCRIPT -" + echo " $SCRIPT -d ./temp" + echo " $SCRIPT b -c -d output" + exit $1 +} + +while getopts "r:e:cd:h$BUILD_OPTS" OPT; do + case $OPT in + r) + DEB_NAME=$OPTARG + ;; + e) + BASE_EXT=$OPTARG + ;; + c) + export PSM_CUDA=1 + DEB_EXT="-cuda" + ;; + d) + BUILD_DIR=$OPTARG + ;; + h) + usage 0 + ;; + \?) + usage 1 + ;; + *) + BUILD_OPT=-$OPT + ;; + esac +done + +# Remove parsed options +shift $((OPTIND-1)) + +# Check if we have any non-option parameters +test ! $# -eq 0 && usage + +# Generic cleanup, build, and tmp folder creation +make distclean OUTDIR=$OUT_DIR + +make RPM_NAME=$DEB_NAME RPM_NAME_BASEEXT=$BASE_EXT dist OUTDIR=$OUT_DIR + +# Prepare build area +mkdir -p $BUILD_DIR/{build,binary,sources,dists} + +# Differnet paths based on DEB_EXT +cp $OUT_DIR/$DEB_NAME-*.tar.gz $BUILD_DIR/dists/ + +FILE_BASE=$(basename $BUILD_DIR/dists/$DEB_NAME-*.tar.gz .tar.gz) +VERSION=${FILE_BASE##$DEB_NAME-} + +echo Building $DEB_NAME version $VERSION... + +tar xzf $BUILD_DIR/dists/$DEB_NAME-$VERSION.tar.gz -C $BUILD_DIR/build + +(cd $BUILD_DIR/build/$DEB_NAME-$VERSION + +# Annotate changelog +mv debian/changelog.in debian/changelog +debchange --newversion=$VERSION "Bump up version to $VERSION" + +# Build package +dpkg-buildpackage $BUILD_OPT -us -uc -tc) + +mv $BUILD_DIR/build/$DEB_NAME*{.tar.xz,.dsc,.changes} $BUILD_DIR/sources/ +mv $BUILD_DIR/build/$DEB_NAME*{.deb,.ddeb} $BUILD_DIR/binary/ + +echo "The deb package(s) is (are) in $BUILD_DIR/binary/$(ls $BUILD_DIR/binary)" diff --git a/makesrpm.sh b/makesrpm.sh new file mode 100755 index 0000000..31caa01 --- /dev/null +++ b/makesrpm.sh @@ -0,0 +1,158 @@ +#!/bin/bash +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2015 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2015 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +#It makes no sense to have both CUDA and non-CUDA in the same invocation +#as they require different versions of the hfi1_user.h at this point in time. +#Limiting this script to only build CUDA if requested + +#default BUILDARG to build source RPM only +BUILDARG=s +RPM_NAME=libpsm2 + +function usage() +{ + echo "Usage: $0 [OPTION] [OPTION] [OPTION]" + echo " " + echo "Creates tar ball of source and source rpms by default." + echo "Optionally generates binary rpm(s) " + echo " " + echo " s,a,b,p,c,i,l" + echo " Optional, default is s (sourcerpm)" + echo " Set single extension letter for rpmbuild -b argument" + echo " -r , -rpmname " + echo " Optional, set the output rpm name" + echo " -e , -baseext " + echo " Optional, set a base name extension" + echo " This only appends an extra string onto the base RPM name" + echo " Does not affect supporting RPMs" + echo " -c, -cuda" + echo " Optional, default is unset" + echo " Sets PSM_CUDA=1, creating -cuda based spec and rpms" + echo " -d , -dir " + echo " Optionally sets output folder for rpmbuild to use" + echo " -h , -hal_gen " + echo " Optional, default is includes all HAL generations" + echo " Sets hal generations for rpmbuild to use" + echo " Examples:" + echo " $0 b" + echo " $0 s -cuda" + echo " $0 -cuda" + echo " $0 -d ./temp" + echo " $0 b -cuda -dir output" + echo " $0 -h gen1" + exit 1 +} + +err=0 + +# OUTDIR is where the Makefile places its meta-data +OUTDIR=build_release + +# Set TEMPDIR first, so user control can override the value +# This is where rpmbuild places rpm(s) and uses its build meta-data. +# It can be set the same as OUTDIR, and work just fine if desired. +TEMPDIR=temp.$$ + +HAL_GENS="" + +while [ "$1" != "" ]; do + case $1 in + -d | -dir) shift + if [ -z "$1" ]; then + usage + fi + TEMPDIR=$1 + ;; + -c | -cuda) export PSM_CUDA=1 + RPM_EXT="-cuda" + ;; + -e | -baseext) shift + if [ -z "$1" ]; then + usage + fi + RPM_NAME_BASEEXT="$1" + export RPM_NAME_BASEEXT="$1" + ;; + -h | -halgen) shift + HAL_GENS="$1 $HAL_GENS" + ;; + -r | -rpmname) shift + if [ -z "$1" ]; then + usage + fi + $RPM_NAME="$1" + export RPM_NAME="$1" + ;; + s|a|b|p|c|i|l) BUILDARG=$1 + ;; + * ) err=1 + usage + ;; + esac + shift +done + +if [ "$HAL_GENS" = "" ]; then + HAL_GENS="*" +fi + +# Generic cleanup, build, and tmp folder creation +make distclean OUTDIR=$OUTDIR +make RPM_NAME=$RPM_NAME RPM_NAME_BASEEXT=$RPM_NAME_BASEEXT "PSM_HAL_ENABLE=$HAL_GENS" dist OUTDIR=$OUTDIR +mkdir -p ./$TEMPDIR/{BUILD,RPMS,SOURCES,SPECS,SRPMS,BUILDROOT} +# Different paths based on RPM_EXT +cp ${OUTDIR}/$RPM_NAME-*.tar.gz $TEMPDIR/SOURCES +make RPM_NAME=$RPM_NAME RPM_NAME_BASEEXT=$RPM_NAME_BASEEXT specfile OUTDIR=$OUTDIR +cp ${OUTDIR}/$RPM_NAME.spec $TEMPDIR/SPECS +rpmbuild -b$BUILDARG --define "_topdir $PWD/$TEMPDIR" --nodeps $TEMPDIR/SPECS/$RPM_NAME.spec + +echo "The SRPM(s) are in $TEMPDIR/SRPMS/`ls $TEMPDIR/SRPMS`" diff --git a/mpspawn/mpspawn_stats.h b/mpspawn/mpspawn_stats.h new file mode 100644 index 0000000..4382587 --- /dev/null +++ b/mpspawn/mpspawn_stats.h @@ -0,0 +1,132 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _MPSPAWN_STATS_H +#define _MPSPAWN_STATS_H + +#include + +#define MPSPAWN_STATS_VERSION 1 + +typedef enum { + MPSPAWN_STATS_TYPE_DOUBLE = 0x1, +#define MPSPAWN_STATS_TYPE_DOUBLE 0x1 + MPSPAWN_STATS_TYPE_HEADER = 0x2, +#define MPSPAWN_STATS_TYPE_HEADER 0x2 + MPSPAWN_STATS_REDUCTION_MAX = 0x1000, +#define MPSPAWN_STATS_REDUCTION_MAX 0x1000 + MPSPAWN_STATS_REDUCTION_MIN = 0x2000, +#define MPSPAWN_STATS_REDUCTION_MIN 0x2000 + MPSPAWN_STATS_REDUCTION_MEDIAN = 0x4000, +#define MPSPAWN_STATS_REDUCTION_MEDIAN 0x4000 + MPSPAWN_STATS_SKIP_IF_ZERO = 0x8000 +#define MPSPAWN_STATS_SKIP_IF_ZERO 0x8000 +} mpspawn_stats_flags; + +#define MPSPAWN_STATS_REDUCTION_ALL (MPSPAWN_STATS_REDUCTION_MAX | \ + MPSPAWN_STATS_REDUCTION_MIN | MPSPAWN_STATS_REDUCTION_MEDIAN) + +#define MPSPAWN_STATS_DOUBLE_TO_U64(arg) (*((uint64_t *) &(arg))) +#define MPSPAWN_NAN_U64 ((uint64_t) ~0ULL) +#define MPSPAWN_ISNAN_U64(x) (((uint64_t)(x)) == MPSPAWN_NAN_U64) + +#define MPSPAWN_NAN ((uint64_t) ~0ULL) /* NAN) */ +#define MPSPAWN_ISNAN(x) (isnan(x)) + +struct mpspawn_stats_add_args; /* client->mpspawn stats registration */ +struct mpspawn_stats_req_args; /* mpspawn->client fn callback stats request */ +struct mpspawn_stats_init_args; /* mpspawn->client "downcall" to register */ + +/* Clients implement this function to fill in mpspawn request for stats */ +typedef void (*mpspawn_stats_req_fn) (struct mpspawn_stats_req_args *); +/* mpspawn implements this function to allow clients to register new stats */ +typedef void (*mpspawn_stats_add_fn) (struct mpspawn_stats_add_args *); +/* mpspawn implements this function to map rank indexes into epaddr structs */ +struct psm2_epaddr; +typedef struct psm2_epaddr *(*mpspawn_map_epaddr_fn) (int rank); + +typedef struct mpspawn_stats_req_args { + int version; + int num; + uint64_t *stats; + uint16_t *flags; + void *context; +} mpspawn_stats_req_args_t; + +typedef +struct mpspawn_stats_add_args { + int version; + int num; + char *header; + char **desc; + uint16_t *flags; + mpspawn_stats_req_fn req_fn; + void *context; +} mpspawn_stats_add_args_t; + +typedef +struct mpspawn_stats_init_args { + int version; + psm2_mq_t mq; /* initialized mq endpoint */ + int num_epaddr; /* number of endpoints in job */ + mpspawn_stats_add_fn add_fn; /* function for client to add stats */ + mpspawn_map_epaddr_fn epaddr_map_fn; + const char *stats_types; /* stats type string mpirun -M */ +} mpspawn_stats_init_args_t; + +/* Function in psm exposed to register stats */ +void *psmi_stats_register(struct mpspawn_stats_init_args *args); + +#endif diff --git a/opa/Makefile b/opa/Makefile new file mode 100644 index 0000000..2692886 --- /dev/null +++ b/opa/Makefile @@ -0,0 +1,111 @@ +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2015 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2015 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Copyright (c) 2003-2014 Intel Corporation. All rights reserved. +# + +OUTDIR = . + +TARGLIB := libopa +MAJOR := $(OPA_LIB_MAJOR) +MINOR := $(OPA_LIB_MINOR) + +this_srcdir := $(shell readlink -m .) +top_srcdir := $(this_srcdir)/.. + +INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips +ifeq (${arch},x86_64) + PLATFORM_OBJ=opa_dwordcpy-x86_64-fast.o +else + PLATFORM_OBJ= +endif + +${TARGLIB}-objs := opa_debug.o opa_time.o \ + opa_service.o opa_utils.o \ + opa_dwordcpy-$(arch).o opa_sysfs.o opa_syslog.o \ + $(PLATFORM_OBJ) + +${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs}) +DEPS := $(${TARGLIB}-objs:.o=.d) + +.PHONY: all clean +IGNORE_DEP_TARGETS = clean + +all .DEFAULT: ${${TARGLIB}-objs} + +install: all + @echo "Nothing to do for install." + +$(OUTDIR)/%.d: $(this_srcdir)/%.c + $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o) + +$(OUTDIR)/%.d: $(this_srcdir)/%.S + $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o) + +$(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS} + $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -c $< -o $@ + +$(OUTDIR)/%.o: $(this_srcdir)/%.S | ${DEPS} + $(CC) $(ASFLAGS) -c $< -o $@ + +clean: + @rm -f $(OUTDIR)/_revision.c + @if [ -d $(OUTDIR) ]; then \ + cd $(OUTDIR); \ + rm -f *.o *.d *.gcda *.gcno ${TARGLIB}.*; \ + cd -; \ + fi + + +#ifeq prevents the deps from being included during clean +#-include line is required to pull in auto-dependecies during 2nd pass +ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),) +-include ${DEPS} +endif diff --git a/opa/opa_debug.c b/opa/opa_debug.c new file mode 100644 index 0000000..990441b --- /dev/null +++ b/opa/opa_debug.c @@ -0,0 +1,371 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "opa_user.h" +#include "../psm_log.h" + +unsigned hfi_debug = 1; +char *__hfi_mylabel = NULL; +FILE *__hfi_dbgout; +static void init_hfi_mylabel(void) __attribute__ ((constructor)); +static void init_hfi_backtrace(void) __attribute__ ((constructor)); +static void init_hfi_dbgfile(void) __attribute__ ((constructor)); +static void fini_hfi_backtrace(void) __attribute__ ((destructor)); +static void fini_hfi_mylabel(void) __attribute__ ((destructor)); +static struct sigaction SIGSEGV_old_act; +static struct sigaction SIGBUS_old_act; +static struct sigaction SIGILL_old_act; +static struct sigaction SIGABRT_old_act; +static struct sigaction SIGINT_old_act; +static struct sigaction SIGTERM_old_act; +#ifdef HFI_BRAKE_DEBUG +static void hfi_brake_debug(void) __attribute__ ((constructor)); + +/* + How to use hfi_break_debug code: + + 1. Build psm with HFI_BRAKE_DEBUG set in the environment. + 2. Create a script for your test case (e.g. mpistress?). In the script + make sure to choose a HFI brake file that corresponds to a network + file system that is common to all hosts where you will run your code. + Also, in the script, make sure to propagate the "HFI_BRAKE_FILE_NAME" + env var to all hosts. + 3. Bring up 3 putty sessions to one of the hosts that your script uses. + 4. In putty session number 1, touch the HFI_BRAKE_FILE and sync. + 5. In putty session number 1, start the script. You should see messages + of the form: +-bash-4.2$ ./mpistress.0304.sc +:5716 remove the file: "/nfs/user/HFI_BRAKE" to continue +:5717 remove the file: "/nfs/user/HFI_BRAKE" to continue +:3456 remove the file: "/nfs/user/HFI_BRAKE" to continue +:3457 remove the file: "/nfs/user/HFI_BRAKE" to continue + + Note that the hostname and process id are shown for all of the processes that are started + by your script. + 6. In putty session 2, bring up gdb, and debug the program that is referenced in your script. + For example: /usr/mpi/gcc/openmpi-1.10.2-hfi/tests/intel/mpi_stress + 7. In putty session 2 / gdb, attach to one of the processes that is shown in putty session 1. + 8. Note, at this point, you have only one gdb session. I leave it as an exercise to the reader to + determine how to bring up multiple gdb sessions. + 9. In putty session 3, rm the HFI_BRAKE_FILE. + 10. You are now debugging a live session of psm. + */ + +static void hfi_brake_debug(void) +{ + struct stat buff; + char hostname[80]; + const char *hfi_brake_file_name = getenv("HFI_BRAKE_FILE_NAME"); + gethostname(hostname, 80); + hostname[sizeof(hostname) - 1] = '\0'; + + if (!hfi_brake_file_name) + hfi_brake_file_name = "/tmp/HFI_BRAKE_FILE"; + printf("%s:%d remove the file: \"%s\" to continue\n",hostname,getpid(),hfi_brake_file_name); + while (0 == stat(hfi_brake_file_name, &buff)) + { + printf("%s:%d remove the file: \"%s\" to continue\n",hostname,getpid(),hfi_brake_file_name); + sleep(10); + } + printf("%s:%d continuing.\n",hostname,getpid()); +} +#endif + +static void init_hfi_mylabel(void) +{ + char lbl[1024]; + char hostname[80]; + char *e; + /* By default, try to come up with a decent default label, it will be + * overridden later. Try getting rank, if that's not available revert to + * pid. */ + gethostname(hostname, 80); + lbl[0] = '\0'; + hostname[sizeof(hostname) - 1] = '\0'; + if ((((e = getenv("PSC_MPI_RANK")) && *e)) || + (((e = getenv("MPI_RANKID")) && *e)) || + (((e = getenv("MPIRUN_RANK")) && *e))) { + char *ep; + unsigned long val; + val = strtoul(e, &ep, 10); + if (ep != e) /* valid conversion */ + snprintf(lbl, 1024, "%s.%lu", hostname, val); + } + if (lbl[0] == '\0') + snprintf(lbl, 1024, "%s.%u", hostname, getpid()); + __hfi_mylabel = strdup(lbl); +} + +static void fini_hfi_mylabel(void) +{ + if(__hfi_mylabel != NULL) + free(__hfi_mylabel); +} + +/* FIXME: This signal handler does not conform to the posix standards described + in 'man 7 signal' due to it calling unsafe functions. + + See 'CALLS UNSAFE FUNCTION' notes below for examples. + */ +static void hfi_sighdlr(int sig, siginfo_t *p1, void *ucv) +{ + /* we make these static to try and avoid issues caused + by stack overflow that might have gotten us here. */ + static void *backaddr[128]; /* avoid stack usage */ + static char buf[150], hname[64], fname[128]; + static int i, j, fd, id; + extern char *__progname; + PSM2_LOG_DECLARE_BT_BUFFER(); + + /* CALLS UNSAFE FUNCTION when PSM_LOG is defined. */ + PSM2_LOG_BT(100,__FUNCTION__); + /* If this is a SIGINT do not display backtrace. Just invoke exit + handlers */ + if ((sig == SIGINT) || (sig == SIGTERM)) + /* CALLS UNSAFE FUNCTION (exit) */ + exit(1); + + /* CALLS UNSAFE FUNCTION (snprintf) */ + id = snprintf(buf, sizeof(buf), + "\n%.60s:%u terminated with signal %d", __progname, + getpid(), sig); + if (ucv) { + static ucontext_t *uc; + uc = (ucontext_t *) ucv; + id += snprintf(buf + id, sizeof(buf) - id, " at PC=%lx SP=%lx", +#if defined(__x86_64__) + (unsigned long)uc->uc_mcontext.gregs[REG_RIP], + (unsigned long)uc->uc_mcontext.gregs[REG_RSP]); +#elif defined(__i386__) + (unsigned long)uc->uc_mcontext.gregs[REG_EIP], + (unsigned long)uc->uc_mcontext.gregs[REG_ESP]); +#else + 0ul, 0ul); +#warning No stack pointer or instruction pointer for this arch +#endif + } + id += snprintf(buf + id, sizeof(buf) - id, ". Backtrace:\n"); + /* CALLS UNSAFE FUNCTION (fprintf) */ + fprintf(stderr, "%.*s", id, buf); + + i = backtrace(backaddr, sizeof(backaddr) / sizeof(backaddr[0])); + if (i > 2) /* skip ourselves and backtrace */ + j = 2, i -= j; + else + j = 0; + + backtrace_symbols_fd(backaddr + j, i, 2); + (void)fsync(2); + + /* Try to write it to a file as well, in case the rest doesn't make it + out. Do it second, in case we get a second failure (more likely). + We might eventually want to print some more of the registers to the + btr file, to aid debugging, but not for now. Truncate the program + name if overly long, so we always get pid and (at least part of) + hostname. */ + /* CALLS UNSAFE FUNCTION (gethostname) */ + (void)gethostname(hname, sizeof(hname)); + hname[sizeof(hname) - 1] = '\0'; + snprintf(fname, sizeof(fname), "%s.80s-%u,%.32s.btr", __progname, + getpid(), hname); + if ((fd = open(fname, O_CREAT | O_WRONLY, 0644)) >= 0) { + /* CALLS UNSAFE FUNCTION (fdopen) */ + FILE *fp = fdopen(fd, "w"); + if (fp) + fprintf(fp, "%.*s", id, buf); + backtrace_symbols_fd(backaddr + j, i, fd); + if (fp) + /* CALLS UNSAFE FUNCTION (fclose) */ + fclose(fp); + } + switch (sig){ + case SIGSEGV: + (*SIGSEGV_old_act.sa_sigaction)(sig,p1,ucv); + break; + case SIGBUS: + (*SIGBUS_old_act.sa_sigaction)(sig,p1,ucv); + break; + case SIGILL: + (*SIGILL_old_act.sa_sigaction)(sig,p1,ucv); + break; + case SIGABRT: + (*SIGABRT_old_act.sa_sigaction)(sig,p1,ucv); + break; + default: + break; + } + exit(1); /* not _exit(), want atexit handlers to get run */ +} + +/* We do this as a constructor so any user program that sets signal handlers + for these will override our settings, but we still get backtraces if they + don't. +*/ +static void init_hfi_backtrace(void) +{ + /* we need to track memory corruption */ + static struct sigaction act; /* easier than memset */ + act.sa_sigaction = hfi_sighdlr; + act.sa_flags = SA_SIGINFO; + + if (getenv("HFI_BACKTRACE")) { + /* permanent, although probably + undocumented way to disable backtraces. */ + (void)sigaction(SIGSEGV, &act, &SIGSEGV_old_act); + (void)sigaction(SIGBUS, &act, &SIGBUS_old_act); + (void)sigaction(SIGILL, &act, &SIGILL_old_act); + (void)sigaction(SIGABRT, &act, &SIGABRT_old_act); + (void)sigaction(SIGINT, &act, &SIGINT_old_act); + (void)sigaction(SIGTERM, &act, &SIGTERM_old_act); + } +} + +/* if HFI_DEBUG_FILENAME is set in the environment, then all the + debug prints (not info and error) will go to that file. + %h is expanded to the hostname, and %p to the pid, if present. */ +static void init_hfi_dbgfile(void) +{ + char *fname = getenv("HFI_DEBUG_FILENAME"); + char *exph, *expp, tbuf[1024]; + FILE *newf; + + if (!fname) { + __hfi_dbgout = stdout; + return; + } + exph = strstr(fname, "%h"); /* hostname */ + expp = strstr(fname, "%p"); /* pid */ + if (exph || expp) { + int baselen; + char hname[256], pid[12]; + if (exph) { + *hname = hname[sizeof(hname) - 1] = 0; + gethostname(hname, sizeof(hname) - 1); + if (!*hname) + strcpy(hname, "[unknown]"); + } + if (expp) + snprintf(pid, sizeof(pid), "%d", getpid()); + if (exph && expp) { + if (exph < expp) { + baselen = exph - fname; + snprintf(tbuf, sizeof(tbuf), "%.*s%s%.*s%s%s", + baselen, fname, hname, + (int)(expp - (exph + 2)), exph + 2, + pid, expp + 2); + } else { + baselen = expp - fname; + snprintf(tbuf, sizeof(tbuf), "%.*s%s%.*s%s%s", + baselen, fname, pid, + (int)(exph - (expp + 2)), expp + 2, + hname, exph + 2); + } + } else if (exph) { + baselen = exph - fname; + snprintf(tbuf, sizeof(tbuf), "%.*s%s%s", + baselen, fname, hname, exph + 2); + } else { + baselen = expp - fname; + snprintf(tbuf, sizeof(tbuf), "%.*s%s%s", + baselen, fname, pid, expp + 2); + } + fname = tbuf; + } + newf = fopen(fname, "a"); + if (!newf) { + _HFI_ERROR + ("Unable to open \"%s\" for debug output, using stdout: %s\n", + fname, strerror(errno)); + __hfi_dbgout = stdout; + } else { + __hfi_dbgout = newf; + setlinebuf(__hfi_dbgout); + } +} + +void hfi_set_mylabel(char *label) +{ + __hfi_mylabel = label; +} + +char *hfi_get_mylabel() +{ + return __hfi_mylabel; +} + +static void fini_hfi_backtrace(void) +{ + if (getenv("HFI_BACKTRACE")) { + (void)sigaction(SIGSEGV, &SIGSEGV_old_act, NULL); + (void)sigaction(SIGBUS, &SIGBUS_old_act, NULL); + (void)sigaction(SIGILL, &SIGILL_old_act, NULL); + (void)sigaction(SIGABRT, &SIGABRT_old_act, NULL); + (void)sigaction(SIGINT, &SIGINT_old_act, NULL); + (void)sigaction(SIGTERM, &SIGTERM_old_act, NULL); + } +} diff --git a/opa/opa_dwordcpy-generic.c b/opa/opa_dwordcpy-generic.c new file mode 100644 index 0000000..dfb7755 --- /dev/null +++ b/opa/opa_dwordcpy-generic.c @@ -0,0 +1,317 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include +#include +#include "opa_intf.h" +#include "psm_user.h" + +#if defined(__x86_64__) +#define hfi_dwordcpy hfi_dwordcpy_safe +#define hfi_qwordcpy hfi_qwordcpy_safe +#endif + +void hfi_dwordcpy(volatile uint32_t *dest, const uint32_t *src, uint32_t ndwords) +{ + uint_fast32_t ndw = ndwords; + const uint64_t *src64[4]; + volatile uint64_t *dst64[4]; + src64[0] = (const uint64_t *) src; + dst64[0] = (volatile uint64_t *) dest; + + while (ndw >= 8) { + *dst64[0] = *src64[0]; + src64[1] = src64[0] + 1; + src64[2] = src64[0] + 2; + src64[3] = src64[0] + 3; + ndw -= 8; + dst64[1] = dst64[0] + 1; + dst64[2] = dst64[0] + 2; + dst64[3] = dst64[0] + 3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + } + if (ndw) { + src = (const uint32_t *) src64[0]; + dest = (volatile uint32_t *) dst64[0]; + + switch (ndw) { + case 7: + *dest++ = *src++; + case 6: + *dest++ = *src++; + case 5: + *dest++ = *src++; + case 4: + *dest++ = *src++; + case 3: + *dest++ = *src++; + case 2: + *dest++ = *src++; + case 1: + *dest++ = *src++; + } + + } +} + +void hfi_qwordcpy(volatile uint64_t *dest, const uint64_t *src, uint32_t nqwords) +{ + uint_fast32_t nqw = nqwords; + const uint64_t *src64[4]; + volatile uint64_t *dst64[4]; + src64[0] = src; + dst64[0] = dest; + + while (nqw >= 8) { + *dst64[0] = *src64[0]; + src64[1] = src64[0] + 1; + src64[2] = src64[0] + 2; + src64[3] = src64[0] + 3; + dst64[1] = dst64[0] + 1; + dst64[2] = dst64[0] + 2; + dst64[3] = dst64[0] + 3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + + *dst64[0] = *src64[0]; + src64[1] = src64[0] + 1; + src64[2] = src64[0] + 2; + src64[3] = src64[0] + 3; + dst64[1] = dst64[0] + 1; + dst64[2] = dst64[0] + 2; + dst64[3] = dst64[0] + 3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + + nqw -= 8; + } + if (nqw) { + switch (nqw) { + case 7: + *(dst64[0])++ = *(src64[0])++; + case 6: + *(dst64[0])++ = *(src64[0])++; + case 5: + *(dst64[0])++ = *(src64[0])++; + case 4: + *(dst64[0])++ = *(src64[0])++; + case 3: + *(dst64[0])++ = *(src64[0])++; + case 2: + *(dst64[0])++ = *(src64[0])++; + case 1: + *(dst64[0])++ = *(src64[0])++; + } + } +} + +#ifdef PSM_AVX512 +void hfi_pio_blockcpy_512(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) +{ + volatile __m512i *dp = (volatile __m512i *) dest; + const __m512i *sp = (const __m512i *) src; + + psmi_assert((dp != NULL) && (sp != NULL)); + psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); + + if ((((uintptr_t) sp) & 0x3f) == 0x0) { + /* source and destination are both 64 byte aligned */ + do { + __m512i tmp0 = _mm512_load_si512(sp); + _mm512_store_si512((__m512i *)dp, tmp0); + } while ((--nblock) && (++dp) && (++sp)); + } else { + /* only destination is 64 byte aligned - use unaligned loads */ + do { + __m512i tmp0 = _mm512_loadu_si512(sp); + _mm512_store_si512((__m512i *)dp, tmp0); + } while ((--nblock) && (++dp) && (++sp)); + } +} +#endif + +void hfi_pio_blockcpy_256(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) +{ + volatile __m256i *dp = (volatile __m256i *) dest; + const __m256i *sp = (const __m256i *) src; + + psmi_assert((dp != NULL) && (sp != NULL)); + psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); + + if ((((uintptr_t) sp) & 0x1f) == 0x0) { + /* source and destination are both 32 byte aligned */ + do { + __m256i tmp0 = _mm256_load_si256(sp); + __m256i tmp1 = _mm256_load_si256(sp + 1); + _mm256_store_si256((__m256i *)dp, tmp0); + _mm256_store_si256((__m256i *)(dp + 1), tmp1); + } while ((--nblock) && (dp = dp+2) && (sp = sp+2)); + } else { + /* only destination is 32 byte aligned - use unaligned loads */ + do { + __m256i tmp0 = _mm256_loadu_si256(sp); + __m256i tmp1 = _mm256_loadu_si256(sp + 1); + _mm256_store_si256((__m256i *)dp, tmp0); + _mm256_store_si256((__m256i *)(dp + 1), tmp1); + } while ((--nblock) && (dp = dp+2) && (sp = sp+2)); + } +} + +void hfi_pio_blockcpy_128(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) +{ + volatile __m128i *dp = (volatile __m128i *) dest; + const __m128i *sp = (const __m128i *) src; + + psmi_assert((dp != NULL) && (sp != NULL)); + psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); + + if ((((uintptr_t) sp) & 0xf) == 0x0) { + /* source and destination are both 16 byte aligned */ + do { + __m128i tmp0 = _mm_load_si128(sp); + __m128i tmp1 = _mm_load_si128(sp + 1); + __m128i tmp2 = _mm_load_si128(sp + 2); + __m128i tmp3 = _mm_load_si128(sp + 3); + _mm_store_si128((__m128i *)dp, tmp0); + _mm_store_si128((__m128i *)(dp + 1), tmp1); + _mm_store_si128((__m128i *)(dp + 2), tmp2); + _mm_store_si128((__m128i *)(dp + 3), tmp3); + } while ((--nblock) && (dp = dp+4) && (sp = sp+4)); + } else { + /* only destination is 16 byte aligned - use unaligned loads */ + do { + __m128i tmp0 = _mm_loadu_si128(sp); + __m128i tmp1 = _mm_loadu_si128(sp + 1); + __m128i tmp2 = _mm_loadu_si128(sp + 2); + __m128i tmp3 = _mm_loadu_si128(sp + 3); + _mm_store_si128((__m128i *)dp, tmp0); + _mm_store_si128((__m128i *)(dp + 1), tmp1); + _mm_store_si128((__m128i *)(dp + 2), tmp2); + _mm_store_si128((__m128i *)(dp + 3), tmp3); + } while ((--nblock) && (dp = dp+4) && (sp = sp+4)); + } +} + +void hfi_pio_blockcpy_64(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) +{ + const uint64_t *src64[4]; + volatile uint64_t *dst64[4]; + src64[0] = src; + dst64[0] = dest; + + psmi_assert((dst64[0] != NULL) && (src64[0] != NULL)); + psmi_assert((((uintptr_t) dest) & 0x3f) == 0x0); + + do { + *dst64[0] = *src64[0]; + src64[1] = src64[0] + 1; + src64[2] = src64[0] + 2; + src64[3] = src64[0] + 3; + dst64[1] = dst64[0] + 1; + dst64[2] = dst64[0] + 2; + dst64[3] = dst64[0] + 3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + + *dst64[0] = *src64[0]; + src64[1] = src64[0] + 1; + src64[2] = src64[0] + 2; + src64[3] = src64[0] + 3; + dst64[1] = dst64[0] + 1; + dst64[2] = dst64[0] + 2; + dst64[3] = dst64[0] + 3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + } while (--nblock); +} + +void MOCKABLE(psmi_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars) +{ + +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(vdest) || PSMI_IS_CUDA_MEM((void *) vsrc))) { + PSMI_CUDA_CALL(cuMemcpy, + (CUdeviceptr)vdest, (CUdeviceptr)vsrc, nchars); + return; + } +#endif + memcpy(vdest, vsrc, nchars); + return; + + +} +MOCK_DEF_EPILOGUE(psmi_mq_mtucpy); + +void psmi_mq_mtucpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars) +{ + memcpy(vdest, vsrc, nchars); + return; +} diff --git a/opa/opa_dwordcpy-i386.S b/opa/opa_dwordcpy-i386.S new file mode 100644 index 0000000..f3d898d --- /dev/null +++ b/opa/opa_dwordcpy-i386.S @@ -0,0 +1,84 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + + .globl hfi_dwordcpy + .file "opa_dword32cpy.S" + .text + .p2align 4,,15 +hfi_dwordcpy: + // standard C calling convention, args on stack + // does not return any value + .type hfi_dwordcpy, @function + // save caller-saved regs + mov %edi,%eax + mov %esi,%edx + + // setup regs + mov 0xc(%esp,1),%ecx + mov 0x4(%esp,1),%edi + mov 0x8(%esp,1),%esi + // and do it + cld + rep + movsd + + // restore + mov %eax,%edi + mov %edx,%esi + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/opa/opa_dwordcpy-x86_64-fast.S b/opa/opa_dwordcpy-x86_64-fast.S new file mode 100644 index 0000000..fe07ebf --- /dev/null +++ b/opa/opa_dwordcpy-x86_64-fast.S @@ -0,0 +1,77 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + + .globl hfi_dwordcpy + .file "opa_dwordcpy-x86_64-fast.S" + .text + .p2align 4,,15 + // standard C calling convention, rdi is dest, rsi is source, rdx is count + // does not return any value +hfi_dwordcpy: + .type hfi_dwordcpy, @function + movl %edx,%ecx + shrl $1,%ecx + andl $1,%edx + cld + rep + movsq + movl %edx,%ecx + rep + movsd + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/opa/opa_dwordcpy-x86_64.c b/opa/opa_dwordcpy-x86_64.c new file mode 100644 index 0000000..dfb7755 --- /dev/null +++ b/opa/opa_dwordcpy-x86_64.c @@ -0,0 +1,317 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include +#include +#include "opa_intf.h" +#include "psm_user.h" + +#if defined(__x86_64__) +#define hfi_dwordcpy hfi_dwordcpy_safe +#define hfi_qwordcpy hfi_qwordcpy_safe +#endif + +void hfi_dwordcpy(volatile uint32_t *dest, const uint32_t *src, uint32_t ndwords) +{ + uint_fast32_t ndw = ndwords; + const uint64_t *src64[4]; + volatile uint64_t *dst64[4]; + src64[0] = (const uint64_t *) src; + dst64[0] = (volatile uint64_t *) dest; + + while (ndw >= 8) { + *dst64[0] = *src64[0]; + src64[1] = src64[0] + 1; + src64[2] = src64[0] + 2; + src64[3] = src64[0] + 3; + ndw -= 8; + dst64[1] = dst64[0] + 1; + dst64[2] = dst64[0] + 2; + dst64[3] = dst64[0] + 3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + } + if (ndw) { + src = (const uint32_t *) src64[0]; + dest = (volatile uint32_t *) dst64[0]; + + switch (ndw) { + case 7: + *dest++ = *src++; + case 6: + *dest++ = *src++; + case 5: + *dest++ = *src++; + case 4: + *dest++ = *src++; + case 3: + *dest++ = *src++; + case 2: + *dest++ = *src++; + case 1: + *dest++ = *src++; + } + + } +} + +void hfi_qwordcpy(volatile uint64_t *dest, const uint64_t *src, uint32_t nqwords) +{ + uint_fast32_t nqw = nqwords; + const uint64_t *src64[4]; + volatile uint64_t *dst64[4]; + src64[0] = src; + dst64[0] = dest; + + while (nqw >= 8) { + *dst64[0] = *src64[0]; + src64[1] = src64[0] + 1; + src64[2] = src64[0] + 2; + src64[3] = src64[0] + 3; + dst64[1] = dst64[0] + 1; + dst64[2] = dst64[0] + 2; + dst64[3] = dst64[0] + 3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + + *dst64[0] = *src64[0]; + src64[1] = src64[0] + 1; + src64[2] = src64[0] + 2; + src64[3] = src64[0] + 3; + dst64[1] = dst64[0] + 1; + dst64[2] = dst64[0] + 2; + dst64[3] = dst64[0] + 3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + + nqw -= 8; + } + if (nqw) { + switch (nqw) { + case 7: + *(dst64[0])++ = *(src64[0])++; + case 6: + *(dst64[0])++ = *(src64[0])++; + case 5: + *(dst64[0])++ = *(src64[0])++; + case 4: + *(dst64[0])++ = *(src64[0])++; + case 3: + *(dst64[0])++ = *(src64[0])++; + case 2: + *(dst64[0])++ = *(src64[0])++; + case 1: + *(dst64[0])++ = *(src64[0])++; + } + } +} + +#ifdef PSM_AVX512 +void hfi_pio_blockcpy_512(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) +{ + volatile __m512i *dp = (volatile __m512i *) dest; + const __m512i *sp = (const __m512i *) src; + + psmi_assert((dp != NULL) && (sp != NULL)); + psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); + + if ((((uintptr_t) sp) & 0x3f) == 0x0) { + /* source and destination are both 64 byte aligned */ + do { + __m512i tmp0 = _mm512_load_si512(sp); + _mm512_store_si512((__m512i *)dp, tmp0); + } while ((--nblock) && (++dp) && (++sp)); + } else { + /* only destination is 64 byte aligned - use unaligned loads */ + do { + __m512i tmp0 = _mm512_loadu_si512(sp); + _mm512_store_si512((__m512i *)dp, tmp0); + } while ((--nblock) && (++dp) && (++sp)); + } +} +#endif + +void hfi_pio_blockcpy_256(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) +{ + volatile __m256i *dp = (volatile __m256i *) dest; + const __m256i *sp = (const __m256i *) src; + + psmi_assert((dp != NULL) && (sp != NULL)); + psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); + + if ((((uintptr_t) sp) & 0x1f) == 0x0) { + /* source and destination are both 32 byte aligned */ + do { + __m256i tmp0 = _mm256_load_si256(sp); + __m256i tmp1 = _mm256_load_si256(sp + 1); + _mm256_store_si256((__m256i *)dp, tmp0); + _mm256_store_si256((__m256i *)(dp + 1), tmp1); + } while ((--nblock) && (dp = dp+2) && (sp = sp+2)); + } else { + /* only destination is 32 byte aligned - use unaligned loads */ + do { + __m256i tmp0 = _mm256_loadu_si256(sp); + __m256i tmp1 = _mm256_loadu_si256(sp + 1); + _mm256_store_si256((__m256i *)dp, tmp0); + _mm256_store_si256((__m256i *)(dp + 1), tmp1); + } while ((--nblock) && (dp = dp+2) && (sp = sp+2)); + } +} + +void hfi_pio_blockcpy_128(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) +{ + volatile __m128i *dp = (volatile __m128i *) dest; + const __m128i *sp = (const __m128i *) src; + + psmi_assert((dp != NULL) && (sp != NULL)); + psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); + + if ((((uintptr_t) sp) & 0xf) == 0x0) { + /* source and destination are both 16 byte aligned */ + do { + __m128i tmp0 = _mm_load_si128(sp); + __m128i tmp1 = _mm_load_si128(sp + 1); + __m128i tmp2 = _mm_load_si128(sp + 2); + __m128i tmp3 = _mm_load_si128(sp + 3); + _mm_store_si128((__m128i *)dp, tmp0); + _mm_store_si128((__m128i *)(dp + 1), tmp1); + _mm_store_si128((__m128i *)(dp + 2), tmp2); + _mm_store_si128((__m128i *)(dp + 3), tmp3); + } while ((--nblock) && (dp = dp+4) && (sp = sp+4)); + } else { + /* only destination is 16 byte aligned - use unaligned loads */ + do { + __m128i tmp0 = _mm_loadu_si128(sp); + __m128i tmp1 = _mm_loadu_si128(sp + 1); + __m128i tmp2 = _mm_loadu_si128(sp + 2); + __m128i tmp3 = _mm_loadu_si128(sp + 3); + _mm_store_si128((__m128i *)dp, tmp0); + _mm_store_si128((__m128i *)(dp + 1), tmp1); + _mm_store_si128((__m128i *)(dp + 2), tmp2); + _mm_store_si128((__m128i *)(dp + 3), tmp3); + } while ((--nblock) && (dp = dp+4) && (sp = sp+4)); + } +} + +void hfi_pio_blockcpy_64(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) +{ + const uint64_t *src64[4]; + volatile uint64_t *dst64[4]; + src64[0] = src; + dst64[0] = dest; + + psmi_assert((dst64[0] != NULL) && (src64[0] != NULL)); + psmi_assert((((uintptr_t) dest) & 0x3f) == 0x0); + + do { + *dst64[0] = *src64[0]; + src64[1] = src64[0] + 1; + src64[2] = src64[0] + 2; + src64[3] = src64[0] + 3; + dst64[1] = dst64[0] + 1; + dst64[2] = dst64[0] + 2; + dst64[3] = dst64[0] + 3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + + *dst64[0] = *src64[0]; + src64[1] = src64[0] + 1; + src64[2] = src64[0] + 2; + src64[3] = src64[0] + 3; + dst64[1] = dst64[0] + 1; + dst64[2] = dst64[0] + 2; + dst64[3] = dst64[0] + 3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + } while (--nblock); +} + +void MOCKABLE(psmi_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars) +{ + +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(vdest) || PSMI_IS_CUDA_MEM((void *) vsrc))) { + PSMI_CUDA_CALL(cuMemcpy, + (CUdeviceptr)vdest, (CUdeviceptr)vsrc, nchars); + return; + } +#endif + memcpy(vdest, vsrc, nchars); + return; + + +} +MOCK_DEF_EPILOGUE(psmi_mq_mtucpy); + +void psmi_mq_mtucpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars) +{ + memcpy(vdest, vsrc, nchars); + return; +} diff --git a/opa/opa_service.c b/opa/opa_service.c new file mode 100644 index 0000000..9bb355b --- /dev/null +++ b/opa/opa_service.c @@ -0,0 +1,135 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* This file contains hfi service routine interface used by the low + level hfi protocol code. */ + +#include "opa_service.h" +#include "psmi_wrappers.h" + +/* These have been fixed to read the values, but they are not + * compatible with the hfi driver, they return new info with + * the qib driver + */ +static int hfi_count_names(const char *namep) +{ + int n = 0; + while (*namep != '\0') { + if (*namep == '\n') + n++; + namep++; + } + return n; +} + +int hfi_get_ctrs_unit_names(int unitno, char **namep) +{ + int i; + i = hfi_hfifs_unit_read(unitno, "counter_names", namep); + if (i < 0) + return -1; + else + return hfi_count_names(*namep); +} + +int hfi_get_ctrs_unit(int unitno, uint64_t *c, int nelem) +{ + int i; + i = hfi_hfifs_unit_rd(unitno, "counters", c, nelem * sizeof(*c)); + if (i < 0) + return -1; + else + return i / sizeof(*c); +} + +int hfi_get_ctrs_port_names(int unitno, char **namep) +{ + int i; + i = hfi_hfifs_unit_read(unitno, "portcounter_names", namep); + if (i < 0) + return -1; + else + return hfi_count_names(*namep); +} + +int hfi_get_ctrs_port(int unitno, int port, uint64_t *c, int nelem) +{ + int i; + char buf[32]; + snprintf(buf, sizeof(buf), "port%dcounters", port); + i = hfi_hfifs_unit_rd(unitno, buf, c, nelem * sizeof(*c)); + if (i < 0) + return -1; + else + return i / sizeof(*c); +} + +int hfi_get_stats_names(char **namep) +{ + int i; + i = hfi_hfifs_read("driver_stats_names", namep); + if (i < 0) + return -1; + else + return hfi_count_names(*namep); +} + +int hfi_get_stats(uint64_t *s, int nelem) +{ + int i; + i = hfi_hfifs_rd("driver_stats", s, nelem * sizeof(*s)); + if (i < 0) + return -1; + else + return i / sizeof(*s); +} diff --git a/opa/opa_sysfs.c b/opa/opa_sysfs.c new file mode 100644 index 0000000..91446ec --- /dev/null +++ b/opa/opa_sysfs.c @@ -0,0 +1,589 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +/* This file contains a simple sysfs interface used by the low level + hfi protocol code. It also implements the interface to hfifs. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opa_service.h" + +static char *sysfs_path; +static size_t sysfs_path_len; +static char *hfifs_path; +static long sysfs_page_size; + +void sysfs_init(const char *dflt_hfi_class_path) +{ + if (NULL != (sysfs_path = getenv("HFI_SYSFS_PATH"))) + { + char *syspath = strdup(sysfs_path); + + if (!syspath) + _HFI_DBG("Failed to strdup(\"%s\") for syspath.\n", + sysfs_path); + else + sysfs_path = syspath; + } + if (sysfs_path == NULL) { + unsigned len = strlen(dflt_hfi_class_path) + 4; + char *syspath = malloc(len); + + if (!syspath) + _HFI_DBG("Failed to alloc %u bytes for syspath.\n",len); + else + { + snprintf(syspath, len, "%s_0", dflt_hfi_class_path); + sysfs_path = syspath; + } + } + + if (sysfs_path != NULL) { + struct stat s; + + if (stat(sysfs_path, &s) || !S_ISDIR(s.st_mode)) + { + _HFI_DBG("Did not find sysfs directory %s, using anyway\n", + sysfs_path); + } + else + { + /* Remove the unit number from the sysfs path: */ + char *lastUS = strrchr(sysfs_path, '_'); + + if ((NULL != lastUS) && (isdigit(lastUS[1]))) + lastUS[1] = 0; + } + } + + if (sysfs_path != NULL) + sysfs_path_len = strlen(sysfs_path); + + if (hfifs_path == NULL) + hfifs_path = getenv("HFI_HFIFS_PATH"); + if (hfifs_path == NULL) + hfifs_path = "/hfifs"; + + if (!sysfs_page_size) + sysfs_page_size = sysconf(_SC_PAGESIZE); +} + +const char *hfi_sysfs_path(void) +{ + return sysfs_path; +} + +size_t hfi_sysfs_path_len(void) +{ + return sysfs_path_len; +} + +const char *hfi_hfifs_path(void) +{ + return hfifs_path; +} + +int hfi_hfifs_open(const char *attr, int flags) +{ + char buf[1024]; + int saved_errno; + int fd; + + snprintf(buf, sizeof(buf), "%s/%s", hfi_hfifs_path(), attr); + fd = open(buf, flags); + saved_errno = errno; + + if (fd == -1) { + _HFI_DBG("Failed to open driver attribute '%s': %s\n", attr, + strerror(errno)); + _HFI_DBG("Offending file name: %s\n", buf); + } + + errno = saved_errno; + return fd; +} + +int hfi_sysfs_unit_open(uint32_t unit, const char *attr, int flags) +{ + int saved_errno; + char buf[1024]; + int fd; + + snprintf(buf, sizeof(buf), "%s%u/%s", hfi_sysfs_path(), unit, attr); + fd = open(buf, flags); + saved_errno = errno; + + if (fd == -1) { + _HFI_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr, + unit, strerror(errno)); + _HFI_DBG("Offending file name: %s\n", buf); + } + + errno = saved_errno; + return fd; +} + +static int hfi_sysfs_unit_open_for_node(uint32_t unit, int flags) +{ + int saved_errno; + char buf[1024]; + int fd; + + snprintf(buf, sizeof(buf), "%s%u/device/numa_node", + hfi_sysfs_path(), unit); + fd = open(buf, flags); + saved_errno = errno; + + if (fd == -1) { + _HFI_DBG("Failed to open attribute numa_node of unit %d: %s\n", + unit, strerror(errno)); + _HFI_DBG("Offending file name: %s\n", buf); + } + + errno = saved_errno; + return fd; +} + +int hfi_sysfs_port_open(uint32_t unit, uint32_t port, const char *attr, + int flags) +{ + int saved_errno; + char buf[1024]; + int fd; + + snprintf(buf, sizeof(buf), "%s%u/ports/%u/%s", hfi_sysfs_path(), + unit, port, attr); + fd = open(buf, flags); + saved_errno = errno; + + if (fd == -1) { + _HFI_DBG("Failed to open attribute '%s' of unit %d:%d: %s\n", + attr, unit, port, strerror(errno)); + _HFI_DBG("Offending file name: %s\n", buf); + } + + errno = saved_errno; + return fd; +} + +int hfi_hfifs_unit_open(uint32_t unit, const char *attr, int flags) +{ + int saved_errno; + char buf[1024]; + int fd; + + snprintf(buf, sizeof(buf), "%s/%u/%s", hfi_hfifs_path(), unit, attr); + fd = open(buf, flags); + saved_errno = errno; + + if (fd == -1) { + _HFI_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr, + unit, strerror(errno)); + _HFI_DBG("Offending file name: %s\n", buf); + } + + errno = saved_errno; + return fd; +} + +static int read_page(int fd, char **datap) +{ + char *data = NULL; + int saved_errno; + int ret = -1; + + data = malloc(sysfs_page_size); + saved_errno = errno; + + if (!data) { + _HFI_DBG("Could not allocate memory: %s\n", strerror(errno)); + goto bail; + } + + ret = read(fd, data, sysfs_page_size); + saved_errno = errno; + + if (ret == -1) { + _HFI_DBG("Read of attribute failed: %s\n", strerror(errno)); + goto bail; + } + +bail: + if (ret == -1) { + free(data); + } else { + if (ret < sysfs_page_size) + data[ret] = 0; + else + data[sysfs_page_size-1] = 0; + *datap = data; + } + + errno = saved_errno; + return ret; +} + +/* + * On return, caller must free *datap. + */ +int hfi_sysfs_unit_read(uint32_t unit, const char *attr, char **datap) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = hfi_sysfs_unit_open(unit, attr, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read_page(fd, datap); + saved_errno = errno; + +bail: + if (ret == -1) + *datap = NULL; + + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + +/* read a string value into buff, no more than size bytes. + returns the number of bytes read */ +size_t hfi_sysfs_unit_port_read(uint32_t unit, uint32_t port, const char *attr, + char *buff, size_t size) +{ + int fd = -1; + size_t rv = 0; + + fd = hfi_sysfs_port_open(unit, port, attr, O_RDONLY); + + if (fd == -1) + return rv; + + rv = read(fd, buff, size); + + close(fd); + + if (rv < size) + buff[rv] = 0; + else + buff[size-1] = 0; + + return rv; +} + +/* + * On return, caller must free *datap. + */ +int hfi_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr, + char **datap) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = hfi_sysfs_port_open(unit, port, attr, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read_page(fd, datap); + saved_errno = errno; + +bail: + if (ret == -1) + *datap = NULL; + + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + +/* + * On return, caller must free *datap. + */ +int hfi_hfifs_read(const char *attr, char **datap) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = hfi_hfifs_open(attr, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read_page(fd, datap); + saved_errno = errno; + +bail: + if (ret == -1) + *datap = NULL; + + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + +/* + * On return, caller must free *datap. + */ +int hfi_hfifs_unit_read(uint32_t unit, const char *attr, char **datap) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = hfi_hfifs_unit_open(unit, attr, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read_page(fd, datap); + saved_errno = errno; + +bail: + if (ret == -1) + *datap = NULL; + + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + +/* + * The _rd routines jread directly into a supplied buffer, + * unlike the _read routines. + */ +int hfi_hfifs_rd(const char *attr, void *buf, int n) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = hfi_hfifs_open(attr, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read(fd, buf, n); + saved_errno = errno; + +bail: + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + +int hfi_hfifs_unit_rd(uint32_t unit, const char *attr, void *buf, int n) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = hfi_hfifs_unit_open(unit, attr, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read(fd, buf, n); + saved_errno = errno; + +bail: + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + +int hfi_sysfs_unit_read_s64(uint32_t unit, const char *attr, + int64_t *valp, int base) +{ + char *data=NULL, *end; + int saved_errno; + long long val; + int ret; + + ret = hfi_sysfs_unit_read(unit, attr, &data); + saved_errno = errno; + + if (ret == -1) { + goto bail; + } + + val = strtoll(data, &end, base); + saved_errno = errno; + + if (!*data || !(*end == '\0' || isspace(*end))) { + ret = -1; + goto bail; + } + + *valp = val; + ret = 0; + +bail: + if (data) + free(data); + errno = saved_errno; + return ret; +} + +static int hfi_sysfs_unit_read_node(uint32_t unit, char **datap) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = hfi_sysfs_unit_open_for_node(unit, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read_page(fd, datap); + if (ret == -1) + *datap = NULL; + + saved_errno = errno; + close(fd); +bail: + errno = saved_errno; + return ret; +} + +int64_t hfi_sysfs_unit_read_node_s64(uint32_t unit) +{ + char *data=NULL, *end; + int saved_errno; + long long val; + int64_t ret = -1; + + saved_errno = errno; + if (hfi_sysfs_unit_read_node(unit, &data) == -1) { + goto bail; + } + + val = strtoll(data, &end, 0); + saved_errno = errno; + + if (!*data || !(*end == '\0' || isspace(*end))) { + ret = -1; + goto bail; + } + + ret = (int64_t) val; +bail: + free(data); + errno = saved_errno; + return ret; +} + +int hfi_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr, + int64_t *valp, int base) +{ + char *data, *end; + int saved_errno; + long long val; + int ret; + + ret = hfi_sysfs_port_read(unit, port, attr, &data); + saved_errno = errno; + + if (ret == -1) { + goto bail; + } + + val = strtoll(data, &end, base); + saved_errno = errno; + + if (!*data || !(*end == '\0' || isspace(*end))) { + ret = -1; + goto bail; + } + + *valp = val; + ret = 0; + +bail: + free(data); + errno = saved_errno; + return ret; +} diff --git a/opa/opa_syslog.c b/opa/opa_syslog.c new file mode 100644 index 0000000..ccd39c5 --- /dev/null +++ b/opa/opa_syslog.c @@ -0,0 +1,113 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#define __USE_GNU +#include +#include +#include +#include +#include + +#include "opa_user.h" + +#define SYSLOG_MAXLEN 512 + +extern char *__hfi_mylabel; + +void +hfi_vsyslog(const char *prefix, int to_console, int level, + const char *format, va_list ap) +{ + char logprefix[SYSLOG_MAXLEN]; + size_t len; + + if (to_console) { + char hostname[80]; + va_list ap_cons; + va_copy(ap_cons, ap); + len = strlen(format); + gethostname(hostname, sizeof(hostname)); + hostname[sizeof(hostname) - 1] = '\0'; + + if (__hfi_mylabel) + fprintf(stderr, "%s", __hfi_mylabel); + else + fprintf(stderr, "%s: ", hostname); + + vfprintf(stderr, format, ap_cons); + if (format[len] != '\n') + fprintf(stderr, "\n"); + fflush(stderr); + va_end(ap_cons); + } + + len = snprintf(logprefix, sizeof(logprefix), + "(hfi/%s)[%d]: %s", prefix ? prefix : "hfi", + (int)getpid(), format); + + vsyslog(level | LOG_USER, logprefix, ap); + + return; +} + +void +hfi_syslog(const char *prefix, int to_console, int level, + const char *format, ...) +{ + va_list ap; + va_start(ap, format); + hfi_vsyslog(prefix, to_console, level, format, ap); + va_end(ap); +} diff --git a/opa/opa_time.c b/opa/opa_time.c new file mode 100644 index 0000000..67e28c7 --- /dev/null +++ b/opa/opa_time.c @@ -0,0 +1,299 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#define __USE_GNU +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opa_user.h" + +#ifdef min +#undef min +#endif +#define min(a, b) ((a) < (b) ? (a) : (b)) + +#ifdef max +#undef max +#endif +#define max(a, b) ((a) > (b) ? (a) : (b)) + +/* init the cycle counter to picosecs/cycle conversion automatically */ +/* at program startup, if it's using timing functions. */ +static void init_picos_per_cycle(void) __attribute__ ((constructor)); +static int hfi_timebase_isvalid(uint32_t pico_per_cycle); +static uint32_t hfi_timebase_from_cpuinfo(uint32_t old_pico_per_cycle); + +/* in case two of our mechanisms fail */ +#define SAFEDEFAULT_PICOS_PER_CYCLE 500 + +uint32_t __hfi_pico_per_cycle = SAFEDEFAULT_PICOS_PER_CYCLE; + +/* This isn't perfect, but it's close enough for rough timing. We want this + to work on systems where the cycle counter isn't the same as the clock + frequency. + __hfi_pico_per_cycle isn't going to lead to completely accurate + conversions from timestamps to nanoseconds, but it's close enough for + our purposes, which is mainly to allow people to show events with nsecs + or usecs if desired, rather than cycles. We use it in some performance + analysis, but it has to be done with care, since cpuspeed can change, + different cpu's can have different speeds, etc. + + Some architectures don't have their TSC-equivalent running at anything + related to the processor speed (e.g. G5 Power systems use a fixed + 33 MHz frequency). +*/ + +#define MIN_TEST_TIME_IN_PICOS (100000000000LL) /* 100 milliseconds */ + +static int timebase_debug; /* off by default */ + +#define timebase_warn_always(fmt, ...) \ + hfi_syslog("timebase", 1, LOG_ERR, fmt, ##__VA_ARGS__) +#define timebase_warn(fmt, ...) if (timebase_debug) \ + timebase_warn_always(fmt, ##__VA_ARGS__) + +static int hfi_timebase_isvalid(uint32_t pico_per_cycle) +{ +#if defined(__x86_64__) || defined(__i386__) + /* If pico-per-cycle is less than 200, the clock speed would be greater + * than 5 GHz. Similarly, we minimally support a 1GHz clock. + * Allow some slop, because newer kernels with HPET can be a few + * units off, and we don't want to spend the startup time needlessly */ + if (pico_per_cycle >= 198 && pico_per_cycle <= 1005) + return 1; +#endif + else + return 0; +} + +/* + * Method #1: + * + * Derive the pico-per-cycle by trying to correlate the difference between two + * reads of the tsc counter to gettimeofday. + */ +static void init_picos_per_cycle() +{ + struct timeval tvs, tve; + int64_t usec = 0; + uint64_t ts, te; + int64_t delta; + uint32_t picos = 0; + int trials = 0; + int retry = 0; + cpu_set_t cpuset, cpuset_saved; + int have_cpuset = 1; + + /* + * Make sure we try to calculate the cycle time without being migrated. + */ + CPU_ZERO(&cpuset_saved); + if (sched_getaffinity(0, sizeof(cpuset), &cpuset_saved)) + have_cpuset = 0; + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + if (have_cpuset && sched_setaffinity(0, sizeof(cpuset), &cpuset)) + have_cpuset = 0; + + /* + * If we set affinity correctly, give the scheduler another change to put + * us on processor 0 + */ + if (have_cpuset) + sched_yield(); + +retry_pico_test: + if (++retry == 10) { + __hfi_pico_per_cycle = hfi_timebase_from_cpuinfo(picos); + goto reset_cpu_mask; /* Reset CPU mask before exiting */ + } + + usec = 0; + gettimeofday(&tvs, NULL); + ts = get_cycles(); + while (usec < MIN_TEST_TIME_IN_PICOS) { /* wait for at least 100 millisecs */ + trials++; + usleep(125); + gettimeofday(&tve, NULL); + usec = 1000000LL * (tve.tv_usec - tvs.tv_usec) + + 1000000000000LL * (tve.tv_sec - tvs.tv_sec); + if (usec < 0) { + timebase_warn + ("RTC timebase, gettimeofday is negative (!) %lld\n", + (long long)usec); + goto retry_pico_test; + } + } + te = get_cycles(); + delta = te - ts; + picos = (uint32_t) (usec / delta); + + if (!hfi_timebase_isvalid(picos)) { + cpu_set_t cpuget; + int affinity_valid = + !sched_getaffinity(0, sizeof(cpuget), &cpuget); + if (affinity_valid && !CPU_ISSET(0, &cpuget)) + affinity_valid = 0; + timebase_warn + ("Failed to get valid RTC timebase, gettimeofday delta=%lld, " + "rtc delta=%lld, picos_per_cycle=%d affinity_valid=%s (trial %d/10)\n", + (long long)usec, (long long)delta, picos, + affinity_valid ? "YES" : "NO", retry); + goto retry_pico_test; + } + + /* If we've had to retry even once, let that be known */ + if (retry > 1) + timebase_warn("Clock is %d picos/cycle found in %d trials and " + "%.3f seconds (retry=%d)\n", picos, trials, + (double)usec / 1.0e12, retry); + + __hfi_pico_per_cycle = picos; + +reset_cpu_mask: + /* Restore affinity */ + if (have_cpuset) { + sched_setaffinity(0, sizeof(cpuset), &cpuset_saved); + /* + * Give a chance to other processes that also set affinity to 0 for + * doing this test. + */ + sched_yield(); + } +} + +/* + * Method #2: + * + * Derive the pico-per-cycle from /proc instead of using sleep trick + * that relies on scheduler. + */ +static uint32_t hfi_timebase_from_cpuinfo(uint32_t old_pico_per_cycle) +{ + /* we only validate once */ + uint32_t new_pico_per_cycle = old_pico_per_cycle; + uint32_t max_bet_new_old_pico, min_bet_new_old_pico; + + char hostname[80]; + gethostname(hostname, 80); + hostname[sizeof(hostname) - 1] = '\0'; + + if (getenv("HFI_DEBUG_TIMEBASE")) + timebase_debug = 1; + + /* If the old one is valid, don't bother with this mechanism */ + if (hfi_timebase_isvalid(old_pico_per_cycle)) + return old_pico_per_cycle; + +#if defined(__x86_64__) || defined(__i386__) + { + FILE *fp = fopen("/proc/cpuinfo", "r"); + char input[255]; + char *p = NULL; + + if (!fp) + goto fail; + + while (!feof(fp) && fgets(input, 255, fp)) { + if (strstr(input, "cpu MHz")) { + p = strchr(input, ':'); + if (p) + { + double MHz = atof(p + 1); + if (MHz != 0.0) + new_pico_per_cycle = + (uint32_t) (1000000. / MHz); + } + break; + } + } + fclose(fp); + if (!p) + goto fail; + } +#endif + + max_bet_new_old_pico = max(new_pico_per_cycle, old_pico_per_cycle); + min_bet_new_old_pico = min(new_pico_per_cycle, old_pico_per_cycle); + /* If there's no change (within a small range), just return the old one */ + if ((max_bet_new_old_pico - min_bet_new_old_pico) < 5) + return old_pico_per_cycle; + + if (hfi_timebase_isvalid(new_pico_per_cycle)) { + timebase_warn_always + ("RTC timebase, using %d picos/cycle from /proc " + "instead of the detected %d picos/cycle\n", + new_pico_per_cycle, old_pico_per_cycle); + return new_pico_per_cycle; + } + +fail: + new_pico_per_cycle = SAFEDEFAULT_PICOS_PER_CYCLE; + timebase_warn_always + ("Problem obtaining CPU time base, detected to be %d " + "pico/cycle, adjusted to safe default %d picos/cycle", + old_pico_per_cycle, new_pico_per_cycle); + return new_pico_per_cycle; +} diff --git a/opa/opa_utils.c b/opa/opa_utils.c new file mode 100644 index 0000000..7169b0e --- /dev/null +++ b/opa/opa_utils.c @@ -0,0 +1,153 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* This file contains hfi service routine interface used by the low */ +/* level hfi protocol code. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opa_user.h" + +/* keep track whether we disabled mmap in malloc */ +int __hfi_malloc_no_mmap = 0; + +const char *hfi_get_next_name(char **names) +{ + char *p, *start; + + p = start = *names; + while (*p != '\0' && *p != '\n') { + p++; + } + if (*p == '\n') { + *p = '\0'; + p++; + *names = p; + return start; + } else + return NULL; +} + +void hfi_release_names(char *namep) +{ + /* names were initialised in the data section before. Now + * they are allocated when hfi_hfifs_read() is called. Allocation + * for names is done only once at init time. Should we eventually + * have an "stats_type_unregister" type of routine to explicitly + * deallocate memory and free resources ? + */ +#if 0 + if (namep != NULL) + free(namep); +#endif +} + +int hfi_get_stats_names_count() +{ + char *namep; + int c; + + c = hfi_get_stats_names(&namep); + free(namep); + return c; +} + +int hfi_get_ctrs_unit_names_count(int unitno) +{ + char *namep; + int c; + + c = hfi_get_ctrs_unit_names(unitno, &namep); + free(namep); + return c; +} + +int hfi_get_ctrs_port_names_count(int unitno) +{ + char *namep; + int c; + + c = hfi_get_ctrs_port_names(unitno, &namep); + free(namep); + return c; +} + +/* + * Add a constructor function to disable mmap if asked to do so by the user + */ +static void init_mallopt_disable_mmap(void) __attribute__ ((constructor)); + +static void init_mallopt_disable_mmap(void) +{ + char *env = getenv("HFI_DISABLE_MMAP_MALLOC"); + + if (env && *env) { + if (mallopt(M_MMAP_MAX, 0) && mallopt(M_TRIM_THRESHOLD, -1)) { + __hfi_malloc_no_mmap = 1; + } + } + + return; +} diff --git a/opa/opa_write_pio-i386.c b/opa/opa_write_pio-i386.c new file mode 100644 index 0000000..5d8cd7f --- /dev/null +++ b/opa/opa_write_pio-i386.c @@ -0,0 +1,304 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +/* This file contains the initialization functions used by the low + level hfi protocol code. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hfi_user.h" + +/* + * These pio copy routines are here so they can be used by test code, as well + * as by MPI, and can change independently of MPI +*/ + +/* + * for processors that may not write store buffers in the order filled, + * and when the store buffer is not completely filled (partial at end, or + * interrupted and flushed) may write the partial buffer in + * "random" order. requires additional serialization +*/ +void hfi_write_pio_force_order(volatile uint32_t *piob, + const struct hfi_pio_params *pioparm, void *hdr, + void *bdata) +{ + union hfi_pbc buf = {.qword = 0 }; + uint32_t cksum_len = pioparm->cksum_is_valid ? + HFI_CRC_SIZE_IN_BYTES : 0; + + buf.length = + __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len + + pioparm->length) >> 2) + 1); + if (pioparm->port > 1) + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + __PBC_IBPORT | pioparm->rate); + else + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + pioparm->rate); + + *piob++ = buf.dword; + /* 32 bit programs require fence after first 32 bits of pbc write */ + /* Can't do as uint64_t store, or compiler could reorder */ + ips_wmb(); + *piob++ = buf.pbcflags; + + if (!pioparm->length) { + uint32_t *dhdr, dcpywords; + dcpywords = (HFI_MESSAGE_HDR_SIZE >> 2) - 1; + hfi_dwordcpy_safe(piob, hdr, dcpywords); + ips_wmb(); + dhdr = hdr; + piob += dcpywords; + dhdr += dcpywords; + *piob++ = *dhdr; + } else { + uint32_t *pay2 = bdata, j; + uint32_t len = pioparm->length; + + hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2); + piob += HFI_MESSAGE_HDR_SIZE >> 2; + + len >>= 2; + if (len > 16) { + uint32_t pay_words = 16 * ((len - 1) / 16); + hfi_dwordcpy_safe(piob, pay2, pay_words); + piob += pay_words; + pay2 += pay_words; + len -= pay_words; + } + /* now write the final chunk a word at a time, fence before trigger */ + for (j = 0; j < (len - 1); j++) + *piob++ = *pay2++; + ips_wmb(); /* flush the buffer out now, so */ + *piob++ = *pay2; + } + + /* If checksum is enabled insert CRC at end of packet */ + if_pf(pioparm->cksum_is_valid) { + int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + while (nCRC < (nCRCopies - 1)) { + *piob = pioparm->cksum; + piob++; + nCRC++; + } + + ips_wmb(); + *piob = pioparm->cksum; + } + + /* send it on it's way, now, rather than waiting for processor to + * get around to flushing it */ + ips_wmb(); +} + +/* + * for processors that always write store buffers in the order filled, + * and if store buffer not completely filled (partial at end, or + * interrupted and flushed) always write the partial buffer in + * address order. Avoids serializing and flush instructions + * where possible. + */ +void hfi_write_pio(volatile uint32_t *piob, + const struct hfi_pio_params *pioparm, void *hdr, void *bdata) +{ + union hfi_pbc buf = { 0 }; + uint32_t cksum_len = pioparm->cksum_is_valid ? + HFI_CRC_SIZE_IN_BYTES : 0; + + buf.length = + __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len + + pioparm->length) >> 2) + 1); + if (pioparm->port > 1) + buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | + __PBC_IBPORT | pioparm->rate); + else + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + pioparm->rate); + + *piob++ = buf.dword; + /* 32 bit programs needs compiler fence to prevent compiler reordering + the two 32 bit stores in a uint64_t, but on inorder wc systems, does + not need a memory fence. */ + asm volatile ("" : : : "memory"); + *piob++ = buf.pbcflags; + + hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2); + piob += HFI_MESSAGE_HDR_SIZE >> 2; + asm volatile ("" : : : "memory"); + + if (pioparm->length) + hfi_dwordcpy_safe(piob, (uint32_t *) bdata, + pioparm->length >> 2); + + /* If checksum is enabled insert CRC at end of packet */ + if_pf(pioparm->cksum_is_valid) { + int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + piob += pioparm->length >> 2; + + while (nCRC < (nCRCopies - 1)) { + *piob = pioparm->cksum; + piob++; + nCRC++; + } + + asm volatile ("" : : : "memory"); + *piob = pioparm->cksum; + } + + /* send it on it's way, now, rather than waiting for processor to + * get around to flushing it */ + ips_wmb(); +} + +/* + * for processors that always write store buffers in the order filled, + * and if store buffer not completely filled (partial at end, or + * interrupted and flushed) always write the partial buffer in + * address order. Avoids serializing and flush instructions + * where possible. + */ +static void hfi_write_pio_special_trigger(volatile uint32_t *piob, + const struct hfi_pio_params *pioparm, + void *hdr, void *bdata, + unsigned offset) + __attribute__ ((always_inline)); + +static void hfi_write_pio_special_trigger(volatile uint32_t *piob, + const struct hfi_pio_params *pioparm, + void *hdr, void *bdata, + unsigned offset) +{ + union hfi_pbc buf = { 0 }; + volatile uint32_t *piobs = piob; + uint32_t cksum_len = pioparm->cksum_is_valid ? + HFI_CRC_SIZE_IN_BYTES : 0; + + buf.length = + __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len + + pioparm->length) >> 2) + 1); + if (pioparm->port > 1) + buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | + __PBC_IBPORT | pioparm->rate); + else + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + pioparm->rate); + + *piob++ = buf.dword; + /* 32 bit programs needs compiler fence to prevent compiler reordering + the two 32 bit stores in a uint64_t, but on inorder wc systems, does + not need a memory fence. */ + asm volatile ("" : : : "memory"); + *piob++ = buf.pbcflags; + + hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2); + piob += HFI_MESSAGE_HDR_SIZE >> 2; + asm volatile ("" : : : "memory"); + + if (pioparm->length) + hfi_dwordcpy_safe(piob, (uint32_t *) bdata, + pioparm->length >> 2); + + /* If checksum is enabled insert CRC at end of packet */ + if_pf(pioparm->cksum_is_valid) { + int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + piob += pioparm->length >> 2; + + while (nCRC < (nCRCopies - 1)) { + *piob = pioparm->cksum; + piob++; + nCRC++; + } + + asm volatile ("" : : : "memory"); + *piob = pioparm->cksum; + } + + /* send it on it's way, now, rather than waiting for processor to + * get around to flushing it */ + ips_wmb(); + *(piobs + offset) = HFI_SPECIAL_TRIGGER_MAGIC; + ips_wmb(); +} + +void hfi_write_pio_special_trigger2k(volatile uint32_t *piob, + const struct hfi_pio_params *pioparm, + void *hdr, void *bdata) +{ + hfi_write_pio_special_trigger(piob, pioparm, hdr, bdata, 1023); +} + +void hfi_write_pio_special_trigger4k(volatile uint32_t *piob, + const struct hfi_pio_params *pioparm, + void *hdr, void *bdata) +{ + hfi_write_pio_special_trigger(piob, pioparm, hdr, bdata, 2047); +} diff --git a/opa/opa_write_pio-x86_64.c b/opa/opa_write_pio-x86_64.c new file mode 100644 index 0000000..b895601 --- /dev/null +++ b/opa/opa_write_pio-x86_64.c @@ -0,0 +1,295 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +/* This file contains the initialization functions used by the low + level hfi protocol code. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opa_user.h" + +/* + * These pio copy routines are here so they can be used by test code, as well + * as by MPI, and can change independently of MPI +*/ + +/* + * for processors that may not write store buffers in the order filled, + * and when the store buffer is not completely filled (partial at end, or + * interrupted and flushed) may write the partial buffer in + * "random" order. requires additional serialization +*/ +void hfi_write_pio_force_order(volatile uint32_t *piob, + const struct hfi_pio_params *pioparm, void *hdr, + void *bdata) +{ + union hfi_pbc buf = {.qword = 0 }; + uint32_t cksum_len = pioparm->cksum_is_valid ? + HFI_CRC_SIZE_IN_BYTES : 0; + + buf.length = + __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len + + pioparm->length) >> 2) + 1); + if (pioparm->port > 1) + buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | + __PBC_IBPORT | pioparm->rate); + else + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + pioparm->rate); + + *(volatile uint64_t *)piob = buf.qword; + ips_wmb(); /* pbc must be forced to be first write to chip buffer */ + piob += 2; + + if (!pioparm->length) { + uint32_t *dhdr, dcpywords; + dcpywords = (HFI_MESSAGE_HDR_SIZE >> 2) - 1; + hfi_dwordcpy_safe(piob, hdr, dcpywords); + ips_wmb(); + dhdr = hdr; + piob += dcpywords; + dhdr += dcpywords; + *piob++ = *dhdr; + } else { + uint32_t *pay2 = bdata, j; + uint32_t len = pioparm->length; + + hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2); + piob += HFI_MESSAGE_HDR_SIZE >> 2; + + len >>= 2; + if (len > 16) { + uint32_t pay_words = 16 * ((len - 1) / 16); + hfi_dwordcpy_safe(piob, pay2, pay_words); + piob += pay_words; + pay2 += pay_words; + len -= pay_words; + } + /* now write the final chunk a word at a time, fence before trigger */ + for (j = 0; j < (len - 1); j++) + *piob++ = *pay2++; + ips_wmb(); /* flush the buffer out now, so */ + *piob++ = *pay2; + } + + /* If checksum is enabled insert CRC at end of packet */ + if_pf(pioparm->cksum_is_valid) { + int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + while (nCRC < (nCRCopies - 1)) { + *piob = pioparm->cksum; + piob++; + nCRC++; + } + + ips_wmb(); + *piob = pioparm->cksum; + } + + /* send it on it's way, now, rather than waiting for processor to + * get around to flushing it */ + ips_wmb(); +} + +/* + * for processors that always write store buffers in the order filled, + * and if store buffer not completely filled (partial at end, or + * interrupted and flushed) always write the partial buffer in + * address order. Avoids serializing and flush instructions + * where possible. + */ +void hfi_write_pio(volatile uint32_t *piob, + const struct hfi_pio_params *pioparm, void *hdr, void *bdata) +{ + union hfi_pbc buf = { 0 }; + uint32_t cksum_len = pioparm->cksum_is_valid ? + HFI_CRC_SIZE_IN_BYTES : 0; + + buf.length = + __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len + + pioparm->length) >> 2) + 1); + if (pioparm->port > 1) + buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | + __PBC_IBPORT | pioparm->rate); + else + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + pioparm->rate); + + *(volatile uint64_t *)piob = buf.qword; + piob += 2; + asm volatile ("" : : : "memory"); + + hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2); + + asm volatile ("" : : : "memory"); + piob += HFI_MESSAGE_HDR_SIZE >> 2; + + if (pioparm->length) + hfi_dwordcpy_safe(piob, (uint32_t *) bdata, + pioparm->length >> 2); + + /* If checksum is enabled insert CRC at end of packet */ + if_pf(pioparm->cksum_is_valid) { + int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + piob += pioparm->length >> 2; + + while (nCRC < (nCRCopies - 1)) { + *piob = pioparm->cksum; + piob++; + nCRC++; + } + + asm volatile ("" : : : "memory"); + *piob = pioparm->cksum; + } + + /* send it on it's way, now, rather than waiting for processor to + * get around to flushing it */ + ips_wmb(); +} + +/* + * here we trigger on a "special" address, so just bang it out + * as fast as possible... + */ +static void +hfi_write_pio_special_trigger(volatile uint32_t *piob, + const struct hfi_pio_params *pioparm, void *hdr, + void *bdata, unsigned offset) + __attribute__ ((always_inline)); + +static void +hfi_write_pio_special_trigger(volatile uint32_t *piob, + const struct hfi_pio_params *pioparm, + void *hdr, void *bdata, unsigned offset) +{ + union hfi_pbc buf = { 0 }; + volatile uint32_t *piobs = piob; + uint32_t cksum_len = pioparm->cksum_is_valid ? + HFI_CRC_SIZE_IN_BYTES : 0; + + buf.length = + __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len + + pioparm->length) >> 2) + 1); + if (pioparm->port > 1) + buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | + __PBC_IBPORT | pioparm->rate); + else + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + pioparm->rate); + + *(volatile uint64_t *)piob = buf.qword; + piob += 2; + asm volatile ("" : : : "memory"); + + hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2); + piob += HFI_MESSAGE_HDR_SIZE >> 2; + asm volatile ("" : : : "memory"); + + if (pioparm->length) + hfi_dwordcpy_safe(piob, (uint32_t *) bdata, + pioparm->length >> 2); + + /* If checksum is enabled insert CRC at end of packet */ + if_pf(pioparm->cksum_is_valid) { + int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + piob += pioparm->length >> 2; + + while (nCRC < (nCRCopies - 1)) { + *piob = pioparm->cksum; + piob++; + nCRC++; + } + + asm volatile ("" : : : "memory"); + *piob = pioparm->cksum; + } + + /* + * flush then write "special" then flush... + */ + ips_wmb(); + *(piobs + offset) = HFI_SPECIAL_TRIGGER_MAGIC; + ips_wmb(); +} + +void hfi_write_pio_special_trigger2k(volatile uint32_t *piob, + const struct hfi_pio_params *pioparm, + void *hdr, void *bdata) +{ + hfi_write_pio_special_trigger(piob, pioparm, hdr, bdata, 1023); +} + +void hfi_write_pio_special_trigger4k(volatile uint32_t *piob, + const struct hfi_pio_params *pioparm, + void *hdr, void *bdata) +{ + hfi_write_pio_special_trigger(piob, pioparm, hdr, bdata, 2047); +} diff --git a/psm.c b/psm.c new file mode 100644 index 0000000..cb12dc5 --- /dev/null +++ b/psm.c @@ -0,0 +1,1069 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include +#include "psm_user.h" +#include "psm2_hal.h" +#include "opa_revision.h" +#include "psm_mq_internal.h" + +static int psmi_verno_major = PSM2_VERNO_MAJOR; +static int psmi_verno_minor = PSM2_VERNO_MINOR; +static int psmi_verno = PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR); +static int psmi_verno_client_val; +int psmi_epid_ver; + +#define PSMI_NOT_INITIALIZED 0 +#define PSMI_INITIALIZED 1 +#define PSMI_FINALIZED -1 /* Prevent the user from calling psm2_init + * once psm_finalize has been called. */ +static int psmi_isinit = PSMI_NOT_INITIALIZED; + +/* Global lock used for endpoint creation and destroy + * (in functions psm2_ep_open and psm2_ep_close) and also + * for synchronization with recv_thread (so that recv_thread + * will not work on an endpoint which is in a middle of closing). */ +psmi_lock_t psmi_creation_lock; + +sem_t *sem_affinity_shm_rw = NULL; +int psmi_affinity_shared_file_opened = 0; +int psmi_affinity_semaphore_open = 0; +uint64_t *shared_affinity_ptr; +char *sem_affinity_shm_rw_name; +char *affinity_shm_name; + +#ifdef PSM_CUDA +int is_cuda_enabled; +int is_gdr_copy_enabled; +int device_support_gpudirect; +int cuda_lib_version; +int is_driver_gpudirect_enabled; +int is_cuda_primary_context_retain = 0; +uint32_t cuda_thresh_rndv; +uint32_t gdr_copy_threshold_send; +uint32_t gdr_copy_threshold_recv; +#endif + +/* + * Bit field that contains capability set. + * Each bit represents different capability. + * It is supposed to be filled with logical OR + * on conditional compilation basis + * along with future features/capabilities. + * At the very beginning we start with Multi EPs. + */ +uint64_t psm2_capabilities_bitset = PSM2_MULTI_EP_CAP; + +int psmi_verno_client() +{ + return psmi_verno_client_val; +} + +/* This function is used to determine whether the current library build can + * successfully communicate with another library that claims to be version + * 'verno'. + * + * PSM 2.x is always ABI compatible, but this checks to see if two different + * versions of the library can coexist. + */ +int psmi_verno_isinteroperable(uint16_t verno) +{ + if (PSMI_VERNO_GET_MAJOR(verno) != PSM2_VERNO_MAJOR) + return 0; + + return 1; +} + +int MOCKABLE(psmi_isinitialized)() +{ + return (psmi_isinit == PSMI_INITIALIZED); +} +MOCK_DEF_EPILOGUE(psmi_isinitialized); + +#ifdef PSM_CUDA +int psmi_cuda_lib_load() +{ + psm2_error_t err = PSM2_OK; + char *dlerr; + + PSM2_LOG_MSG("entering"); + _HFI_VDBG("Loading CUDA library.\n"); + + psmi_cuda_lib = dlopen("libcuda.so.1", RTLD_LAZY); + if (!psmi_cuda_lib) { + dlerr = dlerror(); + _HFI_ERROR("Unable to open libcuda.so. Error %s\n", + dlerr ? dlerr : "no dlerror()"); + goto fail; + } + + psmi_cuDriverGetVersion = dlsym(psmi_cuda_lib, "cuDriverGetVersion"); + + if (!psmi_cuDriverGetVersion) { + _HFI_ERROR + ("Unable to resolve symbols in CUDA libraries.\n"); + goto fail; + } + + PSMI_CUDA_CALL(cuDriverGetVersion, &cuda_lib_version); + if (cuda_lib_version < 7000) { + _HFI_ERROR("Please update CUDA driver, required minimum version is 7.0\n"); + goto fail; + } + + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuInit); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxGetCurrent); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxDetach); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxSetCurrent); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerGetAttribute); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerSetAttribute); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetAttribute); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGet); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetCount); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamCreate); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamDestroy); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventCreate); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventDestroy); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventQuery); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventRecord); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventSynchronize); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostAlloc); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemFreeHost); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpy); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoD); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoH); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyHtoD); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoHAsync); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyHtoDAsync); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcGetMemHandle); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcOpenMemHandle); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcCloseMemHandle); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemGetAddressRange); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxGetState); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxRetain); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxRelease); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxGetDevice); + + PSM2_LOG_MSG("leaving"); + return err; +fail: + if (psmi_cuda_lib) + dlclose(psmi_cuda_lib); + err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to load CUDA library.\n"); + return err; +} + +int psmi_cuda_initialize() +{ + psm2_error_t err = PSM2_OK; + int num_devices, dev; + + PSM2_LOG_MSG("entering"); + _HFI_VDBG("Enabling CUDA support.\n"); + + err = psmi_cuda_lib_load(); + if (err != PSM2_OK) + goto fail; + + PSMI_CUDA_CALL(cuInit, 0); + + /* Check if CUDA context is available. If not, we are not allowed to + * launch any CUDA API calls */ + PSMI_CUDA_CALL(cuCtxGetCurrent, &ctxt); + if (ctxt == NULL) { + _HFI_INFO("Unable to find active CUDA context\n"); + is_cuda_enabled = 0; + err = PSM2_OK; + return err; + } + + CUdevice device; + CUcontext primary_ctx; + PSMI_CUDA_CALL(cuCtxGetDevice, &device); + int is_ctx_active; + unsigned ctx_flags; + PSMI_CUDA_CALL(cuDevicePrimaryCtxGetState, device, &ctx_flags, &is_ctx_active); + if (!is_ctx_active) { + /* There is an issue where certain CUDA API calls create + * contexts but does not make it active which cause the + * driver API call to fail with error 709 */ + PSMI_CUDA_CALL(cuDevicePrimaryCtxRetain, &primary_ctx, device); + is_cuda_primary_context_retain = 1; + } + + /* Check if all devices support Unified Virtual Addressing. */ + PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices); + + for (dev = 0; dev < num_devices; dev++) { + CUdevice device; + PSMI_CUDA_CALL(cuDeviceGet, &device, dev); + int unifiedAddressing; + PSMI_CUDA_CALL(cuDeviceGetAttribute, + &unifiedAddressing, + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, + device); + + if (unifiedAddressing !=1) { + _HFI_ERROR("CUDA device %d does not support Unified Virtual Addressing.\n", dev); + goto fail; + } + + int major; + PSMI_CUDA_CALL(cuDeviceGetAttribute, + &major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + device); + if (major >= 3) + device_support_gpudirect = 1; + else { + device_support_gpudirect = 0; + _HFI_INFO("Device %d does not support GPUDirect RDMA (Non-fatal error) \n", dev); + } + } + + union psmi_envvar_val env_enable_gdr_copy; + psmi_getenv("PSM2_GDRCOPY", + "Enable (set envvar to 1) for gdr copy support in PSM (Enabled by default)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)1, &env_enable_gdr_copy); + is_gdr_copy_enabled = env_enable_gdr_copy.e_int; + + union psmi_envvar_val env_cuda_thresh_rndv; + psmi_getenv("PSM2_CUDA_THRESH_RNDV", + "RNDV protocol is used for message sizes greater than the threshold \n", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)CUDA_THRESH_RNDV, &env_cuda_thresh_rndv); + cuda_thresh_rndv = env_cuda_thresh_rndv.e_int; + + if (cuda_thresh_rndv < 0 || cuda_thresh_rndv > CUDA_THRESH_RNDV) + cuda_thresh_rndv = CUDA_THRESH_RNDV; + + union psmi_envvar_val env_gdr_copy_thresh_send; + psmi_getenv("PSM2_GDRCOPY_THRESH_SEND", + "GDR Copy is turned off on the send side" + " for message sizes greater than the threshold \n", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)GDR_COPY_THRESH_SEND, &env_gdr_copy_thresh_send); + gdr_copy_threshold_send = env_gdr_copy_thresh_send.e_int; + + if (gdr_copy_threshold_send < 8 || gdr_copy_threshold_send > cuda_thresh_rndv) + gdr_copy_threshold_send = GDR_COPY_THRESH_SEND; + + union psmi_envvar_val env_gdr_copy_thresh_recv; + psmi_getenv("PSM2_GDRCOPY_THRESH_RECV", + "GDR Copy is turned off on the recv side" + " for message sizes greater than the threshold \n", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)GDR_COPY_THRESH_RECV, &env_gdr_copy_thresh_recv); + gdr_copy_threshold_recv = env_gdr_copy_thresh_recv.e_int; + + if (gdr_copy_threshold_recv < 8) + gdr_copy_threshold_recv = GDR_COPY_THRESH_RECV; + + PSM2_LOG_MSG("leaving"); + return err; +fail: + err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to initialize PSM2 CUDA support.\n"); + return err; +} +#endif + +psm2_error_t __psm2_init(int *major, int *minor) +{ + psm2_error_t err = PSM2_OK; + union psmi_envvar_val env_tmask; + + psmi_log_initialize(); + + PSM2_LOG_MSG("entering"); + + /* When PSM_PERF is enabled, the following code causes the + PMU to be programmed to measure instruction cycles of the + TX/RX speedpaths of PSM. */ + GENERIC_PERF_INIT(); + GENERIC_PERF_SET_SLOT_NAME(PSM_TX_SPEEDPATH_CTR, "TX"); + GENERIC_PERF_SET_SLOT_NAME(PSM_RX_SPEEDPATH_CTR, "RX"); + + if (psmi_isinit == PSMI_INITIALIZED) + goto update; + + if (psmi_isinit == PSMI_FINALIZED) { + err = PSM2_IS_FINALIZED; + goto fail; + } + + if (major == NULL || minor == NULL) { + err = PSM2_PARAM_ERR; + goto fail; + } + + psmi_init_lock(&psmi_creation_lock); + +#ifdef PSM_DEBUG + if (!getenv("PSM2_NO_WARN")) + fprintf(stderr, + "!!! WARNING !!! You are running an internal-only PSM *DEBUG* build.\n"); +#endif + +#ifdef PSM_PROFILE + if (!getenv("PSM2_NO_WARN")) + fprintf(stderr, + "!!! WARNING !!! You are running an internal-only PSM *PROFILE* build.\n"); +#endif + + /* Make sure we complain if fault injection is enabled */ + if (getenv("PSM2_FI") && !getenv("PSM2_NO_WARN")) + fprintf(stderr, + "!!! WARNING !!! You are running with fault injection enabled!\n"); + + /* Make sure, as an internal check, that this version knows how to detect + * compatibility with other library versions it may communicate with */ + if (psmi_verno_isinteroperable(psmi_verno) != 1) { + err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "psmi_verno_isinteroperable() not updated for current version!"); + goto fail; + } + + /* The only way to not support a client is if the major number doesn't + * match */ + if (*major != PSM2_VERNO_MAJOR && *major != PSM2_VERNO_COMPAT_MAJOR) { + err = psmi_handle_error(NULL, PSM2_INIT_BAD_API_VERSION, + "This library does not implement version %d.%d", + *major, *minor); + goto fail; + } + + /* Make sure we don't keep track of a client that claims a higher version + * number than we are */ + psmi_verno_client_val = + min(PSMI_VERNO_MAKE(*major, *minor), psmi_verno); + + /* Check to see if we need to set Architecture flags to something + * besides big core Xeons */ + cpuid_t id; + psmi_cpu_model = CPUID_MODEL_UNDEFINED; + + /* First check to ensure Genuine Intel */ + get_cpuid(0x0, 0, &id); + if(id.ebx == CPUID_GENUINE_INTEL_EBX + && id.ecx == CPUID_GENUINE_INTEL_ECX + && id.edx == CPUID_GENUINE_INTEL_EDX) + { + /* Use cpuid with EAX=1 to get processor info */ + get_cpuid(0x1, 0, &id); + psmi_cpu_model = CPUID_GENUINE_INTEL; + } + + if( (psmi_cpu_model == CPUID_GENUINE_INTEL) && + (id.eax & CPUID_FAMILY_MASK) == CPUID_FAMILY_XEON) + { + psmi_cpu_model = ((id.eax & CPUID_MODEL_MASK) >> 4) | + ((id.eax & CPUID_EXMODEL_MASK) >> 12); + } + + psmi_isinit = PSMI_INITIALIZED; + /* hfi_debug lives in libhfi.so */ + psmi_getenv("PSM2_TRACEMASK", + "Mask flags for tracing", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_ULONG_FLAGS, + (union psmi_envvar_val)hfi_debug, &env_tmask); + hfi_debug = (long)env_tmask.e_ulong; + + /* The "real thing" is done in hfi_proto.c as a constructor function, but + * we getenv it here to report what we're doing with the setting */ + { + extern int __hfi_malloc_no_mmap; + union psmi_envvar_val env_mmap; + char *env = getenv("HFI_DISABLE_MMAP_MALLOC"); + int broken = (env && *env && !__hfi_malloc_no_mmap); + psmi_getenv("HFI_DISABLE_MMAP_MALLOC", + broken ? "Skipping mmap disable for malloc()" : + "Disable mmap for malloc()", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_YESNO, + (union psmi_envvar_val)0, &env_mmap); + if (broken) + _HFI_ERROR + ("Couldn't successfully disable mmap in mallocs " + "with mallopt()\n"); + } + + { + union psmi_envvar_val env_epid_ver; + psmi_getenv("PSM2_ADDR_FMT", + "Used to force PSM2 to use a particular version of EPID", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)PSMI_EPID_VERNO_DEFAULT, &env_epid_ver); + psmi_epid_ver = env_epid_ver.e_int; + if (psmi_epid_ver > PSMI_MAX_EPID_VERNO_SUPPORTED) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + " The max epid version supported in this version of PSM2 is %d \n" + "Please upgrade PSM2 \n", + PSMI_MAX_EPID_VERNO_SUPPORTED); + goto fail; + } else if (psmi_epid_ver < PSMI_MIN_EPID_VERNO_SUPPORTED) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + " Invalid value provided through PSM2_ADDR_FMT \n"); + goto fail; + } + } + + if (getenv("PSM2_DIAGS")) { + _HFI_INFO("Running diags...\n"); + psmi_diags(); + } + + psmi_multi_ep_init(); + + psmi_faultinj_init(); + + psmi_epid_init(); + + int rc = psmi_hal_initialize(); + + if (rc) + { + err = PSM2_INTERNAL_ERR; + goto fail; + } + +#ifdef PSM_CUDA + union psmi_envvar_val env_enable_cuda; + psmi_getenv("PSM2_CUDA", + "Enable (set envvar to 1) for cuda support in PSM (Disabled by default)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)0, &env_enable_cuda); + is_cuda_enabled = env_enable_cuda.e_int; + + if (PSMI_IS_CUDA_ENABLED) { + err = psmi_cuda_initialize(); + if (err != PSM2_OK) + goto fail; + } +#endif + +update: + + if (getenv("PSM2_IDENTIFY")) { + Dl_info info_psm; + char ofed_delta[100] = ""; + strcat(strcat(ofed_delta," built for OFED DELTA "),psmi_hfi_IFS_version); + printf("%s %s PSM2 v%d.%d%s\n" + "%s %s location %s\n" + "%s %s build date %s\n" + "%s %s src checksum %s\n" + "%s %s git checksum %s\n" + "%s %s built against driver interface v%d.%d\n" + "%s %s HAL instance code: %d, HAL description: \"%s\"\n", + hfi_get_mylabel(), hfi_ident_tag, + PSM2_VERNO_MAJOR,PSM2_VERNO_MINOR, + (strcmp(psmi_hfi_IFS_version,"") != 0) ? ofed_delta +#ifdef PSM_CUDA + : "-cuda", +#else + : "", +#endif + hfi_get_mylabel(), hfi_ident_tag, dladdr(psm2_init, &info_psm) ? + info_psm.dli_fname : "libpsm2 not available", + hfi_get_mylabel(), hfi_ident_tag, psmi_hfi_build_timestamp, + hfi_get_mylabel(), hfi_ident_tag, psmi_hfi_sources_checksum, + hfi_get_mylabel(), hfi_ident_tag, + (strcmp(psmi_hfi_git_checksum,"") != 0) ? + psmi_hfi_git_checksum : "", + hfi_get_mylabel(), hfi_ident_tag, + psmi_hal_get_user_major_bldtime_version(), + psmi_hal_get_user_minor_bldtime_version(), + hfi_get_mylabel(), hfi_ident_tag, psmi_hal_get_hal_instance_type(), + psmi_hal_get_hal_instance_description()); + } + + *major = (int)psmi_verno_major; + *minor = (int)psmi_verno_minor; +fail: + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_init) + +static +psm2_error_t psmi_get_psm2_config(psm2_mq_t mq, + psm2_epaddr_t epaddr, + uint32_t *out) +{ + psm2_error_t rv = PSM2_INTERNAL_ERR; + + *out = 0; + if (&mq->ep->ptl_ips == epaddr->ptlctl) + { + rv = PSM2_OK; + *out |= PSM2_INFO_QUERY_CONFIG_IPS; +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED) + { + *out |= PSM2_INFO_QUERY_CONFIG_CUDA; + if (PSMI_IS_GDR_COPY_ENABLED) + *out |= PSM2_INFO_QUERY_CONFIG_GDR_COPY; + } +#endif + { + union psmi_envvar_val env_sdma; + + psmi_getenv("PSM2_SDMA", + "hfi send dma flags (0 disables send dma, 2 disables send pio, " + "1 for both sdma/spio, default 1)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)1, &env_sdma); + if (env_sdma.e_uint == 0) + *out |= PSM2_INFO_QUERY_CONFIG_PIO; + else if (env_sdma.e_uint == 1) + *out |= (PSM2_INFO_QUERY_CONFIG_PIO | PSM2_INFO_QUERY_CONFIG_DMA); + else if (env_sdma.e_uint == 2) + *out |= PSM2_INFO_QUERY_CONFIG_DMA; + } + } + else if (&mq->ep->ptl_amsh == epaddr->ptlctl) + { + *out |= PSM2_INFO_QUERY_CONFIG_AMSH; + rv = PSM2_OK; + } + else if (&mq->ep->ptl_self == epaddr->ptlctl) + { + *out |= PSM2_INFO_QUERY_CONFIG_SELF; + rv = PSM2_OK; + } + return rv; +} + +psm2_error_t __psm2_info_query(psm2_info_query_t q, void *out, + size_t nargs, psm2_info_query_arg_t args[]) +{ + static const size_t expected_arg_cnt[PSM2_INFO_QUERY_LAST] = + { + 0, /* PSM2_INFO_QUERY_NUM_UNITS */ + 0, /* PSM2_INFO_QUERY_NUM_PORTS */ + 1, /* PSM2_INFO_QUERY_UNIT_STATUS */ + 2, /* PSM2_INFO_QUERY_UNIT_PORT_STATUS */ + 1, /* PSM2_INFO_QUERY_NUM_FREE_CONTEXTS */ + 1, /* PSM2_INFO_QUERY_NUM_CONTEXTS */ + 2, /* PSM2_INFO_QUERY_CONFIG */ + 3, /* PSM2_INFO_QUERY_THRESH */ + 3, /* PSM2_INFO_QUERY_DEVICE_NAME */ + 2, /* PSM2_INFO_QUERY_MTU */ + 2, /* PSM2_INFO_QUERY_LINK_SPEED */ + 1, /* PSM2_INFO_QUERY_NETWORK_TYPE */ + }; + psm2_error_t rv = PSM2_INTERNAL_ERR; + + if ((q < 0) || + (q >= PSM2_INFO_QUERY_LAST) || + (nargs != expected_arg_cnt[q])) + return rv; + + switch (q) + { + case PSM2_INFO_QUERY_NUM_UNITS: + *((uint32_t*)out) = psmi_hal_get_num_units_(1); + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_NUM_PORTS: + *((uint32_t*)out) = psmi_hal_get_num_ports_(); + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_UNIT_STATUS: + *((uint32_t*)out) = psmi_hal_get_unit_active(args[0].unit); + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_UNIT_PORT_STATUS: + *((uint32_t*)out) = psmi_hal_get_port_active(args[0].unit, + args[1].port); + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_NUM_FREE_CONTEXTS: + *((uint32_t*)out) = psmi_hal_get_num_free_contexts(args[0].unit); + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_NUM_CONTEXTS: + *((uint32_t*)out) = psmi_hal_get_num_contexts(args[0].unit); + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_CONFIG: + { + psm2_mq_t mq = args[0].mq; + psm2_epaddr_t epaddr = args[1].epaddr; + rv = psmi_get_psm2_config(mq, epaddr, (uint32_t*)out); + } + break; + case PSM2_INFO_QUERY_THRESH: + { + psm2_mq_t mq = args[0].mq; + psm2_epaddr_t epaddr = args[1].epaddr; + enum psm2_info_query_thresh_et iqt = args[2].mstq; + + uint32_t config; + rv = psmi_get_psm2_config(mq, epaddr, &config); + if (rv == PSM2_OK) + { + *((uint32_t*)out) = 0; + /* Delegate the call to the ptl member function: */ + rv = epaddr->ptlctl->msg_size_thresh_query(iqt, (uint32_t*)out, mq, epaddr); + } + } + break; + case PSM2_INFO_QUERY_DEVICE_NAME: + { + char *hfiName = (char*)out; + psm2_mq_t mq = args[0].mq; + psm2_epaddr_t epaddr = args[1].epaddr; + size_t hfiNameLength = args[2].length; + uint32_t config; + + rv = psmi_get_psm2_config(mq, epaddr, &config); + if (rv == PSM2_OK) + { + if (snprintf(hfiName, hfiNameLength, "%s_%d", + psmi_hal_get_hfi_name(), + psmi_hal_get_unit_id(mq->ep->context.psm_hw_ctxt)) + < hfiNameLength) + rv = PSM2_OK; + } + } + break; + case PSM2_INFO_QUERY_MTU: + { + psm2_mq_t mq = args[0].mq; + psm2_epaddr_t epaddr = args[1].epaddr; + uint32_t config; + + rv = psmi_get_psm2_config(mq, epaddr, &config); + if (rv == PSM2_OK) + { + *((uint32_t*)out) = mq->ep->mtu; + } + } + break; + case PSM2_INFO_QUERY_LINK_SPEED: + { + psm2_mq_t mq = args[0].mq; + psm2_epaddr_t epaddr = args[1].epaddr; + uint32_t config; + + rv = psmi_get_psm2_config(mq, epaddr, &config); + if (rv == PSM2_OK) + { + *((uint32_t*)out) = psmi_hal_get_port_rate(psmi_hal_get_unit_id(mq->ep->context.psm_hw_ctxt), + psmi_hal_get_port_num(mq->ep->context.psm_hw_ctxt)); + } + } + break; + case PSM2_INFO_QUERY_NETWORK_TYPE: + { + char *networkType = (char*)out; + size_t networkTypeLength = args[0].length; + const char *const intelopa = "Intel(R) OPA"; + if (networkTypeLength >= strlen(intelopa)+1) + { + strcpy(networkType,intelopa); + rv = PSM2_OK; + } + } + + break; + default: + break; + } + + return rv; +} +PSMI_API_DECL(psm2_info_query) + +uint64_t __psm2_get_capability_mask(uint64_t req_cap_mask) +{ + return (psm2_capabilities_bitset & req_cap_mask); +} +PSMI_API_DECL(psm2_get_capability_mask) + +psm2_error_t __psm2_finalize(void) +{ + struct psmi_eptab_iterator itor; + char *hostname; + psm2_ep_t ep; + + PSM2_LOG_MSG("entering"); + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + /* When PSM_PERF is enabled, the following line causes the + instruction cycles gathered in the current run to be dumped + to stderr. */ + GENERIC_PERF_DUMP(stderr); + ep = psmi_opened_endpoint; + while (ep != NULL) { + psmi_opened_endpoint = ep->user_ep_next; + psm2_ep_close(ep, PSM2_EP_CLOSE_GRACEFUL, + 2 * PSMI_MIN_EP_CLOSE_TIMEOUT); + ep = psmi_opened_endpoint; + } + + psmi_epid_fini(); + + psmi_faultinj_fini(); + + /* De-allocate memory for any allocated space to store hostnames */ + psmi_epid_itor_init(&itor, PSMI_EP_HOSTNAME); + while ((hostname = psmi_epid_itor_next(&itor))) + psmi_free(hostname); + psmi_epid_itor_fini(&itor); + + /* unmap shared mem object for affinity */ + if (psmi_affinity_shared_file_opened) { + /* + * Start critical section to decrement ref count and unlink + * affinity shm file. + */ + psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name); + + shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] -= 1; + if (shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] <= 0) { + _HFI_VDBG("Unlink shm file for HFI affinity as there are no more users\n"); + shm_unlink(affinity_shm_name); + } else { + _HFI_VDBG("Number of affinity shared memory users left=%ld\n", + shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION]); + } + + msync(shared_affinity_ptr, AFFINITY_SHMEMSIZE, MS_SYNC); + + /* End critical section */ + psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name); + + munmap(shared_affinity_ptr, AFFINITY_SHMEMSIZE); + psmi_free(affinity_shm_name); + affinity_shm_name = NULL; + psmi_affinity_shared_file_opened = 0; + } + + if (psmi_affinity_semaphore_open) { + _HFI_VDBG("Closing and Unlinking Semaphore: %s.\n", sem_affinity_shm_rw_name); + sem_close(sem_affinity_shm_rw); + sem_unlink(sem_affinity_shm_rw_name); + psmi_free(sem_affinity_shm_rw_name); + sem_affinity_shm_rw_name = NULL; + psmi_affinity_semaphore_open = 0; + } + + psmi_hal_finalize(); +#ifdef PSM_CUDA + if (is_cuda_primary_context_retain) { + CUdevice device; + PSMI_CUDA_CALL(cuCtxGetDevice, &device); + PSMI_CUDA_CALL(cuDevicePrimaryCtxRelease, device); + } +#endif + + psmi_isinit = PSMI_FINALIZED; + PSM2_LOG_MSG("leaving"); + psmi_log_fini(); + return PSM2_OK; +} +PSMI_API_DECL(psm2_finalize) + +/* + * Function exposed in >= 1.05 + */ +psm2_error_t +__psm2_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames) +{ + int i; + psm2_error_t err = PSM2_OK; + + PSM2_LOG_MSG("entering"); + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (nids == NULL || hostnames == NULL) { + err = PSM2_PARAM_ERR; + goto fail; + } + + for (i = 0; i < num; i++) { + if ((err = psmi_epid_set_hostname(nids[i], hostnames[i], 1))) + break; + } + +fail: + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_map_nid_hostname) + +void __psm2_epaddr_setlabel(psm2_epaddr_t epaddr, char const *epaddr_label) +{ + PSM2_LOG_MSG("entering"); + PSM2_LOG_MSG("leaving"); + return; /* ignore this function */ +} +PSMI_API_DECL(psm2_epaddr_setlabel) + +void __psm2_epaddr_setctxt(psm2_epaddr_t epaddr, void *ctxt) +{ + + /* Eventually deprecate this API to use set/get opt as this is unsafe. */ + PSM2_LOG_MSG("entering"); + psm2_setopt(PSM2_COMPONENT_CORE, (const void *)epaddr, + PSM2_CORE_OPT_EP_CTXT, (const void *)ctxt, sizeof(void *)); + PSM2_LOG_MSG("leaving"); +} +PSMI_API_DECL(psm2_epaddr_setctxt) + +void *__psm2_epaddr_getctxt(psm2_epaddr_t epaddr) +{ + psm2_error_t err; + uint64_t optlen = sizeof(void *); + void *result = NULL; + + PSM2_LOG_MSG("entering"); + /* Eventually deprecate this API to use set/get opt as this is unsafe. */ + err = psm2_getopt(PSM2_COMPONENT_CORE, (const void *)epaddr, + PSM2_CORE_OPT_EP_CTXT, (void *)&result, &optlen); + + PSM2_LOG_MSG("leaving"); + + if (err == PSM2_OK) + return result; + else + return NULL; +} +PSMI_API_DECL(psm2_epaddr_getctxt) + +psm2_error_t +__psm2_setopt(psm2_component_t component, const void *component_obj, + int optname, const void *optval, uint64_t optlen) +{ + psm2_error_t rv; + PSM2_LOG_MSG("entering"); + switch (component) { + case PSM2_COMPONENT_CORE: + rv = psmi_core_setopt(component_obj, optname, optval, optlen); + PSM2_LOG_MSG("leaving"); + return rv; + break; + case PSM2_COMPONENT_MQ: + /* Use the deprecated MQ set/get opt for now which does not use optlen */ + rv = psm2_mq_setopt((psm2_mq_t) component_obj, optname, optval); + PSM2_LOG_MSG("leaving"); + return rv; + break; + case PSM2_COMPONENT_AM: + /* Hand off to active messages */ + rv = psmi_am_setopt(component_obj, optname, optval, optlen); + PSM2_LOG_MSG("leaving"); + return rv; + break; + case PSM2_COMPONENT_IB: + /* Hand off to IPS ptl to set option */ + rv = psmi_ptl_ips.setopt(component_obj, optname, optval, + optlen); + PSM2_LOG_MSG("leaving"); + return rv; + break; + } + + /* Unrecognized/unknown component */ + rv = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown component %u", + component); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_setopt); + +psm2_error_t +__psm2_getopt(psm2_component_t component, const void *component_obj, + int optname, void *optval, uint64_t *optlen) +{ + psm2_error_t rv; + + PSM2_LOG_MSG("entering"); + switch (component) { + case PSM2_COMPONENT_CORE: + rv = psmi_core_getopt(component_obj, optname, optval, optlen); + PSM2_LOG_MSG("leaving"); + return rv; + break; + case PSM2_COMPONENT_MQ: + /* Use the deprecated MQ set/get opt for now which does not use optlen */ + rv = psm2_mq_getopt((psm2_mq_t) component_obj, optname, optval); + PSM2_LOG_MSG("leaving"); + return rv; + break; + case PSM2_COMPONENT_AM: + /* Hand off to active messages */ + rv = psmi_am_getopt(component_obj, optname, optval, optlen); + PSM2_LOG_MSG("leaving"); + return rv; + break; + case PSM2_COMPONENT_IB: + /* Hand off to IPS ptl to set option */ + rv = psmi_ptl_ips.getopt(component_obj, optname, optval, + optlen); + PSM2_LOG_MSG("leaving"); + return rv; + break; + } + + /* Unrecognized/unknown component */ + rv = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown component %u", + component); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_getopt); + +psm2_error_t __psmi_poll_noop(ptl_t *ptl, int replyonly) +{ + PSM2_LOG_MSG("entering"); + PSM2_LOG_MSG("leaving"); + return PSM2_OK_NO_PROGRESS; +} +PSMI_API_DECL(psmi_poll_noop) + +psm2_error_t __psm2_poll(psm2_ep_t ep) +{ + psm2_error_t err1 = PSM2_OK, err2 = PSM2_OK; + psm2_ep_t tmp; + + PSM2_LOG_MSG("entering"); + + PSMI_ASSERT_INITIALIZED(); + + PSMI_LOCK(ep->mq->progress_lock); + + tmp = ep; + do { + err1 = ep->ptl_amsh.ep_poll(ep->ptl_amsh.ptl, 0); /* poll reqs & reps */ + if (err1 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */ + PSMI_UNLOCK(ep->mq->progress_lock); + PSM2_LOG_MSG("leaving"); + return err1; + } + + err2 = ep->ptl_ips.ep_poll(ep->ptl_ips.ptl, 0); /* get into ips_do_work */ + if (err2 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */ + PSMI_UNLOCK(ep->mq->progress_lock); + PSM2_LOG_MSG("leaving"); + return err2; + } + ep = ep->mctxt_next; + } while (ep != tmp); + + /* This is valid because.. + * PSM2_OK & PSM2_OK_NO_PROGRESS => PSM2_OK + * PSM2_OK & PSM2_OK => PSM2_OK + * PSM2_OK_NO_PROGRESS & PSM2_OK => PSM2_OK + * PSM2_OK_NO_PROGRESS & PSM2_OK_NO_PROGRESS => PSM2_OK_NO_PROGRESS */ + PSMI_UNLOCK(ep->mq->progress_lock); + PSM2_LOG_MSG("leaving"); + return (err1 & err2); +} +PSMI_API_DECL(psm2_poll) + +psm2_error_t __psmi_poll_internal(psm2_ep_t ep, int poll_amsh) +{ + psm2_error_t err1 = PSM2_OK_NO_PROGRESS; + psm2_error_t err2; + psm2_ep_t tmp; + + PSM2_LOG_MSG("entering"); + PSMI_LOCK_ASSERT(ep->mq->progress_lock); + + tmp = ep; + do { + if (poll_amsh) { + err1 = ep->ptl_amsh.ep_poll(ep->ptl_amsh.ptl, 0); /* poll reqs & reps */ + if (err1 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */ + PSM2_LOG_MSG("leaving"); + return err1; + } + } + + err2 = ep->ptl_ips.ep_poll(ep->ptl_ips.ptl, 0); /* get into ips_do_work */ + if (err2 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */ + PSM2_LOG_MSG("leaving"); + return err2; + } + + ep = ep->mctxt_next; + } while (ep != tmp); + PSM2_LOG_MSG("leaving"); + return (err1 & err2); +} +PSMI_API_DECL(psmi_poll_internal) +#ifdef PSM_PROFILE +/* These functions each have weak symbols */ +void psmi_profile_block() +{ + ; /* empty for profiler */ +} + +void psmi_profile_unblock() +{ + ; /* empty for profiler */ +} + +void psmi_profile_reblock(int did_no_progress) +{ + ; /* empty for profiler */ +} +#endif diff --git a/psm2.h b/psm2.h new file mode 100644 index 0000000..fa0ec20 --- /dev/null +++ b/psm2.h @@ -0,0 +1,1748 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef PSM2_H +#define PSM2_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * @file psm2.h + * @page psm2_main PSM2 API + * + * @brief PSM2 OPA Messaging Library + * + * The PSM2 OPA Messaging API, or PSM2 API, is Intel's low-level + * user-level communications interface for the OPA family of products. + * PSM2 users are enabled with mechanisms necessary to implement higher level + * communications interfaces in parallel environments. + * + * Since PSM2 targets clusters of multicore processors, it internally implements + * two levels of communication: intra-node shared memory communication and + * inter-node OPA communication. Both of these levels are encapsulated + * below the interface and the user is free to assume that intra-node and + * inter-node communication is transparently handled within PSM. + * + * @section compat Compatibility + * + * PSM2 can coexist with other QLogic/Pathscale software distributions, such as + * OpenIB/OpenFabrics, which allows applications to simultaneously target + * PSM-based and non PSM-based applications on a single node without changing + * any system-level configuration. However, PSM2 does not support running + * PSM-based and non PSM-based communication within the same user process. + * + * Except where noted, PSM2 does not assume an SPMD (single program, multiple + * data) parallel model and extends to MPMD (multiple program, multiple data) + * environments in specific areas. However, PSM2 assumes the runtime environment + * to be homogeneous on all nodes in bit width (32-bit or 64-bit) and endianness + * (little or big) and will fail at startup if any of these assumptions do not + * hold. For homogeneous systems PSM2 can run either in 32-bit or 64-bit + * environments. Even though both environments should expect similar + * performance from the API, PSM2 has chosen to favor 64-bit environments in + * some minor areas. + * + * @section ep_model Endpoint Communication Model + * + * PSM2 follows an endpoint communication model where an endpoint is defined as + * an object (or handle) instantiated to support sending and receiving messages + * to other endpoints. In order to prevent PSM2 from being tied to a particular + * parallel model (such as SPMD), control over the parallel layout of endpoints + * is retained by the user. Opening endpoints (@ref psm2_ep_open) and + * connecting endpoints to enable communication (@ref psm2_ep_connect) are two + * decoupled mechanisms. Users that do not dynamically change the number of + * endpoints beyond parallel startup will probably lump both mechanisms + * together at startup. Users that wish to manipulate the location and number + * of endpoints at runtime can do so by explicitly connecting sets or subsets + * of endpoints. + * + * As a side effect, this greater flexibility forces the user to cope with a + * two-stage initialization process. In the first stage of opening an endpoint + * (@ref psm2_ep_open), a user obtains an opaque handle to the endpoint and a + * globally distributable endpoint identifier (@ref psm2_epid_t). Prior to the + * second stage of connecting endpoints (@ref psm2_ep_connect), a user must + * distribute all relevent endpoint identifiers through an out-of-band + * mechanism. Once the endpoint identifiers are successfully distributed to + * all processes that wish to communicate, the user + * connects all endpoint identifiers to the locally opened endpoint + * (@ref psm2_ep_connect). In connecting the endpoints, the user obtains an + * opaque endpoint address (@ref psm2_epaddr_t), which is required for all PSM + * communication primitives. + * + * + * @section components PSM2 Components + * + * PSM2 exposes a single endpoint initialization model, but enables various + * levels of communication functionality and semantics through @e components. + * The first major component available in PSM2 is PSM2 Matched Queues + * (@ref psm2_mq), and the second is PSM2 Active Message (@ref psm2_am). + * + * Matched Queues (MQ) present a queue-based communication model with the + * distinction that queue consumers use a 3-tuple of metadata to match incoming + * messages against a list of preposted receive buffers. The MQ semantics are + * sufficiently akin to MPI to cover the entire MPI-1.2 standard. + * + * The Active Message (AM) component presents a request/reply model where + * the arrival of a message triggers the execution of consumer-provided + * handler code. This can be used to implement many one-sided and two-sided + * communications paradigms. + * + * With future releases of the PSM2 interface, more components will + * be exposed to accommodate users that implement parallel communication + * models that deviate from the Matched Queue semantics. For example, PSM + * plans to expose a connection management component to make it easier to + * handle endpoint management for clients without their own connection + * managers. + * + * + * @section progress PSM2 Communication Progress Guarantees + * + * PSM2 internally ensures progress of both intra-node and inter-node messages, + * but not autonomously. This means that while performance does not depend + * greatly on how the user decides to schedule communication progress, + * explicit progress calls are required for correctness. The @ref psm2_poll + * function is available to make progress over all PSM2 components in a generic + * manner. For more information on making progress over many communication + * operations in the MQ component, see the @ref mq_progress documentation. + * + * + * @section completion PSM2 Completion semantics + * + * PSM2 implements the MQ component, which documents its own + * message completion semantics (@ref mq_completion). + * + * + * @section error_handling PSM2 Error handling + * + * PSM2 exposes a list of user and runtime errors enumerated in @ref psm2_error. + * While most errors are fatal in that the user is not expected to be able to + * recover from them, PSM2 still allows some level of control. By + * default, PSM2 returns all errors to the user but as a convenience, allows + * users to either defer errors internally to PSM2 or to have PSM2 return all + * errors to the user (callers to PSM2 functions). PSM2 attempts to deallocate + * its resources as a best effort, but exits are always non-collective with + * respect to endpoints opened in other processes. The user is expected to be + * able to handle non-collective exits from any endpoint and in turn cleanly + * and independently terminate the parallel environment. Local error handling + * can be handled in three modes: + * + * Errors and error handling can be individually registered either globally or + * per-endpoint: + * @li @b Per-endpoint error handling captures errors for functions where the + * error scoping is determined to be over an endpoint. This includes all + * communication functions that include an EP or MQ handle as the first + * parameter. + * + * @li @b Global error handling captures errors for functions where a + * particular endpoint cannot be identified or for @ref psm2_ep_open, where + * errors (if any) occur before the endpoint is opened. + * + * Error handling is controlled by registering error handlers (@ref + * psm2_error_register_handler). The global error handler can + * be set at any time (even before @ref psm2_init), whereas a per-endpoint error + * handler can be set as soon as a new endpoint is successfully created. If a + * per-endpoint handle is not registered, the per-endpoint handler inherits + * from the global error handler at time of open. + * + * PSM2 predefines two different mechanisms for handling errors: + * + * @li PSM-internal error handler (@ref PSM2_ERRHANDLER_PSM_HANDLER) + * @li No-op PSM2 error handler where errors are returned + * (@ref PSM2_ERRHANDLER_NO_HANDLER) + * + * The default PSM-internal error handler effectively frees the user from + * explicitly handling the return values of ever PSM2 function but may not + * return to the user in a function determined to have caused a fatal error. + * + * The No-op PSM2 error handler bypasses all error handling functionality and + * always returns the error to the user. The user can then use @ref + * psm2_error_get_string to obtain a generic string from an error code (compared + * to a more detailed error message available through registering of error + * handlers). + * + * For even more control, users can register their own error handlers to have + * access to more precise error strings and selectively control when an when + * not to return to callers of PSM2 functions. All error handlers shown defer + * error handling to PSM2 for errors that are not recognized using @ref + * psm2_error_defer. Deferring an error from a custom error handler is + * equivalent to relying on the default error handler. + * + * @section env_var Environment variables + * + * Some PSM2 behaviour can be controlled via environment variables. + * + * @li @b PSM2_DEVICES. PSM2 implements three devices for communication which + * are, in order, @c self, @c shm and @c hfi. For PSM2 jobs that do not + * require shared-memory communications, @b PSM2_DEVICES can be specified as @c + * self, @c hfi. Similarly, for shared-memory only jobs, the @c hfi device + * can be disabled. It is up to the user to ensure that the endpoint ids + * passed in @ref psm2_ep_connect do not require a device that has been + * explicitly disabled by the user. In some instances, enabling only the + * devices that are required may improve performance. + * + * @li @b PSM2_TRACEMASK. Depending on the value of the tracemask, various parts + * of PSM2 will output debugging information. With a default value of @c 0x1, + * informative messages will be printed (this value should be considered a + * minimum). At @c 0x101, startup and finalization messages are added to the + * output. At @c 0x1c3, every communication event is logged and should hence + * be used for extreme debugging only. + * + * @li @b PSM2_MULTI_EP. By default, only one PSM2 endpoint may be opened in + * a process. With the correct setting of this environment variable, a process + * may open more than one PSM2 endpoint. In order to enable multiple endpoint + * per process support, the value of this environment variable should be set + * to "1" or "yes". + * + * @section thr_sfty Thread safety and reentrancy + * Unless specifically noted otherwise, all PSM2 functions should not be considered + * to be thread safe or reentrant. + */ + +/** @brief Local endpoint handle (opaque) + * @ingroup ep + * + * Handle returned to the user when a new local endpoint is created. The + * handle is a local handle to be used in all communication functions and is + * not intended to globally identify the opened endpoint in any way. + * + * All open endpoint handles can be globally identified using the endpoint id + * integral type (@ref psm2_epid_t) and all communication must use an endpoint + * address (@ref psm2_epaddr_t) that can be obtained by connecting a local + * endpoint to one or more endpoint identifiers. + * + * @remark The local endpoint handle is opaque to the user. */ +typedef struct psm2_ep *psm2_ep_t; + +/** @brief MQ handle (opaque) + * @ingroup mq + * + * Handle returned to the user when a new Matched queue is created (@ref + * psm2_mq_init). */ +typedef struct psm2_mq *psm2_mq_t; + +/*! @defgroup init PSM2 Initialization and Maintenance + * @{ + */ +#define PSM2_VERNO 0x0201 /*!< Header-defined Version number */ +#define PSM2_VERNO_MAJOR 0x02 /*!< Header-defined Major Version Number */ +#define PSM2_VERNO_MINOR 0x01 /*!< Header-defined Minor Version Number */ +#define PSM2_VERNO_COMPAT_MAJOR 0x01 /*! PSM2_VERNO_MAJOR) { + if (err) + fprintf(stderr, "PSM2 initialization failure: %s\n", + psm2_error_get_string(err)); + else + fprintf(stderr, "PSM2 loaded an unexpected/unsupported " + "version (%d.%d)\n", verno_major, verno_minor); + return -1; + } + + // We were able to initialize PSM2 but will defer all further error + // handling since most of the errors beyond this point will be fatal. + int err = psm2_error_register_handler(NULL, // Global handler + PSM2_ERRHANDLER_PSM_HANDLER); + if (err) { + fprintf(stderr, "Couldn't register global errhandler: %s\n", + psm2_error_get_string(err)); + return -1; + } + return 1; + } + @endcode + */ +psm2_error_t psm2_init(int *api_verno_major, int *api_verno_minor); + +/*! @brief PSM2 capabilities definitions + * + * Each capability is defined as a separate bit, + * i.e. next capabilities must be defined as + * consecutive bits : 0x2, 0x4 ... and so on. + */ +#define PSM2_MULTI_EP_CAP 0x1 /* Multiple Endpoints capability */ + +/** @brief PSM2 capabilities provider + * + * @param[in] req_cap_mask Requested capabilities are given as bit field. + * + * @returns internal capabilities bit field ANDed with a requested bit mask */ +uint64_t psm2_get_capability_mask(uint64_t req_cap_mask); + +/** @brief Finalize PSM2 interface + * + * Single call to finalize PSM2 and close all unclosed endpoints + * + * @post The user guarantees not to make any further PSM2 calls, including @ref + * psm2_init. + * + * @returns PSM2_OK Always returns @c PSM2_OK */ +psm2_error_t psm2_finalize(void); + +/** @brief Error handling opaque token + * + * A token is required for users that register their own handlers and wish to + * defer further error handling to PSM. */ +typedef struct psm2_error_token *psm2_error_token_t; + +/** @brief Error handling function + * + * Users can handle errors explicitly instead of relying on PSM's own error + * handler. There is one global error handler and error handlers that can be + * individually set for each opened endpoint. By default, endpoints will + * inherit the global handler registered at the time of open. + * + * @param[in] ep Handle associated to the endpoint over which the error occurred + * or @c NULL if the error is being handled by the global error + * handler. + * @param[in] error PSM2 error identifier + * @param[in] error_string A descriptive error string of maximum length @ref + * PSM2_ERRSTRING_MAXLEN. + * @param[in] token Opaque PSM2 token associated with the particular event that + * generated the error. The token can be used to extract the + * error string and can be passed to @ref psm2_error_defer to + * defer any remaining or unhandled error handling to PSM. + * + * @post If the error handler returns, the error returned is propagated to the + * caller. */ +typedef psm2_error_t(*psm2_ep_errhandler_t) (psm2_ep_t ep, + const psm2_error_t error, + const char *error_string, + psm2_error_token_t token); + +#define PSM2_ERRHANDLER_DEFAULT ((psm2_ep_errhandler_t)-1) +/**< Obsolete names, only here for backwards compatibility */ +#define PSM2_ERRHANDLER_NOP ((psm2_ep_errhandler_t)-2) +/**< Obsolete names, only here for backwards compatibility */ + +#define PSM2_ERRHANDLER_PSM_HANDLER ((psm2_ep_errhandler_t)-1) +/**< PSM2 error handler as explained in @ref error_handling */ + +#define PSM2_ERRHANDLER_NO_HANDLER ((psm2_ep_errhandler_t)-2) +/**< Bypasses the default PSM2 error handler and returns all errors to the user + * (this is the default) */ + +#define PSM2_ERRSTRING_MAXLEN 512 /**< Maximum error string length. */ + +/** @brief PSM2 error handler registration + * + * Function to register error handlers on a global basis and on a per-endpoint + * basis. PSM2_ERRHANDLER_PSM_HANDLER and PSM2_ERRHANDLER_NO_HANDLER are special + * pre-defined handlers to respectively enable use of the default PSM-internal + * handler or the no-handler that disables registered error handling and + * returns all errors to the caller (both are documented in @ref + * error_handling). + * + * @param[in] ep Handle of the endpoint over which the error handler should be + * registered. With ep set to @c NULL, the behavior of the + * global error handler can be controlled. + * @param[in] errhandler Handler to register. Can be a user-specific error + * handling function or PSM2_ERRHANDLER_PSM_HANDLER or + * PSM2_ERRHANDLER_NO_HANDLER. + * + * @remark When ep is set to @c NULL, this is the only function that can be + * called before @ref psm2_init + */ +psm2_error_t +psm2_error_register_handler(psm2_ep_t ep, const psm2_ep_errhandler_t errhandler); + +/** @brief PSM2 deferred error handler + * + * Function to handle fatal PSM2 errors if no error handler is installed or if + * the user wishes to defer further error handling to PSM. Depending on the + * type of error, PSM2 may or may not return from the function call. + * + * @param[in] err_token Error token initially passed to error handler + * + * @pre The user is calling into the function because it has decided that PSM + * should handle an error case. + * + * @post The function may or may not return depending on the error + */ +psm2_error_t psm2_error_defer(psm2_error_token_t err_token); + +/** @brief Get generic error string from error + * + * Function to return the default error string associated to a PSM2 error. + * + * While a more detailed and precise error string is usually available within + * error handlers, this function is available to obtain an error string out of + * an error handler context or when a no-op error handler is registered. + * + * @param[in] error PSM2 error + */ +const char *psm2_error_get_string(psm2_error_t error); + +/** @brief Option key/pair structure + * + * Currently only used in MQ. + */ +struct psm2_optkey { + uint32_t key; /**< Option key */ + void *value; /**< Option value */ +}; + +/*! @} */ + +/*! @defgroup ep PSM2 Device Endpoint Management + * @{ + */ + +/** @brief Endpoint ID + * + * Integral type of size 8 bytes that can be used by the user to globally + * identify a successfully opened endpoint. Although the contents of the + * endpoint id integral type remains opaque to the user, unique network id and + * OPA port number can be extracted using @ref psm2_epid_nid and @ref + * psm2_epid_context. + */ +typedef uint64_t psm2_epid_t; + +/** @brief Endpoint Address (opaque) + * + * Remote endpoint addresses are created when the user binds an endpoint ID + * to a particular endpoint handle using @ref psm2_ep_connect. A given endpoint + * address is only guaranteed to be valid over a single endpoint. + */ +typedef struct psm2_epaddr *psm2_epaddr_t; + +/** @brief PSM2 Unique UID + * + * PSM2 type equivalent to the DCE-1 uuid_t, used to uniquely identify an + * endpoint within a particular job. Since PSM2 does not participate in job + * allocation and management, users are expected to generate a unique ID to + * associate endpoints to a particular parallel or collective job. + * @see psm2_uuid_generate + */ +typedef uint8_t psm2_uuid_t[16]; + +/** @brief Get Endpoint identifier's Unique Network ID */ +uint64_t psm2_epid_nid(psm2_epid_t epid); + +/** @brief Get Endpoint identifier's OPA context number */ +uint64_t psm2_epid_context(psm2_epid_t epid); + +/** @brief Get Endpoint identifier's OPA port (deprecated, use + * @ref psm2_epid_context instead) */ +uint64_t psm2_epid_port(psm2_epid_t epid); + +/** @brief List the number of available OPA units + * + * Function used to determine the number of locally available OPA units. + * For @c N units, valid unit numbers in @ref psm2_ep_open are @c 0 to @c N-1. + * + * @returns PSM2_OK unless the user has not called @ref psm2_init + */ +psm2_error_t psm2_ep_num_devunits(uint32_t *num_units); + +/** @brief Utility to generate UUIDs for @ref psm2_ep_open + * + * This function is available as a utility for generating unique job-wide ids. + * See discussion in @ref psm2_ep_open for further information. + * + * @remark This function does not require PSM2 to be initialized. + */ +void psm2_uuid_generate(psm2_uuid_t uuid_out); + +/* Affinity modes for the affinity member of struct psm2_ep_open_opts */ +#define PSM2_EP_OPEN_AFFINITY_SKIP 0 /**< Disable setting affinity */ +#define PSM2_EP_OPEN_AFFINITY_SET 1 /**< Enable setting affinity unless + already set */ +#define PSM2_EP_OPEN_AFFINITY_FORCE 2 /**< Enable setting affinity regardless + of current affinity setting */ + +/* Default values for some constants */ +#define PSM2_EP_OPEN_PKEY_DEFAULT 0xffffffffffffffffULL + /**< Default protection key */ + +/** @brief Endpoint Open Options + * + * These options are available for opening a PSM2 endpoint. Each is + * individually documented and setting each option to -1 or passing NULL as the + * options parameter in @ref psm2_ep_open instructs PSM2 to use + * implementation-defined defaults. + * + * Each option is documented in @ref psm2_ep_open + */ +struct psm2_ep_open_opts { + int64_t timeout; /**< timeout in nanoseconds to open device */ + int unit; /**< OPA Unit ID to open on */ + int affinity; /**< How PSM2 should set affinity */ + int shm_mbytes; /**< Megabytes used for intra-node, deprecated */ + int sendbufs_num; /**< Preallocated send buffers */ + uint64_t network_pkey; /**< Network Protection Key (v1.01) */ + int port; /**< IB port to use (1 to N) */ + int outsl; /**< IB SL to use when sending pkts */ + uint64_t service_id; /* IB Service ID to use for endpoint */ + psm2_path_res_t path_res_type; /* Path resolution type */ + int senddesc_num; /* Preallocated send descriptors */ + int imm_size; /* Immediate data size for endpoint */ +}; + +/** @brief OPA endpoint creation + * + * Function used to create a new local communication endpoint on an OPA + * adapter. The returned endpoint handle is required in all PSM2 communication + * operations, as PSM2 can manage communication over multiple endpoints. An + * opened endpoint has no global context until the user connects the endpoint + * to other global endpoints by way of @ref psm2_ep_connect. All local endpoint + * handles are globally identified by endpoint IDs (@ref psm2_epid_t) which are + * also returned when an endpoint is opened. It is assumed that the user can + * provide an out-of-band mechanism to distribute the endpoint IDs in order to + * establish connections between endpoints (@ref psm2_ep_connect for more + * information). + * + * @param[in] unique_job_key Endpoint key, to uniquely identify the endpoint in + * a parallel job. It is up to the user to ensure + * that the key is globally unique over a period long + * enough to prevent duplicate keys over the same set + * of endpoints (see comments below). + * + * @param[in] opts Open options of type @ref psm2_ep_open_opts + * (see @ref psm2_ep_open_opts_get_defaults). + * + * @param[out] ep User-supplied storage to return a pointer to the newly + * created endpoint. The returned pointer of type @ref psm2_ep_t + * is a local handle and cannot be used to globally identify the + * endpoint. + * @param[out] epid User-supplied storage to return the endpoint ID associated + * to the newly created local endpoint returned in the @c ep + * handle. The endpoint ID is an integral type suitable for + * uniquely identifying the local endpoint. + * + * PSM2 does not internally verify the consistency of the uuid, it is up to the + * user to ensure that the uid is unique enough not to collide with other + * currently-running jobs. Users can employ three mechanisms to obtain a uuid. + * + * 1. Use the supplied @ref psm2_uuid_generate utility + * + * 2. Use an OS or library-specific uuid generation utility, that complies with + * OSF DCE 1.1, such as @c uuid_generate on Linux or @c uuid_create on + * FreeBSD. + * (see http://www.opengroup.org/onlinepubs/009629399/uuid_create.htm) + * + * 3. Manually pack a 16-byte string using a utility such as /dev/random or + * other source with enough entropy and proper seeding to prevent two nodes + * from generating the same uuid_t. + * + * The following options are relevent when opening an endpoint: + * @li @c timeout establishes the number of nanoseconds to wait before + * failing to open a port (with -1, defaults to 15 secs). + * @li @c unit sets the OPA unit number to use to open a port (with + * -1, PSM2 determines the best unit to open the port). If @c + * HFI_UNIT is set in the environment, this setting is ignored. + * @li @c affinity enables or disables PSM2 setting processor affinity. The + * option can be controlled to either disable (@ref + * PSM2_EP_OPEN_AFFINITY_SKIP) or enable the affinity setting + * only if it is already unset (@ref + * PSM2_EP_OPEN_AFFINITY_SET) or regardless of affinity begin + * set or not (@ref PSM2_EP_OPEN_AFFINITY_FORCE). + * If @c HFI_NO_CPUAFFINITY is set in the environment, this + * setting is ignored. + * @li @c shm_mbytes sets a maximum number of megabytes that can be allocated + * to each local endpoint ID connected through this + * endpoint (with -1, defaults to 10 MB). + * @li @c sendbufs_num sets the number of send buffers that can be + * pre-allocated for communication (with -1, defaults to + * 512 buffers of MTU size). + * @li @c network_pkey sets the protection key to employ for point-to-point + * PSM2 communication. Unless a specific value is used, + * this parameter should be set to + * PSM2_EP_OPEN_PKEY_DEFAULT. + * + * @warning By default, PSM2 limits the user to calling @ref psm2_ep_open only + * once per process and subsequent calls will fail. In order to enable creation + * of multiple endoints per process, one must properly set the environment variable + * @ref PSM2_MULTI_EP before calling @ref psm2_init. + * + * @code{.c} + // In order to open an endpoint and participate in a job, each endpoint has + // to be distributed a unique 16-byte UUID key from an out-of-band source. + // Presumably this can come from the parallel spawning utility either + // indirectly through an implementors own spawning interface or as in this + // example, the UUID is set as a string in an environment variable + // propagated to all endpoints in the job. + + int try_to_open_psm2_endpoint(psm2_ep_t *ep, // output endpoint handle + psm2_epid_t *epid, // output endpoint identifier + int unit) // unit of our choice + { + struct psm2_ep_open_opts epopts; + psm2_uuid_t job_uuid; + char *c; + + // Let PSM2 assign its default values to the endpoint options. + psm2_ep_open_opts_get_defaults(&epopts); + + // We want a stricter timeout and a specific unit + epopts.timeout = 15*1e9; // 15 second timeout + epopts.unit = unit; // We want a specific unit, -1 would let PSM + // choose the unit for us. + epopts.port = port; // We want a specific unit, <= 0 would let PSM + // choose the port for us. + // We've already set affinity, don't let PSM2 do so if it wants to. + if (epopts.affinity == PSM2_EP_OPEN_AFFINITY_SET) + epopts.affinity = PSM2_EP_OPEN_AFFINITY_SKIP; + + // ENDPOINT_UUID is set to the same value in the environment of all the + // processes that wish to communicate over PSM2 and was generated by + // the process spawning utility + c = getenv("ENDPOINT_UUID"); + if (c && *c) + implementor_string_to_16byte_packing(c, job_uuid); + else { + fprintf(stderr, "Can't find UUID for endpoint\n); + return -1; + } + + // Assume we don't want to handle errors here. + psm2_ep_open(job_uuid, &epopts, ep, epid); + return 1; + } + @endcode + */ +psm2_error_t +psm2_ep_open(const psm2_uuid_t unique_job_key, + const struct psm2_ep_open_opts *opts, psm2_ep_t *ep, + psm2_epid_t *epid); + +/** @brief Endpoint open default options. + * + * Function used to initialize the set of endpoint options to their default + * values for use in @ref psm2_ep_open. + * + * @param[out] opts Endpoint Open options. + * + * @warning For portable operation, users should always call this function + * prior to calling @ref psm2_ep_open. + * + * @return PSM2_OK If result could be updated + * @return PSM2_INIT_NOT_INIT If psm has not been initialized. + */ +psm2_error_t +psm2_ep_open_opts_get_defaults(struct psm2_ep_open_opts *opts); + +/** @brief Endpoint shared memory query + * + * Function used to determine if a remote endpoint shares memory with a + * currently opened local endpiont. + * + * @param[in] ep Endpoint handle + * @param[in] epid Endpoint ID + * + * @param[out] result Result is non-zero if the remote endpoint shares memory with the local + * endpoint @c ep, or zero otherwise. + * + * @return PSM2_OK If result could be updated + * @return PSM2_EPID_UNKNOWN If the epid is not recognized + */ +psm2_error_t +psm2_ep_epid_share_memory(psm2_ep_t ep, psm2_epid_t epid, int *result); + +/** @brief Close endpoint + * @param[in] ep PSM2 endpoint handle + * @param[in] mode One of @ref PSM2_EP_CLOSE_GRACEFUL or @ref PSM2_EP_CLOSE_FORCE + * @param[in] timeout How long to wait in nanoseconds if mode is + * PSM2_EP_CLOSE_GRACEFUL, 0 waits forever. If @c mode is + * @ref PSM2_EP_CLOSE_FORCE, this parameter is ignored. + * + * The following errors are returned, others are handled by the per-endpoint + * error handler: + * + * @return PSM2_OK Endpoint was successfully closed without force or + * successfully closed with force within the supplied timeout. + * @return PSM2_EP_CLOSE_TIMEOUT Endpoint could not be successfully closed + * within timeout. + */ +psm2_error_t psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout); + +#define PSM2_EP_CLOSE_GRACEFUL 0 /**< Graceful mode in @ref psm2_ep_close */ +#define PSM2_EP_CLOSE_FORCE 1 /**< Forceful mode in @ref psm2_ep_close */ + +/** @brief Provide mappings for network id to hostname + * + * Since PSM2 does not assume or rely on the availability of an external + * networkid-to-hostname mapping service, users can provide one or more of + * these mappings. The @ref psm2_map_nid_hostname function allows a list of + * network ids to be associated to hostnames. + * + * This function is not mandatory for correct operation but may allow PSM2 to + * provide better diagnostics when remote endpoints are unavailable and can + * otherwise only be identified by their network id. + * + * @param[in] num Number elements in @c nid and @c hostnames arrays + * @param[in] nids User-provided array of network ids (i.e. OPA LIDs), + * should be obtained by calling @ref psm2_epid_nid on each + * epid. + * @param[in] hostnames User-provided array of hostnames (array of + * NUL-terimated strings) where each hostname index + * maps to the provided nid hostname. + * + * @warning Duplicate nids may be provided in the input @c nids array, only + * the first corresponding hostname will be remembered. + * + * @pre The user may or may not have already provided a hostname mappings. + * @post The user may free any dynamically allocated memory passed to the + * function. + * + */ +psm2_error_t +psm2_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames); + +/** @brief Connect one or more remote endpoints to a local endpoint + * + * Function to non-collectively establish a connection to a set of endpoint IDs + * and translate endpoint IDs into endpoint addresses. Establishing a remote + * connection with a set of remote endpoint IDs does not imply a collective + * operation and the user is free to connect unequal sets on each process. + * Similarly, a given endpoint address does not imply that a pairwise + * communication context exists between the local endpoint and remote endpoint. + * + * @param[in] ep PSM2 endpoint handle + * + * @param[in] num_of_epid The number of endpoints to connect to, which + * also establishes the number of elements contained in + * all of the function's array-based parameters. + * + * @param[in] array_of_epid User-allocated array that contains @c num_of_epid + * valid endpoint identifiers. Each endpoint id (or + * epid) has been obtained through an out-of-band + * mechanism and each endpoint must have been opened + * with the same uuid key. + * + * @param[in] array_of_epid_mask User-allocated array that contains + * @c num_of_epid integers. This array of masks + * allows users to select which of the epids in @c + * array_of_epid should be connected. If the integer + * at index i is zero, psm does not attempt to connect + * to the epid at index i in @c array_of_epid. If + * this parameter is NULL, psm will try to connect to + * each epid. + * + * @param[out] array_of_errors User-allocated array of at least @c num_of_epid + * elements. If the function does not return + * PSM2_OK, this array can be consulted for each + * endpoint not masked off by @c array_of_epid_mask + * to know why the endpoint could not be connected. + * Endpoints that could not be connected because of + * an unrelated failure will be marked as @ref + * PSM2_EPID_UNKNOWN. If the function returns + * PSM2_OK, the errors for all endpoints will also + * contain PSM2_OK. + * + * @param[out] array_of_epaddr User-allocated array of at least @c num_of_epid + * elements of type psm2_epaddr_t. Each + * successfully connected endpoint is updated with + * an endpoint address handle that corresponds to + * the endpoint id at the same index in @c + * array_of_epid. Handles are only updated if the + * endpoint could be connected and if its error in + * array_of_errors is PSM2_OK. + * + * @param[in] timeout Timeout in nanoseconds after which connection attempts + * will be abandoned. Setting this value to 0 disables + * timeout and waits until all endpoints have been + * successfully connected or until an error is detected. + * + * @pre The user has opened a local endpoint and obtained a list of endpoint + * IDs to connect to a given endpoint handle using an out-of-band + * mechanism not provided by PSM. + * + * @post If the connect is successful, @c array_of_epaddr is updated with valid + * endpoint addresses. + * + * @post If unsuccessful, the user can query the return status of each + * individual remote endpoint in @c array_of_errors. + * + * @post The user can call into @ref psm2_ep_connect many times with the same + * endpoint ID and the function is guaranteed to return the same output + * parameters. + * + * @post PSM2 does not keep any reference to the arrays passed into the + * function and the caller is free to deallocate them. + * + * The error value with the highest importance is returned by + * the function if some portion of the communication failed. Users should + * always refer to individual errors in @c array_of_errors whenever the + * function cannot return PSM2_OK. + * + * @returns PSM2_OK The entire set of endpoint IDs were successfully connected + * and endpoint addresses are available for all endpoint IDs. + * + * @code{.c} + int connect_endpoints(psm2_ep_t ep, int numep, + const psm2_epid_t *array_of_epid, + psm2_epaddr_t **array_of_epaddr_out) + { + psm2_error_t *errors = (psm2_error_t *) calloc(numep, sizeof(psm2_error_t)); + if (errors == NULL) + return -1; + + psm2_epaddr_t *all_epaddrs = + (psm2_epaddr_t *) calloc(numep, sizeof(psm2_epaddr_t)); + + if (all_epaddrs == NULL) + return -1; + + psm2_ep_connect(ep, numep, array_of_epid, + NULL, // We want to connect all epids, no mask needed + errors, + all_epaddrs, + 30*e9); // 30 second timeout, <1 ns is forever + *array_of_epaddr_out = all_epaddrs; + free(errors); + return 1; + } + @endcode + */ +psm2_error_t +psm2_ep_connect(psm2_ep_t ep, int num_of_epid, const psm2_epid_t *array_of_epid, + const int *array_of_epid_mask, psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, int64_t timeout); + +/* @brief Disconnect one or more remote endpoints from a local endpoint. +* +* Function to non-collectively disconnect a connection to a set of endpoint +* addresses and free the endpoint addresses. After disconnecting, the +* application cannot send messages to the remote processes and PSM2 is +* restored back to the state before calling psm2_ep_connect. The application +* must call psm2_ep_connect to establish the connections again. +* +* This function is equivalent to calling psm2_ep_disconnect2() with mode == +* PSM2_EP_DISCONNECT_GRACEFUL. +* +* @param[in] ep PSM2 endpoint handle +* +* @param[in] num_of_epaddr The number of endpoint addresses to disconnect from, +* which also indicates the number of elements contained +* in all of the functionā€™s array-based parameters. +* +* @param[in] array_of_epaddr User-allocated array that contains num_of_epaddr +* valid endpoint addresses. Each endpoint address (or +* epaddr) has been obtained through a previous +* psm2_ep_connect call. +* +* @param[in] array_of_epaddr_mask User-allocated array that contains +* num_of_epaddr integers. This array of masks +* allows users to select which of the +* epaddresses in array_of_epaddr should be +* disconnected. If the integer at index i is +* zero, PSM2 does not attempt to disconnect to +* the epaddr at index i in array_of_epaddr. If +* this parameter is NULL, PSM2 tries to +* disconnect all epaddr in array_of_epaddr. +* +* @param[out] array_of_errors User-allocated array of at least num_of_epaddr +* elements. If the function does not return PSM2_OK, +* this array can be consulted for each endpoint +* address not masked off by array_of_epaddr_mask to +* know why the endpoint could not be disconnected. +* Any endpoint address that could not be +* disconnected because of an unrelated failure is +* marked as PSM2_EPID_UNKNOWN. If the function +* returns PSM2_OK, the errors for all endpoint +* addresses also contain PSM2_OK. +* +* @param[in] timeout Timeout in nanoseconds after which disconnection attempts +* are abandoned. Setting this value to 0 disables timeout and +* waits until all endpoints have been successfully +* disconnected or until an error is detected. +* +* @pre You have established the connections with previous psm2_ep_connect calls. +* +* @post If the disconnect is successful, the corresponding epaddr in +* array_of_epaddr is reset to NULL pointer. +* +* @post If unsuccessful, you can query the return status of each individual +* remote endpoint in array_of_errors. +* +* @post PSM2 does not keep any reference to the arrays passed into the function +* and the caller is free to deallocate them. +* +* @post The error value with the highest importance is returned by the function +* if some portion of the communication failed. Refer to individual errors +* in array_of_errors whenever the function cannot return PSM2_OK. +* +* @returns PSM2_OK The entire set of endpoint IDs were successfully disconnected +* and endpoint addresses are freed by PSM2. +* +* @code{.c} +int disconnect_endpoints(psm2_ep_t ep, int num_epaddr, + const psm2_epaddr_t *array_of_epaddr) +{ + psm2_error_t *errors = + (psm2_error_t *)calloc(num_epaddr, sizeof(psm2_error_t)); + if (errors == NULL) + return -1; + psm2_ep_disconnect( + ep, num_epaddr, array_of_epaddr, + NULL, // We want to disconnect all epaddrs, no mask needed, + errors, + 30 * e9); // 30 second timeout, <1 ns is forever + free(errors); + return 1; +} +@endcode +*/ +psm2_error_t psm2_ep_disconnect(psm2_ep_t ep, int num_of_epaddr, + psm2_epaddr_t *array_of_epaddr, + const int *array_of_epaddr_mask, + psm2_error_t *array_of_errors, int64_t timeout); + +/* @brief Disconnect one or more remote endpoints from a local endpoint. +* +* Function to non-collectively disconnect a connection to a set of endpoint +* addresses and free the endpoint addresses. After disconnecting, the +* application cannot send messages to the remote processes and PSM2 is +* restored back to the state before calling psm2_ep_connect. The application +* must call psm2_ep_connect to establish the connections again. +* +* @param[in] ep PSM2 endpoint handle +* +* @param[in] num_of_epaddr The number of endpoint addresses to disconnect from, +* which also indicates the number of elements contained +* in all of the functionā€™s array-based parameters. +* +* @param[in] array_of_epaddr User-allocated array that contains num_of_epaddr +* valid endpoint addresses. Each endpoint address (or +* epaddr) has been obtained through a previous +* psm2_ep_connect call. +* +* @param[in] array_of_epaddr_mask User-allocated array that contains +* num_of_epaddr integers. This array of masks +* allows users to select which of the +* epaddresses in array_of_epaddr should be +* disconnected. If the integer at index i is +* zero, PSM2 does not attempt to disconnect to +* the epaddr at index i in array_of_epaddr. If +* this parameter is NULL, PSM2 tries to +* disconnect all epaddr in array_of_epaddr. +* +* @param[out] array_of_errors User-allocated array of at least num_of_epaddr +* elements. If the function does not return PSM2_OK, +* this array can be consulted for each endpoint +* address not masked off by array_of_epaddr_mask to +* know why the endpoint could not be disconnected. +* Any endpoint address that could not be +* disconnected because of an unrelated failure is +* marked as PSM2_EPID_UNKNOWN. If the function +* returns PSM2_OK, the errors for all endpoint +* addresses also contain PSM2_OK. +* +* @param[in] mode One of @ref PSM2_EP_DISCONECT_GRACEFUL or @ref PSM2_EP_DISCONECT_FORCE +* +* @param[in] timeout Timeout in nanoseconds after which disconnection attempts +* are abandoned. Setting this value to 0 disables timeout and +* waits until all endpoints have been successfully +* disconnected or until an error is detected. Supplying a +* negative value here sets the disconnection mode to "force". +* +* @pre You have established the connections with previous psm2_ep_connect calls. +* +* @post If the disconnect is successful, the corresponding epaddr in +* array_of_epaddr is reset to NULL pointer. +* +* @post If unsuccessful, you can query the return status of each individual +* remote endpoint in array_of_errors. +* +* @post PSM2 does not keep any reference to the arrays passed into the function +* and the caller is free to deallocate them. +* +* @post The error value with the highest importance is returned by the function +* if some portion of the communication failed. Refer to individual errors +* in array_of_errors whenever the function cannot return PSM2_OK. +* +* @returns PSM2_OK The entire set of endpoint IDs were successfully disconnected +* and endpoint addresses are freed by PSM2. +* +* @code{.c} +int disconnect_endpoints(psm2_ep_t ep, int num_epaddr, + const psm2_epaddr_t *array_of_epaddr) +{ + psm2_error_t *errors = + (psm2_error_t *)calloc(num_epaddr, sizeof(psm2_error_t)); + if (errors == NULL) + return -1; + psm2_ep_disconnect2( + ep, num_epaddr, array_of_epaddr, + NULL, // We want to disconnect all epaddrs, no mask needed, + errors, + PSM2_EP_DISCONECT_GRACEFUL, + 30 * e9); // 30 second timeout, 0 ns is forever + free(errors); + return 1; +} +@endcode +*/ +psm2_error_t psm2_ep_disconnect2(psm2_ep_t ep, int num_of_epaddr, + psm2_epaddr_t *array_of_epaddr, + const int *array_of_epaddr_mask, + psm2_error_t *array_of_errors, + int mode, int64_t timeout); + +#define PSM2_EP_DISCONNECT_GRACEFUL PSM2_EP_CLOSE_GRACEFUL /**< Graceful mode in @ref psm2_ep_disconnect2 */ +#define PSM2_EP_DISCONNECT_FORCE PSM2_EP_CLOSE_FORCE /**< Forceful mode in @ref psm2_ep_disconnect2 */ + +/** @brief Ensure endpoint communication progress + * + * Function to ensure progress for all PSM2 components instantiated on an + * endpoint (currently, this only includes the MQ component). The function + * never blocks and is typically required in two cases: + * + * @li Allowing all PSM2 components instantiated over a given endpoint to make + * communication progress. Refer to @ref mq_progress for a detailed + * discussion on MQ-level progress issues. + * + * @li Cases where users write their own synchronization primitives that + * depend on remote communication (such as spinning on a memory location + * which's new value depends on ongoing communication). + * + * The poll function doesn't block, but the user can rely on the @ref + * PSM2_OK_NO_PROGRESS return value to control polling behaviour in terms of + * frequency (poll until an event happens) or execution environment (poll for a + * while but yield to other threads of CPUs are oversubscribed). + * + * @returns PSM2_OK Some communication events were progressed + * @returns PSM2_OK_NO_PROGRESS Polling did not yield any communication progress + * + */ +psm2_error_t psm2_poll(psm2_ep_t ep); + +/** @brief Set a user-determined ep address label. + * + * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect + * @param[in] epaddr_label_string User-allocated string to print when + * identifying endpoint in error handling or other verbose + * printing. The NULL-terminated string must be allocated by + * the user since PSM2 only keeps a pointer to the label. If + * users do not explicitly set a label for each endpoint, + * endpoints will identify themselves as hostname:port. + */ +void psm2_epaddr_setlabel(psm2_epaddr_t epaddr, + const char *epaddr_label_string); + +/** @brief Set a user-determined ep address context. + * + * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect + * @param[in] ctxt Opaque user defined state to associate with an endpoint + * address. This state can be retrieved via + * @ref psm2_epaddr_getctxt. + */ +void +psm2_epaddr_setctxt(psm2_epaddr_t epaddr, void *ctxt); + +/** @brief Get the user-determined ep address context. Users can associate an + * opaque context with each endpoint via @ref psm2_epaddr_setctxt. + * + * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect. + */ +void *psm2_epaddr_getctxt(psm2_epaddr_t epaddr); + +/* Below are all component specific options. The component object for each of + * the options is also specified. + */ + +/* PSM2_COMPONENT_CORE options */ +/* PSM2 debug level */ +#define PSM2_CORE_OPT_DEBUG 0x101 + /**< [@b uint32_t ] Set/Get the PSM2 debug level. This option can be set + * before initializing the PSM2 library. + * + * component object: (null) + * option value: PSM2 Debug mask to set or currently active debug level. + */ + +/* PSM2 endpoint address context */ +#define PSM2_CORE_OPT_EP_CTXT 0x102 + /**< [@b uint32_t ] Set/Get the context associated with a PSM2 endpoint + * address (psm2_epaddr_t). + * + * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. + * option value: Context associated with PSM2 endpoint address. + */ + +/* PSM2_COMPONENT_IB options */ +/* Default service level to use to communicate with remote endpoints */ +#define PSM2_IB_OPT_DF_SL 0x201 + /**< [@b uint32_t ] Default OPA SL to use for all remote communication. + * If unset defaults to Service Level 0. + * + * component object: Opened PSM2 endpoint id (@ref psm2_ep_t). + * option value: Default IB SL to use for endpoint. (0 <= SL < 15) + */ + +/* Set IB service level to use for communication to an endpoint */ +#define PSM2_IB_OPT_EP_SL 0x202 + /**< [@b uint32_t ] OPA SL to use for communication to specified + * remote endpoint. + * + * component object: PSM2 endpoint (@ ref psm2_epaddr_t) address. + * option value: SL used to communicate with remote endpoint. (0 <= SL < 15) + */ + +/* PSM2_COMPONENT_MQ options (deprecates psm2_mq_set|getopt) */ +/* MQ options that can be set in psm2_mq_init and psm2_{set,get}_opt */ +#define PSM2_MQ_OPT_RNDV_IB_SZ 0x301 + /**< [@b uint32_t ] Size at which to start enabling rendezvous + * messaging for OPA messages (if unset, defaults to values + * between 56000 and 72000 depending on the system configuration) + * + * component object: PSM2 Matched Queue (@ref psm2_mq_t). + * option value: Size at which to switch to rendezvous protocol. + */ +#define PSM2_MQ_RNDV_HFI_SZ PSM2_MQ_OPT_RNDV_IB_SZ +#define PSM2_MQ_RNDV_IPATH_SZ PSM2_MQ_OPT_RNDV_IB_SZ + +#define PSM2_MQ_OPT_RNDV_SHM_SZ 0x302 +#define PSM2_MQ_RNDV_SHM_SZ PSM2_MQ_OPT_RNDV_SHM_SZ + /**< [@b uint32_t ] Size at which to start enabling + * rendezvous messaging for shared memory (intra-node) messages (If + * unset, defaults to 64000 bytes). + * + * component object: PSM2 Matched Queue (@ref psm2_mq_t). + * option value: Size at which to switch to rendezvous protocol. + */ + +#define PSM2_MQ_OPT_SYSBUF_MYBYTES 0x303 +#define PSM2_MQ_MAX_SYSBUF_MBYTES PSM2_MQ_OPT_SYSBUF_MYBYTES + /**< [@b uint32_t ] Maximum number of bytes to allocate for unexpected + * messages. + * + * component object: PSM2 Matched Queue (@ref psm2_mq_t). + * option value: Deprecated; this option has no effect. + */ + +/* PSM2_COMPONENT_AM options */ +#define PSM2_AM_OPT_FRAG_SZ 0x401 +#define PSM2_AM_MAX_FRAG_SZ PSM2_AM_OPT_FRAG_SZ +/*!< [@b uint32_t ] Maximum active message fragment size that can be sent + * for a given endpoint or across all endpoints. This value can only be + * queried. + * + * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then + * option value is the smalles fragment size across all + * active endpoints. + * option value: Maximum active message fragment size in bytes. + */ + +#define PSM2_AM_OPT_NARGS 0x402 +#define PSM2_AM_MAX_NARGS PSM2_AM_OPT_NARGS + +/*!< [@b uint32_t ] Maximum number of message arguments that can be sent + * for a given endpoint or across all endpoints. This value can only be + * queried. + * + * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then + * option value is the smalles fragment size across all + * active endpoints. + * option value: Maximum number of active message arguments. + */ + +#define PSM2_AM_OPT_HANDLERS 0x403 +#define PSM2_AM_MAX_HANDLERS PSM2_AM_OPT_HANDLERS +/*!< [@b uint32_t ] Maximum number of message handlers that can be registered + * for a given endpoint or across all endpoints. This value can only be + * queried. + * + * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then + * option value is the smalles fragment size across all + * active endpoints. + * option value: Maximum number of active message handlers. + */ + +/** @brief Set an option for a PSM2 component + * + * Function to set the value of a PSM2 component option + * + * @param[in] component Type of PSM2 component for which to set the option + * @param[in] component_obj Opaque component specify object to apply the set + * operation on. These are passed uninterpreted to the + * appropriate component for interpretation. + * @param[in] optname Name of component option to set. These are component + * specific and passed uninterpreted to the appropriate + * component for interpretation. + * @param[in] optval Pointer to storage that contains the value to be updated + * for the supplied option. It is up to the user to + * ensure that the pointer points to a memory location with a + * correct size and format. + * @param[in] optlen Size of the memory region pointed to by optval. + * + * @returns PSM2_OK if option could be set. + * @returns PSM2_PARAM_ERR if the component or optname are not valid. + * @returns PSM2_OPT_READONLY if the option to be set is a read-only option. + * + */ +psm2_error_t +psm2_setopt(psm2_component_t component, const void *component_obj, + int optname, const void *optval, uint64_t optlen); + +/** @brief Get an option for a PSM2 component + * + * Function to get the value of a PSM2 component option + * + * @param[in] component Type of PSM2 component for which to get the option + * @param[in] component_obj Opaque component specify object to apply the get + * operation on. These are passed uninterpreted to the + * appropriate component for interpretation. + * @param[in] optname Name of component option to get. These are component + * specific and passed uninterpreted to the appropriate + * component for interpretation. + * @param[out] optval Pointer to storage that contains the value to be updated + * for the supplied option. It is up to the user to + * ensure that the pointer points to a valid memory region. + * @param[in,out] optlen This is a value result parameter initially containing + * the size of the memory region pointed to by optval and + * modified to return the actual size of optval. + * + * @returns PSM2_OK if option value could be retrieved successfully. + * @returns PSM2_PARAM_ERR if the component or optname are not valid. + * @returns PSM2_NO_MEMORY if the memory region optval is of insufficient size. + * optlen contains the required memory region size for + * optname value. + * + */ +psm2_error_t +psm2_getopt(psm2_component_t component, const void *component_obj, + int optname, void *optval, uint64_t *optlen); + +/** @brief Datatype for end-point information */ +typedef struct psm2_epinfo { + psm2_ep_t ep; /**< The ep for this end-point*/ + psm2_epid_t epid; /**< The epid for this end-point */ + psm2_uuid_t uuid; /**< The UUID for this end-point */ + uint16_t jkey; /**< The job key for this end-point */ + char uuid_str[64]; /**< String representation of the UUID for this end-point */ +} psm2_epinfo_t; + +/** @brief Datatype for end-point connection */ +typedef struct psm2_epconn { + psm2_epaddr_t addr; /**< The epaddr for this connection */ + psm2_ep_t ep; /**< The ep for this connection */ + psm2_mq_t mq; /**< The mq for this connection */ +} psm2_epconn_t; + +/** @brief Query PSM2 for end-point information. + * + * Function to query PSM2 for end-point information. This allows retrieval of + * end-point information in cases where the caller does not have access to the + * results of psm2_ep_open(). In the default single-rail mode PSM2 will use + * a single endpoint. If either multi-rail mode or multi-endpoint mode is + * enabled, PSM2 will use multiple endpoints. + * + * @param[in,out] num_of_epinfo On input, sizes the available number of entries + * in array_of_epinfo. On output, specifies the + * returned number of entries in array_of_epinfo. + * @param[out] array_of_epinfo Returns end-point information structures. + * + * @pre PSM2 is initialized and the end-point has been opened. + * + * @returns PSM2_OK indicates success. + * @returns PSM2_PARAM_ERR if input num_if_epinfo is less than or equal to zero. + * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point is closed or does not exist. + */ +psm2_error_t psm2_ep_query(int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo); + +/** @brief Query PSM2 for end-point connections. + * + * Function to query PSM2 for end-point connections. This allows retrieval of + * end-point connections in cases where the caller does not have access to the + * results of psm2_ep_connect(). The epid values can be found using + * psm2_ep_query() so that each PSM2 process can determine its own epid. These + * values can then be distributed across the PSM2 process so that each PSM + * process knows the epid for all other PSM2 processes. + * + * @param[in] epid The epid of a PSM2 process. + * @param[out] epconn The connection information for that PSM2 process. + * + * @pre PSM2 is initialized and the end-point has been connected to this epid. + * + * @returns PSM2_OK indicates success. + * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point is closed or does not exist. + * @returns PSM2_EPID_UNKNOWN if the epid value is not known to PSM. + */ +psm2_error_t psm2_ep_epid_lookup(psm2_epid_t epid, psm2_epconn_t *epconn); + +/** @brief Query given PSM2 end-point for its connections. + * + * The need for this function comes with 'multi-ep' feature. + * Function is similar to (@ref psm2_ep_epid_lookup). + * It differs in that an extra parameter which identifies + * the end-point [ep] must be provided which limits the lookup to that single ep. + * + * @returns PSM2_OK indicates success. + * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point [ep] is closed or does not exist. + * @returns PSM2_EPID_UNKNOWN if the [epid] value is not known to PSM. + * @returns PSM2_PARAM_ERR if output [epconn] is NULL. + */ +psm2_error_t psm2_ep_epid_lookup2(psm2_ep_t ep, psm2_epid_t epid, psm2_epconn_t *epconn); + +/** @brief Get PSM2 epid for given epaddr. + * + * @param[in] epaddr The endpoint address. + * @param[out] epid The epid of a PSM2 process. + * + * @returns PSM2_OK indicates success. + * @returns PSM2_PARAM_ERR if input [epaddr] or output [epid] is NULL. + */ +psm2_error_t psm2_epaddr_to_epid(psm2_epaddr_t epaddr, psm2_epid_t *epid); + +/*! @} */ + +/*! @addtogroup init PSM2 Information Query + * @{ + */ + +/** @brief Enumeration for info query APIs + * + * Note that calling the function: + * + @code{.c} + psm2_error_t psm2_info_query(psm2_info_query_t, void *out, + size_t nargs, psm2_info_query_arg_t []); + @endcode + * + * Takes a variable number of input arguments, per the initial psm2_info_query_t + * + * Below, there is an explanation of the number, type and order of the + * required input arguments, as well as a definition of the type of the output. + */ +typedef enum psm2_info_query_et +{ +/*! Required input arguments 0 + Output parameter: uint32_t*, description: the number of units */ + PSM2_INFO_QUERY_NUM_UNITS, + +/*! Required input arguments: 0 + Output parameter: uint32_t*, description: the number of ports */ + PSM2_INFO_QUERY_NUM_PORTS, + +/*! Required input arguments: 1 + 1. type: uint32_t, description: the unit for which status is + desired (use: psm2_info_query_arg_t.unit). + Output parameter: uint32_t, description: zero, when the unit + is not active, non-zero when the unit is + active. */ + PSM2_INFO_QUERY_UNIT_STATUS, + +/*! Required input arguments: 2 + 1. type: uint32_t, description: the unit for which status is + desired (use: psm2_info_query_arg_t.unit). + 2. type: uint32_t, description: the port for which status is + desired (use: psm2_info_query_arg_t.port). + Output parameter: uint32_t, description: zero, when the unit + is not active, non-zero when the unit is + active. */ + PSM2_INFO_QUERY_UNIT_PORT_STATUS, + +/*! Required input arguments: 1 + 1. type: uint32_t, description: the unit for which the number of + free contexts is desired (use: psm2_info_query_arg_t.unit). + Output parameter: uint32_t, description: the number of free + contexts.. */ + PSM2_INFO_QUERY_NUM_FREE_CONTEXTS, + +/*! Required input arguments: 1 + 1. type: uint32_t, description: the unit for which the number of + contexts is desired (use: psm2_info_query_arg_t.unit). + Output parameter: uint32_t, description: the number of + contexts.. */ + PSM2_INFO_QUERY_NUM_CONTEXTS, + +/*! Required input arguments: 2 + 1. type: psm2_mq_t, description: the mq that is associated with the + connection for which configuration information is wanted. + (use: psm2_info_query_arg_t.mq). + 2. type: psm2_epaddr_t, description: the ep address that is + associated with the connection for which configuration + information is wanted (use: psm2_info_query_arg_t.epaddr). + Output parameter: uint32_t, description: a bit mask containing bits defining the configuration. + see psm2_info_query_config for a description of the bits. */ + PSM2_INFO_QUERY_CONFIG, + +/*! Required input arguments: 3 + 1. type: psm2_mq_t, description: the mq that is associated with the + connection for which the msg size query information is wanted. + (use: psm2_info_query_arg_t.mq). + 2. type: psm2_epaddr_t, description: the ep address that is + associated with the connection for which the msg size query + information is wanted (use: psm2_info_query_arg_t.epaddr). + 3. type: enum psm2_info_query_thresh_et, the specific msg size query. + (use: psm2_info_query_arg_t.mstq). + + Output parameter: uint32_t, description: the message size threshold. */ + PSM2_INFO_QUERY_THRESH, + +/*! Required input arguments: 3 + 1. type: psm2_mq_t, description: the mq that is associated with the + connection for which the device name is wanted. + (use: psm2_info_query_arg_t.mq). + 2. type: psm2_epaddr_t, description: the ep address that is + associated with the connection for which device name is wanted. + (use: psm2_info_query_arg_t.epaddr). + 3. type: size_t, the length of the output buffer that will recieve + the device name (use: psm2_info_query_arg_t.length). + Output parameter: char *, description: the device name. */ + PSM2_INFO_QUERY_DEVICE_NAME, + +/*! Required input arguments: 2 + 1. type: psm2_mq_t, description: the mq that is associated with the + connection for which the mtu is wanted (use: psm2_info_query_arg_t.mq). + 2. type: psm2_epaddr_t, description: the ep address that is + associated with the connection for which mtu is wanted. + (use: psm2_info_query_arg_t.epaddr). + Output parameter: uint32_t, description: the mtu. */ + + PSM2_INFO_QUERY_MTU, + +/*! Required input arguments: 2 + 1. type: psm2_mq_t, description: the mq that is associated with the + connection for which the link speed is wanted (use: + psm2_info_query_arg_t.mq). + 2. type: psm2_epaddr_t, description: the ep address that is + associated with the connection for which link speed is wanted. + (use: psm2_info_query_arg_t.epaddr). + Output parameter: uint32_t, description: the link speed. */ + PSM2_INFO_QUERY_LINK_SPEED, + +/*! Required input arguments: 1 + 1. type: size_t, description: the length of the output buffer to receive + the network type (use: psm2_info_query_arg_t.length). + Output parameter: char*, description: the network type. */ + PSM2_INFO_QUERY_NETWORK_TYPE, + PSM2_INFO_QUERY_LAST, /* must appear last, and the info query + constants are used as an index. */ +} psm2_info_query_t; + +/** @brief Enumeration for info query config + */ +enum psm2_info_query_config +{ + /*! The following three are 'main configs': */ + PSM2_INFO_QUERY_CONFIG_IPS = (1 << 0), + PSM2_INFO_QUERY_CONFIG_AMSH = (1 << 1), + PSM2_INFO_QUERY_CONFIG_SELF = (1 << 2), + + /*! The following three are sub-configs of + the IPS main config: */ + + PSM2_INFO_QUERY_CONFIG_CUDA = (1 << 3), + PSM2_INFO_QUERY_CONFIG_PIO = (1 << 4), + PSM2_INFO_QUERY_CONFIG_DMA = (1 << 5), + + /*! The following is a sub-config of IPS & CUDA + main config: */ + + PSM2_INFO_QUERY_CONFIG_GDR_COPY = (1 << 6), +}; + +/** @brief Enumeration info query thresholds + */ +enum psm2_info_query_thresh_et +{ +/*! This is the start of the thresh queries for IPS config: */ + PSM2_INFO_QUERY_THRESH_IPS_START, + +/*! Not shown here are the specific queries supported by the CUDA + and GDR_COPY, sub-configs. + + But, those configs will need to include threshold queries in case the + config includes them. + + Note that for the case of gdr_copy the thresholds varies for the case + of the memory is gpu memory or not. */ + +/*! The following threshold queres are supported for the IPS config + only. */ + +/*! The PSM2_INFO_QUERY_THRESH_IPS_PIO_DMA threshold query indicates at + what message size the send transport transitions from PIO to DMA. + + Note that this threshold query may be meaningless if PIO or DMA is + disabled. */ + PSM2_INFO_QUERY_THRESH_IPS_PIO_DMA = PSM2_INFO_QUERY_THRESH_IPS_START, +/*! Messages with messages sizes less than or equal to the tiny threshold + will be sent by tiny message. */ + PSM2_INFO_QUERY_THRESH_IPS_TINY, +/*! Messages with messages sizes greater than tiny, but less than or equal + to frag size will be sent by short message. */ + PSM2_INFO_QUERY_THRESH_IPS_PIO_FRAG_SIZE, + PSM2_INFO_QUERY_THRESH_IPS_DMA_FRAG_SIZE, +/*! Messages that are greater than the frag_size, but less than RNDV will + be sent by eager message. + Messages with messages sizes greater than or equal to RNDV will be + sent by the rendezvous protocol message. */ + PSM2_INFO_QUERY_THRESH_IPS_RNDV, + PSM2_INFO_QUERY_THRESH_IPS_END = PSM2_INFO_QUERY_THRESH_IPS_RNDV, + +/*! Not shown here are the specific thresh queries supported by AMSH and + SELF configs: */ + PSM2_INFO_QUERY_THRESH_AMSH_START, + PSM2_INFO_QUERY_THRESH_AMSH_END = PSM2_INFO_QUERY_THRESH_AMSH_START, + + PSM2_INFO_QUERY_THRESH_SELF_START, + PSM2_INFO_QUERY_THRESH_SELF_END = PSM2_INFO_QUERY_THRESH_SELF_START, +}; + +/** @brief Union for info query arg type + */ +typedef union psm2_info_query_arg +{ + uint32_t unit; + uint32_t port; + size_t length; + psm2_mq_t mq; + psm2_epaddr_t epaddr; + enum psm2_info_query_thresh_et mstq; +} psm2_info_query_arg_t; + +/** @brief PSM2 info query + * + * Function that allows a client to interrogate PSM2 for various information. + * + * @param[in] psm2_info_query_t What information is requested. + * @param[out] void * out, where the information will be delivered on a + * PSM2_OK return. + * @param[in] size_t nargs, the number of following arguments. + * @param[in] psm2_info_query_arg_t [], The arguments that are required for + * certain queries. See documentation + * at @ref psm2_info_query_t for what + * arguments are required for what + * queries as well as what the type + * the output is expected to be. + * + * @retval PSM2_OK The out buffer has successfully been written with the + * result of the query. + */ +psm2_error_t psm2_info_query(psm2_info_query_t, void *out, + size_t nargs, psm2_info_query_arg_t []); + +/*! @} */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif +#endif diff --git a/psm2_am.h b/psm2_am.h new file mode 100644 index 0000000..3f9942a --- /dev/null +++ b/psm2_am.h @@ -0,0 +1,476 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef PSM2_AM_H +#define PSM2_AM_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * @file psm2_am.h + * @brief PSM2 Active Message. + * + * @page psm2_am Active Message Interface + * + * PSM2 implements an Active Message (AM) component that lives alongside the + * Matched Queues (MQ) component. The active message interface essentially + * provides a remote procedure call mechanism. A PSM2 process can generate a + * request to run an active message handler on a remote PSM2 process + * identified by its end-point address (epaddr). End-point address values + * are returned by PSM2 when connecting end-points using the psm2_ep_connect() + * function. + * + * An AM handler may make local state updates, and may generate at most + * one reply to be returned to the original requestor. This reply will cause + * a handler to be run on that requestor. The requestor handler may make + * local state updates but is not allowed to reply nor request in that handler + * context. A request or reply can convey a small number of in-line arguments + * plus a short amount of data. A tight bound is placed on the number of + * in-line arguments to allow them to be packed into a header. A bound is + * placed on the size of the data payload so that the request or reply can + * be sent as a single packet within the MTU of the underlying communication + * transport. Longer payloads must be synthesized on top of the provided + * short request/reply mechanism by fragmentation and reassembly, or + * transported by some other means. + * + * Handlers are run in the process context of the targeted PSM2 process, + * either in its main thread of execution or in a progress thread. A handler + * may therefore be executed concurrently with the main thread of execution + * of the PSM2 process. PSM2 ensures that its own state is protected against this + * concurrent execution. However, a handler must make its own arrangements to + * protect its own state. Alternatively, the PSM2 progress thread can be + * disabled using the PSM2_RCVTHREAD environment variable if this is too + * onerous for the handler. + * + * PSM2 has an active progress model and requires that the PSM2 library is + * called in order to make progress. This can be achieved using the psm2_poll() + * function. A PSM2 implementatation may provide passive progress through some + * other mechanism (e.g. a receive thread), but a PSM2 consumer must not assume + * this and must arrange to make active progress through calls into the PSM + * library. Note that the PSM2 AM interface is not MTsafe, same as the other PSM + * interfaces, and that MTsafety must be provided by the consumer if required. + * + * The order in which AM requests are issued by an initiator to a particular + * target defines the order in which those AM requests will be executed on + * that target. Therefore the AM implementation will maintain the order + * of handler executions on a flow, and this also applies when progress + * threads are used. For multiple initiators issuing requests to a particular + * target, the handler executions will be interleaved in some sequentially + * consistent ordering. + */ + +/*! @defgroup am PSM2 Active Message + * + * @{ + */ + +/** @brief Datatype for an index representing an active message handler */ +typedef uint32_t psm2_handler_t; + +/** @brief Datatype for a token for an active message handler.*/ +typedef void *psm2_am_token_t; + +/* PSM2 AM flags + * These flags may be combined using bitwise-or. + */ +#define PSM2_AM_FLAG_NONE 0 /**< No other PSM2 AM flags are needed. */ +#define PSM2_AM_FLAG_ASYNC 1 /**< No need to copy source data. */ +#define PSM2_AM_FLAG_NOREPLY 2 /**< The handler for this AM request is + guaranteed not to generate a reply. */ + +/** @brief The psm2_amarg type represents the type of an AM argument. This is + * a 64-bit type and is broken down into four 16-bit fields, two 32-bit + * fields or one 64-bit field for the convenience of code using the PSM2 AM + * interface. + */ +typedef +struct psm2_amarg { + union { + struct { + uint16_t u16w3; + uint16_t u16w2; + uint16_t u16w1; + uint16_t u16w0; + }; + struct { + uint32_t u32w1; + uint32_t u32w0; + }; + uint64_t u64w0; + uint64_t u64; + }; +} psm2_amarg_t; + +/** @brief The AM handler function type + * + * psm2_am_handler_fn_t is the datatype for an AM handler. PSM2 AM will call-back + * into an AM handler using this function prototype. The parameters and result + * of these handler functions are described here. + * + * @param[in] token This is an opaque token value passed into a handler. + * A request handler may send at most one reply back to the + * original requestor, and must pass this value as the token + * parameter to the psm2_am_reply_short() function. A reply + * handler is also passed a token value, but must not attempt + * to reply. + * @param[in] args A pointer to the arguments provided to this handler. + * @param[in] nargs The number of arguments. + * @param[in] src A pointer to the data payload provided to this handler. + * @param[in] len The length of the data payload in bytes. + * + * @returns 0 The handler should always return a result of 0. + */ +typedef +int (*psm2_am_handler_fn_t) (psm2_am_token_t token, + psm2_amarg_t *args, int nargs, + void *src, uint32_t len); + +/** @brief The AM handler function type with caller context + * + * psm2_am_handler_2_fn_t is the datatype for an AM handler that + * includes a user context. PSM2 AM will call-back into an AM handler using + * this function prototype. The parameters and result + * of these handler functions are described here. + * + * @param[in] token This is an opaque token value passed into a handler. + * A request handler may send at most one reply back to the + * original requestor, and must pass this value as the token + * parameter to the psm2_am_reply_short() function. A reply + * handler is also passed a token value, but must not attempt + * to reply. + * @param[in] args A pointer to the arguments provided to this handler. + * @param[in] nargs The number of arguments. + * @param[in] src A pointer to the data payload provided to this handler. + * @param[in] len The length of the data payload in bytes. + * @param[in] hctx The user context pointer provided at handler registration. + * + * @returns 0 The handler should always return a result of 0. + */ +typedef +int (*psm2_am_handler_2_fn_t) (psm2_am_token_t token, + psm2_amarg_t *args, int nargs, + void *src, uint32_t len, void *hctx); + +/** @brief Type for a completion call-back handler. + * + * A completion handler can be specified to give a call-back on the initiation + * side that an AM request or reply has completed on the target side. The + * call-back has a context pointer which is provided along with the call-back + * function pointer when the initiator generates the request or reply. This + * approach will typically give higher performance than using an AM request or + * reply to achieve the same effect, though note that no additional information + * can be passed from the target side back to the initiator side with the + * completion handler approach. + * + * @param[in] context A context pointer. + * @returns void This handler has no return result. + */ +typedef +void (*psm2_am_completion_fn_t) (void *context); + +/** @brief Register AM call-back handlers at the specified end-point. + * + * This function is used to register an array of handlers, and may be called + * multiple times to register additonal handlers. The maximum number of + * handlers that can be registered is limited to the max_handlers value + * returned by psm2_am_get_parameters(). Handlers are associated with a PSM + * end-point. The handlers are allocated index numbers in the the handler table + * for that end-point. The allocated index for the handler function in + * handlers[i] is returned in handlers_idx[i] for i in (0, num_handlers]. These + * handler index values are used in the psm2_am_request_short() and + * psm2_am_reply_short() functions. + * + * @param[in] ep End-point value + * @param[in] handlers Array of handler functions + * @param[in] num_handlers Number of handlers (sizes the handlers and + * handlers_idx arrays) + * @param[out] handlers_idx Used to return handler index mapping table + * + * @returns PSM2_OK Indicates success + * @returns PSM2_EP_NO_RESOURCES Insufficient slots in the AM handler table + */ +psm2_error_t psm2_am_register_handlers(psm2_ep_t ep, + const psm2_am_handler_fn_t * + handlers, int num_handlers, + int *handlers_idx); + +/** @brief Register AM call-back handlers at the specified end-point. + * + * This function is used to register an array of handlers, and may be called + * multiple times to register additonal handlers. The maximum number of + * handlers that can be registered is limited to the max_handlers value + * returned by psm2_am_get_parameters(). Handlers are associated with a PSM + * end-point. The handlers are allocated index numbers in the the handler table + * for that end-point. The allocated index for the handler function in + * handlers[i] is returned in handlers_idx[i] for i in (0, num_handlers]. These + * handler index values are used in the psm2_am_request_short() and + * psm2_am_reply_short() functions. + * + * @param[in] ep End-point value + * @param[in] handlers Array of handler functions + * @param[in] num_handlers Number of handlers (sizes the handlers and + * handlers_idx arrays) + * @param[in] hctx Array of void* pointers to a user contexts for identifying the + * target ep that registered these handlers. + * @param[out] handlers_idx Used to return handler index mapping table + * + * @returns PSM2_OK Indicates success + * @returns PSM2_EP_NO_RESOURCES Insufficient slots in the AM handler table + */ +psm2_error_t psm2_am_register_handlers_2(psm2_ep_t ep, + const psm2_am_handler_2_fn_t * + handlers, int num_handlers, + void **hctx, + int *handlers_idx); + +/** @brief Unregister all AM call-back handlers for the specific end-point. + * + * This function is used to unregister all AM handlers registered to the + * specified end-point. + * + * @param[in] ep End-point value + * + */ +void psm2_am_unregister_handlers(psm2_ep_t ep); + +/** @brief Generate an AM request. + * + * This function generates an AM request causing an AM handler function to be + * called in the PSM2 process associated with the specified end-point address. + * The number of arguments is limited to max_nargs and the payload length in + * bytes to max_request_short returned by the psm2_am_get_parameters() function. + * If arguments are not required, set the number of arguments to 0 and the + * argument pointer will not be dereferenced. If payload is not required, set + * the payload size to 0 and the payload pointer will not be dereferenced. + * + * Optionally a completion function and completion context pointer can be + * provided, and a local call-back will be made to that function passing in + * that context pointer once remote execution of the handler has completed. If + * the completion call-back is not required, the handler should be specified as + * NULL and the pointer value will not be used. + * + * The allowed flags are any combination of the following combined with + * bitwise-or: + * PSM2_AM_FLAG_NONE - No flags + * PSM2_AM_FLAG_ASYNC - Indicates no need to copy source data + * PSM2_AM_FLAG_NOREPLY - The handler for this AM request is guaranteed not to + * generate a reply + * + * The PSM2 AM implementation will not dereference the args pointer after return + * from this function. If PSM2_AM_FLAG_ASYNC is not provided, the PSM2 AM + * implementation will not dereference the src pointer after return from this + * function. This may require the implementation to take a copy of the payload + * if the request cannot be issued immediately. However, if PSM2_AM_FLAG_ASYNC + * is provided then a copy will not be taken and the PSM2 AM implementation + * retains ownership of the payload src memory until the request is locally + * complete. Local completion can be determined using the completion handler + * call-back, or through an AM handler associated with an AM reply. + * + * The PSM2_AM_FLAG_NOREPLY flag indicates ahead of time to the AM handler that + * a reply will not be generated. Use of this flag is optional, but it may + * enable a performance optimization in this case by indicating that reply + * state is not required. + * + * @param[in] epaddr End-point address to run handler on + * @param[in] handler Index of handler to run + * @param[in] args Array of arguments to be provided to the handler + * @param[in] nargs Number of arguments to be provided to the handler + * @param[in] src Pointer to the payload to be delivered to the handler + * @param[in] len Length of the payload in bytes + * @param[in] flags These are PSM2 AM flags and may be combined together with + * bitwise-or + * @param[in] completion_fn The completion function to called locally when + * remote handler is complete + * @param[in] completion_ctxt User-provided context pointer to be passed to the + * completion handler + * + * @returns PSM2_OK indicates success. + */ +psm2_error_t +psm2_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler, + psm2_amarg_t *args, int nargs, void *src, + size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt); + +/** @brief Generate an AM reply. + * + * This function may only be called from an AM handler called due to an AM + * request. If the AM request uses the PSM2_AM_FLAG_NOREPLY flag, the AM + * handler must not call this function. Otherwise, the AM request handler may + * call psm2_am_reply_short() at most once, and must pass in the token value + * that it received in its own handler call-back. + * + * This function generates an AM reply causing an AM handler function to be + * called in the PSM2 process associated with the specified end-point address. + * The number of arguments is limited to max_nargs and the payload length in + * bytes to max_reply_short returned by the psm2_am_get_parameters() function. + * If arguments are not required, set the number of arguments to 0 and the + * argument pointer will not be dereferenced. If payload is not required, set + * the payload size to 0 and the payload pointer will not be dereferenced. + * + * Optionally a completion function and completion context pointer can be + * provided, and a local call-back will be made to that function passing in + * that context pointer once remote execution of the handler has completed. If + * the completion call-back is not required, the handler should be specified as + * NULL and the pointer value will not be used. + * + * The allowed flags are any combination of the following combined with + * bitwise-or: + * PSM2_AM_FLAG_NONE - No flags + * PSM2_AM_FLAG_ASYNC - Indicates no need to copy source data + * + * The PSM2 AM implementation will not dereference the args pointer after return + * from this function. If PSM2_AM_FLAG_ASYNC is not provided, the PSM2 AM + * implementation will not dereference the src pointer after return from this + * function. This may require the implementation to take a copy of the payload + * if the reply cannot be issued immediately. However, if PSM2_AM_FLAG_ASYNC is + * provided then a copy will not be taken and the PSM2 AM implementation retains + * ownership of the payload src memory until the reply is locally complete. + * Local completion can be determined using the completion handler call-back. + * + * @param[in] token Token value provided to the AM handler that is generating + * the reply. + * @param[in] handler Index of handler to run + * @param[in] args Array of arguments to be provided to the handler + * @param[in] nargs Number of arguments to be provided to the handler + * @param[in] src Pointer to the payload to be delivered to the handler + * @param[in] len Length of the payload in bytes + * @param[in] flags These are PSM2 AM flags and may be combined together with + * bitwise-or + * @param[in] completion_fn The completion function to called locally when + * remote handler is complete + * @param[in] completion_ctxt User-provided context pointer to be passed to the + * completion handler + * + * @returns PSM2_OK indicates success. + */ +psm2_error_t +psm2_am_reply_short(psm2_am_token_t token, psm2_handler_t handler, + psm2_amarg_t *args, int nargs, void *src, + size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt); + +/** @brief Return the source end-point address for a token. + * + * This function is used to obtain the epaddr object representing the message + * initiator from a token passed by PSM2 to a message handler. + * + * @param[in] token Token value provided to the AM handler that is generating + * the reply. + * @param[out] epaddr_out Pointer to the where the epaddr should be returned. + * + * @returns PSM2_OK indicates success. + * @returns PSM2_PARAM_ERR token is invalid or epaddr_out is NULL. + */ +psm2_error_t psm2_am_get_source(psm2_am_token_t token, + psm2_epaddr_t *epaddr_out); + +/** @brief AM parameters + * + * This structure is used to return PSM2 AM implementation-specific parameter + * values back to the caller of the psm2_am_get_parameters() function. This + * API also specifies the minimum values for these parameters that an + * implementation must at least provide: + * max_handlers >= 64, + * max_nargs >= 2, + * max_request_short >= 256 and + * max_reply_short >= 256. + */ +struct psm2_am_parameters { + /** Maximum number of handlers that can be registered. */ + uint32_t max_handlers; + /** Maximum number of arguments to an AM handler. */ + uint32_t max_nargs; + /** Maximum number of bytes in a request payload. */ + uint32_t max_request_short; + /** Maximum number of bytes in a reply payload. */ + uint32_t max_reply_short; +}; + +/** @brief Get the AM parameter values + * + * This function retrieves the implementation-specific AM parameter values for + * the specified end-point. + * + * @param[in] ep The end-point value returned by psm2_ep_open(). + * @param[out] parameters Pointer to the struct where the parameters will be + * returned. + * @param[in] sizeof_parameters_in The size in bytes of the struct provided by + * the caller. + * @param[out] sizeof_parameters_out The size in bytes of the struct returned + * by PSM. + * + * @returns PSM2_OK indicates success. + */ +psm2_error_t +psm2_am_get_parameters(psm2_ep_t ep, + struct psm2_am_parameters *parameters, + size_t sizeof_parameters_in, + size_t *sizeof_parameters_out); + +/*! @} */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif +#endif diff --git a/psm2_hal.c b/psm2_hal.c new file mode 100644 index 0000000..100ceaf --- /dev/null +++ b/psm2_hal.c @@ -0,0 +1,348 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "psm_user.h" +#include "psm2_hal.h" + +#include "ptl_ips/ips_scb.h" + +static SLIST_HEAD(, _psmi_hal_instance) head_hi; + +/* define the current hal instance pointer */ +psmi_hal_instance_t *psmi_hal_current_hal_instance = NULL; + +/* psmi_hal_register_instance */ +void psmi_hal_register_instance(psmi_hal_instance_t *psm_hi) +{ +#define REJECT_IMPROPER_HI(MEMBER) if (!psm_hi->MEMBER) return + + /* If an attempt to register a hal instance contains a NULL func ptr, reject it. */ + /* To allow fast lookups, please keep this code segment alphabetized by hfp_* + func ptr member name: */ +#if PSMI_HAL_INST_CNT > 1 + REJECT_IMPROPER_HI(hfp_ack_hfi_event); + REJECT_IMPROPER_HI(hfp_check_rhf_sequence_number); + REJECT_IMPROPER_HI(hfp_cl_q_empty); + REJECT_IMPROPER_HI(hfp_close_context); + REJECT_IMPROPER_HI(hfp_context_open); + REJECT_IMPROPER_HI(hfp_dma_slot_available); + REJECT_IMPROPER_HI(hfp_finalize); + REJECT_IMPROPER_HI(hfp_forward_packet_to_subcontext); + REJECT_IMPROPER_HI(hfp_free_tid); + REJECT_IMPROPER_HI(hfp_get_bthqp); + REJECT_IMPROPER_HI(hfp_get_cc_settings_bin); + REJECT_IMPROPER_HI(hfp_get_cc_table_bin); + REJECT_IMPROPER_HI(hfp_get_cl_q_head_index); + REJECT_IMPROPER_HI(hfp_get_cl_q_tail_index); + REJECT_IMPROPER_HI(hfp_get_context); + REJECT_IMPROPER_HI(hfp_get_egr_buff); + REJECT_IMPROPER_HI(hfp_get_fd); + REJECT_IMPROPER_HI(hfp_get_gid_hi); + REJECT_IMPROPER_HI(hfp_get_gid_lo); + REJECT_IMPROPER_HI(hfp_get_hfi_event_bits); + REJECT_IMPROPER_HI(hfp_get_hfi_type); + REJECT_IMPROPER_HI(hfp_get_hw_status); + REJECT_IMPROPER_HI(hfp_get_hw_status_freezemsg); + REJECT_IMPROPER_HI(hfp_get_jkey); + REJECT_IMPROPER_HI(hfp_get_lid); + REJECT_IMPROPER_HI(hfp_get_node_id); + REJECT_IMPROPER_HI(hfp_get_num_contexts); + REJECT_IMPROPER_HI(hfp_get_num_free_contexts); + REJECT_IMPROPER_HI(hfp_get_pio_size); + REJECT_IMPROPER_HI(hfp_get_pio_stall_cnt); + REJECT_IMPROPER_HI(hfp_get_port_active); + REJECT_IMPROPER_HI(hfp_get_port_gid); + REJECT_IMPROPER_HI(hfp_get_port_index2pkey); + REJECT_IMPROPER_HI(hfp_get_port_lid); + REJECT_IMPROPER_HI(hfp_get_port_lmc); + REJECT_IMPROPER_HI(hfp_get_port_num); + REJECT_IMPROPER_HI(hfp_get_port_rate); + REJECT_IMPROPER_HI(hfp_get_port_sc2vl); + REJECT_IMPROPER_HI(hfp_get_port_sl2sc); + REJECT_IMPROPER_HI(hfp_get_receive_event); + REJECT_IMPROPER_HI(hfp_get_rhf_expected_sequence_number); + REJECT_IMPROPER_HI(hfp_get_rx_egr_tid_cnt); + REJECT_IMPROPER_HI(hfp_get_rx_hdr_q_cnt); + REJECT_IMPROPER_HI(hfp_get_rx_hdr_q_ent_size); + REJECT_IMPROPER_HI(hfp_get_sdma_req_size); + REJECT_IMPROPER_HI(hfp_get_sdma_ring_size); + REJECT_IMPROPER_HI(hfp_get_sdma_ring_slot_status); + REJECT_IMPROPER_HI(hfp_get_subctxt); + REJECT_IMPROPER_HI(hfp_get_subctxt_cnt); + REJECT_IMPROPER_HI(hfp_get_tid_exp_cnt); + REJECT_IMPROPER_HI(hfp_get_tidcache_invalidation); + REJECT_IMPROPER_HI(hfp_get_unit_active); + REJECT_IMPROPER_HI(hfp_get_unit_id); + REJECT_IMPROPER_HI(hfp_get_user_major_bldtime_version); + REJECT_IMPROPER_HI(hfp_get_user_major_runtime_version); + REJECT_IMPROPER_HI(hfp_get_user_minor_bldtime_version); + REJECT_IMPROPER_HI(hfp_get_user_minor_runtime_version); + REJECT_IMPROPER_HI(hfp_hfi_reset_context); + REJECT_IMPROPER_HI(hfp_poll_type); + REJECT_IMPROPER_HI(hfp_retire_hdr_q_entry); + REJECT_IMPROPER_HI(hfp_set_cl_q_head_index); + REJECT_IMPROPER_HI(hfp_set_cl_q_tail_index); + REJECT_IMPROPER_HI(hfp_set_effective_mtu); + REJECT_IMPROPER_HI(hfp_set_pbc); + REJECT_IMPROPER_HI(hfp_set_pio_size); + REJECT_IMPROPER_HI(hfp_set_pkey); + REJECT_IMPROPER_HI(hfp_set_rhf_expected_sequence_number); + REJECT_IMPROPER_HI(hfp_set_tf_valid); + REJECT_IMPROPER_HI(hfp_spio_fini); + REJECT_IMPROPER_HI(hfp_spio_init); + REJECT_IMPROPER_HI(hfp_spio_process_events); + REJECT_IMPROPER_HI(hfp_spio_transfer_frame); + REJECT_IMPROPER_HI(hfp_subcontext_ureg_get); + REJECT_IMPROPER_HI(hfp_tidflow_check_update_pkt_seq); + REJECT_IMPROPER_HI(hfp_tidflow_get); + REJECT_IMPROPER_HI(hfp_tidflow_get_enabled); + REJECT_IMPROPER_HI(hfp_tidflow_get_flowvalid); + REJECT_IMPROPER_HI(hfp_tidflow_get_genmismatch); + REJECT_IMPROPER_HI(hfp_tidflow_get_genval); + REJECT_IMPROPER_HI(hfp_tidflow_get_hw); + REJECT_IMPROPER_HI(hfp_tidflow_get_keep_after_seqerr); + REJECT_IMPROPER_HI(hfp_tidflow_get_keep_on_generr); + REJECT_IMPROPER_HI(hfp_tidflow_get_keep_payload_on_generr); + REJECT_IMPROPER_HI(hfp_tidflow_get_seqmismatch); + REJECT_IMPROPER_HI(hfp_tidflow_get_seqnum); + REJECT_IMPROPER_HI(hfp_tidflow_reset); + REJECT_IMPROPER_HI(hfp_tidflow_set_entry); + REJECT_IMPROPER_HI(hfp_update_tid); + REJECT_IMPROPER_HI(hfp_writev); +#endif + REJECT_IMPROPER_HI(hfp_get_default_pkey); + REJECT_IMPROPER_HI(hfp_get_num_ports); + REJECT_IMPROPER_HI(hfp_get_num_units); + REJECT_IMPROPER_HI(hfp_initialize); + + SLIST_INSERT_HEAD(&head_hi, psm_hi, next_hi); + sysfs_init(psm_hi->hfi_sys_class_path); +} + +static struct _psmi_hal_instance *psmi_hal_get_pi_inst(int *pnumunits, + int *pnumports, + int *pdflt_pkey); + +#if PSMI_HAL_INST_CNT > 1 + +int psmi_hal_pre_init_func(enum psmi_hal_pre_init_func_krnls k, ...) +{ + va_list ap; + va_start(ap, k); + + int rv = 0,numunits,numports,dflt_pkey; + struct _psmi_hal_instance *p = psmi_hal_get_pi_inst(&numunits, + &numports, + &dflt_pkey); + if (!p) + rv = -1; + else + { + switch(k) + { + case psmi_hal_pre_init_func_get_num_units: + rv = numunits; + break; + case psmi_hal_pre_init_func_get_num_ports: + rv = numports; + break; + case psmi_hal_pre_init_func_get_unit_active: + rv = p->hfp_get_unit_active( va_arg(ap,int) ); + break; + case psmi_hal_pre_init_func_get_port_active: + rv = p->hfp_get_port_active( va_arg(ap,int), + va_arg(ap,int) ); + break; + case psmi_hal_pre_init_func_get_num_contexts: + rv = p->hfp_get_num_contexts( va_arg(ap,int) ); + break; + case psmi_hal_pre_init_func_get_num_free_contexts: + rv = p->hfp_get_num_free_contexts( va_arg(ap,int) ); + break; + default: + rv = -1; + break; + } + } + + va_end(ap); + return rv; +} + +#endif + + +static struct _psmi_hal_instance *psmi_hal_get_pi_inst(int *pnumunits, + int *pnumports, + int *pdflt_pkey) +{ + if (SLIST_EMPTY(&head_hi)) + return NULL; + + /* At this point, assuming there are multiple HAL INSTANCES that are + registered, and two or more of the HAL INSTANCES are capable + of initialization on a host, the environment variable PSM2_HAL_PREF + allows the user to identify the one HAL INSTANCE that is desired to + be used. The default policy is, when the PSM2_HAL_PREF is not set, the + first hal instance that successfully initializes is used. */ + + union psmi_envvar_val env_hi_pref; /* HAL instance preference */ + psmi_getenv("PSM2_HAL_PREF", + "Indicate preference for HAL instance (Default is use first HAL" + " instance to successfully initialize))", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)PSM_HAL_INSTANCE_ANY_GEN, &env_hi_pref); + + int wait = 0; + /* The hfp_get_num_units() call below, will not wait for the HFI driver + to come up and create device nodes in /dev/.) */ + struct _psmi_hal_instance *p; + SLIST_FOREACH(p, &head_hi, next_hi) + { + if ((env_hi_pref.e_int == PSM_HAL_INSTANCE_ANY_GEN) || + (p->type == env_hi_pref.e_int)) + { + int nunits = p->hfp_get_num_units(wait); + int nports = p->hfp_get_num_ports(); + int dflt_pkey = p->hfp_get_default_pkey(); + if (nunits > 0 && nports > 0 && dflt_pkey > 0) + { + sysfs_init(p->hfi_sys_class_path); + *pnumunits = nunits; + *pnumports = nports; + *pdflt_pkey = dflt_pkey; + return p; + } + } + } + return NULL; +} + +/* psmi_hal_initialize */ +int psmi_hal_initialize(void) +{ + int nunits = 0; + int nports = 0; + int dflt_pkey = 0; + struct _psmi_hal_instance *p = psmi_hal_get_pi_inst(&nunits, &nports, &dflt_pkey); + + if (!p) + return -PSM_HAL_ERROR_INIT_FAILED; + + memset(&p->params,0,sizeof(p->params)); + + int rv = p->hfp_initialize(p); + + if (!rv) + { + p->params.num_units = nunits; + p->params.num_ports = nports; + p->params.default_pkey = dflt_pkey; + psmi_hal_current_hal_instance = p; + + if (psmi_hal_has_cap(PSM_HAL_CAP_HDRSUPP)) { + union psmi_envvar_val env_hdrsupp; + + psmi_getenv("PSM2_HDRSUPP", + "Receive header suppression. Default is 1 (enabled)," + " 0 to disable.\n", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)1, &env_hdrsupp); + if (env_hdrsupp.e_uint) + psmi_hal_add_sw_status(PSM_HAL_HDRSUPP_ENABLED); + else + /* user wants to disable header suppression */ + psmi_hal_set_tf_valid(0, p); + } + + return rv; + } + return -PSM_HAL_ERROR_INIT_FAILED; +} + +#ifdef PSM2_MOCK_TESTING + +#include "psm_hal_gen1/opa_user_gen1.h" + +void ips_ptl_non_dw_mul_sdma_init(void) +{ + uint16_t major_version = hfi_get_user_major_version(); + uint16_t minor_version = hfi_get_user_minor_version(); + int allow_non_dw_mul = 0; + + if ((major_version > HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED) || + ((major_version == HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED) && + (minor_version >= HFI1_USER_SWMINOR_NON_DW_MUL_MSG_SIZE_ALLOWED))) + { + allow_non_dw_mul = 1; + } + psmi_hal_current_hal_instance->params.cap_mask = 0; + if (allow_non_dw_mul) + psmi_hal_current_hal_instance->params.cap_mask |= PSM_HAL_CAP_NON_DW_MULTIPLE_MSG_SIZE; +} + +void set_sdma_ring_size_in_MOCK_HAL_instance(int sdma_ring_size) +{ + extern int __psm_hal_mock_sdma_ring_size; + + __psm_hal_mock_sdma_ring_size = sdma_ring_size; +} + +void set_comp_entry(struct hfi1_sdma_comp_entry *pce) +{ + extern struct hfi1_sdma_comp_entry * __psm_hal_mock_hfi1_sdma_comp_entry; + + __psm_hal_mock_hfi1_sdma_comp_entry = pce; +} + +#endif diff --git a/psm2_hal.h b/psm2_hal.h new file mode 100644 index 0000000..e2367f5 --- /dev/null +++ b/psm2_hal.h @@ -0,0 +1,882 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __PSM2_HAL_H__ + +#define __PSM2_HAL_H__ + +#include "psm_user.h" + +/* Forward declaration of PSM structs: */ +struct ips_subcontext_ureg; +struct ips_recvhdrq_event; +struct ips_writehdrq; +struct ips_flow; +struct ips_scb; +struct ips_tid_session_list_tag; +struct ips_epinfo; +struct ips_message_header; + +/* Declare types: */ +typedef enum +{ + PSM_HAL_INSTANCE_ANY_GEN = 0, + PSM_HAL_INSTANCE_GEN1 = 1, + PSM_HAL_INSTANCE_GEN2 = 2, + PSM_HAL_INSTANCE_GEN3 = 3, + +#ifdef PSM2_MOCK_TESTING + PSM_HAL_INSTANCE_MOCK = 99, +#endif +} psmi_hal_instance_type; + +typedef enum +{ + /* Operation was successful. No error occurred. */ + PSM_HAL_ERROR_OK = 0, + /* The operation can not be done unless HAL is initialized first. */ + PSM_HAL_ERROR_NOT_INITIALIZED = 1, + /* No HAL INSTANCE has been registered. Initialization is impossible. */ + PSM_HAL_ERROR_NO_HI_REGISTERED = 2, + /* Initialization failure. */ + PSM_HAL_ERROR_INIT_FAILED = 3, + /* Can't open device file. */ + PSM_HAL_ERROR_CANNOT_OPEN_DEVICE = 4, + /* Can't open context. */ + PSM_HAL_ERROR_CANNOT_OPEN_CONTEXT = 5, + /* Context is not open. */ + PSM_HAL_ERROR_CONTEXT_IS_NOT_OPEN = 6, + /* General error. */ + PSM_HAL_ERROR_GENERAL_ERROR = 7, + /* Not implemented. */ + PSM_HAL_ERROR_NOT_IMPLEMENTED = 8, + /* Internal error. */ + PSM_HAL_ERROR_INTERNAL_ERROR = 9, + + /* HAL instances should not return errors less than the value + PSM_HAL_ERROR_RESERVED_BY_HAL_API. These errors are reserved by + the HAL API layer. */ + PSM_HAL_ERROR_RESERVED_BY_HAL_API = 1000, +} psmi_hal_errors; + +typedef enum +{ + PSM_HAL_HW_STATUS_INITTED = (1UL << 0), + PSM_HAL_HW_STATUS_CHIP_PRESENT = (1UL << 1), + PSM_HAL_HW_STATUS_IB_READY = (1UL << 2), + PSM_HAL_HW_STATUS_IB_CONF = (1UL << 3), + PSM_HAL_HW_STATUS_HWERROR = (1UL << 4) +} psmi_hal_hw_status; + +typedef enum +{ + PSM_HAL_HFI_EVENT_FROZEN = (1UL << 0), + PSM_HAL_HFI_EVENT_LINKDOWN = (1UL << 1), + PSM_HAL_HFI_EVENT_LID_CHANGE = (1UL << 2), + PSM_HAL_HFI_EVENT_LMC_CHANGE = (1UL << 3), + PSM_HAL_HFI_EVENT_SL2VL_CHANGE = (1UL << 4), + PSM_HAL_HFI_EVENT_TID_MMU_NOTIFY = (1UL << 5) +} psmi_hal_hfi_events; + +/* The following enum constants correspond to the bits in the + cap_mask member of the psmi_hal_params_t. */ +typedef enum +{ + PSM_HAL_CAP_SDMA = (1UL << 0), + PSM_HAL_CAP_SDMA_AHG = (1UL << 1), + PSM_HAL_CAP_EXTENDED_PSN = (1UL << 2), + PSM_HAL_CAP_HDRSUPP = (1UL << 3), + PSM_HAL_CAP_USE_SDMA_HEAD = (1UL << 4), + PSM_HAL_CAP_MULTI_PKT_EGR = (1UL << 5), + PSM_HAL_CAP_NODROP_RHQ_FULL = (1UL << 6), + PSM_HAL_CAP_NODROP_EGR_FULL = (1UL << 7), + PSM_HAL_CAP_TID_UNMAP = (1UL << 8), + PSM_HAL_CAP_PRINT_UNIMPL = (1UL << 9), + PSM_HAL_CAP_ALLOW_PERM_JKEY = (1UL << 10), + PSM_HAL_CAP_NO_INTEGRITY = (1UL << 11), + PSM_HAL_CAP_PKEY_CHECK = (1UL << 12), + PSM_HAL_CAP_STATIC_RATE_CTRL = (1UL << 13), + PSM_HAL_CAP_SDMA_HEAD_CHECK = (1UL << 14), + PSM_HAL_CAP_EARLY_CREDIT_RETURN = (1UL << 15), + PSM_HAL_CAP_GPUDIRECT_OT = (1UL << 16), + PSM_HAL_CAP_DMA_HSUPP_FOR_32B_MSGS = (1UL << 17), + PSM_HAL_CAP_RSM_FECN_SUPP = (1UL << 18), + PSM_HAL_CAP_MERGED_TID_CTRLS = (1UL << 19), + PSM_HAL_CAP_NON_DW_MULTIPLE_MSG_SIZE = (1UL << 20), +} psmi_hal_capability_bits; + +/* The following enum constants correspond to the bits in the + sw_status member of the psmi_hal_params_t. */ +typedef enum +{ + /* Request to start rx thread. */ + PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD = (1UL << 0), + /* Rx thread is started. */ + PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED = (1UL << 1), + PSM_HAL_PSMI_RUNTIME_INTR_ENABLED = (1UL << 2), + /* Header suppression is enabled: */ + PSM_HAL_HDRSUPP_ENABLED = (1UL << 3), +} psmi_hal_sw_status; + +/* The _psmi_hal_params structure stores values that remain constant for the entire life of + the process and this structure resides in the hal instance structure (below). + The values are settled after the context is opened. */ +typedef struct _psmi_hal_params +{ + uint16_t num_units; + uint16_t num_ports; + uint32_t cap_mask; + uint32_t sw_status; + uint16_t default_pkey; +} psmi_hal_params_t; + +/* HAL assumes that the rx hdr q and the egr buff q are circular lists + with two important indexes: + + head - software takes from this side of the circular list + tail - hardware deposits new content here + +The indexes advance in the list 0, 1, 2, 3, ... until they reach the value: +(number_of_entries_in_the_q-1), then the next value they take is 0. And, +so, that is why these are called circular lists. + +When the head idx == tail idx, that represents an empty circular list. + +A completely full circular list is when: + + head_idx == (tail_idx + 1) % number_of_entries_in_the_q + +Both indexes will always be in the range: 0 <= index < number_of_entries_in_the_q + +After software receives the packet in the slot corresponding to the head idx, +and processes it completely, software will signal to the hardware that the slot +is available for re-use by retiring it - see api below for details. + +Note that these are simplified assumptions for the benefit of the hardware independent +layer of PSM. The actual implementation details are hidden in the hal instances. + +Note that subcontexts have a collection of head / tail indexes for their use. + +So, HAL supports the use of the following circular lists dealing with the +following entities: + +1. Rx Hdr q - corresponding to hardware (software modifies head index, hardware modifies tail index). +2. Rx egr q - corresponding to hardware (software modifies head index, hardware modifies tail index). +3. Rx Hdr q - corresponding to a subcontext (software modifies both head and tail indexes). +4. Rx egr q - corresponding to a subcontext (software modifies both head and tail indexes). + +Declare a type to indicate a circular list index: +*/ +typedef uint32_t psmi_hal_cl_idx; + +typedef enum +{ + PSM_HAL_CL_Q_RX_HDR_Q = 0, /* HW context for the rx hdr q. */ + PSM_HAL_CL_Q_RX_EGR_Q = 1, /* HW context for the rx eager q. */ + /* Start of subcontexts (This is subcontext 0) */ + PSM_HAL_CL_Q_RX_HDR_Q_SC_0 = 2, /* Subcontext 0's rx hdr q. */ + PSM_HAL_CL_Q_RX_EGR_Q_SC_0 = 3, /* Subcontext 0's rx eager q. */ + + /* Following SC 0's CL_Q's are the circular list q for subcontexts 1-7, + two per subcontext. Even values are the rx hdr q for the subcontext + Odd value are for the eager q. */ + +/* Given a subcontext number (0-7), return the CL_Q for the RX HDR_Q: */ +#define PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(SC) ((SC)*2 + PSM_HAL_CL_Q_RX_HDR_Q_SC_0) +/* Given a subcontext number (0-7), return the CL_Q for the RX EGR_Q: */ +#define PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(SC) ((SC)*2 + PSM_HAL_CL_Q_RX_EGR_Q_SC_0) +} psmi_hal_cl_q; + +#define PSM_HAL_MAX_SHARED_CTXTS 8 + +#define PSM_HAL_ALG_ACROSS 0 +#define PSM_HAL_ALG_WITHIN 1 +#define PSM_HAL_ALG_ACROSS_ALL 2 + +typedef enum +{ + PSM_HAL_EXP = 0, + PSM_HAL_EGR = 1, +} psmi_hal_set_sdma_req_type; + +#define PSM_HAL_SDMA_REQ_VERSION_MASK 0xF +#define PSM_HAL_SDMA_REQ_VERSION_SHIFT 0x0 +#define PSM_HAL_SDMA_REQ_OPCODE_MASK 0xF +#define PSM_HAL_SDMA_REQ_OPCODE_SHIFT 0x4 +#define PSM_HAL_SDMA_REQ_IOVCNT_MASK 0xFF +#define PSM_HAL_SDMA_REQ_IOVCNT_SHIFT 0x8 + +#ifdef PSM_CUDA +#define PSM_HAL_BUF_GPU_MEM 1 +#endif + +struct psm_hal_sdma_req_info { + /* + * bits 0-3 - version (currently used only for GPU direct) + * 1 - user space is NOT using flags field + * 2 - user space is using flags field + * bits 4-7 - opcode (enum sdma_req_opcode) + * bits 8-15 - io vector count + */ + __u16 ctrl; + /* + * Number of fragments contained in this request. + * User-space has already computed how many + * fragment-sized packet the user buffer will be + * split into. + */ + __u16 npkts; + /* + * Size of each fragment the user buffer will be + * split into. + */ + __u16 fragsize; + /* + * Index of the slot in the SDMA completion ring + * this request should be using. User-space is + * in charge of managing its own ring. + */ + __u16 comp_idx; +#ifdef PSM_CUDA + /* + * Buffer flags for this request. See HFI1_BUF_* + */ + __u16 flags; + /* The extra bytes for the PSM_CUDA version of the sdma req info + * struct is the size of the flags member. */ +#define PSM_HAL_CUDA_SDMA_REQ_INFO_EXTRA sizeof(__u16) +#endif +} __attribute__((packed)); + + +typedef enum { + PSM_HAL_SDMA_RING_AVAILABLE = 0, + PSM_HAL_SDMA_RING_QUEUED = 1, + PSM_HAL_SDMA_RING_COMPLETE = 2, + PSM_HAL_SDMA_RING_ERROR = 3, +} psmi_hal_sdma_ring_slot_status; + +typedef uint64_t psmi_hal_raw_rhf_t; + +typedef struct psmi_hal_rhf_ +{ + /* The first entity in rhf is the decomposed rhf. + Each HAL instance, in hfp_get_receive_event(), will decompose the raw rhf + obtained from the hardware and deposit the data into this common + decomposed rhf, so the upper layers of psm can find the data in one + uniform place. */ + + uint64_t decomposed_rhf; + + /* The second entry is the raw rhf that comes from the h/w. + The upper layers of psm should not use the raw rhf, instead use the + decomposed rhf above. The raw rhf is intended for use by the HAL + instance only. */ + uint64_t raw_rhf; +} psmi_hal_rhf_t; + +#define PSMI_HAL_RHF_ERR_ICRC_NBITS 1 +#define PSMI_HAL_RHF_ERR_ICRC_SHFTC 63 +#define PSMI_HAL_RHF_ERR_RSRV_NBITS 1 +#define PSMI_HAL_RHF_ERR_RSRV_SHFTC 62 +#define PSMI_HAL_RHF_ERR_ECC_NBITS 1 +#define PSMI_HAL_RHF_ERR_ECC_SHFTC 61 +#define PSMI_HAL_RHF_ERR_LEN_NBITS 1 +#define PSMI_HAL_RHF_ERR_LEN_SHFTC 60 +#define PSMI_HAL_RHF_ERR_TID_NBITS 1 +#define PSMI_HAL_RHF_ERR_TID_SHFTC 59 +#define PSMI_HAL_RHF_ERR_TFGEN_NBITS 1 +#define PSMI_HAL_RHF_ERR_TFGEN_SHFTC 58 +#define PSMI_HAL_RHF_ERR_TFSEQ_NBITS 1 +#define PSMI_HAL_RHF_ERR_TFSEQ_SHFTC 57 +#define PSMI_HAL_RHF_ERR_RTE_NBITS 3 +#define PSMI_HAL_RHF_ERR_RTE_SHFTC 56 +#define PSMI_HAL_RHF_ERR_DC_NBITS 1 +#define PSMI_HAL_RHF_ERR_DC_SHFTC 55 +#define PSMI_HAL_RHF_ERR_DCUN_NBITS 1 +#define PSMI_HAL_RHF_ERR_DCUN_SHFTC 54 +#define PSMI_HAL_RHF_ERR_KHDRLEN_NBITS 1 +#define PSMI_HAL_RHF_ERR_KHDRLEN_SHFTC 53 +#define PSMI_HAL_RHF_ALL_ERR_FLAGS_NBITS (PSMI_HAL_RHF_ERR_ICRC_NBITS + PSMI_HAL_RHF_ERR_RSRV_NBITS \ + + PSMI_HAL_RHF_ERR_ECC_NBITS \ + + PSMI_HAL_RHF_ERR_LEN_NBITS + PSMI_HAL_RHF_ERR_TID_NBITS \ + + PSMI_HAL_RHF_ERR_TFGEN_NBITS + PSMI_HAL_RHF_ERR_TFSEQ_NBITS \ + + PSMI_HAL_RHF_ERR_RTE_NBITS + PSMI_HAL_RHF_ERR_DC_NBITS \ + + PSMI_HAL_RHF_ERR_DCUN_NBITS + PSMI_HAL_RHF_ERR_KHDRLEN_NBITS) +#define PSMI_HAL_RHF_ALL_ERR_FLAGS_SHFTC 53 +#define PSMI_HAL_RHF_EGR_BUFF_OFF_NBITS 12 +#define PSMI_HAL_RHF_EGR_BUFF_OFF_SHFTC 32 +#define PSMI_HAL_RHF_SEQ_NBITS 4 +#define PSMI_HAL_RHF_SEQ_SHFTC 28 +#define PSMI_HAL_RHF_EGR_BUFF_IDX_NBITS 11 +#define PSMI_HAL_RHF_EGR_BUFF_IDX_SHFTC 16 +#define PSMI_HAL_RHF_USE_EGR_BUFF_NBITS 1 +#define PSMI_HAL_RHF_USE_EGR_BUFF_SHFTC 15 +#define PSMI_HAL_RHF_RX_TYPE_NBITS 3 +#define PSMI_HAL_RHF_RX_TYPE_SHFTC 12 +#define PSMI_HAL_RHF_PKT_LEN_NBITS 12 +#define PSMI_HAL_RHF_PKT_LEN_SHFTC 0 + +typedef enum { + PSM_HAL_RHF_RX_TYPE_EXPECTED = 0, + PSM_HAL_RHF_RX_TYPE_EAGER = 1, + PSM_HAL_RHF_RX_TYPE_NON_KD = 2, + PSM_HAL_RHF_RX_TYPE_ERROR = 3 +} psmi_hal_rhf_rx_type; + +struct psm_hal_pbc { + __u32 pbc0; + __u16 PbcStaticRateControlCnt; + __u16 fill1; +}; + +typedef enum { + PSMI_HAL_POLL_TYPE_URGENT = 1 +} psmi_hal_poll_type; + +/* Forward declaration of incomplete struct type _psmi_hal_instance and + * psmi_hal_instance_t typedef: */ + +struct _psmi_hal_instance; +typedef struct _psmi_hal_instance psmi_hal_instance_t; + +struct _psmi_hal_instance +{ + SLIST_ENTRY(_psmi_hal_instance) next_hi; + psmi_hal_instance_type type; + const char *description; + const char *hfi_name; + const char *hfi_sys_class_path; + /* The params member should be read-only for HIC, and + written only by the HAL instance. */ + psmi_hal_params_t params; + /* Initialize the HAL INSTANCE. */ + int (*hfp_initialize)(psmi_hal_instance_t *); + /* Finalize the HAL INSTANCE. */ + int (*hfp_finalize)(void); + + /* Returns the number of hfi units installed on ths host: + NOTE: hfp_get_num_units is a function that must + be callable before the hal instance is initialized. */ + int (*hfp_get_num_units)(int wait); + + /* Returns the number of ports on each hfi unit installed. + on ths host. + NOTE: hfp_get_num_ports is a function that must + be callable before the hal instance is initialized. */ + int (*hfp_get_num_ports)(void); + + /* Returns the default pkey: + NOTE: hfp_get_default_pkey is a function that must + be callable before the hal instance is initialized. */ + int (*hfp_get_default_pkey)(void); + + /* Given a unit number, returns 1 if any port on the unit is active. + returns 0 if no port on the unit is active. + returns -1 when an error occurred. + NOTE: hfp_get_unit_active is a function that must + be callable before the hal instance is initialized. */ + int (*hfp_get_unit_active)(int unit); + + int (*hfp_get_port_active)(int unit,int port); + /* NOTE: hfp_get_num_contexts is a function that must + be callable before the hal instance is initialized. */ + int (*hfp_get_num_contexts)(int unit); + /* NOTE: hfp_get_num_free_contexts is a function that must + be callable before the hal instance is initialized. */ + int (*hfp_get_num_free_contexts)(int unit); + + /* Context open includes opening the device file, and get hw params. */ + int (*hfp_context_open)(int unit, + int port, + uint64_t open_timeout, + psm2_ep_t ep, + psm2_uuid_t const job_key, + psmi_context_t *psm_ctxt, + uint32_t cap_mask, + unsigned retryCnt); + + /* Close the context, including the device file. */ + int (*hfp_close_context)(psmi_hal_hw_context *); + + /* Given a unit, port and index, return an error, or the corresponding pkey for + the index as programmed by the SM */ + /* Returns an int, so -1 indicates an error. */ + int (*hfp_get_port_index2pkey)(int unit, int port, int index); + int (*hfp_get_cc_settings_bin)(int unit, int port, char *ccabuf, size_t len_ccabuf); + int (*hfp_get_cc_table_bin)(int unit, int port, uint16_t **cctp); + int (*hfp_get_port_lmc)(int unit, int port); + int (*hfp_get_port_rate)(int unit, int port); + int (*hfp_get_port_sl2sc)(int unit, int port,int sl); + int (*hfp_get_port_sc2vl)(int unit, int port,int sc); + int (*hfp_set_pkey)(psmi_hal_hw_context, uint16_t); + int (*hfp_poll_type)(uint16_t poll_type, psmi_hal_hw_context); + int (*hfp_get_port_lid)(int unit, int port); + int (*hfp_get_port_gid)(int unit, int port, uint64_t *hi, uint64_t *lo); + int (*hfp_free_tid)(psmi_hal_hw_context, uint64_t tidlist, uint32_t tidcnt); + int (*hfp_get_tidcache_invalidation)(psmi_hal_hw_context, uint64_t tidlist, uint32_t *tidcnt); + int (*hfp_update_tid)(psmi_hal_hw_context, uint64_t vaddr, uint32_t *length, + uint64_t tidlist, uint32_t *tidcnt, + uint16_t flags); + /* Initiate a DMA. Intrinsically specifies a DMA slot to use. */ + int (*hfp_writev)(const struct iovec *iov, int iovcnt, struct ips_epinfo *, psmi_hal_hw_context); + /* Updates PSM from h/w on DMA completions: */ + int (*hfp_get_sdma_ring_slot_status)(int slotIdx, psmi_hal_sdma_ring_slot_status *, uint32_t *errorCode, psmi_hal_hw_context); + /* Returns > 0 if the specified slots is available. 0 if not available + and a negative value if an error occurred. */ + int (*hfp_dma_slot_available)(int slotidx, psmi_hal_hw_context); + + /* Start of receive packet functions. */ + + /* Getter for cl q head indexes: */ + psmi_hal_cl_idx (*hfp_get_cl_q_head_index)(psmi_hal_cl_q, + psmi_hal_hw_context); + + /* Getter for cl q tail indexes: */ + psmi_hal_cl_idx (*hfp_get_cl_q_tail_index)(psmi_hal_cl_q, + psmi_hal_hw_context); + + /* Setter for cl q head indexes: */ + void (*hfp_set_cl_q_head_index)(psmi_hal_cl_idx, + psmi_hal_cl_q, + psmi_hal_hw_context); + + /* Setter for cl q tail indexes: */ + void (*hfp_set_cl_q_tail_index)(psmi_hal_cl_idx, + psmi_hal_cl_q, + psmi_hal_hw_context); + + /* Indicate whether the cl q is empty. + When this returns > 0 the cl q is empty. + When this returns == 0, the cl q is NOT empty (there are packets in the + circular list that are available to receive). + When this returns < 0, an error occurred. + the parameter should correspond to the head index of the + cl q circular list. */ + int (*hfp_cl_q_empty)(psmi_hal_cl_idx head_idx, + psmi_hal_cl_q, + psmi_hal_hw_context); + + /* Receive the raw rhf, decompose it, and then receive the ips_message_hdr. */ + int (*hfp_get_receive_event)(psmi_hal_cl_idx head_idx, psmi_hal_hw_context, + struct ips_recvhdrq_event *); + + /* Deliver an eager buffer given the index. + If the index does not refer to a current egr buffer, hfp_get_egr_buff() returns + NULL. */ + void *(*hfp_get_egr_buff)(psmi_hal_cl_idx, psmi_hal_cl_q, psmi_hal_hw_context); + + /* Retire the given head idx of the header q, and change *head_idx to point to the next + entry, lastly set *empty to indicate whether the headerq is empty at the new + head_idx. */ + int (*hfp_retire_hdr_q_entry)(psmi_hal_cl_idx *head_idx, psmi_hal_cl_q, psmi_hal_hw_context, + uint32_t elemsz, uint32_t elemlast, + int *emptyp); + + /* Returns expected sequence number for RHF. */ + int (*hfp_get_rhf_expected_sequence_number)(unsigned int *, psmi_hal_cl_q, psmi_hal_hw_context); + + /* Sets expected sequence number for RHF. */ + int (*hfp_set_rhf_expected_sequence_number)(unsigned int, psmi_hal_cl_q, psmi_hal_hw_context); + + /* Checks sequence number from RHF. Returns PSM_HAL_ERROR_OK if the sequence number is good + returns something else if the sequence number is bad. */ + int (*hfp_check_rhf_sequence_number)(unsigned int); + + /* Set PBC struct that lies within the extended memory region of SCB */ + int (*hfp_set_pbc)(struct ips_proto *proto, struct ips_flow *flow, + uint32_t isCtrlMsg, struct psm_hal_pbc *dest, uint32_t hdrlen, + uint32_t paylen); + + + /* Start of tid flow functions. */ + int (*hfp_set_tf_valid)(uint32_t, psmi_hal_hw_context); + + int (*hfp_tidflow_set_entry)(uint32_t flowid, uint32_t genval, + uint32_t seqnum, psmi_hal_hw_context); + + int (*hfp_tidflow_reset)(psmi_hal_hw_context, uint32_t flowid, uint32_t genval, + uint32_t seqnum); + + int (*hfp_tidflow_get)(uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context); + + /* hfp_tidflow_get_hw is identical to hfp_tidflow_get(), but guarantees to get + its information fron h/w, and not from cached values, but may be significantly + slower than hfp_tidflow_get(), so should be used for debug only. */ + int (*hfp_tidflow_get_hw)(uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context); + + int (*hfp_tidflow_get_seqnum)(uint64_t val, uint32_t *pseqn); + + int (*hfp_tidflow_get_genval)(uint64_t val, uint32_t *pgv); + + int (*hfp_tidflow_check_update_pkt_seq)(void *vpprotoexp + /* actually a: + struct ips_protoexp *protoexp */, + psmi_seqnum_t sequence_num, + void *vptidrecvc + /* actually a: + struct ips_tid_recv_desc *tidrecvc */, + struct ips_message_header *p_hdr, + void (*ips_protoexp_do_tf_generr) + (void *vpprotoexp + /* actually a: + struct ips_protoexp *protoexp */, + void *vptidrecvc + /* actually a: + struct ips_tid_recv_desc *tidrecvc */, + struct ips_message_header *p_hdr), + void (*ips_protoexp_do_tf_seqerr) + (void *vpprotoexp + /* actually a: + struct ips_protoexp *protoexp */, + void *vptidrecvc + /* actually a: + struct ips_tid_recv_desc *tidrecvc */, + struct ips_message_header *p_hdr) + ); + + int (*hfp_tidflow_get_flowvalid)(uint64_t val, uint32_t *pfv); + + int (*hfp_tidflow_get_enabled)(uint64_t val, uint32_t *penabled); + + int (*hfp_tidflow_get_keep_after_seqerr)(uint64_t val, uint32_t *pkase); + + int (*hfp_tidflow_get_keep_on_generr)(uint64_t val, uint32_t *pkoge); + + int (*hfp_tidflow_get_keep_payload_on_generr)(uint64_t val, uint32_t *pkpoge); + + /* For hfp_tidflow_get_seqmismatch and hfp_tidflow_get_genmismatch, if + val was obtained from hfp_tidflow_get_hw(), then these will be valid + but, if val was obtained from hfp_tidflow_get(), then these will + always return 0. */ + int (*hfp_tidflow_get_seqmismatch)(uint64_t val, uint32_t *psmm); + + int (*hfp_tidflow_get_genmismatch)(uint64_t val, uint32_t *pgmm); + + /* End of tid flow functions. */ + + /* End of receive functions. */ + + int (*hfp_forward_packet_to_subcontext)(struct ips_writehdrq *writeq, + struct ips_recvhdrq_event *rcv_ev, + uint32_t subcontext, + psmi_hal_hw_context); + int (*hfp_subcontext_ureg_get)(ptl_t *ptl, + struct ips_subcontext_ureg **uregp, + psmi_hal_hw_context); + + int (*hfp_get_hfi_event_bits) (uint64_t *event_bits, psmi_hal_hw_context); + + int (*hfp_ack_hfi_event) (uint64_t ack_bits, psmi_hal_hw_context); + + int (*hfp_hfi_reset_context) (psmi_hal_hw_context); + + uint64_t (*hfp_get_hw_status) (psmi_hal_hw_context); + + int (*hfp_get_hw_status_freezemsg) (volatile char** msg, psmi_hal_hw_context); + + uint16_t (*hfp_get_user_major_bldtime_version) (void); + + uint16_t (*hfp_get_user_minor_bldtime_version) (void); + + uint16_t (*hfp_get_user_major_runtime_version) (psmi_hal_hw_context); + + uint16_t (*hfp_get_user_minor_runtime_version) (psmi_hal_hw_context); + + int (*hfp_set_pio_size)(uint32_t, psmi_hal_hw_context); + + int (*hfp_set_effective_mtu)(uint32_t, psmi_hal_hw_context); + + int (*hfp_spio_init)(const psmi_context_t *context, + struct ptl *ptl, void **ctrl); + int (*hfp_spio_fini)(void **ctrl, psmi_hal_hw_context); + + int (*hfp_spio_transfer_frame)(struct ips_proto *proto, + struct ips_flow *flow, struct psm_hal_pbc *pbc, + uint32_t *payload, uint32_t length, + uint32_t isCtrlMsg, uint32_t cksum_valid, + uint32_t cksum, psmi_hal_hw_context +#ifdef PSM_CUDA + , uint32_t is_cuda_payload +#endif + ); + int (*hfp_spio_process_events)(const struct ptl *ptl); + int (*hfp_get_node_id)(int unit, int *nodep); + + int (*hfp_get_bthqp)(psmi_hal_hw_context); + int (*hfp_get_context)(psmi_hal_hw_context); + uint64_t (*hfp_get_gid_lo)(psmi_hal_hw_context); + uint64_t (*hfp_get_gid_hi)(psmi_hal_hw_context); + int (*hfp_get_hfi_type)(psmi_hal_hw_context); + int (*hfp_get_jkey)(psmi_hal_hw_context); + int (*hfp_get_lid)(psmi_hal_hw_context); + int (*hfp_get_pio_size)(psmi_hal_hw_context); + int (*hfp_get_port_num)(psmi_hal_hw_context); + int (*hfp_get_rx_egr_tid_cnt)(psmi_hal_hw_context); + int (*hfp_get_rx_hdr_q_cnt)(psmi_hal_hw_context); + int (*hfp_get_rx_hdr_q_ent_size)(psmi_hal_hw_context); + int (*hfp_get_sdma_req_size)(psmi_hal_hw_context); + int (*hfp_get_sdma_ring_size)(psmi_hal_hw_context); + int (*hfp_get_subctxt)(psmi_hal_hw_context); + int (*hfp_get_subctxt_cnt)(psmi_hal_hw_context); + int (*hfp_get_tid_exp_cnt)(psmi_hal_hw_context); + int (*hfp_get_unit_id)(psmi_hal_hw_context); + int (*hfp_get_fd)(psmi_hal_hw_context); + int (*hfp_get_pio_stall_cnt)(psmi_hal_hw_context, uint64_t **); +}; + +/* This is the current psmi_hal_instance, or, NULL if not initialized. + The HIC should not modify the contents of the HAL instance directly. */ +extern psmi_hal_instance_t *psmi_hal_current_hal_instance; + +/* Declare functions called by the HAL INSTANCES. */ +void psmi_hal_register_instance(psmi_hal_instance_t *); + +/* Declare functions that are called by the HIC: */ +/* All of these functions return a negative int value to + indicate failure, or >= 0 for success. */ + +/* Chooses one of the the psmi_hal_instances that have been + registered and then initializes it. + Returns: -PSM_HAL_ERROR_NOT_REGISTERED_HI if no HAL + INSTANCES are registered, or PSM_HAL_ERROR_INIT_FAILED when + another failure has occured during initialization. */ +int psmi_hal_initialize(void); + +/* note that: + +int psmi_hal_get_num_units(void); + +Is intentionally left out as it is called during initialization, +and the results are cached in the hw params. +*/ + +#include "psm2_hal_inlines_d.h" + +#if PSMI_HAL_INST_CNT == 1 + +#define PSMI_HAL_DISPATCH(KERNEL,...) ( PSMI_HAL_CAT_INL_SYM(KERNEL) ( __VA_ARGS__ ) ) + +#define PSMI_HAL_DISPATCH_PI(KERNEL,...) PSMI_HAL_DISPATCH(KERNEL , ##__VA_ARGS__ ) + +#else + +enum psmi_hal_pre_init_func_krnls +{ + psmi_hal_pre_init_func_get_num_units, + psmi_hal_pre_init_func_get_num_ports, + psmi_hal_pre_init_func_get_unit_active, + psmi_hal_pre_init_func_get_port_active, + psmi_hal_pre_init_func_get_num_contexts, + psmi_hal_pre_init_func_get_num_free_contexts, +}; + +int psmi_hal_pre_init_func(enum psmi_hal_pre_init_func_krnls k, ...); + +#define PSMI_HAL_DISPATCH(KERNEL,...) ( psmi_hal_current_hal_instance->hfp_ ## KERNEL ( __VA_ARGS__ )) + +#define PSMI_HAL_DISPATCH_PI(KERNEL,...) ( psmi_hal_pre_init_func(psmi_hal_pre_init_func_ ## KERNEL , ##__VA_ARGS__ ) ) + +#endif + +#define psmi_hal_get_num_units_(...) PSMI_HAL_DISPATCH_PI(get_num_units,__VA_ARGS__) +#define psmi_hal_get_num_ports_(...) PSMI_HAL_DISPATCH_PI(get_num_ports,##__VA_ARGS__) +#define psmi_hal_get_unit_active(...) PSMI_HAL_DISPATCH_PI(get_unit_active,__VA_ARGS__) +#define psmi_hal_get_port_active(...) PSMI_HAL_DISPATCH_PI(get_port_active,__VA_ARGS__) +#define psmi_hal_get_num_contexts(...) PSMI_HAL_DISPATCH_PI(get_num_contexts,__VA_ARGS__) +#define psmi_hal_get_num_free_contexts(...) PSMI_HAL_DISPATCH_PI(get_num_free_contexts,__VA_ARGS__) +#define psmi_hal_context_open(...) PSMI_HAL_DISPATCH(context_open,__VA_ARGS__) +#define psmi_hal_close_context(...) PSMI_HAL_DISPATCH(close_context,__VA_ARGS__) +#define psmi_hal_get_port_index2pkey(...) PSMI_HAL_DISPATCH(get_port_index2pkey,__VA_ARGS__) +#define psmi_hal_get_cc_settings_bin(...) PSMI_HAL_DISPATCH(get_cc_settings_bin,__VA_ARGS__) +#define psmi_hal_get_cc_table_bin(...) PSMI_HAL_DISPATCH(get_cc_table_bin,__VA_ARGS__) +#define psmi_hal_get_port_lmc(...) PSMI_HAL_DISPATCH(get_port_lmc,__VA_ARGS__) +#define psmi_hal_get_port_rate(...) PSMI_HAL_DISPATCH(get_port_rate,__VA_ARGS__) +#define psmi_hal_get_port_sl2sc(...) PSMI_HAL_DISPATCH(get_port_sl2sc,__VA_ARGS__) +#define psmi_hal_get_port_sc2vl(...) PSMI_HAL_DISPATCH(get_port_sc2vl,__VA_ARGS__) +#define psmi_hal_set_pkey(...) PSMI_HAL_DISPATCH(set_pkey,__VA_ARGS__) +#define psmi_hal_poll_type(...) PSMI_HAL_DISPATCH(poll_type,__VA_ARGS__) +#define psmi_hal_get_port_lid(...) PSMI_HAL_DISPATCH(get_port_lid,__VA_ARGS__) +#define psmi_hal_get_port_gid(...) PSMI_HAL_DISPATCH(get_port_gid,__VA_ARGS__) +#define psmi_hal_free_tid(...) PSMI_HAL_DISPATCH(free_tid,__VA_ARGS__) +#define psmi_hal_get_tidcache_invalidation(...) PSMI_HAL_DISPATCH(get_tidcache_invalidation,__VA_ARGS__) +#define psmi_hal_update_tid(...) PSMI_HAL_DISPATCH(update_tid,__VA_ARGS__) +#define psmi_hal_writev(...) PSMI_HAL_DISPATCH(writev,__VA_ARGS__) +#define psmi_hal_dma_slot_available(...) PSMI_HAL_DISPATCH(dma_slot_available,__VA_ARGS__) +#define psmi_hal_get_sdma_ring_slot_status(...) PSMI_HAL_DISPATCH(get_sdma_ring_slot_status,__VA_ARGS__) +#define psmi_hal_get_cl_q_head_index(...) PSMI_HAL_DISPATCH(get_cl_q_head_index,__VA_ARGS__) +#define psmi_hal_get_cl_q_tail_index(...) PSMI_HAL_DISPATCH(get_cl_q_tail_index,__VA_ARGS__) +#define psmi_hal_set_cl_q_head_index(...) PSMI_HAL_DISPATCH(set_cl_q_head_index,__VA_ARGS__) +#define psmi_hal_set_cl_q_tail_index(...) PSMI_HAL_DISPATCH(set_cl_q_tail_index,__VA_ARGS__) +#define psmi_hal_cl_q_empty(...) PSMI_HAL_DISPATCH(cl_q_empty,__VA_ARGS__) +#define psmi_hal_get_receive_event(...) PSMI_HAL_DISPATCH(get_receive_event,__VA_ARGS__) +#define psmi_hal_get_egr_buff(...) PSMI_HAL_DISPATCH(get_egr_buff,__VA_ARGS__) +#define psmi_hal_retire_hdr_q_entry(...) PSMI_HAL_DISPATCH(retire_hdr_q_entry,__VA_ARGS__) +#define psmi_hal_get_rhf_expected_sequence_number(...) PSMI_HAL_DISPATCH(get_rhf_expected_sequence_number,__VA_ARGS__) +#define psmi_hal_set_rhf_expected_sequence_number(...) PSMI_HAL_DISPATCH(set_rhf_expected_sequence_number,__VA_ARGS__) +#define psmi_hal_check_rhf_sequence_number(...) PSMI_HAL_DISPATCH(check_rhf_sequence_number,__VA_ARGS__) +#define psmi_hal_set_pbc(...) PSMI_HAL_DISPATCH(set_pbc,__VA_ARGS__) +#define psmi_hal_tidflow_set_entry(...) PSMI_HAL_DISPATCH(tidflow_set_entry,__VA_ARGS__) +#define psmi_hal_tidflow_reset(...) PSMI_HAL_DISPATCH(tidflow_reset,__VA_ARGS__) +#define psmi_hal_tidflow_get(...) PSMI_HAL_DISPATCH(tidflow_get,__VA_ARGS__) +#define psmi_hal_tidflow_get_hw(...) PSMI_HAL_DISPATCH(tidflow_get_hw,__VA_ARGS__) +#define psmi_hal_tidflow_get_seqnum(...) PSMI_HAL_DISPATCH(tidflow_get_seqnum,__VA_ARGS__) +#define psmi_hal_tidflow_get_genval(...) PSMI_HAL_DISPATCH(tidflow_get_genval,__VA_ARGS__) +#define psmi_hal_tidflow_check_update_pkt_seq(...) PSMI_HAL_DISPATCH(tidflow_check_update_pkt_seq,__VA_ARGS__) +#define psmi_hal_tidflow_get_flowvalid(...) PSMI_HAL_DISPATCH(tidflow_get_flowvalid,__VA_ARGS__) +#define psmi_hal_tidflow_get_enabled(...) PSMI_HAL_DISPATCH(tidflow_get_enabled,__VA_ARGS__) +#define psmi_hal_tidflow_get_keep_after_seqerr(...) PSMI_HAL_DISPATCH(tidflow_get_keep_after_seqerr,__VA_ARGS__) +#define psmi_hal_tidflow_get_keep_on_generr(...) PSMI_HAL_DISPATCH(tidflow_get_keep_on_generr,__VA_ARGS__) +#define psmi_hal_tidflow_get_keep_payload_on_generr(...) PSMI_HAL_DISPATCH(tidflow_get_keep_payload_on_generr,__VA_ARGS__) +#define psmi_hal_tidflow_get_seqmismatch(...) PSMI_HAL_DISPATCH(tidflow_get_seqmismatch,__VA_ARGS__) +#define psmi_hal_tidflow_get_genmismatch(...) PSMI_HAL_DISPATCH(tidflow_get_genmismatch,__VA_ARGS__) +#define psmi_hal_forward_packet_to_subcontext(...) PSMI_HAL_DISPATCH(forward_packet_to_subcontext,__VA_ARGS__) +#define psmi_hal_subcontext_ureg_get(...) PSMI_HAL_DISPATCH(subcontext_ureg_get,__VA_ARGS__) +#define psmi_hal_finalize(...) PSMI_HAL_DISPATCH(finalize,__VA_ARGS__) +#define psmi_hal_get_hfi_event_bits(...) PSMI_HAL_DISPATCH(get_hfi_event_bits,__VA_ARGS__) +#define psmi_hal_ack_hfi_event(...) PSMI_HAL_DISPATCH(ack_hfi_event,__VA_ARGS__) +#define psmi_hal_hfi_reset_context(...) PSMI_HAL_DISPATCH(hfi_reset_context,__VA_ARGS__) +#define psmi_hal_get_hw_status(...) PSMI_HAL_DISPATCH(get_hw_status,__VA_ARGS__) +#define psmi_hal_get_hw_status_freezemsg(...) PSMI_HAL_DISPATCH(get_hw_status_freezemsg,__VA_ARGS__) +#define psmi_hal_get_user_major_bldtime_version(...) PSMI_HAL_DISPATCH(get_user_major_bldtime_version,__VA_ARGS__) +#define psmi_hal_get_user_minor_bldtime_version(...) PSMI_HAL_DISPATCH(get_user_minor_bldtime_version,__VA_ARGS__) +#define psmi_hal_get_user_major_runtime_version(...) PSMI_HAL_DISPATCH(get_user_major_runtime_version,__VA_ARGS__) +#define psmi_hal_get_user_minor_runtime_version(...) PSMI_HAL_DISPATCH(get_user_minor_runtime_version,__VA_ARGS__) +#define psmi_hal_set_pio_size(...) PSMI_HAL_DISPATCH(set_pio_size,__VA_ARGS__) +#define psmi_hal_set_effective_mtu(...) PSMI_HAL_DISPATCH(set_effective_mtu,__VA_ARGS__) +#define psmi_hal_set_tf_valid(...) PSMI_HAL_DISPATCH(set_tf_valid,__VA_ARGS__) +#define psmi_hal_spio_init(...) PSMI_HAL_DISPATCH(spio_init,__VA_ARGS__) +#define psmi_hal_spio_fini(...) PSMI_HAL_DISPATCH(spio_fini,__VA_ARGS__) +#define psmi_hal_spio_transfer_frame(...) PSMI_HAL_DISPATCH(spio_transfer_frame,__VA_ARGS__) +#define psmi_hal_spio_process_events(...) PSMI_HAL_DISPATCH(spio_process_events,__VA_ARGS__) +#define psmi_hal_get_node_id(...) PSMI_HAL_DISPATCH(get_node_id,__VA_ARGS__) +#define psmi_hal_get_bthqp(...) PSMI_HAL_DISPATCH(get_bthqp,__VA_ARGS__) +#define psmi_hal_get_context(...) PSMI_HAL_DISPATCH(get_context,__VA_ARGS__) +#define psmi_hal_get_gid_lo(...) PSMI_HAL_DISPATCH(get_gid_lo,__VA_ARGS__) +#define psmi_hal_get_gid_hi(...) PSMI_HAL_DISPATCH(get_gid_hi,__VA_ARGS__) +#define psmi_hal_get_hfi_type(...) PSMI_HAL_DISPATCH(get_hfi_type,__VA_ARGS__) +#define psmi_hal_get_jkey(...) PSMI_HAL_DISPATCH(get_jkey,__VA_ARGS__) +#define psmi_hal_get_lid(...) PSMI_HAL_DISPATCH(get_lid,__VA_ARGS__) +#define psmi_hal_get_pio_size(...) PSMI_HAL_DISPATCH(get_pio_size,__VA_ARGS__) +#define psmi_hal_get_port_num(...) PSMI_HAL_DISPATCH(get_port_num,__VA_ARGS__) +#define psmi_hal_get_rx_egr_tid_cnt(...) PSMI_HAL_DISPATCH(get_rx_egr_tid_cnt,__VA_ARGS__) +#define psmi_hal_get_rx_hdr_q_cnt(...) PSMI_HAL_DISPATCH(get_rx_hdr_q_cnt,__VA_ARGS__) +#define psmi_hal_get_rx_hdr_q_ent_size(...) PSMI_HAL_DISPATCH(get_rx_hdr_q_ent_size,__VA_ARGS__) +#define psmi_hal_get_sdma_req_size(...) PSMI_HAL_DISPATCH(get_sdma_req_size,__VA_ARGS__) +#define psmi_hal_get_sdma_ring_size(...) PSMI_HAL_DISPATCH(get_sdma_ring_size,__VA_ARGS__) +#define psmi_hal_get_subctxt(...) PSMI_HAL_DISPATCH(get_subctxt,__VA_ARGS__) +#define psmi_hal_get_subctxt_cnt(...) PSMI_HAL_DISPATCH(get_subctxt_cnt,__VA_ARGS__) +#define psmi_hal_get_tid_exp_cnt(...) PSMI_HAL_DISPATCH(get_tid_exp_cnt,__VA_ARGS__) +#define psmi_hal_get_unit_id(...) PSMI_HAL_DISPATCH(get_unit_id,__VA_ARGS__) +#define psmi_hal_get_fd(...) PSMI_HAL_DISPATCH(get_fd,__VA_ARGS__) +#define psmi_hal_get_pio_stall_cnt(...) PSMI_HAL_DISPATCH(get_pio_stall_cnt,__VA_ARGS__) + +#define PSMI_HAL_NBITS_TO_MASK(NBITS) ((uint64_t)((1 << NBITS)-1)) +#define PSMI_HAL_RHF_UNPACK(A,NAME) ((uint32_t)((A.decomposed_rhf >> \ + PSMI_HAL_RHF_ ## NAME ## _SHFTC \ + ) & PSMI_HAL_NBITS_TO_MASK( \ + PSMI_HAL_RHF_ ## NAME ## _NBITS))) +/* define constants for the decomposed rhf error masks. + Note how each of these are shifted by the ALL_ERR_FLAGS shift count. */ + +#define PSMI_HAL_RHF_ERR_MASK_64(NAME) ((uint64_t)(((PSMI_HAL_NBITS_TO_MASK( \ + PSMI_HAL_RHF_ERR_ ## NAME ## _NBITS) << \ + PSMI_HAL_RHF_ERR_ ## NAME ## _SHFTC )))) +#define PSMI_HAL_RHF_ERR_MASK_32(NAME) ((uint32_t)(PSMI_HAL_RHF_ERR_MASK_64(NAME) >> \ + PSMI_HAL_RHF_ALL_ERR_FLAGS_SHFTC)) +#define PSMI_HAL_RHF_ERR_ICRC PSMI_HAL_RHF_ERR_MASK_32(ICRC) +#define PSMI_HAL_RHF_ERR_ECC PSMI_HAL_RHF_ERR_MASK_32(ECC) +#define PSMI_HAL_RHF_ERR_LEN PSMI_HAL_RHF_ERR_MASK_32(LEN) +#define PSMI_HAL_RHF_ERR_TID PSMI_HAL_RHF_ERR_MASK_32(TID) +#define PSMI_HAL_RHF_ERR_TFGEN PSMI_HAL_RHF_ERR_MASK_32(TFGEN) +#define PSMI_HAL_RHF_ERR_TFSEQ PSMI_HAL_RHF_ERR_MASK_32(TFSEQ) +#define PSMI_HAL_RHF_ERR_RTE PSMI_HAL_RHF_ERR_MASK_32(RTE) +#define PSMI_HAL_RHF_ERR_DC PSMI_HAL_RHF_ERR_MASK_32(DC) +#define PSMI_HAL_RHF_ERR_DCUN PSMI_HAL_RHF_ERR_MASK_32(DCUN) +#define PSMI_HAL_RHF_ERR_KHDRLEN PSMI_HAL_RHF_ERR_MASK_32(KHDRLEN) + +#define psmi_hal_rhf_get_use_egr_buff(A) PSMI_HAL_RHF_UNPACK(A,USE_EGR_BUFF) +#define psmi_hal_rhf_get_egr_buff_index(A) PSMI_HAL_RHF_UNPACK(A,EGR_BUFF_IDX) +#define psmi_hal_rhf_get_egr_buff_offset(A) PSMI_HAL_RHF_UNPACK(A,EGR_BUFF_OFF) +#define psmi_hal_rhf_get_packet_length(A) (PSMI_HAL_RHF_UNPACK(A,PKT_LEN)<<2) +#define psmi_hal_rhf_get_all_err_flags(A) PSMI_HAL_RHF_UNPACK(A,ALL_ERR_FLAGS) +#define psmi_hal_rhf_get_seq(A) PSMI_HAL_RHF_UNPACK(A,SEQ) +#define psmi_hal_rhf_get_rx_type(A) PSMI_HAL_RHF_UNPACK(A,RX_TYPE) +#define PSMI_HAL_RHF_PACK(NAME,VALUE) ((uint64_t)((((uint64_t)(VALUE)) & \ + PSMI_HAL_NBITS_TO_MASK( \ + PSMI_HAL_RHF_ ## NAME ## _NBITS \ + )) << ( \ + PSMI_HAL_RHF_ ## NAME ## _SHFTC ))) + +#define psmi_hal_get_hal_instance_type() psmi_hal_current_hal_instance->type +#define psmi_hal_get_hal_instance_description() psmi_hal_current_hal_instance->description +#define psmi_hal_get_hfi_name() psmi_hal_current_hal_instance->hfi_name +#define psmi_hal_get_num_units() psmi_hal_current_hal_instance->params.num_units +#define psmi_hal_get_num_ports() psmi_hal_current_hal_instance->params.num_ports +#define psmi_hal_get_default_pkey() psmi_hal_current_hal_instance->params.default_pkey +#define psmi_hal_get_cap_mask() psmi_hal_current_hal_instance->params.cap_mask +#define psmi_hal_set_cap_mask(NEW_MASK) (psmi_hal_current_hal_instance->params.cap_mask = (NEW_MASK)) +#define psmi_hal_add_cap(CAP) (psmi_hal_current_hal_instance->params.cap_mask |= (CAP)) +#define psmi_hal_sub_cap(CAP) (psmi_hal_current_hal_instance->params.cap_mask &= (~(CAP))) +#define psmi_hal_has_cap(CAP) ((psmi_hal_get_cap_mask() & (CAP)) == (CAP)) + +#define psmi_hal_get_sw_status() psmi_hal_current_hal_instance->params.sw_status +#define psmi_hal_set_sw_status(NEW_STATUS) (psmi_hal_current_hal_instance->params.sw_status = (NEW_STATUS)) +#define psmi_hal_add_sw_status(STATUS) (psmi_hal_current_hal_instance->params.sw_status |= (STATUS)) +#define psmi_hal_sub_sw_status(STATUS) (psmi_hal_current_hal_instance->params.sw_status &= (~(STATUS))) +#define psmi_hal_has_sw_status(STATUS) ((psmi_hal_get_sw_status() & (STATUS)) == (STATUS)) + + +#include "psm2_hal_inlines_i.h" + +#endif /* #ifndef __PSM2_HAL_H__ */ diff --git a/psm2_hal_inline_t.h b/psm2_hal_inline_t.h new file mode 100644 index 0000000..8e061a2 --- /dev/null +++ b/psm2_hal_inline_t.h @@ -0,0 +1,313 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* The psm2_hal_inline_t.h file serves as a template to allow all HAL + instances to easily and conveniently declare their HAL methods. */ + +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(initialize) + (psmi_hal_instance_t *); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(finalize) + (void); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_units) + (int wait); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_ports) + (void); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_unit_active) + (int unit); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_node_id) + (int unit, int *nodep); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_active) + (int unit, int port); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_contexts) + (int unit); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_free_contexts) + (int unit); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(close_context) + (psmi_hal_hw_context *); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(context_open) + (int unit, + int port, + uint64_t open_timeout, + psm2_ep_t ep, + psm2_uuid_t const job_key, + psmi_context_t *psm_ctxt, + uint32_t cap_mask, + unsigned); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_index2pkey) + (int unit, int port, int index); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_cc_settings_bin) + (int unit, int port, char *ccabuf, size_t len_ccabuf); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_cc_table_bin) + (int unit, int port, uint16_t **ccatp); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_lmc) + (int unit, int port); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_rate) + (int unit, int port); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_sl2sc) + (int unit, int port, int sl); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_sc2vl) + (int unit, int port, int sc); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(set_pkey) + (psmi_hal_hw_context, uint16_t); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(poll_type) + (uint16_t, psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_lid) + (int unit, int port); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_gid) + (int unit, int port, + uint64_t *hi, uint64_t *lo); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(free_tid) + (psmi_hal_hw_context, uint64_t tidlist, uint32_t tidcnt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_tidcache_invalidation) + (psmi_hal_hw_context, uint64_t tidlist, uint32_t *tidcnt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(update_tid) + (psmi_hal_hw_context, uint64_t vaddr, uint32_t *length, + uint64_t tidlist, uint32_t *tidcnt, + uint16_t flags); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(writev) + (const struct iovec *iov, int iovcnt, struct ips_epinfo *, + psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_sdma_ring_slot_status) + (int slotIdx, psmi_hal_sdma_ring_slot_status *, + uint32_t *errorCode,void *); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(dma_slot_available) + (int slotidx, psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_hfi_event_bits) + (uint64_t *event_bits, psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(ack_hfi_event) + (uint64_t ack_bits, psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(hfi_reset_context) + (psmi_hal_hw_context); +static PSMI_HAL_INLINE uint64_t PSMI_HAL_CAT_INL_SYM(get_hw_status) + (psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_hw_status_freezemsg) + (volatile char** msg, psmi_hal_hw_context); +static PSMI_HAL_INLINE uint16_t PSMI_HAL_CAT_INL_SYM(get_user_major_bldtime_version) + (void); +static PSMI_HAL_INLINE uint16_t PSMI_HAL_CAT_INL_SYM(get_user_minor_bldtime_version) + (void); +static PSMI_HAL_INLINE uint16_t PSMI_HAL_CAT_INL_SYM(get_user_major_runtime_version) + (psmi_hal_hw_context); +static PSMI_HAL_INLINE uint16_t PSMI_HAL_CAT_INL_SYM(get_user_minor_runtime_version) + (psmi_hal_hw_context); +static PSMI_HAL_INLINE psmi_hal_cl_idx PSMI_HAL_CAT_INL_SYM(get_cl_q_head_index) + (psmi_hal_cl_q, + psmi_hal_hw_context); +static PSMI_HAL_INLINE psmi_hal_cl_idx PSMI_HAL_CAT_INL_SYM(get_cl_q_tail_index) + (psmi_hal_cl_q, + psmi_hal_hw_context); +static PSMI_HAL_INLINE void PSMI_HAL_CAT_INL_SYM(set_cl_q_head_index) + (psmi_hal_cl_idx, + psmi_hal_cl_q, + psmi_hal_hw_context); +static PSMI_HAL_INLINE void PSMI_HAL_CAT_INL_SYM(set_cl_q_tail_index) + (psmi_hal_cl_idx, + psmi_hal_cl_q, + psmi_hal_hw_context); +static inline int PSMI_HAL_CAT_INL_SYM(cl_q_empty) + (psmi_hal_cl_idx, + psmi_hal_cl_q, + psmi_hal_hw_context); +static inline int PSMI_HAL_CAT_INL_SYM(get_rhf) + (psmi_hal_cl_idx, psmi_hal_raw_rhf_t *, + psmi_hal_cl_q, psmi_hal_hw_context); +static inline int PSMI_HAL_CAT_INL_SYM(get_ips_message_hdr) + (psmi_hal_cl_idx, psmi_hal_raw_rhf_t, struct ips_message_header **, + psmi_hal_cl_q, psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_receive_event) + (psmi_hal_cl_idx head_idx, psmi_hal_hw_context, + struct ips_recvhdrq_event *); +static PSMI_HAL_INLINE void *PSMI_HAL_CAT_INL_SYM(get_egr_buff) + (psmi_hal_cl_idx, psmi_hal_cl_q, psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(retire_hdr_q_entry) + (psmi_hal_cl_idx *, psmi_hal_cl_q, psmi_hal_hw_context, + uint32_t elemsz, uint32_t elemlast, + int *emptyp); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_rhf_expected_sequence_number) + (unsigned int *, psmi_hal_cl_q, psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(set_rhf_expected_sequence_number) + (unsigned int, psmi_hal_cl_q, psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(check_rhf_sequence_number) + (unsigned int); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(set_pbc) + (struct ips_proto *proto, struct ips_flow *flow, + uint32_t isCtrlMsg, struct psm_hal_pbc *dest, uint32_t hdrlen, + uint32_t paylen); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_set_entry) + (uint32_t flowid, uint32_t genval, uint32_t seqnum, + psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_reset) + (psmi_hal_hw_context, uint32_t flowid, uint32_t genval, + uint32_t seqnum); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get) + (uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_hw) + (uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_seqnum) + (uint64_t val, uint32_t *pseqn); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_genval) + (uint64_t val, uint32_t *pgv); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_check_update_pkt_seq) + (void *vpprotoexp + /* actually a: + struct ips_protoexp *protoexp */, + psmi_seqnum_t sequence_num, + void *vptidrecvc + /* actually a: + struct ips_tid_recv_desc *tidrecvc */, + struct ips_message_header *p_hdr, + void (*ips_protoexp_do_tf_generr) + (void *vpprotoexp + /* actually a: + struct ips_protoexp *protoexp */, + void *vptidrecvc + /* actually a: + struct ips_tid_recv_desc *tidrecvc */, + struct ips_message_header *p_hdr), + void (*ips_protoexp_do_tf_seqerr) + (void *vpprotoexp + /* actually a: + struct ips_protoexp *protoexp */, + void *vptidrecvc + /* actually a: + struct ips_tid_recv_desc *tidrecvc */, + struct ips_message_header *p_hdr)); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_flowvalid) + (uint64_t val, uint32_t *pfv); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_enabled) + (uint64_t val, uint32_t *penabled); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_keep_after_seqerr) + (uint64_t val, uint32_t *pkase); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_keep_on_generr) + (uint64_t val, uint32_t *pkoge); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_keep_payload_on_generr) + (uint64_t val, uint32_t *pkpoge); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_seqmismatch) + (uint64_t val, uint32_t *psmm); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_genmismatch) + (uint64_t val, uint32_t *pgmm); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(forward_packet_to_subcontext) + (struct ips_writehdrq *writeq, + struct ips_recvhdrq_event *rcv_ev, + uint32_t subcontext, + psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(subcontext_ureg_get) + (ptl_t *ptl, + struct ips_subcontext_ureg **uregp, + psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(set_pio_size) + (uint32_t, psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(set_effective_mtu) + (uint32_t, psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(set_tf_valid) + (uint32_t, psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_default_pkey) + (void); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(spio_init) + (const psmi_context_t *context, + struct ptl *ptl, void **ctrl); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(spio_fini) + (void **ctrl, psmi_hal_hw_context); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(spio_transfer_frame) + (struct ips_proto *proto, + struct ips_flow *flow, struct psm_hal_pbc *pbc, + uint32_t *payload, uint32_t length, + uint32_t isCtrlMsg, uint32_t cksum_valid, + uint32_t cksum, psmi_hal_hw_context +#ifdef PSM_CUDA + , uint32_t is_cuda_payload +#endif + ); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(spio_process_events) + (const struct ptl *ptl); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_bthqp) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_context) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE uint64_t PSMI_HAL_CAT_INL_SYM(get_gid_lo) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE uint64_t PSMI_HAL_CAT_INL_SYM(get_gid_hi) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_hfi_type) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_jkey) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_lid) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_pio_size) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_num) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_rx_egr_tid_cnt) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_rx_hdr_q_cnt) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_rx_hdr_q_ent_size) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_sdma_req_size) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_sdma_ring_size) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_subctxt) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_subctxt_cnt) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_tid_exp_cnt) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_unit_id) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_fd) + (psmi_hal_hw_context ctxt); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_pio_stall_cnt) + (psmi_hal_hw_context, + uint64_t **); diff --git a/psm2_linker_script_map.in b/psm2_linker_script_map.in new file mode 100644 index 0000000..efa87c5 --- /dev/null +++ b/psm2_linker_script_map.in @@ -0,0 +1,95 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* See http://sourceware.org/binutils/docs/ld/VERSION.html#VERSION for more info. + C++ // Comments don't work in this file. */ + +PSM2_1.0 +{ + /* Expose only those symbols we choose to. This way we do not + pollute users namespace more than absolutely necessary. */ + global: + psm2_*; + + /* Below symbols are used for hfidiags hfi1_pkt_test */ + /* opa_udebug.h - global */ + hfi_debug; + hfi_get_unit_name; + __progname; + + /* opa_udebug.h - _HFI_DEBUGGING */ + __hfi_mylabel; + hfi_set_mylabel; + hfi_get_mylabel; + __hfi_dbgout; + + /* opa_service.h */ + hfi_context_open; + hfi_get_port_vl2mtu; + hfi_get_port_lid; + hfi_context_close; + hfi_cmd_write; + hfi_mmap64; + + /* opa_user.h */ + hfi_userinit; + hfi_poll_type; + hfi_wait_for_packet; + __hfi_pico_per_cycle; + + /* Additional globals */ + _psm2_additional_globals_; + /* Make all other symbols local */ + local: + *; +}; + diff --git a/psm2_mq.h b/psm2_mq.h new file mode 100644 index 0000000..c193afc --- /dev/null +++ b/psm2_mq.h @@ -0,0 +1,1621 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef PSM2_MQ_H +#define PSM2_MQ_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * @file psm2_mq.h + * @brief PSM2 Matched Queues + * + * @page psm2_mq Matched Queues interface + * + * The Matched Queues (MQ) interface implements a queue-based communication + * model with the distinction that queue message consumers use a 3-tuple of + * metadata to match incoming messages against a list of preposted receive + * buffers. These semantics are consistent with those presented by MPI-1.2 + * and all the features and side-effects of Message-Passing find their way into + * Matched Queues. There is currently a single MQ context, + * If need be, MQs may expose a function to allocate more than + * one MQ context in the future. Since an MQ is implicitly bound to a locally + * opened endpoint, handle all MQ functions use an MQ handle instead of an EP + * handle as a communication context. + * + * @section tagmatch MQ Tag Matching + * + * A successful MQ tag match requires an endpoint address (@ref psm2_epaddr_t) + * and a 3-tuple of tag objects. Two of the tag objects are provided by the + * receiver when posting a receive buffer (@ref psm2_mq_irecv) and the last is + * provided by the sender as part of every message sent (@ref psm2_mq_send and + * @ref psm2_mq_isend). Since MQ is a receiver-directed communication model, + * the tag matching done at the receiver involves matching the sent message's + * origin and send tag (@c stag) with the source endpointer address, tag (@c + * rtag), and tag selector (@c rtagsel) attached to every preposted receive + * buffer. The incoming @c stag is compared to the posted @c rtag but only for + * significant bits set to @c 1 in the @c rtagsel. The @c rtagsel can be used + * to mask off parts (or even all) of the bitwise comparison between sender and + * receiver tags. A successful match causes the message to be received into + * the buffer with which the tag is matched. If the incoming message is too + * large, it is truncated to the size of the posted receive buffer. The + * bitwise operation corresponding to a successful match and receipt of an + * expected message amounts to the following expression evaluating as true: + * + * @verbatim ((stag ^ rtag) & rtagsel) == 0 @endverbatim + * + * It is up to the user to encode (pack) into the 64-bit unsigned + * integers, including employing the @c rtagsel tag selector as a method to + * wildcart part or all of the bits significant in the tag matching operation. + * For example, MPI uses triple based on context (MPI communicator), source + * rank, send tag. The following code example shows how the triple can be + * packed into 64 bits: + * + * @code{.c} + // + // 64-bit send tag formed by packing the triple: + // + // ( context_id_16bits | source_rank_16bits | send_tag_32bits ) + // + stag = ( (((context_id)&0xffffULL)<<48)| \ + (((source_rank)&0xffffULL)<<32)| \ + (((send_tag)&0xffffffffULL)) ); + @endcode + * + * Similarly, the receiver applies the @c rtag matching bits and @c rtagsel + * masking bits against a list of send tags and returns the first successful + * match. Zero bits in the @c tagsel can be used to indicate wildcarded bits + * in the 64-bit tag which can be useful for implementing MPI's + * @c MPI_ANY_SOURCE and @c MPI_ANY_TAG. Following the example bit splicing in + * the above @c stag example: + * + * @code{.c} + // Example MPI implementation where MPI_COMM_WORLD implemented as 0x3333 + + // MPI_Irecv source_rank=MPI_ANY_SOURCE, tag=7, comm=MPI_COMM_WORLD + rtag = 0x3333000000000007; + rtagsel = 0xffff0000ffffffff; + + // MPI_Irecv source_rank=3, tag=MPI_ANY_TAG, comm=MPI_COMM_WORLD + rtag = 0x3333000300000000; + rtagsel = 0xffffffff80000000; // can't ignore sign bit in tag + + // MPI_Irecv source_rank=MPI_ANY_SOURCE, tag=MPI_ANY_TAG, comm=MPI_COMM_WORLD + rtag = 0x3333000300000000; + rtagsel = 0xffff000080000000; // can't ignore sign bit in tag + @endcode + * + * + * Applications that do not follow tag matching semantics can simply always + * pass a value of @c 0 for @c rtagsel, which will always yield a successful + * match to the first preposted buffer. If a message cannot be matched to any + * of the preposted buffers, the message is delivered as an unexpected + * message. + * + * @section mq_receive MQ Message Reception + * + * MQ messages are either received as @e expected or @e unexpected: @li The + * received message is @e expected if the incoming message tag matches the + * combination of tag and tag selector of at least one of the user-provided + * receive buffers preposted with @ref psm2_mq_irecv. + * + * @li The received message is @e unexpected if the incoming message tag @b + * doesn't match any combination of tag and tag selector from all the + * user-provided receive buffers preposted with @ref psm2_mq_irecv. + * + * Unexpected messages are messages that the MQ library buffers until the + * user provides a receive buffer that can match the unexpected message. + * With Matched Queues and MPI alike, unexpected messages can occur as a + * side-effect of the programming model, whereby the arrival of messages can be + * slightly out of step with the ordering in which the user + * provides receive buffers. Unexpected messages can also be triggered by the + * difference between the rate at which a sender produces messages and the rate + * at which a paired receiver can post buffers and hence consume the messages. + * + * In all cases, too many @e unexpected messages will negatively affect + * performance. Users can employ some of the following mechanisms to reduce + * the effect of added memory allocations and copies that result from + * unexpected messages: + * @li If and when possible, receive buffers should be posted as early as + * possible and ideally before calling into the progress engine. + * @li Use of rendezvous messaging that can be controlled with + * @ref PSM2_MQ_RNDV_HFI_SZ and @ref PSM2_MQ_RNDV_SHM_SZ options. These + * options default to values determined to make effective use of + * bandwidth and are hence not advisable for all communication message + * sizes, but rendezvous messages inherently prevent unexpected + * messages by synchronizing the sender with the receiver beforehand. + * @li The amount of memory that is allocated to handle unexpected messages + * can be bounded by adjusting the Global @ref PSM2_MQ_MAX_SYSBUF_MBYTES + * option. + * @li MQ statistics, such as the amount of received unexpected messages and + * the aggregate amount of unexpected bytes are available in the @ref + * psm2_mq_stats structure. + * + * Whenever a match occurs, whether the message is expected or unexpected, it + * is generally up to the user to ensure that the message is not truncated. + * Message truncation occurs when the size of the preposted buffer is less than + * the size of the incoming matched message. MQ will correctly handle + * message truncation by always copying the appropriate amount of bytes as to + * not overwrite any data. While it is valid to send less data than the amount + * of data that has been preposted, messages that are truncated will be marked + * @ref PSM2_MQ_TRUNCATION as part of the error code in the message status + * structure (@ref psm2_mq_status_t or @ref psm2_mq_status2_t). + * + * @section mq_completion MQ Completion Semantics + * + * Message completion in Matched Queues follows local completion semantics. + * When sending an MQ message, it is deemed complete when MQ guarantees that + * the source data has been sent and that the entire input source data memory + * location can be safely overwritten. As with standard Message-Passing, + * MQ does not make any remote completion guarantees for sends. MQ does + * however, allow a sender to synchronize with a receiver to send a synchronous + * message which sends a message only after a matching receive buffer has been + * posted by the receiver (@ref PSM2_MQ_FLAG_SENDSYNC). + * + * A receive is deemed complete after it has matched its associated receive + * buffer with an incoming send and that the data from the send has been + * completely delivered to the receive buffer. + * + * @section mq_progress MQ Progress Requirements + * + * Progress on MQs must be @e explicitly ensured by the user for correctness. + * The progress requirement holds even if certain areas of the MQ + * implementation require less network attention than others, or if progress + * may internally be guaranteed through interrupts. The main polling function, + * @ref psm2_poll, is the most general form of ensuring process on a given + * endpoint. Calling @ref psm2_poll ensures that progress is made over all the + * MQs and other components instantiated over the endpoint passed to @ref + * psm2_poll. + * + * While @ref psm2_poll is the only way to directly ensure progress, other MQ + * functions will conditionally ensure progres depending on how they are used: + * + * @li @ref psm2_mq_wait employs polling and waits until the request is + * completed. For blocking communication operations where the caller is + * waiting on a single send or receive to complete, psm2_mq_wait usually + * provides the best responsiveness in terms of latency. + * + * @li @ref psm2_mq_test can test a particular request for completion, but @b + * never directly or indirectly ensures progress as it only tests the + * completion status of a request, nothing more. See functional documentation + * in @ref psm2_mq_test for a detailed discussion. + * + * @li @ref psm2_mq_ipeek ensures progress if and only if the MQ's completion + * queue is empty and will not ensure progress as long as the completion queue + * is non-empty. Users that always aggressively process all elements of the MQ + * completion queue as part of their own progress engine will indirectly always + * ensure MQ progress. The ipeek mechanism is the preferred way for + * ensuring progress when many non-blocking requests are in flight since ipeek + * returns requests in the order in which they complete. Depending on how the + * user initiates and completes communication, this may be preferable to + * calling other progress functions on individual requests. + */ + +/*! @defgroup mq PSM Matched Queues + * + * @{ + */ + +/** @brief Initialize the MQ component for MQ communication + * + * This function provides the Matched Queue handle necessary to perform all + * Matched Queue communication operations. + * + * @param[in] ep Endpoint over which to initialize Matched Queue + * @param[in] tag_order_mask Order mask hint to let MQ know what bits of the + * send tag are required to maintain MQ message + * order. In MPI parlance, this mask sets the bits + * that store the context (or communicator ID). The + * user can choose to pass PSM2_MQ_ORDERMASK_NONE or + * PSM2_MQ_ORDERMASK_ALL to tell MQ to respectively + * provide no ordering guarantees or to provide + * ordering over all messages by ignoring the + * contexts of the send tags. + * @param[in] opts Set of options for Matched Queue + * @param[in] numopts Number of options passed + * @param[out] mq User-supplied storage to return the Matched Queue handle + * associated to the newly created Matched Queue. + * + * @remark This function can be called many times to retrieve the MQ handle + * associated to an endpoint, but options are only considered the first + * time the function is called. + * + * @post The user obtains a handle to an instantiated Match Queue. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK A new Matched Queue has been instantiated across all the + * members of the group. + * + * @code{.c} + int try_open_endpoint_and_initialize_mq( + psm2_ep_t *ep, // endpoint handle + psm2_epid_t *epid, // unique endpoint ID + psm2_uuid_t job_uuid, // unique job uuid, for ep_open + psm2_mq_t *mq, // MQ handle initialized on endpoint 'ep' + uint64_t communicator_bits) // Where we store our communicator or + // context bits in the 64-bit tag. + { + // Simplified open, see psm2_ep_open documentation for more info + psm2_ep_open(job_uuid, + NULL, // no options + ep, epid); + + // We initialize a matched queue by telling PSM the bits that are + // order-significant in the tag. Point-to-point ordering will not be + // maintained between senders where the communicator bits are not the + // same. + psm2_mq_init(ep, + communicator_bits, + NULL, // no other MQ options + 0, // 0 options passed + mq); // newly initialized matched Queue + + return 1; + } + @endcode + */ +psm2_error_t +psm2_mq_init(psm2_ep_t ep, uint64_t tag_order_mask, + const struct psm2_optkey *opts, int numopts, psm2_mq_t *mq); + +#define PSM2_MQ_ORDERMASK_NONE 0ULL + /**< This macro is reserved for future tag order masking support. */ + +#define PSM2_MQ_ORDERMASK_ALL 0xffffffffffffffffULL + /**< This macro is reserved for future tag order masking support. */ + +/** @brief Finalize (close) an MQ handle + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK A given Matched Queue has been freed and use of the future + * use of the handle produces undefined results. + */ +psm2_error_t +psm2_mq_finalize(psm2_mq_t mq); + +#define PSM2_MQ_TAG_ELEMENTS 4 + /**< Represents the number of 32-bit tag elements in the psm2_mq_tag_t + * type plus one extra element to keep alignment and padding + * as 16 bytes. */ + +/** @struct psm2_mq_tag + ** @brief MQ Message tag + * + * Extended message tag type introduced in PSM 2.0. The previous 64 bit tag + * values are replaced by a struct containing three 32 bit tag values for a + * total of 96 bits. Matching semantics are unchanged from the previous 64-bit + * matching scheme; the only difference is that 96 bits are matched instead of + * 64. For interoperability with existing PSM routines, 64 bit tags are + * extended to a 96 bit tag by setting the upper 32 bits (tag[2] or tag2) to + * zero. Other than this caveat, all of the existing routines using 64-bit + * tags are interchangeable with PSM2 routines using this psm2_mq_tag_t type. + * For example, a message sent using @ref psm2_mq_send can be received using + * @ref psm2_mq_irecv2, provided the tags match as described above. + */ +typedef +//struct psm2_mq_tag { +union psm2_mq_tag { +// union { + uint32_t tag[PSM2_MQ_TAG_ELEMENTS]; /* No longer specifying + * alignment as it makes + * code break with newer + * compilers. */ + + /**< 3 x 32bit array representation of @ref psm2_mq_tag */ + struct { + uint32_t tag0; /**< 1 of 3 uint32_t tag values */ + uint32_t tag1; /**< 2 of 3 uint32_t tag values */ + uint32_t tag2; /**< 3 of 3 uint32_t tag values */ + }; +// }; +} psm2_mq_tag_t; + +/** @brief MQ Non-blocking operation status + * + * Message completion status for asynchronous communication operations. + * For wait and test functions, MQ fills in the structure upon completion. + * Upon completion, receive requests fill in every field of the status + * structure while send requests only return a valid error_code and context + * pointer. + */ +typedef +struct psm2_mq_status { + /** Sender's original message tag (receive reqs only) */ + uint64_t msg_tag; + /** Sender's original message length (receive reqs only) */ + uint32_t msg_length; + /** Actual number of bytes transfered (receive reqs only) */ + uint32_t nbytes; + /** MQ error code for communication operation */ + psm2_error_t error_code; + /**< User-associated context for send or receive */ + void *context; +} psm2_mq_status_t; + +/** @brief MQ Non-blocking operation status + * + * Message completion status for asynchronous communication operations. For + * wait and test functions, MQ fills in the structure upon completion. Upon + * completion, requests fill in every field of the status structure with the + * exception of the nbytes field, which is only valid for receives. Version 2 + * of the status type contains an @ref psm2_mq_tag_t type to represent the tag + * instead of a 64-bit integer value and is for use with PSM v2 routines. + */ + +typedef +struct psm2_mq_status2 { + /** Remote peer's epaddr */ + psm2_epaddr_t msg_peer; + /** Sender's original message tag */ + psm2_mq_tag_t msg_tag __attribute__ ((aligned(16)));/* Alignment added + * to preserve the + * layout as is + * expected by + * existent code */ + /** Sender's original message length */ + uint32_t msg_length; + /** Actual number of bytes transfered (receiver only) */ + uint32_t nbytes; + /** MQ error code for communication operation */ + psm2_error_t error_code; + /** User-associated context for send or receive */ + void *context; +} psm2_mq_status2_t; + +/** @brief PSM2 Communication handle (opaque) */ +typedef struct psm2_mq_req *psm2_mq_req_t; + + +/** @brief MQ Request Struct + * + * Message completion request for asynchronous communication operations. + * Upon completion, requests are filled with the valid data for the + * corresponding send/recv operation that was completed. This datatype + * contains the status data and is converted into the + * mq_status structures in wait/test functions. + */ +struct psm2_mq_req_user { + /* Tag matching vars */ + psm2_epaddr_t peer; + psm2_mq_tag_t tag __attribute__ ((aligned(16)));/* Alignment added + * to preserve the + * layout as is + * expected by + * existent code */ + psm2_mq_tag_t tagsel; /* used for receives */ + + /* Buffer attached to request. May be a system buffer for unexpected + * messages or a user buffer when an expected message */ + uint8_t *buf; + uint32_t buf_len; + uint32_t error_code; + + uint32_t recv_msglen; /* Message length we are ready to receive */ + uint32_t send_msglen; /* Message length from sender */ + + /* Used for request to send messages */ + void *context; /* user context associated to sends or receives */ + + uint64_t user_reserved[4]; +}; + +/*! @} */ +/*! @ingroup mq + * @defgroup mq_options PSM Matched Queue Options + * @{ + * + * MQ options can be modified at any point at runtime, unless otherwise noted. + * The following example shows how to retrieve the current message size at + * which messages are sent as synchronous. + * + * @code{.c} + uint32_t get_hfirv_size(psm2_mq_t mq) + { + uint32_t rvsize; + psm2_getopt(mq, PSM2_MQ_RNDV_HFI_SZ, &rvsize); + return rvsize; + } + @endcode + */ + +/** @brief Get an MQ option (Deprecated. Use psm2_getopt with PSM2_COMPONENT_MQ) + * + * Function to retrieve the value of an MQ option. + * + * @param[in] mq Matched Queue handle + * @param[in] option Index of option to retrieve. Possible values are: + * @li @ref PSM2_MQ_RNDV_HFI_SZ + * @li @ref PSM2_MQ_RNDV_SHM_SZ + * @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES + * + * @param[in] value Pointer to storage that can be used to store the value of + * the option to be set. It is up to the user to ensure that the + * pointer points to a memory location large enough to accommodate + * the value associated to the type. Each option documents the size + * associated to its value. + * + * @returns PSM2_OK if option could be retrieved. + * @returns PSM2_PARAM_ERR if the option is not a valid option number + */ +psm2_error_t psm2_mq_getopt(psm2_mq_t mq, int option, void *value); + +/** @brief Set an MQ option (Deprecated. Use psm2_setopt with PSM2_COMPONENT_MQ) + * + * Function to set the value of an MQ option. + * + * @param[in] mq Matched Queue handle + * @param[in] option Index of option to retrieve. Possible values are: + * @li @ref PSM2_MQ_RNDV_HFI_SZ + * @li @ref PSM2_MQ_RNDV_SHM_SZ + * @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES + * + * @param[in] value Pointer to storage that contains the value to be updated + * for the supplied option number. It is up to the user to + * ensure that the pointer points to a memory location with a + * correct size. + * + * @returns PSM2_OK if option could be retrieved. + * @returns PSM2_PARAM_ERR if the option is not a valid option number + * @returns PSM2_OPT_READONLY if the option to be set is a read-only option + * (currently no MQ options are read-only). + */ +psm2_error_t psm2_mq_setopt(psm2_mq_t mq, int option, const void *value); + +/*! @} */ +/*! @ingroup mq + * @{ + */ + +#define PSM2_MQ_FLAG_SENDSYNC 0x01 + /**< MQ Send Force synchronous send */ + +#define PSM2_MQ_REQINVALID ((psm2_mq_req_t)(NULL)) + /**< MQ request completion value */ + +#define PSM2_MQ_ANY_ADDR ((psm2_epaddr_t)NULL) + /**< MQ receive from any source epaddr */ + + +/** @brief MQ fast-path operation enumeration + * + * To provide for quick enqueing of send/receives from within an AM handler + * PSM2 provdes fast path send/recv options that will enqueue those ops + * into the MQ. The supported operations to call in fast path are enumerated + * in the @ref psm2_mq_fp_op enum. + */ +enum psm2_mq_fp_op { + PSM2_MQ_ISEND_FP = 1, + PSM2_MQ_IRECV_FP, +}; + +/** @brief Post a fast-path isend/irecv into the MQ + * + * Function to only enqueue fast-path non-blocking sends or non-blocking recvs + * into a particular MQ. These calls only work if the process already holds + * the mq progress lock, this case traditionally only applies to calls from + * a registered AM Handler. + * + * This function helps to enable one-sided communication models from middleware + * such as OFI to provide fast >2KB message transfers for RMA operations. + * + * When posting irecvs every MQ message received on a particular MQ, + * the @c tag and @c tagsel parameters are used against the incoming + * message's send tag as described in @ref tagmatch. + * + * When posting isends the user gurantees that the source data will remain + * unmodified until the send is locally completed through a call such as + * @ref psm2_mq_wait or @ref psm2_mq_test. + * + * Progress on the operations enqueued into the MQ will may not occur until + * the next PSM2 progress API is invoked. + * + * @param[in] ep PSM2 endpoint + * @param[in] mq Matched Queue Handle + * @param[in] addr Destination EP address (used only on isends) + * @param[in] tag Send/Receive tag + * @param[in] tagsel Receive tag selector (used only on irecvs) + * @param[in] flags Send/Receive Flags + * @param[in] buf Send/Receive buffer + * @param[in] len Send/Receive buffer length + * @param[in] context User context pointer, available in @ref psm2_mq_status_t + * upon completion + * @param[in] fp_type Fast-path op requested + * @param[out] req PSM MQ Request handle created by the preposted receive, to + * be used for explicitly controlling message receive + * completion. + * + * @post The supplied buffer is given to MQ to match against incoming + * messages unless it is cancelled via @ref psm2_mq_cancel @e before any + * match occurs. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The receive buffer has successfully been posted to the MQ. + */ +psm2_error_t +psm2_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *tag, + psm2_mq_tag_t *tagsel, uint32_t flags, void *buf, uint32_t len, + void *context, enum psm2_mq_fp_op fp_type, psm2_mq_req_t *req); + +/** @brief Post a receive to a Matched Queue with tag selection criteria + * + * Function to receive a non-blocking MQ message by providing a preposted + * buffer. For every MQ message received on a particular MQ, the @c tag and @c + * tagsel parameters are used against the incoming message's send tag as + * described in @ref tagmatch. + * + * @param[in] mq Matched Queue Handle + * @param[in] rtag Receive tag + * @param[in] rtagsel Receive tag selector + * @param[in] flags Receive flags (None currently supported) + * @param[in] buf Receive buffer + * @param[in] len Receive buffer length + * @param[in] context User context pointer, available in @ref psm2_mq_status_t + * upon completion + * @param[out] req PSM MQ Request handle created by the preposted receive, to + * be used for explicitly controlling message receive + * completion. + * + * @post The supplied receive buffer is given to MQ to match against incoming + * messages unless it is cancelled via @ref psm2_mq_cancel @e before any + * match occurs. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The receive buffer has successfully been posted to the MQ. + */ +psm2_error_t +psm2_mq_irecv(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel, uint32_t flags, + void *buf, uint32_t len, void *context, psm2_mq_req_t *req); + +/** @brief Post a receive to a Matched Queue with source and tag selection + * criteria + * + * Function to receive a non-blocking MQ message by providing a preposted + * buffer. For every MQ message received on a particular MQ, the @c src, @c tag + * and @c tagsel parameters are used against the incoming message's send tag as + * described in @ref tagmatch. + * + * @param[in] mq Matched Queue Handle + * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR) + * @param[in] rtag Receive tag + * @param[in] rtagsel Receive tag selector + * @param[in] flags Receive flags (None currently supported) + * @param[in] buf Receive buffer + * @param[in] len Receive buffer length + * @param[in] context User context pointer, available in @ref psm2_mq_status2_t + * upon completion + * @param[out] req PSM MQ Request handle created by the preposted receive, to + * be used for explicitly controlling message receive + * completion. + * + * @post The supplied receive buffer is given to MQ to match against incoming + * messages unless it is cancelled via @ref psm2_mq_cancel @e before any + * match occurs. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The receive buffer has successfully been posted to the MQ. + */ +psm2_error_t +psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag, + psm2_mq_tag_t *rtagsel, uint32_t flags, void *buf, uint32_t len, + void *context, psm2_mq_req_t *req); + +/** @brief Post a receive to a Matched Queue with matched request + * + * Function to receive a non-blocking MQ message by providing a preposted + * buffer. The provided request should already be matched using the @ref + * psm2_mq_improbe or @ref psm2_mq_improbe2 routines. It is an error to pass a + * request that has not already been matched by one of those routines. + * + * @param[in] mq Matched Queue Handle + * @param[in] flags Receive flags (None currently supported) + * @param[in] buf Receive buffer + * @param[in] len Receive buffer length + * @param[in] context User context pointer, available in @ref psm2_mq_status_t + * upon completion + * @param[inout] reqo PSM MQ Request handle matched previously by a matched + * probe routine (@ref psm2_mq_improbe or @ref + * psm2_mq_improbe2), also to be used for explicitly + * controlling message receive completion. + * + * @post The supplied receive buffer is given to MQ to deliver the matched + * message. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The receive buffer has successfully been posted to the MQ. + */ +psm2_error_t +psm2_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len, + void *context, psm2_mq_req_t *reqo); + +/** @brief Send a blocking MQ message + * + * Function to send a blocking MQ message, whereby the message is locally + * complete and the source data can be modified upon return. + * + * @param[in] mq Matched Queue Handle + * @param[in] dest Destination EP address + * @param[in] flags Message flags, currently: + * @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message + * synchronously, meaning that the message will not be sent until + * the receiver acknowledges that it has matched the send with a + * receive buffer. + * @param[in] stag Message Send Tag + * @param[in] buf Source buffer pointer + * @param[in] len Length of message starting at @c buf. + * + * @post The source buffer is reusable and the send is locally complete. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * @note This send function has been implemented to best suit MPI_Send. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The message has been successfully sent. + */ +psm2_error_t +psm2_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, + const void *buf, uint32_t len); + +/** @brief Send a blocking MQ message + * + * Function to send a blocking MQ message, whereby the message is locally + * complete and the source data can be modified upon return. + * + * @param[in] mq Matched Queue Handle + * @param[in] dest Destination EP address + * @param[in] flags Message flags, currently: + * @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message + * synchronously, meaning that the message will not be sent until + * the receiver acknowledges that it has matched the send with a + * receive buffer. + * @param[in] stag Message Send Tag + * @param[in] buf Source buffer pointer + * @param[in] len Length of message starting at @c buf. + * + * @post The source buffer is reusable and the send is locally complete. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * @note This send function has been implemented to best suit MPI_Send. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The message has been successfully sent. + */ +psm2_error_t +psm2_mq_send2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, + psm2_mq_tag_t *stag, const void *buf, uint32_t len); + +/** @brief Send a non-blocking MQ message + * + * Function to initiate the send of a non-blocking MQ message, whereby the + * user guarantees that the source data will remain unmodified until the send + * is locally completed through a call such as @ref psm2_mq_wait or @ref + * psm2_mq_test. + * + * @param[in] mq Matched Queue Handle + * @param[in] dest Destination EP address + * @param[in] flags Message flags, currently: + * @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message + * synchronously, meaning that the message will not be sent until + * the receiver acknowledges that it has matched the send with a + * receive buffer. + * @param[in] stag Message Send Tag + * @param[in] buf Source buffer pointer + * @param[in] len Length of message starting at @c buf. + * @param[in] context Optional user-provided pointer available in @ref + * psm2_mq_status_t when the send is locally completed. + * @param[out] req PSM MQ Request handle created by the non-blocking send, to + * be used for explicitly controlling message completion. + * + * @post The source buffer is not reusable and the send is not locally complete + * until its request is completed by either @ref psm2_mq_test or @ref + * psm2_mq_wait. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * @note This send function has been implemented to suit MPI_Isend. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The message has been successfully initiated. + * + * @code{.c} + psm2_mq_req_t + non_blocking_send(const psm2_mq_t mq, psm2_epaddr_t dest_ep, + const void *buf, uint32_t len, + int context_id, int send_tag, const my_request_t *req) + { + psm2_mq_req_t req_mq; + // Set up our send tag, assume that "my_rank" is global and represents + // the rank of this process in the job + uint64_t tag = ( ((context_id & 0xffff) << 48) | + ((my_rank & 0xffff) << 32) | + ((send_tag & 0xffffffff)) ); + + psm2_mq_isend(mq, dest_ep, + 0, // no flags + tag, + buf, + len, + req, // this req is available in psm2_mq_status_t when one + // of the synchronization functions is called. + &req_mq); + return req_mq; + } + @endcode + */ +psm2_error_t +psm2_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, + const void *buf, uint32_t len, void *context, psm2_mq_req_t *req); + +/** @brief Send a non-blocking MQ message + * + * Function to initiate the send of a non-blocking MQ message, whereby the + * user guarantees that the source data will remain unmodified until the send + * is locally completed through a call such as @ref psm2_mq_wait or @ref + * psm2_mq_test. + * + * @param[in] mq Matched Queue Handle + * @param[in] dest Destination EP address + * @param[in] flags Message flags, currently: + * @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message + * synchronously, meaning that the message will not be sent until + * the receiver acknowledges that it has matched the send with a + * receive buffer. + * @param[in] stag Message Send Tag, array of three 32-bit values. + * @param[in] buf Source buffer pointer + * @param[in] len Length of message starting at @c buf. + * @param[in] context Optional user-provided pointer available in @ref + * psm2_mq_status2_t when the send is locally completed. + * @param[out] req PSM MQ Request handle created by the non-blocking send, to + * be used for explicitly controlling message completion. + * + * @post The source buffer is not reusable and the send is not locally complete + * until its request is completed by either @ref psm2_mq_test or @ref + * psm2_mq_wait. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * @note This send function has been implemented to suit MPI_Isend. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The message has been successfully initiated. + * + * @code{.c} + psm2_mq_req_t + non_blocking_send(const psm2_mq_t mq, psm2_epaddr_t dest_ep, + const void *buf, uint32_t len, + int context_id, int send_tag, const my_request_t *req) + { + psm2_mq_req_t req_mq; + // Set up our send tag, assume that "my_rank" is global and represents + // the rank of this process in the job + psm2_mq_tag_t tag; + tag.tag[0] = send_tag; + tag.tag[1] = my_rank; + tag.tag[2] = context_id; + + psm2_mq_isend(mq, dest_ep, + 0, // no flags + &tag, + buf, + len, + req, // this req is available in psm2_mq_status2_t when one + // of the synchronization functions is called. + &req_mq); + return req_mq; + } + @endcode + */ +psm2_error_t +psm2_mq_isend2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, + psm2_mq_tag_t *stag, const void *buf, uint32_t len, void *context, + psm2_mq_req_t *req); + +/** @brief Try to Probe if a message is received matching tag selection + * criteria + * + * Function to verify if a message matching the supplied tag and tag selectors + * has been received. The message is not fully matched until the user + * provides a buffer with the successfully matching tag selection criteria + * through @ref psm2_mq_irecv. + * Probing for messages may be useful if the size of the + * message to be received is unknown, in which case its size will be + * available in the @c msg_length member of the returned @c status. + * + * Function ensures progress if matching request wasnā€™t found + * after the first attempt. + * + * @param[in] mq Matched Queue Handle + * @param[in] rtag Message receive tag + * @param[in] rtagsel Message receive tag selector + * @param[out] status Upon return, @c status is filled with information + * regarding the matching send. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error codes are returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL. + * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is + * unchanged. + */ +psm2_error_t +psm2_mq_iprobe(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel, + psm2_mq_status_t *status); + +/** @brief Try to Probe if a message is received matching source and tag + * selection criteria + * + * Function to verify if a message matching the supplied source, tag, and tag + * selectors has been received. The message is not fully matched until the + * user provides a buffer with the successfully matching tag selection criteria + * through @ref psm2_mq_irecv. Probing for messages may be useful if the size + * of the message to be received is unknown, in which case its size will be + * available in the @c msg_length member of the returned @c status. + * + * Function ensures progress if matching request wasnā€™t found + * after the first attempt. + * + * @param[in] mq Matched Queue Handle + * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR) + * @param[in] rtag Message receive tag + * @param[in] rtagsel Message receive tag selector + * @param[out] status Upon return, @c status is filled with information + * regarding the matching send. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error codes are returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL. + * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is + * unchanged. + */ +psm2_error_t +psm2_mq_iprobe2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag, + psm2_mq_tag_t *rtagsel, psm2_mq_status2_t *status); + +/** @brief Try to Probe if a message is received matching tag selection + * criteria + * + * Function to verify if a message matching the supplied source, tag, and tag + * selectors has been received. If a match is successful, the message is + * removed from the matching queue and returned as a request object. The + * message can be received using @ref psm2_mq_imrecv. It is erroneous to use + * the request object returned by @ref psm2_mq_improbe for any purpose other + * than passing to @ref psm2_mq_imrecv. Probing for messages may be useful if + * the size of the message to be received is unknown, in which case its size + * will be available in the @c msg_length member of the returned @c status. + * + * Function ensures progress if matching request wasnā€™t found + * after the first attempt. + * + * @param[in] mq Matched Queue Handle + * @param[in] rtag Message receive tag + * @param[in] rtagsel Message receive tag selector + * @param[out] req PSM MQ Request handle, to be used for receiving the matched + * message. + * @param[out] status Upon return, @c status is filled with information + * regarding the matching send. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error codes are returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL. + * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is unchanged. + */ +psm2_error_t +psm2_mq_improbe(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel, psm2_mq_req_t *req, + psm2_mq_status_t *status); + +/** @brief Try to Probe if a message is received matching source and tag + * selection criteria + * + * Function to verify if a message matching the supplied tag and tag selectors + * has been received. If a match is successful, the message is removed from + * the matching queue and returned as a request object. The message can be + * received using @ref psm2_mq_imrecv. It is erroneous to use the request + * object returned by @ref psm2_mq_improbe for any purpose other than passing to + * @ref psm2_mq_imrecv. Probing for messages may be useful if the size of the + * message to be received is unknown, in which case its size will be available + * in the @c msg_length member of the returned @c status. + * + * Function ensures progress if matching request wasnā€™t found + * after the first attempt. + * + * @param[in] mq Matched Queue Handle + * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR) + * @param[in] rtag Message receive tag + * @param[in] rtagsel Message receive tag selector + * @param[out] reqo PSM MQ Request handle, to be used for receiving the matched + * message. + * @param[out] status Upon return, @c status is filled with information + * regarding the matching send. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error codes are returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL. + * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is unchanged. + */ +psm2_error_t +psm2_mq_improbe2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag, + psm2_mq_tag_t *rtagsel, psm2_mq_req_t *reqo, + psm2_mq_status2_t *status); + +/** @brief Query for non-blocking requests ready for completion. + * + * Function to query a particular MQ for non-blocking requests that are ready + * for completion. Requests "ready for completion" are not actually considered + * complete by MQ until they are returned to the MQ library through @ref + * psm2_mq_wait or @ref psm2_mq_test. + * + * If the user can deal with consuming request completions in the order in + * which they complete, this function can be used both for completions and for + * ensuring progress. The latter requirement is satisfied when the user + * peeks an empty completion queue as a side effect of always aggressively + * peeking and completing all an MQ's requests ready for completion. + * + * + * @param[in] mq Matched Queue Handle + * @param[in,out] req MQ non-blocking request + * @param[in] status Optional MQ status, can be NULL. + * + * @post The user has ensured progress if the function returns @ref + * PSM2_MQ_NO_COMPLETIONS + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error codes are returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The peek is successful and @c req is updated with a request + * ready for completion. If @c status is non-NULL, it is also + * updated. + * + * @retval PSM2_MQ_NO_COMPLETIONS The peek is not successful, meaning that there + * are no further requests ready for completion. + * The contents of @c req and @c status remain + * unchanged. + * @code{.c} + // Example that uses ipeek_mq_ipeek to make progress instead of psm2_poll + // We return the amount of non-blocking requests that we've completed + int main_progress_loop(psm2_mq_t mq) + { + int num_completed = 0; + psm2_mq_req_t req; + psm2_mq_status_t status; + psm2_error_t err; + my_request_t *myreq; + + do { + err = psm2_mq_ipeek(mq, &req, + NULL); // No need for status in ipeek here + if (err == PSM2_MQ_NO_COMPLETIONS) + return num_completed; + else if (err != PSM2_OK) + goto errh; + num_completed++; + + // We obtained 'req' at the head of the completion queue. We can + // now free the request with PSM and obtain our original reques + // from the status' context + err = psm2_mq_test(&req, // will be marked as invalid + &status); // we need the status + myreq = (my_request_t *) status.context; + + // handle the completion for myreq whether myreq is a posted receive + // or a non-blocking send. + } + while (1); + } + @endcode + */ +psm2_error_t +psm2_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *req, psm2_mq_status_t *status); + +/** @brief Query for non-blocking requests ready for completion. + * + * Function to query a particular MQ for non-blocking requests that are ready + * for completion. Requests "ready for completion" are not actually considered + * complete by MQ until they are returned to the MQ library through @ref + * psm2_mq_wait or @ref psm2_mq_test. + * + * If the user can deal with consuming request completions in the order in + * which they complete, this function can be used both for completions and for + * ensuring progress. The latter requirement is satisfied when the user + * peeks an empty completion queue as a side effect of always aggressively + * peeking and completing all an MQ's requests ready for completion. + * + * + * @param[in] mq Matched Queue Handle + * @param[in,out] req MQ non-blocking request + * @param[in] status Optional MQ status, can be NULL. + * + * @post The user has ensured progress if the function returns @ref + * PSM2_MQ_NO_COMPLETIONS + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error codes are returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The peek is successful and @c req is updated with a request + * ready for completion. If @c status is non-NULL, it is also + * updated. + * + * @retval PSM2_MQ_NO_COMPLETIONS The peek is not successful, meaning that there + * are no further requests ready for completion. + * The contents of @c req and @c status remain + * unchanged. + * @code{.c} + // Example that uses ipeek_mq_ipeek to make progress instead of psm2_poll + // We return the amount of non-blocking requests that we've completed + int main_progress_loop(psm2_mq_t mq) + { + int num_completed = 0; + psm2_mq_req_t req; + psm2_mq_status2_t status; + psm2_error_t err; + my_request_t *myreq; + + do { + err = psm2_mq_ipeek2(mq, &req, + NULL); // No need for status in ipeek here + if (err == PSM2_MQ_NO_COMPLETIONS) + return num_completed; + else if (err != PSM2_OK) + goto errh; + num_completed++; + + // We obtained 'req' at the head of the completion queue. We can + // now free the request with PSM and obtain our original reques + // from the status' context + err = psm2_mq_test2(&req, // will be marked as invalid + &status); // we need the status + myreq = (my_request_t *) status.context; + + // handle the completion for myreq whether myreq is a posted receive + // or a non-blocking send. + } + while (1); + } + @endcode + */ +psm2_error_t +psm2_mq_ipeek2(psm2_mq_t mq, psm2_mq_req_t *req, psm2_mq_status2_t *status); + +/** @brief User defined Callback function handling copy of MQ request into user datatype + * + * Callback function used to convert an MQ request into a user's desired + * status structure. The user's callback function converts the MQ request into + * the provided status_array at the specified index. + * + * @param[in] req MQ External non-blocking Request structure + * @param[in] status_array Array of User defined status datatypes + * @param[in] entry_index Index in array where the converted request will be + * stored if successful + * + * The following error codes are returned. + * + * @retval < 0 The MQ conversion failed with a user defined error. + * + * @retval 0 The MQ was successfully processed, but was not saved + * in the provided @c status_array. + * + * @retval 1 The MQ was successfully processed and was saved in the + * @c status_array at the specified index. + * + * @retval >1 The MQ was successfully processed and was saved in the + * @c status_array at the specified index. This should + * be the last MQ converted in the batch, even if there + * are still spaces in @c status_array. + */ +typedef int (*psmi_mq_status_copy_user_t) (struct psm2_mq_req_user *req, + void *status_array, int entry_index); + +/** @brief Check and dequeue MQ requests into a user's status array using a callback. + * + * Function to atomically check and dequeue MQ entries from the completed + * queue and copy the MQ requests into a user's status datatype through a + * status_copy callback function. + * + * Once the MQ request has been successfully converted by the callback, the + * MQ request is freed and the next entry is processed making the supplied + * Request pointer invalid. + * + * The variable "count" passed in will only be increased if the MQ request was + * successfully stored into the user's passed in array. Otherwise the count + * variable is unchanged. + * + * NOTE: a count of 0 passed into psm2_mq_ipeek_dequeue_multi will result in + * no MQ elements being processed. + * + * @param[in] mq Matched Queue Handle + * @param[in] status_array Array of User defined status datatypes + * @param[in] status_copy Callback function pointer to convert + * MQ to caller datatype + * @param[in/out] count [in]Size of status_array, [out]number of elements + * populated into status_array or user's error return code + * + * The following error codes are returned. + * + * @retval PSM2_OK The dequeue operation was successful and populated the + * full @c status_array up to @c count entries. The parameter + * @c count is equal to the count passed in by the user. + * + * @retval PSM2_MQ_NO_COMPLETIONS The dequeue operation was not able to read + * @c count entries into the @c status_array. The number + * of entries that were successfully written to the + * @c status_array is set in the @c count for the user. + * + * @retval PSM2_INTERNAL_ERR The @c status_copy failed to successfully + * copy the status entry into the user's datatype. + * @c count is set to the return code from the + * @c status_copy. + */ + psm2_error_t + psm2_mq_ipeek_dequeue_multi(psm2_mq_t mq, void *status_array, + psmi_mq_status_copy_user_t status_copy, int *count); + +/** @brief Check and dequeue the first request entry from the completed queue. + * + * Function to atomically check and dequeue the first entry from the completed + * queue. It must be paired with function psm2_mq_req_free, which returns the + * request to PSM2 library. + * + * @param[in] mq Matched Queue Handle + * @param[out] req PSM MQ Request handle, to be used for receiving the matched + * message. + * + * The following error codes are returned. + * + * @retval PSM2_OK The dequeue operation was successful and @c req is updated + * with a request ready for completion. + * + * @retval PSM2_MQ_NO_COMPLETIONS The dequeue operation was not successful, + * meaning that there are no further requests ready + * for completion. The contents of @c req remain + * unchanged. + */ +psm2_error_t +psm2_mq_ipeek_dequeue(psm2_mq_t mq, psm2_mq_req_t *req); + +/** @brief Return the request to PSM2 library. + * + * Function returns the request previously obtained via psm2_mq_ipeek_dequeue + * to the PSM2 library. + * + * @param[in] mq Matched Queue Handle + * @param[in] req PSM MQ Request handle to be returned to PSM2 library. + If @p req is NULL, no operation is performed. + * + * The following error codes are returned. + * + * @retval PSM2_OK Return of an object to PSM2 library pool was successful. + */ +psm2_error_t +psm2_mq_req_free(psm2_mq_t mq, psm2_mq_req_t req); + +/** @brief Wait until a non-blocking request completes + * + * Function to wait on requests created from either preposted receive buffers + * or non-blocking sends. This is the only blocking function in the MQ + * interface and will poll until the request is complete as per the progress + * semantics explained in @ref mq_progress. + * + * @param[in,out] request MQ non-blocking request + * @param[out] status Updated if non-NULL when request successfully completes + * + * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend + * or @ref psm2_mq_irecv and passes a pointer to enough storage to write + * the output of a @ref psm2_mq_status_t or NULL if status is to be + * ignored. + * + * @pre Since MQ will internally ensure progress while the user is + * suspended, the user need not ensure that progress is made prior to + * calling this function. + * + * @post The request is assigned the value @ref PSM2_MQ_REQINVALID and all + * associated MQ request storage is released back to the MQ library. + * + * @remark This function may be called simultaneously from multiple threads + * as long as the requests that are used in each of the calls are + * associated with different MQs. + * + * @remarks + * @li This function ensures progress on the endpoint as long as the request + * is incomplete. + * @li @c status can be NULL, in which case no status is written upon + * completion. + * @li If @c request is @ref PSM2_MQ_REQINVALID, the function returns + * immediately. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The request is complete or the value of @c was + * @ref PSM2_MQ_REQINVALID. + * + */ +psm2_error_t +psm2_mq_wait(psm2_mq_req_t *request, psm2_mq_status_t *status); + +/** @brief Wait until a non-blocking request completes + * + * Function to wait on requests created from either preposted receive buffers + * or non-blocking sends. This is the only blocking function in the MQ + * interface and will poll until the request is complete as per the progress + * semantics explained in @ref mq_progress. + * + * @param[in,out] request MQ non-blocking request + * @param[out] status Updated if non-NULL when request successfully completes + * + * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend + * or @ref psm2_mq_irecv and passes a pointer to enough storage to write + * the output of a @ref psm2_mq_status2_t or NULL if status is to be + * ignored. + * + * @pre Since MQ will internally ensure progress while the user is + * suspended, the user need not ensure that progress is made prior to + * calling this function. + * + * @post The request is assigned the value @ref PSM2_MQ_REQINVALID and all + * associated MQ request storage is released back to the MQ library. + * + * @remark This function may be called simultaneously from multiple threads + * as long as the requests that are used in each of the calls are + * associated with different MQs. + * + * @remarks + * @li This function ensures progress on the endpoint as long as the request + * is incomplete. + * @li @c status can be NULL, in which case no status is written upon + * completion. + * @li If @c request is @ref PSM2_MQ_REQINVALID, the function returns + * immediately. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The request is complete or the value of @c was + * @ref PSM2_MQ_REQINVALID. + * + */ +psm2_error_t +psm2_mq_wait2(psm2_mq_req_t *request, psm2_mq_status2_t *status); + +/** @brief Test if a non-blocking request is complete + * + * Function to test requests created from either preposted receive buffers or + * non-blocking sends for completion. Unlike @ref psm2_mq_wait, this function + * tests @c request for completion and @e never ensures progress directly or + * indirectly. It is up to the user to employ some of the progress functions + * described in @ref mq_progress to ensure progress if the user chooses to + * exclusively test requests for completion. + * + * Testing a request for completion @e never internally ensure progress in + * order to be useful to construct higher-level completion tests over arrays to + * test some, all or any request that has completed. For testing arrays of + * requests, it is preferable for performance reasons to only ensure progress + * once before testing a set of requests for completion. + * + * @param[in,out] request MQ non-blocking request + * @param[out] status Updated if non-NULL and the request successfully + * completes + * + * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend + * or @ref psm2_mq_irecv and passes a pointer to enough storage to write + * the output of a @ref psm2_mq_status_t or NULL if status is to be + * ignored. + * + * @pre The user has ensured progress on the Matched Queue if @ref + * psm2_mq_test is exclusively used for guaranteeing request completions. + * + * @post If the request is complete, the request is assigned the value @ref + * PSM2_MQ_REQINVALID and all associated MQ request storage is released + * back to the MQ library. If the request is incomplete, the contents of + * @c request is unchanged. + * + * @post The user will ensure progress on the Matched Queue if @ref + * psm2_mq_test is exclusively used for guaranteeing request completions. + * + * @remark This function may be called simultaneously from multiple threads + * as long as the requests that are used in each of the calls are + * associated with different MQs. + * + * The following two errors are always returned. Other errors are handled by + * the PSM error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The request is complete and @c request is set to @ref + * PSM2_MQ_REQINVALID or the value of @c was PSM2_MQ_REQINVALID + * + * @retval PSM2_MQ_NO_COMPLETIONS The request is not complete and @c request is + * unchanged. + * + * @code{.c} + // Function that returns the first completed request in an array + // of requests. + void * + user_testany(psm2_ep_t ep, psm2_mq_req_t *allreqs, int nreqs) + { + int i; + void *context = NULL; + + // Ensure progress only once + psm2_poll(ep); + + // Test for at least one completion and return it's context + psm2_mq_status_t stat; + for (i = 0; i < nreqs; i++) { + if (psm2_mq_test(&allreqs[i], &stat) == PSM2_OK) { + context = stat.context; + break; + } + } + return context; + } + @endcode + */ +psm2_error_t +psm2_mq_test(psm2_mq_req_t *request, psm2_mq_status_t *status); + +/** @brief Test if a non-blocking request is complete + * + * Function to test requests created from either preposted receive buffers or + * non-blocking sends for completion. Unlike @ref psm2_mq_wait, this function + * tests @c request for completion and @e never ensures progress directly or + * indirectly. It is up to the user to employ some of the progress functions + * described in @ref mq_progress to ensure progress if the user chooses to + * exclusively test requests for completion. + * + * Testing a request for completion @e never internally ensure progress in + * order to be useful to construct higher-level completion tests over arrays to + * test some, all or any request that has completed. For testing arrays of + * requests, it is preferable for performance reasons to only ensure progress + * once before testing a set of requests for completion. + * + * @param[in,out] request MQ non-blocking request + * @param[out] status Updated if non-NULL and the request successfully + * completes + * + * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend + * or @ref psm2_mq_irecv and passes a pointer to enough storage to write + * the output of a @ref psm2_mq_status2_t or NULL if status is to be + * ignored. + * + * @pre The user has ensured progress on the Matched Queue if @ref + * psm2_mq_test is exclusively used for guaranteeing request completions. + * + * @post If the request is complete, the request is assigned the value @ref + * PSM2_MQ_REQINVALID and all associated MQ request storage is released + * back to the MQ library. If the request is incomplete, the contents of + * @c request is unchanged. + * + * @post The user will ensure progress on the Matched Queue if @ref + * psm2_mq_test is exclusively used for guaranteeing request completions. + * + * @remark This function may be called simultaneously from multiple threads + * as long as the requests that are used in each of the calls are + * associated with different MQs. + * + * The following two errors are always returned. Other errors are handled by + * the PSM error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The request is complete and @c request is set to @ref + * PSM2_MQ_REQINVALID or the value of @c was PSM2_MQ_REQINVALID + * + * @retval PSM2_MQ_NO_COMPLETIONS The request is not complete and @c request is + * unchanged. + * + * @code{.c} + // Function that returns the first completed request in an array + // of requests. + void * + user_testany(psm2_ep_t ep, psm2_mq_req_t *allreqs, int nreqs) + { + int i; + void *context = NULL; + + // Ensure progress only once + psm2_poll(ep); + + // Test for at least one completion and return it's context + psm2_mq_status2_t stat; + for (i = 0; i < nreqs; i++) { + if (psm2_mq_test2(&allreqs[i], &stat) == PSM2_OK) { + context = stat.context; + break; + } + } + return context; + } + @endcode + */ +psm2_error_t +psm2_mq_test2(psm2_mq_req_t *request, psm2_mq_status2_t *status); + +/** @brief Cancel a preposted request + * + * Function to cancel a preposted receive request returned by @ref + * psm2_mq_irecv. It is currently illegal to cancel a send request initiated + * with @ref psm2_mq_isend. + * + * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend. + * + * @post Whether the cancel is successful or not, the user returns the + * request to the library by way of @ref psm2_mq_test or @ref + * psm2_mq_wait. + * + * @remark This function may be called simultaneously from multiple threads + * as long as the requests that are used in each of the calls are + * associated with different MQs. + * + * Only the two following errors can be returned directly, without being + * handled by the error handler (@ref psm2_error_register_handler): + * + * @retval PSM2_OK The request could be successfully cancelled such that the + * preposted receive buffer could be removed from the preposted + * receive queue before a match occurred. The associated @c + * request remains unchanged and the user must still return + * the storage to the MQ library. + * + * @retval PSM2_MQ_NO_COMPLETIONS The request could not be successfully cancelled + * since the preposted receive buffer has already + * matched an incoming message. The @c request + * remains unchanged. + * + */ +psm2_error_t psm2_mq_cancel(psm2_mq_req_t *req); + +/*! @brief MQ statistics structure */ +struct psm2_mq_stats { + /** Bytes received into a matched user buffer */ + uint64_t rx_user_bytes; + /** Messages received into a matched user buffer */ + uint64_t rx_user_num; + /** Bytes received into an unmatched system buffer */ + uint64_t rx_sys_bytes; + /** Messages received into an unmatched system buffer */ + uint64_t rx_sys_num; + + /** Total Messages transmitted (shm and hfi) */ + uint64_t tx_num; + /** Messages transmitted eagerly */ + uint64_t tx_eager_num; + /** Bytes transmitted eagerly */ + uint64_t tx_eager_bytes; + /** Messages transmitted using expected TID mechanism */ + uint64_t tx_rndv_num; + /** Bytes transmitted using expected TID mechanism */ + uint64_t tx_rndv_bytes; + /** Messages transmitted (shm only) */ + uint64_t tx_shm_num; + /** Messages received through shm */ + uint64_t rx_shm_num; + + /** Number of system buffers allocated */ + uint64_t rx_sysbuf_num; + /** Bytes allcoated for system buffers */ + uint64_t rx_sysbuf_bytes; + + /** Internally reserved for future use */ + uint64_t _reserved[16]; +}; + +#define PSM2_MQ_NUM_STATS 13 /**< How many stats are currently used in @ref psm2_mq_stats */ + +/*! @see psm2_mq_stats */ + typedef struct psm2_mq_stats psm2_mq_stats_t; + +/** @brief Retrieve statistics from an instantiated MQ */ + void + psm2_mq_get_stats(psm2_mq_t mq, psm2_mq_stats_t *stats); + +/*! @} */ +#ifdef __cplusplus +} /* extern "C" */ +#endif +#endif diff --git a/psm_am.c b/psm_am.c new file mode 100644 index 0000000..c421142 --- /dev/null +++ b/psm_am.c @@ -0,0 +1,339 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_am.h" +#include "psm_am_internal.h" +#include "psm_mq_internal.h" + +int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid); + +/* AM capabilities parameters are initialized once in psmi_am_init_internal + and copied out in __psm2_am_get_parameters. When debugging is enabled, + various assertions reference these parameters for sanity checking. */ +struct psm2_am_parameters psmi_am_parameters = { 0 }; + +static int _ignore_handler(PSMI_AM_ARGS_DEFAULT) +{ + return 0; +} + +int psmi_abort_handler(PSMI_AM_ARGS_DEFAULT) +{ + abort(); + return 0; +} + +static void psmi_am_min_parameters(struct psm2_am_parameters *dest, + struct psm2_am_parameters *src) +{ + dest->max_handlers = min(dest->max_handlers, src->max_handlers); + dest->max_nargs = min(dest->max_nargs, src->max_nargs); + dest->max_request_short = + min(dest->max_request_short, src->max_request_short); + dest->max_reply_short = + min(dest->max_reply_short, src->max_reply_short); +} + +psm2_error_t psmi_am_init_internal(psm2_ep_t ep) +{ + int i; + struct psm2_ep_am_handle_entry *am_htable; + struct psm2_am_parameters params; + + psmi_am_parameters.max_handlers = INT_MAX; + psmi_am_parameters.max_nargs = INT_MAX; + psmi_am_parameters.max_request_short = INT_MAX; + psmi_am_parameters.max_reply_short = INT_MAX; + + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { + ep->ptl_self.am_get_parameters(ep, ¶ms); + psmi_am_min_parameters(&psmi_am_parameters, ¶ms); + } + + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { + ep->ptl_ips.am_get_parameters(ep, ¶ms); + psmi_am_min_parameters(&psmi_am_parameters, ¶ms); + } + + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { + ep->ptl_amsh.am_get_parameters(ep, ¶ms); + psmi_am_min_parameters(&psmi_am_parameters, ¶ms); + } + + ep->am_htable = + psmi_malloc(ep, UNDEFINED, + sizeof(struct psm2_ep_am_handle_entry) * PSMI_AM_NUM_HANDLERS); + if (ep->am_htable == NULL) + return PSM2_NO_MEMORY; + + am_htable = (struct psm2_ep_am_handle_entry *) ep->am_htable; + for (i = 0; i < PSMI_AM_NUM_HANDLERS; i++) { + am_htable[i].hfn = _ignore_handler; + am_htable[i].hctx = NULL; + am_htable[i].version = PSM2_AM_HANDLER_V2; + } + + return PSM2_OK; + +} + +psm2_error_t +__psm2_am_register_handlers(psm2_ep_t ep, + const psm2_am_handler_fn_t *handlers, + int num_handlers, int *handlers_idx) +{ + int i, j; + + psmi_assert_always(ep->am_htable != NULL); + + PSM2_LOG_MSG("entering"); + /* For now just assign any free one */ + for (i = 0, j = 0; (i < PSMI_AM_NUM_HANDLERS) && (j < num_handlers); i++) { + if (ep->am_htable[i].hfn == _ignore_handler) { + ep->am_htable[i].hfn = handlers[j]; + ep->am_htable[i].hctx = NULL; + ep->am_htable[i].version = PSM2_AM_HANDLER_V1; + handlers_idx[j] = i; + if (++j == num_handlers) /* all registered */ + break; + } + } + + if (j < num_handlers) { + /* Not enough free handlers, restore unused handlers */ + for (i = 0; i < j; i++) { + ep->am_htable[handlers_idx[i]].hfn = _ignore_handler; + ep->am_htable[handlers_idx[i]].hctx = NULL; + ep->am_htable[handlers_idx[i]].version = PSM2_AM_HANDLER_V2; + } + PSM2_LOG_MSG("leaving"); + return psmi_handle_error(ep, PSM2_EP_NO_RESOURCES, + "Insufficient " + "available AM handlers: registered %d of %d requested handlers", + j, num_handlers); + } + else { + PSM2_LOG_MSG("leaving"); + return PSM2_OK; + } +} +PSMI_API_DECL(psm2_am_register_handlers) + +psm2_error_t +__psm2_am_register_handlers_2(psm2_ep_t ep, + const psm2_am_handler_2_fn_t *handlers, + int num_handlers, void **hctx, int *handlers_idx) +{ + int i, j; + + psmi_assert_always(ep->am_htable != NULL); + + PSM2_LOG_MSG("entering"); + /* For now just assign any free one */ + for (i = 0, j = 0; (i < PSMI_AM_NUM_HANDLERS) && (j < num_handlers); i++) { + if (ep->am_htable[i].hfn == _ignore_handler) { + ep->am_htable[i].hfn = handlers[j]; + ep->am_htable[i].hctx = hctx[j]; + ep->am_htable[i].version = PSM2_AM_HANDLER_V2; + handlers_idx[j] = i; + if (++j == num_handlers) /* all registered */ + break; + } + } + + if (j < num_handlers) { + /* Not enough free handlers, restore unused handlers */ + for (i = 0; i < j; i++) { + ep->am_htable[handlers_idx[i]].hfn = _ignore_handler; + ep->am_htable[handlers_idx[i]].hctx = NULL; + ep->am_htable[handlers_idx[i]].version = PSM2_AM_HANDLER_V2; + } + PSM2_LOG_MSG("leaving"); + return psmi_handle_error(ep, PSM2_EP_NO_RESOURCES, + "Insufficient " + "available AM handlers: registered %d of %d requested handlers", + j, num_handlers); + } + else { + PSM2_LOG_MSG("leaving"); + return PSM2_OK; + } +} +PSMI_API_DECL(psm2_am_register_handlers_2) + +void +__psm2_am_unregister_handlers(psm2_ep_t ep) +{ + int i; + + PSM2_LOG_MSG("entering"); + for (i = 0; i < PSMI_AM_NUM_HANDLERS; i++) { + if (ep->am_htable[i].hfn != _ignore_handler) { + ep->am_htable[i].hfn = _ignore_handler; + ep->am_htable[i].hctx = NULL; + ep->am_htable[i].version = PSM2_AM_HANDLER_V2; + } + } + PSM2_LOG_MSG("leaving"); +} +PSMI_API_DECL(psm2_am_unregister_handlers) + +psm2_error_t +__psm2_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler, + psm2_amarg_t *args, int nargs, void *src, size_t len, + int flags, psm2_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + psm2_error_t err; + ptl_ctl_t *ptlc = epaddr->ptlctl; + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + psmi_assert(epaddr != NULL); + psmi_assert(handler >= 0 && handler < psmi_am_parameters.max_handlers); + psmi_assert(nargs >= 0 && nargs <= psmi_am_parameters.max_nargs); + psmi_assert(nargs > 0 ? args != NULL : 1); + psmi_assert(len >= 0 && len <= psmi_am_parameters.max_request_short); + psmi_assert(len > 0 ? src != NULL : 1); + + PSMI_LOCK(ptlc->ep->mq->progress_lock); + + err = ptlc->am_short_request(epaddr, handler, args, + nargs, src, len, flags, completion_fn, + completion_ctxt); + PSMI_UNLOCK(ptlc->ep->mq->progress_lock); + PSM2_LOG_MSG("leaving"); + + return err; +} +PSMI_API_DECL(psm2_am_request_short) + +psm2_error_t +__psm2_am_reply_short(psm2_am_token_t token, psm2_handler_t handler, + psm2_amarg_t *args, int nargs, void *src, size_t len, + int flags, psm2_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + psm2_error_t err; + struct psmi_am_token *tok; + psm2_epaddr_t epaddr; + ptl_ctl_t *ptlc; + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + psmi_assert_always(token != NULL); + psmi_assert(handler >= 0 && handler < psmi_am_parameters.max_handlers); + psmi_assert(nargs >= 0 && nargs <= psmi_am_parameters.max_nargs); + psmi_assert(nargs > 0 ? args != NULL : 1); + psmi_assert(len >= 0 && len <= psmi_am_parameters.max_reply_short); + psmi_assert(len > 0 ? src != NULL : 1); + + tok = (struct psmi_am_token *)token; + epaddr = tok->epaddr_incoming; + ptlc = epaddr->ptlctl; + + /* No locking here since we are already within handler context and already + * locked */ + + err = ptlc->am_short_reply(token, handler, args, + nargs, src, len, flags, completion_fn, + completion_ctxt); + PSM2_LOG_MSG("leaving"); + + return err; +} +PSMI_API_DECL(psm2_am_reply_short) + +psm2_error_t __psm2_am_get_source(psm2_am_token_t token, psm2_epaddr_t *epaddr_out) +{ + struct psmi_am_token *tok; + + PSM2_LOG_MSG("entering"); + if (token == NULL || epaddr_out == NULL) { + PSM2_LOG_MSG("leaving"); + return psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid %s parameters", __FUNCTION__); + } + + tok = (struct psmi_am_token *)token; + *epaddr_out = tok->epaddr_incoming; + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} +PSMI_API_DECL(psm2_am_get_source) + +psm2_error_t +__psm2_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters, + size_t sizeof_parameters_in, + size_t *sizeof_parameters_out) +{ + size_t s; + + PSM2_LOG_MSG("entering"); + if (parameters == NULL) { + PSM2_LOG_MSG("leaving"); + return psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid %s parameters", __FUNCTION__); + } + + memset(parameters, 0, sizeof_parameters_in); + s = min(sizeof(psmi_am_parameters), sizeof_parameters_in); + memcpy(parameters, &psmi_am_parameters, s); + *sizeof_parameters_out = s; + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} +PSMI_API_DECL(psm2_am_get_parameters) diff --git a/psm_am_internal.h b/psm_am_internal.h new file mode 100644 index 0000000..bc2c128 --- /dev/null +++ b/psm_am_internal.h @@ -0,0 +1,107 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _PSM2_AM_INTERNAL_H +#define _PSM2_AM_INTERNAL_H + +#define PSMI_AM_MAX_ARGS 10 +#define PSMI_AM_NUM_HANDLERS 256 /* must be power of 2 */ + +#define PSMI_AM_ARGS_DEFAULT psm2_am_token_t token, \ + psm2_amarg_t *args, int nargs, \ + void *src, uint32_t len, \ + void *hctx + +enum psm2_am_handler_version +{ + PSM2_AM_HANDLER_V1 = 0, + PSM2_AM_HANDLER_V2, +}; + +struct psm2_ep_am_handle_entry +{ + void *hfn; + void *hctx; + enum psm2_am_handler_version version; +}; + +struct psmi_am_token { + psm2_epaddr_t epaddr_incoming; + uint32_t flags; + /* Can handler reply? i.e. Not OPCODE_AM_REQUEST_NOREPLY request */ + uint32_t can_reply; + + /* PTLs may add other stuff here */ +}; + +/* AM capabilities parameters are initialized once in psmi_am_init_internal + and copied out in __psm2_am_get_parameters. When debugging is enabled, + various assertions reference these parameters for sanity checking. */ +extern struct psm2_am_parameters psmi_am_parameters; + +PSMI_ALWAYS_INLINE(struct psm2_ep_am_handle_entry * + psm_am_get_handler_function(psm2_ep_t ep, + psm2_handler_t handler_idx)) +{ + int hidx = handler_idx & (PSMI_AM_NUM_HANDLERS - 1); + struct psm2_ep_am_handle_entry *hentry = &ep->am_htable[hidx]; + psmi_assert_always(hentry != NULL); + return hentry; +} + +/* PSM internal initialization */ +psm2_error_t psmi_am_init_internal(psm2_ep_t ep); + +#endif diff --git a/psm_config.h b/psm_config.h new file mode 100644 index 0000000..3c42106 --- /dev/null +++ b/psm_config.h @@ -0,0 +1,208 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2018 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2018 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef PSM_CONFIG_H +#define PSM_CONFIG_H + +/* + * The following flags can be used instead of `make` switches in order to + * change behavior achieved when using `make` without parameters. + */ + +#ifndef RDPMC_PERF_FRAMEWORK +/* #define RDPMC_PERF_FRAMEWORK */ +#endif + +#ifndef PSM2_MOCK_TESTING +/* #define PSM2_MOCK_TESTING */ +#endif + +#ifndef PSM_CUDA +/* #define PSM_CUDA */ +/* #define NVIDIA_GPU_DIRECT */ +#endif + +#ifndef HFI_BRAKE_DEBUG +/* #define HFI_BRAKE_DEBUG */ +#endif + +#ifndef PSM_DEBUG +/* #define PSM_DEBUG */ +/* #define _HFI_DEBUGGING 1 */ +/* #define _FORTIFY_SOURCE 2 */ +#endif + +#ifndef PSM_HEAP_DEBUG +/* #define PSM_HEAP_DEBUG */ +#endif + +#ifndef PSM_PROFILE +/* #define PSM_PROFILE */ +#endif + +#define PSMI_MIN_EP_CONNECT_TIMEOUT (2 * SEC_ULL) +#define PSMI_MIN_EP_CLOSE_TIMEOUT (1 * SEC_ULL) +#define PSMI_MAX_EP_CLOSE_TIMEOUT (2 * SEC_ULL) + +#define PSMI_MIN_EP_CLOSE_GRACE_INTERVAL (1 * SEC_ULL) +#define PSMI_MAX_EP_CLOSE_GRACE_INTERVAL (2 * SEC_ULL) + + +#define HFI_MAX_RAILS 4 + +#define AFFINITY_SHM_BASENAME "/psm2_hfi_affinity_shm" +#define AFFINITY_SHMEMSIZE sysconf(_SC_PAGE_SIZE) +#define AFFINITY_SHM_REF_COUNT_LOCATION 0 +#define AFFINITY_SHM_HFI_INDEX_LOCATION 1 +#define SEM_AFFINITY_SHM_RW_BASENAME "/psm2_hfi_affinity_shm_rw_mutex" + +#define PSMI_RCVTHREAD_FLAGS 0x1 +/**< + * Default setting for Receive thread + * + * 0x0 disables rcvthread by default + * 0x1 enables ips receive thread by default + */ + +/* + * Define one of these below. + * + * Spinlock gives the best performance and makes sense with the progress thread + * only because the progress thread does a "trylock" and then goes back to + * sleep in a poll. + * + * Mutexlock should be used for experimentation while the more useful + * mutexlock-debug should be enabled during development to catch potential + * errors. + */ +#ifdef PSM_DEBUG +#define PSMI_LOCK_IS_MUTEXLOCK_DEBUG +#else +#define PSMI_LOCK_IS_SPINLOCK +/* #define PSMI_LOCK_IS_MUTEXLOCK */ +/* #define PSMI_LOCK_IS_MUTEXLOCK_DEBUG */ +/* #define PSMI_PLOCK_IS_NOLOCK */ +#endif + +#ifdef PSM_CUDA +/* XXX TODO: Getting the gpu page size from driver at init time */ +#define PSMI_GPU_PAGESIZE 65536 + +#define CUDA_SMALLHOSTBUF_SZ (256*1024) +#define CUDA_WINDOW_PREFETCH_DEFAULT 2 +#define GPUDIRECT_THRESH_RV 3 + +#define GDR_COPY_THRESH_SEND 32 +#define GDR_COPY_THRESH_RECV 64000 +/* All GPU transfers beyond this threshold use + * RNDV protocol. It is mostly a send side knob. + */ +#define CUDA_THRESH_RNDV 32768 +#endif + +#define MQ_HFI_THRESH_TINY 8 +#define MQ_HFI_THRESH_EGR_SDMA_XEON 34000 /* Eager Xeon blocking */ +#define MQ_HFI_THRESH_EGR_SDMA_PHI2 200000 /* Eager Phi2 blocking */ +#define MQ_HFI_THRESH_EGR_SDMA_SQ_XEON 16000 /* Eager Xeon non-blocking */ +#define MQ_HFI_THRESH_EGR_SDMA_SQ_PHI2 65536 /* Eager Phi2 non-blocking */ + +#define MQ_HFI_THRESH_RNDV_PHI2 200000 +#define MQ_HFI_THRESH_RNDV_XEON 64000 + +#define MQ_HFI_WINDOW_RNDV_PHI2 4194304 +#define MQ_HFI_WINDOW_RNDV_XEON 131072 + +#ifdef PSM_CUDA +#define MQ_HFI_WINDOW_RNDV_CUDA 2097152 +#endif + +#define MQ_SHM_THRESH_RNDV 16000 + +#define NUM_HASH_BUCKETS 64 +#define HASH_THRESHOLD 65 +#define NUM_HASH_CONFIGS 3 +#define NUM_MQ_SUBLISTS (NUM_HASH_CONFIGS + 1) + +#define REMOVE_ENTRY 1 + + +/* Keep timer stats */ +#define PSMI_TIMER_STATS 0 + + +/* Psm context */ +#define HAL_CONTEXT_OPEN_RETRY_MAX 3 + + +/* + * By default, PSMI_DEVICES_DEFAULT establishes the bind order a component is + * tested for reachability to each peer. First self, then shm and finally + * hfi. The order should really only affect endpoints that happen to be on + * the same node. PSM will correctly detect that two endpoints are on the same + * node even though they may be using different host interfaces. + */ +#define PSMI_DEVICES_DEFAULT "self,shm,hfi" + +/* Lock */ +#define PSMI_USE_PTHREAD_SPINLOCKS 0 + +/* Utils */ +#define PSMI_EPID_TABSIZE_CHUNK 128 +#define PSMI_EPID_TABLOAD_FACTOR ((float)0.7) + +#define PSMI_EP_HOSTNAME_LEN 64 /* hostname only */ +#define PSMI_EP_NAME_LEN 96 /* hostname:LID:context:subcontext */ + +#define PSMI_FAULTINJ_SPEC_NAMELEN 32 + +#endif /* PSM_CONFIG_H */ diff --git a/psm_context.c b/psm_context.c new file mode 100644 index 0000000..48f4671 --- /dev/null +++ b/psm_context.c @@ -0,0 +1,711 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +#include +#include +#include "psm_user.h" +#include "psm2_hal.h" + +static int psmi_get_hfi_selection_algorithm(void); + +psm2_error_t psmi_context_interrupt_set(psmi_context_t *context, int enable) +{ + int poll_type; + int ret; + + if (!enable == !psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED)) + return PSM2_OK; + + if (enable) + poll_type = PSMI_HAL_POLL_TYPE_URGENT; + else + poll_type = 0; + + ret = psmi_hal_poll_type(poll_type, context->psm_hw_ctxt); + + if (ret != 0) + return PSM2_EP_NO_RESOURCES; + else { + if (enable) + psmi_hal_add_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED); + else + psmi_hal_sub_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED); + return PSM2_OK; + } +} + +int psmi_context_interrupt_isenabled(psmi_context_t *context) +{ + return psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED); +} + +/* Returns 1 when all of the active units have their free contexts + * equal the number of contexts. This is an indication that no + * jobs are currently running. + * + * Note that this code is clearly racy (this code may happen concurrently + * by two or more processes, and this point of observation, + * occurs earlier in time to when the decision is made for deciding which + * context to assign, which will also occurs earlier in time to when the + * context is actually assigned. And, when the context is finally + * assigned, this will change the "nfreectxts" observed below.) + */ +static int psmi_all_active_units_have_max_freecontexts(int nunits) +{ + int u; + + for (u=0;u < nunits;u++) + { + if (psmi_hal_get_unit_active(u) > 0) + { + int nfreectxts=psmi_hal_get_num_free_contexts(u), + nctxts=psmi_hal_get_num_contexts(u); + if (nfreectxts > 0 && nctxts > 0) + { + if (nfreectxts != nctxts) + return 0; + } + } + } + return 1; +} + +/* returns the integer value of an environment variable, or 0 if the environment + * variable is not set. */ +static int psmi_get_envvar(const char *env) +{ + const char *env_val = getenv(env); + + if (env_val && *env_val) + { + int r = atoi(env_val); + return (r >= 0) ? r : 0; + } + return 0; +} + +/* returns the 8-bit hash value of an uuid. */ +static inline +uint8_t +psmi_get_uuid_hash(psm2_uuid_t const uuid) +{ + int i; + uint8_t hashed_uuid = 0; + + for (i=0; i < sizeof(psm2_uuid_t); ++i) + hashed_uuid ^= *((uint8_t const *)uuid + i); + + return hashed_uuid; +} + +int psmi_get_current_proc_location() +{ + int core_id, node_id; + + core_id = sched_getcpu(); + if (core_id < 0) + return -EINVAL; + + node_id = numa_node_of_cpu(core_id); + if (node_id < 0) + return -EINVAL; + + return node_id; +} + +static void +psmi_spread_hfi_selection(psm2_uuid_t const job_key, long *unit_start, + long *unit_end, int nunits) +{ + /* if the number of ranks on the host is 1 and ... */ + if ((psmi_get_envvar("MPI_LOCALNRANKS") == 1) && + /* + * All of the active units have free contexts equal the + * number of contexts. + */ + psmi_all_active_units_have_max_freecontexts(nunits)) { + /* we start looking at unit 0, and end at nunits-1: */ + *unit_start = 0; + *unit_end = nunits - 1; + } else { + /* else, we are going to look at: + (a hash of the job key plus the local rank id) mod nunits. */ + + *unit_start = (psmi_get_envvar("MPI_LOCALRANKID") + + psmi_get_uuid_hash(job_key)) % nunits; + if (*unit_start > 0) + *unit_end = *unit_start - 1; + else + *unit_end = nunits-1; + } +} + +static int +psmi_create_and_open_affinity_shm(psm2_uuid_t const job_key) +{ + int shm_fd, ret; + int first_to_create = 0; + size_t shm_name_len = 256; + shared_affinity_ptr = NULL; + affinity_shm_name = NULL; + affinity_shm_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, shm_name_len); + + psmi_assert_always(affinity_shm_name != NULL); + snprintf(affinity_shm_name, shm_name_len, + AFFINITY_SHM_BASENAME".%d", + psmi_get_uuid_hash(job_key)); + shm_fd = shm_open(affinity_shm_name, O_RDWR | O_CREAT | O_EXCL, + S_IRUSR | S_IWUSR); + if ((shm_fd < 0) && (errno == EEXIST)) { + shm_fd = shm_open(affinity_shm_name, O_RDWR, S_IRUSR | S_IWUSR); + if (shm_fd < 0) { + _HFI_VDBG("Cannot open affinity shared mem fd:%s, errno=%d\n", + affinity_shm_name, errno); + return shm_fd; + } + } else if (shm_fd > 0) { + first_to_create = 1; + } else { + _HFI_VDBG("Cannot create affinity shared mem fd:%s, errno=%d\n", + affinity_shm_name, errno); + } + + ret = ftruncate(shm_fd, AFFINITY_SHMEMSIZE); + if ( ret < 0 ) + return ret; + + shared_affinity_ptr = (uint64_t *) mmap(NULL, AFFINITY_SHMEMSIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, shm_fd, 0); + if (shared_affinity_ptr == MAP_FAILED) { + _HFI_VDBG("Cannot mmap affinity shared memory. errno=%d\n", + errno); + close(shm_fd); + return -1; + } + close(shm_fd); + + psmi_affinity_shared_file_opened = 1; + + if (first_to_create) { + _HFI_VDBG("Creating shm to store HFI affinity per socket\n"); + + memset(shared_affinity_ptr, 0, AFFINITY_SHMEMSIZE); + + /* + * Once shm object is initialized, unlock others to be able to + * use it. + */ + psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name); + } else { + _HFI_VDBG("Opening shm object to read/write HFI affinity per socket\n"); + } + + /* + * Start critical section to increment reference count when creating + * or opening shm object. Decrement of ref count will be done before + * closing the shm. + */ + if (psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name)) { + _HFI_VDBG("Could not enter critical section to update shm refcount\n"); + return -1; + } + + shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] += 1; + + /* End critical section */ + psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name); + + return 0; +} + +/* + * Spread HFI selection between units if we find more than one within a socket. + */ +static void +psmi_spread_hfi_within_socket(long *unit_start, long *unit_end, int node_id, + int *saved_hfis, int found, psm2_uuid_t const job_key) +{ + int ret, shm_location; + + /* + * Take affinity lock and open shared memory region to be able to + * accurately determine which HFI to pick for this process. If any + * issues, bail by picking first known HFI. + */ + if (!psmi_affinity_semaphore_open) + goto spread_hfi_fallback; + + ret = psmi_create_and_open_affinity_shm(job_key); + if (ret < 0) + goto spread_hfi_fallback; + + shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION + node_id; + if (shm_location > AFFINITY_SHMEMSIZE) + goto spread_hfi_fallback; + + /* Start critical section to read/write shm object */ + if (psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name)) { + _HFI_VDBG("Could not enter critical section to update HFI index\n"); + goto spread_hfi_fallback; + } + + *unit_start = *unit_end = shared_affinity_ptr[shm_location]; + shared_affinity_ptr[shm_location] = + (shared_affinity_ptr[shm_location] + 1) % found; + _HFI_VDBG("Selected HFI index= %ld, Next HFI=%ld, node = %d, local rank=%d, found=%d.\n", + *unit_start, shared_affinity_ptr[shm_location], node_id, + psmi_get_envvar("MPI_LOCALRANKID"), found); + + /* End Critical Section */ + psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name); + + return; + +spread_hfi_fallback: + *unit_start = *unit_end = saved_hfis[0]; +} + +static void +psmi_create_affinity_semaphores(psm2_uuid_t const job_key) +{ + int ret; + sem_affinity_shm_rw_name = NULL; + size_t sem_len = 256; + + /* + * If already opened, no need to do anything else. + * This could be true for Multi-EP cases where a different thread has + * already created the semaphores. We don't need separate locks here as + * we are protected by the overall "psmi_creation_lock" which each + * thread will take in psm2_ep_open() + */ + if (psmi_affinity_semaphore_open) + return; + + sem_affinity_shm_rw_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, sem_len); + psmi_assert_always(sem_affinity_shm_rw_name != NULL); + snprintf(sem_affinity_shm_rw_name, sem_len, + SEM_AFFINITY_SHM_RW_BASENAME".%d", + psmi_get_uuid_hash(job_key)); + + ret = psmi_init_semaphore(&sem_affinity_shm_rw, sem_affinity_shm_rw_name, + S_IRUSR | S_IWUSR, 0); + if (ret) { + _HFI_VDBG("Cannot initialize semaphore: %s for read-write access to shm object.\n", + sem_affinity_shm_rw_name); + sem_close(sem_affinity_shm_rw); + psmi_free(sem_affinity_shm_rw_name); + sem_affinity_shm_rw_name = NULL; + return; + } + + _HFI_VDBG("Semaphore: %s created for read-write access to shm object.\n", + sem_affinity_shm_rw_name); + + psmi_affinity_semaphore_open = 1; + + return; +} + +static +psm2_error_t +psmi_compute_start_and_end_unit(long unit_param,int nunitsactive,int nunits, + psm2_uuid_t const job_key, + long *unit_start,long *unit_end) +{ + unsigned short hfi_sel_alg = PSMI_UNIT_SEL_ALG_ACROSS; + int node_id, unit_id, found = 0; + int saved_hfis[nunits]; + + /* if the user did not set HFI_UNIT then ... */ + if (unit_param == HFI_UNIT_ID_ANY) + { + /* Get the actual selection algorithm from the environment: */ + hfi_sel_alg = psmi_get_hfi_selection_algorithm(); + /* If round-robin is selection algorithm and ... */ + if ((hfi_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS) && + /* there are more than 1 active units then ... */ + (nunitsactive > 1)) + { + /* + * Pick first HFI we find on same root complex + * as current task. If none found, fall back to + * load-balancing algorithm. + */ + node_id = psmi_get_current_proc_location(); + if (node_id >= 0) { + for (unit_id = 0; unit_id < nunits; unit_id++) { + if (psmi_hal_get_unit_active(unit_id) <= 0) + continue; + + int node_id_i; + + if (!psmi_hal_get_node_id(unit_id, &node_id_i)) { + if (node_id_i == node_id) { + saved_hfis[found] = unit_id; + found++; + } + } + } + + if (found > 1) { + psmi_create_affinity_semaphores(job_key); + psmi_spread_hfi_within_socket(unit_start, unit_end, + node_id, saved_hfis, + found, job_key); + } else if (found == 1) { + *unit_start = *unit_end = saved_hfis[0]; + } + } + + if (node_id < 0 || !found) { + psmi_spread_hfi_selection(job_key, unit_start, + unit_end, nunits); + } + } else if ((hfi_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS_ALL) && + (nunitsactive > 1)) { + psmi_spread_hfi_selection(job_key, unit_start, + unit_end, nunits); + } + else { + *unit_start = 0; + *unit_end = nunits - 1; + } + } else if (unit_param >= 0) { + /* the user specified HFI_UNIT, we use it. */ + *unit_start = *unit_end = unit_param; + } else { + psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM2 can't open unit: %ld for reading and writing", + unit_param); + return PSM2_EP_DEVICE_FAILURE; + } + + return PSM2_OK; +} + +psm2_error_t +psmi_context_open(const psm2_ep_t ep, long unit_param, long port, + psm2_uuid_t const job_key, int64_t timeout_ns, + psmi_context_t *context) +{ + long open_timeout = 0, unit_start, unit_end, unit_id, unit_id_prev; + psm2_error_t err = PSM2_OK; + int nunits = psmi_hal_get_num_units(), nunitsactive=0; + + /* + * If shared contexts are enabled, try our best to schedule processes + * across one or many devices + */ + + /* if no units, then no joy. */ + if (nunits <= 0) + { + err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM2 no hfi units are available"); + goto ret; + } + + /* Calculate the number of active units: */ + for (unit_id=0;unit_id < nunits;unit_id++) + { + if (psmi_hal_get_unit_active(unit_id) > 0) + nunitsactive++; + } + /* if no active units, then no joy. */ + if (nunitsactive == 0) + { + err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM2 no hfi units are active"); + goto ret; + } + if (timeout_ns > 0) + open_timeout = (long)(timeout_ns / MSEC_ULL); + + + unit_start = 0; unit_end = nunits - 1; + err = psmi_compute_start_and_end_unit(unit_param, nunitsactive, + nunits, job_key, + &unit_start, &unit_end); + if (err != PSM2_OK) + return err; + + /* this is the start of a loop that starts at unit_start and goes to unit_end. + but note that the way the loop computes the loop control variable is by + an expression involving the mod operator. */ + int success = 0; + unit_id_prev = unit_id = unit_start; + do + { + /* close previous opened unit fd before attempting open of current unit. */ + if (psmi_hal_get_fd(context->psm_hw_ctxt) > 0) { + psmi_hal_close_context(&context->psm_hw_ctxt); + context->psm_hw_ctxt = 0; + } + + /* if the unit_id is not active, go to next one. */ + if (psmi_hal_get_unit_active(unit_id) <= 0) { + unit_id_prev = unit_id; + unit_id = (unit_id + 1) % nunits; + continue; + } + + /* open this unit. */ + int rv = psmi_hal_context_open(unit_id, port, open_timeout, + ep, job_key, context, + psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED), + HAL_CONTEXT_OPEN_RETRY_MAX); + + /* go to next unit if failed to open. */ + if (rv || context->psm_hw_ctxt == NULL) { + unit_id_prev = unit_id; + unit_id = (unit_id + 1) % nunits; + continue; + } + + success = 1; + break; + + } while (unit_id_prev != unit_end); + + if (!success) + { + err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM2 can't open hfi unit: %ld",unit_param); + goto bail; + } + + context->ep = (psm2_ep_t) ep; + +#ifdef PSM_CUDA + /* Check backward compatibility bits here and save the info */ + if (psmi_hal_has_cap(PSM_HAL_CAP_GPUDIRECT_OT)) + is_driver_gpudirect_enabled = 1; +#endif + _HFI_VDBG("hfi_userinit() passed.\n"); + + /* Fetch hw parameters from HAL (that were obtained during opening the context above. */ + + int lid = psmi_hal_get_lid(context->psm_hw_ctxt); + ep->unit_id = psmi_hal_get_unit_id(context->psm_hw_ctxt); + ep->portnum = psmi_hal_get_port_num(context->psm_hw_ctxt); + ep->gid_lo = psmi_hal_get_gid_lo(context->psm_hw_ctxt); + ep->gid_hi = psmi_hal_get_gid_hi(context->psm_hw_ctxt); + int ctxt = psmi_hal_get_context(context->psm_hw_ctxt); + int subctxt = psmi_hal_get_subctxt(context->psm_hw_ctxt); + uint32_t hfi_type = psmi_hal_get_hfi_type(context->psm_hw_ctxt); + context->ep = (psm2_ep_t) ep; + + /* Construct epid for this Endpoint */ + + switch (PSMI_EPID_VERSION) { + case PSMI_EPID_V1: + context->epid = PSMI_EPID_PACK_V1(lid, ctxt, + subctxt, + ep->unit_id, + PSMI_EPID_VERSION, 0x3ffffff); + break; + case PSMI_EPID_V2: + context->epid = PSMI_EPID_PACK_V2(lid, ctxt, + subctxt, + PSMI_EPID_IPS_SHM, /*Not a only-shm epid */ + PSMI_EPID_VERSION, ep->gid_hi); + break; + default: + /* Epid version is greater than max supportd version. */ + psmi_assert_always(PSMI_EPID_VERSION <= PSMI_EPID_V2); + break; + } + + _HFI_VDBG + ("construct epid: lid %d ctxt %d subctxt %d hcatype %d mtu %d\n", + lid, ctxt, + subctxt, hfi_type, ep->mtu); + + goto ret; + +bail: + _HFI_PRDBG("open failed: unit_id: %ld, err: %d (%s)\n", unit_id, err, strerror(errno)); + if (psmi_hal_get_fd(context->psm_hw_ctxt) > 0) + psmi_hal_close_context(&context->psm_hw_ctxt); +ret: + + _HFI_VDBG("psmi_context_open() return %d\n", err); + return err; +} + +psm2_error_t psmi_context_close(psmi_context_t *context) +{ + if (psmi_hal_get_fd(context->psm_hw_ctxt) > 0) + psmi_hal_close_context(&context->psm_hw_ctxt); + + return PSM2_OK; +} + +/* + * This function works whether a context is initialized or not in a psm2_ep. + * + * Returns one of + * + * PSM2_OK: Port status is ok (or context not initialized yet but still "ok") + * PSM2_OK_NO_PROGRESS: Cable pulled + * PSM2_EP_NO_NETWORK: No network, no lid, ... + * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc. + * The message follows the per-port status + * As of 7322-ready driver, need to check port-specific qword for IB + * as well as older unit-only. For now, we don't have the port interface + * defined, so just check port 0 qword for spi_status + */ +psm2_error_t psmi_context_check_status(const psmi_context_t *contexti) +{ + psm2_error_t err = PSM2_OK; + psmi_context_t *context = (psmi_context_t *) contexti; + char *errmsg = NULL; + uint64_t status = psmi_hal_get_hw_status(context->psm_hw_ctxt); + + /* Fatal chip-related errors */ + if (!(status & PSM_HAL_HW_STATUS_CHIP_PRESENT) || + !(status & PSM_HAL_HW_STATUS_INITTED) || + (status & PSM_HAL_HW_STATUS_HWERROR)) { + + err = PSM2_EP_DEVICE_FAILURE; + if (err != context->status_lasterr) { /* report once */ + volatile char *errmsg_sp="no err msg"; + + psmi_hal_get_hw_status_freezemsg(&errmsg_sp, + context->psm_hw_ctxt); + + if (*errmsg_sp) + psmi_handle_error(context->ep, err, + "Hardware problem: %s", + errmsg_sp); + else { + if (status & PSM_HAL_HW_STATUS_HWERROR) + errmsg = "Hardware error"; + else + errmsg = "Hardware not found"; + + psmi_handle_error(context->ep, err, + "%s", errmsg); + } + } + } + /* Fatal network-related errors with timeout: */ + else if (!(status & PSM_HAL_HW_STATUS_IB_CONF) || + !(status & PSM_HAL_HW_STATUS_IB_READY)) { + err = PSM2_EP_NO_NETWORK; + if (err != context->status_lasterr) { /* report once */ + context->networkLostTime = time(NULL); + } + else + { + time_t now = time(NULL); + static const double seventySeconds = 70.0; + + /* The linkup time duration for a system should allow the time needed + to complete 3 LNI passes which is: + 50 seconds for a passive copper channel + 65 seconds for optical channel. + (we add 5 seconds of margin.) */ + if (difftime(now,context->networkLostTime) > seventySeconds) + { + volatile char *errmsg_sp="no err msg"; + + psmi_hal_get_hw_status_freezemsg(&errmsg_sp, + context->psm_hw_ctxt); + + psmi_handle_error(context->ep, err, "%s", + *errmsg_sp ? errmsg_sp : + "Network down"); + } + } + } + + if (err == PSM2_OK && context->status_lasterr != PSM2_OK) + context->status_lasterr = PSM2_OK; /* clear error */ + else if (err != PSM2_OK) + context->status_lasterr = err; /* record error */ + + return err; +} + +static +int psmi_get_hfi_selection_algorithm(void) +{ + union psmi_envvar_val env_hfi1_alg; + int hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS; + + /* If a specific unit is set in the environment, use that one. */ + psmi_getenv("HFI_SELECTION_ALG", + "HFI Device Selection Algorithm to use. Round Robin (Default) " + ", Packed or Round Robin All.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"Round Robin", &env_hfi1_alg); + + if (!strcasecmp(env_hfi1_alg.e_str, "Round Robin")) + hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS; + else if (!strcasecmp(env_hfi1_alg.e_str, "Packed")) + hfi1_alg = PSMI_UNIT_SEL_ALG_WITHIN; + else if (!strcasecmp(env_hfi1_alg.e_str, "Round Robin All")) + hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS_ALL; + else { + _HFI_ERROR + ("Unknown HFI selection algorithm %s. Defaulting to Round Robin " + "allocation of HFIs.\n", env_hfi1_alg.e_str); + hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS; + } + + return hfi1_alg; +} diff --git a/psm_context.h b/psm_context.h new file mode 100644 index 0000000..d152a7f --- /dev/null +++ b/psm_context.h @@ -0,0 +1,120 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_IN_USER_H +#error psm_context.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSM_CONTEXT_H +#define _PSM_CONTEXT_H + +typedef +struct psmi_context { + + /* The following three member variables are used for sharing contexts among + subcontexts and they have the following common properties: + + a. They are all initialized below HAL layer when the context is opened. + b. If they are NULL that means no context is being shared among subcontexts, + non-NULL means a context is being shared among some number of subcontexts. + c. The initialization code is currently found in the gen1 hal instance. + */ + + void *spio_ctrl; + void *tid_ctrl; + void *tf_ctrl; + + /* end of shared context member variables. */ + + psmi_hal_hw_context psm_hw_ctxt; + + psm2_ep_t ep; /* psm ep handle */ + psm2_epid_t epid; /* psm integral ep id */ + uint32_t rcvthread_flags; + psm2_error_t status_lasterr; + time_t networkLostTime; +} psmi_context_t; + +psm2_error_t +psmi_context_open(const psm2_ep_t ep, long unit_id, long port, + psm2_uuid_t const job_key, + int64_t timeout_ns, psmi_context_t *context); + +psm2_error_t psmi_context_close(psmi_context_t *context); + +/* Check status of context */ +psm2_error_t psmi_context_check_status(const psmi_context_t *context); + +psm2_error_t psmi_context_interrupt_set(psmi_context_t *context, int enable); +int psmi_context_interrupt_isenabled(psmi_context_t *context); + +/* + * round robin contexts across HFIs, then + * ports; this is the default. + * This option spreads the HFI selection within the local socket. + * If it is preferred to spread job over over entire set of + * HFIs within the system, see ALG_ACROSS_ALL below. + */ +#define PSMI_UNIT_SEL_ALG_ACROSS PSM_HAL_ALG_ACROSS + +#define PSMI_UNIT_SEL_ALG_ACROSS_ALL PSM_HAL_ALG_ACROSS_ALL + +/* + * use all contexts on an HFI (round robin + * active ports within), then next HFI + */ +#define PSMI_UNIT_SEL_ALG_WITHIN PSM_HAL_ALG_WITHIN + +#endif /* PSM_CONTEXT_H */ diff --git a/psm_diags.c b/psm_diags.c new file mode 100644 index 0000000..2a43c22 --- /dev/null +++ b/psm_diags.c @@ -0,0 +1,362 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm_mq_internal.h" + +typedef void (*memcpy_fn_t) (void *dst, const void *src, size_t n); +static int psmi_test_memcpy(memcpy_fn_t, const char *name); +static int psmi_test_epid_table(int numelems); + +int psmi_diags(void); + +#define diags_assert(x) do { \ + if (!(x)) { \ + _HFI_ERROR("Diags assertion failure: %s\n", \ + #x); \ + goto fail; \ + } \ + } while (0) + +#define DIAGS_RETURN_PASS(str) \ + do { _HFI_INFO("%s: PASSED %s\n", __func__, str); return 0; } \ + while (0) +#define DIAGS_RETURN_FAIL(str) \ + do { _HFI_INFO("%s: FAILED %s\n", __func__, str); return 1; } \ + while (0) + +int psmi_diags(void) +{ + int ret = 0; + ret |= psmi_test_epid_table(2048); + ret |= psmi_test_memcpy((memcpy_fn_t) psmi_memcpyo, "psmi_memcpyo"); + /* ret |= psmi_test_memcpy((memcpy_fn_t) psmi_mq_mtucpy, "psmi_mq_mtucpy"); */ + + if (ret) + DIAGS_RETURN_FAIL(""); + else + DIAGS_RETURN_PASS(""); +} + +/* + * Hash table test + */ +#define NALLOC 1024 +static int psmi_test_epid_table(int numelems) +{ + ptl_ctl_t ctl; + psm2_epaddr_t *ep_array, epaddr, ep_alloc; + psm2_epid_t *epid_array, epid_tmp; + psm2_ep_t ep = (psm2_ep_t) (uintptr_t) 0xabcdef00; + struct psmi_epid_table *tab; + int i, j; + struct drand48_data drand48_data; + + ep_alloc = + (psm2_epaddr_t) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems, + sizeof(struct psm2_epaddr)); + ep_array = + (psm2_epaddr_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems, + sizeof(struct psm2_epaddr *)); + epid_array = + (psm2_epid_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems, + sizeof(psm2_epid_t)); + diags_assert(ep_alloc != NULL); + diags_assert(ep_array != NULL); + diags_assert(epid_array != NULL); + + srand48_r(12345678, &drand48_data); + + psmi_epid_init(); + tab = &psmi_epid_table; + ctl.ep = ep; + + for (i = 0; i < numelems; i++) { + epid_array[i] = i; + ep_alloc[i].ptlctl = &ctl; + ep_alloc[i].epid = epid_array[i]; + ep_array[i] = &ep_alloc[i]; + } + for (i = 0; i < numelems; i++) { + psmi_epid_add(ep, epid_array[i], ep_array[i]); + } + + /* Randomize epid_array */ + for (i = 0; i < numelems; i++) { + long int rand_result; + lrand48_r(&drand48_data, &rand_result); + j = (int)(rand_result % numelems); + epid_tmp = epid_array[i]; + epid_array[i] = epid_array[j]; + epid_array[j] = epid_tmp; + } + /* Lookup. */ + for (i = 0; i < numelems; i++) { + epaddr = psmi_epid_lookup(ep, epid_array[i]); + diags_assert(epaddr != NULL); + diags_assert(epaddr->epid == epid_array[i]); + diags_assert(epaddr->ptlctl->ep == ep); + } + + /* Randomize epid_array again */ + for (i = 0; i < numelems; i++) { + long int rand_result; + lrand48_r(&drand48_data, &rand_result); + j = (int)(rand_result % numelems); + epid_tmp = epid_array[i]; + epid_array[i] = epid_array[j]; + epid_array[j] = epid_tmp; + } + /* Delete half */ + for (i = 0; i < numelems / 2; i++) { + epaddr = psmi_epid_remove(ep, epid_array[i]); + diags_assert(epaddr != NULL); + diags_assert(epaddr->epid == epid_array[i]); + diags_assert(epaddr->ptlctl->ep == ep); + } + /* Lookup other half -- expect non-NULL, then delete */ + for (i = numelems / 2; i < numelems; i++) { + epaddr = psmi_epid_lookup(ep, epid_array[i]); + diags_assert(epaddr != NULL); + diags_assert(epaddr->epid == epid_array[i]); + diags_assert(epaddr->ptlctl->ep == ep); + epaddr = psmi_epid_remove(ep, epid_array[i]); + epaddr = psmi_epid_lookup(ep, epid_array[i]); + diags_assert(epaddr == NULL); + } + /* Lookup whole thing, expect done */ + for (i = 0; i < numelems; i++) { + epaddr = psmi_epid_lookup(ep, epid_array[i]); + diags_assert(epaddr == NULL); + } + for (i = 0; i < tab->tabsize; i++) { + diags_assert(tab->table[i].entry == NULL || + tab->table[i].entry == EPADDR_DELETED); + } + + /* Make sure we're not leaking memory somewhere... */ + diags_assert(tab->tabsize > tab->tabsize_used && + tab->tabsize * PSMI_EPID_TABLOAD_FACTOR > + tab->tabsize_used); + + /* Only free on success */ + psmi_epid_fini(); + psmi_free(epid_array); + psmi_free(ep_array); + psmi_free(ep_alloc); + DIAGS_RETURN_PASS(""); + +fail: + /* Klocwork scan report memory leak. */ + psmi_epid_fini(); + if (epid_array) + psmi_free(epid_array); + if (ep_array) + psmi_free(ep_array); + if (ep_alloc) + psmi_free(ep_alloc); + DIAGS_RETURN_FAIL(""); +} + +/* + * Memcpy correctness test + */ +static int memcpy_check_size(memcpy_fn_t fn, int *p, int *f, size_t n); +static void *memcpy_check_one(memcpy_fn_t fn, void *dst, void *src, size_t n); + +static int psmi_test_memcpy(memcpy_fn_t fn, const char *memcpy_name) +{ + const int CORNERS = 0; + const long long lo = 1; + const long long hi = 16 * 1024 * 1024; + const long long below = 32; + const long long above = 32; + long long n, m; + char buf[128]; + int ret = 0; + int memcpy_passed; + int memcpy_failed; + + memcpy_passed = 0; + memcpy_failed = 0; + + ret = memcpy_check_size(fn, &memcpy_passed, &memcpy_failed, 0); + if (ret < 0) + DIAGS_RETURN_FAIL("no heap space"); + + for (n = lo; n <= hi; n <<= 1) { + _HFI_INFO("%s %d align=0..16\n", memcpy_name, (int)n); + for (m = n - below; m <= n + above; m++) { + if (m == n) { + ret = + memcpy_check_size(fn, &memcpy_passed, + &memcpy_failed, n); + if (ret < 0) + DIAGS_RETURN_FAIL("no heap space"); + } else if (CORNERS && m >= lo && m <= hi && m > (n >> 1) + && m < max(n, ((n << 1) - below))) { + ret = + memcpy_check_size(fn, &memcpy_passed, + &memcpy_failed, + (size_t) m); + if (ret < 0) + DIAGS_RETURN_FAIL("no heap space"); + } + } + } + + int total = memcpy_passed + memcpy_failed; + if (total > 0) { + _HFI_INFO("%d memcpy tests with %d passed (%.2f%%) " + "and %d failed (%.2f%%)\n", + total, memcpy_passed, (100.0 * memcpy_passed) / total, + memcpy_failed, (100.0 * memcpy_failed) / total); + } + if (memcpy_failed) { + snprintf(buf, sizeof(buf), "%s %.2f%% of tests memcpy_failed", + memcpy_name, (100.0 * memcpy_failed) / total); + DIAGS_RETURN_FAIL(buf); + } else { + DIAGS_RETURN_PASS(memcpy_name); + } +} + +void *memcpy_check_one(memcpy_fn_t fn, void *dst, void *src, size_t n) +{ + int ok = 1; + unsigned int seed = (unsigned int) + ((uintptr_t) dst ^ (uintptr_t) src ^ (uintptr_t) n); + size_t i; + struct drand48_data drand48_data; + + if (!n) + return dst; + + memset(src, 0x55, n); + memset(dst, 0xaa, n); + srand48_r(seed, &drand48_data); + for (i = 0; i < n; i++) { + long int rand_result; + lrand48_r(&drand48_data, &rand_result); + ((uint8_t *) src)[i] = (((int)(rand_result & INT_MAX)) >> 16) & 0xff; + } + + fn(dst, src, n); + memset(src, 0, n); + srand48_r(seed, &drand48_data); + for (i = 0; i < n; i++) { + long int rand_result; + lrand48_r(&drand48_data, &rand_result); + int value = (int)(uint8_t) (((int)(rand_result % INT_MAX)) >> 16); + int v = (int)((uint8_t *) dst)[i]; + if (v != value) { + _HFI_ERROR + ("Error on index %llu : got %d instead of %d\n", + (unsigned long long)i, v, value); + ok = 0; + } + } + return ok ? dst : NULL; +} + +int memcpy_check_size(memcpy_fn_t fn, int *p, int *f, size_t n) +{ +#define num_aligns 16 +#define USE_MALLOC 0 +#define DEBUG 0 + uint8_t *src; + uint8_t *dst; + size_t size = n * 2 + num_aligns; + if (USE_MALLOC) { + src = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size); + dst = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size); + if (src == NULL || dst == NULL) + if (src) + psmi_free(src); + if (dst) + psmi_free(dst); + return -1; + } else { + void *src_p, *dst_p; + if (posix_memalign(&src_p, 64, size) != 0 || + posix_memalign(&dst_p, 64, size) != 0) + return -1; + else { + src = (uint8_t *) src_p; + dst = (uint8_t *) dst_p; + } + } + int src_align, dst_align; + for (src_align = 0; src_align < num_aligns; src_align++) { + for (dst_align = 0; dst_align < num_aligns; dst_align++) { + uint8_t *d = ((uint8_t *) dst) + dst_align; + uint8_t *s = ((uint8_t *) src) + src_align; + int ok = (memcpy_check_one(fn, d, s, n) != NULL); + if (DEBUG || !ok) { + _HFI_INFO("memcpy(%p, %p, %llu) : %s\n", d, s, + (unsigned long long)n, + ok ? "passed" : "failed"); + } + if (ok) { + (*p)++; + } else { + (*f)++; + } + } + } + psmi_free(src); + psmi_free(dst); + return 0; +} diff --git a/psm_ep.c b/psm_ep.c new file mode 100644 index 0000000..d78431d --- /dev/null +++ b/psm_ep.c @@ -0,0 +1,1569 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include +#include +#include /* cpu_set */ +#include /* isalpha */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" + +#ifdef PSM_CUDA +#include "psm_gdrcpy.h" +#endif +/* + * Endpoint management + */ +psm2_ep_t psmi_opened_endpoint = NULL; +int psmi_opened_endpoint_count = 0; + +static psm2_error_t psmi_ep_open_device(const psm2_ep_t ep, + const struct psm2_ep_open_opts *opts, + const psm2_uuid_t unique_job_key, + struct psmi_context *context, + psm2_epid_t *epid); + +/* + * Device management + * + * PSM uses "devices" as components to manage communication to self, to peers + * reachable via shared memory and finally to peers reachable only through + * hfi. + */ + +static psm2_error_t psmi_parse_devices(int devices[PTL_MAX_INIT], + const char *devstr); +static int psmi_device_is_enabled(const int devices[PTL_MAX_INIT], int devid); +int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid); + +psm2_error_t __psm2_ep_num_devunits(uint32_t *num_units_o) +{ + static int num_units = -1; + + PSM2_LOG_MSG("entering"); + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (num_units == -1) { + num_units = psmi_hal_get_num_units(); + if (num_units == -1) + num_units = 0; + } + + *num_units_o = (uint32_t) num_units; + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} +PSMI_API_DECL(psm2_ep_num_devunits) + +static int cmpfunc(const void *p1, const void *p2) +{ + uint64_t a = ((uint64_t *) p1)[0]; + uint64_t b = ((uint64_t *) p2)[0]; + if (a < b) + return -1; + if (a == b) + return 0; + return 1; +} + +static psm2_error_t +psmi_ep_multirail(int *num_rails, uint32_t *unit, uint16_t *port) +{ + uint32_t num_units; + uint64_t gid_hi, gid_lo; + int i, j, ret, count = 0; + char *env; + psm2_error_t err = PSM2_OK; + uint64_t gidh[HFI_MAX_RAILS][3]; + union psmi_envvar_val env_multirail; + int multirail_within_socket_used = 0; + int node_id = -1, found = 0; + + psmi_getenv("PSM2_MULTIRAIL", + "Use all available HFIs in the system for communication.\n" + "0: Disabled (default),\n" + "1: Enable multirail across all available HFIs,\n" + "2: Enable multirail within socket.\n" + "\t For multirail within a socket, we try to find at\n" + "\t least one HFI on the same socket as current task.\n" + "\t If none found, we continue to use other HFIs within\n" + "\t the system.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)0, + &env_multirail); + if (!env_multirail.e_int) { + *num_rails = 0; + return err; + } + + if (env_multirail.e_int == 2) + multirail_within_socket_used = 1; + +/* + * map is in format: unit:port,unit:port,... + */ + if ((env = getenv("PSM2_MULTIRAIL_MAP"))) { + if (sscanf(env, "%d:%d", &i, &j) == 2) { + char *comma = strchr(env, ','); + unit[count] = i; + port[count] = j; + count++; + while (comma) { + if (sscanf(comma, ",%d:%d", &i, &j) != 2) { + break; + } + unit[count] = i; + port[count] = j; + count++; + if (count == HFI_MAX_RAILS) + break; + comma = strchr(comma + 1, ','); + } + } + *num_rails = count; + +/* + * Check if any of the port is not usable. + */ + for (i = 0; i < count; i++) { + ret = psmi_hal_get_port_active(unit[i], port[i]); + if (ret <= 0) { + err = + psmi_handle_error(NULL, + PSM2_EP_DEVICE_FAILURE, + "Unit/port: %d:%d is not active.", + unit[i], port[i]); + return err; + } + ret = psmi_hal_get_port_lid(unit[i], port[i]); + if (ret <= 0) { + err = + psmi_handle_error(NULL, + PSM2_EP_DEVICE_FAILURE, + "Couldn't get lid for unit %d:%d", + unit[i], port[i]); + return err; + } + ret = + psmi_hal_get_port_gid(unit[i], port[i], &gid_hi, + &gid_lo); + if (ret == -1) { + err = + psmi_handle_error(NULL, + PSM2_EP_DEVICE_FAILURE, + "Couldn't get gid for unit %d:%d", + unit[i], port[i]); + return err; + } + } + + return err; + } + + if ((err = psm2_ep_num_devunits(&num_units))) { + return err; + } + if (num_units > HFI_MAX_RAILS) { + _HFI_INFO + ("Found %d units, max %d units are supported, use %d\n", + num_units, HFI_MAX_RAILS, HFI_MAX_RAILS); + num_units = HFI_MAX_RAILS; + } + + /* + * PSM2_MULTIRAIL=2 functionality- + * - Try to find at least find one HFI in the same root + * complex. If none found, continue to run and + * use remaining HFIs in the system. + * - If we do find at least one HFI in same root complex, we + * go ahead and add to list. + */ + if (multirail_within_socket_used) { + node_id = psmi_get_current_proc_location(); + for (i = 0; i < num_units; i++) { + if (psmi_hal_get_unit_active(i) <= 0) + continue; + int node_id_i; + + if (!psmi_hal_get_node_id(i, &node_id_i)) { + if (node_id_i == node_id) { + found = 1; + break; + } + } + } + } +/* + * Get all the ports with a valid lid and gid, one per unit. + */ + for (i = 0; i < num_units; i++) { + int node_id_i; + + if (!psmi_hal_get_node_id(i, &node_id_i)) + { + if (multirail_within_socket_used && + found && (node_id_i != node_id)) + continue; + } + + for (j = HFI_MIN_PORT; j <= HFI_MAX_PORT; j++) { + ret = psmi_hal_get_port_lid(i, j); + if (ret <= 0) + continue; + ret = psmi_hal_get_port_gid(i, j, &gid_hi, &gid_lo); + if (ret == -1) + continue; + + gidh[count][0] = gid_hi; + gidh[count][1] = i; + gidh[count][2] = j; + count++; + break; + } + } + +/* + * Sort all the ports with gidh from small to big. + * This is for multiple fabrics, and we use fabric with the + * smallest gid to make the master connection. + */ + qsort(gidh, count, sizeof(uint64_t) * 3, cmpfunc); + + for (i = 0; i < count; i++) { + unit[i] = (uint32_t) gidh[i][1]; + port[i] = (uint16_t) (uint32_t) gidh[i][2]; + } + *num_rails = count; + return err; +} + +static psm2_error_t +psmi_ep_devlids(uint16_t **lids, uint32_t *num_lids_o, + uint64_t my_gid_hi, uint64_t my_gid_lo) +{ + static uint16_t *hfi_lids; + static uint32_t nlids; + uint32_t num_units; + int i; + psm2_error_t err = PSM2_OK; + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (hfi_lids == NULL) { + if ((err = psm2_ep_num_devunits(&num_units))) + goto fail; + hfi_lids = (uint16_t *) + psmi_calloc(PSMI_EP_NONE, UNDEFINED, + num_units * psmi_hal_get_num_ports(), sizeof(uint16_t)); + if (hfi_lids == NULL) { + err = psmi_handle_error(NULL, PSM2_NO_MEMORY, + "Couldn't allocate memory for dev_lids structure"); + goto fail; + } + + for (i = 0; i < num_units; i++) { + int j; + for (j = HFI_MIN_PORT; j <= HFI_MAX_PORT; j++) { + int lid = psmi_hal_get_port_lid(i, j); + int ret; + uint64_t gid_hi = 0, gid_lo = 0; + + if (lid <= 0) + continue; + ret = psmi_hal_get_port_gid(i, j, &gid_hi, &gid_lo); + if (ret == -1) + continue; + else if (my_gid_hi != gid_hi) { + _HFI_VDBG("LID %d, unit %d, port %d, " + "mismatched GID %llx:%llx and " + "%llx:%llx\n", + lid, i, j, + (unsigned long long)gid_hi, + (unsigned long long)gid_lo, + (unsigned long long)my_gid_hi, + (unsigned long long) + my_gid_lo); + continue; + } + _HFI_VDBG("LID %d, unit %d, port %d, " + "matching GID %llx:%llx and " + "%llx:%llx\n", lid, i, j, + (unsigned long long)gid_hi, + (unsigned long long)gid_lo, + (unsigned long long)my_gid_hi, + (unsigned long long)my_gid_lo); + + hfi_lids[nlids++] = (uint16_t) lid; + } + } + if (nlids == 0) { + err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "Couldn't get lid&gid from any unit/port"); + goto fail; + } + } + *lids = hfi_lids; + *num_lids_o = nlids; + +fail: + return err; +} + +static psm2_error_t +psmi_ep_verify_pkey(psm2_ep_t ep, uint16_t pkey, uint16_t *opkey) +{ + int i, ret; + psm2_error_t err; + + for (i = 0; i < 16; i++) { + ret = psmi_hal_get_port_index2pkey(ep->unit_id, ep->portnum, i); + if (ret < 0) { + err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "Can't get a valid pkey value from pkey table\n"); + return err; + } else if ((ret & 0x7fff) == 0x7fff) { + continue; /* management pkey, not for app traffic. */ + } + + if ((pkey & 0x7fff) == (uint16_t)(ret & 0x7fff)) { + break; + } + } + + /* if pkey does not match */ + if (i == 16) { + err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "Wrong pkey 0x%x, please use PSM2_PKEY to specify a valid pkey\n", + pkey); + return err; + } + + if (((uint16_t)ret & 0x8000) == 0) { + err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "Limited Member pkey 0x%x, please use PSM2_PKEY to specify a valid pkey\n", + (uint16_t)ret); + return err; + } + + /* return the final pkey */ + *opkey = (uint16_t)ret; + + return PSM2_OK; +} + +uint64_t __psm2_epid_nid(psm2_epid_t epid) +{ + uint64_t rv; + + PSM2_LOG_MSG("entering"); + rv = (uint64_t) PSMI_EPID_GET_LID(epid); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_epid_nid) + +/* Currently not exposed to users, we don't acknowledge the existence of + * subcontexts */ +uint64_t psmi_epid_subcontext(psm2_epid_t epid) +{ + return (uint64_t) PSMI_EPID_GET_SUBCONTEXT(epid); +} + +/* Currently not exposed to users, we don't acknowledge the existence of + * service levels encoding within epids. This may require + * changing to expose SLs + */ +uint64_t psmi_epid_version(psm2_epid_t epid) +{ + return (uint64_t) PSMI_EPID_GET_EPID_VERSION(epid); +} + +uint64_t __psm2_epid_context(psm2_epid_t epid) +{ + uint64_t rv; + + PSM2_LOG_MSG("entering"); + rv = (uint64_t) PSMI_EPID_GET_CONTEXT(epid); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_epid_context) + +uint64_t __psm2_epid_port(psm2_epid_t epid) +{ + uint64_t rv; + PSM2_LOG_MSG("entering"); + rv = __psm2_epid_context(epid); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_epid_port) + +psm2_error_t __psm2_ep_query(int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo) +{ + psm2_error_t err = PSM2_OK; + int i; + psm2_ep_t ep; + + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (*num_of_epinfo <= 0) { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid psm2_ep_query parameters"); + PSM2_LOG_MSG("leaving"); + return err; + } + + if (psmi_opened_endpoint == NULL) { + err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED, + "PSM Endpoint is closed or does not exist"); + PSM2_LOG_MSG("leaving"); + return err; + } + + ep = psmi_opened_endpoint; + for (i = 0; i < *num_of_epinfo; i++) { + if (ep == NULL) + break; + array_of_epinfo[i].ep = ep; + array_of_epinfo[i].epid = ep->epid; + array_of_epinfo[i].jkey = ep->jkey; + memcpy(array_of_epinfo[i].uuid, + (void *)ep->uuid, sizeof(psm2_uuid_t)); + psmi_uuid_unparse(ep->uuid, array_of_epinfo[i].uuid_str); + ep = ep->user_ep_next; + } + *num_of_epinfo = i; + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_ep_query) + +psm2_error_t __psm2_ep_epid_lookup(psm2_epid_t epid, psm2_epconn_t *epconn) +{ + psm2_error_t err = PSM2_OK; + psm2_epaddr_t epaddr; + psm2_ep_t ep; + + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + /* Need to have an opened endpoint before we can resolve epids */ + if (psmi_opened_endpoint == NULL) { + err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED, + "PSM Endpoint is closed or does not exist"); + PSM2_LOG_MSG("leaving"); + return err; + } + + ep = psmi_opened_endpoint; + while (ep) { + epaddr = psmi_epid_lookup(ep, epid); + if (!epaddr) { + ep = ep->user_ep_next; + continue; + } + + /* Found connection for epid. Return info about endpoint to caller. */ + psmi_assert_always(epaddr->ptlctl->ep == ep); + epconn->addr = epaddr; + epconn->ep = ep; + epconn->mq = ep->mq; + PSM2_LOG_MSG("leaving"); + return err; + } + + err = psmi_handle_error(NULL, PSM2_EPID_UNKNOWN, + "Endpoint connection status unknown"); + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_ep_epid_lookup); + +psm2_error_t __psm2_ep_epid_lookup2(psm2_ep_t ep, psm2_epid_t epid, psm2_epconn_t *epconn) +{ + psm2_error_t err = PSM2_OK; + + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + /* Need to have an opened endpoint before we can resolve epids */ + if (ep == NULL) { + err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED, + "PSM Endpoint is closed or does not exist"); + PSM2_LOG_MSG("leaving"); + return err; + } + + if (epconn == NULL) { + err = psmi_handle_error(ep, PSM2_PARAM_ERR, + "Invalid output parameter"); + PSM2_LOG_MSG("leaving"); + return err; + } + + psm2_epaddr_t epaddr = psmi_epid_lookup(ep, epid); + if (epaddr) { + /* Found connection for epid. Return info about endpoint to caller. */ + psmi_assert_always(epaddr->ptlctl->ep == ep); + epconn->addr = epaddr; + epconn->ep = ep; + epconn->mq = ep->mq; + PSM2_LOG_MSG("leaving"); + return err; + } + + err = psmi_handle_error(ep, PSM2_EPID_UNKNOWN, + "Endpoint connection status unknown"); + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_ep_epid_lookup2); + +psm2_error_t __psm2_epaddr_to_epid(psm2_epaddr_t epaddr, psm2_epid_t *epid) +{ + psm2_error_t err = PSM2_OK; + PSM2_LOG_MSG("entering"); + if (epaddr && epid) { + *epid = epaddr->epid; + } + else { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid input epaddr or output epid parameter"); + } + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_epaddr_to_epid); + +psm2_error_t +__psm2_ep_epid_share_memory(psm2_ep_t ep, psm2_epid_t epid, int *result_o) +{ + uint32_t num_lids = 0; + uint16_t *lids = NULL; + int i; + uint16_t epid_lid; + int result = 0; + psm2_error_t err; + + PSM2_LOG_MSG("entering"); + psmi_assert_always(ep != NULL); + PSMI_ERR_UNLESS_INITIALIZED(ep); + + if ((!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) || + (psmi_epid_version(epid) == PSMI_EPID_VERSION_SHM)) { + /* If we are in the no hfi-mode, or the other process is, + * the epid doesn't help us - so assume both we're on the same + * machine and try to connect. + */ + result = 1; + } else { + epid_lid = (uint16_t) psm2_epid_nid(epid); + err = psmi_ep_devlids(&lids, &num_lids, ep->gid_hi, ep->gid_lo); + if (err) { + PSM2_LOG_MSG("leaving"); + return err; + } + for (i = 0; i < num_lids; i++) { + if (epid_lid == lids[i]) { + /* we share memory if the lid is the same. */ + result = 1; + break; + } + } + } + *result_o = result; + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} +PSMI_API_DECL(psm2_ep_epid_share_memory) + +psm2_error_t __psm2_ep_open_opts_get_defaults(struct psm2_ep_open_opts *opts) +{ + PSM2_LOG_MSG("entering"); + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (!opts) + return PSM2_PARAM_ERR; + + /* Set in order in the structure. */ + opts->timeout = 30000000000LL; /* 30 sec */ + opts->unit = HFI_UNIT_ID_ANY; + opts->affinity = PSM2_EP_OPEN_AFFINITY_SET; + opts->shm_mbytes = 0; /* deprecated in psm2.h */ + opts->sendbufs_num = 1024; + opts->network_pkey = psmi_hal_get_default_pkey(); + opts->port = HFI_PORT_NUM_ANY; + opts->outsl = PSMI_SL_DEFAULT; + opts->service_id = HFI_DEFAULT_SERVICE_ID; + opts->path_res_type = PSM2_PATH_RES_NONE; + opts->senddesc_num = 4096; + opts->imm_size = 128; + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} +PSMI_API_DECL(psm2_ep_open_opts_get_defaults) + +psm2_error_t psmi_poll_noop(ptl_t *ptl, int replyonly); + +psm2_error_t +__psm2_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, + struct psm2_ep_open_opts const *opts_i, psm2_mq_t mq, + psm2_ep_t *epo, psm2_epid_t *epido) +{ + psm2_ep_t ep = NULL; + uint32_t num_units; + size_t len; + psm2_error_t err; + psm2_epaddr_t epaddr = NULL; + char buf[128], *p, *e; + union psmi_envvar_val envvar_val; + size_t ptl_sizes; + struct psm2_ep_open_opts opts; + ptl_t *amsh_ptl, *ips_ptl, *self_ptl; + int i; + + /* First get the set of default options, we overwrite with the user's + * desired values afterwards */ + if ((err = psm2_ep_open_opts_get_defaults(&opts))) + goto fail; + + if (opts_i != NULL) { + if (opts_i->timeout != -1) + opts.timeout = opts_i->timeout; + if (opts_i->unit != -1) + opts.unit = opts_i->unit; + if (opts_i->affinity != -1) + opts.affinity = opts_i->affinity; + + if (opts_i->sendbufs_num != -1) + opts.sendbufs_num = opts_i->sendbufs_num; + + if (opts_i->network_pkey != psmi_hal_get_default_pkey()) + opts.network_pkey = opts_i->network_pkey; + + if (opts_i->port != 0) + opts.port = opts_i->port; + + if (opts_i->outsl != -1) + opts.outsl = opts_i->outsl; + + if (opts_i->service_id) + opts.service_id = (uint64_t) opts_i->service_id; + if (opts_i->path_res_type != PSM2_PATH_RES_NONE) + opts.path_res_type = opts_i->path_res_type; + + if (opts_i->senddesc_num) + opts.senddesc_num = opts_i->senddesc_num; + if (opts_i->imm_size) + opts.imm_size = opts_i->imm_size; + } + + /* Get Service ID from environment */ + if (!psmi_getenv("PSM2_IB_SERVICE_ID", + "HFI Service ID for path resolution", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_ULONG_ULONG, + (union psmi_envvar_val)HFI_DEFAULT_SERVICE_ID, + &envvar_val)) { + opts.service_id = (uint64_t) envvar_val.e_ulonglong; + } + + /* Get Path resolution type from environment Possible choices are: + * + * NONE : Default same as previous instances. Utilizes static data. + * OPP : Use OFED Plus Plus library to do path record queries. + * UMAD : Use raw libibumad interface to form and process path records. + */ + if (!psmi_getenv("PSM2_PATH_REC", + "Mechanism to query HFI path record (default is no path query)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"none", &envvar_val)) { + if (!strcasecmp(envvar_val.e_str, "none")) + opts.path_res_type = PSM2_PATH_RES_NONE; + else if (!strcasecmp(envvar_val.e_str, "opp")) + opts.path_res_type = PSM2_PATH_RES_OPP; + else if (!strcasecmp(envvar_val.e_str, "umad")) + opts.path_res_type = PSM2_PATH_RES_UMAD; + else { + _HFI_ERROR("Unknown path resolution type %s. " + "Disabling use of path record query.\n", + envvar_val.e_str); + opts.path_res_type = PSM2_PATH_RES_NONE; + } + } + + /* If a specific unit is set in the environment, use that one. */ + if (!psmi_getenv("HFI_UNIT", "Device Unit number (-1 autodetects)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, + (union psmi_envvar_val)HFI_UNIT_ID_ANY, &envvar_val)) { + opts.unit = envvar_val.e_long; + } + + /* Get user specified port number to use. */ + if (!psmi_getenv("HFI_PORT", "IB Port number (0 autodetects)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, + (union psmi_envvar_val)HFI_PORT_NUM_ANY, + &envvar_val)) { + opts.port = envvar_val.e_long; + } + + /* Get service level from environment, path-query overrides it */ + if (!psmi_getenv + ("HFI_SL", "HFI outging ServiceLevel number (default 0)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, + (union psmi_envvar_val)PSMI_SL_DEFAULT, &envvar_val)) { + opts.outsl = envvar_val.e_long; + } + + /* Get network key from environment. MVAPICH and other vendor MPIs do not + * specify it on ep open and we may require it for vFabrics. + * path-query will override it. + */ + if (!psmi_getenv("PSM2_PKEY", + "HFI PKey to use for endpoint", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_ULONG, + (union psmi_envvar_val)((unsigned int)(psmi_hal_get_default_pkey())), + &envvar_val)) { + opts.network_pkey = (uint64_t) envvar_val.e_ulong; + } + + /* BACKWARDS COMPATIBILITY: Open MPI likes to choose its own PKEY of + 0x7FFF. That's no longer a valid default, so override it if the + client was compiled against PSM v1 */ + if (PSMI_VERNO_GET_MAJOR(psmi_verno_client()) < 2 && + opts.network_pkey == 0x7FFF) { + opts.network_pkey = psmi_hal_get_default_pkey();; + } + + /* Get number of default send buffers from environment */ + if (!psmi_getenv("PSM2_NUM_SEND_BUFFERS", + "Number of send buffers to allocate [1024]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)1024, &envvar_val)) { + opts.sendbufs_num = envvar_val.e_uint; + } + + /* Get immediate data size - transfers less than immediate data size do + * not consume a send buffer and require just a send descriptor. + */ + if (!psmi_getenv("PSM2_SEND_IMMEDIATE_SIZE", + "Immediate data send size not requiring a buffer [128]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)128, &envvar_val)) { + opts.imm_size = envvar_val.e_uint; + } + + /* Get number of send descriptors - by default this is 4 times the number + * of send buffers - mainly used for short/inlined messages. + */ + if (!psmi_getenv("PSM2_NUM_SEND_DESCRIPTORS", + "Number of send descriptors to allocate [4096]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)4096, &envvar_val)) { + opts.senddesc_num = envvar_val.e_uint; + } + + if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { + if ((err = psm2_ep_num_devunits(&num_units)) != PSM2_OK) + goto fail; + } else + num_units = 0; + + /* do some error checking */ + if (opts.timeout < -1) { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid timeout value %lld", + (long long)opts.timeout); + goto fail; + } else if (num_units && (opts.unit < -1 || opts.unit >= (int)num_units)) { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid Device Unit ID %d (%d units found)", + opts.unit, num_units); + goto fail; + } else if ((opts.port < HFI_MIN_PORT || opts.port > HFI_MAX_PORT) && + opts.port != HFI_PORT_NUM_ANY) { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid Device port number %d", + opts.port); + goto fail; + } else if (opts.affinity < 0 + || opts.affinity > PSM2_EP_OPEN_AFFINITY_FORCE) { + err = + psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid Affinity option: %d", + opts.affinity); + goto fail; + } else if (opts.outsl < PSMI_SL_MIN || opts.outsl > PSMI_SL_MAX) { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid SL number: %lld", + (unsigned long long)opts.outsl); + goto fail; + } + + /* Set environment variable if PSM is not allowed to set affinity */ + if (opts.affinity == PSM2_EP_OPEN_AFFINITY_SKIP) + setenv("HFI_NO_CPUAFFINITY", "1", 1); + + /* Allocate end point structure storage */ + ptl_sizes = + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_SELF) ? + psmi_ptl_self.sizeof_ptl() : 0) + + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS) ? + psmi_ptl_ips.sizeof_ptl() : 0) + + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_AMSH) ? + psmi_ptl_amsh.sizeof_ptl() : 0); + if (ptl_sizes == 0) + return PSM2_EP_NO_DEVICE; + + ep = (psm2_ep_t) psmi_memalign(PSMI_EP_NONE, UNDEFINED, 64, + sizeof(struct psm2_ep) + ptl_sizes); + epaddr = (psm2_epaddr_t) psmi_calloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, + 1, sizeof(struct psm2_epaddr)); + if (ep == NULL || epaddr == NULL) { + err = psmi_handle_error(NULL, PSM2_NO_MEMORY, + "Couldn't allocate memory for %s structure", + ep == NULL ? "psm2_ep" : "psm2_epaddr"); + goto fail; + } + memset(ep, 0, sizeof(struct psm2_ep) + ptl_sizes); + + /* Copy PTL enabled status */ + for (i = 0; i < PTL_MAX_INIT; i++) + ep->devid_enabled[i] = devid_enabled[i]; + + /* Matched Queue initialization. We do this early because we have to + * make sure ep->mq exists and is valid before calling ips_do_work. + */ + ep->mq = mq; + + /* Get ready for PTL initialization */ + memcpy(&ep->uuid, (void *)unique_job_key, sizeof(psm2_uuid_t)); + ep->epaddr = epaddr; + ep->memmode = mq->memmode; + ep->hfi_num_sendbufs = opts.sendbufs_num; + ep->service_id = opts.service_id; + ep->path_res_type = opts.path_res_type; + ep->hfi_num_descriptors = opts.senddesc_num; + ep->hfi_imm_size = opts.imm_size; + ep->errh = psmi_errhandler_global; /* by default use the global one */ + ep->ptl_amsh.ep_poll = psmi_poll_noop; + ep->ptl_ips.ep_poll = psmi_poll_noop; + ep->connections = 0; + + /* See how many iterations we want to spin before yielding */ + psmi_getenv("PSM2_YIELD_SPIN_COUNT", + "Spin poll iterations before yield", + PSMI_ENVVAR_LEVEL_HIDDEN, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD, + &envvar_val); + ep->yield_spin_cnt = envvar_val.e_uint; + + ptl_sizes = 0; + amsh_ptl = ips_ptl = self_ptl = NULL; + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { + amsh_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); + ptl_sizes += psmi_ptl_amsh.sizeof_ptl(); + } + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { + ips_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); + ptl_sizes += psmi_ptl_ips.sizeof_ptl(); + } + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { + self_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); + ptl_sizes += psmi_ptl_self.sizeof_ptl(); + } + + if ((err = psmi_ep_open_device(ep, &opts, unique_job_key, + &(ep->context), &ep->epid))) + goto fail; + + psmi_assert_always(ep->epid != 0); + ep->epaddr->epid = ep->epid; + + _HFI_VDBG("psmi_ep_open_device() passed\n"); + + /* Set our new label as soon as we know what it is */ + strncpy(buf, psmi_gethostname(), sizeof(buf) - 1); + buf[sizeof(buf) - 1] = '\0'; + + p = buf + strlen(buf); + + /* If our rank is set, use it. If not, use context.subcontext notation */ + if (((e = getenv("MPI_RANKID")) != NULL && *e) || + ((e = getenv("PSC_MPI_RANK")) != NULL && *e)) + len = snprintf(p, sizeof(buf) - strlen(buf), ":%d.", atoi(e)); + else + len = snprintf(p, sizeof(buf) - strlen(buf), ":%d.%d.", + (uint32_t) psm2_epid_context(ep->epid), + (uint32_t) psmi_epid_subcontext(ep->epid)); + *(p + len) = '\0'; + ep->context_mylabel = psmi_strdup(ep, buf); + if (ep->context_mylabel == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + /* hfi_set_mylabel(ep->context_mylabel); */ + + if ((err = psmi_epid_set_hostname(psm2_epid_nid(ep->epid), buf, 0))) + goto fail; + + _HFI_VDBG("start ptl device init...\n"); + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { + if ((err = psmi_ptl_self.init(ep, self_ptl, &ep->ptl_self))) + goto fail; + } + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { + if ((err = psmi_ptl_ips.init(ep, ips_ptl, &ep->ptl_ips))) + goto fail; + } + /* If we're shm-only, this device is enabled above */ + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { + if ((err = psmi_ptl_amsh.init(ep, amsh_ptl, &ep->ptl_amsh))) + goto fail; + } else { + /* We may have pre-attached as part of getting our rank for enabling + * shared contexts. */ + } + + _HFI_VDBG("finish ptl device init...\n"); + + /* + * Keep only IPS since only IPS support multi-rail, other devices + * are only setup once. IPS device can come to this function again. + */ + for (i = 0; i < PTL_MAX_INIT; i++) { + if (devid_enabled[i] != PTL_DEVID_IPS) { + devid_enabled[i] = -1; + } + } + + *epido = ep->epid; + *epo = ep; + + return PSM2_OK; + +fail: + if (ep != NULL) { + psmi_hal_close_context(&ep->context.psm_hw_ctxt); + psmi_free(ep); + } + if (epaddr != NULL) + psmi_free(epaddr); + return err; +} + +psm2_error_t +__psm2_ep_open(psm2_uuid_t const unique_job_key, + struct psm2_ep_open_opts const *opts_i, psm2_ep_t *epo, + psm2_epid_t *epido) +{ + psm2_error_t err; + psm2_mq_t mq; + psm2_epid_t epid; + psm2_ep_t ep, tmp; + uint32_t units[HFI_MAX_RAILS]; + uint16_t ports[HFI_MAX_RAILS]; + int i, num_rails = 0; + char *uname = "HFI_UNIT"; + char *pname = "HFI_PORT"; + char uvalue[6], pvalue[6]; + int devid_enabled[PTL_MAX_INIT]; + union psmi_envvar_val devs; + + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (!epo || !epido) + return PSM2_PARAM_ERR; + + /* Allowing only one EP (unless explicitly enabled). */ + if (psmi_opened_endpoint_count > 0 && !psmi_multi_ep_enabled) { + PSM2_LOG_MSG("leaving"); + return PSM2_TOO_MANY_ENDPOINTS; + } + + /* Matched Queue initialization. We do this early because we have to + * make sure ep->mq exists and is valid before calling ips_do_work. + */ + err = psmi_mq_malloc(&mq); + PSMI_LOCK(psmi_creation_lock); + if (err != PSM2_OK) + goto fail; + + /* Set some of the MQ thresholds from the environment. + Do this before ptl initialization - the ptl may have other + constraints that will limit the MQ's settings. */ + err = psmi_mq_initialize_defaults(mq); + if (err != PSM2_OK) + goto fail; + + psmi_init_lock(&(mq->progress_lock)); + + /* See which ptl devices we want to use for this ep to be opened */ + psmi_getenv("PSM2_DEVICES", + "Ordered list of PSM-level devices", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)PSMI_DEVICES_DEFAULT, &devs); + + if ((err = psmi_parse_devices(devid_enabled, devs.e_str))) + goto fail; + + if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { + err = psmi_ep_multirail(&num_rails, units, ports); + if (err != PSM2_OK) + goto fail; + + /* If multi-rail is used, set the first ep unit/port */ + if (num_rails > 0) { + snprintf(uvalue, 6, "%1d", units[0]); + snprintf(pvalue, 6, "%1d", ports[0]); + setenv(uname, uvalue, 1); + setenv(pname, pvalue, 1); + } + } + +#ifdef PSM_CUDA + if (PSMI_IS_GDR_COPY_ENABLED) + hfi_gdr_open(); +#endif + + err = __psm2_ep_open_internal(unique_job_key, + devid_enabled, opts_i, mq, &ep, &epid); + if (err != PSM2_OK) + goto fail; + + if (psmi_opened_endpoint == NULL) { + psmi_opened_endpoint = ep; + } else { + tmp = psmi_opened_endpoint; + while (tmp->user_ep_next) + tmp = tmp->user_ep_next; + tmp->user_ep_next = ep; + } + psmi_opened_endpoint_count++; + ep->mctxt_prev = ep->mctxt_next = ep; + ep->mctxt_master = ep; + mq->ep = ep; + + /* Active Message initialization */ + err = psmi_am_init_internal(ep); + if (err != PSM2_OK) + goto fail; + + *epo = ep; + *epido = epid; + + if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { + for (i = 1; i < num_rails; i++) { + snprintf(uvalue, 6, "%1d", units[i]); + snprintf(pvalue, 6, "%1d", ports[i]); + setenv(uname, uvalue, 1); + setenv(pname, pvalue, 1); + + /* Create slave EP */ + err = __psm2_ep_open_internal(unique_job_key, + devid_enabled, opts_i, mq, + &tmp, &epid); + if (err) + goto fail; + + /* Point back to shared resources on the master EP */ + tmp->am_htable = ep->am_htable; + + /* Link slave EP after master EP. */ + PSM_MCTXT_APPEND(ep, tmp); + } + } + + _HFI_VDBG("psm2_ep_open() OK....\n"); + +fail: + PSMI_UNLOCK(psmi_creation_lock); + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_ep_open) + +psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in) +{ + psm2_error_t err = PSM2_OK; +#if _HFI_DEBUGGING + uint64_t t_start = 0; + if (_HFI_PRDBG_ON) { + t_start = get_cycles(); + } +#endif + +#ifdef PSM_CUDA + /* + * The close on the gdr fd needs to be called before the + * close on the hfi fd as the the gdr device will hold + * reference count on the hfi device which will make the close + * on the hfi fd return without actually closing the fd. + */ + if (PSMI_IS_GDR_COPY_ENABLED) + hfi_gdr_close(); +#endif + union psmi_envvar_val timeout_intval; + psm2_ep_t tmp; + psm2_mq_t mmq; + + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(ep); + psmi_assert_always(ep->mctxt_master == ep); + + PSMI_LOCK(psmi_creation_lock); + + if (psmi_opened_endpoint == NULL) { + err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED, + "PSM Endpoint is closed or does not exist"); + PSM2_LOG_MSG("leaving"); + PSMI_UNLOCK(psmi_creation_lock); + return err; + } + + tmp = psmi_opened_endpoint; + while (tmp && tmp != ep) { + tmp = tmp->user_ep_next; + } + if (!tmp) { + err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED, + "PSM Endpoint is closed or does not exist"); + PSM2_LOG_MSG("leaving"); + PSMI_UNLOCK(psmi_creation_lock); + return err; + } + + psmi_getenv("PSM2_CLOSE_TIMEOUT", + "End-point close timeout over-ride.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)0, &timeout_intval); + + if (getenv("PSM2_CLOSE_TIMEOUT")) { + timeout_in = timeout_intval.e_uint * SEC_ULL; + } else if (timeout_in > 0) { + /* The timeout parameter provides the minimum timeout. A heuristic + * is used to scale up the timeout linearly with the number of + * endpoints, and we allow one second per 100 endpoints. */ + timeout_in = max(timeout_in, (ep->connections * SEC_ULL) / 100); + } + + if (timeout_in > 0 && timeout_in < PSMI_MIN_EP_CLOSE_TIMEOUT) + timeout_in = PSMI_MIN_EP_CLOSE_TIMEOUT; + + /* Infinite and excessive close time-out are limited here to a max. + * The "rationale" is that there is no point waiting around forever for + * graceful termination. Normal (or forced) process termination should clean + * up the context state correctly even if termination is not graceful. */ + if (timeout_in <= 0 || timeout_in > PSMI_MAX_EP_CLOSE_TIMEOUT) + timeout_in = PSMI_MAX_EP_CLOSE_TIMEOUT; + _HFI_PRDBG("Closing endpoint %p with force=%s and to=%.2f seconds and " + "%d connections\n", + ep, mode == PSM2_EP_CLOSE_FORCE ? "YES" : "NO", + (double)timeout_in / 1e9, (int)ep->connections); + + /* XXX We currently cheat in the sense that we leave each PTL the allowed + * timeout. There's no good way to do this until we change the PTL + * interface to allow asynchronous finalization + */ + + + /* Check if transfer ownership of receive thread is needed before closing ep. + * In case of PSM2_MULTI_EP support receive thread is created and assigned + * to first opened endpoint. Receive thread is killed when closing this + * endpoint. + */ + if (ep->user_ep_next != NULL) { + /* Receive thread will be transfered and assigned to ep->user_ep_next + * only if currently working receive thread (which will be killed) is + * assigned to ep and there isn't any assigned to ep->user_ep_next. + */ + if ((psmi_ptl_ips_rcvthread.is_enabled(ep->ptl_ips.ptl)) && + (!psmi_ptl_ips_rcvthread.is_enabled(ep->user_ep_next->ptl_ips.ptl))) + psmi_ptl_ips_rcvthread.transfer_ownership(ep->ptl_ips.ptl, ep->user_ep_next->ptl_ips.ptl); + } + + /* + * Before freeing the master ep itself, + * remove it from the global linklist. + * We do it here to let atexit handler in ptl_am directory + * to search the global linklist and free the shared memory file. + */ + if (psmi_opened_endpoint == ep) { + /* Removing ep from global endpoint list. */ + psmi_opened_endpoint = ep->user_ep_next; + } else { + tmp = psmi_opened_endpoint; + while (tmp->user_ep_next != ep) { + tmp = tmp->user_ep_next; + } + /* Removing ep from global endpoint list. */ + tmp->user_ep_next = ep->user_ep_next; + } + psmi_opened_endpoint_count--; + + /* + * This do/while loop is used to close and free memory of endpoints. + * + * If MULTIRAIL feature is disable this loop will be passed only once + * and only endpoint passed in psm2_ep_close will be closed/removed. + * + * If MULTIRAIL feature is enabled then this loop will be passed + * multiple times (depending on number of rails). The order in which + * endpoints will be closed is shown below: + * + * |--this is master endpoint in case of multirail + * | this endpoint is passed to psm2_ep_close and + * V this is only endpoint known to user. + * +<-Ep0<-Ep1<-Ep2<-Ep3 + * |__________________| Ep3->mctxt_prev points to Ep2 + * (3) (2) (1) (4) Ep2->mctxt_prev points to Ep1 + * ^ Ep1->mctxt_prev points to Ep0 + * | Ep0->mctxt_prev points to Ep3 (master ep) + * | + * |---- order in which endpoints will be closed. + * + * Closing MULTIRAILs starts by closing slaves (Ep2, Ep1, Ep0) + * If MULTIRAIL is enabled then Ep3->mctxt_prev will point to Ep2, if + * feature is disabled then Ep3->mctxt_prev will point to Ep3 and + * do/while loop will have one pass. + * + * In case of MULTIRAIL enabled Ep3 which is master endpoint will be + * closed as the last one. + */ + mmq = ep->mq; + tmp = ep->mctxt_prev; + do { + ep = tmp; + tmp = ep->mctxt_prev; + + PSMI_LOCK(ep->mq->progress_lock); + + PSM_MCTXT_REMOVE(ep); + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) + err = + psmi_ptl_amsh.fini(ep->ptl_amsh.ptl, mode, + timeout_in); + + if ((err == PSM2_OK || err == PSM2_TIMEOUT) && + psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) + err = + psmi_ptl_ips.fini(ep->ptl_ips.ptl, mode, + timeout_in); + + /* If there's timeouts in the disconnect requests, + * still make sure that we still get to close the + *endpoint and mark it closed */ + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) + psmi_context_close(&ep->context); + + psmi_free(ep->epaddr); + psmi_free(ep->context_mylabel); + + PSMI_UNLOCK(ep->mq->progress_lock); + + ep->mq = NULL; + psmi_free(ep); + + } while ((err == PSM2_OK || err == PSM2_TIMEOUT) && tmp != ep); + + if (mmq) + err = psmi_mq_free(mmq); + + + PSMI_UNLOCK(psmi_creation_lock); + + if (_HFI_PRDBG_ON) { + _HFI_PRDBG_ALWAYS("Closed endpoint in %.3f secs\n", + (double)cycles_to_nanosecs(get_cycles() - + t_start) / SEC_ULL); + } + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_ep_close) + +static +psm2_error_t +psmi_ep_open_device(const psm2_ep_t ep, + const struct psm2_ep_open_opts *opts, + const psm2_uuid_t unique_job_key, + struct psmi_context *context, psm2_epid_t *epid) +{ + psm2_error_t err = PSM2_OK; + + /* Skip affinity. No affinity if: + * 1. User explicitly sets no-affinity=YES in environment. + * 2. User doesn't set affinity in environment and PSM is opened with + * option affinity skip. + */ + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { + uint32_t rcvthread_flags; + union psmi_envvar_val env_rcvthread; + static int norcvthread; /* only for first rail */ + + ep->out_sl = opts->outsl; + + if ((err = + psmi_context_open(ep, opts->unit, opts->port, + unique_job_key, opts->timeout, + context)) != PSM2_OK) + goto fail; + + _HFI_DBG("[%d]use unit %d port %d\n", getpid(), + psmi_hal_get_unit_id(ep->context.psm_hw_ctxt), 1); + + /* At this point, we have the unit id and port number, so + * check if pkey is not 0x0/0x7fff/0xffff, and match one + * of the pkey in table. + */ + if ((err = + psmi_ep_verify_pkey(ep, (uint16_t) opts->network_pkey, + &ep->network_pkey)) != PSM2_OK) + goto fail; + + /* See if we want to activate support for receive thread */ + psmi_getenv("PSM2_RCVTHREAD", + "Recv thread flags (0 disables thread)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)(norcvthread++ ? 0 : + PSMI_RCVTHREAD_FLAGS), + &env_rcvthread); + rcvthread_flags = env_rcvthread.e_uint; + + /* If enabled, use the pollurg capability to implement a receive + * interrupt thread that can handle urg packets */ + if (rcvthread_flags) { + psmi_hal_add_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD); +#ifdef PSMI_PLOCK_IS_NOLOCK + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "#define PSMI_PLOCK_IS_NOLOCK not functional yet " + "with RCVTHREAD on"); +#endif + } + context->rcvthread_flags = rcvthread_flags; + + *epid = context->epid; + } else if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { + int rank; + + /* In shm-only mode, we need to derive a valid epid + * based on our rank. We try to get it from the + * environment if its available, or resort to using + * our PID as the rank. + */ + union psmi_envvar_val env_rankid; + + if (psmi_getenv + ("MPI_LOCALRANKID", "Shared context rankid", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)-1, &env_rankid)) { + if (psmi_getenv + ("PSC_MPI_NODE_RANK", + "Shared context rankid", + PSMI_ENVVAR_LEVEL_HIDDEN, + PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)-1, &env_rankid)) { + rank = getpid(); + } else + rank = env_rankid.e_int; + } else + rank = env_rankid.e_int; + + /* + * We use a LID of 0 for non-HFI communication. + * Since a jobkey is not available from IPS, pull the + * first 16 bits from the UUID. + */ + switch (PSMI_EPID_VERSION) { + case PSMI_EPID_V1: + *epid = PSMI_EPID_PACK_V1(((uint16_t *) unique_job_key)[0], + (rank >> 3), rank, 0, + PSMI_EPID_VERSION_SHM, rank); + break; + case PSMI_EPID_V2: + /* Construct epid for this Endpoint */ + *epid = PSMI_EPID_PACK_V2_SHM(getpid(), + PSMI_EPID_SHM_ONLY, /*is a only-shm epid */ + PSMI_EPID_VERSION); + break; + default: + /* Epid version is greater than max supportd version. */ + psmi_assert_always(PSMI_EPID_VERSION <= PSMI_EPID_V2); + break; + } + } else { + /* Self-only, meaning only 1 proc max */ + switch (PSMI_EPID_VERSION) { + case PSMI_EPID_V1: + *epid = PSMI_EPID_PACK_V1( + 0, 0, 0, 0, PSMI_EPID_VERSION_SHM, 0x3ffffff); + break; + case PSMI_EPID_V2: + *epid = PSMI_EPID_PACK_V2_SHM(0, + PSMI_EPID_SHM_ONLY, /*is a only-shm epid */ + PSMI_EPID_VERSION); + break; + default: + /* Epid version is greater than max supportd version. */ + psmi_assert_always(PSMI_EPID_VERSION <= PSMI_EPID_V2); + break; + } + } + +fail: + return err; +} + +/* Get a list of PTLs we want to use. The order is important, it affects + * whether node-local processes use shm or ips */ +static +psm2_error_t +psmi_parse_devices(int devices[PTL_MAX_INIT], const char *devstring) +{ + char *devstr = NULL; + char *b_new, *e, *ee, *b; + psm2_error_t err = PSM2_OK; + int len; + int i = 0; + + psmi_assert_always(devstring != NULL); + len = strlen(devstring) + 1; + + for (i = 0; i < PTL_MAX_INIT; i++) + devices[i] = -1; + + devstr = (char *)psmi_calloc(PSMI_EP_NONE, UNDEFINED, 2, len); + if (devstr == NULL) + goto fail; + + b_new = (char *)devstr; + e = b_new + len; + strncpy(e, devstring, len); + ee = e + len; + i = 0; + while (e < ee && *e && i < PTL_MAX_INIT) { + while (*e && !isalpha(*e)) + e++; + b = e; + while (*e && isalpha(*e)) + e++; + *e = '\0'; + if (*b) { + if (!strcasecmp(b, "self")) { + devices[i++] = PTL_DEVID_SELF; + b_new = strcpy(b_new, "self,"); + b_new += 5; + } else if (!strcasecmp(b, "shm") || + !strcasecmp(b, "shmem") || + !strcasecmp(b, "amsh")) { + devices[i++] = PTL_DEVID_AMSH; + strcpy(b_new, "amsh,"); + b_new += 5; + } else if (!strcasecmp(b, "hfi") || + !strcasecmp(b, "ipath") || + !strcasecmp(b, "ips")) { + devices[i++] = PTL_DEVID_IPS; + strcpy(b_new, "ips,"); + b_new += 4; + } else { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "%s set in environment variable PSM_PTL_DEVICES=\"%s\" " + "is not one of the recognized PTL devices (%s)", + b, devstring, + PSMI_DEVICES_DEFAULT); + goto fail; + } + e++; + } + } + if (b_new != devstr) /* we parsed something, remove trailing comma */ + *(b_new - 1) = '\0'; + + _HFI_PRDBG("PSM Device allocation order: %s\n", devstr); +fail: + if (devstr != NULL) + psmi_free(devstr); + return err; + +} + +static +int psmi_device_is_enabled(const int devid_enabled[PTL_MAX_INIT], int devid) +{ + int i; + for (i = 0; i < PTL_MAX_INIT; i++) + if (devid_enabled[i] == devid) + return 1; + return 0; +} + +int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid) +{ + return psmi_device_is_enabled(ep->devid_enabled, devid); +} diff --git a/psm_ep.h b/psm_ep.h new file mode 100644 index 0000000..055573d --- /dev/null +++ b/psm_ep.h @@ -0,0 +1,236 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_IN_USER_H +#error psm2_ep.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_EP_H +#define _PSMI_EP_H + +/* + * EPIDs encode the following information: + * + * LID:16 bits - LID for endpoint + * CONTEXT:8 bits - Context used for bits (upto 256 contexts) + * SUBCONTEXT:3 bits - Subcontext used for endpoint + * HFIUNIT: 2 bits - HFI unit number + * HFITYPE: 3 bits - OPA1, OPA2, ... + * RANK: 26 bits - process rank + * reserved: 6 bit - for future usage + */ + +#define PSMI_HFI_TYPE_UNKNOWN 0 +#define PSMI_HFI_TYPE_OPA1 1 +#define PSMI_HFI_TYPE_OPA2 2 + +#define PSMI_SL_DEFAULT 0 +#define PSMI_SC_DEFAULT 0 +#define PSMI_VL_DEFAULT 0 +#define PSMI_SL_MIN 0 +#define PSMI_SL_MAX 31 +#define PSMI_SC_ADMIN 15 +#define PSMI_VL_ADMIN 15 + +#define PSMI_EPID_PACK_V1(lid, context, subcontext, hfiunit, epid_version, rank) \ + (((((uint64_t)lid)&0xffff)<<16) | \ + ((((uint64_t)context)&0xff)<<8) | \ + ((((uint64_t)subcontext)&0x7)<<5) | \ + ((((uint64_t)hfiunit)&0x3)<<3) | \ + ((((uint64_t)epid_version)&0x7)<<0) | \ + ((((uint64_t)rank)&0x3ffffff)<<32)) + +#define PSMI_EPID_PACK_V2(lid, context, subcontext, shmbool, epid_version, subnet_id) \ + (((((uint64_t)lid)&0xffffff)<<16) | \ + ((((uint64_t)context)&0xff)<<8) | \ + ((((uint64_t)subcontext)&0x7)<<5) | \ + ((((uint64_t)shmbool)&0x1)<<3) | \ + ((((uint64_t)epid_version)&0x7)<<0) | \ + ((((uint64_t)subnet_id)&0xffff)<<48)) + +#define PSMI_EPID_PACK_V2_SHM(process_id, shmbool, epid_version) \ + (((((uint64_t)process_id)&0xffffffff)<<32) | \ + ((((uint64_t)shmbool)&0x1)<<3) | \ + ((((uint64_t)epid_version)&0x7)<<0)) + +#define PSMI_EPID_GET_LID_V1(epid) (((epid)>>16)&0xffff) +#define PSMI_EPID_GET_LID_V2(epid) (((epid)>>16)&0xffffff) +#define PSMI_EPID_GET_CONTEXT(epid) (((epid)>>8)&0xff) +#define PSMI_EPID_GET_SUBCONTEXT(epid) (((epid)>>5)&0x7) +#define PSMI_EPID_GET_HFIUNIT(epid) (((epid)>>3)&0x3) +#define PSMI_EPID_GET_EPID_VERSION(epid) (((epid)>>0)&0x7) +#define PSMI_EPID_GET_RANK(epid) (((epid)>>32)&0x3ffffff) +#define PSMI_EPID_GET_SHMBOOL(epid) (((epid)>>3)&0x1) +#define PSMI_EPID_GET_SUBNET_ID(epid) (((epid)>>48)&0xffff) +#define PSMI_EPID_GET_PROCESS_ID(epid) (((epid)>>32)&0xffffffff) + +#define PSM_MCTXT_APPEND(head, node) \ + node->mctxt_prev = head->mctxt_prev; \ + node->mctxt_next = head; \ + head->mctxt_prev->mctxt_next = node; \ + head->mctxt_prev = node; \ + node->mctxt_master = head +#define PSM_MCTXT_REMOVE(node) \ + node->mctxt_prev->mctxt_next = node->mctxt_next; \ + node->mctxt_next->mctxt_prev = node->mctxt_prev; \ + node->mctxt_next = node->mctxt_prev = node; \ + node->mctxt_master = NULL + +struct psm2_ep { + psm2_epid_t epid; /**> This endpoint's Endpoint ID */ + psm2_epaddr_t epaddr; /**> This ep's ep address */ + psm2_mq_t mq; /**> only 1 MQ */ + int unit_id; + uint16_t portnum; + uint16_t out_sl; + uint16_t mtu; /* out_sl-->vl-->mtu in sysfs */ + uint16_t network_pkey; /**> OPA Pkey */ + int did_syslog; + psm2_uuid_t uuid; + uint16_t jkey; + uint64_t service_id; /* OPA service ID */ + psm2_path_res_t path_res_type; /* Path resolution for endpoint */ + psm2_ep_errhandler_t errh; + int devid_enabled[PTL_MAX_INIT]; + int memmode; /**> min, normal, large memory mode */ + + uint32_t hfi_num_sendbufs;/**> Number of allocated send buffers */ + uint32_t hfi_num_descriptors;/** Number of allocated scb descriptors*/ + uint32_t hfi_imm_size; /** Immediate data size */ + uint32_t connections; /**> Number of connections */ + + psmi_context_t context; + char *context_mylabel; + uint32_t yield_spin_cnt; + + /* EP link-lists */ + struct psm2_ep *user_ep_next; + + /* EP link-lists for multi-context. */ + struct psm2_ep *mctxt_prev; + struct psm2_ep *mctxt_next; + struct psm2_ep *mctxt_master; + + /* Active Message handler table */ + struct psm2_ep_am_handle_entry *am_htable; + + uint64_t gid_hi; + uint64_t gid_lo; + + ptl_ctl_t ptl_amsh; + ptl_ctl_t ptl_ips; + ptl_ctl_t ptl_self; + + /* All ptl data is allocated inline below */ + uint8_t ptl_base_data[0] __attribute__ ((aligned(64))); +}; + +struct mqq { + psm2_mq_req_t first; + psm2_mq_req_t last; +}; + +typedef +union psmi_seqnum { + struct { + uint32_t psn_seq:11; + uint32_t psn_gen:20; + }; + struct { + uint32_t psn_num:31; + }; + uint32_t psn_val; +} psmi_seqnum_t; + +/* + * PSM end point address. One per connection and per rail. + */ +struct psm2_epaddr { + psm2_epid_t epid; /* peer's epid */ + ptl_ctl_t *ptlctl; /* The control structure for the ptl */ + struct ips_proto *proto; /* only for ips protocol */ + void *usr_ep_ctxt; /* User context associated with endpoint */ +}; + +#ifndef PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD +# define PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD 250 +#endif + +/* + * Users of BLOCKUNTIL should check the value of err upon return + */ +#define PSMI_BLOCKUNTIL(ep, err, cond) do { \ + int spin_cnt = 0; \ + PSMI_PROFILE_BLOCK(); \ + while (!(cond)) { \ + err = psmi_poll_internal(ep, 1); \ + if (err == PSM2_OK_NO_PROGRESS) { \ + PSMI_PROFILE_REBLOCK(1); \ + if (++spin_cnt == (ep)->yield_spin_cnt) { \ + spin_cnt = 0; \ + PSMI_YIELD((ep)->mq->progress_lock); \ + } \ + } \ + else if (err == PSM2_OK) { \ + PSMI_PROFILE_REBLOCK(0); \ + spin_cnt = 0; \ + } \ + else \ + break; \ + } \ + PSMI_PROFILE_UNBLOCK(); \ +} while (0) + +#endif /* _PSMI_EP_H */ diff --git a/psm_ep_connect.c b/psm_ep_connect.c new file mode 100644 index 0000000..1eb836f --- /dev/null +++ b/psm_ep_connect.c @@ -0,0 +1,620 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm_mq_internal.h" + +int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid); + +#if _HFI_DEBUGGING +PSMI_ALWAYS_INLINE( +char *psmi_getdevice(int type)) +{ + switch (type) { + case PTL_DEVID_IPS: + return "ips"; + case PTL_DEVID_AMSH: + return "amsh"; + case PTL_DEVID_SELF: + return "self"; + default: + return "ips"; + } +} +#endif + +psm2_error_t +__psm2_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epid, + int const *array_of_epid_mask, /* can be NULL */ + psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, + int64_t timeout) +{ + psm2_error_t err = PSM2_OK; + ptl_ctl_t *ptlctl; + ptl_t *ptl; + int i, j, dup_idx; + int num_toconnect = 0; + int *epid_mask = NULL; + int *epid_mask_isdupof = NULL; + uint64_t t_start = get_cycles(); + uint64_t t_left; + union psmi_envvar_val timeout_intval; + + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(ep); + + /* + * Normally we would lock here, but instead each implemented ptl component + * does its own locking. This is mostly because the ptl components are + * ahead of the PSM2 interface in that they can disconnect their peers. + */ + if (ep == NULL || array_of_epaddr == NULL || array_of_epid == NULL || + num_of_epid < 1) { + err = psmi_handle_error(ep, PSM2_PARAM_ERR, + "Invalid psm2_ep_connect parameters"); + goto fail_nolock; + } + + PSMI_LOCK(ep->mq->progress_lock); + + /* We need two of these masks to detect duplicates */ + err = PSM2_NO_MEMORY; + epid_mask = + (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid); + if (epid_mask == NULL) + goto fail; + epid_mask_isdupof = + (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid); + if (epid_mask_isdupof == NULL) + goto fail; + err = PSM2_OK; + + /* Eventually handle timeouts across all connects. */ + for (j = 0; j < num_of_epid; j++) { + if (array_of_epid_mask != NULL && !array_of_epid_mask[j]) + epid_mask[j] = 0; + else { + epid_mask[j] = 1; + array_of_errors[j] = PSM2_EPID_UNKNOWN; + array_of_epaddr[j] = NULL; + if (psmi_epid_version(array_of_epid[j]) > + PSMI_EPID_VERSION) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + " Unknown version of EPID - %"PRIu64" \n" + "Please upgrade PSM2 or set PSM2_ADDR_FMT=1 in the environment to force EPID version 1 \n", + psmi_epid_version(array_of_epid[j])); + } + num_toconnect++; + } + epid_mask_isdupof[j] = -1; + } + + psmi_getenv("PSM2_CONNECT_TIMEOUT", + "End-point connection timeout over-ride. 0 for no time-out.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)0, &timeout_intval); + + if (getenv("PSM2_CONNECT_TIMEOUT")) { + timeout = timeout_intval.e_uint * SEC_ULL; + } else if (timeout > 0) { + /* The timeout parameter provides the minimum timeout. A heuristic + * is used to scale up the timeout linearly with the number of + * endpoints, and we allow one second per 100 endpoints. */ + timeout = max(timeout, (num_toconnect * SEC_ULL) / 100); + } + + if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT) + timeout = PSMI_MIN_EP_CONNECT_TIMEOUT; + _HFI_PRDBG("Connect to %d endpoints with time-out of %.2f secs\n", + num_toconnect, (double)timeout / 1e9); + + /* Look for duplicates in input array */ + for (i = 0; i < num_of_epid; i++) { + for (j = i + 1; j < num_of_epid; j++) { + if (array_of_epid[i] == array_of_epid[j] && + epid_mask[i] && epid_mask[j]) { + epid_mask[j] = 0; /* don't connect more than once */ + epid_mask_isdupof[j] = i; + } + } + } + + for (i = 0; i < PTL_MAX_INIT; i++) { + if (ep->devid_enabled[i] == -1) + continue; + /* Set up the right connect ptrs */ + switch (ep->devid_enabled[i]) { + case PTL_DEVID_IPS: + ptlctl = &ep->ptl_ips; + ptl = ep->ptl_ips.ptl; + break; + case PTL_DEVID_AMSH: + ptlctl = &ep->ptl_amsh; + ptl = ep->ptl_amsh.ptl; + break; + case PTL_DEVID_SELF: + ptlctl = &ep->ptl_self; + ptl = ep->ptl_self.ptl; + break; + default: + ptlctl = &ep->ptl_ips; /*no-unused */ + ptl = ep->ptl_ips.ptl; /*no-unused */ + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unknown/unhandled PTL id %d\n", + ep->devid_enabled[i]); + break; + } + t_left = psmi_cycles_left(t_start, timeout); + + if (_HFI_VDBG_ON) { + _HFI_VDBG_ALWAYS + ("Trying to connect with device %s\n", + psmi_getdevice(ep->devid_enabled[i])); + } + if ((err = ptlctl->ep_connect(ptl, num_of_epid, array_of_epid, + epid_mask, array_of_errors, + array_of_epaddr, + cycles_to_nanosecs(t_left)))) { + if (_HFI_PRDBG_ON) { + _HFI_PRDBG_ALWAYS + ("Connect failure in device %s err=%d\n", + psmi_getdevice(ep->devid_enabled[i]), err); + } + goto connect_fail; + } + + /* Now process what's been connected */ + for (j = 0; j < num_of_epid; j++) { + dup_idx = epid_mask_isdupof[j]; + if (!epid_mask[j] && dup_idx == -1) + continue; + + if (dup_idx != -1) { /* dup */ + array_of_epaddr[j] = array_of_epaddr[dup_idx]; + array_of_errors[j] = array_of_errors[dup_idx]; + epid_mask_isdupof[j] = -1; + } + + if (array_of_errors[j] == PSM2_OK) { + epid_mask[j] = 0; /* don't try on next ptl */ + ep->connections++; + } + } + } + + for (i = 0; i < num_of_epid; i++) { + ptl_ctl_t *c = NULL; + if (array_of_epid_mask != NULL && !array_of_epid_mask[i]) + continue; + /* If we see unreachable here, that means some PTLs were not enabled */ + if (array_of_errors[i] == PSM2_EPID_UNREACHABLE) { + err = PSM2_EPID_UNREACHABLE; + break; + } + + psmi_assert_always(array_of_epaddr[i] != NULL); + c = array_of_epaddr[i]->ptlctl; + psmi_assert_always(c != NULL); + _HFI_VDBG("%-20s DEVICE %s (%p)\n", + psmi_epaddr_get_name(array_of_epid[i]), + c == &ep->ptl_ips ? "hfi" : + (c == &ep->ptl_amsh ? "amsh" : "self"), + (void *)array_of_epaddr[i]->ptlctl->ptl); + } + + if (err == PSM2_OK) + for (i=0; idevid_enabled[i]) { + case PTL_DEVID_IPS: + devname = "hfi"; + break; + case PTL_DEVID_AMSH: + devname = "shm"; + break; + case PTL_DEVID_SELF: + default: + devname = "self"; + break; + } + len += + snprintf(errbuf + len, + sizeof(errbuf) - len - 1, "%s,", + devname); + } + if (len < sizeof(errbuf) - 1 && devname != NULL) + /* parsed something, remove trailing comma */ + errbuf[len - 1] = ')'; + } else + len = snprintf(errbuf, sizeof(errbuf) - 1, + "%s", err == PSM2_TIMEOUT ? + "Detected connection timeout" : + psm2_error_get_string(err)); + + /* first pass, look for all nodes with the error */ + for (i = 0; i < num_of_epid && len < sizeof(errbuf) - 1; i++) { + if (array_of_epid_mask != NULL + && !array_of_epid_mask[i]) + continue; + if (array_of_errors[i] == PSM2_OK) + continue; + if (array_of_errors[i] == PSM2_EPID_UNREACHABLE && + err != PSM2_EPID_UNREACHABLE) + continue; + if (err == array_of_errors[i]) { + len += + snprintf(errbuf + len, + sizeof(errbuf) - len - 1, "%c %s", + j == 0 ? ':' : ',', + psmi_epaddr_get_hostname + (array_of_epid[i])); + j++; + } + } + errbuf[sizeof(errbuf) - 1] = '\0'; + err = psmi_handle_error(ep, err, "%s", errbuf); + } + +fail: + PSMI_UNLOCK(ep->mq->progress_lock); + +fail_nolock: + if (epid_mask != NULL) + psmi_free(epid_mask); + if (epid_mask_isdupof != NULL) + psmi_free(epid_mask_isdupof); + + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_ep_connect) + +psm2_error_t __psm2_ep_disconnect(psm2_ep_t ep, int num_of_epaddr, + psm2_epaddr_t *array_of_epaddr, + const int *array_of_epaddr_mask, + psm2_error_t *array_of_errors, + int64_t timeout) +{ + return psm2_ep_disconnect2(ep, num_of_epaddr, array_of_epaddr, + array_of_epaddr_mask, array_of_errors, + PSM2_EP_DISCONNECT_GRACEFUL, timeout); +} +PSMI_API_DECL(psm2_ep_disconnect) + +psm2_error_t __psm2_ep_disconnect2(psm2_ep_t ep, int num_of_epaddr, + psm2_epaddr_t *array_of_epaddr, + const int *array_of_epaddr_mask, + psm2_error_t *array_of_errors, + int mode, int64_t timeout) +{ + psm2_error_t err = PSM2_OK; + ptl_ctl_t *ptlctl; + ptl_t *ptl; + int i, j, dup_idx; + int num_todisconnect = 0; + int *epaddr_mask = NULL; + int *epaddr_mask_isdupof = NULL; + uint64_t t_start = get_cycles(); + uint64_t t_left; + union psmi_envvar_val timeout_intval; + + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(ep); + + + /* + * Normally we would lock here, but instead each implemented ptl component + * does its own locking. This is mostly because the ptl components are + * ahead of the PSM2 interface in that they can disconnect their peers. + */ + if (ep == NULL || array_of_epaddr == NULL || + num_of_epaddr < 1) { + err = psmi_handle_error(ep, PSM2_PARAM_ERR, + "Invalid psm2_ep_disconnect parameters"); + goto fail_nolock; + } + + PSMI_LOCK(ep->mq->progress_lock); + + /* We need two of these masks to detect duplicates */ + err = PSM2_NO_MEMORY; + epaddr_mask = + (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epaddr); + if (epaddr_mask == NULL) + goto fail; + epaddr_mask_isdupof = + (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epaddr); + if (epaddr_mask_isdupof == NULL) + goto fail; + err = PSM2_OK; + + /* Eventually handle timeouts across all connects. */ + for (j = 0; j < num_of_epaddr; j++) { + if (array_of_epaddr_mask != NULL && !array_of_epaddr_mask[j]) + epaddr_mask[j] = 0; + else { + epaddr_mask[j] = 1; + array_of_errors[j] = PSM2_EPID_UNKNOWN; + num_todisconnect++; + } + epaddr_mask_isdupof[j] = -1; + } + + psmi_getenv("PSM2_DISCONNECT_TIMEOUT", + "End-point disconnection timeout over-ride. 0 for no time-out.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)0, &timeout_intval); + + if (getenv("PSM2_DISCONNECT_TIMEOUT")) { + timeout = timeout_intval.e_uint * SEC_ULL; + } else if (timeout > 0) { + /* The timeout parameter provides the minimum timeout. A heuristic + * is used to scale up the timeout linearly with the number of + * endpoints, and we allow one second per 100 endpoints. */ + timeout = max(timeout, (num_todisconnect * SEC_ULL) / 100); + } + + if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT) + timeout = PSMI_MIN_EP_CONNECT_TIMEOUT; + _HFI_PRDBG("Disconnect %d endpoints with time-out of %.2f secs\n", + num_todisconnect, (double)timeout / 1e9); + + /* Look for duplicates in input array */ + for (i = 0; i < num_of_epaddr; i++) { + for (j = i + 1; j < num_of_epaddr; j++) { + if (array_of_epaddr[i] == array_of_epaddr[j] && + epaddr_mask[i] && epaddr_mask[j]) { + epaddr_mask[j] = 0; /* don't disconnect more than once */ + epaddr_mask_isdupof[j] = i; + } + } + } + + for (i = 0; i < PTL_MAX_INIT; i++) { + if (ep->devid_enabled[i] == -1) + continue; + /* Set up the right connect ptrs */ + switch (ep->devid_enabled[i]) { + case PTL_DEVID_IPS: + ptlctl = &ep->ptl_ips; + ptl = ep->ptl_ips.ptl; + break; + case PTL_DEVID_AMSH: + ptlctl = &ep->ptl_amsh; + ptl = ep->ptl_amsh.ptl; + break; + case PTL_DEVID_SELF: + ptlctl = &ep->ptl_self; + ptl = ep->ptl_self.ptl; + break; + default: + ptlctl = &ep->ptl_ips; /*no-unused */ + ptl = ep->ptl_ips.ptl; /*no-unused */ + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unknown/unhandled PTL id %d\n", + ep->devid_enabled[i]); + break; + } + t_left = psmi_cycles_left(t_start, timeout); + + if (_HFI_VDBG_ON) { + _HFI_VDBG_ALWAYS + ("Trying to disconnect with device %s\n", + psmi_getdevice(ep->devid_enabled[i])); + } + if ((err = ptlctl->ep_disconnect(ptl, (mode == PSM2_EP_DISCONNECT_FORCE), + num_of_epaddr, array_of_epaddr, + epaddr_mask, array_of_errors, + cycles_to_nanosecs(t_left)))) { + if (_HFI_PRDBG_ON) { + _HFI_PRDBG_ALWAYS + ("Disconnect failure in device %s err=%d\n", + psmi_getdevice(ep->devid_enabled[i]), err); + } + goto disconnect_fail; + } + + /* Now process what's been disconnected */ + for (j = 0; j < num_of_epaddr; j++) { + dup_idx = epaddr_mask_isdupof[j]; + if (!epaddr_mask[j] && dup_idx == -1) + continue; + + if (dup_idx != -1) { /* dup */ + array_of_errors[j] = array_of_errors[dup_idx]; + epaddr_mask_isdupof[j] = -1; + } + + if (array_of_errors[j] == PSM2_OK) { + epaddr_mask[j] = 0; /* don't try on next ptl */ + array_of_epaddr[j] = NULL; + ep->connections--; + } + } + } + + for (i = 0; i < num_of_epaddr; i++) { + if (array_of_epaddr_mask != NULL && !array_of_epaddr_mask[i]) + continue; + /* If we see unreachable here, that means some PTLs were not enabled */ + if (array_of_errors[i] == PSM2_EPID_UNREACHABLE) { + err = PSM2_EPID_UNREACHABLE; + break; + } + } + +disconnect_fail: + /* If the error is a timeout (at worse) and the client is OPA MPI, + * just return timeout to let OPA MPI handle the hostnames that + * timed out */ + if (err != PSM2_OK) { + char errbuf[PSM2_ERRSTRING_MAXLEN]; + size_t len; + int j = 0; + + if (err == PSM2_EPID_UNREACHABLE) { + char *deverr = "of an incorrect setting"; + char *eperr = ""; + char *devname = NULL; + if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { + deverr = + "there is no shared memory PSM2 device (shm)"; + eperr = " shared memory"; + } else + if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { + deverr = + "there is no OPA PSM2 device (hfi)"; + eperr = " OPA"; + } + + len = snprintf(errbuf, sizeof(errbuf) - 1, + "Some%s endpoints could not be disconnected because %s " + "in the currently enabled PSM2_DEVICES (", + eperr, deverr); + for (i = 0; i < PTL_MAX_INIT && len < sizeof(errbuf) - 1; i++) { + switch (ep->devid_enabled[i]) { + case PTL_DEVID_IPS: + devname = "hfi"; + break; + case PTL_DEVID_AMSH: + devname = "shm"; + break; + case PTL_DEVID_SELF: + default: + devname = "self"; + break; + } + len += + snprintf(errbuf + len, + sizeof(errbuf) - len - 1, "%s,", + devname); + } + if (len < sizeof(errbuf) - 1 && devname != NULL) + /* parsed something, remove trailing comma */ + errbuf[len - 1] = ')'; + } else + len = snprintf(errbuf, sizeof(errbuf) - 1, + "%s", err == PSM2_TIMEOUT ? + "Detected disconnect timeout" : + psm2_error_get_string(err)); + + /* first pass, look for all nodes with the error */ + for (i = 0; i < num_of_epaddr && len < sizeof(errbuf) - 1; i++) { + if (array_of_epaddr_mask != NULL + && !array_of_epaddr_mask[i]) + continue; + if (array_of_errors[i] == PSM2_OK) + continue; + if (array_of_errors[i] == PSM2_EPID_UNREACHABLE && + err != PSM2_EPID_UNREACHABLE) + continue; + if (err == array_of_errors[i]) { + len += + snprintf(errbuf + len, + sizeof(errbuf) - len - 1, "%c %s", + j == 0 ? ':' : ',', + psmi_epaddr_get_hostname + (array_of_epaddr[i]->epid)); + j++; + } + } + errbuf[sizeof(errbuf) - 1] = '\0'; + err = psmi_handle_error(ep, err, "%s", errbuf); + } + +fail: + PSMI_UNLOCK(ep->mq->progress_lock); + +fail_nolock: + if (epaddr_mask != NULL) + psmi_free(epaddr_mask); + if (epaddr_mask_isdupof != NULL) + psmi_free(epaddr_mask_isdupof); + + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_ep_disconnect2) diff --git a/psm_error.c b/psm_error.c new file mode 100644 index 0000000..99bb94f --- /dev/null +++ b/psm_error.c @@ -0,0 +1,348 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" + +#define PSMI_NOLOG -1 + +struct psm2_error_token { + psm2_ep_t ep; + psm2_error_t error; + char err_string[PSM2_ERRSTRING_MAXLEN]; +}; + +static +psm2_error_t +psmi_errhandler_noop(psm2_ep_t ep, const psm2_error_t err, + const char *error_string, psm2_error_token_t token) +{ + return err; +} + +static +psm2_error_t +psmi_errhandler_psm(psm2_ep_t ep, + const psm2_error_t err, + const char *error_string, psm2_error_token_t token) +{ + /* we want the error to be seen through ssh, etc., so we flush and then + * sleep a bit. Not perfect, but not doing so means it almost never + * gets seen. */ + fprintf(stderr, "%s%s\n", hfi_get_mylabel(), token->err_string); + fflush(stdout); + fflush(stderr); + + /* XXX Eventually, this will hook up to a connection manager, and we'll + * issue an upcall into the connection manager at shutdown time */ + sleep(3); + + /* We use this "special" ep internally to handle internal errors that are + * triggered from within code that is not expected to return to the user. + * Errors of this sort on not expected to be handled by users and always + * mean we have an internal PSM bug. */ + if (err == PSM2_INTERNAL_ERR) + abort(); + else + exit(-1); +} + +psm2_ep_errhandler_t psmi_errhandler_global = psmi_errhandler_noop; + +psm2_error_t __psm2_error_defer(psm2_error_token_t token) +{ + psm2_error_t rv; + PSM2_LOG_MSG("entering"); + rv = psmi_errhandler_psm(token->ep, token->error, token->err_string, + token); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_error_defer) + +psm2_error_t +__psm2_error_register_handler(psm2_ep_t ep, const psm2_ep_errhandler_t errhandler) +{ + psm2_ep_errhandler_t *errh; + + PSM2_LOG_MSG("entering"); + + if (ep == NULL) + errh = &psmi_errhandler_global; + else + errh = &ep->errh; + + if (errhandler == PSM2_ERRHANDLER_PSM_HANDLER) + *errh = psmi_errhandler_psm; + else if (errhandler == PSM2_ERRHANDLER_NO_HANDLER) + *errh = psmi_errhandler_noop; + else + *errh = errhandler; + + PSM2_LOG_MSG("leaving"); + + return PSM2_OK; +} +PSMI_API_DECL(psm2_error_register_handler) + +psm2_error_t +MOCKABLE (psmi_handle_error)(psm2_ep_t ep, psm2_error_t error, const char *buf, ...) +{ + va_list argptr; + int syslog_level; + int console_print = 0; + psm2_error_t newerr; + struct psm2_error_token token; + char *c, fullmsg[PSM2_ERRSTRING_MAXLEN]; + token.error = error; + snprintf(fullmsg, PSM2_ERRSTRING_MAXLEN - 1, "%s", buf); + fullmsg[PSM2_ERRSTRING_MAXLEN - 1] = '\0'; + va_start(argptr, buf); + vsnprintf(token.err_string, PSM2_ERRSTRING_MAXLEN - 1, fullmsg, argptr); + va_end(argptr); + token.err_string[PSM2_ERRSTRING_MAXLEN - 1] = '\0'; + + /* Unless the user has set PSM2_NO_VERBOSE_ERRORS, always print errors to + * console */ + c = getenv("PSM2_NO_VERBOSE_ERRORS"); + console_print = 0; + if (ep == PSMI_EP_LOGEVENT) + console_print = 1; + else if (!c || *c == '\0') { /* no desire to prevent verbose errors */ + /* Remove the console print if we're internally handling the error */ + if (ep == PSMI_EP_NORETURN) + console_print = 0; + else if (ep == NULL + && psmi_errhandler_global != psmi_errhandler_psm) + console_print = 1; + else if (ep != NULL && ep->errh != psmi_errhandler_psm) + console_print = 1; + } + + /* Before we let the user even handle the error, send to syslog */ + syslog_level = psmi_error_syslog_level(error); + if (syslog_level != PSMI_NOLOG || ep == PSMI_EP_LOGEVENT) + psmi_syslog(ep, console_print, + ep == PSMI_EP_LOGEVENT ? LOG_NOTICE : syslog_level, + "%s (err=%d)", token.err_string, error); + + if (ep == PSMI_EP_LOGEVENT) /* we're just logging */ + newerr = PSM2_OK; + else if (ep == PSMI_EP_NORETURN) + newerr = + psmi_errhandler_psm(NULL, error, token.err_string, &token); + else if (ep == NULL) + newerr = + psmi_errhandler_global(NULL, error, token.err_string, + &token); + else + newerr = ep->errh(ep, error, token.err_string, &token); + + return newerr; +} +MOCK_DEF_EPILOGUE(psmi_handle_error); + +/* Returns the "worst" error out of errA and errB */ +psm2_error_t psmi_error_cmp(psm2_error_t errA, psm2_error_t errB) +{ +#define _PSMI_ERR_IS(err) if (errA == (err) || errB == (err)) return (err) + + /* Bad runtime or before initialization */ + _PSMI_ERR_IS(PSM2_NO_MEMORY); + _PSMI_ERR_IS(PSM2_INTERNAL_ERR); + _PSMI_ERR_IS(PSM2_INIT_NOT_INIT); + _PSMI_ERR_IS(PSM2_INIT_BAD_API_VERSION); + + /* Before we cget an endpoint */ + _PSMI_ERR_IS(PSM2_EP_NO_DEVICE); + _PSMI_ERR_IS(PSM2_EP_UNIT_NOT_FOUND); + _PSMI_ERR_IS(PSM2_EP_DEVICE_FAILURE); + _PSMI_ERR_IS(PSM2_EP_NO_PORTS_AVAIL); + _PSMI_ERR_IS(PSM2_TOO_MANY_ENDPOINTS); + + /* As we open/close the endpoint */ + _PSMI_ERR_IS(PSM2_EP_NO_NETWORK); + _PSMI_ERR_IS(PSM2_SHMEM_SEGMENT_ERR); + _PSMI_ERR_IS(PSM2_EP_CLOSE_TIMEOUT); + _PSMI_ERR_IS(PSM2_EP_INVALID_UUID_KEY); + _PSMI_ERR_IS(PSM2_EP_NO_RESOURCES); + + /* In connect phase */ + _PSMI_ERR_IS(PSM2_EPID_NETWORK_ERROR); + _PSMI_ERR_IS(PSM2_EPID_INVALID_NODE); + _PSMI_ERR_IS(PSM2_EPID_INVALID_CONNECT); + _PSMI_ERR_IS(PSM2_EPID_INVALID_PKEY); + _PSMI_ERR_IS(PSM2_EPID_INVALID_VERSION); + _PSMI_ERR_IS(PSM2_EPID_INVALID_UUID_KEY); + _PSMI_ERR_IS(PSM2_EPID_INVALID_MTU); + + /* Timeout if nothing else */ + _PSMI_ERR_IS(PSM2_TIMEOUT); + + /* Last resort */ + return max(errA, errB); +} + +struct psmi_error_item { + int syslog_level; + const char *error_string; +}; + +static +struct psmi_error_item psmi_error_items[] = { + {PSMI_NOLOG, "Success"}, /* PSM2_OK = 0, */ + {PSMI_NOLOG, "No events were progressed in psm_poll"}, /* PSM2_OK_NO_PROGRESS = 1 */ + {PSMI_NOLOG, "unknown 2"}, + {PSMI_NOLOG, "Error in a function parameter"}, /* PSM2_PARAM_ERR = 3 */ + {LOG_CRIT, "Ran out of memory"}, /* PSM2_NO_MEMORY = 4 */ + {PSMI_NOLOG, "PSM has not been initialized by psm2_init"}, /* PSM2_INIT_NOT_INIT = 5 */ + {LOG_INFO, "API version passed in psm2_init is incompatible"}, /* PSM2_INIT_BAD_API_VERSION = 6 */ + {PSMI_NOLOG, "PSM Could not set affinity"}, /* PSM2_NO_AFFINITY = 7 */ + {LOG_ALERT, "PSM Unresolved internal error"}, /* PSM2_INTERNAL_ERR = 8 */ + {LOG_CRIT, "PSM could not set up shared memory segment"}, /* PSM2_SHMEM_SEGMENT_ERR = 9 */ + {PSMI_NOLOG, "PSM option is a read-only option"}, /* PSM2_OPT_READONLY = 10 */ + {PSMI_NOLOG, "Operation timed out"}, /* PSM2_TIMEOUT = 11 */ + {LOG_INFO, "Exceeded supported amount of endpoints"}, + /* PSM2_TOO_MANY_ENDPOINTS = 12 */ + {PSMI_NOLOG, "PSM is in the finalized state"}, /* PSM2_IS_FINALIZED = 13 */ + {PSMI_NOLOG, "unknown 14"}, + {PSMI_NOLOG, "unknown 15"}, + {PSMI_NOLOG, "unknown 16"}, + {PSMI_NOLOG, "unknown 17"}, + {PSMI_NOLOG, "unknown 18"}, + {PSMI_NOLOG, "unknown 19"}, + {PSMI_NOLOG, "Endpoint was closed"}, /* PSM2_EP_WAS_CLOSED = 20 */ + {LOG_ALERT, "PSM Could not find an OPA Unit"}, /* PSM2_EP_NO_DEVICE = 21 */ + {PSMI_NOLOG, "User passed a bad unit number"}, /* PSM2_EP_UNIT_NOT_FOUND = 22 */ + {LOG_ALERT, "Failure in initializing endpoint"}, /* PSM2_EP_DEVICE_FAILURE = 23 */ + {PSMI_NOLOG, "Error closing the endpoing error"}, /* PSM2_EP_CLOSE_TIMEOUT = 24 */ + {PSMI_NOLOG, "No free contexts could be obtained"}, /* PSM2_EP_NO_PORTS_AVAIL = 25 */ + {LOG_ALERT, "Could not detect network connectivity"}, /* PSM2_EP_NO_NETWORK = 26 */ + {LOG_INFO, "Invalid Unique job-wide UUID Key"}, /* PSM2_EP_INVALID_UUID_KEY = 27 */ + {LOG_INFO, "Out of endpoint resources"}, /* PSM2_EP_NO_RESOURCES = 28 */ + {PSMI_NOLOG, "unknown 29"}, + {PSMI_NOLOG, "unknown 30"}, + {PSMI_NOLOG, "unknown 31"}, + {PSMI_NOLOG, "unknown 32"}, + {PSMI_NOLOG, "unknown 33"}, + {PSMI_NOLOG, "unknown 34"}, + {PSMI_NOLOG, "unknown 35"}, + {PSMI_NOLOG, "unknown 36"}, + {PSMI_NOLOG, "unknown 37"}, + {PSMI_NOLOG, "unknown 38"}, + {PSMI_NOLOG, "unknown 39"}, + {PSMI_NOLOG, "Unknown/unresolved connection status (other errors occurred)"}, /* PSM2_EPID_UNKNOWN = 40 */ + {PSMI_NOLOG, "Endpoint could not be reached"}, /* PSM2_EPID_UNREACHABLE = 41 */ + {PSMI_NOLOG, "unknown 42"}, + {LOG_CRIT, "Invalid node (mismatch in bit width 32/64 or byte order)"}, /* PSM2_EPID_INVALID_NODE = 43 */ + {LOG_CRIT, "Invalid MTU"}, /* PSM2_EPID_INVALID_MTU = 44 */ + {PSMI_NOLOG, "UUID key mismatch"}, /* PSM2_EPID_INVALID_UUID_KEY = 45 */ + {LOG_ERR, "Incompatible PSM version"}, /* PSM2_EPID_INVALID_VERSION = 46 */ + {LOG_CRIT, "Connect received garbled connection information"}, /* PSM2_EPID_INVALID_CONNECT = 47 */ + {PSMI_NOLOG, "Endpoint was already connected"}, /* PSM2_EPID_ALREADY_CONNECTED = 48 */ + {LOG_CRIT, "Two or more endpoints have the same network id (LID)"}, /* PSM2_EPID_NETWORK_ERROR = 49 */ + {LOG_CRIT, "Endpoint provided incompatible Partition Key"}, + {LOG_CRIT, "Unable to resolve network path. Is the SM running?"}, + {PSMI_NOLOG, "unknown 52"}, + {PSMI_NOLOG, "unknown 53"}, + {PSMI_NOLOG, "unknown 54"}, + {PSMI_NOLOG, "unknown 55"}, + {PSMI_NOLOG, "unknown 56"}, + {PSMI_NOLOG, "unknown 57"}, + {PSMI_NOLOG, "unknown 58"}, + {PSMI_NOLOG, "unknown 59"}, + {PSMI_NOLOG, "MQ Non-blocking request is incomplete"}, /* PSM2_MQ_NO_COMPLETIONS = 60 */ + {PSMI_NOLOG, "MQ Message has been truncated at the receiver"}, /* PSM2_MQ_TRUNCATION = 61 */ + {PSMI_NOLOG, "unknown 62"}, + {PSMI_NOLOG, "unknown 63"}, + {PSMI_NOLOG, "unknown 64"}, + {PSMI_NOLOG, "unknown 65"}, + {PSMI_NOLOG, "unknown 66"}, + {PSMI_NOLOG, "unknown 67"}, + {PSMI_NOLOG, "unknown 68"}, + {PSMI_NOLOG, "unknown 69"}, + {PSMI_NOLOG, "Invalid AM reply"}, + {PSMI_NOLOG, "unknown 71"}, + {PSMI_NOLOG, "unknown 72"}, + {PSMI_NOLOG, "unknown 73"}, + {PSMI_NOLOG, "unknown 74"}, + {PSMI_NOLOG, "unknown 75"}, + {PSMI_NOLOG, "unknown 76"}, + {PSMI_NOLOG, "unknown 77"}, + {PSMI_NOLOG, "unknown 78"}, + {PSMI_NOLOG, "unknown 79"}, + {PSMI_NOLOG, "unknown 80"}, +}; + +const char *__psm2_error_get_string(psm2_error_t error) +{ + PSM2_LOG_MSG("entering"); + if (error >= PSM2_ERROR_LAST) { + PSM2_LOG_MSG("leaving"); + return "unknown"; + } + else { + PSM2_LOG_MSG("leaving"); + return psmi_error_items[error].error_string; + } +} +PSMI_API_DECL(psm2_error_get_string) + +int psmi_error_syslog_level(psm2_error_t error) +{ + if (error >= PSM2_ERROR_LAST) + return PSMI_NOLOG; + else + return psmi_error_items[error].syslog_level; +} diff --git a/psm_error.h b/psm_error.h new file mode 100644 index 0000000..f335382 --- /dev/null +++ b/psm_error.h @@ -0,0 +1,78 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ +#include "psm2_mock_testing.h" + +#ifndef _PSMI_IN_USER_H +#error psm_error.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_ERROR_H +#define _PSMI_ERROR_H + +#define PSMI_EP_NONE (NULL) +#define PSMI_EP_NORETURN ((psm2_ep_t) -2) +#define PSMI_EP_LOGEVENT ((psm2_ep_t) -3) + +psm2_ep_errhandler_t psmi_errhandler_global; + +psm2_error_t MOCKABLE(psmi_handle_error)(psm2_ep_t ep, psm2_error_t error, + const char *buf, ...) + __attribute__((format(printf, 3, 4))); +MOCK_DCL_EPILOGUE(psmi_handle_error); + +psm2_error_t psmi_error_cmp(psm2_error_t errA, psm2_error_t errB); +int psmi_error_syslog_level(psm2_error_t error); + +#endif /* _PSMI_ERROR_H */ diff --git a/psm_gdrcpy.h b/psm_gdrcpy.h new file mode 100644 index 0000000..2773454 --- /dev/null +++ b/psm_gdrcpy.h @@ -0,0 +1,77 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2018 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2018 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2018 Intel Corporation. All rights reserved. */ +#ifndef GDR_CPY_H +#define GDR_CPY_H +#ifdef PSM_CUDA + +#include "ptl_ips/ips_proto.h" + +#define GDR_FD get_gdr_fd() + +int get_gdr_fd(); + +void hfi_gdr_open(); + +void hfi_gdr_close(); + +void * +gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf, + size_t size, int flags, + struct ips_proto* proto); + +uint64_t +gdr_cache_evict(); +#endif +#endif diff --git a/psm_hal_gen1/Makefile b/psm_hal_gen1/Makefile new file mode 100644 index 0000000..9a09d06 --- /dev/null +++ b/psm_hal_gen1/Makefile @@ -0,0 +1,79 @@ +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2017 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2017 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +OUTDIR = . + +this_srcdir = $(shell readlink -m .) +top_srcdir := $(this_srcdir)/.. + +INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips + +${TARGLIB}-objs := psm_hal_gen1.o opa_service_gen1.o opa_utils_gen1.o \ + opa_proto_gen1.o opa_i2cflash_gen1.o psm_gdrcpy.o +${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs}) + +DEPS:= $(${TARGLIB}-objs:.o=.d) +-include $(DEPS) + +all: ${${TARGLIB}-objs} + +$(OUTDIR)/%.o: $(this_srcdir)/%.c + $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -MMD -c $< -o $@ + +clean: + @if [ -d $(OUTDIR) ]; then \ + cd $(OUTDIR); \ + rm -f *.o *.d *.gcda *.gcno; \ + cd -; \ + fi + +install: + @echo "Nothing to do for install." diff --git a/psm_hal_gen1/hfi1_deprecated_gen1.h b/psm_hal_gen1/hfi1_deprecated_gen1.h new file mode 100644 index 0000000..c325608 --- /dev/null +++ b/psm_hal_gen1/hfi1_deprecated_gen1.h @@ -0,0 +1,181 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* + + hfi1_deprecated_gen1.h + + Contains certain features of the hfi1 module that have been deprecated. + + These features may still need to be supported by the psm library for + reasons of backwards compatibility. + */ + +#ifndef __HFI1_DEPRECATED_GEN1_H__ + +#define __HFI1_DEPRECATED_GEN1_H__ + +/* First, include the current hfi1_user.h file: */ + +#include + +/* Determine if we need to define and declare deprecated + entities based on the IB_IOCTL_MAGIC macro. */ + +#if defined( IB_IOCTL_MAGIC ) + +/* The macro: PSM2_SUPPORT_IW_CMD_API is used to stipulate + adding compile-time support of either the ioctl() or write() + command interfaces to the driver. Note though that the + final decision whether to support this depends on factors + only known at runtime. */ +#define PSM2_SUPPORT_IW_CMD_API 1 +/* IOCTL_CMD_API_MODULE_MAJOR defines the first version of the hfi1 + * module that supports the ioctl() command interface. Prior to this + * (IOCTL_CMD_API_MODULE_MAJOR - 1 and smaller), the module used + * write() for the command interface. */ +#define IOCTL_CMD_API_MODULE_MAJOR 6 + +/* + * round robin contexts across HFIs, then + * ports; this is the default. + * This option spreads the HFI selection within the local socket. + * If it is preferred to spread job over over entire set of + * HFIs within the system, see ALG_ACROSS_ALL below. + */ +#define HFI1_ALG_ACROSS_DEP 0 + +/* + * use all contexts on an HFI (round robin + * active ports within), then next HFI + */ +#define HFI1_ALG_WITHIN_DEP 1 + +struct hfi1_cmd_deprecated { + __u32 type; /* command type */ + __u32 len; /* length of struct pointed to by add */ + __u64 addr; /* pointer to user structure */ +}; + +#define hfi1_cmd hfi1_cmd_deprecated + +#define HFI1_ALG_ACROSS HFI1_ALG_ACROSS_DEP +#define HFI1_ALG_WITHIN HFI1_ALG_WITHIN_DEP + +#else + +#define HFI1_SWMAJOR_SHIFT 16 + +#endif /* defined( IB_IOCTL_MAGIC )*/ + +#define HFI1_ALG_ACROSS_ALL_DEP 2 +#define HFI1_ALG_ACROSS_ALL HFI1_ALG_ACROSS_ALL_DEP + +/* Note that struct hfi1_user_info_dep declaration is identical to + the struct hfi1_user_info declaration from MAJOR version 5 of the + hfi1_user.h file. */ +struct hfi1_user_info_dep { + /* + * version of user software, to detect compatibility issues. + * Should be set to HFI1_USER_SWVERSION. + */ + __u32 userversion; + __u16 pad; + /* HFI selection algorithm, if unit has not selected */ + __u16 hfi1_alg; + /* + * If two or more processes wish to share a context, each process + * must set the subcontext_cnt and subcontext_id to the same + * values. The only restriction on the subcontext_id is that + * it be unique for a given node. + */ + __u16 subctxt_cnt; + __u16 subctxt_id; + /* 128bit UUID passed in by PSM. */ + __u8 uuid[16]; +}; + +/* + * We assume here that we have the hfi1_user.h file installed in the system path + * with the 'flags' field defined in struct sdma_req_info. (At least, when the + * user needs to run GPU workloads, this _should_ be the version of hfi1_user.h + * file installed by the IFS.) + */ +struct sdma_req_info_v6_3 { + /* + * bits 0-3 - version (currently unused) + * bits 4-7 - opcode (enum sdma_req_opcode) + * bits 8-15 - io vector count + */ + __u16 ctrl; + /* + * Number of fragments contained in this request. + * User-space has already computed how many + * fragment-sized packet the user buffer will be + * split into. + */ + __u16 npkts; + /* + * Size of each fragment the user buffer will be + * split into. + */ + __u16 fragsize; + /* + * Index of the slot in the SDMA completion ring + * this request should be using. User-space is + * in charge of managing its own ring. + */ + __u16 comp_idx; +} __attribute__((packed)); + +#endif /* #ifndef __HFI1_DEPRECATED_GEN1_H__ */ diff --git a/psm_hal_gen1/opa_common_gen1.h b/psm_hal_gen1/opa_common_gen1.h new file mode 100644 index 0000000..1bc8f73 --- /dev/null +++ b/psm_hal_gen1/opa_common_gen1.h @@ -0,0 +1,62 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef OPA_COMMON_GEN1_H +#define OPA_COMMON_GEN1_H + +#include +#include "hfi1_deprecated_gen1.h" + +#endif /* OPA_COMMON_GEN1_H */ diff --git a/psm_hal_gen1/opa_i2cflash_gen1.c b/psm_hal_gen1/opa_i2cflash_gen1.c new file mode 100644 index 0000000..427eecc --- /dev/null +++ b/psm_hal_gen1/opa_i2cflash_gen1.c @@ -0,0 +1,87 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opa_user_gen1.h" + +uint8_t hfi_flash_csum(struct hfi_flash *ifp, int adjust) +{ + uint8_t *ip = (uint8_t *) ifp; + uint8_t csum = 0, len; + + /* + * Limit length checksummed to max length of actual data. + * Checksum of erased eeprom will still be bad, but we avoid + * reading past the end of the buffer we were passed. + */ + len = ifp->if_length; + if (len > sizeof(struct hfi_flash)) + len = sizeof(struct hfi_flash); + while (len--) + csum += *ip++; + csum -= ifp->if_csum; + csum = ~csum; + if (adjust) + ifp->if_csum = csum; + return csum; +} diff --git a/psm_hal_gen1/opa_proto_gen1.c b/psm_hal_gen1/opa_proto_gen1.c new file mode 100644 index 0000000..1f2b13e --- /dev/null +++ b/psm_hal_gen1/opa_proto_gen1.c @@ -0,0 +1,567 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* This file contains the initialization functions used by the low + level hfi protocol code. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opa_user_gen1.h" +#include "opa_udebug.h" + +#include + +#define ALIGN(x, a) (((x)+(a)-1)&~((a)-1)) + +/* It is allowed to have multiple devices (and of different types) + simultaneously opened and initialized, although this (still! Oct 07) + implemented. This routine is used by the low level hfi protocol code (and + any other code that has similar low level functionality). + This is the only routine that takes a file descriptor, rather than an + struct _hfi_ctrl *. The struct _hfi_ctrl * used for everything + else is returned as part of hfi1_base_info. +*/ +struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo) +{ + struct _hfi_ctrl *spctrl = NULL; + struct hfi1_ctxt_info *cinfo; + struct hfi1_base_info *binfo; + void *tmp; + uint64_t *tmp64; + struct hfi1_cmd c; + uintptr_t pg_mask; + int __hfi_pg_sz; +#ifdef PSM2_SUPPORT_IW_CMD_API + /* for major version 6 of driver, we will use uinfo_new. See below for details. */ + struct hfi1_user_info uinfo_new = {0}; +#endif + + /* First get the page size */ + __hfi_pg_sz = sysconf(_SC_PAGESIZE); + pg_mask = ~(intptr_t) (__hfi_pg_sz - 1); + + if (!(spctrl = calloc(1, sizeof(struct _hfi_ctrl)))) { + _HFI_INFO("can't allocate memory for hfi_ctrl: %s\n", + strerror(errno)); + goto err; + } + cinfo = &spctrl->ctxt_info; + binfo = &spctrl->base_info; + + _HFI_VDBG("uinfo: ver %x, alg %d, subc_cnt %d, subc_id %d\n", + uinfo->userversion, uinfo->hfi1_alg, + uinfo->subctxt_cnt, uinfo->subctxt_id); + + /* 1. ask driver to assign context to current process */ + memset(&c, 0, sizeof(struct hfi1_cmd)); + c.type = PSMI_HFI_CMD_ASSIGN_CTXT; + +#ifdef PSM2_SUPPORT_IW_CMD_API + /* If psm is communicating with a MAJOR version 6 driver, we need + to pass in an actual struct hfi1_user_info not a hfi1_user_info_dep. + Else if psm is communicating with a MAJOR version 5 driver, we can + just continue to pass a hfi1_user_info_dep as struct hfi1_user_info_dep + is identical to the MAJOR version 5 struct hfi1_user_info. */ + if (hfi_get_user_major_version() == IOCTL_CMD_API_MODULE_MAJOR) + { + /* If psm is communicating with a MAJOR version 6 driver, + we copy uinfo into uinfo_new and pass uinfo_new to the driver. */ + c.len = sizeof(uinfo_new); + c.addr = (__u64) (&uinfo_new); + + uinfo_new.userversion = uinfo->userversion; + uinfo_new.pad = uinfo->pad; + uinfo_new.subctxt_cnt = uinfo->subctxt_cnt; + uinfo_new.subctxt_id = uinfo->subctxt_id; + memcpy(uinfo_new.uuid,uinfo->uuid,sizeof(uinfo_new.uuid)); + } + else + { + /* If psm is working with an old driver, we continue to use + the struct hfi1_user_info_dep version of the struct: */ + c.len = sizeof(*uinfo); + c.addr = (__u64) uinfo; + } +#else + c.len = sizeof(*uinfo); + c.addr = (__u64) uinfo; +#endif + if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) { + if (errno == ENODEV) { + _HFI_INFO("PSM2 and driver version mismatch\n"); + /* Overwrite errno. One would wish that the driver + * didn't return ENODEV for a version mismatch */ + errno = EPROTONOSUPPORT; + } else { + _HFI_INFO("assign_context command failed: %s\n", + strerror(errno)); + } + goto err; + } + +#ifdef PSM2_SUPPORT_IW_CMD_API + if (hfi_get_user_major_version() == IOCTL_CMD_API_MODULE_MAJOR) + { + /* for the new driver, we copy the results of the call back to uinfo from + uinfo_new. */ + uinfo->userversion = uinfo_new.userversion; + uinfo->pad = uinfo_new.pad; + uinfo->subctxt_cnt = uinfo_new.subctxt_cnt; + uinfo->subctxt_id = uinfo_new.subctxt_id; + memcpy(uinfo->uuid,uinfo_new.uuid,sizeof(uinfo_new.uuid)); + } +#endif + + /* 2. get context info from driver */ + c.type = PSMI_HFI_CMD_CTXT_INFO; + c.len = sizeof(*cinfo); + c.addr = (__u64) cinfo; + + if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) { + _HFI_INFO("CTXT_INFO command failed: %s\n", strerror(errno)); + goto err; + } + + /* sanity checking... */ + if (cinfo->rcvtids%8) { + _HFI_INFO("rcvtids not 8 multiple: %d\n", cinfo->rcvtids); + goto err; + } + if (cinfo->egrtids%8) { + _HFI_INFO("egrtids not 8 multiple: %d\n", cinfo->egrtids); + goto err; + } + if (cinfo->rcvtids < cinfo->egrtids) { + _HFI_INFO("rcvtids(%d) < egrtids(%d)\n", + cinfo->rcvtids, cinfo->egrtids); + goto err; + } + if (cinfo->rcvhdrq_cnt%32) { + _HFI_INFO("rcvhdrq_cnt not 32 multiple: %d\n", + cinfo->rcvhdrq_cnt); + goto err; + } + if (cinfo->rcvhdrq_entsize%64) { + _HFI_INFO("rcvhdrq_entsize not 64 multiple: %d\n", + cinfo->rcvhdrq_entsize); + goto err; + } + if (cinfo->rcvegr_size%__hfi_pg_sz) { + _HFI_INFO("rcvegr_size not page multiple: %d\n", + cinfo->rcvegr_size); + goto err; + } + + _HFI_VDBG("ctxtinfo: runtime_flags %llx, rcvegr_size %d\n", + cinfo->runtime_flags, cinfo->rcvegr_size); + _HFI_VDBG("ctxtinfo: active %d, unit %d, ctxt %d, subctxt %d\n", + cinfo->num_active, cinfo->unit, cinfo->ctxt, cinfo->subctxt); + _HFI_VDBG("ctxtinfo: rcvtids %d, credits %d\n", + cinfo->rcvtids, cinfo->credits); + _HFI_VDBG("ctxtinfo: numa %d, cpu %x, send_ctxt %d\n", + cinfo->numa_node, cinfo->rec_cpu, cinfo->send_ctxt); + _HFI_VDBG("ctxtinfo: rcvhdrq_cnt %d, rcvhdrq_entsize %d\n", + cinfo->rcvhdrq_cnt, cinfo->rcvhdrq_entsize); + _HFI_VDBG("ctxtinfo: egrtids %d, sdma_ring_size %d\n", + cinfo->egrtids, cinfo->sdma_ring_size); + + /* if affinity has not been setup, set it */ + if ((!getenv("HFI_NO_CPUAFFINITY") && cinfo->rec_cpu != (__u16) -1) || + getenv("HFI_FORCE_CPUAFFINITY")) { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(cinfo->rec_cpu, &cpuset); + if (sched_setaffinity(0, sizeof(cpuset), &cpuset)) { + _HFI_INFO("Couldn't set runon processor %u " + "(unit:context %u:%u) (%u active chips): %s\n", + cinfo->rec_cpu, cinfo->unit, cinfo->ctxt, + cinfo->num_active, strerror(errno)); + } + } + + + /* 4. Get user base info from driver */ + c.type = PSMI_HFI_CMD_USER_INFO; + c.len = sizeof(*binfo); + c.addr = (__u64) binfo; + + if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) { + _HFI_INFO("BASE_INFO command failed: %s\n", strerror(errno)); + goto err; + } + + hfi_set_user_version(binfo->sw_version); + + _HFI_VDBG("baseinfo: hwver %x, swver %x, jkey %d, qp %d\n", + binfo->hw_version, binfo->sw_version, + binfo->jkey, binfo->bthqp); + _HFI_VDBG("baseinfo: credit_addr %llx, sop %llx, pio %llx\n", + binfo->sc_credits_addr, binfo->pio_bufbase_sop, + binfo->pio_bufbase); + _HFI_VDBG("baseinfo: hdrbase %llx, egrbase %llx, sdmabase %llx\n", + binfo->rcvhdr_bufbase, binfo->rcvegr_bufbase, + binfo->sdma_comp_bufbase); + _HFI_VDBG("baseinfo: ureg %llx, eventbase %llx, " + "statusbase %llx, tailaddr %llx\n", binfo->user_regbase, + binfo->events_bufbase, binfo->status_bufbase, + binfo->rcvhdrtail_base); + + /* + * Check if driver version matches PSM version, + * this is different from PSM API version. + */ + if ((binfo->sw_version >> HFI1_SWMAJOR_SHIFT) != hfi_get_user_major_version()) { + _HFI_INFO + ("User major version 0x%x not same as driver major 0x%x\n", + hfi_get_user_major_version(), binfo->sw_version >> HFI1_SWMAJOR_SHIFT); + if ((binfo->sw_version >> HFI1_SWMAJOR_SHIFT) < hfi_get_user_major_version()) + goto err; /* else assume driver knows how to be compatible */ + } else if ((binfo->sw_version & 0xffff) != HFI1_USER_SWMINOR) { + _HFI_PRDBG + ("User minor version 0x%x not same as driver minor 0x%x\n", + HFI1_USER_SWMINOR, binfo->sw_version & 0xffff); + } + + /* Map the PIO credits address */ + if ((tmp = hfi_mmap64(0, __hfi_pg_sz, + PROT_READ, MAP_SHARED | MAP_LOCKED, fd, + (__off64_t) binfo->sc_credits_addr & + pg_mask)) == MAP_FAILED) { + _HFI_INFO("mmap of sc_credits_addr (%llx) failed: %s\n", + (unsigned long long)binfo->sc_credits_addr, + strerror(errno)); + goto err; + } else { + hfi_touch_mmap(tmp, __hfi_pg_sz); + binfo->sc_credits_addr = (uint64_t) (uintptr_t) tmp | + (binfo->sc_credits_addr & ~pg_mask); + _HFI_VDBG("sc_credits_addr %llx\n", + binfo->sc_credits_addr); + } + + /* Map the PIO buffer SOP address */ + if ((tmp = hfi_mmap64(0, cinfo->credits * 64, + PROT_WRITE, MAP_SHARED | MAP_LOCKED, fd, + (__off64_t) binfo->pio_bufbase_sop & pg_mask)) + == MAP_FAILED) { + _HFI_INFO("mmap of pio buffer sop at %llx failed: %s\n", + (unsigned long long)binfo->pio_bufbase_sop, + strerror(errno)); + goto err; + } else { + /* Do not try to read the PIO buffers; they are mapped write */ + /* only. We'll fault them in as we write to them. */ + binfo->pio_bufbase_sop = (uintptr_t) tmp; + _HFI_VDBG("pio_bufbase_sop %llx\n", + binfo->pio_bufbase_sop); + } + + /* Map the PIO buffer address */ + if ((tmp = hfi_mmap64(0, cinfo->credits * 64, + PROT_WRITE, MAP_SHARED | MAP_LOCKED, fd, + (__off64_t) binfo->pio_bufbase & pg_mask)) == + MAP_FAILED) { + _HFI_INFO("mmap of pio buffer at %llx failed: %s\n", + (unsigned long long)binfo->pio_bufbase, + strerror(errno)); + goto err; + } else { + /* Do not try to read the PIO buffers; they are mapped write */ + /* only. We'll fault them in as we write to them. */ + binfo->pio_bufbase = (uintptr_t) tmp; + _HFI_VDBG("sendpio_bufbase %llx\n", binfo->pio_bufbase); + } + + /* Map the receive header queue */ + if ((tmp = + hfi_mmap64(0, cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize, + PROT_READ, MAP_SHARED | MAP_LOCKED, fd, + (__off64_t) binfo->rcvhdr_bufbase & pg_mask)) == + MAP_FAILED) { + _HFI_INFO("mmap of rcvhdrq at %llx failed: %s\n", + (unsigned long long)binfo->rcvhdr_bufbase, + strerror(errno)); + goto err; + } else { + /* for use in protocol code */ + hfi_touch_mmap(tmp, + cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize); + binfo->rcvhdr_bufbase = (uintptr_t) tmp; /* set to mapped address */ + _HFI_VDBG("rcvhdr_bufbase %llx\n", binfo->rcvhdr_bufbase); + } + + /* Map the receive eager buffer */ + if ((tmp = + hfi_mmap64(0, cinfo->egrtids * cinfo->rcvegr_size, + PROT_READ, MAP_SHARED | MAP_LOCKED, fd, + (__off64_t) binfo->rcvegr_bufbase & pg_mask)) == + MAP_FAILED) { + _HFI_INFO("mmap of rcvegrq bufs from %llx failed: %s\n", + (unsigned long long)binfo->rcvegr_bufbase, + strerror(errno)); + goto err; + } else { + hfi_touch_mmap(tmp, cinfo->egrtids * cinfo->rcvegr_size); + binfo->rcvegr_bufbase = (uint64_t) (uintptr_t) tmp; + _HFI_VDBG("rcvegr_bufbase %llx\n", binfo->rcvegr_bufbase); + } + + /* Map the sdma completion queue */ + if (!(cinfo->runtime_flags & HFI1_CAP_SDMA)) { + binfo->sdma_comp_bufbase = 0; + } else + if ((tmp = + hfi_mmap64(0, cinfo->sdma_ring_size * + sizeof(struct hfi1_sdma_comp_entry), + PROT_READ, MAP_SHARED | MAP_LOCKED, fd, + (__off64_t) binfo->sdma_comp_bufbase & pg_mask)) == + MAP_FAILED) { + _HFI_INFO + ("mmap of sdma completion queue from %llx failed: %s\n", + (unsigned long long)binfo->sdma_comp_bufbase, + strerror(errno)); + goto err; + } else { + binfo->sdma_comp_bufbase = (uint64_t) (uintptr_t) tmp; + } + _HFI_VDBG("sdma_comp_bufbase %llx\n", binfo->sdma_comp_bufbase); + + /* Map RXE per-context CSRs */ + if ((tmp = hfi_mmap64(0, __hfi_pg_sz, + PROT_WRITE | PROT_READ, MAP_SHARED | MAP_LOCKED, + fd, + (__off64_t) binfo->user_regbase & pg_mask)) == + MAP_FAILED) { + _HFI_INFO("mmap of user registers at %llx failed: %s\n", + (unsigned long long)binfo->user_regbase, + strerror(errno)); + goto err; + } else { + /* we don't try to fault these in, no need */ + binfo->user_regbase = (uint64_t) (uintptr_t) tmp; + _HFI_VDBG("user_regbase %llx\n", binfo->user_regbase); + } + + /* + * Set up addresses for optimized register writeback routines. + * This is for the real onchip registers, shared context or not + */ + tmp64 = (uint64_t *) tmp; + spctrl->__hfi_rcvhdrtail = (volatile __le64 *)&tmp64[ur_rcvhdrtail]; + spctrl->__hfi_rcvhdrhead = (volatile __le64 *)&tmp64[ur_rcvhdrhead]; + spctrl->__hfi_rcvegrtail = + (volatile __le64 *)&tmp64[ur_rcvegrindextail]; + spctrl->__hfi_rcvegrhead = + (volatile __le64 *)&tmp64[ur_rcvegrindexhead]; + spctrl->__hfi_rcvofftail = + (volatile __le64 *)&tmp64[ur_rcvegroffsettail]; + + if (!(cinfo->runtime_flags & HFI1_CAP_HDRSUPP)) { + spctrl->__hfi_rcvtidflow = spctrl->regs; + spctrl->__hfi_tfvalid = 0; + } else { + spctrl->__hfi_rcvtidflow = + (volatile __le64 *)&tmp64[ur_rcvtidflowtable]; + spctrl->__hfi_tfvalid = 1; + } + + /* Map the rcvhdrq tail register address */ + if (!(cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL)) { + /* + * We don't use receive header queue tail register to detect + * new packets, but here we save the address for + * false-eager-full recovery. + */ + binfo->rcvhdrtail_base = + (uint64_t) (uintptr_t) spctrl->__hfi_rcvhdrtail; + spctrl->__hfi_rcvtail = (__le64 *) binfo->rcvhdrtail_base; + } else + if ((tmp = hfi_mmap64(0, __hfi_pg_sz, + PROT_READ, MAP_SHARED | MAP_LOCKED, fd, + (__off64_t) binfo->rcvhdrtail_base & + pg_mask)) == MAP_FAILED) { + _HFI_INFO("mmap of rcvhdrq tail addr %llx failed: %s\n", + (unsigned long long)binfo->rcvhdrtail_base, + strerror(errno)); + goto err; + } else { + hfi_touch_mmap(tmp, __hfi_pg_sz); + binfo->rcvhdrtail_base = (uint64_t) (uintptr_t) tmp; + spctrl->__hfi_rcvtail = (__le64 *) binfo->rcvhdrtail_base; + } + _HFI_VDBG("rcvhdr_tail_addr %llx\n", binfo->rcvhdrtail_base); + + /* Map the event page */ + if ((tmp = hfi_mmap64(0, __hfi_pg_sz, + PROT_READ, MAP_SHARED | MAP_LOCKED, fd, + (__off64_t) binfo->events_bufbase & pg_mask)) == + MAP_FAILED) { + _HFI_INFO("mmap of status page at %llx failed: %s\n", + (unsigned long long)binfo->events_bufbase, + strerror(errno)); + goto err; + } else { + binfo->events_bufbase = (uint64_t) (uintptr_t) tmp | + (binfo->events_bufbase & ~pg_mask); + _HFI_VDBG("events_bufbase %llx\n", binfo->events_bufbase); + } + + /* Map the status page */ + if ((tmp = hfi_mmap64(0, __hfi_pg_sz, + PROT_READ, MAP_SHARED | MAP_LOCKED, fd, + (__off64_t) binfo->status_bufbase & pg_mask)) == + MAP_FAILED) { + _HFI_INFO("mmap of status page (%llx) failed: %s\n", + (unsigned long long)binfo->status_bufbase, + strerror(errno)); + goto err; + } else { + binfo->status_bufbase = (uintptr_t) tmp; + _HFI_VDBG("status_bufbase %llx\n", binfo->status_bufbase); + } + + /* If subcontext is used, map the buffers */ + if (uinfo->subctxt_cnt) { + unsigned num_subcontexts = uinfo->subctxt_cnt; + size_t size; + + size = __hfi_pg_sz; + if ((tmp = hfi_mmap64(0, size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_LOCKED, fd, + (__off64_t) binfo->subctxt_uregbase & + pg_mask)) == MAP_FAILED) { + _HFI_INFO + ("mmap of subcontext uregbase array (%llx) failed: %s\n", + (unsigned long long)binfo->subctxt_uregbase, + strerror(errno)); + goto err; + } else { + hfi_touch_mmap(tmp, size); + binfo->subctxt_uregbase = (uint64_t) (uintptr_t) tmp; + _HFI_VDBG("subctxt_uregbase %llx\n", + binfo->subctxt_uregbase); + } + + size = ALIGN(cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize, + __hfi_pg_sz) * num_subcontexts; + if ((tmp = hfi_mmap64(0, size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_LOCKED, fd, + (__off64_t) binfo->subctxt_rcvhdrbuf & + pg_mask)) == MAP_FAILED) { + _HFI_INFO + ("mmap of subcontext rcvhdr_base array (%llx) failed: %s\n", + (unsigned long long)binfo->subctxt_rcvhdrbuf, + strerror(errno)); + goto err; + } else { + hfi_touch_mmap(tmp, size); + binfo->subctxt_rcvhdrbuf = (uint64_t) (uintptr_t) tmp; + _HFI_VDBG("subctxt_rcvhdrbuf %llx\n", + binfo->subctxt_rcvhdrbuf); + } + + size = ALIGN(cinfo->egrtids * cinfo->rcvegr_size, + __hfi_pg_sz) * num_subcontexts; + if ((tmp = hfi_mmap64(0, size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_LOCKED, fd, + (__off64_t) binfo->subctxt_rcvegrbuf & + pg_mask)) == MAP_FAILED) { + _HFI_INFO + ("mmap of subcontext rcvegrbuf array (%llx) failed: %s\n", + (unsigned long long)binfo->subctxt_rcvegrbuf, + strerror(errno)); + goto err; + } else { + hfi_touch_mmap(tmp, size); + binfo->subctxt_rcvegrbuf = (uint64_t) (uintptr_t) tmp; + _HFI_VDBG("subctxt_rcvegrbuf %llx\n", + binfo->subctxt_rcvegrbuf); + } + } + + /* Save some info. */ + spctrl->fd = fd; + spctrl->__hfi_unit = cinfo->unit; + /* + * driver should provide the port where the context is opened for, But + * OPA driver does not have port interface to psm because there is only + * one port. So we hardcode the port to 1 here. When we work on the + * version of PSM for the successor to OPA, we should have port returned + * from driver and will be set accordingly. + */ + /* spctrl->__hfi_port = cinfo->port; */ + spctrl->__hfi_port = 1; + spctrl->__hfi_tidegrcnt = cinfo->egrtids; + spctrl->__hfi_tidexpcnt = cinfo->rcvtids - cinfo->egrtids; + + return spctrl; + +err: + if (spctrl) + free(spctrl); + return NULL; +} diff --git a/psm_hal_gen1/opa_service_gen1.c b/psm_hal_gen1/opa_service_gen1.c new file mode 100644 index 0000000..e4719e3 --- /dev/null +++ b/psm_hal_gen1/opa_service_gen1.c @@ -0,0 +1,859 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2018 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2018 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* This file contains hfi service routine interface used by the low + level hfi protocol code. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "opa_service_gen1.h" +#include "psmi_wrappers.h" + +typedef union +{ + struct + { + uint16_t minor; + uint16_t major; + }; + uint32_t version; +} sw_version_t; + +static sw_version_t sw_version = +{ + { + .major = HFI1_USER_SWMAJOR, + .minor = HFI1_USER_SWMINOR + } +}; + +/* + * This function is necessary in a udev-based world. There can be an + * arbitrarily long (but typically less than one second) delay between + * a driver getting loaded and any dynamic special files turning up. + * + * The timeout is in milliseconds. A value of zero means "callee + * decides timeout". Negative is infinite. + * + * Returns 0 on success, -1 on error or timeout. Check errno to see + * whether there was a timeout (ETIMEDOUT) or an error (any other + * non-zero value). + */ +int hfi_wait_for_device(const char *path, long timeout) +{ + int saved_errno; + struct stat st; + long elapsed; + int ret; + + if (timeout == 0) + timeout = 15000; + + elapsed = 0; + + while (1) { + static const long default_ms = 250; + struct timespec req = { 0 }; + long ms; + + ret = stat(path, &st); + saved_errno = errno; + + if (ret == 0 || (ret == -1 && errno != ENOENT)) + break; + + if ((timeout > 0) && ((timeout - elapsed) <= 0)) { + saved_errno = ETIMEDOUT; + break; + } + + if (elapsed == 0) { + if (timeout < 0) + _HFI_DBG + ("Device file %s not present on first check; " + "waiting indefinitely...\n", path); + else + _HFI_DBG + ("Device file %s not present on first check; " + "waiting up to %.1f seconds...\n", path, + timeout / 1e3); + } + + if (timeout < 0 || timeout - elapsed >= default_ms) + ms = default_ms; + else + ms = timeout; + + elapsed += ms; + req.tv_nsec = ms * 1000000; + + ret = nanosleep(&req, NULL); + saved_errno = errno; + + if (ret == -1) + break; + } + + if (ret == 0) + _HFI_DBG("Found %s after %.1f seconds\n", path, elapsed / 1e3); + else + _HFI_INFO + ("The %s device failed to appear after %.1f seconds: %s\n", + path, elapsed / 1e3, strerror(saved_errno)); + + errno = saved_errno; + return ret; +} + +/* fwd declaration */ +ustatic int _hfi_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count); + +#ifdef PSM2_SUPPORT_IW_CMD_API + +/* fwd declaration */ +ustatic int _hfi_cmd_ioctl(int fd, struct hfi1_cmd *cmd, size_t count); + +/* Function pointer. */ +static int (*_hfi_cmd_send)(int fd, struct hfi1_cmd *cmd, size_t count) = _hfi_cmd_ioctl; + +#else +/* Function pointer. */ +static int (*const _hfi_cmd_send)(int fd, struct hfi1_cmd *cmd, size_t count) = _hfi_cmd_write; +#endif + +uint16_t hfi_get_user_major_version(void) +{ + return sw_version.major; +} + +void hfi_set_user_major_version(uint16_t major_version) +{ + sw_version.major = major_version; +} + +uint16_t hfi_get_user_minor_version(void) +{ + return sw_version.minor; +} + +void hfi_set_user_version(uint32_t version) +{ + sw_version.version = version; +} + +int hfi_context_open(int unit, int port, uint64_t open_timeout) +{ + char dev_name_ignored[256]; + + return hfi_context_open_ex(unit, port, open_timeout, + dev_name_ignored, sizeof(dev_name_ignored)); +} + +int hfi_context_open_ex(int unit, int port, uint64_t open_timeout, + char *dev_name,size_t dev_name_len) +{ + int fd; + + if (unit != HFI_UNIT_ID_ANY && unit >= 0) + snprintf(dev_name, dev_name_len, "%s_%u", HFI_DEVICE_PATH_GEN1, + unit); + else + snprintf(dev_name, dev_name_len, "%s_%u", HFI_DEVICE_PATH_GEN1, + 0); + + if (hfi_wait_for_device(dev_name, (long)open_timeout) == -1) { + _HFI_DBG("Could not find an HFI Unit on device " + "%s (%lds elapsed)", dev_name, + (long)open_timeout / 1000); + return -1; + } + + if ((fd = open(dev_name, O_RDWR)) == -1) { + _HFI_DBG("(host:Can't open %s for reading and writing", + dev_name); + return -1; + } + + if (fcntl(fd, F_SETFD, FD_CLOEXEC)) + _HFI_INFO("Failed to set close on exec for device: %s\n", + strerror(errno)); + +#ifdef PSM2_SUPPORT_IW_CMD_API + { + /* if hfi1DriverMajor == -1, then we are potentially talking to a new driver. + Let's confirm by issuing an ioctl version request: */ + struct hfi1_cmd c; + + memset(&c, 0, sizeof(struct hfi1_cmd)); + c.type = PSMI_HFI_CMD_GET_VERS; + c.len = 0; + c.addr = 0; + + if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) { + /* Let's assume that the driver is the old driver */ + hfi_set_user_major_version(IOCTL_CMD_API_MODULE_MAJOR - 1); + /* the old driver uses write() for its command interface: */ + _hfi_cmd_send = _hfi_cmd_write; + } + else + { + int major = c.addr >> HFI1_SWMAJOR_SHIFT; + if (major != hfi_get_user_major_version()) { + /* If there is a skew between the major version of the driver + that is executing and the major version which was used during + compilation of PSM, we treat that is a fatal error. */ + _HFI_INFO("PSM2 and driver version mismatch: (%d != %d)\n", + major, hfi_get_user_major_version()); + close(fd); + return -1; + } + } + } + +#endif + return fd; +} + +/* + * Check if non-double word multiple message size for SDMA is allowed to be + * pass to the driver. Starting from 6.2 driver version, PSM is able to pass + * to the driver message which size is not a multiple of double word for SDMA. + */ +uint32_t hfi_check_non_dw_mul_sdma(void) +{ + uint16_t major = hfi_get_user_major_version(); + uint16_t minor = hfi_get_user_minor_version(); + + if ((major > HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED) || + ((major == HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED) && + (minor >= HFI1_USER_SWMINOR_NON_DW_MUL_MSG_SIZE_ALLOWED))) + return 1; + + return 0; +} + +void hfi_context_close(int fd) +{ + (void)close(fd); +} + +int hfi_cmd_writev(int fd, const struct iovec *iov, int iovcnt) +{ + return writev(fd, iov, iovcnt); +} + +int hfi_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count) +{ + return _hfi_cmd_send(fd, cmd, count); +} + +ustatic +int _hfi_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count) +{ + const static unsigned int cmdTypeToWriteNum[PSMI_HFI_CMD_LAST] = { + [PSMI_HFI_CMD_ASSIGN_CTXT] = LEGACY_HFI1_CMD_ASSIGN_CTXT, + [PSMI_HFI_CMD_CTXT_INFO] = LEGACY_HFI1_CMD_CTXT_INFO, + [PSMI_HFI_CMD_USER_INFO] = LEGACY_HFI1_CMD_USER_INFO, + [PSMI_HFI_CMD_TID_UPDATE] = LEGACY_HFI1_CMD_TID_UPDATE, + [PSMI_HFI_CMD_TID_FREE] = LEGACY_HFI1_CMD_TID_FREE, + [PSMI_HFI_CMD_CREDIT_UPD] = LEGACY_HFI1_CMD_CREDIT_UPD, + [PSMI_HFI_CMD_RECV_CTRL] = LEGACY_HFI1_CMD_RECV_CTRL, + [PSMI_HFI_CMD_POLL_TYPE] = LEGACY_HFI1_CMD_POLL_TYPE, + [PSMI_HFI_CMD_ACK_EVENT] = LEGACY_HFI1_CMD_ACK_EVENT, + [PSMI_HFI_CMD_SET_PKEY] = LEGACY_HFI1_CMD_SET_PKEY, + [PSMI_HFI_CMD_CTXT_RESET] = LEGACY_HFI1_CMD_CTXT_RESET, + [PSMI_HFI_CMD_TID_INVAL_READ] = LEGACY_HFI1_CMD_TID_INVAL_READ, + [PSMI_HFI_CMD_GET_VERS] = LEGACY_HFI1_CMD_GET_VERS, + }; + + if (cmd->type < PSMI_HFI_CMD_LAST) { + cmd->type = cmdTypeToWriteNum[cmd->type]; + + return psmi_write(fd, cmd, count); + } else { + errno = EINVAL; + return -1; + } +} + +#ifdef PSM2_SUPPORT_IW_CMD_API +ustatic +int _hfi_cmd_ioctl(int fd, struct hfi1_cmd *cmd, size_t count) +{ + uint64_t addrOrLiteral[2] = { (uint64_t)cmd->addr, (uint64_t)&cmd->addr }; + const static struct + { + unsigned int ioctlCmd; + unsigned int addrOrLiteralIdx; + } cmdTypeToIoctlNum[PSMI_HFI_CMD_LAST] = { + [PSMI_HFI_CMD_ASSIGN_CTXT] = {HFI1_IOCTL_ASSIGN_CTXT , 0}, + [PSMI_HFI_CMD_CTXT_INFO] = {HFI1_IOCTL_CTXT_INFO , 0}, + [PSMI_HFI_CMD_USER_INFO] = {HFI1_IOCTL_USER_INFO , 0}, + [PSMI_HFI_CMD_TID_UPDATE] = {HFI1_IOCTL_TID_UPDATE , 0}, + [PSMI_HFI_CMD_TID_FREE] = {HFI1_IOCTL_TID_FREE , 0}, + [PSMI_HFI_CMD_CREDIT_UPD] = {HFI1_IOCTL_CREDIT_UPD , 1}, + [PSMI_HFI_CMD_RECV_CTRL] = {HFI1_IOCTL_RECV_CTRL , 1}, + [PSMI_HFI_CMD_POLL_TYPE] = {HFI1_IOCTL_POLL_TYPE , 1}, + [PSMI_HFI_CMD_ACK_EVENT] = {HFI1_IOCTL_ACK_EVENT , 1}, + [PSMI_HFI_CMD_SET_PKEY] = {HFI1_IOCTL_SET_PKEY , 1}, + [PSMI_HFI_CMD_CTXT_RESET] = {HFI1_IOCTL_CTXT_RESET , 1}, + [PSMI_HFI_CMD_TID_INVAL_READ] = {HFI1_IOCTL_TID_INVAL_READ, 0}, + [PSMI_HFI_CMD_GET_VERS] = {HFI1_IOCTL_GET_VERS , 1}, +#ifdef PSM_CUDA + [PSMI_HFI_CMD_TID_UPDATE_V2] = {HFI1_IOCTL_TID_UPDATE_V2 , 0}, +#endif + }; + + if (cmd->type < PSMI_HFI_CMD_LAST) + return psmi_ioctl(fd, + cmdTypeToIoctlNum[cmd->type].ioctlCmd, + addrOrLiteral[cmdTypeToIoctlNum[cmd->type].addrOrLiteralIdx]); + else + { + errno = EINVAL; + return -1; + } +} +#endif /* #ifdef PSM2_SUPPORT_IW_CMD_API */ + +/* we use mmap64() because we compile in both 32 and 64 bit mode, + and we have to map physical addresses that are > 32 bits long. + While linux implements mmap64, it doesn't have a man page, + and isn't declared in any header file, so we declare it here ourselves. + + We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and + redirects mmap to mmap64 for us, but at least through suse10 and fc4, + it doesn't work when the address being mapped is > 32 bits. It chips + off bits 32 and above. So we stay with mmap64. */ +void *hfi_mmap64(void *addr, size_t length, int prot, int flags, int fd, + __off64_t offset) +{ + return mmap64(addr, length, prot, flags, fd, offset); +} + +/* get the number of units supported by the driver. Does not guarantee */ +/* that a working chip has been found for each possible unit #. */ +/* number of units >=0 (0 means none found). */ +/* formerly used sysfs file "num_units" */ +int hfi_get_num_units(int wait) +{ + int ret; + + for (ret = 0;; ret++) { + char pathname[PATH_MAX]; + struct stat st; + int r; + + snprintf(pathname, sizeof(pathname), HFI_DEVICE_PATH_GEN1 "_%d", ret); + if (wait && (ret == 0)) + /* We only wait for the first device to come up. Not + on subsequent devices in order to save time. */ + r = hfi_wait_for_device(pathname, 0); + else + r = stat(pathname, &st); + if (!r) + continue; + else + break; + } + + return ret; +} + +/* Given a unit number, returns 1 if any port on the unit is active. + returns 0 if no port on the unit is active. + returns -1 when an error occurred. */ +int hfi_get_unit_active(int unit) +{ + int p,rv; + + for (p = HFI_MIN_PORT; p <= HFI_MAX_PORT; p++) + if ((rv=hfi_get_port_lid(unit, p)) > 0) + break; + + if (p <= HFI_MAX_PORT) + { + return 1; + } + + return rv; +} + +/* get the number of contexts from the unit id. */ +/* Returns 0 if no unit or no match. */ +int hfi_get_num_contexts(int unit_id, int wait) +{ + int n = 0; + int units; + int64_t val; + uint32_t p = HFI_MIN_PORT; + + units = hfi_get_num_units(wait); + + if_pf(units <= 0) + return 0; + + if (unit_id == HFI_UNIT_ID_ANY) { + uint32_t u; + + for (u = 0; u < units; u++) { + for (p = HFI_MIN_PORT; p <= HFI_MAX_PORT; p++) + if (hfi_get_port_lid(u, p) > 0) + break; + + if (p <= HFI_MAX_PORT && + !hfi_sysfs_unit_read_s64(u, "nctxts", &val, 0)) + n += (uint32_t) val; + } + } else { + for (; p <= HFI_MAX_PORT; p++) + if (hfi_get_port_lid(unit_id, p) > 0) + break; + + if (p <= HFI_MAX_PORT && + !hfi_sysfs_unit_read_s64(unit_id, "nctxts", &val, 0)) + n += (uint32_t) val; + } + + return n; +} + +/* Given a unit number and port number, returns 1 if the unit and port are active. + returns 0 if the unit and port are not active. + returns -1 when an error occurred. */ +int hfi_get_port_active(int unit, int port) +{ + int ret; + char *state; + + ret = hfi_sysfs_port_read(unit, port, "phys_state", &state); + if (ret == -1) { + if (errno == ENODEV) + /* this is "normal" for port != 1, on single port chips */ + _HFI_VDBG + ("Failed to get phys_state for unit %u:%u: %s\n", + unit, port, strerror(errno)); + else + _HFI_DBG + ("Failed to get phys_state for unit %u:%u: %s\n", + unit, port, strerror(errno)); + return -1; + } else { + if (strncmp(state, "5: LinkUp", 9)) { + _HFI_DBG("Link is not Up for unit %u:%u\n", unit, port); + free(state); + return 0; + } + free(state); + return 1; + } +} + +/* Given the unit number, return an error, or the corresponding LID + For now, it's used only so the MPI code can determine it's own + LID, and which other LIDs (if any) are also assigned to this node + Returns an int, so -1 indicates an error. 0 may indicate that + the unit is valid, but no LID has been assigned. + No error print because we call this for both potential + ports without knowing if both ports exist (or are connected) */ +int hfi_get_port_lid(int unit, int port) +{ + int ret; + int64_t val; + + if (hfi_get_port_active(unit,port) != 1) + return -2; + ret = hfi_sysfs_port_read_s64(unit, port, "lid", &val, 0); + _HFI_VDBG("hfi_get_port_lid: ret %d, unit %d port %d\n", ret, unit, + port); + + if (ret == -1) { + if (errno == ENODEV) + /* this is "normal" for port != 1, on single port chips */ + _HFI_VDBG("Failed to get LID for unit %u:%u: %s\n", + unit, port, strerror(errno)); + else + _HFI_DBG("Failed to get LID for unit %u:%u: %s\n", + unit, port, strerror(errno)); + } else { + ret = val; + +/* disable this feature since we don't have a way to provide + file descriptor in multiple context case. */ +#if 0 + if (getenv("HFI_DIAG_LID_LOOP")) { + /* provides diagnostic ability to run MPI, etc. even */ + /* on loopback, by claiming a different LID for each context */ + struct hfi1_ctxt_info info; + struct hfi1_cmd cmd; + cmd.type = PSMI_HFI_CMD_CTXT_INFO; + cmd.cmd.ctxt_info = (uintptr_t) &info; + if (__hfi_lastfd == -1) + _HFI_INFO + ("Can't run CONTEXT_INFO for lid_loop, fd not set\n"); + else if (write(__hfi_lastfd, &cmd, sizeof(cmd)) == -1) + _HFI_INFO("CONTEXT_INFO command failed: %s\n", + strerror(errno)); + else if (!info.context) + _HFI_INFO("CONTEXT_INFO returned context 0!\n"); + else { + _HFI_PRDBG + ("Using lid 0x%x, base %x, context %x\n", + ret + info.context, ret, info.context); + ret += info.context; + } + } +#endif + } + + return ret; +} + +/* Given the unit number, return an error, or the corresponding GID + For now, it's used only so the MPI code can determine its fabric ID. + Returns an int, so -1 indicates an error. + No error print because we call this for both potential + ports without knowing if both ports exist (or are connected) */ +int hfi_get_port_gid(int unit, int port, uint64_t *hi, uint64_t *lo) +{ + int ret; + char *gid_str = NULL; + + ret = hfi_sysfs_port_read(unit, port, "gids/0", &gid_str); + + if (ret == -1) { + if (errno == ENODEV) + /* this is "normal" for port != 1, on single + * port chips */ + _HFI_VDBG("Failed to get GID for unit %u:%u: %s\n", + unit, port, strerror(errno)); + else + _HFI_DBG("Failed to get GID for unit %u:%u: %s\n", + unit, port, strerror(errno)); + } else { + uint32_t gid[8]; + if (sscanf(gid_str, "%4x:%4x:%4x:%4x:%4x:%4x:%4x:%4x", + &gid[0], &gid[1], &gid[2], &gid[3], + &gid[4], &gid[5], &gid[6], &gid[7]) != 8) { + _HFI_DBG("Failed to parse GID for unit %u:%u: %s\n", + unit, port, gid_str); + ret = -1; + } else { + *hi = (((uint64_t) gid[0]) << 48) | (((uint64_t) gid[1]) + << 32) | + (((uint64_t) + gid[2]) << 16) | (((uint64_t) gid[3]) << 0); + *lo = (((uint64_t) gid[4]) << 48) | (((uint64_t) gid[5]) + << 32) | + (((uint64_t) + gid[6]) << 16) | (((uint64_t) gid[7]) << 0); + } + free(gid_str); + } + + return ret; +} + +/* Given the unit number, return an error, or the corresponding LMC value + for the port */ +/* Returns an int, so -1 indicates an error. 0 */ +int hfi_get_port_lmc(int unit, int port) +{ + int ret; + int64_t val; + + ret = hfi_sysfs_port_read_s64(unit, port, "lid_mask_count", &val, 0); + + if (ret == -1) { + _HFI_INFO("Failed to get LMC for unit %u:%u: %s\n", + unit, port, strerror(errno)); + } else + ret = val; + + return ret; +} + +/* Given the unit number, return an error, or the corresponding link rate + for the port */ +/* Returns an int, so -1 indicates an error. */ +int hfi_get_port_rate(int unit, int port) +{ + int ret; + double rate; + char *data_rate = NULL, *newptr; + + ret = hfi_sysfs_port_read(unit, port, "rate", &data_rate); + if (ret == -1) + goto get_port_rate_error; + else { + rate = strtod(data_rate, &newptr); + if ((rate == 0) && (data_rate == newptr)) + goto get_port_rate_error; + } + + free(data_rate); + return ((int)(rate * 2) >> 1); + +get_port_rate_error: + _HFI_INFO("Failed to get link rate for unit %u:%u: %s\n", + unit, port, strerror(errno)); + + return ret; +} + +/* Given a unit, port and SL, return an error, or the corresponding SC for the + SL as programmed by the SM */ +/* Returns an int, so -1 indicates an error. */ +int hfi_get_port_sl2sc(int unit, int port, int sl) +{ + int ret; + int64_t val; + char sl2scpath[16]; + + snprintf(sl2scpath, sizeof(sl2scpath), "sl2sc/%d", sl); + ret = hfi_sysfs_port_read_s64(unit, port, sl2scpath, &val, 0); + + if (ret == -1) { + _HFI_DBG + ("Failed to get SL2SC mapping for SL %d unit %u:%u: %s\n", + sl, unit, port, strerror(errno)); + } else + ret = val; + + return ret; +} + +/* Given a unit, port and SC, return an error, or the corresponding VL for the + SC as programmed by the SM */ +/* Returns an int, so -1 indicates an error. */ +int hfi_get_port_sc2vl(int unit, int port, int sc) +{ + int ret; + int64_t val; + char sc2vlpath[16]; + + snprintf(sc2vlpath, sizeof(sc2vlpath), "sc2vl/%d", sc); + ret = hfi_sysfs_port_read_s64(unit, port, sc2vlpath, &val, 0); + + if (ret == -1) { + _HFI_DBG + ("Failed to get SC2VL mapping for SC %d unit %u:%u: %s\n", + sc, unit, port, strerror(errno)); + } else + ret = val; + + return ret; +} + +/* Given a unit, port and VL, return an error, or the corresponding MTU for the + VL as programmed by the SM */ +/* Returns an int, so -1 indicates an error. */ +int hfi_get_port_vl2mtu(int unit, int port, int vl) +{ + int ret; + int64_t val; + char vl2mtupath[16]; + + snprintf(vl2mtupath, sizeof(vl2mtupath), "vl2mtu/%d", vl); + ret = hfi_sysfs_port_read_s64(unit, port, vl2mtupath, &val, 0); + + if (ret == -1) { + _HFI_DBG + ("Failed to get VL2MTU mapping for VL %d unit %u:%u: %s\n", + vl, unit, port, strerror(errno)); + } else + ret = val; + + return ret; +} + +/* Given a unit, port and index, return an error, or the corresponding pkey + value for the index as programmed by the SM */ +/* Returns an int, so -1 indicates an error. */ +int hfi_get_port_index2pkey(int unit, int port, int index) +{ + int ret; + int64_t val; + char index2pkeypath[16]; + + snprintf(index2pkeypath, sizeof(index2pkeypath), "pkeys/%d", index); + ret = hfi_sysfs_port_read_s64(unit, port, index2pkeypath, &val, 0); + + if (ret == -1) { + _HFI_DBG + ("Failed to get index2pkey mapping for index %d unit %u:%u: %s\n", + index, unit, port, strerror(errno)); + } else + ret = val; + + return ret; +} + +int hfi_get_cc_settings_bin(int unit, int port, char *ccabuf, size_t len_ccabuf) +{ + int fd; + + /* + * 4 bytes for 'control map' + * 2 bytes 'port control' + * 32 (#SLs) * 6 bytes 'congestion setting' (per-SL) + */ + const size_t count = 4 + 2 + (32 * 6); + + if (count > len_ccabuf) + return -2; +/* + * Check qib driver CCA setting, and try to use it if available. + * Fall to self CCA setting if errors. + */ + if (snprintf(ccabuf, len_ccabuf, "%s%d/ports/%d/CCMgtA/cc_settings_bin", + hfi_sysfs_path(), unit, port) >= (len_ccabuf-1)) + return -1; + + fd = open(ccabuf, O_RDONLY); + if (fd < 0) { + return 0; + } + + if (read(fd, ccabuf, count) != count) { + _HFI_CCADBG("Read cc_settings_bin failed. using static CCA\n"); + close(fd); + return 0; + } + + close(fd); + + return 1; +} + +int hfi_get_cc_table_bin(int unit, int port, uint16_t **cctp) +{ + int i; + unsigned short ccti_limit; + uint16_t *cct; + int fd; + char pathname[256]; + *cctp = NULL; + + if (snprintf(pathname,sizeof(pathname), "%s%d/ports/%d/CCMgtA/cc_table_bin", + hfi_sysfs_path(), unit, port) >= (sizeof(pathname)-1)) + return -1; + + fd = open(pathname, O_RDONLY); + if (fd < 0) { + _HFI_CCADBG("Open cc_table_bin failed. using static CCA\n"); + return 0; + } + if (read(fd, &ccti_limit, sizeof(ccti_limit)) != sizeof(ccti_limit)) { + _HFI_CCADBG("Read ccti_limit failed. using static CCA\n"); + close(fd); + return 0; + } + + _HFI_CCADBG("ccti_limit = %d\n", ccti_limit); + + if (ccti_limit < 63) { + _HFI_CCADBG("Read ccti_limit %d not in range [63, 65535], " + "using static CCA.\n", ccti_limit); + close(fd); + return 0; + } + + i = (ccti_limit + 1) * sizeof(uint16_t); + cct = malloc(i); + if (!cct) { + close(fd); + return -1; + } + if (read(fd, cct, i) != i) { + _HFI_CCADBG("Read ccti_entry_list, using static CCA\n"); + free(cct); + close(fd); + return 0; + } + + close(fd); + + _HFI_CCADBG("cct[0] = 0x%04x\n", cct[0]); + + *cctp = cct; + return ccti_limit; +} + +/* + * This is for diag function hfi_wait_for_packet() only + */ +int hfi_cmd_wait_for_packet(int fd) +{ + int ret; + struct pollfd pfd; + + pfd.fd = fd; + pfd.events = POLLIN; + + ret = poll(&pfd, 1, 500 /* ms */); + + return ret; +} diff --git a/psm_hal_gen1/opa_service_gen1.h b/psm_hal_gen1/opa_service_gen1.h new file mode 100644 index 0000000..9bce8ca --- /dev/null +++ b/psm_hal_gen1/opa_service_gen1.h @@ -0,0 +1,294 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef OPA_SERVICE_GEN1_H +#define OPA_SERVICE_GEN1_H + +/* This file contains all the lowest level routines calling into sysfs */ +/* and qib driver. All other calls are based on these routines. */ + +#include + +#include "opa_intf.h" +#include "opa_common_gen1.h" +#include "opa_udebug.h" +#include "opa_byteorder.h" + +/* upper and lower bounds for HFI port numbers */ +#define HFI_MIN_PORT 1 +#define HFI_MAX_PORT 1 +#ifndef HFI_NUM_PORTS_GEN1 +#define HFI_NUM_PORTS_GEN1 (HFI_MAX_PORT - HFI_MIN_PORT + 1) +#endif +/* any unit id to match. */ +#define HFI_UNIT_ID_ANY ((long)-1) +/* any port num to match. */ +#define HFI_PORT_NUM_ANY ((long)0) + +/* base name of path (without unit #) for qib driver */ +#ifndef HFI_DEVICE_PATH_GEN1 +#define HFI_DEVICE_PATH_GEN1 "/dev/hfi1" +#endif + +#ifdef PSM_CUDA +#define GDR_DEVICE_PATH "/dev/hfi1_gdr" +#endif + +/* The major and minor versions of driver that support non-DW multiple SDMA */ +#define HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED 6 +#define HFI1_USER_SWMINOR_NON_DW_MUL_MSG_SIZE_ALLOWED 2 + +/* Commands used to communicate with driver. */ +enum PSMI_HFI_CMD { + PSMI_HFI_CMD_ASSIGN_CTXT = 0, /* allocate HFI and context */ + PSMI_HFI_CMD_CTXT_INFO, /* find out what resources we got */ + PSMI_HFI_CMD_USER_INFO, /* set up userspace */ + PSMI_HFI_CMD_TID_UPDATE, /* update expected TID entries */ + PSMI_HFI_CMD_TID_FREE, /* free expected TID entries */ + PSMI_HFI_CMD_CREDIT_UPD, /* force an update of PIO credit */ + PSMI_HFI_CMD_RECV_CTRL, /* control receipt of packets */ + PSMI_HFI_CMD_POLL_TYPE, /* set the kind of polling we want */ + PSMI_HFI_CMD_ACK_EVENT, /* ack & clear user status bits */ + PSMI_HFI_CMD_SET_PKEY, /* set context's pkey */ + PSMI_HFI_CMD_CTXT_RESET, /* reset context's HW send context */ + PSMI_HFI_CMD_TID_INVAL_READ, /* read TID cache invalidations */ + PSMI_HFI_CMD_GET_VERS, /* get the version of the user cdev */ + +#ifdef PSM_CUDA + PSMI_HFI_CMD_TID_UPDATE_V2 = 28, +#endif + PSMI_HFI_CMD_LAST, +}; + +/* Legacy commands used to communicate with driver using 'write' */ +enum LEGACY_HFI1_CMD { + LEGACY_HFI1_CMD_ASSIGN_CTXT = 1, /* allocate HFI and context */ + LEGACY_HFI1_CMD_CTXT_INFO = 2, /* find out what resources we got */ + LEGACY_HFI1_CMD_USER_INFO = 3, /* set up userspace */ + LEGACY_HFI1_CMD_TID_UPDATE = 4, /* update expected TID entries */ + LEGACY_HFI1_CMD_TID_FREE = 5, /* free expected TID entries */ + LEGACY_HFI1_CMD_CREDIT_UPD = 6, /* force an update of PIO credit */ + + LEGACY_HFI1_CMD_RECV_CTRL = 8, /* control receipt of packets */ + LEGACY_HFI1_CMD_POLL_TYPE = 9, /* set the kind of polling we want */ + LEGACY_HFI1_CMD_ACK_EVENT = 10, /* ack & clear user status bits */ + LEGACY_HFI1_CMD_SET_PKEY = 11, /* set context's pkey */ + LEGACY_HFI1_CMD_CTXT_RESET = 12, /* reset context's HW send context */ + LEGACY_HFI1_CMD_TID_INVAL_READ = 13, /* read TID cache invalidations */ + LEGACY_HFI1_CMD_GET_VERS = 14 /* get the version of the user cdev */ +}; + +/* Given a unit number and port number, returns 1 if the unit and port are active. + returns 0 if the unit and port are not active. returns -1 when an error occurred. */ +int hfi_get_port_active(int, int); + +/* Given the unit number and port, return an error, or the corresponding LID */ +/* Returns an int, so -1 indicates a general error. -2 indicates that the unit/port + are not active. 0 indicates that the unit is valid, but no LID has been assigned. */ +int hfi_get_port_lid(int, int); + +/* Given the unit number and port, return an error, or the corresponding GID */ +/* Returns an int, so -1 indicates an error. */ +int hfi_get_port_gid(int, int, uint64_t *hi, uint64_t *lo); + +/* Given the unit number, return an error, or the corresponding LMC value + for the port */ +/* Returns an int, so -1 indicates an error. 0 */ +int hfi_get_port_lmc(int unit, int port); + +/* Given the unit number, return an error, or the corresponding link rate + for the port */ +/* Returns an int, so -1 indicates an error. */ +int hfi_get_port_rate(int unit, int port); + +/* Given a unit, port and SL, return an error, or the corresponding SC for the + SL as programmed by the SM */ +/* Returns an int, so -1 indicates an error. */ +int hfi_get_port_sl2sc(int unit, int port, int sl); + +/* Given a unit, port and SC, return an error, or the corresponding VL for the + SC as programmed by the SM */ +/* Returns an int, so -1 indicates an error. */ +int hfi_get_port_sc2vl(int unit, int port, int sc); + +/* Given a unit, port and VL, return an error, or the corresponding MTU for the + VL as programmed by the SM */ +/* Returns an int, so -1 indicates an error. */ +int hfi_get_port_vl2mtu(int unit, int port, int vl); + +/* Given a unit, port and index, return an error, or the corresponding pkey for + the index as programmed by the SM */ +/* Returns an int, so -1 indicates an error. */ +int hfi_get_port_index2pkey(int unit, int port, int index); + +/* Get the number of units supported by the driver. Does not guarantee + that a working chip has been found for each possible unit #. + When the parameter 'wait' is non-zero, the code will wait briefly as + the driver may be coming up. If 'wait' is zero, the function does not wait. + Returns -1 with errno set, or number of units >=0 (0 means none found). */ +int hfi_get_num_units(int wait); + +/* Given a unit number, returns 1 if any port on the unit is active. + returns 0 if no port on the unit is active. + returns -1 when an error occurred. */ +int hfi_get_unit_active(int unit); + +/* get the number of contexts from the unit id. + When the parameter 'wait' is non-zero, the code will wait briefly as + the driver may be coming up. If 'wait' is zero, the function does not wait. + Returns 0 if no unit or no match. */ +int hfi_get_num_contexts(int unit, int wait); + +/* Open hfi device file, return -1 on error. */ +int hfi_context_open(int unit, int port, uint64_t open_timeout); +int hfi_context_open_ex(int unit, int port, uint64_t open_timeout, + char *dev_name,size_t dev_name_len); + +uint32_t hfi_check_non_dw_mul_sdma(void); + +void hfi_context_close(int fd); + +/* hfi_get_user_major_version() returns the major version of the driver + that should be used for this session of psm. Valid only after + hfi_context_open has been called. */ +uint16_t hfi_get_user_major_version(void); + +/* hfi_get_user_minor_version() return the minor version of the driver */ +uint16_t hfi_get_user_minor_version(void); + +void hfi_set_user_version(uint32_t version); +void hfi_set_user_major_version(uint16_t major_version); + +int hfi_cmd_write(int fd, struct hfi1_cmd *, size_t count); + +int hfi_cmd_writev(int fd, const struct iovec *iov, int iovcnt); + +/* hfi_get_cc_settings_bin() returns less than or equal to 0 on failure, + returns greater than 0 on success. */ + int hfi_get_cc_settings_bin(int unit, int port, char *ccabuf, size_t len_ccabuf); +int hfi_get_cc_table_bin(int unit, int port, uint16_t **cctp); + +/* We use mmap64() because we compile in both 32 and 64 bit mode, + and we have to map physical addresses that are > 32 bits long. + While linux implements mmap64, it doesn't have a man page, + and isn't declared in any header file, so we declare it here ourselves. */ + +/* We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and + redirects mmap to mmap64 for us, but at least through suse10 and fc4, + it doesn't work when the address being mapped is > 32 bits. It chips + off bits 32 and above. So we stay with mmap64. */ +extern void *mmap64(void *, size_t, int, int, int, __off64_t); +void *hfi_mmap64(void *, size_t, int, int, int, __off64_t); + +/* Statistics maintained by the driver */ +int hfi_get_stats(uint64_t *, int); +int hfi_get_stats_names(char **namep); +/* Counters maintained in the chip, globally, and per-prot */ +int hfi_get_ctrs_unit(int unitno, uint64_t *, int); +int hfi_get_ctrs_unit_names(int unitno, char **namep); +int hfi_get_ctrs_port(int unitno, int port, uint64_t *, int); +int hfi_get_ctrs_port_names(int unitno, char **namep); + +/* sysfs helper routines (only those currently used are exported; + * try to avoid using others) */ + +/* Initializes the following sysfs helper routines. */ +void sysfs_init(const char *dflt_hfi_class_path); + +const char *hfi_sysfs_path(void); + +/* read a string value */ +int hfi_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr, + char **datap); + +/* read a string value into buff, no more than size bytes. + returns the number of bytes read */ +size_t hfi_sysfs_unit_port_read(uint32_t unit, uint32_t port, const char *attr, + char *buff, size_t size); + +/* open attribute in unit's sysfs directory via open(2) */ +int hfi_sysfs_unit_open(uint32_t unit, const char *attr, int flags); +int hfi_sysfs_port_open(uint32_t unit, uint32_t port, const char *attr, + int flags); +/* print to attribute in {unit,port} sysfs directory */ +int hfi_sysfs_port_printf(uint32_t unit, uint32_t port, const char *attr, + const char *fmt, ...) + __attribute__((format(printf, 4, 5))); +int hfi_sysfs_unit_printf(uint32_t unit, const char *attr, const char *fmt, ...) + __attribute__((format(printf, 3, 4))); + +int hfi_hfifs_unit_write(uint32_t unit, const char *attr, const void *data, + size_t len); +/* read up to one page of malloc'ed data (caller must free), returning + number of bytes read or -1 */ +int hfi_hfifs_read(const char *attr, char **datap); +int hfi_hfifs_unit_read(uint32_t unit, const char *attr, char **data); +/* read a signed 64-bit quantity, in some arbitrary base */ +int hfi_sysfs_unit_read_s64(uint32_t unit, const char *attr, + int64_t *valp, int base); +int hfi_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr, + int64_t *valp, int base); +int64_t hfi_sysfs_unit_read_node_s64(uint32_t unit); +/* these read directly into supplied buffer and take a count */ +int hfi_hfifs_rd(const char *, void *, int); +int hfi_hfifs_unit_rd(uint32_t unit, const char *, void *, int); + +int hfi_hfifs_open(const char *relname, int flags); + +/* wait for device special file to show up. timeout is in + * milliseconds, 0 is "callee knows best", < 0 is infinite. */ +int hfi_wait_for_device(const char *path, long timeout); + +int hfi_cmd_wait_for_packet(int fd); + +#endif /* OPA_SERVICE_GEN1_H */ diff --git a/psm_hal_gen1/opa_user_gen1.h b/psm_hal_gen1/opa_user_gen1.h new file mode 100644 index 0000000..9731b2b --- /dev/null +++ b/psm_hal_gen1/opa_user_gen1.h @@ -0,0 +1,593 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef OPA_USER_GEN1_H +#define OPA_USER_GEN1_H + +/* This file contains all of the data structures and routines that are + publicly visible and usable (to low level infrastructure code; it is + not expected that any application, or even normal application-level library, + will ever need to use any of this). + + Additional entry points and data structures that are used by these routines + may be referenced in this file, but they should not be generally available; + they are visible here only to allow use in inlined functions. Any variable, + data structure, or function that starts with a leading "_" is in this + category. +*/ + +/* Include header files we need that are unlikely to otherwise be needed by */ +/* programs. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "opa_intf.h" +#include "opa_common_gen1.h" +#include "opa_byteorder.h" +#include "opa_udebug.h" +#include "opa_service_gen1.h" +#include "opa_user.h" + +#define HFI_RHF_USE_EGRBFR_MASK 0x1 +#define HFI_RHF_USE_EGRBFR_SHIFT 15 +#define HFI_RHF_EGRBFR_INDEX_MASK 0x7FF +#define HFI_RHF_EGRBFR_INDEX_SHIFT 16 + +#define HFI_RHF_SEQ_MASK 0xF +#define HFI_RHF_SEQ_SHIFT 28 +#define HFI_RHF_EGRBFR_OFFSET_MASK 0xFFF +#define HFI_RHF_EGRBFR_OFFSET_SHIFT 0 +#define HFI_RHF_HDRQ_OFFSET_MASK 0x1FF +#define HFI_RHF_HDRQ_OFFSET_SHIFT 12 +#define HFI_RHF_TIDERR 0x08000000 + +/* TidFlow related bits */ +#define HFI_TF_SEQNUM_SHIFT 0 +#define HFI_TF_SEQNUM_MASK 0x7ff + +#define HFI_TF_GENVAL_SHIFT 11 +#define HFI_TF_GENVAL_MASK 0xfffff + +#define HFI_TF_FLOWVALID_SHIFT 32 +#define HFI_TF_FLOWVALID_MASK 0x1 +#define HFI_TF_HDRSUPP_ENABLED_SHIFT 33 +#define HFI_TF_HDRSUPP_ENABLED_MASK 0x1 + +#define HFI_TF_KEEP_AFTER_SEQERR_SHIFT 34 +#define HFI_TF_KEEP_AFTER_SEQERR_MASK 0x1 +#define HFI_TF_KEEP_ON_GENERR_SHIFT 35 +#define HFI_TF_KEEP_ON_GENERR_MASK 0x1 +#define HFI_TF_KEEP_PAYLOAD_ON_GENERR_SHIFT 36 +#define HFI_TF_KEEP_PAYLOAD_ON_GENERR_MASK 0x1 +#define HFI_TF_STATUS_SEQMISMATCH_SHIFT 37 +#define HFI_TF_STATUS_SEQMISMATCH_MASK 0x1 +#define HFI_TF_STATUS_GENMISMATCH_SHIFT 38 +#define HFI_TF_STATUS_GENMISMATCH_MASK 0x1 + +/* PBC bits */ +#define HFI_PBC_STATICRCC_SHIFT 0 +#define HFI_PBC_STATICRCC_MASK 0xffff + +#define HFI_PBC_SC4_SHIFT 4 +#define HFI_PBC_SC4_MASK 0x1 + +#define HFI_PBC_INTR_SHIFT 31 +#define HFI_PBC_DCINFO_SHIFT 30 +#define HFI_PBC_TESTEBP_SHIFT 29 +#define HFI_PBC_PACKETBYPASS_SHIFT 28 +#define HFI_PBC_INSERTHCRC_SHIFT 26 +#define HFI_PBC_INSERTHCRC_MASK 0x3 +#define HFI_PBC_CREDITRETURN_SHIFT 25 +#define HFI_PBC_INSERTBYPASSICRC_SHIFT 24 +#define HFI_PBC_TESTBADICRC_SHIFT 23 +#define HFI_PBC_FECN_SHIFT 22 +#define HFI_PBC_VL_SHIFT 12 +#define HFI_PBC_VL_MASK 0xf +#define HFI_PBC_LENGTHDWS_SHIFT 0 +#define HFI_PBC_LENGTHDWS_MASK 0xfff + +/* this portion only defines what we currently use */ +struct hfi_pbc { + __u32 pbc0; + __u16 PbcStaticRateControlCnt; + __u16 fill1; +}; + +#define HFI_PCB_SIZE_IN_BYTES 8 + +/* Usable bytes in header (hdrsize - lrh - bth) */ +#define HFI_MESSAGE_HDR_SIZE_HFI (HFI_MESSAGE_HDR_SIZE-20) + +/* + * SDMA includes 8B sdma hdr, 8B PBC, and message header. + * If we are using GPU workloads, we need to set a new + * "flags" member which takes another 2 bytes in the + * sdma hdr. We let the driver know of this 2 extra bytes + * at runtime when we set the length for the iovecs. + */ +#define HFI_SDMA_HDR_SIZE (8+8+56) + +static inline __u32 hfi_hdrget_seq(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> HFI_RHF_SEQ_SHIFT) + & HFI_RHF_SEQ_MASK; +} + +static inline __u32 hfi_hdrget_hdrq_offset(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[1]) >> HFI_RHF_HDRQ_OFFSET_SHIFT) + & HFI_RHF_HDRQ_OFFSET_MASK; +} + +struct _hfi_ctrl { + int32_t fd; /* device file descriptor */ + /* tidflow valid */ + uint32_t __hfi_tfvalid; + /* unit id */ + uint32_t __hfi_unit; + /* port id */ + uint32_t __hfi_port; + + /* number of eager tid entries */ + uint32_t __hfi_tidegrcnt; + /* number of expected tid entries */ + uint32_t __hfi_tidexpcnt; + + /* effective mtu size, should be <= base_info.mtu */ + uint32_t __hfi_mtusize; + /* max PIO size, should be <= effective mtu size */ + uint32_t __hfi_piosize; + + /* two struct output from driver. */ + struct hfi1_ctxt_info ctxt_info; + struct hfi1_base_info base_info; + + /* some local storages in some condition: */ + /* as storage of __hfi_rcvtidflow in hfi_userinit(). */ + __le64 regs[HFI_TF_NFLOWS]; + + /* location to which OPA writes the rcvhdrtail register whenever + it changes, so that no chip registers are read in the performance + path. */ + volatile __le64 *__hfi_rcvtail; + + /* address where ur_rcvhdrtail is written */ + volatile __le64 *__hfi_rcvhdrtail; + /* address where ur_rcvhdrhead is written */ + volatile __le64 *__hfi_rcvhdrhead; + /* address where ur_rcvegrindextail is read */ + volatile __le64 *__hfi_rcvegrtail; + /* address where ur_rcvegrindexhead is written */ + volatile __le64 *__hfi_rcvegrhead; + /* address where ur_rcvegroffsettail is read */ + volatile __le64 *__hfi_rcvofftail; + /* address where ur_rcvtidflow is written */ + volatile __le64 *__hfi_rcvtidflow; +}; + +/* After the device is opened, hfi_userinit() is called to give the driver the + parameters the user code wants to use, and to get the implementation values, + etc. back. 0 is returned on success, a positive value is a standard errno, + and a negative value is reserved for future use. The first argument is + the filedescriptor returned by the device open. + + It is allowed to have multiple devices (and of different types) + simultaneously opened and initialized, although this won't be fully + implemented initially. This routine is used by the low level + hfi protocol code (and any other code that has similar low level + functionality). + This is the only routine that takes a file descriptor, rather than an + struct _hfi_ctrl *. The struct _hfi_ctrl * used for everything + else is returned by this routine. +*/ + +struct _hfi_ctrl *hfi_userinit(int32_t, struct hfi1_user_info_dep *); + +/* don't inline these; it's all init code, and not inlining makes the */ +/* overall code shorter and easier to debug */ +void hfi_touch_mmap(void *, size_t) __attribute__ ((noinline)); + +/* set the BTH pkey to check for this process. */ +/* This is for receive checks, not for sends. It isn't necessary + to set the default key, that's always allowed by the hardware. + If too many pkeys are in use for the hardware to support, this + will return EAGAIN, and the caller should then fail and exit + or use the default key and check the pkey in the received packet + checking. */ +/* set send context pkey to verify, error if driver is not configured with */ +/* this pkey in its pkey table. */ +int hfi_set_pkey(struct _hfi_ctrl *, uint16_t); + +int hfi_wait_for_packet(struct _hfi_ctrl *); + +/* New user event mechanism, using spi_sendbuf_status HFI_EVENT_* bits + obsoletes hfi_disarm_bufs(), and extends it, although old mechanism + remains for binary compatibility. */ +int hfi_event_ack(struct _hfi_ctrl *ctrl, __u64 ackbits); + +/* set whether we want an interrupt on all packets, or just urgent ones */ +int hfi_poll_type(struct _hfi_ctrl *ctrl, uint16_t poll_type); + +/* reset halted send context, error if context is not halted. */ +int hfi_reset_context(struct _hfi_ctrl *ctrl); + +/* +* Safe version of hfi_[d/q]wordcpy that is guaranteed to only copy each byte once. +*/ +#if defined(__x86_64__) +void hfi_dwordcpy_safe(volatile uint32_t *dest, const uint32_t *src, + uint32_t ndwords); +void hfi_qwordcpy_safe(volatile uint64_t *dest, const uint64_t *src, + uint32_t nqwords); +#else +#define hfi_dwordcpy_safe hfi_dwordcpy +#define hfi_qwordcpy_safe hfi_qwordcpy +#endif + +static __inline__ void hfi_tidflow_set_entry(struct _hfi_ctrl *ctrl, + uint32_t flowid, uint32_t genval, + uint32_t seqnum) +{ +/* For proper behavior with RSM interception of FECN packets for CCA, + * the tidflow entry needs the KeepAfterSequenceError bit set. + * A packet that is converted from expected to eager by RSM will not + * trigger an update in the tidflow state. This will cause the tidflow + * to incorrectly report a sequence error on any non-FECN packets that + * arrive after the RSM intercepted packets. If the KeepAfterSequenceError + * bit is set, PSM can properly detect this "false SeqErr" condition, + * and recover without dropping packets. + * Note that if CCA/RSM are not important, this change will slightly + * increase the CPU load when packets are dropped. If this is significant, + * consider hiding this change behind a CCA/RSM environment variable. + */ + + ctrl->__hfi_rcvtidflow[flowid] = __cpu_to_le64( + ((genval & HFI_TF_GENVAL_MASK) << HFI_TF_GENVAL_SHIFT) | + ((seqnum & HFI_TF_SEQNUM_MASK) << HFI_TF_SEQNUM_SHIFT) | + ((uint64_t)ctrl->__hfi_tfvalid << HFI_TF_FLOWVALID_SHIFT) | + (1ULL << HFI_TF_HDRSUPP_ENABLED_SHIFT) | + /* KeepAfterSequenceError = 1 -- previously was 0 */ + (1ULL << HFI_TF_KEEP_AFTER_SEQERR_SHIFT) | + (1ULL << HFI_TF_KEEP_ON_GENERR_SHIFT) | + /* KeePayloadOnGenErr = 0 */ + (1ULL << HFI_TF_STATUS_SEQMISMATCH_SHIFT) | + (1ULL << HFI_TF_STATUS_GENMISMATCH_SHIFT)); +} + +static __inline__ void hfi_tidflow_reset(struct _hfi_ctrl *ctrl, + uint32_t flowid, uint32_t genval, + uint32_t seqnum) +{ +/* + * If a tidflow table entry is set to "Invalid", we want to drop + * header if payload is dropped, we want to get a header if the payload + * is delivered. + * + * We set a tidflow table entry "Invalid" by setting FlowValid=1 and + * GenVal=0x1FFF/0xFFFFF, this is a special generation number and no + * packet will use this value. We don't care SeqNum but we set it to + * 0x7FF. So if GenVal does not match, the payload is dropped because + * KeepPayloadOnGenErr=0; for packet header, KeepOnGenErr=0 make sure + * header is not generated. But if a packet happens to have the special + * generation number, the payload is delivered, HdrSuppEnabled=0 make + * sure header is generated if SeqNUm matches, if SeqNum does not match, + * KeepAfterSeqErr=1 makes sure the header is generated. + */ + ctrl->__hfi_rcvtidflow[flowid] = __cpu_to_le64( + /* genval = 0x1FFF or 0xFFFFF */ + ((genval & HFI_TF_GENVAL_MASK) << HFI_TF_GENVAL_SHIFT) | + /* seqnum = 0x7FF */ + ((seqnum & HFI_TF_SEQNUM_MASK) << HFI_TF_SEQNUM_SHIFT) | + ((uint64_t)ctrl->__hfi_tfvalid << HFI_TF_FLOWVALID_SHIFT) | + /* HdrSuppEnabled = 0 */ + (1ULL << HFI_TF_KEEP_AFTER_SEQERR_SHIFT) | + /* KeepOnGenErr = 0 */ + /* KeepPayloadOnGenErr = 0 */ + (1ULL << HFI_TF_STATUS_SEQMISMATCH_SHIFT) | + (1ULL << HFI_TF_STATUS_GENMISMATCH_SHIFT)); +} + +/* + * This should only be used for debugging. + * Normally, we shouldn't read the chip. + */ +static __inline__ uint64_t hfi_tidflow_get(struct _hfi_ctrl *ctrl, + uint32_t flowid) +{ + return __le64_to_cpu(ctrl->__hfi_rcvtidflow[flowid]); +} + +static __inline__ uint32_t hfi_tidflow_get_seqnum(uint64_t val) +{ + return (val >> HFI_TF_SEQNUM_SHIFT) & HFI_TF_SEQNUM_MASK; +} + +static __inline__ uint32_t hfi_tidflow_get_genval(uint64_t val) +{ + return (val >> HFI_TF_GENVAL_SHIFT) & HFI_TF_GENVAL_MASK; +} + +static __inline__ uint32_t hfi_tidflow_get_flowvalid(uint64_t val) +{ + return (val >> HFI_TF_FLOWVALID_SHIFT) & HFI_TF_FLOWVALID_MASK; +} + +static __inline__ uint32_t hfi_tidflow_get_enabled(uint64_t val) +{ + return (val >> HFI_TF_HDRSUPP_ENABLED_SHIFT) & + HFI_TF_HDRSUPP_ENABLED_MASK; +} + +static __inline__ uint32_t hfi_tidflow_get_keep_after_seqerr(uint64_t val) +{ + return (val >> HFI_TF_KEEP_AFTER_SEQERR_SHIFT) & + HFI_TF_KEEP_AFTER_SEQERR_MASK; +} + +static __inline__ uint32_t hfi_tidflow_get_keep_on_generr(uint64_t val) +{ + return (val >> HFI_TF_KEEP_ON_GENERR_SHIFT) & + HFI_TF_KEEP_ON_GENERR_MASK; +} + +static __inline__ uint32_t hfi_tidflow_get_keep_payload_on_generr(uint64_t val) +{ + return (val >> HFI_TF_KEEP_PAYLOAD_ON_GENERR_SHIFT) & + HFI_TF_KEEP_PAYLOAD_ON_GENERR_MASK; +} + +static __inline__ uint32_t hfi_tidflow_get_seqmismatch(uint64_t val) +{ + return (val >> HFI_TF_STATUS_SEQMISMATCH_SHIFT) & + HFI_TF_STATUS_SEQMISMATCH_MASK; +} + +static __inline__ uint32_t hfi_tidflow_get_genmismatch(uint64_t val) +{ + return (val >> HFI_TF_STATUS_GENMISMATCH_SHIFT) & + HFI_TF_STATUS_GENMISMATCH_MASK; +} + +/* + * This should only be used by a process to write the eager index into + * a subcontext's eager header entry. + */ +static __inline__ void hfi_hdrset_use_egrbfr(__le32 *rbuf, uint32_t val) +{ + rbuf[0] = + (rbuf[0] & + __cpu_to_le32(~(HFI_RHF_USE_EGRBFR_MASK << + HFI_RHF_USE_EGRBFR_SHIFT))) | + __cpu_to_le32((val & HFI_RHF_USE_EGRBFR_MASK) << + HFI_RHF_USE_EGRBFR_SHIFT); +} + +static __inline__ void hfi_hdrset_egrbfr_index(__le32 *rbuf, uint32_t val) +{ + rbuf[0] = + (rbuf[0] & + __cpu_to_le32(~(HFI_RHF_EGRBFR_INDEX_MASK << + HFI_RHF_EGRBFR_INDEX_SHIFT))) | + __cpu_to_le32((val & HFI_RHF_EGRBFR_INDEX_MASK) << + HFI_RHF_EGRBFR_INDEX_SHIFT); +} + +static __inline__ void hfi_hdrset_egrbfr_offset(__le32 *rbuf, uint32_t val) +{ + rbuf[1] = + (rbuf[1] & + __cpu_to_le32(~(HFI_RHF_EGRBFR_OFFSET_MASK << + HFI_RHF_EGRBFR_OFFSET_SHIFT))) | + __cpu_to_le32((val & HFI_RHF_EGRBFR_OFFSET_MASK) << + HFI_RHF_EGRBFR_OFFSET_SHIFT); +} + +/* + * This should only be used by a process to update the receive header + * error flags. + */ +static __inline__ void hfi_hdrset_err_flags(__le32 *rbuf, uint32_t val) +{ + rbuf[1] |= __cpu_to_le32(val); +} + +/* + * This should only be used by a process to write the rhf seq number into + * a subcontext's eager header entry. + */ +static __inline__ void hfi_hdrset_seq(__le32 *rbuf, uint32_t val) +{ + rbuf[0] = + (rbuf[0] & + __cpu_to_le32(~(HFI_RHF_SEQ_MASK << + HFI_RHF_SEQ_SHIFT))) | + __cpu_to_le32((val & HFI_RHF_SEQ_MASK) << HFI_RHF_SEQ_SHIFT); +} + +/* Manage TID entries. It is possible that not all entries + requested may be allocated. A matching hfi_free_tid() must be + done for each hfi_update_tid(), because currently no caching or + reuse of expected tid entries is allowed, to work around malloc/free + and mmap/munmap issues. The driver decides which TID entries to allocate. + If hfi_free_tid is called to free entries in use by a different + send by the same process, data corruption will probably occur, + but only within that process, not for other processes. +*/ + +/* update tidcnt expected TID entries from the array pointed to by tidinfo. */ +/* Returns 0 on success, else an errno. See full description at declaration */ +static __inline__ int32_t hfi_update_tid(struct _hfi_ctrl *ctrl, + uint64_t vaddr, uint32_t *length, + uint64_t tidlist, uint32_t *tidcnt, uint16_t flags) +{ + struct hfi1_cmd cmd; +#ifdef PSM_CUDA + struct hfi1_tid_info_v2 tidinfo; +#else + struct hfi1_tid_info tidinfo; +#endif + int err; + + tidinfo.vaddr = vaddr; /* base address for this send to map */ + tidinfo.length = *length; /* length of vaddr */ + + tidinfo.tidlist = tidlist; /* driver copies tids back directly */ + tidinfo.tidcnt = 0; /* clear to zero */ + + cmd.type = PSMI_HFI_CMD_TID_UPDATE; +#ifdef PSM_CUDA + cmd.type = PSMI_HFI_CMD_TID_UPDATE_V2; + + if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) + tidinfo.flags = flags; + else + tidinfo.flags = 0; +#endif + + cmd.len = sizeof(tidinfo); + cmd.addr = (__u64) &tidinfo; + + err = hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)); + + if (err != -1) { + *length = tidinfo.length; + *tidcnt = tidinfo.tidcnt; + } + + return err; +} + +static __inline__ int32_t hfi_free_tid(struct _hfi_ctrl *ctrl, + uint64_t tidlist, uint32_t tidcnt) +{ + struct hfi1_cmd cmd; + struct hfi1_tid_info tidinfo; + int err; + + tidinfo.tidlist = tidlist; /* input to driver */ + tidinfo.tidcnt = tidcnt; + + cmd.type = PSMI_HFI_CMD_TID_FREE; + cmd.len = sizeof(tidinfo); + cmd.addr = (__u64) &tidinfo; + + err = hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)); + + return err; +} + +static __inline__ int32_t hfi_get_invalidation(struct _hfi_ctrl *ctrl, + uint64_t tidlist, uint32_t *tidcnt) +{ + struct hfi1_cmd cmd; + struct hfi1_tid_info tidinfo; + int err; + + tidinfo.tidlist = tidlist; /* driver copies tids back directly */ + tidinfo.tidcnt = 0; /* clear to zero */ + + cmd.type = PSMI_HFI_CMD_TID_INVAL_READ; + cmd.len = sizeof(tidinfo); + cmd.addr = (__u64) &tidinfo; + + err = hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)); + + if (err != -1) + *tidcnt = tidinfo.tidcnt; + + return err; +} + +/* + * Data layout in I2C flash (for GUID, etc.) + * All fields are little-endian binary unless otherwise stated + */ +#define HFI_FLASH_VERSION 2 +struct hfi_flash { + /* flash layout version (HFI_FLASH_VERSION) */ + __u8 if_fversion; + /* checksum protecting if_length bytes */ + __u8 if_csum; + /* + * valid length (in use, protected by if_csum), including + * if_fversion and if_csum themselves) + */ + __u8 if_length; + /* the GUID, in network order */ + __u8 if_guid[8]; + /* number of GUIDs to use, starting from if_guid */ + __u8 if_numguid; + /* the (last 10 characters of) board serial number, in ASCII */ + char if_serial[12]; + /* board mfg date (YYYYMMDD ASCII) */ + char if_mfgdate[8]; + /* last board rework/test date (YYYYMMDD ASCII) */ + char if_testdate[8]; + /* logging of error counts, TBD */ + __u8 if_errcntp[4]; + /* powered on hours, updated at driver unload */ + __u8 if_powerhour[2]; + /* ASCII free-form comment field */ + char if_comment[32]; + /* Backwards compatible prefix for longer QLogic Serial Numbers */ + char if_sprefix[4]; + /* 82 bytes used, min flash size is 128 bytes */ + __u8 if_future[46]; +}; +#endif /* OPA_USER_GEN1_H */ diff --git a/psm_hal_gen1/opa_utils_gen1.c b/psm_hal_gen1/opa_utils_gen1.c new file mode 100644 index 0000000..86ff69e --- /dev/null +++ b/psm_hal_gen1/opa_utils_gen1.c @@ -0,0 +1,273 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* This file contains hfi service routine interface used by the low */ +/* level hfi protocol code. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opa_user_gen1.h" + +/* touch the pages, with a 32 bit read */ +void hfi_touch_mmap(void *m, size_t bytes) +{ + volatile uint32_t *b = (volatile uint32_t *)m, c; + size_t i; /* m is always page aligned, so pgcnt exact */ + int __hfi_pg_sz; + + /* First get the page size */ + __hfi_pg_sz = sysconf(_SC_PAGESIZE); + + _HFI_VDBG("Touch %lu mmap'ed pages starting at %p\n", + (unsigned long)bytes / __hfi_pg_sz, m); + bytes /= sizeof(c); + for (i = 0; i < bytes; i += __hfi_pg_sz / sizeof(c)) + c = b[i]; +} + +/* ack event bits, and clear them. Usage is check *spi_sendbuf_status, + pass bits you are prepared to handle to hfi_event_ack(), perform the + appropriate actions for bits that were set, and then (if appropriate) + check the bits again. */ +int hfi_event_ack(struct _hfi_ctrl *ctrl, __u64 ackbits) +{ + struct hfi1_cmd cmd; + + cmd.type = PSMI_HFI_CMD_ACK_EVENT; + cmd.len = 0; + cmd.addr = ackbits; + + if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) { + if (errno != EINVAL) /* not implemented in driver. */ + _HFI_DBG("event ack failed: %s\n", strerror(errno)); + return -1; + } + return 0; +} + +/* Tell the driver to change the way packets can generate interrupts. + + HFI1_POLL_TYPE_URGENT: Generate interrupt only when packet sets + HFI_KPF_INTR + HFI1_POLL_TYPE_ANYRCV: wakeup on any rcv packet (when polled on). + + PSM: Uses TYPE_URGENT in ips protocol +*/ +int hfi_poll_type(struct _hfi_ctrl *ctrl, uint16_t poll_type) +{ + struct hfi1_cmd cmd; + + cmd.type = PSMI_HFI_CMD_POLL_TYPE; + cmd.len = 0; + cmd.addr = (uint64_t) poll_type; + + if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) { + if (errno != EINVAL) /* not implemented in driver */ + _HFI_INFO("poll type failed: %s\n", strerror(errno)); + return -1; + } + return 0; +} + +/* set the send context pkey to check BTH pkey in each packet. + driver should check its pkey table to see if it can find + this pkey, if not, driver should return error. */ +int hfi_set_pkey(struct _hfi_ctrl *ctrl, uint16_t pkey) +{ + struct hfi1_cmd cmd; + struct hfi1_base_info tbinfo; + + cmd.type = PSMI_HFI_CMD_SET_PKEY; + cmd.len = 0; + cmd.addr = (uint64_t) pkey; + + _HFI_VDBG("Setting context pkey to 0x%04x.\n", pkey); + if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) { + _HFI_INFO("Setting context pkey to 0x%04x failed: %s\n", + pkey, strerror(errno)); + return -1; + } else { + _HFI_VDBG("Successfully set context pkey to 0x%04x.\n", pkey); + } + + if (getenv("PSM2_SELINUX")) { + /* + * If SELinux is in use the kernel may have changed our JKey based on + * what we supply for the PKey so go ahead and interrogate the user info + * again and update our saved copy. In the future there may be a new + * IOCTL to get the JKey only. For now, this temporary workaround works. + */ + cmd.type = PSMI_HFI_CMD_USER_INFO; + cmd.len = sizeof(tbinfo); + cmd.addr = (uint64_t) &tbinfo; + + if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) { + _HFI_VDBG("BASE_INFO command failed in setpkey: %s\n", + strerror(errno)); + return -1; + } + _HFI_VDBG("PSM2_SELINUX is set, updating jkey to 0x%04x\n", tbinfo.jkey); + ctrl->base_info.jkey = tbinfo.jkey; + } + return 0; +} + +/* Tell the driver to reset the send context. if the send context + if halted, reset it, if not, return error back to caller. + After context reset, the credit return should be reset to + zero by a hardware credit return DMA. + Driver will return ENOLCK if the reset is timeout, in this + case PSM needs to re-call again. */ +int hfi_reset_context(struct _hfi_ctrl *ctrl) +{ + struct hfi1_cmd cmd; + + cmd.type = PSMI_HFI_CMD_CTXT_RESET; + cmd.len = 0; + cmd.addr = 0; + +retry: + if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) { + if (errno == ENOLCK) + goto retry; + + if (errno != EINVAL) + _HFI_INFO("reset ctxt failed: %s\n", strerror(errno)); + return -1; + } + return 0; +} + +/* wait for a received packet for our context + This allows us to not busy wait, if nothing has happened for a + while, which allows better measurements of cpu utilization, and + in some cases, slightly better performance. Called where we would + otherwise call sched_yield(). It is not guaranteed that a packet + has arrived, so the normal checking loop(s) should be done. + + PSM: not used as is, PSM has it's own use of polling for interrupt-only + packets (sets hfi_poll_type to TYPE_URGENT) */ +int hfi_wait_for_packet(struct _hfi_ctrl *ctrl) +{ + return hfi_cmd_wait_for_packet(ctrl->fd); +} + +/* These have been fixed to read the values, but they are not + * compatible with the hfi driver, they return new info with + * the qib driver + */ +static int hfi_count_names(const char *namep) +{ + int n = 0; + while (*namep != '\0') { + if (*namep == '\n') + n++; + namep++; + } + return n; +} + +int hfi_lookup_stat(const char *attr, char *namep, uint64_t *stats, + uint64_t *s) +{ + const char *p; + int i, ret = -1, len = strlen(attr); + int nelem = hfi_count_names(namep); + + for (i = 0; i < nelem; i++) { + p = hfi_get_next_name(&namep); + if (p == NULL) + break; + if (strncasecmp(p, attr, len + 1) == 0) { + ret = i; + *s = stats[i]; + } + } + return ret; +} + +int hfi_get_single_portctr(int unit, int port, const char *attr, uint64_t *s) +{ + int nelem, n = 0, ret = -1; + char *namep = NULL; + uint64_t *stats = NULL; + + nelem = hfi_get_ctrs_port_names(unit, &namep); + if (nelem == -1 || namep == NULL) + goto bail; + stats = calloc(nelem, sizeof(uint64_t)); + if (stats == NULL) + goto bail; + n = hfi_get_ctrs_port(unit, port, stats, nelem); + if (n != nelem) + goto bail; + ret = hfi_lookup_stat(attr, namep, stats, s); +bail: + if (namep != NULL) + free(namep); + if (stats != NULL) + free(stats); + return ret; +} diff --git a/psm_hal_gen1/psm_gdrcpy.c b/psm_hal_gen1/psm_gdrcpy.c new file mode 100644 index 0000000..06cb9c2 --- /dev/null +++ b/psm_hal_gen1/psm_gdrcpy.c @@ -0,0 +1,227 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2018 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2018 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#ifdef PSM_CUDA +#include "psm_user.h" +#include "psm2_hal.h" +#include "psm_gdrcpy.h" +#include +#include +#include +#include "ptl_ips/ips_tid.h" +#include "ptl_ips/ips_expected_proto.h" +#include "opa_user_gen1.h" + +static int gdr_fd; + +int is_gdr_copy_enabled; + + +int get_gdr_fd(){ + return gdr_fd; +} + +#define GPU_PAGE_OFFSET_MASK (PSMI_GPU_PAGESIZE -1) +#define GPU_PAGE_MASK ~GPU_PAGE_OFFSET_MASK + +uint64_t +gdr_cache_evict() { + int ret; + struct hfi1_gdr_cache_evict_params params; + params.evict_params_in.version = HFI1_GDR_VERSION; + params.evict_params_in.pages_to_evict = 4; + + ret = ioctl(gdr_fd, HFI1_IOCTL_GDR_GPU_CACHE_EVICT, ¶ms); + if (ret) { + /* Fatal error */ + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "PIN/MMAP ioctl failed ret %d errno %d\n", + ret, errno); + return ret; + } + + return params.evict_params_out.pages_evicted; +} + + +uint64_t +ips_sdma_gpu_cache_evict(int fd) { + int ret; + struct hfi1_sdma_gpu_cache_evict_params params; + params.evict_params_in.version = HFI1_GDR_VERSION; + params.evict_params_in.pages_to_evict = 2; + + ret = ioctl(fd, HFI1_IOCTL_SDMA_CACHE_EVICT, ¶ms); + if (ret) { + /* Fatal error */ + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "SDMA Cache Evict failed ret %d errno %d\n", + ret, errno); + return ret; + } + + return params.evict_params_out.pages_evicted; +} + +/* handle_out_of_bar_space is called when the driver tries + * to self evict in the GDR cache and finds no entries. + * This could be due to the fact that all the pages pinned + * in the BAR1 region are cached in the SDMA and TID cache. + * We try to evict from both the caches for 30 seconds after + * which we bail out. If successful we retry to PIN/MMAP once + * again + */ +uint64_t +handle_out_of_bar_space(struct ips_proto *proto) +{ + time_t lastEvictTime = 0; + uint64_t lengthEvicted; + time_t now; + retry: + now = time(NULL); + + if (!lastEvictTime) + lastEvictTime = now; + + if (proto->protoexp && proto->protoexp->tidc.tid_cachemap.payload.nidle) { + lengthEvicted = + ips_tidcache_evict(&proto->protoexp->tidc, -1); + + if (lengthEvicted) { + lastEvictTime = 0; + return lengthEvicted; /* signals a retry of the writev command. */ + } + } + + lengthEvicted = ips_sdma_gpu_cache_evict(psmi_hal_get_fd(proto->ep->context.psm_hw_ctxt)); + if (lengthEvicted) { + lastEvictTime = 0; + return lengthEvicted; + } + static const double thirtySeconds = 30.0; + if (difftime(now, lastEvictTime) > + thirtySeconds) { + return 0; + } else { + goto retry; + } +} + +void * +gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf, + size_t size, int flags, + struct ips_proto* proto) +{ + struct hfi1_gdr_query_params query_params; + void *host_addr_buf; + int ret; + + query_params.query_params_in.version = HFI1_GDR_VERSION; + uintptr_t pageaddr = buf & GPU_PAGE_MASK; + /* As size is guarenteed to be in the range of 0-8kB + * there is a guarentee that buf+size-1 does not overflow + * 64 bits. + */ + uint32_t pagelen = (uint32_t) (PSMI_GPU_PAGESIZE + + ((buf + size - 1) & GPU_PAGE_MASK) - + pageaddr); + + query_params.query_params_in.gpu_buf_addr = pageaddr; + query_params.query_params_in.gpu_buf_size = pagelen; + retry: + + ret = ioctl(gdr_fd, HFI1_IOCTL_GDR_GPU_PIN_MMAP, &query_params); + + if (ret) { + if (errno == ENOMEM || errno == EINVAL) { + if (!handle_out_of_bar_space(proto)) { + /* Fatal error */ + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unable to PIN GPU pages(Out of BAR1 space)\n"); + return NULL; + } else { + goto retry; + } + } else { + /* Fatal error */ + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "PIN/MMAP ioctl failed ret %d errno %d\n", + ret, errno); + return NULL; + } + } + host_addr_buf = (void *)query_params.query_params_out.host_buf_addr; + return host_addr_buf + (buf & GPU_PAGE_OFFSET_MASK); +} + + +void hfi_gdr_open(){ + gdr_fd = open(GDR_DEVICE_PATH, O_RDWR); + if (-1 == gdr_fd ) { + /* Non-Fatal error. If device cannot be found we assume + * that the driver does not support GDR Copy and we fallback + * to sending all GPU messages using rndv protocol + */ + _HFI_INFO(" Warning: The HFI1 driver installed does not support GPUDirect RDMA" + " fast copy. Turning off GDR fast copy in PSM \n"); + is_gdr_copy_enabled = 0; + return; + } + return; +} + +void hfi_gdr_close() +{ + close(GDR_FD); +} + +#endif diff --git a/psm_hal_gen1/psm_hal_gen1.c b/psm_hal_gen1/psm_hal_gen1.c new file mode 100644 index 0000000..732943f --- /dev/null +++ b/psm_hal_gen1/psm_hal_gen1.c @@ -0,0 +1,190 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "psm_user.h" +#include "psm2_hal.h" + +#if PSMI_HAL_INST_CNT > 1 +#define PSMI_HAL_CAT_INL_SYM(KERNEL) hfp_gen1_ ## KERNEL +#include "psm2_hal_inline_t.h" +#include "psm_hal_inline_i.h" +#endif + +/* define the singleton that implements hal for gen1 */ +static hfp_gen1_t psm_gen1_hi = { + /* start of public psmi_hal_instance_t data */ + .phi = { + .type = PSM_HAL_INSTANCE_GEN1, + .description = "PSM2 HAL instance for GEN1" +#ifdef PSM_CUDA + " (cuda)" +#endif + , + .hfi_name = "hfi1", + .hfi_sys_class_path = "/sys/class/infiniband/hfi1", + .params = {0}, + + /* The following methods are alphabetized */ +#if PSMI_HAL_INST_CNT > 1 + .hfp_ack_hfi_event = hfp_gen1_ack_hfi_event, + .hfp_check_rhf_sequence_number = hfp_gen1_check_rhf_sequence_number, + .hfp_cl_q_empty = hfp_gen1_cl_q_empty, + .hfp_close_context = hfp_gen1_close_context, + .hfp_context_open = hfp_gen1_context_open, + .hfp_dma_slot_available = hfp_gen1_dma_slot_available, + .hfp_finalize = hfp_gen1_finalize, + .hfp_forward_packet_to_subcontext = hfp_gen1_forward_packet_to_subcontext, + .hfp_free_tid = hfp_gen1_free_tid, + .hfp_get_bthqp = hfp_gen1_get_bthqp, + .hfp_get_cc_settings_bin = hfp_gen1_get_cc_settings_bin, + .hfp_get_cc_table_bin = hfp_gen1_get_cc_table_bin, + .hfp_get_cl_q_head_index = hfp_gen1_get_cl_q_head_index, + .hfp_get_cl_q_tail_index = hfp_gen1_get_cl_q_tail_index, + .hfp_get_context = hfp_gen1_get_context, + .hfp_get_egr_buff = hfp_gen1_get_egr_buff, + .hfp_get_fd = hfp_gen1_get_fd, + .hfp_get_gid_hi = hfp_gen1_get_gid_hi, + .hfp_get_gid_lo = hfp_gen1_get_gid_lo, + .hfp_get_hfi_event_bits = hfp_gen1_get_hfi_event_bits, + .hfp_get_hfi_type = hfp_gen1_get_hfi_type, + .hfp_get_hw_status = hfp_gen1_get_hw_status, + .hfp_get_hw_status_freezemsg = hfp_gen1_get_hw_status_freezemsg, + .hfp_get_jkey = hfp_gen1_get_jkey, + .hfp_get_lid = hfp_gen1_get_lid, + .hfp_get_node_id = hfp_gen1_get_node_id, + .hfp_get_num_contexts = hfp_gen1_get_num_contexts, + .hfp_get_num_free_contexts = hfp_gen1_get_num_free_contexts, + .hfp_get_pio_size = hfp_gen1_get_pio_size, + .hfp_get_pio_stall_cnt = hfp_gen1_get_pio_stall_cnt, + .hfp_get_port_active = hfp_gen1_get_port_active, + .hfp_get_port_gid = hfp_gen1_get_port_gid, + .hfp_get_port_index2pkey = hfp_gen1_get_port_index2pkey, + .hfp_get_port_lid = hfp_gen1_get_port_lid, + .hfp_get_port_lmc = hfp_gen1_get_port_lmc, + .hfp_get_port_num = hfp_gen1_get_port_num, + .hfp_get_port_rate = hfp_gen1_get_port_rate, + .hfp_get_port_sc2vl = hfp_gen1_get_port_sc2vl, + .hfp_get_port_sl2sc = hfp_gen1_get_port_sl2sc, + .hfp_get_receive_event = hfp_gen1_get_receive_event, + .hfp_get_rhf_expected_sequence_number = hfp_gen1_get_rhf_expected_sequence_number, + .hfp_get_rx_egr_tid_cnt = hfp_gen1_get_rx_egr_tid_cnt, + .hfp_get_rx_hdr_q_cnt = hfp_gen1_get_rx_hdr_q_cnt, + .hfp_get_rx_hdr_q_ent_size = hfp_gen1_get_rx_hdr_q_ent_size, + .hfp_get_sdma_req_size = hfp_gen1_get_sdma_req_size, + .hfp_get_sdma_ring_size = hfp_gen1_get_sdma_ring_size, + .hfp_get_sdma_ring_slot_status = hfp_gen1_get_sdma_ring_slot_status, + .hfp_get_subctxt = hfp_gen1_get_subctxt, + .hfp_get_subctxt_cnt = hfp_gen1_get_subctxt_cnt, + .hfp_get_tid_exp_cnt = hfp_gen1_get_tid_exp_cnt, + .hfp_get_tidcache_invalidation = hfp_gen1_get_tidcache_invalidation, + .hfp_get_unit_active = hfp_gen1_get_unit_active, + .hfp_get_unit_id = hfp_gen1_get_unit_id, + .hfp_get_user_major_bldtime_version = hfp_gen1_get_user_major_bldtime_version, + .hfp_get_user_major_bldtime_version = hfp_gen1_get_user_major_bldtime_version, + .hfp_get_user_major_runtime_version = hfp_gen1_get_user_major_runtime_version, + .hfp_get_user_major_runtime_version = hfp_gen1_get_user_major_runtime_version, + .hfp_get_user_minor_bldtime_version = hfp_gen1_get_user_minor_bldtime_version, + .hfp_get_user_minor_bldtime_version = hfp_gen1_get_user_minor_bldtime_version, + .hfp_get_user_minor_runtime_version = hfp_gen1_get_user_minor_runtime_version, + .hfp_get_user_minor_runtime_version = hfp_gen1_get_user_minor_runtime_version, + .hfp_hfi_reset_context = hfp_gen1_hfi_reset_context, + .hfp_poll_type = hfp_gen1_poll_type, + .hfp_retire_hdr_q_entry = hfp_gen1_retire_hdr_q_entry, + .hfp_set_cl_q_head_index = hfp_gen1_set_cl_q_head_index, + .hfp_set_cl_q_tail_index = hfp_gen1_set_cl_q_tail_index, + .hfp_set_effective_mtu = hfp_gen1_set_effective_mtu, + .hfp_set_pbc = hfp_gen1_set_pbc, + .hfp_set_pio_size = hfp_gen1_set_pio_size, + .hfp_set_pkey = hfp_gen1_set_pkey, + .hfp_set_rhf_expected_sequence_number = hfp_gen1_set_rhf_expected_sequence_number, + .hfp_set_tf_valid = hfp_gen1_set_tf_valid, + .hfp_spio_fini = hfp_gen1_spio_fini, + .hfp_spio_init = hfp_gen1_spio_init, + .hfp_spio_process_events = hfp_gen1_spio_process_events, + .hfp_spio_transfer_frame = hfp_gen1_spio_transfer_frame, + .hfp_subcontext_ureg_get = hfp_gen1_subcontext_ureg_get, + .hfp_tidflow_check_update_pkt_seq = hfp_gen1_tidflow_check_update_pkt_seq, + .hfp_tidflow_get = hfp_gen1_tidflow_get, + .hfp_tidflow_get_enabled = hfp_gen1_tidflow_get_enabled, + .hfp_tidflow_get_flowvalid = hfp_gen1_tidflow_get_flowvalid, + .hfp_tidflow_get_genmismatch = hfp_gen1_tidflow_get_genmismatch, + .hfp_tidflow_get_genval = hfp_gen1_tidflow_get_genval, + .hfp_tidflow_get_hw = hfp_gen1_tidflow_get_hw, + .hfp_tidflow_get_keep_after_seqerr = hfp_gen1_tidflow_get_keep_after_seqerr, + .hfp_tidflow_get_keep_on_generr = hfp_gen1_tidflow_get_keep_on_generr, + .hfp_tidflow_get_keep_payload_on_generr = hfp_gen1_tidflow_get_keep_payload_on_generr, + .hfp_tidflow_get_seqmismatch = hfp_gen1_tidflow_get_seqmismatch, + .hfp_tidflow_get_seqnum = hfp_gen1_tidflow_get_seqnum, + .hfp_tidflow_reset = hfp_gen1_tidflow_reset, + .hfp_tidflow_set_entry = hfp_gen1_tidflow_set_entry, + .hfp_update_tid = hfp_gen1_update_tid, + .hfp_writev = hfp_gen1_writev, +#endif + .hfp_get_default_pkey = hfp_gen1_get_default_pkey, + .hfp_get_num_units = hfp_gen1_get_num_units, + .hfp_get_num_ports = hfp_gen1_get_num_ports, + .hfp_initialize = hfp_gen1_initialize, + }, + /* start of private hfp_gen1_private data */ + .hfp_private = { + .sdmahdr_req_size = 0, + .dma_rtail = 0, + .hdrq_rhf_off = 0, + } +}; + +/* __psmi_hal_gen1_constructor */ +static void __attribute__ ((constructor)) __psmi_hal_gen1_constructor(void) +{ + psmi_hal_register_instance((psmi_hal_instance_t*)&psm_gen1_hi); +} diff --git a/psm_hal_gen1/psm_hal_gen1.h b/psm_hal_gen1/psm_hal_gen1.h new file mode 100644 index 0000000..abe04a5 --- /dev/null +++ b/psm_hal_gen1/psm_hal_gen1.h @@ -0,0 +1,145 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "psm_user.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" +#include "psm_hal_gen1_spio.h" +#include "psm_mq_internal.h" +#include "opa_user_gen1.h" + +#define LAST_RHF_SEQNO 13 + +typedef struct +{ + volatile uint64_t *cl_q_head; + volatile uint64_t *cl_q_tail; + union + { + /* hdr_qe's are only present in *_RX_HDR_Q* CL Q types: */ + struct + { + uint32_t rx_hdrq_rhf_seq; + uint32_t *p_rx_hdrq_rhf_seq; + uint32_t *hdrq_base_addr; + } hdr_qe; /* header queue entry */ + /* egr_buffs's are only present in *_RX_EGR_Q* CL Q types: */ + void **egr_buffs; + }; +} psm_hal_gen1_cl_q_t; + +COMPILE_TIME_ASSERT(MAX_SHARED_CTXTS_MUST_MATCH, PSM_HAL_MAX_SHARED_CTXTS == HFI1_MAX_SHARED_CTXTS); + +/* Private struct on a per-context basis. */ +typedef struct _hfp_gen1_pc_private +{ + struct _hfi_ctrl *ctrl; /* driver opaque hfi_proto */ + psm_hal_gen1_cl_q_t cl_qs[PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(7) + 1]; + struct ips_hwcontext_ctrl *hwcontext_ctrl; + struct ips_subcontext_ureg *subcontext_ureg[HFI1_MAX_SHARED_CTXTS]; + struct ips_spio spio_ctrl; + struct hfi1_user_info_dep user_info; +} hfp_gen1_pc_private; + +/* At the end of each scb struct, we have space reserved to accommodate + * three structures (for GEN1)- + * struct psm_hal_sdma_req_info, struct psm_hal_pbc and struct ips_message_header. + * The HIC should get the size needed for the extended memory region + * using a HAL call (psmi_hal_get_scb_extended_mem_size). For Gen1, this API + * will return the size of the below struct psm_hal_gen1_scb_extended + * aligned up to be able to fit struct psm_hal_pbc on a 64-byte boundary. + */ + +#define PSMI_SHARED_CONTEXTS_ENABLED_BY_DEFAULT 1 + +struct psm_hal_gen1_scb_extended { + union + { + struct sdma_req_info sri1; + struct sdma_req_info_v6_3 sri2; + }; + struct { + struct psm_hal_pbc pbc; + struct ips_message_header ips_lrh; + } PSMI_CACHEALIGN; +}; + +/* declare the hfp_gen1_private struct */ +typedef struct _hfp_gen1_private +{ + /* GEN1 specific data that are common to all contexts: */ + int sdmahdr_req_size; + int dma_rtail; + uint32_t hdrq_rhf_off; +} hfp_gen1_private_t; + +/* declare hfp_gen1_t struct, (combines public psmi_hal_instance_t + together with a private struct) */ +typedef struct _hfp_gen1 +{ + psmi_hal_instance_t phi; + hfp_gen1_private_t hfp_private; +} hfp_gen1_t; + +static const struct +{ + uint32_t hfi1_event_bit, psmi_hal_hfi_event_bit; +} hfi1_events_map[] = +{ + { HFI1_EVENT_FROZEN, PSM_HAL_HFI_EVENT_FROZEN }, + { HFI1_EVENT_LINKDOWN, PSM_HAL_HFI_EVENT_LINKDOWN }, + { HFI1_EVENT_LID_CHANGE, PSM_HAL_HFI_EVENT_LID_CHANGE }, + { HFI1_EVENT_LMC_CHANGE, PSM_HAL_HFI_EVENT_LMC_CHANGE }, + { HFI1_EVENT_SL2VL_CHANGE, PSM_HAL_HFI_EVENT_SL2VL_CHANGE }, + { HFI1_EVENT_TID_MMU_NOTIFY, PSM_HAL_HFI_EVENT_TID_MMU_NOTIFY}, +}; diff --git a/psm_hal_gen1/psm_hal_gen1_spio.c b/psm_hal_gen1/psm_hal_gen1_spio.c new file mode 100644 index 0000000..8767dd9 --- /dev/null +++ b/psm_hal_gen1/psm_hal_gen1_spio.c @@ -0,0 +1,928 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2017 Intel Corporation. All rights reserved. */ + +/* included header files */ +#include +#include +#include +#include +#include + +#include "ips_proto.h" +#include "ips_proto_internal.h" +#include "psm_hal_gen1_spio.h" +#include "ips_proto_params.h" + +/* Report PIO stalls every 20 seconds at the least */ +#define SPIO_STALL_WARNING_INTERVAL (nanosecs_to_cycles(20e9)) +#define SPIO_MAX_CONSECUTIVE_SEND_FAIL (1<<20) /* 1M */ +/* RESYNC_CONSECUTIVE_SEND_FAIL has to be a multiple of MAX_CONSECUTIVE */ +#define SPIO_RESYNC_CONSECUTIVE_SEND_FAIL (1<<4) /* 16 */ + +static void spio_report_stall(struct ips_spio *ctrl, + uint64_t t_cyc_now, uint64_t send_failures); + +static void spio_handle_stall(struct ips_spio *ctrl, uint64_t send_failures); + +static psm2_error_t spio_reset_hfi(struct ips_spio *ctrl); +static psm2_error_t spio_reset_hfi_shared(struct ips_spio *ctrl); +static psm2_error_t spio_credit_return_update(struct ips_spio *ctrl); +static psm2_error_t spio_credit_return_update_shared(struct ips_spio *ctrl); + +static PSMI_HAL_INLINE psm2_error_t +ips_spio_init(const struct psmi_context *context, struct ptl *ptl, + struct ips_spio *ctrl +#ifdef PSM_AVX512 + , int is_avx512_enabled +#endif + ) +{ + cpuid_t id; + hfp_gen1_pc_private *psm_hw_ctxt = context->psm_hw_ctxt; + struct _hfi_ctrl *con_ctrl = psm_hw_ctxt->ctrl; + + ctrl->ptl = ptl; + ctrl->context = context; + ctrl->unit_id = context->ep->unit_id; + ctrl->portnum = context->ep->portnum; + + pthread_spin_init(&ctrl->spio_lock, PTHREAD_PROCESS_PRIVATE); + ctrl->spio_credits_addr = (volatile __le64 *) con_ctrl->base_info.sc_credits_addr; + ctrl->spio_bufbase_sop = (volatile uint64_t *)con_ctrl->base_info.pio_bufbase_sop; + ctrl->spio_bufbase = (volatile uint64_t *)con_ctrl->base_info.pio_bufbase; + + ctrl->spio_consecutive_failures = 0; + ctrl->spio_num_stall = 0ULL; + ctrl->spio_num_stall_total = 0ULL; + ctrl->spio_next_stall_warning = 0ULL; + ctrl->spio_last_stall_cyc = 0ULL; + ctrl->spio_init_cyc = get_cycles(); + + ctrl->spio_total_blocks = con_ctrl->ctxt_info.credits; + ctrl->spio_block_index = 0; + + ctrl->spio_ctrl = (struct ips_spio_ctrl *)context->spio_ctrl; + if (!ctrl->spio_ctrl) { + ctrl->spio_ctrl = (volatile struct ips_spio_ctrl *) + psmi_calloc(context->ep, UNDEFINED, 1, + sizeof(struct ips_spio_ctrl)); + if (ctrl->spio_ctrl == NULL) { + return PSM2_NO_MEMORY; + } + + ctrl->spio_reset_hfi = spio_reset_hfi; + ctrl->spio_credit_return_update = + spio_credit_return_update; + } else { + ctrl->spio_reset_hfi = spio_reset_hfi_shared; + ctrl->spio_credit_return_update = + spio_credit_return_update_shared; + } + + /* + * Only the master process can initialize. + */ + if (psmi_hal_get_subctxt(context->psm_hw_ctxt) == 0) { + pthread_spin_init(&ctrl->spio_ctrl->spio_ctrl_lock, + PTHREAD_PROCESS_SHARED); + + ctrl->spio_ctrl->spio_write_in_progress = 0; + ctrl->spio_ctrl->spio_reset_count = 0; + ctrl->spio_ctrl->spio_frozen_count = 0; + + ctrl->spio_ctrl->spio_available_blocks = + ctrl->spio_total_blocks; + ctrl->spio_ctrl->spio_block_index = 0; + ctrl->spio_ctrl->spio_fill_counter = 0; + + psmi_assert(SPIO_CREDITS_Counter + (ctrl->spio_ctrl->spio_credits.value) == 0); + psmi_assert(SPIO_CREDITS_Status + (ctrl->spio_ctrl->spio_credits.value) == 0); + + ctrl->spio_ctrl->spio_credits.credit_return = + *ctrl->spio_credits_addr; + } + + /* + * Setup the PIO block copying routines. + */ + + get_cpuid(0x1, 0, &id); + + /* 16B copying supported */ + ctrl->spio_blockcpy_med = (id.edx & (1<spio_blockcpy_large = (id.ebx & (1<spio_blockcpy_med; + +#ifdef PSM_AVX512 + /* 64B copying supported */ + ctrl->spio_blockcpy_large = (is_avx512_enabled && (id.ebx & (1<spio_blockcpy_large; + +#endif + + +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED) { + PSMI_CUDA_CALL(cuMemHostAlloc, (void **) &ctrl->cuda_pio_buffer, + MAX_CUDA_MTU, CU_MEMHOSTALLOC_PORTABLE); + } +#endif + + _HFI_PRDBG("ips_spio_init() done\n"); + + return PSM2_OK; +} + +static PSMI_HAL_INLINE psm2_error_t ips_spio_fini(struct ips_spio *ctrl) +{ +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED) + PSMI_CUDA_CALL(cuMemFreeHost, (void *) ctrl->cuda_pio_buffer); +#endif + spio_report_stall(ctrl, get_cycles(), 0ULL); + if (!ctrl->context->spio_ctrl) + psmi_free((void *)ctrl->spio_ctrl); + return PSM2_OK; +} + +static PSMI_HAL_INLINE +void +spio_report_stall(struct ips_spio *ctrl, uint64_t t_cyc_now, + uint64_t send_failures) +{ + size_t off = 0; + char buf[1024]; + + if (ctrl->spio_num_stall == 0) + return; + + if (send_failures > 0) { + char bufctr[128]; + uint64_t tx_stat, rx_stat; + int ret; + + off = snprintf(buf, sizeof(buf) - 1, + "PIO Send context %d with total blocks %d , available blocks %d, " + "fill counter %d, free counter %d ", + (int)psm2_epid_context(ctrl->context->epid), + ctrl->spio_total_blocks, + ctrl->spio_ctrl->spio_available_blocks, + ctrl->spio_ctrl->spio_fill_counter, + SPIO_CREDITS_Counter(ctrl->spio_ctrl-> + spio_credits.value)); + buf[off] = '\0'; + + /* In case hfifs isn't running */ + ret = hfi_get_single_portctr(ctrl->unit_id, ctrl->portnum, + "TxPkt", &tx_stat); + if (ret != -1) { + ret = hfi_get_single_portctr(ctrl->unit_id, + ctrl->portnum, "RxPkt", + &rx_stat); + if (ret != -1) { + snprintf(bufctr, sizeof(bufctr) - 1, + "(TxPktCnt=%llu,RxPktCnt=%llu)", + (unsigned long long)tx_stat, + (unsigned long long)rx_stat); + bufctr[sizeof(bufctr) - 1] = '\0'; + } else + bufctr[0] = '\0'; + } else + bufctr[0] = '\0'; + + _HFI_DBG + ("PIO Send Stall after at least %.2fM failed send attempts " + "(elapsed=%.3fs, last=%.3fs, pio_stall_count=%lld) %s %s\n", + send_failures / 1e6, + PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_init_cyc), + PSMI_CYCLES_TO_SECSF(t_cyc_now - + ctrl->spio_last_stall_cyc), + (unsigned long long)ctrl->spio_num_stall, + bufctr[0] != '\0' ? bufctr : "", buf); + } else { + _HFI_DBG + ("PIO Send Stall Summary: count=%llu, last=%.3fs, elapsed=%.3fs", + (unsigned long long)ctrl->spio_num_stall, + PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_init_cyc), + PSMI_CYCLES_TO_SECSF(t_cyc_now - + ctrl->spio_last_stall_cyc)); + } + + return; +} + +static PSMI_HAL_INLINE void spio_handle_stall(struct ips_spio *ctrl, uint64_t send_failures) +{ + uint64_t t_cyc_now = get_cycles(); + + /* We handle the pio-stall every time but only report something every 20 + * seconds. We print a summary at the end while closing the device */ + ctrl->spio_num_stall++; + ctrl->spio_num_stall_total++; + + if (ctrl->spio_next_stall_warning <= t_cyc_now) { + /* If context status is ok (i.e. no cables pulled or anything) */ + if (psmi_context_check_status(ctrl->context) == PSM2_OK) + spio_report_stall(ctrl, t_cyc_now, send_failures); + ctrl->spio_next_stall_warning = + get_cycles() + SPIO_STALL_WARNING_INTERVAL; + } + + /* re-initialize our shadow from the real registers; by this time, + * we know the hardware has to have done the update. + * Also, kernel check may have changed things. + */ + ctrl->spio_credit_return_update(ctrl); + + ctrl->spio_last_stall_cyc = t_cyc_now; + + return; +} + +/* + * A send context halt is detected in several ways: + * 1. during pio for normal credit return update; + * 2. during events process when no event; + * when a hfi is frozen, we recover hfi by calling this routine. + */ +static PSMI_HAL_INLINE void spio_reset_context(struct ips_spio *ctrl) +{ + /* if there are too many reset, teardown process */ + ctrl->spio_ctrl->spio_reset_count++; + if (ctrl->spio_ctrl->spio_reset_count > IPS_CTXT_RESET_MAX) + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Too many send context reset, teardown...\n"); + + /* + * Because there are many epaddrs and many flows using the + * same PIO queue, it is hard to search all the unacked + * queue and find the correct retry point. Instead we just + * let the upper level flow control to NAK the packets and + * do the retry from the right point. + */ + + /* Call into driver to reset send context, driver will + * block this routine until the send context is actually + * reset. + */ + ips_wmb(); + if (psmi_hal_hfi_reset_context(ctrl->context->psm_hw_ctxt)) + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Send context reset failed: %d.\n", errno); + + /* Reset spio shared control struct. */ + ctrl->spio_ctrl->spio_available_blocks = + ctrl->spio_total_blocks; + ctrl->spio_ctrl->spio_block_index = 0; + ctrl->spio_ctrl->spio_fill_counter = 0; + /* Get updated credit return again after reset. */ + ctrl->spio_ctrl->spio_credits.credit_return = + *ctrl->spio_credits_addr; + + psmi_assert(SPIO_CREDITS_Counter + (ctrl->spio_ctrl->spio_credits.value) == 0); + psmi_assert(SPIO_CREDITS_Status + (ctrl->spio_ctrl->spio_credits.value) == 0); +} + +/* + * hfi frozen is detected when checking events from driver, + * psm calls to check events in the main receive loop + * when there is no normal traffic. + */ +static PSMI_HAL_INLINE void spio_reset_hfi_internal(struct ips_spio *ctrl) +{ + struct ips_recvhdrq *recvq = &((struct ptl_ips *)(ctrl->ptl))->recvq; + struct ips_proto *proto = (struct ips_proto *)&((struct ptl_ips *)(ctrl->ptl))->proto; + + /* Reset receive queue state, this must be done first + * because after send context reset, hardware start to + * receive new packets. + */ + recvq->state->hdrq_head = 0; + recvq->state->rcv_egr_index_head = NO_EAGER_UPDATE; + recvq->state->num_hdrq_done = 0; + recvq->state->hdr_countdown = 0; + + /* set the expected sequence number to 1. */ + if (!(get_psm_gen1_hi()->hfp_private.dma_rtail)) + psmi_hal_set_rhf_expected_sequence_number(1, recvq->psm_hal_cl_hdrq, + ((struct ptl_ips *)proto->ptl)->context->psm_hw_ctxt); + + /* Reset send context */ + spio_reset_context(ctrl); + + /* Reset sdma completion queue, this should be done last + * because when send context is reset, driver will complete + * all the sdma requests with error code -2. This error + * code is ignored by PSM, but other error codes are + * caught inside the routine. + */ + while (proto->sdma_done_index != proto->sdma_fill_index) + ips_proto_dma_completion_update(proto); +} + +static PSMI_HAL_INLINE psm2_error_t spio_reset_hfi(struct ips_spio *ctrl) +{ + /* Drain receive header queue before reset hfi, we use + * the main progression loop to do this so we return from + * here. + */ + if (!ips_recvhdrq_isempty(&((struct ptl_ips *)(ctrl->ptl))->recvq)) + return PSM2_OK_NO_PROGRESS; + + /* do the real reset work: + * 1. reset receive header queue; + * 2. reset send context; + * 3. dain sdma completion queue; + */ + spio_reset_hfi_internal(ctrl); + + return PSM2_OK; +} + +/* + * There is a shared count and per process count, all initialized to + * zero. If a process' local count is equal to shared count, it is + * the first process and does the hfi reset, this process also move + * both counts up by one. If a process' local count is not equal to + * the shared count, it means other process has done the hfi reset, + * it just saves the shared count to local count and return. All the + * operation are locked by spio_ctrl_lock. + */ +static PSMI_HAL_INLINE psm2_error_t spio_reset_hfi_shared(struct ips_spio *ctrl) +{ + volatile struct ips_spio_ctrl *spio_ctrl = ctrl->spio_ctrl; + + /* Drain receive header queue before reset hfi, we use + * the main progression loop to do this so we return from + * here. We don't reset software receive header queue. + */ + if (!ips_recvhdrq_isempty(&((struct ptl_ips *)(ctrl->ptl))->recvq)) + return PSM2_OK_NO_PROGRESS; + + pthread_spin_lock(&spio_ctrl->spio_ctrl_lock); + + /* + * In context sharing mode, if there is a subcontext + * process in PIO writing, we need to wait till the PIO + * writing is done. So we spin wait here. If other + * process comes here and does the hfi reset, it should + * be perfectly fine. + */ + while (ctrl->spio_ctrl->spio_write_in_progress) { + pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock); + usleep(1000); + pthread_spin_lock(&spio_ctrl->spio_ctrl_lock); + } + + if (ctrl->spio_frozen_count == ctrl->spio_ctrl->spio_frozen_count) { + ctrl->spio_frozen_count++; + ctrl->spio_ctrl->spio_frozen_count++; + + spio_reset_hfi_internal(ctrl); + } else + ctrl->spio_frozen_count = ctrl->spio_ctrl->spio_frozen_count; + + pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock); + + return PSM2_OK; +} + +/* + * return value: + * PSM2_OK: new credits updated; + * PSM2_OK_NO_PROGRESS: no new credits; + */ +static PSMI_HAL_INLINE psm2_error_t +spio_credit_return_update(struct ips_spio *ctrl) +{ + uint64_t credit_return; + + credit_return = *ctrl->spio_credits_addr; + /* Update available blocks based on fill counter and free counter */ + if (ctrl->spio_ctrl->spio_credits.credit_return == credit_return) + return PSM2_OK_NO_PROGRESS; + + ctrl->spio_ctrl->spio_credits.credit_return = credit_return; + + /* If Status is set, then send context is halted */ + if (SPIO_CREDITS_Status(ctrl->spio_ctrl->spio_credits.value)) { + spio_reset_context(ctrl); + } else { + /* + * OPA1 has 1M PIO buffer, but each context can have max 64K, + * which is 1K 64B blocks, so the distance between fill counter + * and credit return counter is no more than 1024; Both fill + * counter and credit return counter are 11 bits value, + * representing range [0, 2047]. + */ + psmi_assert((ctrl->spio_ctrl->spio_available_blocks + + ((ctrl->spio_ctrl->spio_fill_counter - + SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits. + value)) & 0x7FF)) <= + ctrl->spio_total_blocks); + ctrl->spio_ctrl->spio_available_blocks = + ctrl->spio_total_blocks - + ((ctrl->spio_ctrl->spio_fill_counter - + SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits. + value)) & 0x7FF); + + /* a successful credit update, clear reset count */ + ctrl->spio_ctrl->spio_reset_count = 0; + } + + return PSM2_OK; +} + +/* + * return value: + * PSM2_OK: new credits updated; + * PSM2_OK_NO_PROGRESS: no new credits; + */ +static PSMI_HAL_INLINE psm2_error_t +spio_credit_return_update_shared(struct ips_spio *ctrl) +{ + uint64_t credit_return; + + pthread_spin_lock(&ctrl->spio_ctrl->spio_ctrl_lock); + + credit_return = *ctrl->spio_credits_addr; + /* Update available blocks based on fill counter and free counter */ + if (ctrl->spio_ctrl->spio_credits.credit_return == credit_return) { + pthread_spin_unlock(&ctrl->spio_ctrl->spio_ctrl_lock); + return PSM2_OK_NO_PROGRESS; + } + + ctrl->spio_ctrl->spio_credits.credit_return = credit_return; + + /* If Status is set, then send context is halted */ + if (SPIO_CREDITS_Status(ctrl->spio_ctrl->spio_credits.value)) { + /* + * In context sharing mode, if there is a subcontext + * process in PIO writing, we need to wait till the PIO + * writing is done. So we spin wait here. Other processes + * won't come here because for them, there is NO new + * credit return change (the first 'if' check in this + * routine). + */ + while (ctrl->spio_ctrl->spio_write_in_progress) { + pthread_spin_unlock(&ctrl->spio_ctrl->spio_ctrl_lock); + usleep(1000); + pthread_spin_lock(&ctrl->spio_ctrl->spio_ctrl_lock); + } + + spio_reset_context(ctrl); + } else { + /* + * OPA1 has 1M PIO buffer, but each context can have max 64K, + * which is 1K 64B blocks, so the distance between fill counter + * and credit return counter is no more than 1024; Both fill + * counter and credit return counter are 11 bits value, + * representing range [0, 2047]. + */ + psmi_assert((ctrl->spio_ctrl->spio_available_blocks + + ((ctrl->spio_ctrl->spio_fill_counter - + SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits. + value)) & 0x7FF)) <= + ctrl->spio_total_blocks); + ctrl->spio_ctrl->spio_available_blocks = + ctrl->spio_total_blocks - + ((ctrl->spio_ctrl->spio_fill_counter - + SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits. + value)) & 0x7FF); + + /* a successful credit update, clear reset count */ + ctrl->spio_ctrl->spio_reset_count = 0; + } + + pthread_spin_unlock(&ctrl->spio_ctrl->spio_ctrl_lock); + + return PSM2_OK; +} + +/* + * Check and process events + * return value: + * PSM2_OK: normal events processing; + * PSM2_OK_NO_PROGRESS: no event is processed; + */ +static PSMI_HAL_INLINE psm2_error_t +ips_spio_process_events(const struct ptl *ptl_gen) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + struct ips_spio *ctrl = ptl->proto.spioc; + uint64_t event_mask; + int rc = psmi_hal_get_hfi_event_bits(&event_mask,ctrl->context->psm_hw_ctxt); + + if (rc) + return PSM2_OK_NO_PROGRESS; + + /* + * If there is no event, try do credit return update + * to catch send context halt. + */ + if_pf(event_mask == 0) + return ctrl->spio_credit_return_update(ctrl); + + /* + * Process mmu invalidation event, this will invalidate + * all caching items removed by mmu notifier. + */ + if (event_mask & PSM_HAL_HFI_EVENT_TID_MMU_NOTIFY) { + /* + * driver will clear the event bit before return, + * PSM does not need to ack the event. + */ + return ips_tidcache_invalidation(&ptl->proto.protoexp->tidc); + } + + /* Check if HFI is frozen */ + if (event_mask & PSM_HAL_HFI_EVENT_FROZEN) { + /* if no progress, return and retry */ + if (ctrl->spio_reset_hfi(ctrl) != PSM2_OK) + return PSM2_OK_NO_PROGRESS; + } + + /* First ack the driver the receipt of the events */ + _HFI_VDBG("Acking event(s) 0x%" PRIx64 " to qib driver.\n", + (uint64_t) event_mask); + + psmi_hal_ack_hfi_event(event_mask, ctrl->context->psm_hw_ctxt); + + if (event_mask & PSM_HAL_HFI_EVENT_LINKDOWN) { + /* A link down event can clear the LMC and SL2VL + * change as those events are implicitly handled + * in the link up/down event handler. + */ + event_mask &= + ~(PSM_HAL_HFI_EVENT_LMC_CHANGE | + PSM_HAL_HFI_EVENT_SL2VL_CHANGE); + ips_ibta_link_updown_event(&((struct ptl_ips *)(ctrl->ptl))->proto); + _HFI_VDBG("Link down detected.\n"); + } + + if (event_mask & PSM_HAL_HFI_EVENT_LID_CHANGE) { + /* Display a warning that LID change has occurred during + * the run. This is not supported in the current + * implementation and in general is bad for the SM to + * re-assign LIDs during a run. + */ + _HFI_INFO + ("Warning! LID change detected during run. " + "Old LID: %d, New Lid: %d\n", + (int)PSMI_EPID_GET_LID(ctrl->context->epid), + (int)psmi_hal_get_port_lid(ctrl->unit_id, + ctrl->portnum)); + } + + if (event_mask & PSM_HAL_HFI_EVENT_LMC_CHANGE) + _HFI_INFO("Fabric LMC changed.\n"); + + if (event_mask & PSM_HAL_HFI_EVENT_SL2VL_CHANGE) { + _HFI_INFO("SL2VL mapping changed for port.\n"); + ips_ibta_init_sl2sc2vl_table(&((struct ptl_ips *)(ctrl->ptl))->proto); + } + + return PSM2_OK; +} + +static PSMI_HAL_INLINE void +spio_handle_resync(struct ips_spio *ctrl, uint64_t consecutive_send_failed) +{ + /* hfi_force_pio_avail_update(ctrl->context->ctrl); */ + + if (!(consecutive_send_failed & (SPIO_MAX_CONSECUTIVE_SEND_FAIL - 1))) + spio_handle_stall(ctrl, consecutive_send_failed); +} + +/* + * This function attempts to write a packet to a PIO. + * + * Recoverable errors: + * PSM2_OK: Packet triggered through PIO. + * PSM2_EP_NO_RESOURCES: No PIO bufs available or cable pulled. + * + * Unrecoverable errors: + * PSM2_EP_NO_NETWORK: No network, no lid, ... + * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc. + */ +static inline psm2_error_t +ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, + struct psm_hal_pbc *pbc, uint32_t *payload, + uint32_t length, uint32_t isCtrlMsg, + uint32_t cksum_valid, uint32_t cksum +#ifdef PSM_CUDA + , uint32_t is_cuda_payload +#endif + ) +{ + struct ips_spio *ctrl = proto->spioc; + volatile struct ips_spio_ctrl *spio_ctrl = ctrl->spio_ctrl; + volatile uint64_t *pioaddr; + uint32_t paylen, nblks; + psm2_error_t err = PSM2_OK; + int do_lock = psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); + + if (do_lock) + pthread_spin_lock(&ctrl->spio_lock); + + if_pf(PSMI_FAULTINJ_ENABLED()) { + PSMI_FAULTINJ_STATIC_DECL(fi_lost, "piosend", 1, + IPS_FAULTINJ_PIOLOST); + PSMI_FAULTINJ_STATIC_DECL(fi_busy, "piobusy", 1, + IPS_FAULTINJ_PIOBUSY); + if (psmi_faultinj_is_fault(fi_lost)) { + if (do_lock) + pthread_spin_unlock(&ctrl->spio_lock); + return PSM2_OK; + } else if (psmi_faultinj_is_fault(fi_busy)) + goto fi_busy; + /* else fall through normal processing path, i.e. no faults */ + } + + psmi_assert((length & 0x3) == 0); + paylen = length + (cksum_valid ? PSM_CRC_SIZE_IN_BYTES : 0); + nblks = 1 + ((paylen + 63) >> 6); + + if (spio_ctrl->spio_available_blocks < nblks) { + ctrl->spio_credit_return_update(ctrl); + + if_pf(spio_ctrl->spio_available_blocks < nblks) { + /* Check unit status */ +fi_busy: + if ((err = + psmi_context_check_status(ctrl->context)) == + PSM2_OK) { + if (0 == + (++ctrl-> + spio_consecutive_failures & + (SPIO_RESYNC_CONSECUTIVE_SEND_FAIL - 1))) + spio_handle_resync(ctrl, + ctrl-> + spio_consecutive_failures); + err = PSM2_EP_NO_RESOURCES; + } + /* If cable is pulled, we don't count it as a consecutive failure, + * we just make it as though no send pio was available */ + else if (err == PSM2_OK_NO_PROGRESS) + err = PSM2_EP_NO_RESOURCES; + /* else something bad happened in check_status */ + if (do_lock) + pthread_spin_unlock(&ctrl->spio_lock); + return err; + } + } + + /* + * if context->spio_ctrl is set, it is pointing to shared context ureg + * page, and we are using context sharing. + */ + if (ctrl->context->spio_ctrl) { + pthread_spin_lock(&spio_ctrl->spio_ctrl_lock); + if (spio_ctrl->spio_available_blocks < nblks) { + pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock); + + if (do_lock) + pthread_spin_unlock(&ctrl->spio_lock); + return PSM2_EP_NO_RESOURCES; + } + } + + _HFI_VDBG("credits: total %d, avail %d index %d, fill %d " + "free %d: %d %d %d %d %d; addr %llx\n", + ctrl->spio_total_blocks, + spio_ctrl->spio_available_blocks, + spio_ctrl->spio_block_index, + spio_ctrl->spio_fill_counter, + SPIO_CREDITS_Counter(spio_ctrl->spio_credits.value), + SPIO_CREDITS_Status(spio_ctrl->spio_credits.value), + SPIO_CREDITS_DueToPbc(spio_ctrl->spio_credits.value), + SPIO_CREDITS_DueToTheshold(spio_ctrl->spio_credits.value), + SPIO_CREDITS_DueToErr(spio_ctrl->spio_credits.value), + SPIO_CREDITS_DueToForce(spio_ctrl->spio_credits.value), + *ctrl->spio_credits_addr); + + /* + * Save the assigned locally, update the shared for other processes. + */ + ctrl->spio_block_index = spio_ctrl->spio_block_index; + spio_ctrl->spio_available_blocks -= nblks; + /* fill counter should be 11 bits value, same as credit return counter */ + spio_ctrl->spio_fill_counter = + (spio_ctrl->spio_fill_counter + nblks) & 0x7FF; + spio_ctrl->spio_block_index += nblks; + if (spio_ctrl->spio_block_index >= ctrl->spio_total_blocks) + spio_ctrl->spio_block_index -= ctrl->spio_total_blocks; + + /* + * Unlock in context sharing mode, but increase refcount to + * indicate I am in progress to write to PIO blocks. + */ + if (ctrl->context->spio_ctrl) { + spio_ctrl->spio_write_in_progress++; + pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock); + } + + ctrl->spio_num_stall = 0; /* now able to send, so clear if set */ + ctrl->spio_consecutive_failures = 0; + if (do_lock) + pthread_spin_unlock(&ctrl->spio_lock); + + _HFI_VDBG("PIO write: nblks %d length %d, paylen %d\n", nblks, length, + paylen); + + /* Setup PBC for this packet */ + ips_proto_pbc_update(proto, flow, isCtrlMsg, + pbc, sizeof(struct ips_message_header), paylen); + + /* Write to PIO: SOP block */ + pioaddr = ctrl->spio_bufbase_sop + ctrl->spio_block_index * 8; + if (++ctrl->spio_block_index == ctrl->spio_total_blocks) + ctrl->spio_block_index = 0; + + ctrl->spio_blockcpy_med(pioaddr, (uint64_t *) pbc, 1); + _HFI_VDBG("pio qw write sop %p: 8\n", pioaddr); + + /* Write to PIO: other blocks of payload */ +#ifdef PSM_CUDA + if (is_cuda_payload) { + /* Since the implementation of cuMemcpy is unknown, + and the HFI specifies several conditions for how PIO + writes must occur, for safety reasons we should not assume + that cuMemcpy will follow the HFI's requirements. + The cuMemcpy should instead write into a buffer in + host memory, and then PSM can copy to the HFI as usual. */ + PSMI_CUDA_CALL(cuMemcpyDtoH, ctrl->cuda_pio_buffer, + (CUdeviceptr)payload, paylen); + payload = (uint32_t *) ctrl->cuda_pio_buffer; + } +#endif + if (length >= 64) { + + ips_spio_blockcpy_fn_t blockcpy_fn; + if (length >= 256) { + blockcpy_fn = ctrl->spio_blockcpy_large; + } + else { + blockcpy_fn = ctrl->spio_blockcpy_med; + } + + uint32_t blks2send = length >> 6; + uint32_t blks2end = + ctrl->spio_total_blocks - ctrl->spio_block_index; + + pioaddr = ctrl->spio_bufbase + ctrl->spio_block_index * 8; + if (blks2end >= blks2send) { + blockcpy_fn(pioaddr, + (uint64_t *)payload, blks2send); + _HFI_VDBG("pio blk write %p: %d\n", + pioaddr, blks2send); + ctrl->spio_block_index += blks2send; + if (ctrl->spio_block_index == ctrl->spio_total_blocks) + ctrl->spio_block_index = 0; + payload += blks2send*16; + } else { + blockcpy_fn(pioaddr, + (uint64_t *)payload, blks2end); + _HFI_VDBG("pio blk write %p: %d\n", + pioaddr, blks2end); + payload += blks2end*16; + + pioaddr = ctrl->spio_bufbase; + blockcpy_fn(pioaddr, + (uint64_t *)payload, (blks2send-blks2end)); + _HFI_VDBG("pio blk write %p: %d\n", + pioaddr, (blks2send-blks2end)); + ctrl->spio_block_index = blks2send - blks2end; + payload += (blks2send-blks2end)*16; + } + + length -= blks2send*64; + } + + /* + * The following code makes sure to write to pioaddr in + * qword granularity, this is required by hardware. + */ + paylen = length + (cksum_valid ? PSM_CRC_SIZE_IN_BYTES : 0); + if (paylen > 0) { + uint32_t blkbuf[32]; + uint32_t qws = length >> 3; + uint32_t dws = 0; + + pioaddr = ctrl->spio_bufbase + ctrl->spio_block_index * 8; + if (++ctrl->spio_block_index == ctrl->spio_total_blocks) + ctrl->spio_block_index = 0; + + /* Write the remaining qwords of payload */ + if (qws) { + hfi_qwordcpy_safe(pioaddr, (uint64_t *) payload, qws); + _HFI_VDBG("pio qw write %p: %d\n", pioaddr, qws); + payload += qws << 1; + length -= qws << 3; + + pioaddr += qws; + paylen -= qws << 3; + } + + /* if we have last one dword payload */ + if (length > 0) { + blkbuf[dws++] = payload[0]; + } + /* if we have checksum to attach */ + if (paylen > length) { + blkbuf[dws++] = cksum; + blkbuf[dws++] = cksum; + } + + /* Write the rest of qwords of current block */ + hfi_qwordcpy_safe(pioaddr, (uint64_t *) blkbuf, 8 - qws); + _HFI_VDBG("pio qw write %p: %d\n", pioaddr, 8 - qws); + + if (paylen > ((8 - qws) << 3)) { + /* We need another block */ + pioaddr = + ctrl->spio_bufbase + ctrl->spio_block_index * 8; + if (++ctrl->spio_block_index == ctrl->spio_total_blocks) + ctrl->spio_block_index = 0; + + /* Write the last block */ + hfi_qwordcpy_safe(pioaddr, + (uint64_t *) &blkbuf[(8 - qws) << 1], + 8); + _HFI_VDBG("pio qw write %p: %d\n", pioaddr, 8); + } + } + /* + * In context sharing, we need to track who is in progress of + * writing to PIO block, this is for halted send context reset. + * I am done with PIO blocks writing, decrease the refcount. + */ + if (ctrl->context->spio_ctrl) { + pthread_spin_lock(&spio_ctrl->spio_ctrl_lock); + spio_ctrl->spio_write_in_progress--; + pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock); + } + + return err; +} /* ips_spio_transfer_frame() */ diff --git a/psm_hal_gen1/psm_hal_gen1_spio.h b/psm_hal_gen1/psm_hal_gen1_spio.h new file mode 100644 index 0000000..14bc646 --- /dev/null +++ b/psm_hal_gen1/psm_hal_gen1_spio.h @@ -0,0 +1,192 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2017 Intel Corporation. All rights reserved. */ + +#ifndef IPS_SPIO_H +#define IPS_SPIO_H + +#include "psm_user.h" + +#define IPS_CTXT_RESET_MAX 1000 /* max send context reset */ +struct ips_spio; +struct ptl; +struct ips_proto; +struct ips_flow; + +/* 64B move instruction support */ +#define AVX512F_BIT 16 /* level 07h, ebx */ +/* 32B move instruction support */ +#define AVX2_BIT 5 /* level 07h, ebx */ +/* 16B move instruction support */ +#define SSE2_BIT 26 /* level 01h, edx */ + +typedef +void (*ips_spio_blockcpy_fn_t)(volatile uint64_t *dest, + const uint64_t *src, uint32_t nblock); +#ifdef PSM_AVX512 +void hfi_pio_blockcpy_512(volatile uint64_t *dest, + const uint64_t *src, uint32_t nblock); +#endif +void hfi_pio_blockcpy_256(volatile uint64_t *dest, + const uint64_t *src, uint32_t nblock); +void hfi_pio_blockcpy_128(volatile uint64_t *dest, + const uint64_t *src, uint32_t nblock); +void hfi_pio_blockcpy_64(volatile uint64_t *dest, + const uint64_t *src, uint32_t nblock); + + +static PSMI_HAL_INLINE psm2_error_t ips_spio_init(const psmi_context_t *context, + struct ptl *ptl, struct ips_spio *ctrl +#ifdef PSM_AVX512 + , int is_avx512_enabled +#endif +); + +static PSMI_HAL_INLINE psm2_error_t ips_spio_fini(struct ips_spio *ctrl); + +static inline psm2_error_t ips_spio_transfer_frame(struct ips_proto *proto, + struct ips_flow *flow, struct psm_hal_pbc *pbc, + uint32_t *payload, uint32_t length, + uint32_t isCtrlMsg, uint32_t cksum_valid, + uint32_t cksum +#ifdef PSM_CUDA + , uint32_t is_cuda_payload +#endif +); + +static psm2_error_t ips_spio_process_events(const struct ptl *ptl); + +#define SPIO_CREDITS_Counter(value) (((value) >> 0) & 0x7FF) +#define SPIO_CREDITS_Status(value) (((value) >> 11) & 0x1) +#define SPIO_CREDITS_DueToPbc(value) (((value) >> 12) & 0x1) +#define SPIO_CREDITS_DueToTheshold(value) (((value) >> 13) & 0x1) +#define SPIO_CREDITS_DueToErr(value) (((value) >> 14) & 0x1) +#define SPIO_CREDITS_DueToForce(value) (((value) >> 15) & 0x1) +struct ips_spio_credits { +/* don't use bit operation for performance reason, + * using above macro instead. + uint16_t Counter:11; + uint16_t Status:1; + uint16_t CreditReturnDueToPbc:1; + uint16_t CreditReturnDueToThreshold:1; + uint16_t CreditReturnDueToErr:1; + uint16_t CreditReturnDueToForce:1; +*/ + union { + struct { + uint16_t value; + uint16_t pad0; + uint32_t pad1; + }; + uint64_t credit_return; + }; +}; + +struct ips_spio_ctrl { + /* credit return lock for context sharing */ + pthread_spinlock_t spio_ctrl_lock; + + /* PIO write in progress for context sharing */ + volatile uint16_t spio_write_in_progress; + /* send context reset count */ + volatile uint16_t spio_reset_count; + /* HFI frozen count, shared copy */ + volatile uint16_t spio_frozen_count; + + volatile uint16_t spio_available_blocks; + volatile uint16_t spio_block_index; + volatile uint16_t spio_fill_counter; + volatile struct ips_spio_credits spio_credits; +} __attribute__ ((aligned(64))); + +struct ips_spio { + const psmi_context_t *context; + struct ptl *ptl; + uint16_t unit_id; + uint16_t portnum; + + pthread_spinlock_t spio_lock; /* thread lock */ + volatile __le64 *spio_credits_addr __attribute__ ((aligned(64))); + volatile uint64_t *spio_bufbase_sop; + volatile uint64_t *spio_bufbase; + volatile struct ips_spio_ctrl *spio_ctrl; + + uint16_t spio_frozen_count; /* local copy */ + uint16_t spio_total_blocks; + uint16_t spio_block_index; + + uint32_t spio_consecutive_failures; + uint64_t spio_num_stall; + uint64_t spio_num_stall_total; + uint64_t spio_next_stall_warning; + uint64_t spio_last_stall_cyc; + uint64_t spio_init_cyc; + + psm2_error_t (*spio_reset_hfi)(struct ips_spio *ctrl); + psm2_error_t (*spio_credit_return_update)(struct ips_spio *ctrl); + + /* copying routines based on block size */ + ips_spio_blockcpy_fn_t spio_blockcpy_med; + ips_spio_blockcpy_fn_t spio_blockcpy_large; + +#ifdef PSM_CUDA + /* Use an intermediate buffer when writing PIO data from the + GPU to ensure that we follow the HFI's write ordering rules. */ + unsigned char *cuda_pio_buffer; + +#define MAX_CUDA_MTU 10240 +#endif +}; + +#endif /* IPS_SPIO_H */ diff --git a/psm_hal_gen1/psm_hal_inline_i.h b/psm_hal_gen1/psm_hal_inline_i.h new file mode 100644 index 0000000..d573653 --- /dev/null +++ b/psm_hal_gen1/psm_hal_inline_i.h @@ -0,0 +1,2022 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "psm_hal_gen1.h" + +static inline struct _hfp_gen1 *get_psm_gen1_hi(void) +{ + return (struct _hfp_gen1*) psmi_hal_current_hal_instance; +} + +/* hfp_gen1_initialize */ +static PSMI_HAL_INLINE int hfp_gen1_initialize(psmi_hal_instance_t *phi) +{ + return 0; +} + +/* hfp_gen1_finalize */ +static PSMI_HAL_INLINE int hfp_gen1_finalize(void) +{ + return 0; +} + +/* hfp_gen1_get_num_units */ +static PSMI_HAL_INLINE int hfp_gen1_get_num_units(int wait) +{ + return hfi_get_num_units(wait); +} + +/* hfp_gen1_get_num_ports */ +static PSMI_HAL_INLINE int hfp_gen1_get_num_ports(void) +{ + return HFI_NUM_PORTS_GEN1; +} + +/* hfp_gen1_get_unit_active */ +static PSMI_HAL_INLINE int hfp_gen1_get_unit_active(int unit) +{ + return hfi_get_unit_active(unit); +} + +/* hfp_gen1_get_port_active */ +static PSMI_HAL_INLINE int hfp_gen1_get_port_active(int unit, int port) +{ + return hfi_get_port_active(unit, port); +} + +/* hfp_gen1_get_contexts */ +static PSMI_HAL_INLINE int hfp_gen1_get_num_contexts(int unit) +{ + int64_t nctxts=0; + + if (!hfi_sysfs_unit_read_s64(unit, "nctxts", + &nctxts, 0)) + { + return (int)nctxts; + } + return -PSM_HAL_ERROR_GENERAL_ERROR; +} + +/* hfp_gen1_get_num_free_contexts */ +static PSMI_HAL_INLINE int hfp_gen1_get_num_free_contexts(int unit) +{ + int64_t nfreectxts=0; + + if (!hfi_sysfs_unit_read_s64(unit, "nfreectxts", + &nfreectxts, 0)) + { + return (int)nfreectxts; + } + return -PSM_HAL_ERROR_GENERAL_ERROR; +} + +/* hfp_gen1_close_context */ +static PSMI_HAL_INLINE int hfp_gen1_close_context(psmi_hal_hw_context *ctxtp) +{ + if (!ctxtp || !*ctxtp) + return PSM_HAL_ERROR_OK; + + int i; + hfp_gen1_pc_private *psm_hw_ctxt = *ctxtp; + + ips_recvq_egrbuf_table_free(psm_hw_ctxt->cl_qs[PSM_HAL_CL_Q_RX_EGR_Q].egr_buffs); + + for (i=0;i < psm_hw_ctxt->user_info.subctxt_cnt;i++) + ips_recvq_egrbuf_table_free( + psm_hw_ctxt->cl_qs[ + PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(i) + ].egr_buffs); + struct hfi1_base_info *binfo; + struct hfi1_ctxt_info *cinfo; + int __hfi_pg_sz = sysconf(_SC_PAGESIZE); + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + binfo = &ctrl->base_info; + cinfo = &ctrl->ctxt_info; + + munmap((void*)PSMI_ALIGNDOWN(binfo->sc_credits_addr, __hfi_pg_sz), + __hfi_pg_sz); + munmap((void*)PSMI_ALIGNDOWN(binfo->pio_bufbase_sop, __hfi_pg_sz), + cinfo->credits * 64); + munmap((void*)PSMI_ALIGNDOWN(binfo->pio_bufbase, __hfi_pg_sz), + cinfo->credits * 64); + munmap((void*)PSMI_ALIGNDOWN(binfo->rcvhdr_bufbase, __hfi_pg_sz), + cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize); + munmap((void*)PSMI_ALIGNDOWN(binfo->rcvegr_bufbase, __hfi_pg_sz), + cinfo->egrtids * cinfo->rcvegr_size); + munmap((void*)PSMI_ALIGNDOWN(binfo->sdma_comp_bufbase, __hfi_pg_sz), + cinfo->sdma_ring_size * sizeof(struct hfi1_sdma_comp_entry)); + /* only unmap the RTAIL if it was enabled in the first place */ + if (cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL) { + munmap((void*)PSMI_ALIGNDOWN(binfo->rcvhdrtail_base, __hfi_pg_sz), + __hfi_pg_sz); + } + munmap((void*)PSMI_ALIGNDOWN(binfo->user_regbase, __hfi_pg_sz), + __hfi_pg_sz); + munmap((void*)PSMI_ALIGNDOWN(binfo->events_bufbase, __hfi_pg_sz), + __hfi_pg_sz); + munmap((void*)PSMI_ALIGNDOWN(binfo->status_bufbase, __hfi_pg_sz), + __hfi_pg_sz); + + /* only unmap subcontext-related stuff it subcontexts are enabled */ + if (psm_hw_ctxt->user_info.subctxt_cnt > 0) { + munmap((void*)PSMI_ALIGNDOWN(binfo->subctxt_uregbase, __hfi_pg_sz), + __hfi_pg_sz); + munmap((void*)PSMI_ALIGNDOWN(binfo->subctxt_rcvhdrbuf, __hfi_pg_sz), + __hfi_pg_sz); + munmap((void*)PSMI_ALIGNDOWN(binfo->subctxt_rcvegrbuf, __hfi_pg_sz), + __hfi_pg_sz); + } + + close(psm_hw_ctxt->ctrl->fd); + free(psm_hw_ctxt->ctrl); + psmi_free(psm_hw_ctxt); + + return PSM_HAL_ERROR_OK; +} + +/* Moved from psm_context.c */ + +ustatic PSMI_HAL_INLINE +int MOCKABLE(psmi_sharedcontext_params)(int *nranks, int *rankid); +MOCK_DCL_EPILOGUE(psmi_sharedcontext_params); +ustatic PSMI_HAL_INLINE psm2_error_t psmi_init_userinfo_params(psm2_ep_t ep, + int unit_id, + psm2_uuid_t const unique_job_key, + struct hfi1_user_info_dep *user_info); + +/* + * Prepare user_info params for driver open, used only in psmi_context_open + */ +ustatic PSMI_HAL_INLINE +psm2_error_t +psmi_init_userinfo_params(psm2_ep_t ep, int unit_id, + psm2_uuid_t const unique_job_key, + struct hfi1_user_info_dep *user_info) +{ + /* static variables, shared among rails */ + static int shcontexts_enabled = -1, rankid, nranks; + + int avail_contexts = 0, max_contexts, ask_contexts; + int ranks_per_context = 0; + psm2_error_t err = PSM2_OK; + union psmi_envvar_val env_maxctxt, env_ranks_per_context; + static int subcontext_id_start; + + memset(user_info, 0, sizeof(*user_info)); + user_info->userversion = HFI1_USER_SWMINOR|(hfi_get_user_major_version()<subctxt_id = 0; + user_info->subctxt_cnt = 0; + memcpy(user_info->uuid, unique_job_key, sizeof(user_info->uuid)); + + if (shcontexts_enabled == -1) { + shcontexts_enabled = + psmi_sharedcontext_params(&nranks, &rankid); + } + if (!shcontexts_enabled) + return err; + + avail_contexts = hfi_get_num_contexts(unit_id, 0); + + if (avail_contexts == 0) { + err = psmi_handle_error(NULL, PSM2_EP_NO_DEVICE, + "PSM2 found 0 available contexts on opa device(s)."); + goto fail; + } + + /* See if the user wants finer control over context assignments */ + if (!psmi_getenv("PSM2_MAX_CONTEXTS_PER_JOB", + "Maximum number of contexts for this PSM2 job", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)avail_contexts, &env_maxctxt)) { + max_contexts = max(env_maxctxt.e_int, 1); /* needs to be non-negative */ + ask_contexts = min(max_contexts, avail_contexts); /* needs to be available */ + } else if (!psmi_getenv("PSM2_SHAREDCONTEXTS_MAX", + "", /* deprecated */ + PSMI_ENVVAR_LEVEL_HIDDEN | PSMI_ENVVAR_LEVEL_NEVER_PRINT, + PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)avail_contexts, &env_maxctxt)) { + + _HFI_INFO + ("The PSM2_SHAREDCONTEXTS_MAX env variable is deprecated. Please use PSM2_MAX_CONTEXTS_PER_JOB in future.\n"); + + max_contexts = max(env_maxctxt.e_int, 1); /* needs to be non-negative */ + ask_contexts = min(max_contexts, avail_contexts); /* needs to be available */ + } else + ask_contexts = max_contexts = avail_contexts; + + if (!psmi_getenv("PSM2_RANKS_PER_CONTEXT", + "Number of ranks per context", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)1, &env_ranks_per_context)) { + ranks_per_context = max(env_ranks_per_context.e_int, 1); + ranks_per_context = min(ranks_per_context, HFI1_MAX_SHARED_CTXTS); + } + + /* + * See if we could get a valid ppn. If not, approximate it to be the + * number of cores. + */ + if (nranks == -1) { + long nproc = sysconf(_SC_NPROCESSORS_ONLN); + if (nproc < 1) + nranks = 1; + else + nranks = nproc; + } + + /* + * Make sure that our guesses are good educated guesses + */ + if (rankid >= nranks) { + _HFI_PRDBG + ("PSM2_SHAREDCONTEXTS disabled because lrank=%d,ppn=%d\n", + rankid, nranks); + goto fail; + } + + if (ranks_per_context) { + int contexts = + (nranks + ranks_per_context - 1) / ranks_per_context; + if (contexts > ask_contexts) { + err = psmi_handle_error(NULL, PSM2_EP_NO_DEVICE, + "Incompatible settings for " + "PSM2_MAX_CONTEXTS_PER_JOB and PSM2_RANKS_PER_CONTEXT"); + goto fail; + } + ask_contexts = contexts; + } + + /* group id based on total groups and local rank id */ + user_info->subctxt_id = subcontext_id_start + rankid % ask_contexts; + /* this is for multi-rail, when we setup a new rail, + * we can not use the same subcontext ID as the previous + * rail, otherwise, the driver will match previous rail + * and fail. + */ + subcontext_id_start += ask_contexts; + + /* Need to compute with how many *other* peers we will be sharing the + * context */ + if (nranks > ask_contexts) { + user_info->subctxt_cnt = nranks / ask_contexts; + /* If ppn != multiple of contexts, some contexts get an uneven + * number of subcontexts */ + if (nranks % ask_contexts > rankid % ask_contexts) + user_info->subctxt_cnt++; + /* The case of 1 process "sharing" a context (giving 1 subcontext) + * is supcontexted by the driver and PSM. However, there is no + * need to share in this case so disable context sharing. */ + if (user_info->subctxt_cnt == 1) + user_info->subctxt_cnt = 0; + if (user_info->subctxt_cnt > HFI1_MAX_SHARED_CTXTS) { + err = psmi_handle_error(NULL, PSM2_INTERNAL_ERR, + "Calculation of subcontext count exceeded maximum supported"); + goto fail; + } + } + /* else subcontext_cnt remains 0 and context sharing is disabled. */ + + _HFI_PRDBG("PSM2_SHAREDCONTEXTS lrank=%d,ppn=%d,avail_contexts=%d," + "max_contexts=%d,ask_contexts=%d," + "ranks_per_context=%d,id=%u,cnt=%u\n", + rankid, nranks, avail_contexts, max_contexts, + ask_contexts, ranks_per_context, + user_info->subctxt_id, user_info->subctxt_cnt); +fail: + return err; +} + +ustatic +int MOCKABLE(psmi_sharedcontext_params)(int *nranks, int *rankid) +{ + union psmi_envvar_val enable_shcontexts; + char *ppn_env = NULL, *lrank_env = NULL, *c; + + *rankid = -1; + *nranks = -1; + +#if 0 + /* DEBUG: Used to selectively test possible shared context and shm-only + * settings */ + unsetenv("PSC_MPI_NODE_RANK"); + unsetenv("PSC_MPI_PPN"); + unsetenv("MPI_LOCALRANKID"); + unsetenv("MPI_LOCALRANKS"); +#endif + + /* We do not support context sharing for multiple endpoints */ + if (psmi_multi_ep_enabled) { + return 0; + } + + /* New name in 2.0.1, keep observing old name */ + psmi_getenv("PSM2_SHAREDCONTEXTS", "Enable shared contexts", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_YESNO, + (union psmi_envvar_val) + PSMI_SHARED_CONTEXTS_ENABLED_BY_DEFAULT, + &enable_shcontexts); + if (!enable_shcontexts.e_int) + return 0; + + /* We support two types of syntaxes to let users give us a hint what + * our local rankid is. Moving towards MPI_, but still support PSC_ */ + if ((c = getenv("MPI_LOCALRANKID")) && *c != '\0') { + lrank_env = "MPI_LOCALRANKID"; + ppn_env = "MPI_LOCALNRANKS"; + } else if ((c = getenv("PSC_MPI_PPN")) && *c != '\0') { + ppn_env = "PSC_MPI_PPN"; + lrank_env = "PSC_MPI_NODE_RANK"; + } + + if (ppn_env != NULL && lrank_env != NULL) { + union psmi_envvar_val env_rankid, env_nranks; + + psmi_getenv(lrank_env, "Shared context rankid", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)-1, &env_rankid); + + psmi_getenv(ppn_env, "Shared context numranks", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)-1, &env_nranks); + + *rankid = env_rankid.e_int; + *nranks = env_nranks.e_int; + + return 1; + } else + return 0; +} +MOCK_DEF_EPILOGUE(psmi_sharedcontext_params); + +/* moved from ips_subcontext.c */ +static PSMI_HAL_INLINE psm2_error_t +divvy_shared_mem_ptrs(hfp_gen1_pc_private *pc_private, + psmi_context_t *context, + const struct hfi1_base_info *base_info) +{ + struct ips_hwcontext_ctrl **hwcontext_ctrl = &pc_private->hwcontext_ctrl; + uint32_t subcontext_cnt = pc_private->user_info.subctxt_cnt; + struct ips_subcontext_ureg **uregp = &pc_private->subcontext_ureg[0]; + + uintptr_t all_subcontext_uregbase = + (uintptr_t) base_info->subctxt_uregbase; + int i; + + psmi_assert_always(all_subcontext_uregbase != 0); + for (i = 0; i < HFI1_MAX_SHARED_CTXTS; i++) { + struct ips_subcontext_ureg *subcontext_ureg = + (struct ips_subcontext_ureg *)all_subcontext_uregbase; + *uregp++ = (i < subcontext_cnt) ? subcontext_ureg : NULL; + all_subcontext_uregbase += sizeof(struct ips_subcontext_ureg); + } + + *hwcontext_ctrl = + (struct ips_hwcontext_ctrl *)all_subcontext_uregbase; + all_subcontext_uregbase += sizeof(struct ips_hwcontext_ctrl); + + context->spio_ctrl = (void *)all_subcontext_uregbase; + all_subcontext_uregbase += sizeof(struct ips_spio_ctrl); + + context->tid_ctrl = (void *)all_subcontext_uregbase; + all_subcontext_uregbase += sizeof(struct ips_tid_ctrl); + + context->tf_ctrl = (void *)all_subcontext_uregbase; + all_subcontext_uregbase += sizeof(struct ips_tf_ctrl); + + psmi_assert((all_subcontext_uregbase - + (uintptr_t) base_info->subctxt_uregbase) <= PSMI_PAGESIZE); + + return PSM2_OK; +} + +static PSMI_HAL_INLINE +uint64_t get_cap_mask(uint64_t gen1_mask) +{ + static const struct + { + uint64_t gen1_bit; + uint32_t psmi_hal_bit; + } bit_map[] = + { + { HFI1_CAP_SDMA, PSM_HAL_CAP_SDMA }, + { HFI1_CAP_SDMA_AHG, PSM_HAL_CAP_SDMA_AHG }, + { HFI1_CAP_EXTENDED_PSN, PSM_HAL_CAP_EXTENDED_PSN }, + { HFI1_CAP_HDRSUPP, PSM_HAL_CAP_HDRSUPP }, + { HFI1_CAP_USE_SDMA_HEAD, PSM_HAL_CAP_USE_SDMA_HEAD }, + { HFI1_CAP_MULTI_PKT_EGR, PSM_HAL_CAP_MULTI_PKT_EGR }, + { HFI1_CAP_NODROP_RHQ_FULL, PSM_HAL_CAP_NODROP_RHQ_FULL }, + { HFI1_CAP_NODROP_EGR_FULL, PSM_HAL_CAP_NODROP_EGR_FULL }, + { HFI1_CAP_TID_UNMAP, PSM_HAL_CAP_TID_UNMAP }, + { HFI1_CAP_PRINT_UNIMPL, PSM_HAL_CAP_PRINT_UNIMPL }, + { HFI1_CAP_ALLOW_PERM_JKEY, PSM_HAL_CAP_ALLOW_PERM_JKEY }, + { HFI1_CAP_NO_INTEGRITY, PSM_HAL_CAP_NO_INTEGRITY }, + { HFI1_CAP_PKEY_CHECK, PSM_HAL_CAP_PKEY_CHECK }, + { HFI1_CAP_STATIC_RATE_CTRL, PSM_HAL_CAP_STATIC_RATE_CTRL }, + { HFI1_CAP_SDMA_HEAD_CHECK, PSM_HAL_CAP_SDMA_HEAD_CHECK }, + { HFI1_CAP_EARLY_CREDIT_RETURN, PSM_HAL_CAP_EARLY_CREDIT_RETURN }, +#ifdef PSM_CUDA + { HFI1_CAP_GPUDIRECT_OT, PSM_HAL_CAP_GPUDIRECT_OT }, +#endif + }; + uint64_t rv = 0; + int i; + for (i=0;i < sizeof(bit_map)/sizeof(bit_map[0]);i++) + { + if (bit_map[i].gen1_bit & gen1_mask) + rv |= bit_map[i].psmi_hal_bit; + } + return rv; +} + +/* hfp_gen1_context_open */ +static PSMI_HAL_INLINE int hfp_gen1_context_open(int unit, + int port, + uint64_t open_timeout, + psm2_ep_t ep, + psm2_uuid_t const job_key, + psmi_context_t *psm_ctxt, + uint32_t cap_mask, + unsigned retryCnt) +{ + int fd = -1; + psm2_error_t err = PSM_HAL_ERROR_OK; + hfp_gen1_pc_private *pc_private = psmi_malloc(ep, UNDEFINED, sizeof(hfp_gen1_pc_private)); + + if_pf (!pc_private) { + err = -PSM_HAL_ERROR_CANNOT_OPEN_CONTEXT; + goto bail; + } + + memset(pc_private,0,sizeof(hfp_gen1_pc_private)); + + char dev_name[PATH_MAX]; + fd = hfi_context_open_ex(unit, port, open_timeout, + dev_name, sizeof(dev_name)); + if (fd < 0) + { + err = -PSM_HAL_ERROR_CANNOT_OPEN_DEVICE; + goto bail; + } + + err = psmi_init_userinfo_params(ep, + unit, + job_key, + &pc_private->user_info); + if (err) { + err = -PSM_HAL_ERROR_GENERAL_ERROR; + goto bail; + } + + /* attempt to assign the context via hfi_userinit() */ + int retry = 0; + do { + if (retry > 0) + _HFI_INFO("hfi_userinit: failed, trying again (%d/%d)\n", + retry, retryCnt); + pc_private->ctrl = hfi_userinit(fd, &pc_private->user_info); + } while (pc_private->ctrl == NULL && ++retry <= retryCnt); + + if (!pc_private->ctrl) + { + err = -PSM_HAL_ERROR_CANNOT_OPEN_CONTEXT; + goto bail; + } + else + { + + if (getenv("PSM2_IDENTIFY")) { + printf("%s %s run-time driver interface v%d.%d\n", + hfi_get_mylabel(), hfi_ident_tag, + hfi_get_user_major_version(), + hfi_get_user_minor_version()); + } + + struct _hfi_ctrl *ctrl = pc_private->ctrl; + int i; + + if (hfi_get_port_lid(ctrl->__hfi_unit, + ctrl->__hfi_port) <= 0) { + err = psmi_handle_error(NULL, + PSM2_EP_DEVICE_FAILURE, + "Can't get HFI LID in psm2_ep_open: is SMA running?"); + goto bail; + } + uint64_t gid_lo,gid_hi; + if (hfi_get_port_gid(ctrl->__hfi_unit, + ctrl->__hfi_port, + &gid_hi, + &gid_lo) == -1) { + err = + psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "Can't get HFI GID in psm2_ep_open: is SMA running?"); + goto bail; + } + ep->unit_id = ctrl->__hfi_unit; + ep->portnum = ctrl->__hfi_port; + ep->gid_hi = gid_hi; + ep->gid_lo = gid_lo; + + /* Endpoint out_sl contains the default SL to use for this endpoint. */ + /* Get the MTU for this SL. */ + int sc; + if ((sc=hfi_get_port_sl2sc(ep->unit_id, + ctrl->__hfi_port, + ep->out_sl)) < 0) { + sc = PSMI_SC_DEFAULT; + } + int vl; + if ((vl = hfi_get_port_sc2vl(ep->unit_id, + ctrl->__hfi_port, + sc)) < 0) { + vl = PSMI_VL_DEFAULT; + } + if (sc == PSMI_SC_ADMIN || + vl == PSMI_VL_ADMIN) { + err = psmi_handle_error(NULL, PSM2_INTERNAL_ERR, + "Invalid sl: %d, please specify correct sl via HFI_SL", + ep->out_sl); + goto bail; + } + + if ((ep->mtu = hfi_get_port_vl2mtu(ep->unit_id, + ctrl->__hfi_port, + vl)) < 0) { + err = + psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "Can't get MTU for VL %d", + vl); + goto bail; + } + + get_psm_gen1_hi()->phi.params.cap_mask = cap_mask + | get_cap_mask(ctrl->ctxt_info.runtime_flags) + | PSM_HAL_CAP_MERGED_TID_CTRLS + | PSM_HAL_CAP_RSM_FECN_SUPP; + + int driver_major = hfi_get_user_major_version(); + int driver_minor = hfi_get_user_minor_version(); + + if ((driver_major > 6) || + ((driver_major == 6) && + (driver_minor >= 3))) + { + get_psm_gen1_hi()->phi.params.cap_mask |= PSM_HAL_CAP_DMA_HSUPP_FOR_32B_MSGS; + } + + get_psm_gen1_hi()->hfp_private.sdmahdr_req_size = HFI_SDMA_HDR_SIZE; + + if (hfi_check_non_dw_mul_sdma()) + get_psm_gen1_hi()->phi.params.cap_mask |= PSM_HAL_CAP_NON_DW_MULTIPLE_MSG_SIZE; + /* The dma_rtail member is: 1 when the HFI1_CAP_DMA_RTAIL bit is set. + 0 when the HFI1_CAP_DMA_RTAIL bit is NOT set. */ + get_psm_gen1_hi()->hfp_private.dma_rtail = 0 != (HFI1_CAP_DMA_RTAIL & ctrl->ctxt_info.runtime_flags); + + psm_ctxt->psm_hw_ctxt = pc_private; + if (pc_private->user_info.subctxt_cnt > 0) + divvy_shared_mem_ptrs(pc_private, + psm_ctxt, + &ctrl->base_info); + + /* Initialize all of the cl q's. */ + + get_psm_gen1_hi()->hfp_private.hdrq_rhf_off = (ctrl->ctxt_info.rcvhdrq_entsize - 8) >> BYTE2DWORD_SHIFT; + + /* The following guard exists to workaround a critical issue flagged by KW to prevent + subscripting past the end of the cl_qs[] array in the following for () loop. */ + if (pc_private->user_info.subctxt_cnt <= HFI1_MAX_SHARED_CTXTS) + { + /* Here, we are initializing only the rx hdrq rhf seq for all subcontext + cl q's: */ + for (i=PSM_HAL_CL_Q_RX_HDR_Q_SC_0; i < + PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(pc_private->user_info.subctxt_cnt); i += 2) + { + psm_hal_gen1_cl_q_t *pcl_q = &(pc_private->cl_qs[i]); + + pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = &pcl_q->hdr_qe.rx_hdrq_rhf_seq; + if (get_psm_gen1_hi()->hfp_private.dma_rtail) + pcl_q->hdr_qe.rx_hdrq_rhf_seq = 0; + else + pcl_q->hdr_qe.rx_hdrq_rhf_seq = 1; + } + } + /* Next, initialize the hw rx hdr q and egr buff q: */ + { + /* base address of user registers */ + volatile uint64_t *uregbase = (volatile uint64_t *)(uintptr_t) (ctrl->base_info.user_regbase); + /* hw rx hdr q: */ + psm_hal_gen1_cl_q_t *pcl_q = &(pc_private->cl_qs[PSM_HAL_CL_Q_RX_HDR_Q]); + pcl_q->cl_q_head = (volatile uint64_t *)&(uregbase[ur_rcvhdrhead]); + pcl_q->cl_q_tail = (volatile uint64_t *)&(uregbase[ur_rcvhdrtail]); + pcl_q->hdr_qe.hdrq_base_addr = (uint32_t *) (ctrl->base_info.rcvhdr_bufbase); + + /* Initialize the ptr to the rx hdrq rhf seq: */ + if (pc_private->user_info.subctxt_cnt > 0) + /* During sharing of a context, the h/w hdrq rhf_seq is placed in shared memory and is shared + by all subcontexts: */ + pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = &pc_private->hwcontext_ctrl->rx_hdrq_rhf_seq; + else + pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = &pcl_q->hdr_qe.rx_hdrq_rhf_seq; + + if (get_psm_gen1_hi()->hfp_private.dma_rtail) + *pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = 0; + else + *pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = 1; + /* hw egr buff q: */ + pcl_q = &pc_private->cl_qs[PSM_HAL_CL_Q_RX_EGR_Q]; + pcl_q->cl_q_head = (volatile uint64_t *)&(uregbase[ur_rcvegrindexhead]); + pcl_q->cl_q_tail = (volatile uint64_t *)&(uregbase[ur_rcvegrindextail]); + pcl_q->egr_buffs = ips_recvq_egrbuf_table_alloc(ep, + (void*)(ctrl->base_info.rcvegr_bufbase), + ctrl->ctxt_info.egrtids, + ctrl->ctxt_info.rcvegr_size); + } + /* Next, initialize the subcontext's rx hdr q and egr buff q: */ + for (i=0; i < pc_private->user_info.subctxt_cnt;i++) + { + /* Subcontexts mimic the HW registers but use different addresses + * to avoid cache contention. */ + volatile uint64_t *subcontext_uregbase; + uint32_t *rcv_hdr, *rcv_egr; + unsigned hdrsize, egrsize; + unsigned pagesize = getpagesize(); + uint32_t subcontext = i; + unsigned i = pagesize - 1; + hdrsize = + (ctrl->ctxt_info.rcvhdrq_cnt * ctrl->ctxt_info.rcvhdrq_entsize + i) & ~i; + egrsize = + (ctrl->ctxt_info.egrtids * ctrl->ctxt_info.rcvegr_size + i) & ~i; + + subcontext_uregbase = (uint64_t *) + (((uintptr_t) (ctrl->base_info.subctxt_uregbase)) + + (sizeof(struct ips_subcontext_ureg) * subcontext)); + { + struct ips_subcontext_ureg *pscureg = (struct ips_subcontext_ureg *)subcontext_uregbase; + + if (subcontext == ctrl->ctxt_info.subctxt) + { + memset(pscureg, 0, sizeof(*pscureg)); + if (get_psm_gen1_hi()->hfp_private.dma_rtail) + pscureg->writeq_state.hdrq_rhf_seq = 0; + else + pscureg->writeq_state.hdrq_rhf_seq = 1; + } + } + + rcv_hdr = (uint32_t *) + (((uintptr_t) (ctrl->base_info.subctxt_rcvhdrbuf)) + + (hdrsize * subcontext)); + rcv_egr = (uint32_t *) + (((uintptr_t) ctrl->base_info.subctxt_rcvegrbuf + + (egrsize * subcontext))); + + /* rx hdr q: */ + psm_hal_gen1_cl_q_t *pcl_q = &(pc_private->cl_qs[PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(subcontext)]); + pcl_q->hdr_qe.hdrq_base_addr = rcv_hdr; + pcl_q->cl_q_head = (volatile uint64_t *)&subcontext_uregbase[ur_rcvhdrhead * 8]; + pcl_q->cl_q_tail = (volatile uint64_t *)&subcontext_uregbase[ur_rcvhdrtail * 8]; + + /* egr q: */ + pcl_q = &(pc_private->cl_qs[PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(subcontext)]); + pcl_q->cl_q_head = (volatile uint64_t *)&subcontext_uregbase[ur_rcvegrindexhead * 8]; + pcl_q->cl_q_tail = (volatile uint64_t *)&subcontext_uregbase[ur_rcvegrindextail * 8]; + pcl_q->egr_buffs = ips_recvq_egrbuf_table_alloc( + ep, + (void*)rcv_egr, + ctrl->ctxt_info.egrtids, + ctrl->ctxt_info.rcvegr_size); + } + return PSM_HAL_ERROR_OK; + } + return PSM_HAL_ERROR_OK; + +bail: + if (fd >0) close(fd); + if (pc_private) { + if (pc_private->ctrl) free(pc_private->ctrl); + psmi_free(pc_private); + } + + return -PSM_HAL_ERROR_GENERAL_ERROR; +} + +/* hfp_gen1_get_port_index2pkey */ +static PSMI_HAL_INLINE int hfp_gen1_get_port_index2pkey(int unit, int port, int index) +{ + return hfi_get_port_index2pkey(unit, port, index); +} + +static PSMI_HAL_INLINE int hfp_gen1_get_cc_settings_bin(int unit, int port, char *ccabuf, size_t len_ccabuf) +{ + return hfi_get_cc_settings_bin(unit, port, ccabuf, len_ccabuf); +} + +static PSMI_HAL_INLINE int hfp_gen1_get_cc_table_bin(int unit, int port, uint16_t **ccatp) +{ + return hfi_get_cc_table_bin(unit, port, ccatp); +} + +static PSMI_HAL_INLINE int hfp_gen1_get_port_lmc(int unit, int port) +{ + return hfi_get_port_lmc(unit, port); +} + +static PSMI_HAL_INLINE int hfp_gen1_get_port_rate(int unit, int port) +{ + return hfi_get_port_rate(unit, port); +} + +static PSMI_HAL_INLINE int hfp_gen1_get_port_sl2sc(int unit, int port, int sl) +{ + return hfi_get_port_sl2sc(unit, port, sl); +} + +static PSMI_HAL_INLINE int hfp_gen1_get_port_sc2vl(int unit, int port, int sc) +{ + return hfi_get_port_sc2vl(unit, port, sc); +} + +static PSMI_HAL_INLINE int hfp_gen1_set_pkey(psmi_hal_hw_context ctxt, uint16_t pkey) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + return hfi_set_pkey(psm_hw_ctxt->ctrl, pkey); +} + +static PSMI_HAL_INLINE int hfp_gen1_poll_type(uint16_t poll_type, psmi_hal_hw_context ctxt) +{ + if (poll_type == PSMI_HAL_POLL_TYPE_URGENT) + poll_type = HFI1_POLL_TYPE_URGENT; + else + poll_type = 0; + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + return hfi_poll_type(psm_hw_ctxt->ctrl, poll_type); +} + +static PSMI_HAL_INLINE int hfp_gen1_get_port_lid(int unit, int port) +{ + return hfi_get_port_lid(unit, port); +} + +static PSMI_HAL_INLINE int hfp_gen1_get_port_gid(int unit, int port, + uint64_t *hi, uint64_t *lo) +{ + return hfi_get_port_gid(unit, port, hi, lo); +} + +static PSMI_HAL_INLINE int hfp_gen1_free_tid(psmi_hal_hw_context ctxt, uint64_t tidlist, uint32_t tidcnt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + return hfi_free_tid(psm_hw_ctxt->ctrl, tidlist, tidcnt); +} + +static PSMI_HAL_INLINE int hfp_gen1_get_tidcache_invalidation(psmi_hal_hw_context ctxt, uint64_t tidlist, uint32_t *tidcnt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + return hfi_get_invalidation(psm_hw_ctxt->ctrl, tidlist, tidcnt); +} + +static PSMI_HAL_INLINE int hfp_gen1_update_tid(psmi_hal_hw_context ctxt, uint64_t vaddr, uint32_t *length, + uint64_t tidlist, uint32_t *tidcnt, uint16_t flags) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + + return hfi_update_tid(psm_hw_ctxt->ctrl, vaddr, length, tidlist, tidcnt, flags); +} + +static PSMI_HAL_INLINE int hfp_gen1_writev(const struct iovec *iov, int iovcnt, struct ips_epinfo *ignored, psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = (hfp_gen1_pc_private *)ctxt; + + return hfi_cmd_writev(psm_hw_ctxt->ctrl->fd, iov, iovcnt); +} + +static PSMI_HAL_INLINE int hfp_gen1_dma_slot_available(int slotidx, psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + if (slotidx < 0 || slotidx >= ctrl->ctxt_info.sdma_ring_size) + return -1; + + struct hfi1_sdma_comp_entry *sdma_comp_queue = (struct hfi1_sdma_comp_entry *) + ctrl->base_info.sdma_comp_bufbase; + + return sdma_comp_queue[slotidx].status != QUEUED; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_sdma_ring_slot_status(int slotIdx, + psmi_hal_sdma_ring_slot_status *status, + uint32_t *errorCode, + psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + if (slotIdx < 0 || slotIdx >= ctrl->ctxt_info.sdma_ring_size) + { + *status = PSM_HAL_SDMA_RING_ERROR; + return -PSM_HAL_ERROR_GENERAL_ERROR; + } + + struct hfi1_sdma_comp_entry *sdma_comp_queue = (struct hfi1_sdma_comp_entry *) + ctrl->base_info.sdma_comp_bufbase; + + switch (sdma_comp_queue[slotIdx].status) + { + case FREE: + *status = PSM_HAL_SDMA_RING_AVAILABLE; + break; + case QUEUED: + *status = PSM_HAL_SDMA_RING_QUEUED; + break; + case COMPLETE: + *status = PSM_HAL_SDMA_RING_COMPLETE; + break; + case ERROR: + *status = PSM_HAL_SDMA_RING_ERROR; + break; + default: + *status = PSM_HAL_SDMA_RING_ERROR; + return -PSM_HAL_ERROR_GENERAL_ERROR; + } + *errorCode = sdma_comp_queue[slotIdx].errcode; + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_hfi_event_bits(uint64_t *event_bits, psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + uint64_t *pevents_mask = (uint64_t *)ctrl->base_info.events_bufbase; + uint64_t events_mask = *pevents_mask; + uint64_t hal_hfi_event_bits = 0; + int i; + + if (!events_mask) + { + *event_bits = 0; + return PSM_HAL_ERROR_OK; + } + + /* Encode hfi1_events as HAL event codes here */ + for (i = 0; i < sizeof(hfi1_events_map)/sizeof(hfi1_events_map[0]); i++) + { + if (events_mask & hfi1_events_map[i].hfi1_event_bit) + hal_hfi_event_bits |= + hfi1_events_map[i].psmi_hal_hfi_event_bit; + } + + *event_bits = hal_hfi_event_bits; + + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_ack_hfi_event(uint64_t ack_bits, psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + uint64_t hfi1_ack_bits = 0; + int i; + + /* Decode from HAL event codes to hfi1_events */ + for (i = 0; i < sizeof(hfi1_events_map)/sizeof(hfi1_events_map[0]); i++) + { + if (ack_bits & hfi1_events_map[i].psmi_hal_hfi_event_bit) + hfi1_ack_bits |= + hfi1_events_map[i].hfi1_event_bit; + } + + return hfi_event_ack(ctrl, hfi1_ack_bits); +} + +static PSMI_HAL_INLINE int hfp_gen1_hfi_reset_context(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + return hfi_reset_context(ctrl); +} + +static PSMI_HAL_INLINE uint64_t hfp_gen1_get_hw_status(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + struct hfi1_status *status = + (struct hfi1_status *) ctrl->base_info.status_bufbase; + uint64_t hw_status = 0; + int i; + + static const struct + { + uint32_t hfi1_status_dev_bit, psmi_hal_status_bit; + } status_dev_map[] = + { + { HFI1_STATUS_INITTED, PSM_HAL_HW_STATUS_INITTED }, + { HFI1_STATUS_CHIP_PRESENT, PSM_HAL_HW_STATUS_CHIP_PRESENT }, + { HFI1_STATUS_HWERROR, PSM_HAL_HW_STATUS_HWERROR }, + }; + + for (i=0; i < sizeof(status_dev_map)/sizeof(status_dev_map[0]); i++) + { + if (status->dev &status_dev_map[i].hfi1_status_dev_bit) + hw_status |= status_dev_map[i].psmi_hal_status_bit; + } + + static const struct + { + uint32_t hfi1_status_port_bit, psmi_hal_status_bit; + } status_port_map[] = + { + { HFI1_STATUS_IB_READY, PSM_HAL_HW_STATUS_IB_READY }, + { HFI1_STATUS_IB_CONF, PSM_HAL_HW_STATUS_IB_CONF }, + }; + + for (i=0; i < sizeof(status_port_map)/sizeof(status_port_map[0]); i++) + { + if (status->port &status_port_map[i].hfi1_status_port_bit) + hw_status |= status_port_map[i].psmi_hal_status_bit; + } + + return hw_status; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_hw_status_freezemsg(volatile char** msg, psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + struct hfi1_status *status = + (struct hfi1_status *) ctrl->base_info.status_bufbase; + + *msg = (volatile char *) status->freezemsg; + + return PSM2_OK; +} + +static PSMI_HAL_INLINE uint16_t hfp_gen1_get_user_major_bldtime_version() +{ + return HFI1_USER_SWMAJOR; +} + +static PSMI_HAL_INLINE uint16_t hfp_gen1_get_user_minor_bldtime_version() +{ + return HFI1_USER_SWMINOR; +} + +static PSMI_HAL_INLINE uint16_t hfp_gen1_get_user_major_runtime_version(psmi_hal_hw_context ctx) +{ + return hfi_get_user_major_version(); +} + +static PSMI_HAL_INLINE uint16_t hfp_gen1_get_user_minor_runtime_version(psmi_hal_hw_context ctx) +{ + return hfi_get_user_minor_version(); +} + +static inline +uint32_t +get_ht(volatile uint64_t *ht_register) +{ + uint64_t res = *ht_register; + ips_rmb(); + return (uint32_t)res; +} + +static inline +void +set_ht(volatile uint64_t *ht_register, uint64_t new_ht) +{ + *ht_register = new_ht; + return; +} + +/* hfp_gen1_get_cl_q_head_index */ +static PSMI_HAL_INLINE psmi_hal_cl_idx hfp_gen1_get_cl_q_head_index( + psmi_hal_cl_q cl_q, + psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + + return get_ht(psm_hw_ctxt->cl_qs[cl_q].cl_q_head); +} + +/* hfp_gen1_get_cl_q_tail_index */ +static PSMI_HAL_INLINE psmi_hal_cl_idx hfp_gen1_get_cl_q_tail_index( + psmi_hal_cl_q cl_q, + psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + + return get_ht(psm_hw_ctxt->cl_qs[cl_q].cl_q_tail); +} + +/* hfp_gen1_set_cl_q_head_index */ +static PSMI_HAL_INLINE void hfp_gen1_set_cl_q_head_index( + psmi_hal_cl_idx idx, + psmi_hal_cl_q cl_q, + psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + + set_ht(psm_hw_ctxt->cl_qs[cl_q].cl_q_head, idx); + return; +} + +/* hfp_gen1_set_cl_q_tail_index */ +static PSMI_HAL_INLINE void hfp_gen1_set_cl_q_tail_index( + psmi_hal_cl_idx idx, + psmi_hal_cl_q cl_q, + psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + + set_ht(psm_hw_ctxt->cl_qs[cl_q].cl_q_tail, idx); + return; +} + +/* hfp_gen1_cl_q_empty */ +static inline int hfp_gen1_cl_q_empty(psmi_hal_cl_idx head_idx, + psmi_hal_cl_q cl_q, + psmi_hal_hw_context ctxt) +{ + if (!get_psm_gen1_hi()->hfp_private.dma_rtail) + { + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; + int seq = hfi_hdrget_seq(pcl_q->hdr_qe.hdrq_base_addr + + (head_idx + get_psm_gen1_hi()->hfp_private.hdrq_rhf_off)); + + return (*pcl_q->hdr_qe.p_rx_hdrq_rhf_seq != seq); + } + + return (head_idx == hfp_gen1_get_cl_q_tail_index(cl_q, ctxt)); +} + +static inline int hfp_gen1_get_rhf(psmi_hal_cl_idx idx, + psmi_hal_raw_rhf_t *rhfp, + psmi_hal_cl_q cl_q, + psmi_hal_hw_context ctxt) + +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; + uint32_t *pu32 = (pcl_q->hdr_qe.hdrq_base_addr + + (idx + get_psm_gen1_hi()->hfp_private.hdrq_rhf_off)); + *rhfp = *((psmi_hal_raw_rhf_t*)pu32); + return PSM_HAL_ERROR_OK; +} + +static inline int hfp_gen1_get_ips_message_hdr(psmi_hal_cl_idx idx, + psmi_hal_raw_rhf_t rhf, + struct ips_message_header **imhp, + psmi_hal_cl_q cl_q, + psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; + uint32_t *pu32 = pcl_q->hdr_qe.hdrq_base_addr + (idx + hfi_hdrget_hdrq_offset((uint32_t *)&rhf)); + *imhp = (struct ips_message_header*)pu32; + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_receive_event(psmi_hal_cl_idx head_idx, psmi_hal_hw_context ctxt, + struct ips_recvhdrq_event *rcv_ev) +{ + int rv; + + if_pf ((rv=hfp_gen1_get_rhf(head_idx, &rcv_ev->psm_hal_rhf.raw_rhf, rcv_ev->psm_hal_hdr_q, ctxt)) != + PSM_HAL_ERROR_OK) + return rv; + + /* here, we turn off the TFSEQ err bit if set: */ + rcv_ev->psm_hal_rhf.decomposed_rhf = rcv_ev->psm_hal_rhf.raw_rhf & (~(PSMI_HAL_RHF_ERR_MASK_64(TFSEQ))); + + /* Now, get the lrh: */ + if_pf ((rv=hfp_gen1_get_ips_message_hdr(head_idx, rcv_ev->psm_hal_rhf.raw_rhf, &rcv_ev->p_hdr, + rcv_ev->psm_hal_hdr_q, ctxt)) != + PSM_HAL_ERROR_OK) + return rv; + + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE void *hfp_gen1_get_egr_buff(psmi_hal_cl_idx idx, + psmi_hal_cl_q cl_q, + psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; + return pcl_q->egr_buffs[idx]; +} + +static PSMI_HAL_INLINE int hfp_gen1_retire_hdr_q_entry(psmi_hal_cl_idx *idx, + psmi_hal_cl_q cl_q, + psmi_hal_hw_context ctxt, + uint32_t elemsz, uint32_t elemlast, + int *emptyp) +{ + psmi_hal_cl_idx tmp = *idx + elemsz; + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; + + if (!get_psm_gen1_hi()->hfp_private.dma_rtail) + { + (*pcl_q->hdr_qe.p_rx_hdrq_rhf_seq)++; + if (*pcl_q->hdr_qe.p_rx_hdrq_rhf_seq > LAST_RHF_SEQNO) + *pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = 1; + } + if_pf(tmp > elemlast) + tmp = 0; + *emptyp = hfp_gen1_cl_q_empty(tmp, cl_q, ctxt); + *idx = tmp; + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_rhf_expected_sequence_number(unsigned int *pseqnum, + psmi_hal_cl_q cl_q, + psmi_hal_hw_context ctxt) + +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; + + *pseqnum = *pcl_q->hdr_qe.p_rx_hdrq_rhf_seq; + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_set_rhf_expected_sequence_number(unsigned int seqnum, + psmi_hal_cl_q cl_q, + psmi_hal_hw_context ctxt) + +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; + + *pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = seqnum; + return PSM_HAL_ERROR_OK; +} + +/* Get pbc static rate value for flow for a given message length */ +PSMI_ALWAYS_INLINE( +uint16_t +ips_proto_pbc_static_rate(struct ips_proto *proto, struct ips_flow *flow, + uint32_t msgLen)) +{ + uint32_t rate = 0; + + /* The PBC rate is based on which HFI type as different media have different + * mechanism for static rate control. + */ + + switch (proto->epinfo.ep_hfi_type) { + case PSMI_HFI_TYPE_OPA1: + { + /* + * time_to_send is: + * + * (packet_length) [bits] / (pkt_egress_rate) [bits/sec] + * ----------------------------------------------------- + * fabric_clock_period == (1 / 805 * 10^6) [1/sec] + * + * (where pkt_egress_rate is assumed to be 100 Gbit/s.) + */ + uint32_t time_to_send = (8 * msgLen * 805) / (100000); + rate = (time_to_send >> flow->path->pr_cca_divisor) * + (flow->path->pr_active_ipd); + + if (rate > 65535) + rate = 65535; + + } + break; + + default: + rate = 0; + } + + return (uint16_t) rate; +} + +/* This is a helper function to convert Per Buffer Control to little-endian */ +PSMI_ALWAYS_INLINE( +void ips_proto_pbc_to_le(struct psm_hal_pbc *pbc)) +{ + pbc->pbc0 = __cpu_to_le32(pbc->pbc0); + pbc->PbcStaticRateControlCnt = __cpu_to_le16(pbc->PbcStaticRateControlCnt); + pbc->fill1 = __cpu_to_le16(pbc->fill1); +} + +/* This is only used for SDMA cases; pbc is really a pointer to + * struct ips_pbc_header * or the equivalent un-named structure + * in ips_scb. Please note pcb will be in little-endian byte + * order on return */ +PSMI_ALWAYS_INLINE( +void +ips_proto_pbc_update(struct ips_proto *proto, struct ips_flow *flow, + uint32_t isCtrlMsg, struct psm_hal_pbc *pbc, uint32_t hdrlen, + uint32_t paylen)) +{ + int dw = (sizeof(struct psm_hal_pbc) + hdrlen + paylen) >> BYTE2DWORD_SHIFT; + int sc = proto->sl2sc[flow->path->pr_sl]; + int vl = proto->sc2vl[sc]; + uint16_t static_rate = 0; + + if_pf(!isCtrlMsg && flow->path->pr_active_ipd) + static_rate = + ips_proto_pbc_static_rate(proto, flow, hdrlen + paylen); + + pbc->pbc0 = (dw & HFI_PBC_LENGTHDWS_MASK) | + ((vl & HFI_PBC_VL_MASK) << HFI_PBC_VL_SHIFT) | + (((sc >> HFI_PBC_SC4_SHIFT) & + HFI_PBC_SC4_MASK) << HFI_PBC_DCINFO_SHIFT); + + pbc->PbcStaticRateControlCnt = static_rate & HFI_PBC_STATICRCC_MASK; + + /* Per Buffer Control must be in little-endian */ + ips_proto_pbc_to_le(pbc); + + return; +} + +static PSMI_HAL_INLINE int hfp_gen1_check_rhf_sequence_number(unsigned int seqno) +{ + return (seqno <= LAST_RHF_SEQNO) ? + PSM_HAL_ERROR_OK : + PSM_HAL_ERROR_GENERAL_ERROR; +} + +static PSMI_HAL_INLINE int hfp_gen1_set_pbc(struct ips_proto *proto, struct ips_flow *flow, + uint32_t isCtrlMsg, struct psm_hal_pbc *dest, uint32_t hdrlen, + uint32_t paylen) +{ + ips_proto_pbc_update(proto, flow, isCtrlMsg, + dest, hdrlen, paylen); + + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_tidflow_set_entry(uint32_t flowid, uint32_t genval, uint32_t seqnum, psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + hfi_tidflow_set_entry(ctrl, flowid, genval, seqnum); + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_tidflow_reset(psmi_hal_hw_context ctxt, uint32_t flowid, uint32_t genval, uint32_t seqnum) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + hfi_tidflow_reset(ctrl, flowid, genval, seqnum); + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_tidflow_get(uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + *ptf = hfi_tidflow_get(ctrl, flowid); + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_hw(uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context ctxt) +{ + return hfp_gen1_tidflow_get(flowid, ptf, ctxt); +} + +static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_seqnum(uint64_t val, uint32_t *pseqn) +{ + *pseqn = hfi_tidflow_get_seqnum(val); + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_genval(uint64_t val, uint32_t *pgv) +{ + *pgv = hfi_tidflow_get_genval(val); + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_tidflow_check_update_pkt_seq(void *vpprotoexp + /* actually a: + struct ips_protoexp *protoexp */, + psmi_seqnum_t sequence_num, + void *vptidrecvc + /* actually a: + struct ips_tid_recv_desc *tidrecvc */, + struct ips_message_header *p_hdr, + void (*ips_protoexp_do_tf_generr) + (void *vpprotoexp + /* actually a: + struct ips_protoexp *protoexp */, + void *vptidrecvc + /* actually a: + struct ips_tid_recv_desc *tidrecvc */, + struct ips_message_header *p_hdr), + void (*ips_protoexp_do_tf_seqerr) + (void *vpprotoexp + /* actually a: + struct ips_protoexp *protoexp */, + void *vptidrecvc + /* actually a: + struct ips_tid_recv_desc *tidrecvc */, + struct ips_message_header *p_hdr) + ) +{ + struct ips_protoexp *protoexp = (struct ips_protoexp *) vpprotoexp; + struct ips_tid_recv_desc *tidrecvc = (struct ips_tid_recv_desc *) vptidrecvc; + + if_pf(psmi_hal_has_sw_status(PSM_HAL_HDRSUPP_ENABLED)) { + /* Drop packet if generation number does not match. There + * is a window that before we program the hardware tidflow + * table with new gen/seq, hardware might receive some + * packets with the old generation. + */ + if (sequence_num.psn_gen != tidrecvc->tidflow_genseq.psn_gen) + { + PSM2_LOG_MSG("leaving"); + return PSM_HAL_ERROR_GENERAL_ERROR; + } + +#ifdef PSM_DEBUG + /* Check if new packet falls into expected seq range, we need + * to deal with wrap around of the seq value from 2047 to 0 + * because seq is only 11 bits. */ + int16_t seq_off = (int16_t)(sequence_num.psn_seq - + tidrecvc->tidflow_genseq.psn_seq); + if (seq_off < 0) + seq_off += 2048; /* seq is 11 bits */ + psmi_assert(seq_off < 1024); +#endif + /* NOTE: with RSM in use, we should not automatically update + * our PSN from the HFI's PSN. The HFI doesn't know about + * RSM interceptions. + */ + /* (DON'T!) Update the shadow tidflow_genseq */ + /* tidrecvc->tidflow_genseq.psn_seq = sequence_num.psn_seq + 1; */ + + } + /* Always check the sequence number if we get a header, even if SH. */ + if_pt(sequence_num.psn_num == tidrecvc->tidflow_genseq.psn_num) { + /* Update the shadow tidflow_genseq */ + tidrecvc->tidflow_genseq.psn_seq = sequence_num.psn_seq + 1; + + /* update the fake tidflow table with new seq, this is for + * seqerr and err_chk_gen processing to get the latest + * valid sequence number */ + hfp_gen1_tidflow_set_entry( + tidrecvc->rdescid._desc_idx, + tidrecvc->tidflow_genseq.psn_gen, + tidrecvc->tidflow_genseq.psn_seq, + tidrecvc->context->psm_hw_ctxt); + } else { + /* Generation mismatch */ + if (sequence_num.psn_gen != tidrecvc->tidflow_genseq.psn_gen) { + ips_protoexp_do_tf_generr(protoexp, + tidrecvc, p_hdr); + PSM2_LOG_MSG("leaving"); + return PSM_HAL_ERROR_GENERAL_ERROR; + } else { + /* Possible sequence mismatch error */ + /* First, check if this is a recoverable SeqErr - + * caused by a good packet arriving in a tidflow that + * has had a FECN bit set on some earlier packet. + */ + + /* If this is the first RSM packet, our own PSN state + * is probably old. Pull from the HFI if it has + * newer data. + */ + uint64_t tf; + psmi_seqnum_t tf_sequence_num; + + hfp_gen1_tidflow_get(tidrecvc->rdescid._desc_idx, &tf, + tidrecvc->context->psm_hw_ctxt); + hfp_gen1_tidflow_get_seqnum(tf, &tf_sequence_num.psn_val); + + if (tf_sequence_num.psn_val > tidrecvc->tidflow_genseq.psn_seq) + tidrecvc->tidflow_genseq.psn_seq = tf_sequence_num.psn_seq; + + /* Now re-check the sequence numbers. */ + if (sequence_num.psn_seq > tidrecvc->tidflow_genseq.psn_seq) { + /* It really was a sequence error. Restart. */ + ips_protoexp_do_tf_seqerr(protoexp, tidrecvc, p_hdr); + PSM2_LOG_MSG("leaving"); + return PSM_HAL_ERROR_GENERAL_ERROR; + } else { + /* False SeqErr. We can accept this packet. */ + if (sequence_num.psn_seq == tidrecvc->tidflow_genseq.psn_seq) + tidrecvc->tidflow_genseq.psn_seq++; + } + } + } + + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_flowvalid(uint64_t val, uint32_t *pfv) +{ + *pfv = hfi_tidflow_get_flowvalid(val); + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_enabled(uint64_t val, uint32_t *penabled) +{ + *penabled = hfi_tidflow_get_enabled(val); + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_keep_after_seqerr(uint64_t val, uint32_t *pkase) +{ + *pkase = hfi_tidflow_get_keep_after_seqerr(val); + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_keep_on_generr(uint64_t val, uint32_t *pkoge) +{ + *pkoge = hfi_tidflow_get_keep_on_generr(val); + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_keep_payload_on_generr(uint64_t val, uint32_t *pkpoge) +{ + *pkpoge = hfi_tidflow_get_keep_payload_on_generr(val); + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_seqmismatch(uint64_t val, uint32_t *psmm) +{ + *psmm = hfi_tidflow_get_seqmismatch(val); + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_genmismatch(uint64_t val, uint32_t *pgmm) +{ + *pgmm = hfi_tidflow_get_genmismatch(val); + return PSM_HAL_ERROR_OK; +} + +static inline int hfp_gen1_write_header_to_subcontext(struct ips_message_header *pimh, + psmi_hal_cl_idx idx, + psmi_hal_raw_rhf_t rhf, + psmi_hal_cl_q cl_q, + psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; + uint32_t *pu32 = pcl_q->hdr_qe.hdrq_base_addr + (idx + hfi_hdrget_hdrq_offset((uint32_t *)&rhf)); + struct ips_message_header *piph_dest = (struct ips_message_header *)pu32; + + *piph_dest = *pimh; + return PSM_HAL_ERROR_OK; +} + +static inline +void +writehdrq_write_rhf_atomic(uint64_t *rhf_dest, uint64_t rhf_src) +{ + /* + * In 64-bit mode, we check in init that the rhf will always be 8-byte + * aligned + */ + *rhf_dest = rhf_src; + return; +} + +static inline int hfp_gen1_write_rhf_to_subcontext(psmi_hal_raw_rhf_t rhf, + psmi_hal_cl_idx idx, + uint32_t *phdrq_rhf_seq, + psmi_hal_cl_q cl_q, + psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; + + if (!get_psm_gen1_hi()->hfp_private.dma_rtail) + { + uint32_t rhf_seq = *phdrq_rhf_seq; + hfi_hdrset_seq((uint32_t *) &rhf, rhf_seq); + rhf_seq++; + if (rhf_seq > LAST_RHF_SEQNO) + rhf_seq = 1; + + *phdrq_rhf_seq = rhf_seq; + } + + /* Now write the new rhf */ + writehdrq_write_rhf_atomic((uint64_t*)(pcl_q->hdr_qe.hdrq_base_addr + + (idx + get_psm_gen1_hi()->hfp_private.hdrq_rhf_off)), + rhf); + return PSM_HAL_ERROR_OK; +} + +static PSMI_HAL_INLINE int hfp_gen1_subcontext_ureg_get(ptl_t *ptl_gen, + struct ips_subcontext_ureg **uregp, + psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + int i; + struct ptl_ips *ptl = (struct ptl_ips *) ptl_gen; + + ptl->recvshc->hwcontext_ctrl = psm_hw_ctxt->hwcontext_ctrl; + for (i=0;i < psm_hw_ctxt->user_info.subctxt_cnt; i++) + uregp[i] = psm_hw_ctxt->subcontext_ureg[i]; + return PSM_HAL_ERROR_OK; +} + + +static inline +int +ips_write_eager_packet(struct ips_writehdrq *writeq, + struct ips_recvhdrq_event *rcv_ev, + psmi_hal_cl_idx write_hdr_tail, + uint32_t subcontext, + psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + psmi_hal_cl_idx write_egr_tail; + write_egr_tail = hfp_gen1_get_cl_q_tail_index( + PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(subcontext), + ctxt); + uint32_t next_write_egr_tail = write_egr_tail; + /* checksum is trimmed from paylen, we need to add back */ + uint32_t rcv_paylen = ips_recvhdrq_event_paylen(rcv_ev) + + (rcv_ev->has_cksum ? PSM_CRC_SIZE_IN_BYTES : 0); + psmi_assert(rcv_paylen > 0); + uint32_t egr_elemcnt = ctrl->ctxt_info.egrtids; + uint32_t egr_elemsz = ctrl->ctxt_info.rcvegr_size; + + /* Loop as long as the write eager queue is NOT full */ + while (1) { + next_write_egr_tail++; + if (next_write_egr_tail >= egr_elemcnt) + next_write_egr_tail = 0; + psmi_hal_cl_idx egr_head; + egr_head = hfp_gen1_get_cl_q_head_index( + PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(subcontext), + ctxt); + if (next_write_egr_tail == egr_head) { + break; + } + + /* Move to next eager entry if leftover is not enough */ + if ((writeq->state->egrq_offset + rcv_paylen) > + egr_elemsz) { + writeq->state->egrq_offset = 0; + write_egr_tail = next_write_egr_tail; + + /* Update the eager buffer tail pointer */ + hfp_gen1_set_cl_q_tail_index(write_egr_tail, + PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(subcontext), + ctxt); + } else { + /* There is enough space in this entry! */ + /* Use pre-calculated address from look-up table */ + char *write_payload = + hfp_gen1_get_egr_buff(write_egr_tail, + PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(subcontext), + ctxt)+ + writeq->state->egrq_offset; + const char *rcv_payload = + ips_recvhdrq_event_payload(rcv_ev); + + psmi_assert(write_payload != NULL); + psmi_assert(rcv_payload != NULL); + psmi_mq_mtucpy(write_payload, rcv_payload, rcv_paylen); + + /* Fix up the rhf with the subcontext's eager index/offset */ + hfi_hdrset_egrbfr_index((uint32_t*)(&rcv_ev->psm_hal_rhf.raw_rhf),write_egr_tail); + hfi_hdrset_egrbfr_offset((uint32_t *)(&rcv_ev->psm_hal_rhf.raw_rhf), (writeq->state-> + egrq_offset >> 6)); + /* Copy the header to the subcontext's header queue */ + hfp_gen1_write_header_to_subcontext(rcv_ev->p_hdr, + write_hdr_tail, + rcv_ev->psm_hal_rhf.raw_rhf, + PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(subcontext), + ctxt); + + /* Update offset to next 64B boundary */ + writeq->state->egrq_offset = + (writeq->state->egrq_offset + rcv_paylen + + 63) & (~63); + return IPS_RECVHDRQ_CONTINUE; + } + } + + /* At this point, the eager queue is full -- drop the packet. */ + /* Copy the header to the subcontext's header queue */ + + /* Mark header with ETIDERR (eager overflow) */ + hfi_hdrset_err_flags((uint32_t*) (&rcv_ev->psm_hal_rhf.raw_rhf), HFI_RHF_TIDERR); + + /* Clear UseEgrBfr bit because payload is dropped */ + hfi_hdrset_use_egrbfr((uint32_t *)(&rcv_ev->psm_hal_rhf.raw_rhf), 0); + hfp_gen1_write_header_to_subcontext(rcv_ev->p_hdr, + write_hdr_tail, + rcv_ev->psm_hal_rhf.raw_rhf, + PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(subcontext), + ctxt); + return IPS_RECVHDRQ_BREAK; +} + +static PSMI_HAL_INLINE +int +hfp_gen1_forward_packet_to_subcontext(struct ips_writehdrq *writeq, + struct ips_recvhdrq_event *rcv_ev, + uint32_t subcontext, + psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + psmi_hal_cl_idx write_hdr_head; + psmi_hal_cl_idx write_hdr_tail; + uint32_t hdrq_elemsz = ctrl->ctxt_info.rcvhdrq_entsize >> BYTE2DWORD_SHIFT; + psmi_hal_cl_idx next_write_hdr_tail; + int result = IPS_RECVHDRQ_CONTINUE; + + /* Drop packet if write header queue is disabled */ + if_pf (!writeq->state->enabled) { + return IPS_RECVHDRQ_BREAK; + } + + write_hdr_head = hfp_gen1_get_cl_q_head_index( + PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(subcontext), + ctxt); + write_hdr_tail = hfp_gen1_get_cl_q_tail_index( + PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(subcontext), + ctxt); + /* Drop packet if write header queue is full */ + next_write_hdr_tail = write_hdr_tail + hdrq_elemsz; + if (next_write_hdr_tail > writeq->hdrq_elemlast) { + next_write_hdr_tail = 0; + } + if (next_write_hdr_tail == write_hdr_head) { + return IPS_RECVHDRQ_BREAK; + } + if (psmi_hal_rhf_get_use_egr_buff(rcv_ev->psm_hal_rhf)) + { + result = ips_write_eager_packet(writeq, rcv_ev, + write_hdr_tail, + subcontext, + ctxt); + } else { + /* Copy the header to the subcontext's header queue */ + hfp_gen1_write_header_to_subcontext(rcv_ev->p_hdr, + write_hdr_tail, + rcv_ev->psm_hal_rhf.raw_rhf, + PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(subcontext), + ctxt); + } + + /* Ensure previous writes are visible before writing rhf seq or tail */ + ips_wmb(); + + /* The following func call may modify the hdrq_rhf_seq */ + hfp_gen1_write_rhf_to_subcontext(rcv_ev->psm_hal_rhf.raw_rhf, write_hdr_tail, + &writeq->state->hdrq_rhf_seq, + PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(subcontext), + ctxt); + /* The tail must be updated regardless of PSM_HAL_CAP_DMA_RTAIL + * since this tail is also used to keep track of where + * ips_writehdrq_append will write to next. For subcontexts there is + * no separate shadow copy of the tail. */ + hfp_gen1_set_cl_q_tail_index(next_write_hdr_tail, + PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(subcontext), + ctxt); + + return result; +} + +static PSMI_HAL_INLINE int hfp_gen1_set_pio_size(uint32_t pio_size, psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + ctrl->__hfi_piosize = pio_size; + + return 0; +} + +static PSMI_HAL_INLINE int hfp_gen1_set_effective_mtu(uint32_t eff_mtu, psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + ctrl->__hfi_mtusize = eff_mtu; + return 0; +} + +static PSMI_HAL_INLINE int hfp_gen1_set_tf_valid(uint32_t tf_valid, psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + ctrl->__hfi_tfvalid = tf_valid; + return 0; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_default_pkey(void) +{ + return HFI_DEFAULT_P_KEY; +} + +#include "psm_hal_gen1_spio.c" + +static PSMI_HAL_INLINE int hfp_gen1_spio_init(const psmi_context_t *context, + struct ptl *ptl, void **ctrl) +{ + hfp_gen1_pc_private *psm_hw_ctxt = context->psm_hw_ctxt; + +#ifdef PSM_AVX512 + union psmi_envvar_val env_enable_avx512; + psmi_getenv("PSM2_AVX512", + "Enable (set envvar to 1) AVX512 code in PSM (Enabled by default)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)1, &env_enable_avx512); + int is_avx512_enabled = env_enable_avx512.e_int; + int rc = ips_spio_init(context,ptl, &psm_hw_ctxt->spio_ctrl, is_avx512_enabled); +#else + int rc = ips_spio_init(context,ptl, &psm_hw_ctxt->spio_ctrl); +#endif + if (rc >= 0) + { + *ctrl = &psm_hw_ctxt->spio_ctrl; + } + return rc; +} + +static PSMI_HAL_INLINE int hfp_gen1_spio_fini(void **ctrl, psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + int rc = ips_spio_fini(&psm_hw_ctxt->spio_ctrl); + + if (!rc) + *ctrl = NULL; + return rc; +} + +static PSMI_HAL_INLINE int hfp_gen1_spio_transfer_frame(struct ips_proto *proto, + struct ips_flow *flow, struct psm_hal_pbc *pbc, + uint32_t *payload, uint32_t length, + uint32_t isCtrlMsg, uint32_t cksum_valid, + uint32_t cksum, psmi_hal_hw_context ctxt +#ifdef PSM_CUDA + , uint32_t is_cuda_payload +#endif + ) +{ + return ips_spio_transfer_frame(proto, flow, pbc, + payload, length, isCtrlMsg, + cksum_valid, cksum +#ifdef PSM_CUDA + , is_cuda_payload +#endif + ); +} + +static PSMI_HAL_INLINE int hfp_gen1_spio_process_events(const struct ptl *ptl) +{ + return ips_spio_process_events(ptl); +} + +static PSMI_HAL_INLINE int hfp_gen1_get_node_id(int unit, int *nodep) +{ + int64_t node_id = hfi_sysfs_unit_read_node_s64(unit); + *nodep = (int)node_id; + if (node_id != -1) + return PSM_HAL_ERROR_OK; + else + return -PSM_HAL_ERROR_GENERAL_ERROR; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_bthqp(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + return ctrl->base_info.bthqp; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_context(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + return ctrl->ctxt_info.ctxt; +} + +static PSMI_HAL_INLINE uint64_t hfp_gen1_get_gid_lo(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + uint64_t gid_lo, gid_hi; + if (hfi_get_port_gid(ctrl->__hfi_unit, + ctrl->__hfi_port, &gid_hi, + &gid_lo) == -1) { + psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "Can't get HFI GID in psm2_ep_open: is SMA running?"); + } + return gid_lo; +} + +static PSMI_HAL_INLINE uint64_t hfp_gen1_get_gid_hi(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + uint64_t gid_lo, gid_hi; + if (hfi_get_port_gid(ctrl->__hfi_unit, + ctrl->__hfi_port, &gid_hi, + &gid_lo) == -1) { + psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "Can't get HFI GID in psm2_ep_open: is SMA running?"); + } + return gid_hi; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_hfi_type(psmi_hal_hw_context ctxt) +{ + return PSM_HAL_INSTANCE_GEN1; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_jkey(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + return ctrl->base_info.jkey; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_lid(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + int lid; + + if ((lid = hfi_get_port_lid(ctrl->__hfi_unit, + ctrl->__hfi_port)) <= 0) { + psmi_handle_error(NULL, + PSM2_EP_DEVICE_FAILURE, + "Can't get HFI LID in psm2_ep_open: is SMA running?"); + } + return lid; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_pio_size(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + return (ctrl->ctxt_info.credits / 2) * 64 - + (sizeof(struct ips_message_header) + HFI_PCB_SIZE_IN_BYTES); +} + +static PSMI_HAL_INLINE int hfp_gen1_get_port_num(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + return ctrl->__hfi_port; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_rx_egr_tid_cnt(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + return ctrl->ctxt_info.egrtids; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_rx_hdr_q_cnt(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + return ctrl->ctxt_info.rcvhdrq_cnt; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_rx_hdr_q_ent_size(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + return ctrl->ctxt_info.rcvhdrq_entsize; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_sdma_req_size(psmi_hal_hw_context ctxt) +{ + return get_psm_gen1_hi()->hfp_private.sdmahdr_req_size; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_sdma_ring_size(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + return ctrl->ctxt_info.sdma_ring_size; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_subctxt(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + return ctrl->ctxt_info.subctxt; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_subctxt_cnt(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + + return psm_hw_ctxt->user_info.subctxt_cnt; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_tid_exp_cnt(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + return ctrl->__hfi_tidexpcnt; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_unit_id(psmi_hal_hw_context ctxt) +{ + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; + + return ctrl->__hfi_unit; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_fd(psmi_hal_hw_context ctxt) +{ + if (!ctxt) + return -1; + + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + + return psm_hw_ctxt->ctrl->fd; +} + +static PSMI_HAL_INLINE int hfp_gen1_get_pio_stall_cnt(psmi_hal_hw_context ctxt, uint64_t **pio_stall_cnt) +{ + + if (!ctxt) + return -PSM_HAL_ERROR_GENERAL_ERROR; + + hfp_gen1_pc_private *psm_hw_ctxt = ctxt; + + *pio_stall_cnt = &psm_hw_ctxt->spio_ctrl.spio_num_stall_total; + + return PSM_HAL_ERROR_OK; +} diff --git a/psm_help.h b/psm_help.h new file mode 100644 index 0000000..12ebe5b --- /dev/null +++ b/psm_help.h @@ -0,0 +1,190 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_HELP_H +#define _PSMI_HELP_H +#include "psm_log.h" + +/* XXX gcc only */ +#define PSMI_INLINE(FN) \ + static inline FN + +#define PSMI_ALWAYS_INLINE(FN) \ + static __inline__ FN __attribute__((always_inline)); \ + static __inline__ FN + +#define PSMI_NEVER_INLINE(FN) \ + static FN __attribute__((noinline)); \ + static FN + +#define _PPragma(x) _Pragma(x) + +#define STRINGIFY(s) _STRINGIFY(s) +#define _STRINGIFY(s) #s +#define PSMI_CURLOC __FILE__ ":" STRINGIFY(__LINE__) +#define psmi_assert_always_loc(x, curloc) \ + do { \ + if_pf(!(x)) { \ + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Assertion failure at %s: %s", curloc, \ + STRINGIFY(x)); \ + } } while (0) + +#define psmi_assert_always(x) psmi_assert_always_loc(x, PSMI_CURLOC) + +#ifdef PSM_DEBUG +# define psmi_assert(x) psmi_assert_always(x) +# define PSMI_ASSERT_INITIALIZED() psmi_assert_always(psmi_isinitialized()) +#else +# define psmi_assert(x) +# define PSMI_ASSERT_INITIALIZED() +#endif + +#define _PSMI_API_NAME(FN) __ ## FN +#define _PSMI_API_STR(FN) _STRINGIFY(__ ## FN) +#define PSMI_API_DECL(FN) \ + typeof(_PSMI_API_NAME(FN)) FN __attribute__((weak, alias(_PSMI_API_STR(FN)))); + +#define PSMI_ERR_UNLESS_INITIALIZED(ep) \ + do { \ + if (!psmi_isinitialized()) { \ + PSM2_LOG_MSG("leaving"); \ + return psmi_handle_error(ep, PSM2_INIT_NOT_INIT, \ + "PSM2 has not been initialized"); \ + } \ + } while (0) + +#define PSMI_CHECKMEM(err, mem) \ + do { \ + if ((mem) == NULL) { \ + (err) = PSM2_NO_MEMORY; \ + goto fail; \ + } \ + } while (0) + +#define PSMI_CACHEALIGN __attribute__((aligned(64))) + +/* Easy way to ignore the OK_NO_PROGRESS case */ +PSMI_ALWAYS_INLINE(psm2_error_t psmi_err_only(psm2_error_t err)) +{ + if (err > PSM2_OK_NO_PROGRESS) + return err; + else + return PSM2_OK; +} + +#ifdef min +#undef min +#endif +#define min(a, b) ((a) < (b) ? (a) : (b)) + +#ifdef max +#undef max +#endif +#define max(a, b) ((a) > (b) ? (a) : (b)) + +#define SEC_ULL 1000000000ULL +#define MSEC_ULL 1000000ULL +#define USEC_ULL 1000ULL +#define NSEC_ULL 1ULL + +#define PSMI_TRUE 1 +#define PSMI_FALSE 0 + +#define PSMI_CYCLES_TO_SECSF(cycles) \ + ((double) cycles_to_nanosecs(cycles) / 1.0e9) + +#define PSMI_PAGESIZE psmi_getpagesize() +#define PSMI_POWEROFTWO(P) (((P)&((P)-1)) == 0) +#define PSMI_ALIGNDOWN(p, P) (((uintptr_t)(p))&~((uintptr_t)((P)-1))) +#define PSMI_ALIGNUP(p, P) (PSMI_ALIGNDOWN((uintptr_t)(p)+((uintptr_t)((P)-1)), (P))) + +#define PSMI_MAKE_DRIVER_VERSION(major, minor) ((major)<<16 | ((minor) & 0xffff)) + +#ifdef PSM_DEBUG + +/* The intent of the following two macros is to emit an internal error if a size of a + 'member' is not as expected, violating an assumption in the code. There are some + problems with the implementation of this code: + + The first macro creates a static const variable with ABSOLUTELY NO references + to them. For example there are ABSOLUTELY NO uses of the second macro in the + PSM code. This is not completely pure. GCC version 5, for example, emits a + warning for defining a static const when it is not referenced. + + A better implementation of the intent of this code is to use static_assert() + so that at compile time the violations can be caught and corrected - not at + run time. */ + +#define PSMI_STRICT_SIZE_DECL(member, sz) static const size_t __psm2_ss_ ## member = sz +#define PSMI_STRICT_SIZE_VERIFY(member, sz) \ + do { \ + if (__psm2_ss_ ## member != (sz)) { \ + char errmsg[64]; \ + snprintf(errmsg, 32, "Internal error: %s " \ + "size doesn't match expected %d bytes", \ + STRINGIFY(member), (int) __psm2_ss_ ## member); \ + exit(-1); \ + } \ + } while (0) + +#else + +#define PSMI_STRICT_SIZE_DECL(member, sz) /* nothing */ +#define PSMI_STRICT_SIZE_VERIFY(member, sz) /* nothing */ + +#endif /* PSM_DEBUG */ + +#endif /* _PSMI_HELP_H */ diff --git a/psm_lock.h b/psm_lock.h new file mode 100644 index 0000000..c82960c --- /dev/null +++ b/psm_lock.h @@ -0,0 +1,190 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_IN_USER_H +#error psm_lock.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_LOCK_H +#define _PSMI_LOCK_H + +#ifndef PSMI_USE_PTHREAD_SPINLOCKS +#define PSMI_USE_PTHREAD_SPINLOCKS 0 +#endif + +#if PSMI_USE_PTHREAD_SPINLOCKS +typedef pthread_spinlock_t psmi_spinlock_t; + +#define psmi_spin_init(lock) pthread_spin_init(lock, \ + PTHREAD_PROCESS_PRIVATE) +#define psmi_spin_lock(lock) pthread_spin_lock(lock) +#define psmi_spin_trylock(lock) pthread_spin_trylock(lock) +#define psmi_spin_unlock(lock) pthread_spin_unlock(lock) +#else +typedef ips_atomic_t psmi_spinlock_t; +#define PSMI_SPIN_LOCKED 1 +#define PSMI_SPIN_UNLOCKED 0 +#endif + +/* psmi_lock_t structure */ +typedef struct { + +#ifdef PSMI_LOCK_IS_SPINLOCK + psmi_spinlock_t lock; +#elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG) + pthread_mutex_t lock; + pthread_t lock_owner; +#elif defined(PSMI_LOCK_IS_MUTEXLOCK) + pthread_mutex_t lock; +#endif +} psmi_lock_t; + + +#if PSMI_USE_PTHREAD_SPINLOCKS +#else +PSMI_ALWAYS_INLINE(int psmi_spin_init(psmi_spinlock_t *lock)) +{ + ips_atomic_set(lock, PSMI_SPIN_UNLOCKED); + return 0; +} + +PSMI_ALWAYS_INLINE(int psmi_spin_trylock(psmi_spinlock_t *lock)) +{ + if (ips_atomic_cmpxchg(lock, PSMI_SPIN_UNLOCKED, PSMI_SPIN_LOCKED) + == PSMI_SPIN_UNLOCKED) + return 0; + else + return EBUSY; +} + +PSMI_ALWAYS_INLINE(int psmi_spin_lock(psmi_spinlock_t *lock)) +{ + while (psmi_spin_trylock(lock) == EBUSY) { + } + return 0; +} + +PSMI_ALWAYS_INLINE(int psmi_spin_unlock(psmi_spinlock_t *lock)) +{ + atomic_set(lock, PSMI_SPIN_UNLOCKED); + return 0; +} +#endif /* PSMI_USE_PTHREAD_SPINLOCKS */ + +PSMI_ALWAYS_INLINE(void psmi_init_lock(psmi_lock_t *lock)) +{ +#ifdef PSMI_LOCK_IS_SPINLOCK + psmi_spin_init(&(lock->lock)); +#elif defined(PSMI_LOCK_IS_MUTEXLOCK) + pthread_mutex_init(&(lock->lock), NULL); +#elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG) + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK_NP); + pthread_mutex_init(&(lock->lock), &attr); + pthread_mutexattr_destroy(&attr); + lock->lock_owner = PSMI_LOCK_NO_OWNER; +#endif +} + +PSMI_ALWAYS_INLINE(int psmi_sem_post(sem_t *sem, const char *name)) +{ + if (sem_post(sem) == -1) { + _HFI_VDBG("Semaphore %s: post failed\n", name ? name : "NULL" ); + return -1; + } + + _HFI_VDBG("Semaphore %s: post succeeded\n", name ? name : "NULL"); + + return 0; +} + +PSMI_ALWAYS_INLINE(int psmi_sem_timedwait(sem_t *sem, const char *name)) +{ + /* Wait 5 seconds for shm read-write lock to open */ + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_sec += 5; + + if (sem_timedwait(sem, &ts) == -1) { + _HFI_VDBG("Semaphore %s: Timedwait failed\n", name ? name : "NULL" ); + return -1; + } + + _HFI_VDBG("Semaphore %s: Timedwait succeeded\n", name ? name : "NULL"); + + return 0; +} + +PSMI_ALWAYS_INLINE(int psmi_init_semaphore(sem_t **sem, const char *name, + mode_t mode, int value)) +{ + *sem = sem_open(name, O_CREAT | O_EXCL, mode, value); + if ((*sem == SEM_FAILED) && (errno == EEXIST)) { + *sem = sem_open(name, O_CREAT, mode, value); + if (*sem == SEM_FAILED) { + _HFI_VDBG("Cannot open semaphore %s, errno=%d\n", + name, errno); + return -1; + } + } else if (*sem == SEM_FAILED) { + _HFI_VDBG("Cannot create semaphore %s, errno=%d\n", name, errno); + return -1; + } + + return 0; +} + +#endif /* _PSMI_LOCK_H */ diff --git a/psm_log.h b/psm_log.h new file mode 100644 index 0000000..1c5158a --- /dev/null +++ b/psm_log.h @@ -0,0 +1,282 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef _PSMI_LOG_H +#define _PSMI_LOG_H + +/* + + A note about PSM_LOG and PSM_LOG_FAST_IO: + + By default, the PSM_LOG facility is safe and slow. Log messages + are written to a file under /tmp as they're generated. So, if the test case + has an abnormal termination such as a segmentation fault or an abort(), + the log messages will still be available. + + However, debugging timing sensitive problems, make the default PSM_LOG + facility inadequate as the timing overhead that it introduces dominates, and + the symptoms of the problem being tested may change. + + When performance is important, you can use BOTH: PSM_LOG and PSM_LOG_FAST_IO. + With PSM_LOG_FAST_IO, log messages are written to a memory buffer, and when + the program terminates, the log messages are written to a file under /tmp + + * How to use basic functionality of PSM LOG: + + - To use default PSM_LOG, build PSM2 with macro + PSM_LOG=1 + + - To use PSM_LOG when performance is critical, build PSM2 with macros + PSM_LOG=1 PSM_LOG_FAST_IO=1 + + - Insert log message in code with a . Log message follow the same + format as printf(). For example: + PSM2_LOG_MSG(" %u", 1); + + - To filter out log messages, set environment variable + PSM2_LOG_SRCH_FORMAT_STRING to and the wildcard character (*). + For example, + PSM2_LOG_SRCH_FORMAT_STRING=* + + - A more detailed explanation to use PSM LOG can be found below. + + * How to get log messages with abnormal termination while using + PSM LOG with PSM_LOG_FAST_IO: + + - Log messages are saved from a memory buffer to a file under /tmp when + psmi_log_fini() is called. psmi_log_fini() is exposed to the outside + world via the linker script file, so client test code can psmi_log_fini() + on a fatal error. + + -------------------------------------------------------------------------------- + + This file (psm_log.h) defines macros for logging messages to assist + investigations into the psm library. + + By default, these macros are not defined when building psm. When not defined, + the macros become no-ops in the PSM code. + + When enabled (by defining the PSM_LOG symbol), the macros present information + to the psmi_log_message() facility for processing. See below for more + information on the psmi_log_message() facility. + + The macros are described in the following: + + PSM2_LOG_MSG(FORMAT,...) Spills a printf-style message to the log. + PSM2_LOG_DECLARE_BT_BUFFER() Declares a local back trace buffer for use + with the PSM2_LOG_BT() macro. + PSM2_LOG_BT(NFRAMES,FORMAT,...) Spills the current backtrace, if it differs + from the previous backtrace spilled to the + log. + + The psmi_log_message() facility is the backend for these messages when + PSM_LOG is enabled. The psmi_log_message() facility spills messages to + unique log files based on the process id and the thread id. So every unique + process id, and thread id will spill to unique log files. The + psmi_log_message prefixes each message in the log files with a high + resolution timer message so that messages from multiple threads and log files + can be reconciled to one timeline. It is left as an exercise to the reader + to reconcile log messages from different hosts to one timeline. + + The backtrace capability in the PSM_LOG functionality needs some explanation: + often a bug happens only when the code is tickled from a specific call-chain. + The PSM2_LOG_BT() macro supports identifying the unique call-chain when a + problem occurs. The model is as follows: + + A unique declaration is made for a backtrace to spill the backtrace + information to. This declaration should be made in the same basic block as + the use of the PSM2_LOG_BT() macro. To make the declaration, use + PSM2_LOG_DECLARE_BT_BUFFER(). + + When the PSM_LOG is enabled, at the statement for the macro: + PSM2_LOG_BT(NFRAMES,FORMAT,...), the psmi_log_message() facility generates + the current backtrace, and compares the first NFRAMES of the current backtrace + against the previous backtrace stored in the backtrace buffer declared with + the declaration. If the two backtraces differ, the psmi_log_message() code + saves the current backtrace into the declared buffer, and then spills the + backtrace to the log file. + + At runtime, setting environment variables can squelch the log file from + getting too big: + + PSM2_LOG_INC_FUNCTION_NAMES is a list of function name lists (abbreviated + FNL) (see below), that will INClude the FNL's into the colleciton of functions + to spill log data for. + + PSM2_LOG_EXC_FUNCTION_NAMES is a list of FNL's (see below), that will EXClude + the FNL's from the collection of functions to spill log data for. + + An FNL is a 'Function Name List' that is defined by the following grammar: + + # A LINE1 is either a single line number of a range of line numbers: + LINE1 :: lineNumber | + lineNumber1 '-' lineNumber2 + + # LINES is a list of LINE1's separated by commas: + LINES :: LINE1 | + LINE1 ',' LINES + + # An FN is either a function name, or a function name with a list of lines: + FN :: functionName | + functionName ';' LINES + + # A FNL is a list of FN's separated by colons: + FNL :: FN | + FN ':' FNL + + # Examples: + foo:bar the two functions foo and bar + foo;1-10 lines 1 to 10 of function foo. + bar;1,3,5 lines 1, 3 and 5 of function bar + + PSM2_LOG_SRCH_FORMAT_STRING If set, overrides the PSM2_LOG_INC_FUNCTION_NAMES + and PSM2_LOG_EXC_FUNCTION_NAMES settings. Causes the psmi_log_message() + facility to only emit the log messages that match (using fnmatch()) the + message in FORMAT. + + */ + +typedef enum +{ + PSM2_LOG_TX = 0, + PSM2_LOG_RX = 1, + PSM2_LOG_PEND = 2, +} psmi_log_tx_rx_t; + +#ifdef PSM_LOG + +extern void psmi_log_initialize(void); + +/* defined in psm_utils.c */ +extern void psmi_log_message(const char *fileName, + const char *functionName, + int lineNumber, + const char *format, ...); + +#ifdef PSM_LOG_FAST_IO +extern void psmi_log_fini(void); +#else +#define psmi_log_fini() /* nothing */ +#endif + +#define PSM2_LOG_MSG(FORMAT , ...) psmi_log_message(__FILE__,__FUNCTION__,__LINE__,FORMAT, ## __VA_ARGS__) + +#define PSM2_LOG_BT_BUFFER_SIZE 100 + +#define PSM2_LOG_DECLARE_BT_BUFFER() static void * psm_log_bt_buffer[PSM2_LOG_BT_BUFFER_SIZE] + +#define PSM2_LOG_DECLARE_BT_BUFFER_SZ(SIZE) static void * psm_log_bt_buffer[SIZE] + +#define PSM2_LOG_BT_MAGIC ((const char *)-1) + +#define PSM2_LOG_BT(NFRAMES,FORMAT , ...) psmi_log_message(__FILE__,__FUNCTION__,__LINE__,PSM2_LOG_BT_MAGIC,psm_log_bt_buffer,NFRAMES,FORMAT, ## __VA_ARGS__) + +#define PSM2_LOG_EPM_MAGIC ((const char *)-2) + +/* EPM is short for Emit Protocol Message to the log file. +OPCODE is an int, and corresponds to one of the OPCODES declared in ptl_ips/ips_proto_header.h +TXRX is an int, and should be one of the above two consts (PSM2_LOG_TX, or PSM2_LOG_RX). +FROMEPID and TOEPID are uint64_t's and the fromepid should be the epid (end point id) of the sender of the message + and the toepid should be the epid (end point id) of the receiver of the message + */ +#define PSM2_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) \ + psmi_log_message(__FILE__,__FUNCTION__,__LINE__, \ + PSM2_LOG_EPM_MAGIC,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT, \ + ## __VA_ARGS__) + +/* Just adds a condition to the PSM2_LOG_EPM() macro. */ +#define PSM2_LOG_EPM_COND(COND,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) \ + if (COND) \ + PSM2_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT, ## __VA_ARGS__) + +#define PSM2_LOG_DUMP_MAGIC ((const char *)-3) + +#define PSM2_LOG_MSG_DUMP(ADDR,SIZE,FORMAT , ...) \ + psmi_log_message(__FILE__,__FUNCTION__,__LINE__,PSM2_LOG_DUMP_MAGIC,ADDR,SIZE, \ + FORMAT, ## __VA_ARGS__) + +#define PSM2_LOG_PKT_STRM_MAGIC ((const char *)-4) + +#define PSM2_LOG_MIN_MAGIC PSM2_LOG_BT_MAGIC + +#define PSM2_LOG_MAX_MAGIC PSM2_LOG_PKT_STRM_MAGIC + +#define PSM2_LOG_PKT_STRM(TXRX,IPS_MSG_HDRP,FORMAT, ...) \ + psmi_log_message(__FILE__,__FUNCTION__,__LINE__,PSM2_LOG_PKT_STRM_MAGIC,TXRX, \ + IPS_MSG_HDRP,FORMAT, ## __VA_ARGS__) + +#else + +#define psmi_log_initialize() /* nothing */ + +#define PSM2_LOG_MSG(FORMAT , ...) /* nothing */ + +#define psmi_log_fini() /* nothing */ + +#define PSM2_LOG_DECLARE_BT_BUFFER() /* nothing */ + +#define PSM2_LOG_DECLARE_BT_BUFFER_SZ(SIZE) /* nothing */ + +#define PSM2_LOG_BT(NFRAMES,FORMAT , ...) /* nothing */ + +#define PSM2_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) /* nothing */ + +#define PSM2_LOG_EPM_COND(COND,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) /* nothing */ + +#define PSM2_LOG_MSG_DUMP(ADDR,SIZE,FORMAT , ...) /* nothing */ + +#define PSM2_LOG_PKT_STRM(TXRX,IPS_MSG_HDRP,FORMAT, ...) /* nothing */ + +#endif /* #ifdef PSM_LOG */ + +#endif /* #ifndef _PSMI_LOG_H */ diff --git a/psm_memcpy.c b/psm_memcpy.c new file mode 100644 index 0000000..b7c7a89 --- /dev/null +++ b/psm_memcpy.c @@ -0,0 +1,68 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include +#include +#include +#include + +#include "psm_user.h" +#include "psm_mq_internal.h" + +void *psmi_memcpyo(void *dst, const void *src, size_t n) +{ + psmi_mq_mtucpy(dst, src, n); + return dst; +} diff --git a/psm_mock.c b/psm_mock.c new file mode 100644 index 0000000..bdcfd41 --- /dev/null +++ b/psm_mock.c @@ -0,0 +1,90 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "psm_user.h" +#include "psm_mq_internal.h" +#include "psm2_mock_testing.h" + +#ifdef PSM2_MOCK_TESTING +void MOCKABLE(psmi_mockable_lock_init)(psmi_lock_t *pl) +{ + _PSMI_LOCK_INIT(*pl); +} +MOCK_DEF_EPILOGUE(psmi_mockable_lock_init); +int MOCKABLE(psmi_mockable_lock_try)(psmi_lock_t *pl) +{ + int ret = _PSMI_LOCK_TRY(*pl); + return ret; +} +MOCK_DEF_EPILOGUE(psmi_mockable_lock_try); +void MOCKABLE(psmi_mockable_lock)(psmi_lock_t *pl) +{ + _PSMI_LOCK(*pl); +} +MOCK_DEF_EPILOGUE(psmi_mockable_lock); +void MOCKABLE(psmi_mockable_unlock)(psmi_lock_t *pl) +{ + _PSMI_UNLOCK(*pl); +} +MOCK_DEF_EPILOGUE(psmi_mockable_unlock); +void MOCKABLE(psmi_mockable_lock_assert)(psmi_lock_t *pl) +{ + _PSMI_LOCK_ASSERT(*pl); +} +MOCK_DEF_EPILOGUE(psmi_mockable_lock_assert); +void MOCKABLE(psmi_mockable_unlock_assert)(psmi_lock_t *pl) +{ + _PSMI_UNLOCK_ASSERT(*pl); +} +MOCK_DEF_EPILOGUE(psmi_mockable_unlock_assert); +#endif diff --git a/psm_mpool.c b/psm_mpool.c new file mode 100644 index 0000000..1f2a365 --- /dev/null +++ b/psm_mpool.c @@ -0,0 +1,573 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" + +#define PSMI_MPOOL_ALIGNMENT 64 + +struct mpool_element { + union { + SLIST_ENTRY(mpool_element) me_next; + mpool_t me_mpool; + }; + + uint32_t me_gen_count; + uint32_t me_index; +#ifdef PSM_DEBUG + uint32_t me_isused; +#endif +} __attribute__ ((aligned(8))); + +#ifdef PSM_DEBUG +# define me_mark_used(me) ((me)->me_isused = 1) +# define me_mark_unused(me) ((me)->me_isused = 0) +#else +# define me_mark_used(me) +# define me_mark_unused(me) +#endif + +struct mpool { + int mp_type; + int mp_flags; + int mp_vector_shift; + + uint32_t mp_elm_vector_size; + uint32_t mp_elm_offset; + uint32_t mp_num_obj; + uint32_t mp_num_obj_inuse; + uint32_t mp_elm_size; + uint32_t mp_obj_size; + uint32_t mp_num_obj_per_chunk; + uint32_t mp_num_obj_max_total; + psmi_memtype_t mp_memtype; + + SLIST_HEAD(, mpool_element) mp_head; + struct mpool_element **mp_elm_vector; + struct mpool_element **mp_elm_vector_free; + non_empty_callback_fn_t mp_non_empty_cb; + void *mp_non_empty_cb_context; + +#ifdef PSM_CUDA + alloc_dealloc_callback_fn_t mp_alloc_dealloc_cb; + void *mp_alloc_dealloc_cb_context; +#endif +}; + +static int psmi_mpool_allocate_chunk(mpool_t); + +/** + * psmi_mpool_create() + * + * Create a memory pool and allocates objects of size + * . If more memory is needed to accommodate mpool_get() + * requests, the memory pool will allocate another chunk of + * objects, until it reaches the maximum number of objects + * it can allocate. + * + * size of each individual object + * number of objects to allocate per chunk (power of two) + * total number of objects that may be allocated + * at any given time. Must be a power of two greater than + * . + * + * flags to be applied on the memory pool (ie. memory + * alignment) + * + * callback to be called when the memory pool has some + * free objects available again (after running out of them). + * context pointer for the callback + * + * Return the mpool on success, NULL on failure. + */ +mpool_t +psmi_mpool_create_inner(size_t obj_size, uint32_t num_obj_per_chunk, + uint32_t num_obj_max_total, int flags, + psmi_memtype_t statstype, + non_empty_callback_fn_t cb, void *context) +{ + mpool_t mp; + int s; + size_t hdr_size; + + if (!PSMI_POWEROFTWO(num_obj_per_chunk) || + !PSMI_POWEROFTWO(num_obj_max_total) || + num_obj_max_total < num_obj_per_chunk) { + return NULL; + } + + mp = psmi_calloc(PSMI_EP_NONE, statstype, 1, sizeof(struct mpool)); + if (mp == NULL) { + fprintf(stderr, + "Failed to allocate memory for memory pool: %s\n", + strerror(errno)); + return NULL; + } + + for (s = 1; s < num_obj_per_chunk; s <<= 1) + mp->mp_vector_shift++; + + mp->mp_flags = flags; + mp->mp_num_obj_per_chunk = num_obj_per_chunk; + mp->mp_num_obj_max_total = num_obj_max_total; + mp->mp_non_empty_cb = cb; + mp->mp_non_empty_cb_context = context; + + mp->mp_memtype = statstype; + + SLIST_INIT(&mp->mp_head); + mp->mp_elm_vector_size = num_obj_max_total / num_obj_per_chunk; + mp->mp_elm_vector = + psmi_calloc(PSMI_EP_NONE, statstype, mp->mp_elm_vector_size, + sizeof(struct mpool_element *)); + if (mp->mp_elm_vector == NULL) { + fprintf(stderr, + "Failed to allocate memory for memory pool vector: " + "%s\n", strerror(errno)); + psmi_free(mp); + return NULL; + } + + mp->mp_elm_vector_free = mp->mp_elm_vector; + + if (flags & PSMI_MPOOL_ALIGN) { + /* User wants its block to start on a PSMI_MPOOL_ALIGNMENT + * boundary. */ + hdr_size = PSMI_ALIGNUP(sizeof(struct mpool_element), + PSMI_MPOOL_ALIGNMENT); + mp->mp_obj_size = PSMI_ALIGNUP(obj_size, PSMI_MPOOL_ALIGNMENT); + mp->mp_elm_size = hdr_size + mp->mp_obj_size; + + mp->mp_elm_offset = hdr_size - sizeof(struct mpool_element); + } else { + hdr_size = sizeof(struct mpool_element); + mp->mp_obj_size = PSMI_ALIGNUP(obj_size, 8); + mp->mp_elm_size = hdr_size + mp->mp_obj_size; + mp->mp_elm_offset = 0; + } + + return mp; +} + +mpool_t +MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk, + uint32_t num_obj_max_total, int flags, + psmi_memtype_t statstype, non_empty_callback_fn_t cb, + void *context) +{ + mpool_t mp; + + mp = psmi_mpool_create_inner(obj_size, num_obj_per_chunk, + num_obj_max_total, flags, statstype, + cb, context); + + if (mp == NULL) + return NULL; + + if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) { + psmi_mpool_destroy(mp); + return NULL; + } + + return mp; +} +MOCK_DEF_EPILOGUE(psmi_mpool_create); + +#ifdef PSM_CUDA +mpool_t +psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk, + uint32_t num_obj_max_total, int flags, + psmi_memtype_t statstype, + non_empty_callback_fn_t cb, void *context, + alloc_dealloc_callback_fn_t ad_cb, void *ad_context) +{ + mpool_t mp; + + mp = psmi_mpool_create_inner(obj_size, num_obj_per_chunk, + num_obj_max_total, flags, statstype, + cb, context); + + if (mp == NULL) + return NULL; + + mp->mp_alloc_dealloc_cb = ad_cb; + mp->mp_alloc_dealloc_cb_context = ad_context; + + if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) { + psmi_mpool_destroy(mp); + return NULL; + } + + return mp; +} +#endif + +/** + * psmi_mpool_get() + * + * memory pool + * + * Requests an object from the memory pool. + * + * Returns NULL if the maximum number of objects has been allocated (refer to + * in psmi_mpool_create) or if running out of memory. + */ +void *psmi_mpool_get(mpool_t mp) +{ + struct mpool_element *me; + void *obj; + + if (SLIST_EMPTY(&mp->mp_head)) { + if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) + return NULL; + } + + me = SLIST_FIRST(&mp->mp_head); + SLIST_REMOVE_HEAD(&mp->mp_head, me_next); + + psmi_assert(!me->me_isused); + me_mark_used(me); + + /* store a backpointer to the memory pool */ + me->me_mpool = mp; + mp->mp_num_obj_inuse++; + psmi_assert(mp->mp_num_obj_inuse <= mp->mp_num_obj); + + obj = (void *)((uintptr_t) me + sizeof(struct mpool_element)); + + return obj; +} + +/** + * psmi_mpool_put() + * + * object to return to the memory pool + * + * Returns an to the memory pool subsystem. This object will be re-used + * to fulfill new psmi_mpool_get() requests. + */ +void psmi_mpool_put(void *obj) +{ + struct mpool_element *me; + int was_empty; + mpool_t mp; + + me = (struct mpool_element *) + ((uintptr_t) obj - sizeof(struct mpool_element)); + me->me_gen_count++; + + mp = me->me_mpool; + + psmi_assert(mp != NULL); + psmi_assert(mp->mp_num_obj_inuse >= 0); + psmi_assert(me->me_isused); + me_mark_unused(me); + + was_empty = mp->mp_num_obj_inuse == mp->mp_num_obj_max_total; + SLIST_INSERT_HEAD(&mp->mp_head, me, me_next); + + mp->mp_num_obj_inuse--; + + /* tell the user that memory is available */ + if (mp->mp_non_empty_cb && was_empty) + mp->mp_non_empty_cb(mp->mp_non_empty_cb_context); +} + +/** + * psmi_mpool_get_obj_index() + * + * object in the memory pool + * + * Returns the index of the in the memory pool. + */ + +int psmi_mpool_get_obj_index(void *obj) +{ + struct mpool_element *me = (struct mpool_element *) + ((uintptr_t) obj - sizeof(struct mpool_element)); + + return me->me_index; +} + +/** + * psmi_mpool_get_obj_gen_count() + * + * object in the memory pool + * + * Returns the generation count of the . + */ +uint32_t psmi_mpool_get_obj_gen_count(void *obj) +{ + struct mpool_element *me = (struct mpool_element *) + ((uintptr_t) obj - sizeof(struct mpool_element)); + + return me->me_gen_count; +} + +/** + * psmi_mpool_get_obj_index_gen_count() + * + * object in the memory pool + * + * Returns the index of the in . + * Returns the generation count of the in . + */ +int +psmi_mpool_get_obj_index_gen_count(void *obj, uint32_t *index, + uint32_t *gen_count) +{ + struct mpool_element *me = (struct mpool_element *) + ((uintptr_t) obj - sizeof(struct mpool_element)); + + *index = me->me_index; + *gen_count = me->me_gen_count; + return 0; +} + +/** + * psmi_mpool_find_obj_by_index() + * + * memory pool + * index of the object + * + * Returns the object located at in the memory pool or NULL if the + * is invalid. + */ +void *psmi_mpool_find_obj_by_index(mpool_t mp, int index) +{ + struct mpool_element *me; + + if_pf(index < 0 || index >= mp->mp_num_obj) + return NULL; + + me = (struct mpool_element *) + ((uintptr_t) mp->mp_elm_vector[index >> mp->mp_vector_shift] + + (index & (mp->mp_num_obj_per_chunk - 1)) * mp->mp_elm_size + + mp->mp_elm_offset); + + /* If this mpool doesn't require generation counts, it's illegal to find a + * freed object */ +#ifdef PSM_DEBUG + if (mp->mp_flags & PSMI_MPOOL_NOGENERATION) + psmi_assert(!me->me_isused); +#endif + + return (void *)((uintptr_t) me + sizeof(struct mpool_element)); +} + +#ifdef PSM_CUDA +/** + * psmi_mpool_chunk_dealloc() + * memory pool + * index + * Calls the dealloc function on each element in the chunk. + */ +void psmi_mpool_chunk_dealloc(mpool_t mp, int idx) +{ + int j; + for (j = 0; j < mp->mp_num_obj_per_chunk; j++) + mp->mp_alloc_dealloc_cb(0 /* is not alloc */, + mp->mp_alloc_dealloc_cb_context, + ((void *) mp->mp_elm_vector[idx]) + + j * mp->mp_elm_size + + sizeof(struct mpool_element)); +} +#endif +/** + * psmi_mpool_destroy() + * + * memory pool + * + * Destroy a previously allocated memory pool and reclaim its associated + * memory. The behavior is undefined if some objects have not been returned + * to the memory pool with psmi_mpool_put(). + */ +void psmi_mpool_destroy(mpool_t mp) +{ + int i = 0; + size_t nbytes = mp->mp_num_obj * mp->mp_elm_size; + + for (i = 0; i < mp->mp_elm_vector_size; i++) { + if (mp->mp_elm_vector[i]) { +#ifdef PSM_CUDA + if (mp->mp_alloc_dealloc_cb) + psmi_mpool_chunk_dealloc(mp, i); +#endif + psmi_free(mp->mp_elm_vector[i]); + } + } + psmi_free(mp->mp_elm_vector); + nbytes += mp->mp_elm_vector_size * sizeof(struct mpool_element *); + psmi_free(mp); + nbytes += sizeof(struct mpool); +} + +/** + * psmi_mpool_get_max_obj() + * + * memory pool + * + * Returns the num-obj-per-chunk + * Returns the num-obj-max-total + */ +void +MOCKABLE(psmi_mpool_get_obj_info)(mpool_t mp, uint32_t *num_obj_per_chunk, + uint32_t *num_obj_max_total) +{ + *num_obj_per_chunk = mp->mp_num_obj_per_chunk; + *num_obj_max_total = mp->mp_num_obj_max_total; + return; +} +MOCK_DEF_EPILOGUE(psmi_mpool_get_obj_info); + +static int psmi_mpool_allocate_chunk(mpool_t mp) +{ + struct mpool_element *elm; + void *chunk; + uint32_t i = 0, num_to_allocate; + + num_to_allocate = + mp->mp_num_obj + mp->mp_num_obj_per_chunk > + mp->mp_num_obj_max_total ? 0 : mp->mp_num_obj_per_chunk; + + psmi_assert(mp->mp_num_obj + num_to_allocate <= + mp->mp_num_obj_max_total); + + if (num_to_allocate == 0) + return PSM2_NO_MEMORY; + +#ifdef PSM_CUDA + if (mp->mp_alloc_dealloc_cb) + chunk = psmi_calloc(PSMI_EP_NONE, mp->mp_memtype, + num_to_allocate, mp->mp_elm_size); + else + chunk = psmi_malloc(PSMI_EP_NONE, mp->mp_memtype, + num_to_allocate * mp->mp_elm_size); +#else + chunk = psmi_malloc(PSMI_EP_NONE, mp->mp_memtype, + num_to_allocate * mp->mp_elm_size); +#endif + if (chunk == NULL) { + fprintf(stderr, + "Failed to allocate memory for memory pool chunk: %s\n", + strerror(errno)); + return PSM2_NO_MEMORY; + } + + for (i = 0; i < num_to_allocate; i++) { +#ifdef PSM_CUDA + if (mp->mp_alloc_dealloc_cb) + mp->mp_alloc_dealloc_cb(1 /* is alloc */, + mp->mp_alloc_dealloc_cb_context, + chunk + i * mp->mp_elm_size + + sizeof(struct mpool_element)); +#endif + elm = (struct mpool_element *)((uintptr_t) chunk + + i * mp->mp_elm_size + + mp->mp_elm_offset); + elm->me_gen_count = 0; + elm->me_index = mp->mp_num_obj + i; +#ifdef PSM_DEBUG + elm->me_isused = 0; +#endif + SLIST_INSERT_HEAD(&mp->mp_head, elm, me_next); +#if 0 + fprintf(stderr, "chunk%ld i=%d elm=%p user=%p next=%p\n", + (long)(mp->mp_elm_vector_free - mp->mp_elm_vector), + (int)i, elm, + (void *)((uintptr_t) elm + + sizeof(struct mpool_element)), SLIST_NEXT(elm, + me_next)); +#endif + } + + psmi_assert((uintptr_t) mp->mp_elm_vector_free + < ((uintptr_t) mp->mp_elm_vector) + mp->mp_elm_vector_size + * sizeof(struct mpool_element *)); + + mp->mp_elm_vector_free[0] = chunk; + mp->mp_elm_vector_free++; + mp->mp_num_obj += num_to_allocate; + + return PSM2_OK; +} + +#if 0 +void psmi_mpool_dump(mpool_t mp) +{ + int i, j; + struct mpool_element *me; + + fprintf(stderr, "Memory pool %p has %d elements per chunk.\n", + mp, mp->mp_num_obj_per_chunk); + for (i = 0; i < mp->mp_elm_vector_size; i++) { + if (mp->mp_elm_vector[i] != NULL) { + fprintf(stderr, "===========================\n"); + fprintf(stderr, "mpool chunk #%d\n", i); + + for (j = 0, me = mp->mp_elm_vector[i]; + j < mp->mp_num_obj_per_chunk; + j++, me = (struct mpool_element *) + ((uintptr_t) me + mp->mp_elm_size)) { + fprintf(stderr, + "obj=%p index=%d gen_count=%d\n", + (void *)((uintptr_t) me + + sizeof(struct mpool_element)), + me->me_index, me->me_gen_count); + } + fprintf(stderr, "===========================\n"); + } + } +} +#endif diff --git a/psm_mpool.h b/psm_mpool.h new file mode 100644 index 0000000..8098f60 --- /dev/null +++ b/psm_mpool.h @@ -0,0 +1,107 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_IN_USER_H +#error psm_mpool.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef PSM_MPOOL_H +#define PSM_MPOOL_H + +/* mpool flags */ +#define PSMI_MPOOL_ALIGN_CACHE 0x1 +#define PSMI_MPOOL_ALIGN_PAGE 0x2 +#define PSMI_MPOOL_NOGENERATION 0x4 + +/* Backwards compatibility */ +#define PSMI_MPOOL_ALIGN PSMI_MPOOL_ALIGN_CACHE + +typedef struct mpool *mpool_t; +typedef void (*non_empty_callback_fn_t) (void *context); +typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *context, + void *chunk); + +mpool_t +MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk, + uint32_t num_obj_max_total, int flags, + psmi_memtype_t statstype, + non_empty_callback_fn_t cb, void *context); +MOCK_DCL_EPILOGUE(psmi_mpool_create); + +mpool_t psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk, + uint32_t num_obj_max_total, int flags, + psmi_memtype_t statstype, + non_empty_callback_fn_t cb, void *context, + alloc_dealloc_callback_fn_t ad_cb, + void *ad_context); + +void psmi_mpool_destroy(mpool_t mp); + +void +MOCKABLE(psmi_mpool_get_obj_info)(mpool_t mp, uint32_t *num_obj_per_chunk, + uint32_t *num_obj_max_total); +MOCK_DCL_EPILOGUE(psmi_mpool_get_obj_info); + +void *psmi_mpool_get(mpool_t mp); +void psmi_mpool_put(void *obj); + +int psmi_mpool_get_obj_index(void *obj); +uint32_t psmi_mpool_get_obj_gen_count(void *obj); +int psmi_mpool_get_obj_index_gen_count(void *obj, + uint32_t *index, uint32_t *gen_count); + +void *psmi_mpool_find_obj_by_index(mpool_t mp, int index); + +#endif diff --git a/psm_mq.c b/psm_mq.c new file mode 100644 index 0000000..f41c134 --- /dev/null +++ b/psm_mq.c @@ -0,0 +1,1635 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include + +#include "psm_user.h" +#include "psm2_hal.h" +#include "psm_mq_internal.h" + +#ifdef PSM_CUDA +#include "psm_gdrcpy.h" +#endif + +/* + * Functions to manipulate the expected queue in mq_ep. + */ + +/* + * Once the linked lists cross the size limit, this function will enable tag + * hashing and disable the non-hashing fastpath. We need to go back and insert + * reqs into the hash tables where the hashing searches will look for them. + */ +void +psmi_mq_fastpath_disable(psm2_mq_t mq) +{ + psm2_mq_req_t *curp, cur; + struct mqq *qp; + unsigned hashvals[NUM_HASH_CONFIGS]; + int t = PSM2_ANYTAG_ANYSRC; + + mq->nohash_fastpath = 0; + /* Everything in the unexpected_q needs to be duplicated into + each of the (three) unexpected hash tables. */ + qp = &mq->unexpected_q; + for (curp = &qp->first; (cur = *curp) != NULL; curp = &cur->next[t]) { + mq->unexpected_hash_len++; + hashvals[PSM2_TAG_SRC] = + hash_64(*(uint64_t *) cur->req_data.tag.tag) % NUM_HASH_BUCKETS; + hashvals[PSM2_TAG_ANYSRC] = + hash_32(cur->req_data.tag.tag[0]) % NUM_HASH_BUCKETS; + hashvals[PSM2_ANYTAG_SRC] = + hash_32(cur->req_data.tag.tag[1]) % NUM_HASH_BUCKETS; + for (t = PSM2_TAG_SRC; t < PSM2_ANYTAG_ANYSRC; t++) + mq_qq_append_which(mq->unexpected_htab, + t, hashvals[t], cur); + } + + /* Everything in the expected_q needs to be moved into the + (single) correct expected hash table. */ + qp = &mq->expected_q; + for (curp = &qp->first; (cur = *curp) != NULL; /*curp = &cur->next*/) { + /* must read next ptr before remove */ + curp = &cur->next[PSM2_ANYTAG_ANYSRC]; + if ((cur->req_data.tagsel.tag[0] == 0xFFFFFFFF) && + (cur->req_data.tagsel.tag[1] == 0xFFFFFFFF)) { + /* hash tag0 and tag1 */ + t = PSM2_TAG_SRC; + hashvals[t] = hash_64(*(uint64_t *) cur->req_data.tag.tag) % NUM_HASH_BUCKETS; + mq_qq_append_which(mq->expected_htab, + t, hashvals[t], cur); + } else if (cur->req_data.tagsel.tag[0] == 0xFFFFFFFF) { + t = PSM2_TAG_ANYSRC; + hashvals[t] = hash_32(cur->req_data.tag.tag[0]) % NUM_HASH_BUCKETS; + mq_qq_append_which(mq->expected_htab, + t, hashvals[t], cur); + } else if (cur->req_data.tagsel.tag[1] == 0xFFFFFFFF) { + t = PSM2_ANYTAG_SRC; + hashvals[t] = hash_32(cur->req_data.tag.tag[1]) % NUM_HASH_BUCKETS; + mq_qq_append_which(mq->expected_htab, + t, hashvals[t], cur); + } else + continue; /* else, req must stay in ANY ANY */ + + mq->expected_list_len--; + mq->expected_hash_len++; + mq_qq_remove_which(cur, PSM2_ANYTAG_ANYSRC); + } +} + +/* easy threshold to re-enable: if |hash| == 0 && |list| < X + aggressive threshold: if |hash| + |list| < X + even easier: if |hash| + |list| == 0 + might be better approach to avoid constant bouncing between modes */ +void psmi_mq_fastpath_try_reenable(psm2_mq_t mq) +{ + if_pf(mq->nohash_fastpath == 0 && + mq->unexpected_hash_len == 0 && + mq->expected_hash_len == 0 && + mq->unexpected_list_len == 0 && + mq->expected_list_len == 0){ + mq->nohash_fastpath = 1; + } +} + +/* + * ! @brief PSM exposed version to allow PTLs to match + */ + +/*! @brief Try to match against the MQ using a tag and tagsel + * + * @param[in] mq Message Queue + * @param[in] src Source (sender) epaddr, may be PSM2_MQ_ANY_ADDR. + * @param[in] tag Input Tag + * @param[in] tagsel Input Tag Selector + * @param[in] remove Non-zero to remove the req from the queue + * + * @returns NULL if no match or an mq request if there is a match + */ +static +psm2_mq_req_t +mq_req_match_with_tagsel(psm2_mq_t mq, psm2_epaddr_t src, + psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, int remove) +{ + psm2_mq_req_t *curp; + psm2_mq_req_t cur; + unsigned hashval; + int i, j = 0; + struct mqq *qp; + + if_pt (mq->nohash_fastpath) { + i = j = PSM2_ANYTAG_ANYSRC; + qp = &mq->unexpected_q; + } else if ((tagsel->tag[0] == 0xFFFFFFFF) && + (tagsel->tag[1] == 0xFFFFFFFF)) { + i = PSM2_TAG_SRC; + hashval = hash_64(*(uint64_t *) tag->tag) % NUM_HASH_BUCKETS; + qp = &mq->unexpected_htab[i][hashval]; + } else if (tagsel->tag[0] == 0xFFFFFFFF) { + i = PSM2_TAG_ANYSRC; + hashval = hash_32(tag->tag[0]) % NUM_HASH_BUCKETS; + qp = &mq->unexpected_htab[i][hashval]; + } else if (tagsel->tag[1] == 0xFFFFFFFF) { + i = PSM2_ANYTAG_SRC; + hashval = hash_32(tag->tag[1]) % NUM_HASH_BUCKETS; + qp = &mq->unexpected_htab[i][hashval]; + } else { + /* unhashable tag */ + i = PSM2_ANYTAG_ANYSRC; + qp = &mq->unexpected_q; + } + + for (curp = &qp->first; (cur = *curp) != NULL; curp = &cur->next[i]) { + psmi_assert(cur->req_data.peer != PSM2_MQ_ANY_ADDR); + if ((src == PSM2_MQ_ANY_ADDR || src == cur->req_data.peer) && + !((tag->tag[0] ^ cur->req_data.tag.tag[0]) & tagsel->tag[0]) && + !((tag->tag[1] ^ cur->req_data.tag.tag[1]) & tagsel->tag[1]) && + !((tag->tag[2] ^ cur->req_data.tag.tag[2]) & tagsel->tag[2])) { + /* match! */ + if (remove) { + if_pt (i == PSM2_ANYTAG_ANYSRC) + mq->unexpected_list_len--; + else + mq->unexpected_hash_len--; + for (; j < NUM_MQ_SUBLISTS; j++) + mq_qq_remove_which(cur, j); + psmi_mq_fastpath_try_reenable(mq); + } + return cur; + } + } + return NULL; +} + +static void mq_add_to_expected_hashes(psm2_mq_t mq, psm2_mq_req_t req) +{ + unsigned hashval; + int i; + + req->timestamp = mq->timestamp++; + if_pt (mq->nohash_fastpath) { + mq_qq_append(&mq->expected_q, req); + req->q[PSM2_ANYTAG_ANYSRC] = &mq->expected_q; + mq->expected_list_len++; + if_pf (mq->expected_list_len >= HASH_THRESHOLD) + psmi_mq_fastpath_disable(mq); + } else if ((req->req_data.tagsel.tag[0] == 0xFFFFFFFF) && + (req->req_data.tagsel.tag[1] == 0xFFFFFFFF)) { + i = PSM2_TAG_SRC; + hashval = hash_64(*(uint64_t *) req->req_data.tag.tag) % NUM_HASH_BUCKETS; + mq_qq_append_which(mq->expected_htab, i, hashval, req); + mq->expected_hash_len++; + } else if (req->req_data.tagsel.tag[0] == 0xFFFFFFFF) { + i = PSM2_TAG_ANYSRC; + hashval = hash_32(req->req_data.tag.tag[0]) % NUM_HASH_BUCKETS; + mq_qq_append_which(mq->expected_htab, i, hashval, req); + mq->expected_hash_len++; + } else if (req->req_data.tagsel.tag[1] == 0xFFFFFFFF) { + i = PSM2_ANYTAG_SRC; + hashval = hash_32(req->req_data.tag.tag[1]) % NUM_HASH_BUCKETS; + mq_qq_append_which(mq->expected_htab, i, hashval, req); + mq->expected_hash_len++; + } else { + mq_qq_append(&mq->expected_q, req); + req->q[PSM2_ANYTAG_ANYSRC] = &mq->expected_q; + mq->expected_list_len++; + } +} + +/*! @brief Try to remove the req in the MQ + * + * @param[in] mq Message Queue + * @param[in] req MQ request + * + * @returns 1 if successfully removed, or 0 if req cannot be found. + */ +static +int mq_req_remove_single(psm2_mq_t mq, psm2_mq_req_t req) +{ + int i; + + /* item should only exist in one expected queue at a time */ + psmi_assert((!!req->q[0] + !!req->q[1] + !!req->q[2] + !!req->q[3]) == 1); + + for (i = 0; i < NUM_MQ_SUBLISTS; i++) + if (req->q[i]) /* found */ + break; + switch (i) { + case PSM2_ANYTAG_ANYSRC: + mq->expected_list_len--; + break; + case PSM2_TAG_SRC: + case PSM2_TAG_ANYSRC: + case PSM2_ANYTAG_SRC: + mq->expected_hash_len--; + break; + default: + return 0; + } + + mq_qq_remove_which(req, i); + psmi_mq_fastpath_try_reenable(mq); + return 1; +} + +PSMI_ALWAYS_INLINE( +psm2_mq_req_t +psmi_mq_iprobe_inner(psm2_mq_t mq, psm2_epaddr_t src, + psm2_mq_tag_t *tag, + psm2_mq_tag_t *tagsel, int remove_req)) +{ + psm2_mq_req_t req; + + PSMI_LOCK(mq->progress_lock); + req = mq_req_match_with_tagsel(mq, src, tag, tagsel, remove_req); + + if (req != NULL) { + PSMI_UNLOCK(mq->progress_lock); + return req; + } + + psmi_poll_internal(mq->ep, 1); + /* try again */ + req = mq_req_match_with_tagsel(mq, src, tag, tagsel, remove_req); + + PSMI_UNLOCK(mq->progress_lock); + return req; +} + +psm2_error_t +__psm2_mq_iprobe2(psm2_mq_t mq, psm2_epaddr_t src, + psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, + psm2_mq_status2_t *status) +{ + psm2_mq_req_t req; + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + + req = psmi_mq_iprobe_inner(mq, src, tag, tagsel, 0); + psmi_assert_req_not_internal(req); + + if (req != NULL) { + if (status != NULL) { + mq_status2_copy(req, status); + } + PSM2_LOG_MSG("leaving"); + return PSM2_OK; + } + PSM2_LOG_MSG("leaving"); + return PSM2_MQ_NO_COMPLETIONS; +} +PSMI_API_DECL(psm2_mq_iprobe2) + +psm2_error_t +__psm2_mq_iprobe(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, + psm2_mq_status_t *status) +{ + psm2_mq_tag_t rtag; + psm2_mq_tag_t rtagsel; + psm2_mq_req_t req; + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + + *(uint64_t *) rtag.tag = tag; +#ifdef PSM_DEBUG + rtag.tag[2] = 0; +#endif + *(uint64_t *) rtagsel.tag = tagsel; + rtagsel.tag[2] = 0; + + req = psmi_mq_iprobe_inner(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel, 0); + psmi_assert_req_not_internal(req); + + if (req != NULL) { + if (status != NULL) { + mq_status_copy(req, status); + } + PSM2_LOG_MSG("leaving"); + return PSM2_OK; + } + + PSM2_LOG_MSG("leaving"); + + return PSM2_MQ_NO_COMPLETIONS; +} +PSMI_API_DECL(psm2_mq_iprobe) + +psm2_error_t +__psm2_mq_improbe2(psm2_mq_t mq, psm2_epaddr_t src, + psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, + psm2_mq_req_t *reqo, psm2_mq_status2_t *status) +{ + psm2_mq_req_t req; + + PSM2_LOG_MSG("entering"); + + PSMI_ASSERT_INITIALIZED(); + + req = psmi_mq_iprobe_inner(mq, src, tag, tagsel, 1); + if (req != NULL) { + if (status != NULL) { + mq_status2_copy(req, status); + } + *reqo = req; + PSM2_LOG_MSG("leaving"); + return PSM2_OK; + } + + *reqo = NULL; + PSM2_LOG_MSG("leaving"); + return PSM2_MQ_NO_COMPLETIONS; +} +PSMI_API_DECL(psm2_mq_improbe2) + +psm2_error_t +__psm2_mq_improbe(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, + psm2_mq_req_t *reqo, psm2_mq_status_t *status) +{ + psm2_mq_tag_t rtag; + psm2_mq_tag_t rtagsel; + psm2_mq_req_t req; + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + + *(uint64_t *) rtag.tag = tag; +#ifdef PSM_DEBUG + rtag.tag[2] = 0; +#endif + *(uint64_t *) rtagsel.tag = tagsel; + rtagsel.tag[2] = 0; + + req = psmi_mq_iprobe_inner(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel, 1); + if (req != NULL) { + if (status != NULL) { + mq_status_copy(req, status); + } + *reqo = req; + PSM2_LOG_MSG("leaving"); + return PSM2_OK; + } + + *reqo = NULL; + PSM2_LOG_MSG("leaving"); + return PSM2_MQ_NO_COMPLETIONS; +} +PSMI_API_DECL(psm2_mq_improbe) + +psm2_error_t __psm2_mq_cancel(psm2_mq_req_t *ireq) +{ + psm2_mq_req_t req = *ireq; + psm2_mq_t mq; + psm2_error_t err = PSM2_OK; + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + + if (req == NULL) { + PSM2_LOG_MSG("leaving"); + return PSM2_MQ_NO_COMPLETIONS; + } + + /* Cancelling a send is a blocking operation, and expensive. + * We only allow cancellation of rendezvous sends, consider the eager sends + * as always unsuccessfully cancelled. + */ + mq = req->mq; + PSMI_LOCK(mq->progress_lock); + + if (MQE_TYPE_IS_RECV(req->type)) { + if (req->state == MQ_STATE_POSTED) { + int rc; + + rc = mq_req_remove_single(mq, req); + psmi_assert_always(rc); + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); + err = PSM2_OK; + } else + err = PSM2_MQ_NO_COMPLETIONS; + } else { + err = psmi_handle_error(mq->ep, PSM2_PARAM_ERR, + "Cannot cancel send requests (req=%p)", + req); + } + + PSMI_UNLOCK(mq->progress_lock); + + PSM2_LOG_MSG("leaving"); + + return err; +} +PSMI_API_DECL(psm2_mq_cancel) + +/* This is the only PSM function that blocks. + * We handle it in a special manner since we don't know what the user's + * execution environment is (threads, oversubscribing processes, etc). + * + * The status argument can be an instance of either type psm2_mq_status_t or + * psm2_mq_status2_t. Depending on the type, a corresponding status copy + * routine should be passed in. + */ +PSMI_ALWAYS_INLINE( +psm2_error_t +psmi_mq_wait_inner(psm2_mq_req_t *ireq, void *status, + psmi_mq_status_copy_t status_copy, + int do_lock)) +{ + psm2_error_t err = PSM2_OK; + + psm2_mq_req_t req = *ireq; + if (req == PSM2_MQ_REQINVALID) { + return PSM2_OK; + } + + if (do_lock) + PSMI_LOCK(req->mq->progress_lock); + + if (req->state != MQ_STATE_COMPLETE) { + psm2_mq_t mq = req->mq; + + /* We'll be waiting on this req, mark it as so */ + req->type |= MQE_TYPE_WAITING; + + _HFI_VDBG("req=%p, buf=%p, len=%d, waiting\n", + req, req->req_data.buf, req->req_data.buf_len); + + if (req->testwait_callback) { + err = req->testwait_callback(ireq); + if (do_lock) + PSMI_UNLOCK(req->mq->progress_lock); + if (status != NULL) { + status_copy(req, status); + } + return err; + } + + PSMI_BLOCKUNTIL(mq->ep, err, req->state == MQ_STATE_COMPLETE); + + if (err > PSM2_OK_NO_PROGRESS) + goto fail_with_lock; + else + err = PSM2_OK; + } + + if(!psmi_is_req_internal(req)) + mq_qq_remove(&req->mq->completed_q, req); + + if (status != NULL) { + status_copy(req, status); + } + + _HFI_VDBG("req=%p complete, buf=%p, len=%d, err=%d\n", + req, req->req_data.buf, req->req_data.buf_len, req->req_data.error_code); + + psmi_mq_req_free(req); + *ireq = PSM2_MQ_REQINVALID; + +fail_with_lock: + if (do_lock) + PSMI_UNLOCK(req->mq->progress_lock); + return err; +} + +psm2_error_t +__psm2_mq_wait2(psm2_mq_req_t *ireq, psm2_mq_status2_t *status) +{ + psm2_error_t rv; + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + psmi_assert_req_not_internal(*ireq); + + rv = psmi_mq_wait_inner(ireq, status, + (psmi_mq_status_copy_t) mq_status2_copy, 1); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_mq_wait2) + +psm2_error_t +__psm2_mq_wait(psm2_mq_req_t *ireq, psm2_mq_status_t *status) +{ + psm2_error_t rv; + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + psmi_assert_req_not_internal(*ireq); + + rv = psmi_mq_wait_inner(ireq, status, + (psmi_mq_status_copy_t) mq_status_copy, 1); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_mq_wait) + +psm2_error_t psmi_mq_wait_internal(psm2_mq_req_t *ireq) +{ + return psmi_mq_wait_inner(ireq, NULL, NULL, 0); +} + +/* The status argument can be an instance of either type psm2_mq_status_t or + * psm2_mq_status2_t. Depending on the type, a corresponding status copy + * routine should be passed in. + */ +PSMI_ALWAYS_INLINE( +psm2_error_t +psmi_mq_test_inner(psm2_mq_req_t *ireq, void *status, + psmi_mq_status_copy_t status_copy)) +{ + psm2_mq_req_t req = *ireq; + psm2_error_t err = PSM2_OK; + + PSMI_ASSERT_INITIALIZED(); + + if (req == PSM2_MQ_REQINVALID) { + return PSM2_OK; + } + + if (req->state != MQ_STATE_COMPLETE) { + if (req->testwait_callback) { + PSMI_LOCK(req->mq->progress_lock); + err = req->testwait_callback(ireq); + if (status != NULL) { + status_copy(req, status); + } + PSMI_UNLOCK(req->mq->progress_lock); + return err; + } else + return PSM2_MQ_NO_COMPLETIONS; + } + + if (status != NULL) + status_copy(req, status); + + _HFI_VDBG + ("req=%p complete, tag=%08x.%08x.%08x buf=%p, len=%d, err=%d\n", + req, req->req_data.tag.tag[0], req->req_data.tag.tag[1], + req->req_data.tag.tag[2], req->req_data.buf, + req->req_data.buf_len, req->req_data.error_code); + + PSMI_LOCK(req->mq->progress_lock); + mq_qq_remove(&req->mq->completed_q, req); + psmi_mq_req_free(req); + PSMI_UNLOCK(req->mq->progress_lock); + + *ireq = PSM2_MQ_REQINVALID; + + return err; +} + +psm2_error_t +__psm2_mq_test2(psm2_mq_req_t *ireq, psm2_mq_status2_t *status) +{ + psm2_error_t rv; + PSM2_LOG_MSG("entering"); + rv = psmi_mq_test_inner(ireq, status, + (psmi_mq_status_copy_t) mq_status2_copy); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_mq_test2) + +psm2_error_t +__psm2_mq_test(psm2_mq_req_t *ireq, psm2_mq_status_t *status) +{ + psm2_error_t rv; + PSM2_LOG_MSG("entering"); + rv = psmi_mq_test_inner(ireq, status, + (psmi_mq_status_copy_t) mq_status_copy); + PSM2_LOG_MSG("leaving"); + return rv; + +} +PSMI_API_DECL(psm2_mq_test) + +psm2_error_t +__psm2_mq_isend2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, + psm2_mq_tag_t *stag, const void *buf, uint32_t len, + void *context, psm2_mq_req_t *req) +{ + psm2_error_t err; + + PSM2_LOG_MSG("entering"); + + PSMI_ASSERT_INITIALIZED(); + psmi_assert(stag != NULL); + + PSMI_LOCK(mq->progress_lock); + err = + dest->ptlctl->mq_isend(mq, dest, flags, PSMI_REQ_FLAG_NORMAL, + stag, buf, len, context, req); + PSMI_UNLOCK(mq->progress_lock); + + psmi_assert(*req != NULL); + psmi_assert_req_not_internal(*req); + + (*req)->req_data.peer = dest; + + PSM2_LOG_MSG("leaving"); + + return err; +} +PSMI_API_DECL(psm2_mq_isend2) + +psm2_error_t +__psm2_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, + const void *buf, uint32_t len, void *context, psm2_mq_req_t *req) +{ + psm2_error_t err; + psm2_mq_tag_t tag; + + PSM2_LOG_MSG("entering"); + + *((uint64_t *) tag.tag) = stag; + tag.tag[2] = 0; + + PSMI_ASSERT_INITIALIZED(); + + PSMI_LOCK(mq->progress_lock); + err = dest->ptlctl->mq_isend(mq, dest, flags, PSMI_REQ_FLAG_NORMAL, + &tag, buf, len, context, req); + PSMI_UNLOCK(mq->progress_lock); + + psmi_assert(*req != NULL); + psmi_assert_req_not_internal(*req); + + (*req)->req_data.peer = dest; + + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_mq_isend) + +psm2_error_t +__psm2_mq_send2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, + psm2_mq_tag_t *stag, const void *buf, uint32_t len) +{ + psm2_error_t err; + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + psmi_assert(stag != NULL); + + PSMI_LOCK(mq->progress_lock); + err = dest->ptlctl->mq_send(mq, dest, flags, stag, buf, len); + PSMI_UNLOCK(mq->progress_lock); + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_mq_send2) + +psm2_error_t +__psm2_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, + const void *buf, uint32_t len) +{ + psm2_error_t err; + psm2_mq_tag_t tag; + + PSM2_LOG_MSG("entering stag: 0x%" PRIx64, stag); + + *((uint64_t *) tag.tag) = stag; + tag.tag[2] = 0; + + PSMI_ASSERT_INITIALIZED(); + + PSMI_LOCK(mq->progress_lock); + err = dest->ptlctl->mq_send(mq, dest, flags, &tag, buf, len); + PSMI_UNLOCK(mq->progress_lock); + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_mq_send) + +/* + * Common subroutine to psm2_mq_irecv2 and psm2_mq_imrecv. This code assumes + * that the provided request has been matched, and begins copying message data + * that has already arrived to the user's buffer. Any remaining data is copied + * by PSM polling until the message is complete. + */ +static psm2_error_t +psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len) +{ + uint32_t copysz; + + PSM2_LOG_MSG("entering"); + psmi_assert(MQE_TYPE_IS_RECV(req->type)); +#ifdef PSM_CUDA + psmi_mtucpy_fn_t psmi_mtucpy_fn; + if (req->is_buf_gpu_mem) + psmi_mtucpy_fn = psmi_mq_mtucpy; + else + psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem; +#endif + + switch (req->state) { + case MQ_STATE_COMPLETE: + if (req->req_data.buf != NULL) { /* 0-byte messages don't alloc a sysbuf */ + copysz = mq_set_msglen(req, len, req->req_data.send_msglen); + void *ubuf = buf; +#ifdef PSM_CUDA + if (PSMI_USE_GDR_COPY(req, len)) { + ubuf = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)buf, + len, 1, + mq->ep->epaddr->proto); + psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem; + } + psmi_mtucpy_fn(ubuf, (const void *)req->req_data.buf, copysz); +#else + psmi_mq_mtucpy(ubuf, (const void *)req->req_data.buf, copysz); +#endif + psmi_mq_sysbuf_free(mq, req->req_data.buf); + } + req->req_data.buf = buf; + req->req_data.buf_len = len; + mq_qq_append(&mq->completed_q, req); + break; + + case MQ_STATE_UNEXP: /* not done yet */ + copysz = mq_set_msglen(req, len, req->req_data.send_msglen); + /* Copy What's been received so far and make sure we don't receive + * any more than copysz. After that, swap system with user buffer + */ + req->recv_msgoff = min(req->recv_msgoff, copysz); + +#ifdef PSM_CUDA + if (PSMI_USE_GDR_COPY(req, req->req_data.send_msglen)) { + buf = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)req->user_gpu_buffer, + req->req_data.send_msglen, 1, + mq->ep->epaddr->proto); + psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem; + } +#endif + + if (req->recv_msgoff) { +#ifdef PSM_CUDA + psmi_mtucpy_fn +#else + psmi_mq_mtucpy +#endif + (buf, (const void *)req->req_data.buf, + req->recv_msgoff); + } + psmi_mq_sysbuf_free(mq, req->req_data.buf); + + req->state = MQ_STATE_MATCHED; + req->req_data.buf = buf; + req->req_data.buf_len = len; + break; + + case MQ_STATE_UNEXP_RV: /* rendez-vous ... */ + copysz = mq_set_msglen(req, len, req->req_data.send_msglen); + /* Copy What's been received so far and make sure we don't receive + * any more than copysz. After that, swap system with user buffer + */ + req->recv_msgoff = min(req->recv_msgoff, copysz); + if (req->recv_msgoff) { +#ifdef PSM_CUDA + psmi_mtucpy_fn +#else + psmi_mq_mtucpy +#endif + (buf, (const void *)req->req_data.buf, + req->recv_msgoff); + } + if (req->send_msgoff) { + psmi_mq_sysbuf_free(mq, req->req_data.buf); + } + + req->state = MQ_STATE_MATCHED; + req->req_data.buf = buf; + req->req_data.buf_len = len; + req->rts_callback(req, 0); + break; + + default: + fprintf(stderr, "Unexpected state %d in req %p\n", req->state, + req); + fprintf(stderr, "type=%d, mq=%p, tag=%08x.%08x.%08x\n", + req->type, req->mq, req->req_data.tag.tag[0], req->req_data.tag.tag[1], + req->req_data.tag.tag[2]); + abort(); + } + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} + +psm2_error_t +__psm2_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *tag, + psm2_mq_tag_t *tagsel, uint32_t flags, void *buf, uint32_t len, + void *context, enum psm2_mq_fp_op fp_type, psm2_mq_req_t *req) +{ + psm2_error_t err = PSM2_OK; + + PSM2_LOG_MSG("entering"); + + PSMI_ASSERT_INITIALIZED(); + + PSMI_LOCK_ASSERT(mq->progress_lock); + + if (fp_type == PSM2_MQ_ISEND_FP) { + psmi_assert(tag != NULL); + err = + addr->ptlctl->mq_isend(mq, addr, flags, PSMI_REQ_FLAG_FASTPATH, + tag, buf, len, context, req); + + psmi_assert(*req != NULL); + psmi_assert_req_not_internal(*req); + + (*req)->req_data.peer = addr; + } else if (fp_type == PSM2_MQ_IRECV_FP) { + psm2_mq_req_t recv_req; + +#ifdef PSM_CUDA + int gpu_mem = 0; + void *gpu_user_buffer = NULL; + /* CUDA documentation dictates the use of SYNC_MEMOPS attribute + * when the buffer pointer received into PSM has been allocated + * by the application. This guarantees the all memory operations + * to this region of memory (used by multiple layers of the stack) + * always synchronize + */ + if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)buf)) { + int trueflag = 1; + PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag, + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + (CUdeviceptr)buf); + gpu_mem = 1; + gpu_user_buffer = buf; + } +#endif + + /* First check unexpected Queue and remove req if found */ + recv_req = mq_req_match_with_tagsel(mq, addr, tag, tagsel, REMOVE_ENTRY); + + if (recv_req == NULL) { + /* prepost before arrival, add to expected q */ + recv_req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + if_pf(recv_req == NULL) { + err = PSM2_NO_MEMORY; + goto recv_ret; + } + + recv_req->req_data.peer = addr; + recv_req->req_data.tag = *tag; + recv_req->req_data.tagsel = *tagsel; + recv_req->state = MQ_STATE_POSTED; + recv_req->req_data.buf = buf; + recv_req->req_data.buf_len = len; + recv_req->req_data.recv_msglen = len; + recv_req->recv_msgoff = 0; + recv_req->req_data.context = context; + +#ifdef PSM_CUDA + recv_req->is_buf_gpu_mem = gpu_mem; + recv_req->user_gpu_buffer = gpu_user_buffer; +#endif + + mq_add_to_expected_hashes(mq, recv_req); + _HFI_VDBG("buf=%p,len=%d,tag=%08x.%08x.%08x " + " tagsel=%08x.%08x.%08x req=%p\n", + buf, len, tag->tag[0], tag->tag[1], tag->tag[2], + tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], recv_req); + } else { + _HFI_VDBG("unexpected buf=%p,len=%d,tag=%08x.%08x.%08x" + " tagsel=%08x.%08x.%08x req=%p\n", buf, len, + tag->tag[0], tag->tag[1], tag->tag[2], + tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], recv_req); + +#ifdef PSM_CUDA + recv_req->is_buf_gpu_mem = gpu_mem; + recv_req->user_gpu_buffer = gpu_user_buffer; +#endif + + recv_req->req_data.context = context; + + psm2_mq_irecv_inner(mq, recv_req, buf, len); + } +recv_ret: + psmi_assert_req_not_internal(recv_req); + *req = recv_req; + } else { + err = PSM2_PARAM_ERR; + } + + PSM2_LOG_MSG("leaving"); + + return err; +} +PSMI_API_DECL(psm2_mq_fp_msg) + +psm2_error_t +__psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src, + psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, + uint32_t flags, void *buf, uint32_t len, void *context, + psm2_mq_req_t *reqo) +{ + psm2_error_t err = PSM2_OK; + psm2_mq_req_t req; + +#ifdef PSM_CUDA + int gpu_mem; + /* CUDA documentation dictates the use of SYNC_MEMOPS attribute + * when the buffer pointer received into PSM has been allocated + * by the application. This guarantees the all memory operations + * to this region of memory (used by multiple layers of the stack) + * always synchronize + */ + if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)buf)) { + int trueflag = 1; + PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag, + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + (CUdeviceptr)buf); + gpu_mem = 1; + } else + gpu_mem = 0; +#endif + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + + PSMI_LOCK(mq->progress_lock); + + /* First check unexpected Queue and remove req if found */ + req = mq_req_match_with_tagsel(mq, src, tag, tagsel, REMOVE_ENTRY); + + if (req == NULL) { + /* prepost before arrival, add to expected q */ + req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + if_pf(req == NULL) { + err = PSM2_NO_MEMORY; + goto ret; + } + + req->req_data.peer = src; + req->req_data.tag = *tag; + req->req_data.tagsel = *tagsel; + req->state = MQ_STATE_POSTED; + req->req_data.buf = buf; + req->req_data.buf_len = len; + req->req_data.recv_msglen = len; + req->recv_msgoff = 0; + req->req_data.context = context; + +#ifdef PSM_CUDA + req->is_buf_gpu_mem = gpu_mem; + if (gpu_mem) + req->user_gpu_buffer = buf; + else + req->user_gpu_buffer = NULL; +#endif + + mq_add_to_expected_hashes(mq, req); + _HFI_VDBG("buf=%p,len=%d,tag=%08x.%08x.%08x " + " tagsel=%08x.%08x.%08x req=%p\n", + buf, len, tag->tag[0], tag->tag[1], tag->tag[2], + tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], req); + } else { + _HFI_VDBG("unexpected buf=%p,len=%d,tag=%08x.%08x.%08x" + " tagsel=%08x.%08x.%08x req=%p\n", buf, len, + tag->tag[0], tag->tag[1], tag->tag[2], + tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], req); +#ifdef PSM_CUDA + req->is_buf_gpu_mem = gpu_mem; + if (gpu_mem) + req->user_gpu_buffer = buf; + else + req->user_gpu_buffer = NULL; +#endif + + req->req_data.context = context; + + psm2_mq_irecv_inner(mq, req, buf, len); + } + +ret: + PSMI_UNLOCK(mq->progress_lock); + psmi_assert_req_not_internal(req); + *reqo = req; + PSM2_LOG_MSG("leaving"); + + return err; +} +PSMI_API_DECL(psm2_mq_irecv2) + +psm2_error_t +__psm2_mq_irecv(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, uint32_t flags, + void *buf, uint32_t len, void *context, psm2_mq_req_t *reqo) +{ + psm2_error_t rv; + psm2_mq_tag_t rtag; + psm2_mq_tag_t rtagsel; + + *reqo = NULL; + + PSM2_LOG_MSG("entering tag: 0x%" PRIx64, tag); + + *(uint64_t *) rtag.tag = tag; +#ifdef PSM_DEBUG + rtag.tag[2] = 0; +#endif + *(uint64_t *) rtagsel.tag = tagsel; + rtagsel.tag[2] = 0; + rv = __psm2_mq_irecv2(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel, + flags, buf, len, context, reqo); + + psmi_assert_req_not_internal(*reqo); + PSM2_LOG_MSG("leaving"); + + return rv; +} +PSMI_API_DECL(psm2_mq_irecv) + +psm2_error_t +__psm2_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len, + void *context, psm2_mq_req_t *reqo) +{ + psm2_error_t err = PSM2_OK; + psm2_mq_req_t req = *reqo; + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + + if (req == PSM2_MQ_REQINVALID) { + err = psmi_handle_error(mq->ep, PSM2_PARAM_ERR, + "Invalid request (req=%p)", req); + } else { + /* Message is already matched -- begin delivering message data to the + user's buffer. */ + req->req_data.context = context; + +#ifdef PSM_CUDA + /* CUDA documentation dictates the use of SYNC_MEMOPS attribute + * when the buffer pointer received into PSM has been allocated + * by the application. This guarantees the all memory operations + * to this region of memory (used by multiple layers of the stack) + * always synchronize + */ + if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)buf)) { + int trueflag = 1; + PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag, + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + (CUdeviceptr)buf); + req->is_buf_gpu_mem = 1; + } else + req->is_buf_gpu_mem = 0; +#endif + + PSMI_LOCK(mq->progress_lock); + psm2_mq_irecv_inner(mq, req, buf, len); + PSMI_UNLOCK(mq->progress_lock); + } + + PSM2_LOG_MSG("leaving"); + + return err; +} +PSMI_API_DECL(psm2_mq_imrecv) + +/* The status argument can be an instance of either type psm2_mq_status_t or + * psm2_mq_status2_t. Depending on the type, a corresponding status copy + * routine should be passed in. + */ +PSMI_ALWAYS_INLINE( +psm2_error_t +psmi_mq_ipeek_inner(psm2_mq_t mq, psm2_mq_req_t *oreq, + void *status, + psmi_mq_status_copy_t status_copy)) +{ + psm2_mq_req_t req; + + PSMI_ASSERT_INITIALIZED(); + + if ((req = mq->completed_q.first) == NULL) { + PSMI_LOCK(mq->progress_lock); + psmi_poll_internal(mq->ep, 1); + if ((req = mq->completed_q.first) == NULL) { + PSMI_UNLOCK(mq->progress_lock); + return PSM2_MQ_NO_COMPLETIONS; + } + PSMI_UNLOCK(mq->progress_lock); + } + /* something in the queue */ + *oreq = req; + if (status != NULL) + status_copy(req, status); + + return PSM2_OK; +} + +psm2_error_t +__psm2_mq_ipeek2(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status2_t *status) +{ + psm2_error_t rv; + + *oreq = NULL; + + PSM2_LOG_MSG("entering"); + rv = psmi_mq_ipeek_inner(mq, oreq, status, + (psmi_mq_status_copy_t) mq_status2_copy); + + psmi_assert_req_not_internal(*oreq); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_mq_ipeek2) + +psm2_error_t +__psm2_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status_t *status) +{ + psm2_error_t rv; + + *oreq = NULL; + PSM2_LOG_MSG("entering"); + rv = psmi_mq_ipeek_inner(mq, oreq, status, + (psmi_mq_status_copy_t) mq_status_copy); + + psmi_assert_req_not_internal(*oreq); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_mq_ipeek) + +psm2_error_t __psm2_mq_ipeek_dequeue_multi(psm2_mq_t mq, void *status_array, + psmi_mq_status_copy_user_t status_copy, int *count) +{ + psm2_mq_req_t req; + int read_count = *count; + int ret = 0; + + PSMI_ASSERT_INITIALIZED(); + + *count = 0; + while (*count < read_count) { + PSMI_LOCK(mq->progress_lock); + + if (mq->completed_q.first == NULL) + psmi_poll_internal(mq->ep, 1); + + if ((req = mq->completed_q.first) == NULL) { + PSMI_UNLOCK(mq->progress_lock); + return PSM2_MQ_NO_COMPLETIONS; + } + + mq_qq_remove(&mq->completed_q, req); + PSMI_UNLOCK(mq->progress_lock); + + ret = status_copy(&req->req_data, status_array, *count); + psm2_mq_req_free(mq, req); + + if (unlikely(ret < 0)) { + *count = ret; + return PSM2_INTERNAL_ERR; + } else if (ret == 0) { + continue; + } + + *count = *count + 1; + + if (ret > 1) + break; + } + return PSM2_OK; +} +PSMI_API_DECL(psm2_mq_ipeek_dequeue_multi) + +psm2_error_t __psm2_mq_ipeek_dequeue(psm2_mq_t mq, psm2_mq_req_t *oreq) +{ + psm2_mq_req_t req; + + PSMI_ASSERT_INITIALIZED(); + PSMI_LOCK(mq->progress_lock); + if (mq->completed_q.first == NULL) + psmi_poll_internal(mq->ep, 1); + if ((req = mq->completed_q.first) == NULL) { + PSMI_UNLOCK(mq->progress_lock); + return PSM2_MQ_NO_COMPLETIONS; + } + mq_qq_remove(&mq->completed_q, req); + PSMI_UNLOCK(mq->progress_lock); + *oreq = req; + return PSM2_OK; +} +PSMI_API_DECL(psm2_mq_ipeek_dequeue) + +psm2_error_t __psm2_mq_req_free(psm2_mq_t mq, psm2_mq_req_t req) +{ + PSMI_ASSERT_INITIALIZED(); + if (req == NULL) + return PSM2_OK; + PSMI_LOCK(mq->progress_lock); + psmi_mq_req_free(req); + PSMI_UNLOCK(mq->progress_lock); + + return PSM2_OK; +} +PSMI_API_DECL(psm2_mq_req_free) + +static +psm2_error_t psmi_mqopt_ctl(psm2_mq_t mq, uint32_t key, void *value, int get) +{ + psm2_error_t err = PSM2_OK; + uint32_t val32; + + switch (key) { + case PSM2_MQ_RNDV_HFI_SZ: + if (get) + *((uint32_t *) value) = mq->hfi_thresh_rv; + else { + val32 = *((uint32_t *) value); + mq->hfi_thresh_rv = val32; + } + _HFI_VDBG("RNDV_HFI_SZ = %d (%s)\n", + mq->hfi_thresh_rv, get ? "GET" : "SET"); + break; + + case PSM2_MQ_RNDV_SHM_SZ: + if (get) + *((uint32_t *) value) = mq->shm_thresh_rv; + else { + val32 = *((uint32_t *) value); + mq->shm_thresh_rv = val32; + } + _HFI_VDBG("RNDV_SHM_SZ = %d (%s)\n", + mq->shm_thresh_rv, get ? "GET" : "SET"); + break; + case PSM2_MQ_MAX_SYSBUF_MBYTES: + /* Deprecated: this option no longer does anything. */ + break; + + default: + err = + psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Unknown option key=%u", key); + break; + } + return err; +} + +psm2_error_t __psm2_mq_getopt(psm2_mq_t mq, int key, void *value) +{ + psm2_error_t rv; + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(mq->ep); + rv = psmi_mqopt_ctl(mq, key, value, 1); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_mq_getopt) + +psm2_error_t __psm2_mq_setopt(psm2_mq_t mq, int key, const void *value) +{ + psm2_error_t rv; + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(mq->ep); + rv = psmi_mqopt_ctl(mq, key, (void *)value, 0); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_mq_setopt) + +#define TAB_SIZE 16 +#define STATS \ + STAT(rx_user_num) \ + STAT(rx_sys_bytes) \ + STAT(rx_sys_num) \ + STAT(tx_num) \ + STAT(tx_eager_num) \ + STAT(tx_eager_bytes) \ + STAT(tx_rndv_num) \ + STAT(tx_rndv_bytes) \ + STAT(tx_shm_num) \ + STAT(rx_shm_num) \ + STAT(rx_sysbuf_num) \ + STAT(rx_sysbuf_bytes) + +static +void +psmi_mq_print_stats(psm2_mq_t mq, FILE *perf_stats_fd) +{ + psm2_mq_stats_t stats; + char msg_buffer[MSG_BUFFER_LEN]; + + psm2_mq_get_stats(mq, &stats); + +#define STAT(x) \ + snprintf(msg_buffer, MSG_BUFFER_LEN, "%*lu",TAB_SIZE, stats.x); \ + fwrite(msg_buffer, sizeof(char), strlen(msg_buffer), perf_stats_fd); + + STATS + +#undef STAT + + fwrite("\n", sizeof(char), 1, perf_stats_fd); +} + + +static +void +*psmi_mq_print_stats_thread(void *_mq) +{ + psm2_mq_t mq = (psm2_mq_t)_mq; + char perf_file_name[MSG_BUFFER_LEN]; + char msg_buffer[MSG_BUFFER_LEN]; + int delta_t = 0; + + snprintf(perf_file_name, MSG_BUFFER_LEN, "./psm2-perf-stat-ep-%" PRIu64 "-pid-%d", + (uint64_t)(mq->ep->epid), + getpid()); + FILE *perf_stats_fd = fopen(perf_file_name, "w+"); + + if (!perf_stats_fd) + { + _HFI_ERROR("Failed to create fd for performance logging\n"); + goto end; + } + +#define STAT(x) \ + snprintf(msg_buffer, MSG_BUFFER_LEN, "%*s",TAB_SIZE, #x);\ + fwrite(msg_buffer, sizeof(char), strlen(msg_buffer), perf_stats_fd); + + STAT(delta_t) + STATS + +#undef STAT + + fwrite("\n", sizeof(char), 1, perf_stats_fd); + + /* Performance stats will be printed every $PSM2_MQ_PRINT_STATS seconds */ + do { + snprintf(msg_buffer, MSG_BUFFER_LEN, "%*d",TAB_SIZE, delta_t); + fwrite(msg_buffer, sizeof(char), strlen(msg_buffer), perf_stats_fd); + psmi_mq_print_stats(mq, perf_stats_fd); + fflush(perf_stats_fd); + usleep(MICRO_SEC * mq->print_stats); + delta_t += mq->print_stats; + } while (mq->mq_perf_data.perf_print_stats); + + fclose(perf_stats_fd); +end: + pthread_exit(NULL); +} + +static +void +psmi_mq_print_stats_init(psm2_mq_t mq) +{ + mq->mq_perf_data.perf_print_stats = 1; + if (pthread_create(&(mq->mq_perf_data.perf_print_thread), NULL, + psmi_mq_print_stats_thread, (void*)mq)) + { + mq->mq_perf_data.perf_print_stats = 0; + _HFI_ERROR("Failed to create logging thread\n"); + } +} + +static +void +psmi_mq_print_stats_finalize(psm2_mq_t mq) +{ + if (mq->mq_perf_data.perf_print_stats) + { + mq->mq_perf_data.perf_print_stats = 0; + pthread_join(mq->mq_perf_data.perf_print_thread, NULL); + } +} + +/* + * This is the API for the user. We actually allocate the MQ much earlier, but + * the user can set options after obtaining an endpoint + */ +psm2_error_t +__psm2_mq_init(psm2_ep_t ep, uint64_t tag_order_mask, + const struct psm2_optkey *opts, int numopts, psm2_mq_t *mqo) +{ + psm2_error_t err = PSM2_OK; + + if (ep == NULL) { + err = PSM2_PARAM_ERR; + goto fail; + } + + psm2_mq_t mq = ep->mq; + int i; + + PSM2_LOG_MSG("entering"); + + PSMI_ERR_UNLESS_INITIALIZED(ep); + + psmi_assert_always(mq != NULL); + psmi_assert_always(mq->ep != NULL); + + /* Process options */ + for (i = 0; err == PSM2_OK && i < numopts; i++) + err = psmi_mqopt_ctl(mq, opts[i].key, opts[i].value, 0); + if (err != PSM2_OK) /* error already handled */ + goto fail; + + /* Initialize the unexpected system buffer allocator */ + psmi_mq_sysbuf_init(mq); + char buf[128]; + psmi_mq_sysbuf_getinfo(mq, buf, sizeof buf); + _HFI_VDBG("%s", buf); + + *mqo = mq; + + if (mq->print_stats > 0) + psmi_mq_print_stats_init(mq); + +fail: + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_mq_init) + +psm2_error_t __psm2_mq_finalize(psm2_mq_t mq) +{ + psm2_error_t rv = PSM2_OK; + + PSM2_LOG_MSG("entering"); + + PSMI_ERR_UNLESS_INITIALIZED(mq->ep); + + if (mq->print_stats == -1) + { + mq->print_stats = 1; + psmi_mq_print_stats_init(mq); + } + if (mq->print_stats != 0) + psmi_mq_print_stats_finalize(mq); + + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_mq_finalize) + +void __psm2_mq_get_stats(psm2_mq_t mq, psm2_mq_stats_t *stats) +{ + PSM2_LOG_MSG("entering"); + memcpy(stats, &mq->stats, sizeof(psm2_mq_stats_t)); + PSM2_LOG_MSG("leaving"); +} +PSMI_API_DECL(psm2_mq_get_stats) + +psm2_error_t psmi_mq_malloc(psm2_mq_t *mqo) +{ + psm2_error_t err = PSM2_OK; + + psm2_mq_t mq = + (psm2_mq_t) psmi_calloc(NULL, UNDEFINED, 1, sizeof(struct psm2_mq)); + if (mq == NULL) { + err = psmi_handle_error(NULL, PSM2_NO_MEMORY, + "Couldn't allocate memory for mq endpoint"); + goto fail; + } + + mq->ep = NULL; + /*mq->unexpected_callback = NULL; */ + mq->memmode = psmi_parse_memmode(); + + memset(mq->unexpected_htab, 0, + NUM_HASH_CONFIGS * NUM_HASH_BUCKETS * sizeof(struct mqq)); + memset(mq->expected_htab, 0, + NUM_HASH_CONFIGS * NUM_HASH_BUCKETS * sizeof(struct mqq)); + memset(&mq->expected_q, 0, sizeof(struct mqq)); + memset(&mq->unexpected_q, 0, sizeof(struct mqq)); + memset(&mq->completed_q, 0, sizeof(struct mqq)); + memset(&mq->outoforder_q, 0, sizeof(struct mqq)); + STAILQ_INIT(&mq->eager_q); + + + /* The values are overwritten in initialize_defaults, they're just set to + * sensible defaults until then */ + if(psmi_cpu_model == CPUID_MODEL_PHI_GEN2 || psmi_cpu_model == CPUID_MODEL_PHI_GEN2M) + { + mq->hfi_thresh_rv = MQ_HFI_THRESH_RNDV_PHI2; + mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_PHI2; + } else { + mq->hfi_thresh_rv = MQ_HFI_THRESH_RNDV_XEON; + mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_XEON; + } + mq->hfi_thresh_tiny = MQ_HFI_THRESH_TINY; +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED) + mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_CUDA; +#endif + mq->shm_thresh_rv = MQ_SHM_THRESH_RNDV; + + memset(&mq->stats, 0, sizeof(psm2_mq_stats_t)); + err = psmi_mq_req_init(mq); + if (err) + goto fail; + + *mqo = mq; + + return PSM2_OK; +fail: + if (mq != NULL) + psmi_free(mq); + return err; +} + +psm2_error_t psmi_mq_initialize_defaults(psm2_mq_t mq) +{ + union psmi_envvar_val env_hfitiny, env_rvwin, env_hfirv, + env_shmrv, env_stats; + + psmi_getenv("PSM2_MQ_TINY_HFI_THRESH", + "hfi tiny packet switchover (max 8, default 8)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)mq->hfi_thresh_tiny, &env_hfitiny); + mq->hfi_thresh_tiny = min(env_hfitiny.e_uint, 8); + + psmi_getenv("PSM2_MQ_RNDV_HFI_THRESH", + "hfi eager-to-rendezvous switchover", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)mq->hfi_thresh_rv, &env_hfirv); + mq->hfi_thresh_rv = env_hfirv.e_uint; + + psmi_getenv("PSM2_MQ_RNDV_HFI_WINDOW", + "hfi rendezvous window size, max 4M", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)mq->hfi_base_window_rv, &env_rvwin); + mq->hfi_base_window_rv = min(4 * 1024 * 1024, env_rvwin.e_uint); + + /* Re-evaluate this since it may have changed after initializing the shm + * device */ + mq->shm_thresh_rv = psmi_shm_mq_rv_thresh; + psmi_getenv("PSM2_MQ_RNDV_SHM_THRESH", + "shm eager-to-rendezvous switchover", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)mq->shm_thresh_rv, &env_shmrv); + mq->shm_thresh_rv = env_shmrv.e_uint; + + psmi_getenv("PSM2_MQ_PRINT_STATS", + "Prints MQ performance stats every n seconds to file" + "./psm2-perf-stat-ep-[epid]-[pid] when set to -1 stats are " + "printed only once during finalization", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) 0, &env_stats); + mq->print_stats = env_stats.e_uint; + + mq->nohash_fastpath = 1; + return PSM2_OK; +} + +psm2_error_t MOCKABLE(psmi_mq_free)(psm2_mq_t mq) +{ + psmi_mq_req_fini(mq); + psmi_mq_sysbuf_fini(mq); + psmi_free(mq); + return PSM2_OK; +} +MOCK_DEF_EPILOGUE(psmi_mq_free); diff --git a/psm_mq_internal.h b/psm_mq_internal.h new file mode 100644 index 0000000..1a26898 --- /dev/null +++ b/psm_mq_internal.h @@ -0,0 +1,624 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#ifndef MQ_INT_H +#define MQ_INT_H + +/* Ugh. smmintrin.h eventually includes mm_malloc.h, which calls malloc */ +#ifdef malloc +#undef malloc +#endif +#ifdef free +#undef free +#endif +#include +#include "psm_user.h" +#include "psm_sysbuf.h" + +#include "psm2_mock_testing.h" + +#if 0 +typedef psm2_error_t(*psm_mq_unexpected_callback_fn_t) + (psm2_mq_t mq, uint16_t mode, psm2_epaddr_t epaddr, + uint64_t tag, uint32_t send_msglen, const void *payload, + uint32_t paylen); +#endif + +#define MICRO_SEC 1000000 +#define MSG_BUFFER_LEN 100 + +struct psm2_mq_perf_data +{ + pthread_t perf_print_thread; + int perf_print_stats; +}; + +enum psm2_mq_tag_pattern { + PSM2_TAG_SRC = 0, + PSM2_TAG_ANYSRC, + PSM2_ANYTAG_SRC, + PSM2_ANYTAG_ANYSRC, +}; + +struct psm2_mq { + psm2_ep_t ep; /**> ep back pointer */ + mpool_t sreq_pool; + mpool_t rreq_pool; + + struct mqq unexpected_htab[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS]; + struct mqq expected_htab[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS]; + + /* in case the compiler can't figure out how to preserve the hashed values + between mq_req_match() and mq_add_to_unexpected_hashes() ... */ + unsigned hashvals[NUM_HASH_CONFIGS]; + + /*psm_mq_unexpected_callback_fn_t unexpected_callback; */ + struct mqq expected_q; /**> Preposted (expected) queue */ + struct mqq unexpected_q; /**> Unexpected queue */ + struct mqq completed_q; /**> Completed queue */ + + struct mqq outoforder_q; /**> OutofOrder queue */ + STAILQ_HEAD(, psm2_mq_req) eager_q; /**> eager request queue */ + + uint32_t hfi_thresh_tiny; + uint32_t hfi_thresh_rv; + uint32_t shm_thresh_rv; + uint32_t hfi_base_window_rv; /**> this is a base rndv window size, + will be further trimmed down per-connection based + on the peer's MTU */ + int memmode; + + uint64_t timestamp; + + psm2_mq_stats_t stats; /**> MQ stats, accumulated by each PTL */ + + int print_stats; + struct psm2_mq_perf_data mq_perf_data; + + int nohash_fastpath; + unsigned unexpected_hash_len; + unsigned unexpected_list_len; + unsigned expected_hash_len; + unsigned expected_list_len; + + psmi_mem_ctrl_t handler_index[MM_NUM_OF_POOLS]; + int mem_ctrl_is_init; + uint64_t mem_ctrl_total_bytes; + + psmi_lock_t progress_lock; +}; + +#define MQE_TYPE_IS_SEND(type) ((type) & MQE_TYPE_SEND) +#define MQE_TYPE_IS_RECV(type) ((type) & MQE_TYPE_RECV) + +#define MQE_TYPE_SEND 0x1000 +#define MQE_TYPE_RECV 0x2000 +#define MQE_TYPE_FLAGMASK 0x0fff +#define MQE_TYPE_WAITING 0x0001 +#define MQE_TYPE_WAITING_PEER 0x0004 +#define MQE_TYPE_EAGER_QUEUE 0x0008 + +#define MQ_STATE_COMPLETE 0 +#define MQ_STATE_POSTED 1 +#define MQ_STATE_MATCHED 2 +#define MQ_STATE_UNEXP 3 +#define MQ_STATE_UNEXP_RV 4 +#define MQ_STATE_FREE 5 + +/* + * These must match the ips protocol message opcode. + */ +#define MQ_MSG_TINY 0xc1 +#define MQ_MSG_SHORT 0xc2 +#define MQ_MSG_EAGER 0xc3 +#define MQ_MSG_LONGRTS 0xc4 + +/* + * Descriptor allocation limits. + * The 'LIMITS' predefines fill in a psmi_rlimits_mpool structure + */ +#define MQ_SENDREQ_LIMITS { \ + .env = "PSM2_MQ_SENDREQS_MAX", \ + .descr = "Max num of isend requests in flight", \ + .env_level = PSMI_ENVVAR_LEVEL_USER, \ + .minval = 1, \ + .maxval = ~0, \ + .mode[PSMI_MEMMODE_NORMAL] = { 1024, 1048576 }, \ + .mode[PSMI_MEMMODE_MINIMAL] = { 1024, 65536 }, \ + .mode[PSMI_MEMMODE_LARGE] = { 8192, 16777216 } \ + } + +#define MQ_RECVREQ_LIMITS { \ + .env = "PSM2_MQ_RECVREQS_MAX", \ + .descr = "Max num of irecv requests in flight", \ + .env_level = PSMI_ENVVAR_LEVEL_USER, \ + .minval = 1, \ + .maxval = ~0, \ + .mode[PSMI_MEMMODE_NORMAL] = { 1024, 1048576 }, \ + .mode[PSMI_MEMMODE_MINIMAL] = { 1024, 65536 }, \ + .mode[PSMI_MEMMODE_LARGE] = { 8192, 16777216 } \ + } + +typedef psm2_error_t(*mq_rts_callback_fn_t) (psm2_mq_req_t req, int was_posted); +typedef psm2_error_t(*mq_testwait_callback_fn_t) (psm2_mq_req_t *req); + + +/* If request is marked as internal, then it will not + be exposed to the user, will not be added to the mq->completed_q. + This flag is set if request is used by e.g. MPI_SEND */ +#define PSMI_REQ_FLAG_IS_INTERNAL (1 << 0) +/* Identifies req as part of fast path. */ +#define PSMI_REQ_FLAG_FASTPATH (1 << 1) +/* Identifies req as a NORMAL operation with no special cases.*/ +#define PSMI_REQ_FLAG_NORMAL 0 + +#define psmi_is_req_internal(req) ((req)->flags_internal & PSMI_REQ_FLAG_IS_INTERNAL) + +#define psmi_assert_req_not_internal(req) psmi_assert(((req) == PSM2_MQ_REQINVALID) || \ + (!psmi_is_req_internal(req))) + +/* receive mq_req, the default */ +struct psm2_mq_req { + struct psm2_mq_req_user req_data; + + struct { + psm2_mq_req_t next[NUM_MQ_SUBLISTS]; + psm2_mq_req_t prev[NUM_MQ_SUBLISTS]; + STAILQ_ENTRY(psm2_mq_req) nextq; /* used for eager only */ + }; + struct mqq *q[NUM_MQ_SUBLISTS]; + uint64_t timestamp; + uint32_t state; + uint32_t type; + psm2_mq_t mq; + + /* Some PTLs want to get notified when there's a test/wait event */ + mq_testwait_callback_fn_t testwait_callback; + + uint16_t msg_seqnum; /* msg seq num for mctxt */ + uint32_t recv_msgoff; /* Message offset into req_data.buf */ + union { + uint32_t send_msgoff; /* Bytes received so far.. can be larger than buf_len */ + uint32_t recv_msgposted; + }; + uint32_t rts_reqidx_peer; + + uint32_t flags_user; + uint32_t flags_internal; + + /* Used to keep track of unexpected rendezvous */ + mq_rts_callback_fn_t rts_callback; + psm2_epaddr_t rts_peer; + uintptr_t rts_sbuf; + +#ifdef PSM_CUDA + uint8_t* user_gpu_buffer; + STAILQ_HEAD(sendreq_spec_, ips_cuda_hostbuf) sendreq_prefetch; + uint32_t prefetch_send_msgoff; + int cuda_hostbuf_used; + CUipcMemHandle cuda_ipc_handle; + CUevent cuda_ipc_event; + uint8_t cuda_ipc_handle_attached; + uint32_t cuda_ipc_offset; + /* + * is_sendbuf_gpu_mem - Used to always select TID path on the receiver + * when send is on a device buffer + */ + uint8_t is_sendbuf_gpu_mem; +#endif + /* + * is_buf_gpu_mem - used to indicate if the send or receive is issued + * on a device/host buffer. + */ + uint8_t is_buf_gpu_mem; + + /* PTLs get to store their own per-request data. MQ manages the allocation + * by allocating psm2_mq_req so that ptl_req_data has enough space for all + * possible PTLs. + */ + union { + void *ptl_req_ptr; /* when used by ptl as pointer */ + uint8_t ptl_req_data[0]; /* when used by ptl for "inline" data */ + }; +}; + +PSMI_ALWAYS_INLINE( +unsigned +hash_64(uint64_t a)) +{ + return _mm_crc32_u64(0, a); +} +PSMI_ALWAYS_INLINE( +unsigned +hash_32(uint32_t a)) +{ + return _mm_crc32_u32(0, a); +} + +void MOCKABLE(psmi_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars); +MOCK_DCL_EPILOGUE(psmi_mq_mtucpy); +void psmi_mq_mtucpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars); + +#if defined(__x86_64__) +void psmi_mq_mtucpy_safe(void *vdest, const void *vsrc, uint32_t nchars); +#else +#define psmi_mq_mtucpy_safe psmi_mq_mtucpy +#endif + +/* + * Optimize for 0-8 byte case, but also handle others. + */ +PSMI_ALWAYS_INLINE( +void +mq_copy_tiny(uint32_t *dest, uint32_t *src, uint8_t len)) +{ +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(dest) || PSMI_IS_CUDA_MEM(src))) { + if (!PSMI_IS_CUDA_ENABLED) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Please enable PSM CUDA support when using GPU buffer \n"); + return; + } + PSMI_CUDA_CALL(cuMemcpy, (CUdeviceptr)dest, (CUdeviceptr)src, len); + return; + } +#endif + switch (len) { + case 8: + *dest++ = *src++; + case 4: + *dest++ = *src++; + case 0: + return; + case 7: + case 6: + case 5: + *dest++ = *src++; + len -= 4; + case 3: + case 2: + case 1: + break; + default: /* greater than 8 */ + psmi_mq_mtucpy(dest, src, len); + return; + } + uint8_t *dest1 = (uint8_t *) dest; + uint8_t *src1 = (uint8_t *) src; + switch (len) { + case 3: + *dest1++ = *src1++; + case 2: + *dest1++ = *src1++; + case 1: + *dest1++ = *src1++; + } +} + +#ifdef PSM_CUDA +typedef void (*psmi_mtucpy_fn_t)(void *dest, const void *src, uint32_t len); + +PSMI_ALWAYS_INLINE( +void +mq_copy_tiny_host_mem(uint32_t *dest, uint32_t *src, uint8_t len)) +{ + switch (len) { + case 8: + *dest++ = *src++; + case 4: + *dest++ = *src++; + case 0: + return; + case 7: + case 6: + case 5: + *dest++ = *src++; + len -= 4; + case 3: + case 2: + case 1: + break; + default: /* greater than 8 */ + psmi_mq_mtucpy(dest, src, len); + return; + } + uint8_t *dest1 = (uint8_t *) dest; + uint8_t *src1 = (uint8_t *) src; + switch (len) { + case 3: + *dest1++ = *src1++; + case 2: + *dest1++ = *src1++; + case 1: + *dest1++ = *src1++; + } +} +#endif + +/* Typedef describing a function to populate a psm2_mq_status(2)_t given a + * matched request. The purpose of this typedef is to avoid duplicating + * code to handle both PSM v1 and v2 status objects. Outer routines pass in + * either mq_status_copy or mq_status2_copy and the inner routine calls that + * provided routine to fill in the correct status type. + */ +typedef void (*psmi_mq_status_copy_t) (psm2_mq_req_t req, void *status); + +/* + * Given an req with buffer ubuf of length ubuf_len, + * fill in the req's status and return the amount of bytes the request + * can receive. + * + * The function sets status truncation errors. Basically what MPI_Status does. + */ +PSMI_ALWAYS_INLINE( +void +mq_status_copy(psm2_mq_req_t req, psm2_mq_status_t *status)) +{ + status->msg_tag = *((uint64_t *) req->req_data.tag.tag); + status->msg_length = req->req_data.send_msglen; + status->nbytes = req->req_data.recv_msglen; + status->error_code = req->req_data.error_code; + status->context = req->req_data.context; +} + +PSMI_ALWAYS_INLINE( +void +mq_status2_copy(psm2_mq_req_t req, psm2_mq_status2_t *status)) +{ + status->msg_peer = req->req_data.peer; + status->msg_tag = req->req_data.tag; + status->msg_length = req->req_data.send_msglen; + status->nbytes = req->req_data.recv_msglen; + status->error_code = req->req_data.error_code; + status->context = req->req_data.context; +} + +PSMI_ALWAYS_INLINE( +uint32_t +mq_set_msglen(psm2_mq_req_t req, uint32_t recvlen, uint32_t sendlen)) +{ + req->req_data.send_msglen = sendlen; + if (recvlen < sendlen) { + req->req_data.recv_msglen = recvlen; + req->req_data.error_code = PSM2_MQ_TRUNCATION; + return recvlen; + } else { + req->req_data.recv_msglen = sendlen; + req->req_data.error_code = PSM2_OK; + return sendlen; + } +} + +PSMI_ALWAYS_INLINE( +int +min_timestamp_4(psm2_mq_req_t *match)) +{ + uint64_t oldest = -1; + int which = -1, i; + for (i = 0; i < 4; i++) { + if (match[i] && (match[i]->timestamp < oldest)) { + oldest = match[i]->timestamp; + which = i; + } + } + return which; +} + +#ifndef PSM_DEBUG +/*! Append to Queue */ +PSMI_ALWAYS_INLINE(void mq_qq_append(struct mqq *q, psm2_mq_req_t req)) +{ + req->next[PSM2_ANYTAG_ANYSRC] = NULL; + req->prev[PSM2_ANYTAG_ANYSRC] = q->last; + if (q->last) + q->last->next[PSM2_ANYTAG_ANYSRC] = req; + else + q->first = req; + q->last = req; + req->q[PSM2_ANYTAG_ANYSRC] = q; +} +#else +#define mq_qq_append(qq, req) \ + do { \ + psmi_assert_req_not_internal(req); \ + (req)->next[PSM2_ANYTAG_ANYSRC] = NULL; \ + (req)->prev[PSM2_ANYTAG_ANYSRC] = (qq)->last; \ + if ((qq)->last) \ + (qq)->last->next[PSM2_ANYTAG_ANYSRC] = (req); \ + else \ + (qq)->first = (req); \ + (qq)->last = (req); \ + (req)->q[PSM2_ANYTAG_ANYSRC] = (qq); \ + if (qq == &(req)->mq->completed_q) \ + _HFI_VDBG("Moving (req)=%p to completed queue on %s, %d\n", \ + (req), __FILE__, __LINE__); \ + } while (0) +#endif +PSMI_ALWAYS_INLINE( +void mq_qq_append_which(struct mqq q[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS], + int table, int bucket, psm2_mq_req_t req)) +{ + req->next[table] = NULL; + req->prev[table] = q[table][bucket].last; + if (q[table][bucket].last) + q[table][bucket].last->next[table] = req; + else + q[table][bucket].first = req; + q[table][bucket].last = req; + req->q[table] = &q[table][bucket]; +} +PSMI_ALWAYS_INLINE(void mq_qq_remove(struct mqq *q, psm2_mq_req_t req)) +{ + if (req->next[PSM2_ANYTAG_ANYSRC] != NULL) + req->next[PSM2_ANYTAG_ANYSRC]->prev[PSM2_ANYTAG_ANYSRC] = + req->prev[PSM2_ANYTAG_ANYSRC]; + else + q->last = req->prev[PSM2_ANYTAG_ANYSRC]; + if (req->prev[PSM2_ANYTAG_ANYSRC]) + req->prev[PSM2_ANYTAG_ANYSRC]->next[PSM2_ANYTAG_ANYSRC] = + req->next[PSM2_ANYTAG_ANYSRC]; + else + q->first = req->next[PSM2_ANYTAG_ANYSRC]; +} +PSMI_ALWAYS_INLINE(void mq_qq_remove_which(psm2_mq_req_t req, int table)) +{ + struct mqq *q = req->q[table]; + + req->q[table] = NULL; + if (req->next[table] != NULL) + req->next[table]->prev[table] = req->prev[table]; + else + q->last = req->prev[table]; + if (req->prev[table]) + req->prev[table]->next[table] = req->next[table]; + else + q->first = req->next[table]; +} + +psm2_error_t psmi_mq_req_init(psm2_mq_t mq); +psm2_error_t psmi_mq_req_fini(psm2_mq_t mq); +psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type); +MOCK_DCL_EPILOGUE(psmi_mq_req_alloc); +#define psmi_mq_req_free(req) psmi_mpool_put(req) + +/* + * Main receive progress engine, for shmops and hfi, in mq.c + */ +psm2_error_t psmi_mq_malloc(psm2_mq_t *mqo); +psm2_error_t psmi_mq_initialize_defaults(psm2_mq_t mq); + +psm2_error_t MOCKABLE(psmi_mq_free)(psm2_mq_t mq); +MOCK_DCL_EPILOGUE(psmi_mq_free); + +/* Three functions that handle all MQ stuff */ +#define MQ_RET_MATCH_OK 0 +#define MQ_RET_UNEXP_OK 1 +#define MQ_RET_UNEXP_NO_RESOURCES 2 +#define MQ_RET_DATA_OK 3 +#define MQ_RET_DATA_OUT_OF_ORDER 4 + +void psmi_mq_handle_rts_complete(psm2_mq_req_t req); +int psmi_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req, + uint32_t offset, const void *payload, uint32_t paylen); +int psmi_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, + uint32_t msglen, const void *payload, uint32_t paylen, + int msgorder, mq_rts_callback_fn_t cb, + psm2_mq_req_t *req_o); +int psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, + uint32_t msglen, uint32_t offset, + const void *payload, uint32_t paylen, int msgorder, + uint32_t opcode, psm2_mq_req_t *req_o); +int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t req); + +void psmi_mq_stats_register(psm2_mq_t mq, mpspawn_stats_add_fn add_fn); + +void psmi_mq_fastpath_disable(psm2_mq_t mq); +void psmi_mq_fastpath_try_reenable(psm2_mq_t mq); + +PSMI_ALWAYS_INLINE( +psm2_mq_req_t +mq_ooo_match(struct mqq *q, void *msgctl, uint16_t msg_seqnum)) +{ + psm2_mq_req_t *curp; + psm2_mq_req_t cur; + + for (curp = &q->first; (cur = *curp) != NULL; curp = &cur->next[PSM2_ANYTAG_ANYSRC]) { + if (cur->ptl_req_ptr == msgctl && cur->msg_seqnum == msg_seqnum) { + /* match! */ + mq_qq_remove(q, cur); + return cur; + } + } + return NULL; /* no match */ +} + +PSMI_ALWAYS_INLINE( +psm2_mq_req_t +mq_eager_match(psm2_mq_t mq, void *peer, uint16_t msg_seqnum)) +{ + psm2_mq_req_t cur; + + cur = STAILQ_FIRST(&mq->eager_q); + while (cur) { + if (cur->ptl_req_ptr == peer && cur->msg_seqnum == msg_seqnum) + return cur; + cur = STAILQ_NEXT(cur, nextq); + } + return NULL; /* no match */ +} + +#if 0 +/* Not exposed in public psm, but may extend parts of PSM 2.1 to support + * this feature before 2.3 */ +psm_mq_unexpected_callback_fn_t +psmi_mq_register_unexpected_callback(psm2_mq_t mq, + psm_mq_unexpected_callback_fn_t fn); +#endif + +PSMI_ALWAYS_INLINE(void psmi_mq_stats_rts_account(psm2_mq_req_t req)) +{ + psm2_mq_t mq = req->mq; + if (MQE_TYPE_IS_SEND(req->type)) { + mq->stats.tx_num++; + mq->stats.tx_rndv_num++; + mq->stats.tx_rndv_bytes += req->req_data.send_msglen; + } else { + mq->stats.rx_user_num++; + mq->stats.rx_user_bytes += req->req_data.recv_msglen; + } + return; +} + +#endif diff --git a/psm_mq_recv.c b/psm_mq_recv.c new file mode 100644 index 0000000..0f46075 --- /dev/null +++ b/psm_mq_recv.c @@ -0,0 +1,635 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "psm_mq_internal.h" +#include "ptl_ips/ips_proto_header.h" + +#ifdef PSM_CUDA +#include "psm_gdrcpy.h" +#endif + +#if 0 +/* Not exposed in public psm, but may extend parts of PSM 2.1 to support + * this feature before 2.3 */ +psm_mq_unexpected_callback_fn_t +psmi_mq_register_unexpected_callback(psm2_mq_t mq, + psm_mq_unexpected_callback_fn_t fn) +{ + psm_mq_unexpected_callback_fn_t old_fn = mq->unexpected_callback; + mq->unexpected_callback = fn; + return old_fn; +} +#endif + +void psmi_mq_handle_rts_complete(psm2_mq_req_t req) +{ + psm2_mq_t mq = req->mq; + + /* Stats on rendez-vous messages */ + psmi_mq_stats_rts_account(req); + req->state = MQ_STATE_COMPLETE; + ips_barrier(); + if(!psmi_is_req_internal(req)) + mq_qq_append(&mq->completed_q, req); + + _HFI_VDBG("RTS complete, req=%p, recv_msglen = %d\n", + req, req->req_data.recv_msglen); + return; +} + +static void +psmi_mq_req_copy(psm2_mq_req_t req, + uint32_t offset, const void *buf, uint32_t nbytes) +{ + /* recv_msglen may be changed by unexpected receive req_data.buf. */ + uint32_t msglen_this, end; + uint8_t *msgptr = (uint8_t *) req->req_data.buf + offset; + + /* out of receiving range. */ + if (offset >= req->req_data.recv_msglen) { + req->send_msgoff += nbytes; + return; + } + + end = offset + nbytes; + if (end > req->req_data.recv_msglen) { + msglen_this = req->req_data.recv_msglen - offset; + end = req->req_data.recv_msglen; + } else { + msglen_this = nbytes; + } + + psmi_mq_mtucpy(msgptr, buf, msglen_this); + + if (req->recv_msgoff < end) { + req->recv_msgoff = end; + } + + req->send_msgoff += nbytes; + return; +} + +int +psmi_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req, + uint32_t offset, const void *buf, uint32_t nbytes) +{ + psmi_assert(req != NULL); + int rc; + + if (req->state == MQ_STATE_MATCHED) + rc = MQ_RET_MATCH_OK; + else { + psmi_assert(req->state == MQ_STATE_UNEXP); + rc = MQ_RET_UNEXP_OK; + } + + psmi_mq_req_copy(req, offset, buf, nbytes); + + /* + * the reason to use >= is because send_msgoff + * may be DW pad included. + */ + if (req->send_msgoff >= req->req_data.send_msglen) { + if (req->type & MQE_TYPE_EAGER_QUEUE) { + STAILQ_REMOVE(&mq->eager_q, req, psm2_mq_req, nextq); + } + + if (req->state == MQ_STATE_MATCHED) { + req->state = MQ_STATE_COMPLETE; + ips_barrier(); + mq_qq_append(&mq->completed_q, req); + } else { /* MQ_STATE_UNEXP */ + req->state = MQ_STATE_COMPLETE; + } + } + + return rc; +} + +static +void mq_add_to_unexpected_hashes(psm2_mq_t mq, psm2_mq_req_t req) +{ + int table; + mq_qq_append(&mq->unexpected_q, req); + req->q[PSM2_ANYTAG_ANYSRC] = &mq->unexpected_q; + mq->unexpected_list_len++; + if_pt (mq->nohash_fastpath) { + if_pf (mq->unexpected_list_len >= HASH_THRESHOLD) + psmi_mq_fastpath_disable(mq); + return; + } + + for (table = PSM2_TAG_SRC; table < PSM2_ANYTAG_ANYSRC; table++) + mq_qq_append_which(mq->unexpected_htab, + table, mq->hashvals[table], req); + mq->unexpected_hash_len++; +} + + +psm2_mq_req_t +mq_list_scan(struct mqq *q, psm2_epaddr_t src, psm2_mq_tag_t *tag, int which, uint64_t *time_threshold) +{ + psm2_mq_req_t *curp, cur; + + for (curp = &q->first; + ((cur = *curp) != NULL) && (cur->timestamp < *time_threshold); + curp = &cur->next[which]) { + if ((cur->req_data.peer == PSM2_MQ_ANY_ADDR || src == cur->req_data.peer) && + !((tag->tag[0] ^ cur->req_data.tag.tag[0]) & cur->req_data.tagsel.tag[0]) && + !((tag->tag[1] ^ cur->req_data.tag.tag[1]) & cur->req_data.tagsel.tag[1]) && + !((tag->tag[2] ^ cur->req_data.tag.tag[2]) & cur->req_data.tagsel.tag[2])) { + *time_threshold = cur->timestamp; + return cur; + } + } + return NULL; +} + +psm2_mq_req_t +mq_req_match(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, int remove) +{ + psm2_mq_req_t match[4]; + int table; + uint64_t best_ts = -1; + + if (mq->nohash_fastpath) { + table = PSM2_ANYTAG_ANYSRC; + match[table] = + mq_list_scan(&mq->expected_q, + src, tag, PSM2_ANYTAG_ANYSRC, &best_ts); + if (match[table] && remove) { + mq->expected_list_len--; + mq_qq_remove_which(match[table], table); + } + return match[table]; + } + + mq->hashvals[PSM2_TAG_SRC] = hash_64(*(uint64_t *) tag->tag) % NUM_HASH_BUCKETS; + mq->hashvals[PSM2_TAG_ANYSRC] = hash_32(tag->tag[0]) % NUM_HASH_BUCKETS; + mq->hashvals[PSM2_ANYTAG_SRC] = hash_32(tag->tag[1]) % NUM_HASH_BUCKETS; + + for (table = PSM2_TAG_SRC; table < PSM2_ANYTAG_ANYSRC; table++) + match[table] = + mq_list_scan(&mq->expected_htab[table][mq->hashvals[table]], + src, tag, table, &best_ts); + table = PSM2_ANYTAG_ANYSRC; + match[table] = mq_list_scan(&mq->expected_q, src, tag, table, &best_ts); + + table = min_timestamp_4(match); + if (table == -1) + return NULL; + + if (remove) { + if_pt (table == PSM2_ANYTAG_ANYSRC) + mq->expected_list_len--; + else + mq->expected_hash_len--; + mq_qq_remove_which(match[table], table); + psmi_mq_fastpath_try_reenable(mq); + } + return match[table]; +} +/* + * This handles the rendezvous MPI envelopes, the packet might have the whole + * message payload, or zero payload. + */ +int +psmi_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, + uint32_t send_msglen, const void *payload, uint32_t paylen, + int msgorder, mq_rts_callback_fn_t cb, psm2_mq_req_t *req_o) +{ + psm2_mq_req_t req; + uint32_t msglen; + int rc; + + PSMI_LOCK_ASSERT(mq->progress_lock); + + if (msgorder && (req = mq_req_match(mq, src, tag, 1))) { + /* we have a match, no need to callback */ + msglen = mq_set_msglen(req, req->req_data.buf_len, send_msglen); + /* reset send_msglen because sender only sends this many */ + req->req_data.send_msglen = msglen; + req->state = MQ_STATE_MATCHED; + req->req_data.peer = src; + req->req_data.tag = *tag; + + if (paylen > msglen) paylen = msglen; + if (paylen) { + psmi_mq_mtucpy(req->req_data.buf, payload, paylen); + } + req->recv_msgoff = req->send_msgoff = paylen; + *req_o = req; /* yes match */ + PSM2_LOG_EPM(OPCODE_LONG_RTS,PSM2_LOG_RX,src->epid,mq->ep->epid, + "req->rts_reqidx_peer: %d",req->rts_reqidx_peer); + rc = MQ_RET_MATCH_OK; + } else if (msgorder > 1) { + /* There is NO request match, and this is the first time + * to try to process this packet, we leave the packet in + * hardware queue for retry in hope there is a request + * match next time, this is for performance + * consideration. + */ + rc = MQ_RET_UNEXP_NO_RESOURCES; + } else { /* No match, keep track of callback */ + req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + psmi_assert(req != NULL); + /* We don't know recv_msglen yet but we set it here for + * mq_iprobe */ + req->req_data.send_msglen = req->req_data.recv_msglen = send_msglen; + PSM2_LOG_EPM_COND(req->req_data.send_msglen > mq->hfi_thresh_rv, + OPCODE_LONG_RTS,PSM2_LOG_RX,src->epid,mq->ep->epid, + "req->rts_reqidx_peer: %d",req->rts_reqidx_peer); + req->state = MQ_STATE_UNEXP_RV; + req->req_data.peer = src; + req->req_data.tag = *tag; + req->rts_callback = cb; + if (paylen > send_msglen) paylen = send_msglen; + if (paylen) { + req->req_data.buf = psmi_mq_sysbuf_alloc(mq, paylen); + psmi_assert(paylen == 0 || req->req_data.buf != NULL); + mq->stats.rx_sysbuf_num++; + mq->stats.rx_sysbuf_bytes += paylen; + psmi_mq_mtucpy(req->req_data.buf, payload, paylen); + } + req->recv_msgoff = req->send_msgoff = paylen; + + if (msgorder) { + mq_add_to_unexpected_hashes(mq, req); + } + /* caller will handle out of order case */ + *req_o = req; /* no match, will callback */ + rc = MQ_RET_UNEXP_OK; + } + +#ifdef PSM_DEBUG + if (req) + _HFI_VDBG("match=%s (req=%p) src=%s mqtag=%08x.%08x.%08x recvlen=%d " + "sendlen=%d errcode=%d\n", + rc == MQ_RET_MATCH_OK ? "YES" : "NO", req, + psmi_epaddr_get_name(src->epid), + req->req_data.tag.tag[0], req->req_data.tag.tag[1], req->req_data.tag.tag[2], + req->req_data.recv_msglen, req->req_data.send_msglen, req->req_data.error_code); + else + _HFI_VDBG("match=%s (req=%p) src=%s\n", + rc == MQ_RET_MATCH_OK ? "YES" : "NO", req, + psmi_epaddr_get_name(src->epid)); +#endif /* #ifdef PSM_DEBUG */ + return rc; +} + +/* + * This handles the regular (i.e. non-rendezvous MPI envelopes) + */ +int +psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, + uint32_t send_msglen, uint32_t offset, + const void *payload, uint32_t paylen, int msgorder, + uint32_t opcode, psm2_mq_req_t *req_o) +{ + psm2_mq_req_t req; + uint32_t msglen; + + if (msgorder && (req = mq_req_match(mq, src, tag, 1))) { + /* we have a match */ + psmi_assert(MQE_TYPE_IS_RECV(req->type)); + req->req_data.peer = src; + req->req_data.tag = *tag; + msglen = mq_set_msglen(req, req->req_data.buf_len, send_msglen); + + _HFI_VDBG("match=YES (req=%p) opcode=%x src=%s mqtag=%x.%x.%x" + " msglen=%d paylen=%d\n", req, opcode, + psmi_epaddr_get_name(src->epid), + tag->tag[0], tag->tag[1], tag->tag[2], msglen, + paylen); + + void* user_buffer = NULL; + + switch (opcode) { + case MQ_MSG_TINY: + /* mq_copy_tiny() can handle zero byte */ + +#ifdef PSM_CUDA + if (PSMI_USE_GDR_COPY(req, msglen)) { + void* mmaped_host = gdr_convert_gpu_to_host_addr(GDR_FD, + (unsigned long)req->req_data.buf, + msglen, 1, src->proto); + mq_copy_tiny((uint32_t *) mmaped_host, + (uint32_t *) payload, msglen); + } + else { + mq_copy_tiny((uint32_t *) req->req_data.buf, + (uint32_t *) payload, msglen); + } +#else + + mq_copy_tiny((uint32_t *) req->req_data.buf, + (uint32_t *) payload, msglen); +#endif + + req->state = MQ_STATE_COMPLETE; + ips_barrier(); + mq_qq_append(&mq->completed_q, req); + break; + + case MQ_MSG_SHORT: /* message fits in 1 payload */ + user_buffer = req->req_data.buf; +#ifdef PSM_CUDA + psmi_mtucpy_fn_t psmi_mtucpy_fn = psmi_mq_mtucpy; + if (PSMI_USE_GDR_COPY(req, msglen)) { + user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, + (unsigned long)req->req_data.buf, + msglen, 1, src->proto); + psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem; + } +#endif + if (msglen <= paylen) { +#ifdef PSM_CUDA + psmi_mtucpy_fn(user_buffer, payload, msglen); +#else + psmi_mq_mtucpy(user_buffer, payload, msglen); +#endif + } else { + psmi_assert((msglen & ~0x3) == paylen); +#ifdef PSM_CUDA + psmi_mtucpy_fn(user_buffer, payload, paylen); +#else + psmi_mq_mtucpy(user_buffer, payload, paylen); +#endif + /* + * there are nonDW bytes attached in header, + * copy after the DW payload. + */ + mq_copy_tiny((uint32_t *)(user_buffer+paylen), + (uint32_t *)&offset, msglen & 0x3); + } + req->state = MQ_STATE_COMPLETE; + ips_barrier(); + mq_qq_append(&mq->completed_q, req); + break; + + case MQ_MSG_EAGER: + req->state = MQ_STATE_MATCHED; + req->type |= MQE_TYPE_EAGER_QUEUE; + req->send_msgoff = req->recv_msgoff = 0; + STAILQ_INSERT_TAIL(&mq->eager_q, req, nextq); + _HFI_VDBG("exp MSG_EAGER of length %d bytes pay=%d\n", + msglen, paylen); +#ifdef PSM_CUDA + if (PSMI_USE_GDR_COPY(req, req->req_data.send_msglen)) { + req->req_data.buf = gdr_convert_gpu_to_host_addr(GDR_FD, + (unsigned long)req->user_gpu_buffer, + req->req_data.send_msglen, 1, src->proto); + } +#endif + if (paylen > 0) + psmi_mq_handle_data(mq, req, offset, payload, + paylen); + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Internal error, unknown packet 0x%x", + opcode); + } + + mq->stats.rx_user_bytes += msglen; + mq->stats.rx_user_num++; + + *req_o = req; /* yes match */ + return MQ_RET_MATCH_OK; + } + + /* unexpected message or out of order message. */ + +#if 0 + /* + * Keep a callback here in case we want to fit some other high-level + * protocols over MQ (i.e. shmem). These protocols would bypass the + * normal message handling and go to higher-level message handlers. + */ + if (msgorder && mq->unexpected_callback) { + mq->unexpected_callback(mq, opcode, epaddr, req_data.tag, send_msglen, + payload, paylen); + *req_o = NULL; + return MQ_RET_UNEXP_OK; + } +#endif + + if (msgorder > 1) { + /* There is NO request match, and this is the first time + * to try to process this packet, we leave the packet in + * hardware queue for retry in hope there is a request + * match nex time, this is for performance + * consideration. + */ + return MQ_RET_UNEXP_NO_RESOURCES; + } + + req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + psmi_assert(req != NULL); + + req->req_data.peer = src; + req->req_data.tag = *tag; + req->recv_msgoff = 0; + req->req_data.recv_msglen = req->req_data.send_msglen = req->req_data.buf_len = msglen = + send_msglen; + + _HFI_VDBG("match=NO (req=%p) opcode=%x src=%s mqtag=%08x.%08x.%08x" + " send_msglen=%d\n", req, opcode, + psmi_epaddr_get_name(src->epid), + tag->tag[0], tag->tag[1], tag->tag[2], send_msglen); + + switch (opcode) { + case MQ_MSG_TINY: + if (msglen > 0) { + req->req_data.buf = psmi_mq_sysbuf_alloc(mq, msglen); + psmi_assert(msglen == 0 || req->req_data.buf != NULL); + mq->stats.rx_sysbuf_num++; + mq->stats.rx_sysbuf_bytes += paylen; + mq_copy_tiny((uint32_t *) req->req_data.buf, + (uint32_t *) payload, msglen); + } else + req->req_data.buf = NULL; + req->state = MQ_STATE_COMPLETE; + break; + + case MQ_MSG_SHORT: + req->req_data.buf = psmi_mq_sysbuf_alloc(mq, msglen); + psmi_assert(msglen == 0 || req->req_data.buf != NULL); + mq->stats.rx_sysbuf_num++; + mq->stats.rx_sysbuf_bytes += paylen; + if (msglen <= paylen) { + psmi_mq_mtucpy(req->req_data.buf, payload, msglen); + } else { + psmi_assert((msglen & ~0x3) == paylen); + psmi_mq_mtucpy(req->req_data.buf, payload, paylen); + /* + * there are nonDW bytes attached in header, + * copy after the DW payload. + */ + mq_copy_tiny((uint32_t *)(req->req_data.buf+paylen), + (uint32_t *)&offset, msglen & 0x3); + } + req->state = MQ_STATE_COMPLETE; + break; + + case MQ_MSG_EAGER: + req->send_msgoff = 0; + req->req_data.buf = psmi_mq_sysbuf_alloc(mq, msglen); + psmi_assert(msglen == 0 || req->req_data.buf != NULL); + mq->stats.rx_sysbuf_num++; + mq->stats.rx_sysbuf_bytes += paylen; + req->state = MQ_STATE_UNEXP; + req->type |= MQE_TYPE_EAGER_QUEUE; + STAILQ_INSERT_TAIL(&mq->eager_q, req, nextq); + _HFI_VDBG("unexp MSG_EAGER of length %d bytes pay=%d\n", + msglen, paylen); + if (paylen > 0) + psmi_mq_handle_data(mq, req, offset, payload, paylen); + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Internal error, unknown packet 0x%x", + opcode); + } + + mq->stats.rx_sys_bytes += msglen; + mq->stats.rx_sys_num++; + + if (msgorder) { + mq_add_to_unexpected_hashes(mq, req); + } + /* caller will handle out of order case */ + *req_o = req; /* no match, will callback */ + return MQ_RET_UNEXP_OK; +} + +int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq) +{ + psm2_mq_req_t ereq; + uint32_t msglen; + + ereq = mq_req_match(mq, ureq->req_data.peer, &ureq->req_data.tag, 1); + if (ereq == NULL) { + mq_add_to_unexpected_hashes(mq, ureq); + return 0; + } + + psmi_assert(MQE_TYPE_IS_RECV(ereq->type)); + ereq->req_data.peer = ureq->req_data.peer; + ereq->req_data.tag = ureq->req_data.tag; + msglen = mq_set_msglen(ereq, ereq->req_data.buf_len, ureq->req_data.send_msglen); + + switch (ureq->state) { + case MQ_STATE_COMPLETE: + if (ureq->req_data.buf != NULL) { /* 0-byte don't alloc a sysreq_data.buf */ + psmi_mq_mtucpy(ereq->req_data.buf, (const void *)ureq->req_data.buf, + msglen); + psmi_mq_sysbuf_free(mq, ureq->req_data.buf); + } + ereq->state = MQ_STATE_COMPLETE; + ips_barrier(); + mq_qq_append(&mq->completed_q, ereq); + break; + case MQ_STATE_UNEXP: /* not done yet */ + ereq->state = MQ_STATE_MATCHED; + ereq->msg_seqnum = ureq->msg_seqnum; + ereq->ptl_req_ptr = ureq->ptl_req_ptr; + ereq->send_msgoff = ureq->send_msgoff; + ereq->recv_msgoff = min(ureq->recv_msgoff, msglen); + if (ereq->recv_msgoff) { + psmi_mq_mtucpy(ereq->req_data.buf, + (const void *)ureq->req_data.buf, + ereq->recv_msgoff); + } + psmi_mq_sysbuf_free(mq, ureq->req_data.buf); + ereq->type = ureq->type; + STAILQ_INSERT_AFTER(&mq->eager_q, ureq, ereq, nextq); + STAILQ_REMOVE(&mq->eager_q, ureq, psm2_mq_req, nextq); + break; + case MQ_STATE_UNEXP_RV: /* rendez-vous ... */ + ereq->state = MQ_STATE_MATCHED; + ereq->rts_peer = ureq->rts_peer; + ereq->rts_sbuf = ureq->rts_sbuf; + ereq->send_msgoff = ureq->send_msgoff; + ereq->recv_msgoff = min(ureq->recv_msgoff, msglen); + if (ereq->recv_msgoff) { + psmi_mq_mtucpy(ereq->req_data.buf, + (const void *)ureq->req_data.buf, + ereq->recv_msgoff); + } + if (ereq->send_msgoff) { + psmi_mq_sysbuf_free(mq, ureq->req_data.buf); + } + ereq->rts_callback = ureq->rts_callback; + ereq->rts_reqidx_peer = ureq->rts_reqidx_peer; + ereq->type = ureq->type; + ereq->rts_callback(ereq, 0); + break; + default: + fprintf(stderr, "Unexpected state %d in req %p\n", ureq->state, + ureq); + fprintf(stderr, "type=%d, mq=%p, tag=%08x.%08x.%08x\n", + ureq->type, ureq->mq, ureq->req_data.tag.tag[0], + ureq->req_data.tag.tag[1], ureq->req_data.tag.tag[2]); + abort(); + } + + psmi_mq_req_free(ureq); + return 0; +} diff --git a/psm_mq_utils.c b/psm_mq_utils.c new file mode 100644 index 0000000..a0409db --- /dev/null +++ b/psm_mq_utils.c @@ -0,0 +1,280 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm_mq_internal.h" + +/* + * + * MQ request allocator + * + */ + +psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type) +{ + psm2_mq_req_t req; + + psmi_assert(type == MQE_TYPE_RECV || type == MQE_TYPE_SEND); + + if (type == MQE_TYPE_SEND) + req = psmi_mpool_get(mq->sreq_pool); + else + req = psmi_mpool_get(mq->rreq_pool); + + if_pt(req != NULL) { + /* A while ago there were issues about forgetting to zero-out parts of the + * structure, I'm leaving this as a debug-time option */ +#ifdef PSM_DEBUG + memset(req, 0, sizeof(struct psm2_mq_req)); +#endif + req->type = type; + req->state = MQ_STATE_FREE; + memset(req->next, 0, NUM_MQ_SUBLISTS * sizeof(psm2_mq_req_t)); + memset(req->prev, 0, NUM_MQ_SUBLISTS * sizeof(psm2_mq_req_t)); + memset(req->q, 0, NUM_MQ_SUBLISTS * sizeof(struct mqq *)); + req->req_data.error_code = PSM2_OK; + req->mq = mq; + req->testwait_callback = NULL; + req->rts_peer = NULL; + req->req_data.peer = NULL; + req->ptl_req_ptr = NULL; +#ifdef PSM_CUDA + req->is_buf_gpu_mem = 0; + req->user_gpu_buffer = NULL; +#endif + req->flags_user = 0; + req->flags_internal = 0; + return req; + } else { /* we're out of reqs */ + int issend = (type == MQE_TYPE_SEND); + uint32_t reqmax, reqchunk; + psmi_mpool_get_obj_info(issend ? mq->sreq_pool : mq->rreq_pool, + &reqchunk, &reqmax); + + psmi_handle_error(PSMI_EP_NORETURN, PSM2_PARAM_ERR, + "Exhausted %d MQ %s request descriptors, which usually indicates " + "a user program error or insufficient request descriptors (%s=%d)", + reqmax, issend ? "isend" : "irecv", + issend ? "PSM2_MQ_SENDREQS_MAX" : + "PSM2_MQ_RECVREQS_MAX", reqmax); + return NULL; + } +} +MOCK_DEF_EPILOGUE(psmi_mq_req_alloc); + +#ifdef PSM_CUDA +void psmi_cuda_recvreq_alloc_func(int is_alloc, void* context, void* obj) { + psm2_mq_req_t recvreq = (psm2_mq_req_t)obj; + if (PSMI_IS_CUDA_ENABLED) { + if (is_alloc) + PSMI_CUDA_CALL(cuEventCreate, &recvreq->cuda_ipc_event, CU_EVENT_DEFAULT); + else + PSMI_CUDA_CALL(cuEventDestroy, recvreq->cuda_ipc_event); + } + return; +} +#endif + +psm2_error_t psmi_mq_req_init(psm2_mq_t mq) +{ + psm2_mq_req_t warmup_req; + psm2_error_t err = PSM2_OK; + + _HFI_VDBG("mq element sizes are %d bytes\n", + (int)sizeof(struct psm2_mq_req)); + + /* + * Send MQ requests + */ + { + struct psmi_rlimit_mpool rlim = MQ_SENDREQ_LIMITS; + uint32_t maxsz, chunksz; + + if ((err = + psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz))) + goto fail; + + if ((mq->sreq_pool = + psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz, + maxsz, 0, DESCRIPTORS, NULL, + NULL)) == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + } + + /* + * Receive MQ requests + */ + { + struct psmi_rlimit_mpool rlim = MQ_RECVREQ_LIMITS; + uint32_t maxsz, chunksz; + + if ((err = + psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz))) + goto fail; + /* Have a callback function for receive req mpool which creates + * and destroy events. + */ +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED) { + if ((mq->rreq_pool = + psmi_mpool_create_for_cuda(sizeof(struct psm2_mq_req), chunksz, + maxsz, 0, DESCRIPTORS, NULL, + NULL, psmi_cuda_recvreq_alloc_func, NULL)) == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + } + else { + if ((mq->rreq_pool = + psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz, + maxsz, 0, DESCRIPTORS, NULL, + NULL)) == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + } +#else + if ((mq->rreq_pool = + psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz, + maxsz, 0, DESCRIPTORS, NULL, + NULL)) == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } +#endif + } + + /* Warm up the allocators */ + warmup_req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + psmi_assert_always(warmup_req != NULL); + psmi_mq_req_free(warmup_req); + + warmup_req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); + psmi_assert_always(warmup_req != NULL); + psmi_mq_req_free(warmup_req); + +fail: + return err; +} + +psm2_error_t psmi_mq_req_fini(psm2_mq_t mq) +{ + psmi_mpool_destroy(mq->rreq_pool); + psmi_mpool_destroy(mq->sreq_pool); + return PSM2_OK; +} + + +/* + * Hooks to plug into QLogic MPI stats + */ + +static +void psmi_mq_stats_callback(struct mpspawn_stats_req_args *args) +{ + uint64_t *entry = args->stats; + psm2_mq_t mq = (psm2_mq_t) args->context; + psm2_mq_stats_t mqstats; + + psm2_mq_get_stats(mq, &mqstats); + + if (args->num < 8) + return; + + entry[0] = mqstats.tx_eager_num; + entry[1] = mqstats.tx_eager_bytes; + entry[2] = mqstats.tx_rndv_num; + entry[3] = mqstats.tx_rndv_bytes; + + entry[4] = mqstats.rx_user_num; + entry[5] = mqstats.rx_user_bytes; + entry[6] = mqstats.rx_sys_num; + entry[7] = mqstats.rx_sys_bytes; +} + +void psmi_mq_stats_register(psm2_mq_t mq, mpspawn_stats_add_fn add_fn) +{ + char *desc[8]; + uint16_t flags[8]; + int i; + struct mpspawn_stats_add_args mp_add; + /* + * Hardcode flags until we correctly move mpspawn to its own repo. + * flags[i] = MPSPAWN_REDUCTION_MAX | MPSPAWN_REDUCTION_MIN; + */ + for (i = 0; i < 8; i++) + flags[i] = MPSPAWN_STATS_REDUCTION_ALL; + + desc[0] = "Eager count sent"; + desc[1] = "Eager bytes sent"; + desc[2] = "Rendezvous count sent"; + desc[3] = "Rendezvous bytes sent"; + desc[4] = "Expected count received"; + desc[5] = "Expected bytes received"; + desc[6] = "Unexpect count received"; + desc[7] = "Unexpect bytes received"; + + mp_add.version = MPSPAWN_STATS_VERSION; + mp_add.num = 8; + mp_add.header = "MPI Statistics Summary (max,min @ rank)"; + mp_add.req_fn = psmi_mq_stats_callback; + mp_add.desc = desc; + mp_add.flags = flags; + mp_add.context = mq; + + add_fn(&mp_add); +} diff --git a/psm_perf.c b/psm_perf.c new file mode 100644 index 0000000..aaf3fd0 --- /dev/null +++ b/psm_perf.c @@ -0,0 +1,260 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifdef RDPMC_PERF_FRAMEWORK + +#include "psm_user.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Configuration */ + +#define RDPMC_PERF_DEFAULT_TYPE (PERF_TYPE_HARDWARE) +#define RDPMC_PERF_DEFAULT_CONFIG (PERF_COUNT_HW_CPU_CYCLES) + +__thread struct rdpmc_ctx global_rdpmc_ctx; + +u64 global_rdpmc_begin[RDPMC_PERF_MAX_SLOT_NUMBER]; +u64 global_rdpmc_summ[RDPMC_PERF_MAX_SLOT_NUMBER]; +u64 global_rdpmc_number[RDPMC_PERF_MAX_SLOT_NUMBER]; + +char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SLOT_NAME]; + +__thread unsigned int global_rdpmc_type = RDPMC_PERF_DEFAULT_TYPE; +__thread unsigned int global_rdpmc_config = RDPMC_PERF_DEFAULT_CONFIG; + +struct rdpmc_ctx { + int fd; + struct perf_event_mmap_page *buf; +}; + +typedef unsigned long long u64; + +#if defined(__ICC) || defined(__INTEL_COMPILER) +#include "immintrin.h" +#endif + +/** + * DOC: Ring 3 counting for CPU performance counters + * + * This library allows accessing CPU performance counters from ring 3 + * using the perf_events subsystem. This is useful to measure specific + * parts of programs (e.g. excluding initialization code) + * + * Requires a Linux 3.3+ kernel + */ + +/** + * rdpmc_open_attr - initialize a raw ring 3 readable performance counter + * @attr: perf struct %perf_event_attr for the counter + * @ctx: Pointer to struct %rdpmc_ctx that is initialized. + * @leader_ctx: context of group leader or NULL + * + * This allows more flexible setup with a custom &perf_event_attr. + * For simple uses rdpmc_open() should be used instead. + * Must be called for each thread using the counter. + * Must be closed with rdpmc_close() + */ +PSMI_ALWAYS_INLINE(int rdpmc_open_attr(struct perf_event_attr *attr, struct rdpmc_ctx *ctx, + struct rdpmc_ctx *leader_ctx)) +{ + ctx->fd = syscall(__NR_perf_event_open, attr, 0, -1, + leader_ctx ? leader_ctx->fd : -1, 0); + if (ctx->fd < 0) { + perror("perf_event_open"); + return -1; + } + ctx->buf = mmap(NULL, sysconf(_SC_PAGESIZE), PROT_READ, MAP_SHARED, ctx->fd, 0); + if (ctx->buf == MAP_FAILED) { + close(ctx->fd); + perror("mmap on perf fd"); + return -1; + } + return 0; +} + +/** + * rdpmc_open - initialize a simple ring 3 readable performance counter + * @counter: Raw event descriptor (UUEE UU unit mask EE event) + * @ctx: Pointer to struct &rdpmc_ctx that is initialized + * + * The counter will be set up to count CPU events excluding the kernel. + * Must be called for each thread using the counter. + * The caller must make sure counter is suitable for the running CPU. + * Only works in 3.3+ kernels. + * Must be closed with rdpmc_close() + */ + +PSMI_ALWAYS_INLINE(int rdpmc_open(unsigned counter, struct rdpmc_ctx *ctx)) +{ + struct perf_event_attr attr = { + .type = counter > 10 ? PERF_TYPE_RAW : PERF_TYPE_HARDWARE, + .size = PERF_ATTR_SIZE_VER0, + .config = counter, + .sample_type = PERF_SAMPLE_READ, + .exclude_kernel = 1, + }; + return rdpmc_open_attr(&attr, ctx, NULL); +} + +/** + * rdpmc_close: free a ring 3 readable performance counter + * @ctx: Pointer to &rdpmc_ctx context. + * + * Must be called by each thread for each context it initialized. + */ +PSMI_ALWAYS_INLINE(void rdpmc_close(struct rdpmc_ctx *ctx)) +{ + close(ctx->fd); + munmap(ctx->buf, sysconf(_SC_PAGESIZE)); +} + +static void psmi_rdpmc_perf_framework_init() +{ + int rdpmc_retval; + + struct rdpmc_ctx *leader = NULL; + + int env_result = 1; + char * env_type = NULL; + char * env_config = NULL; + + env_type = getenv("RDPMC_PERF_TYPE"); + + if (env_type) + { + global_rdpmc_type = (int)strtoll(env_type, NULL, 16); + } + else + { + env_result = 0; + } + + env_config = getenv("RDPMC_PERF_CONFIG"); + + if (env_config) + { + global_rdpmc_config = (int)strtoll(env_config, NULL, 16); + } + else + { + env_result = 0; + } + + if (env_result != 1) + { + global_rdpmc_type = RDPMC_PERF_DEFAULT_TYPE; + global_rdpmc_config = RDPMC_PERF_DEFAULT_CONFIG; + } + + struct perf_event_attr attr = { + .type = global_rdpmc_type, + .size = sizeof(struct perf_event_attr), + .config = global_rdpmc_config, + .sample_type = PERF_SAMPLE_READ, + }; + + rdpmc_retval = rdpmc_open_attr(&attr, &global_rdpmc_ctx, leader); + + if (rdpmc_retval < 0) + { + printf("Unable to initialize RDPMC. Error: %d\n", rdpmc_retval); + exit(-1); + } +} + +/** + * rdpmc_read: read a ring 3 readable performance counter + * @ctx: Pointer to initialized &rdpmc_ctx structure. + * + * Read the current value of a running performance counter. + */ +unsigned long long rdpmc_read(struct rdpmc_ctx *ctx) +{ + static __thread int rdpmc_perf_initialized = 0; + + if_pf(!rdpmc_perf_initialized) + { + psmi_rdpmc_perf_framework_init(); + rdpmc_perf_initialized = 1; + } + + u64 val; + unsigned seq; + u64 offset = 0; + + typeof (ctx->buf) buf = ctx->buf; + do { + seq = buf->lock; + ips_rmb(); + if (buf->index <= 0) + return buf->offset; +#if defined(__ICC) || defined(__INTEL_COMPILER) + val = _rdpmc(buf->index - 1); +#else /* GCC */ + val = __builtin_ia32_rdpmc(buf->index - 1); +#endif + offset = buf->offset; + ips_rmb(); + } while (buf->lock != seq); + return val + offset; +} + +#endif /* RDPMC_PERF_FRAMEWORK */ diff --git a/psm_perf.h b/psm_perf.h new file mode 100644 index 0000000..b6b77f0 --- /dev/null +++ b/psm_perf.h @@ -0,0 +1,138 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#define PSM_TX_SPEEDPATH_CTR 0 +#define PSM_RX_SPEEDPATH_CTR 1 + +#ifdef RDPMC_PERF_FRAMEWORK + +/* Configuration */ + +#define RDPMC_PERF_MAX_SLOT_NUMBER (8) +#define RDPMC_PERF_MAX_SLOT_NAME (256) + +/* RDPMC infrastructure */ + +extern __thread struct rdpmc_ctx global_rdpmc_ctx; + +typedef unsigned long long u64; + +extern u64 global_rdpmc_begin[RDPMC_PERF_MAX_SLOT_NUMBER]; +extern u64 global_rdpmc_summ[RDPMC_PERF_MAX_SLOT_NUMBER]; +extern u64 global_rdpmc_number[RDPMC_PERF_MAX_SLOT_NUMBER]; + +extern char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SLOT_NAME]; + +extern __thread unsigned int global_rdpmc_type; +extern __thread unsigned int global_rdpmc_config; + +extern unsigned long long rdpmc_read(struct rdpmc_ctx *ctx); + +#define RDPMC_PERF_INIT() \ +{ \ + int i; \ + for (i = 0; i < RDPMC_PERF_MAX_SLOT_NUMBER; i++) \ + { \ + global_rdpmc_begin[i] = 0; \ + global_rdpmc_summ[i] = 0; \ + global_rdpmc_number[i] = 0; \ + global_rdpmc_slot_name[i][0] = '\0'; \ + } \ +} + +/* There is no slot_number max range check */ + +#define RDPMC_PERF_SET_SLOT_NAME(slot_number, name) \ +{ \ + strncpy(global_rdpmc_slot_name[(slot_number)], (name), RDPMC_PERF_MAX_SLOT_NAME - 1); \ + global_rdpmc_slot_name[(slot_number)][RDPMC_PERF_MAX_SLOT_NAME - 1] = '\0'; \ +} + +#define RDPMC_PERF_BEGIN(slot_number) \ +{ \ + global_rdpmc_begin[(slot_number)] = rdpmc_read(&global_rdpmc_ctx); \ +} + +#define RDPMC_PERF_END(slot_number) \ +{ \ + global_rdpmc_summ[(slot_number)] += (rdpmc_read(&global_rdpmc_ctx) - global_rdpmc_begin[(slot_number)]); \ + global_rdpmc_number[(slot_number)]++; \ +} + +#define RDPMC_PERF_DUMP(stream) \ +{ \ + int i; \ + for (i = 0; i < RDPMC_PERF_MAX_SLOT_NUMBER; i++) \ + { \ + if (global_rdpmc_slot_name[i][0]) \ + { \ + fprintf((stream), "RDPMC [%s] (%x, %04x) avg = %g (%llu times)\n", \ + global_rdpmc_slot_name[i], global_rdpmc_type, global_rdpmc_config, \ + (double)global_rdpmc_summ[i] / global_rdpmc_number[i], global_rdpmc_number[i]); \ + fflush((stream)); \ + } \ + } \ +} + +#define GENERIC_PERF_INIT() RDPMC_PERF_INIT() +#define GENERIC_PERF_SET_SLOT_NAME(slot_number, name) RDPMC_PERF_SET_SLOT_NAME(slot_number, name) +#define GENERIC_PERF_BEGIN(slot_number) RDPMC_PERF_BEGIN(slot_number) +#define GENERIC_PERF_END(slot_number) RDPMC_PERF_END(slot_number) +#define GENERIC_PERF_DUMP(stream) RDPMC_PERF_DUMP(stream) +#else /* RDPMC_PERF_FRAMEWORK */ +#define GENERIC_PERF_INIT() +#define GENERIC_PERF_SET_SLOT_NAME(slot_number, name) +#define GENERIC_PERF_BEGIN(slot_number) +#define GENERIC_PERF_END(slot_number) +#define GENERIC_PERF_DUMP(stream) +#endif /* RDPMC_PERF_FRAMEWORK */ diff --git a/psm_stats.c b/psm_stats.c new file mode 100644 index 0000000..c9b5777 --- /dev/null +++ b/psm_stats.c @@ -0,0 +1,666 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm_mq_internal.h" + +struct psmi_stats_type { + STAILQ_ENTRY(psmi_stats_type) next; + struct psmi_stats_entry *entries; + + int num_entries; + void *heading; + uint32_t statstype; + void *context; +}; + +static STAILQ_HEAD(, psmi_stats_type) psmi_stats = +STAILQ_HEAD_INITIALIZER(psmi_stats); + +psm2_error_t +psmi_stats_register_type(const char *heading, + uint32_t statstype, + const struct psmi_stats_entry *entries_i, + int num_entries, void *context) +{ + struct psmi_stats_entry *entries; + struct psmi_stats_type *type; + int i; + psm2_error_t err = PSM2_OK; + + entries = + psmi_calloc(PSMI_EP_NONE, STATS, num_entries, + sizeof(struct psmi_stats_entry)); + type = + psmi_calloc(PSMI_EP_NONE, STATS, 1, sizeof(struct psmi_stats_type)); + PSMI_CHECKMEM(err, entries); + PSMI_CHECKMEM(err, type); + + type->entries = entries; + type->num_entries = num_entries; + type->statstype = statstype; + type->context = context; + type->heading = (char *)heading; + + for (i = 0; i < num_entries; i++) { + type->entries[i].desc = entries_i[i].desc; + type->entries[i].flags = entries_i[i].flags; + type->entries[i].getfn = entries_i[i].getfn; + type->entries[i].u.val = entries_i[i].u.val; + } + + STAILQ_INSERT_TAIL(&psmi_stats, type, next); + return err; + +fail: + if (entries) + psmi_free(entries); + if (type) + psmi_free(type); + return err; +} + +psm2_error_t psmi_stats_deregister_all(void) +{ + struct psmi_stats_type *type; + + /* Currently our mpi still reads stats after finalize so this isn't safe + * yet */ + while ((type = STAILQ_FIRST(&psmi_stats)) != NULL) { + STAILQ_REMOVE_HEAD(&psmi_stats, next); + psmi_free(type->entries); + psmi_free(type); + } + + return PSM2_OK; +} + +static uint32_t typestring_to_type(const char *typestr) +{ + if (strncasecmp(typestr, "all", 4) == 0) + return PSMI_STATSTYPE_ALL; + else if (strncasecmp(typestr, "p2p", 4) == 0) + return PSMI_STATSTYPE_P2P; + else if (strncasecmp(typestr, "hfi", 6) == 0) + return PSMI_STATSTYPE_HFI; + else if (strncasecmp(typestr, "ips", 4) == 0) + return PSMI_STATSTYPE_IPSPROTO; + else if ((strncasecmp(typestr, "intr", 5) == 0) || + (strncasecmp(typestr, "thread", 7) == 0) || + (strncasecmp(typestr, "rcvthread", 10) == 0)) + return PSMI_STATSTYPE_RCVTHREAD; + else if ((strncasecmp(typestr, "mq", 3) == 0) || + (strncasecmp(typestr, "mpi", 4) == 0)) + return PSMI_STATSTYPE_MQ; + else if ((strncasecmp(typestr, "tid", 4) == 0) || + (strncasecmp(typestr, "tids", 5) == 0)) + return PSMI_STATSTYPE_TIDS; + else if ((strncasecmp(typestr, "counter", 8) == 0) || + (strncasecmp(typestr, "counters", 9) == 0)) + return PSMI_STATSTYPE_DEVCOUNTERS; + else if (strncasecmp(typestr, "devstats", 9) == 0) + return PSMI_STATSTYPE_DEVSTATS; + else if ((strncasecmp(typestr, "memory", 7) == 0) || + (strncasecmp(typestr, "alloc", 6) == 0) || + (strncasecmp(typestr, "malloc", 7) == 0)) + return PSMI_STATSTYPE_MEMORY; + else + return 0; +} + +static uint32_t stats_parse_enabled_mask(const char *stats_string) +{ + char *b = (char *)stats_string; + char *e = b; + char buf[128]; + + uint32_t stats_enabled_mask = 0; + + while (*e) { + b = e; + while (*e && *e != ',' && *e != '+' && *e != '.' && + *e != '|' && *e != ':') + e++; + if (e > b) { /* something new to parse */ + int len = ((e - b) > (sizeof(buf) - 1)) ? + (sizeof(buf) - 1) : (e - b); + strncpy(buf, b, len); + buf[len] = '\0'; + stats_enabled_mask |= typestring_to_type(buf); + } + if (*e) + e++; /* skip delimiter */ + } + return stats_enabled_mask; +} + +static +void psmi_stats_mpspawn_callback(struct mpspawn_stats_req_args *args) +{ + const struct psmi_stats_entry *entry; + struct psmi_stats_type *type = (struct psmi_stats_type *)args->context; + int i, num = args->num; + uint64_t *stats = args->stats; + uint64_t *c = NULL; + uint64_t *s = NULL; + + psmi_assert(num == type->num_entries); + + if (type->statstype == PSMI_STATSTYPE_DEVCOUNTERS || + type->statstype == PSMI_STATSTYPE_DEVSTATS) { + int unit_id = ((psm2_ep_t) type->context)->unit_id; + int portno = ((psm2_ep_t) type->context)->portnum; + uintptr_t off; + uint8_t *p = NULL; + int nc, npc, ns; + int nstats = hfi_get_stats_names_count(); + int nctrs = hfi_get_ctrs_unit_names_count(unit_id); + int npctrs = hfi_get_ctrs_port_names_count(unit_id); + + if (nctrs != -1 && npctrs != -1) + c = psmi_calloc(PSMI_EP_NONE, STATS, nctrs + npctrs, + sizeof(uint64_t)); + if (nstats != -1) + s = psmi_calloc(PSMI_EP_NONE, STATS, nstats, + sizeof(uint64_t)); + + /* + * If hfifs is not loaded, we set NAN everywhere. We don't want + * stats to break just because 1 node didn't have hfi-stats + */ + if (type->statstype == PSMI_STATSTYPE_DEVCOUNTERS && c != NULL) { + nc = hfi_get_ctrs_unit(unit_id, c, nctrs); + if (nc != -1 && nc == nctrs) + p = (uint8_t *) c; + if (nc == -1) + nc = 0; + npc = + hfi_get_ctrs_port(unit_id, portno, c + nc, npctrs); + if (!p && npc > 0 && npc == npctrs) + p = (uint8_t *) c; + } else if (s != NULL) { + ns = hfi_get_stats(s, nstats); + if (ns != -1) + p = (uint8_t *) s; + } + for (i = 0; i < num; i++) { + entry = &type->entries[i]; + if (p) { + off = (uintptr_t) entry->u.off; + stats[i] = *((uint64_t *) (p + off)); + } else + stats[i] = MPSPAWN_NAN_U64; + } + } else if (type->statstype == PSMI_STATSTYPE_MEMORY) { + for (i = 0; i < num; i++) { + entry = &type->entries[i]; + stats[i] = + *(uint64_t *) ((uintptr_t) &psmi_stats_memory + + (uintptr_t) entry->u.off); + } + } else { + for (i = 0; i < num; i++) { + entry = &type->entries[i]; + if (entry->getfn != NULL) + stats[i] = entry->getfn(type->context); + else + stats[i] = *entry->u.val; + } + } + + if (c != NULL) + psmi_free(c); + if (s != NULL) + psmi_free(s); +} + +static +void +stats_register_mpspawn_single(mpspawn_stats_add_fn add_fn, + char *heading, + int num_entries, + struct psmi_stats_entry *entries, + mpspawn_stats_req_fn req_fn, void *context) +{ + int i; + struct mpspawn_stats_add_args mp_add; + + mp_add.version = MPSPAWN_STATS_VERSION; + mp_add.num = num_entries; + mp_add.header = heading; + mp_add.req_fn = req_fn; + mp_add.context = context; + + mp_add.desc = (char **)alloca(sizeof(char *) * num_entries); + + mp_add.flags = (uint16_t *) alloca(sizeof(uint16_t *) * num_entries); + + for (i = 0; i < num_entries; i++) { + mp_add.desc[i] = (char *)entries[i].desc; + mp_add.flags[i] = entries[i].flags; + } + + /* Ignore return code, doesn't matter to *us* if register failed */ + add_fn(&mp_add); + + return; +} + +static void stats_register_hfi_counters(psm2_ep_t ep); +static void stats_register_hfi_stats(psm2_ep_t ep); +static void stats_register_mem_stats(psm2_ep_t ep); +static psm2_error_t psmi_stats_epaddr_register(struct mpspawn_stats_init_args + *args); + +/* + * Downcall from QLogic MPI into PSM, so we can register stats + */ +void *psmi_stats_register(struct mpspawn_stats_init_args *args) +{ + struct psmi_stats_type *type; + uint32_t statsmask; + + /* + * Args has a version string in it, but we can ignore it since mpspawn + * will decide if it supports *our* version + */ + + /* + * Eventually, parse the stats_types to add various "flavours" of stats + */ + if (args->stats_types == NULL) + return NULL; + + statsmask = stats_parse_enabled_mask(args->stats_types); + + /* MQ (MPI-level) statistics */ + if (statsmask & PSMI_STATSTYPE_MQ) + psmi_mq_stats_register(args->mq, args->add_fn); + + /* PSM and hfi level statistics */ + if (statsmask & PSMI_STATSTYPE_DEVCOUNTERS) + stats_register_hfi_counters(args->mq->ep); + + if (statsmask & PSMI_STATSTYPE_DEVSTATS) + stats_register_hfi_stats(args->mq->ep); + + if (statsmask & PSMI_STATSTYPE_MEMORY) + stats_register_mem_stats(args->mq->ep); + + /* + * At this point all PSM and hfi-level components have registered stats + * with the PSM stats interface. We register with the mpspawn stats + * interface with an upcall in add_fn + */ + STAILQ_FOREACH(type, &psmi_stats, next) { + if (type->statstype & statsmask) + stats_register_mpspawn_single(args->add_fn, + type->heading, + type->num_entries, + type->entries, + psmi_stats_mpspawn_callback, + type); + } + + /* + * Special handling for per-endpoint statistics + * Only MPI knows what the endpoint-addresses are in the running program, + * PSM has no sense of MPI worlds. In stats register, MPI tells PSM how + * many endpoints it anticipates having and PSM simply reserves that amount + * of stats entries X the amount of per-endpoint stats. + */ + if (statsmask & PSMI_STATSTYPE_P2P) + psmi_stats_epaddr_register(args); + + return NULL; +} + +struct stats_epaddr { + psm2_ep_t ep; + mpspawn_map_epaddr_fn epaddr_map_fn; + int num_ep; + int num_ep_stats; +}; + +static +void psmi_stats_epaddr_callback(struct mpspawn_stats_req_args *args) +{ + int i, num, off; + uint64_t *statsp; + struct stats_epaddr *stats_ctx = (struct stats_epaddr *)args->context; + psm2_ep_t ep = stats_ctx->ep; + psm2_epaddr_t epaddr; + + num = stats_ctx->num_ep * stats_ctx->num_ep_stats; + + /* First always NAN the entire stats request */ + for (i = 0; i < num; i++) { + if (args->flags[i] & MPSPAWN_STATS_TYPE_DOUBLE) + args->stats[i] = MPSPAWN_NAN; + else + args->stats[i] = MPSPAWN_NAN_U64; + } + + for (i = 0; i < stats_ctx->num_ep; i++) { + statsp = args->stats + i * stats_ctx->num_ep_stats; + off = 0; + epaddr = stats_ctx->epaddr_map_fn(i); + if (epaddr == NULL) + continue; + + /* Self */ + if (&ep->ptl_self == epaddr->ptlctl) { + if (ep->ptl_self.epaddr_stats_get != NULL) + off += + ep->ptl_self.epaddr_stats_get(epaddr, + statsp + off); + } else { + if (ep->ptl_self.epaddr_stats_num != NULL) + off += ep->ptl_self.epaddr_stats_num(); + } + + /* Shm */ + if (&ep->ptl_amsh == epaddr->ptlctl) { + if (ep->ptl_amsh.epaddr_stats_get != NULL) + off += + ep->ptl_amsh.epaddr_stats_get(epaddr, + statsp + off); + } else { + if (ep->ptl_amsh.epaddr_stats_num != NULL) + off += ep->ptl_amsh.epaddr_stats_num(); + } + + /* ips */ + if (&ep->ptl_ips == epaddr->ptlctl) { + if (ep->ptl_ips.epaddr_stats_get != NULL) + off += + ep->ptl_ips.epaddr_stats_get(epaddr, + statsp + off); + } else { + if (ep->ptl_ips.epaddr_stats_num != NULL) + off += ep->ptl_ips.epaddr_stats_num(); + } + } + return; +} + +static +psm2_error_t +psmi_stats_epaddr_register(struct mpspawn_stats_init_args *args) +{ + int i = 0, j; + int num_ep = args->num_epaddr; + int num_ep_stats = 0; + int nz; + char **desc, **desc_i; + uint16_t *flags, *flags_i; + char *p; + char buf[128]; + psm2_ep_t ep; + struct mpspawn_stats_add_args mp_add; + struct stats_epaddr *stats_ctx; + psm2_error_t err = PSM2_OK; + + if (args->mq == NULL) + return PSM2_OK; + ep = args->mq->ep; + + /* Figure out how many stats there are in an endpoint from all devices */ + if (ep->ptl_self.epaddr_stats_num != NULL) + num_ep_stats += ep->ptl_self.epaddr_stats_num(); + if (ep->ptl_amsh.epaddr_stats_num != NULL) + num_ep_stats += ep->ptl_amsh.epaddr_stats_num(); + if (ep->ptl_ips.epaddr_stats_num != NULL) + num_ep_stats += ep->ptl_ips.epaddr_stats_num(); + + /* Allocate desc and flags and let each device initialize their + * descriptions and flags */ + desc = + psmi_malloc(ep, STATS, + sizeof(char *) * num_ep_stats * (num_ep + 1)); + if (desc == NULL) + return PSM2_NO_MEMORY; + flags = + psmi_malloc(ep, STATS, + sizeof(uint16_t) * num_ep_stats * (num_ep + 1)); + if (flags == NULL) { + psmi_free(desc); + return PSM2_NO_MEMORY; + } + + /* Get the descriptions/flags from each device */ + i = 0; + i += ep->ptl_self.epaddr_stats_num != NULL ? + ep->ptl_self.epaddr_stats_init(desc + i, flags + i) : 0; + i += ep->ptl_amsh.epaddr_stats_num != NULL ? + ep->ptl_amsh.epaddr_stats_init(desc + i, flags + i) : 0; + i += ep->ptl_ips.epaddr_stats_num != NULL ? + ep->ptl_ips.epaddr_stats_init(desc + i, flags + i) : 0; + psmi_assert_always(i == num_ep_stats); + + /* + * Clone the descriptions for each endpoint but append "rank %d" to it + * beforehand. + */ + nz = (num_ep < 10 ? 1 : (num_ep < 100 ? 2 : /* cheap log */ + (num_ep < 1000 ? 3 : (num_ep < 1000 ? 4 : + (num_ep < + 10000 ? 5 : 6))))); + + desc_i = desc + num_ep_stats; + flags_i = flags + num_ep_stats; + memset(desc_i, 0, sizeof(char *) * num_ep * num_ep_stats); + + for (i = 0; i < num_ep; i++) { + for (j = 0; j < num_ep_stats; j++) { + snprintf(buf, sizeof(buf) - 1, "<%*d> %s", nz, i, + desc[j]); + buf[sizeof(buf) - 1] = '\0'; + p = psmi_strdup(ep, buf); + if (p == NULL) { + err = PSM2_NO_MEMORY; + goto clean; + } + desc_i[i * num_ep_stats + j] = p; + flags_i[i * num_ep_stats + j] = flags[j]; + } + } + + mp_add.version = MPSPAWN_STATS_VERSION; + mp_add.num = num_ep_stats * num_ep; + mp_add.header = "Endpoint-to-Endpoint Stats (by )"; + mp_add.req_fn = psmi_stats_epaddr_callback; + mp_add.desc = desc_i; + mp_add.flags = flags_i; + stats_ctx = psmi_malloc(ep, STATS, sizeof(struct stats_epaddr)); + if (stats_ctx == NULL) { + err = PSM2_NO_MEMORY; + goto clean; + } + stats_ctx->ep = ep; + stats_ctx->epaddr_map_fn = args->epaddr_map_fn; + stats_ctx->num_ep = num_ep; + stats_ctx->num_ep_stats = num_ep_stats; + mp_add.context = stats_ctx; + + args->add_fn(&mp_add); + +clean: + /* Now we can free all the descriptions */ + for (i = 0; i < num_ep; i++) { + for (j = 0; j < num_ep_stats; j++) + if (desc_i[i * num_ep_stats + j]) + psmi_free(desc_i[i * num_ep_stats + j]); + } + + psmi_free(desc); + psmi_free(flags); + + return err; +} + +static +void stats_register_hfi_counters(psm2_ep_t ep) +{ + int i, nc, npc; + char *cnames = NULL, *pcnames = NULL; + struct psmi_stats_entry *entries = NULL; + + nc = hfi_get_ctrs_unit_names(ep->unit_id, &cnames); + if (nc == -1 || cnames == NULL) + goto bail; + npc = hfi_get_ctrs_port_names(ep->unit_id, &pcnames); + if (npc == -1 || pcnames == NULL) + goto bail; + entries = + psmi_calloc(ep, STATS, nc + npc, sizeof(struct psmi_stats_entry)); + if (entries == NULL) + goto bail; + + for (i = 0; i < nc; i++) { + entries[i].desc = hfi_get_next_name(&cnames); + entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL | + MPSPAWN_STATS_SKIP_IF_ZERO; + entries[i].getfn = NULL; + entries[i].u.off = i * sizeof(uint64_t); + } + for (i = nc; i < nc + npc; i++) { + entries[i].desc = hfi_get_next_name(&pcnames); + entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL | + MPSPAWN_STATS_SKIP_IF_ZERO; + entries[i].getfn = NULL; + entries[i].u.off = i * sizeof(uint64_t); + } + psmi_stats_register_type("OPA device counters", + PSMI_STATSTYPE_DEVCOUNTERS, + entries, nc + npc, ep); + return; + +bail: + if (cnames != NULL) + hfi_release_names(cnames); + if (pcnames != NULL) + hfi_release_names(pcnames); + if (entries != NULL) + psmi_free(entries); +} + +static +void stats_register_hfi_stats(psm2_ep_t ep) +{ + int i, ns; + char *snames = NULL; + struct psmi_stats_entry *entries = NULL; + + ns = hfi_get_stats_names(&snames); + if (ns == -1 || snames == NULL) + goto bail; + entries = psmi_calloc(ep, STATS, ns, sizeof(struct psmi_stats_entry)); + if (entries == NULL) + goto bail; + + for (i = 0; i < ns; i++) { + entries[i].desc = hfi_get_next_name(&snames); + entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL | + MPSPAWN_STATS_SKIP_IF_ZERO; + entries[i].getfn = NULL; + entries[i].u.off = i * sizeof(uint64_t); + } + psmi_stats_register_type("OPA device statistics", + PSMI_STATSTYPE_DEVSTATS, entries, ns, ep); + // psmi_stats_register_type makes it's own copy of entries + // so we should free the entries buffer. + // The snames will be freed when we deregister the hfi. + psmi_free(entries); + return; + +bail: + if (snames != NULL) + hfi_release_names(snames); + if (entries != NULL) + psmi_free(entries); +} + +#undef _SDECL +#define _SDECL(_desc, _param) { \ + .desc = _desc, \ + .flags = MPSPAWN_STATS_REDUCTION_ALL \ + | MPSPAWN_STATS_SKIP_IF_ZERO, \ + .getfn = NULL, \ + .u.off = offsetof(struct psmi_stats_malloc, _param) \ + } + +static +void stats_register_mem_stats(psm2_ep_t ep) +{ + struct psmi_stats_entry entries[] = { + _SDECL("Total (current)", m_all_total), + _SDECL("Total (max)", m_all_max), + _SDECL("All Peers (current)", m_perpeer_total), + _SDECL("All Peers (max)", m_perpeer_max), + _SDECL("Network Buffers (current)", m_netbufs_total), + _SDECL("Network Buffers (max)", m_netbufs_max), + _SDECL("PSM desctors (current)", m_descriptors_total), + _SDECL("PSM desctors (max)", m_descriptors_max), + _SDECL("Unexp. buffers (current)", m_unexpbufs_total), + _SDECL("Unexp. Buffers (max)", m_unexpbufs_max), + _SDECL("Other (current)", m_undefined_total), + _SDECL("Other (max)", m_undefined_max), + }; + + psmi_stats_register_type("PSM memory allocation statistics", + PSMI_STATSTYPE_MEMORY, + entries, PSMI_STATS_HOWMANY(entries), ep); +} diff --git a/psm_stats.h b/psm_stats.h new file mode 100644 index 0000000..9e9e0a9 --- /dev/null +++ b/psm_stats.h @@ -0,0 +1,120 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_IN_USER_H +#error psm_stats.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSM_STATS_H +#define _PSM_STATS_H + +#include "mpspawn_stats.h" + +#define PSMI_STATSTYPE_MQ 0x00001 +#define PSMI_STATSTYPE_RCVTHREAD 0x00100 /* num_wakups, ratio, etc. */ +#define PSMI_STATSTYPE_IPSPROTO 0x00200 /* acks,naks,err_chks */ +#define PSMI_STATSTYPE_TIDS 0x00400 +#define PSMI_STATSTYPE_MEMORY 0x01000 +#define PSMI_STATSTYPE_HFI (PSMI_STATSTYPE_RCVTHREAD| \ + PSMI_STATSTYPE_IPSPROTO | \ + PSMI_STATSTYPE_MEMORY | \ + PSMI_STATSTYPE_TIDS) +#define PSMI_STATSTYPE_P2P 0x00800 /* ep-to-ep details */ +#define PSMI_STATSTYPE_DEVCOUNTERS 0x10000 +#define PSMI_STATSTYPE_DEVSTATS 0x20000 +#define PSMI_STATSTYPE_ALL 0xfffff +#define _PSMI_STATSTYPE_DEVMASK 0xf0000 + +/* Used to determine how many stats in static array decl. */ +#define PSMI_STATS_HOWMANY(entries) \ + (sizeof(entries)/sizeof(entries[0])) + +#define PSMI_STATS_NO_HEADING NULL + +#define PSMI_STATS_DECL(_desc, _flags, _getfn, _val) \ + { .desc = _desc, \ + .flags = _flags, \ + .getfn = _getfn, \ + .u.val = _val, \ + } + +#define PSMI_STATS_DECLU64(_desc, _val) \ + PSMI_STATS_DECL(_desc, \ + MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO, \ + NULL, \ + _val) + +struct psmi_stats_entry { + const char *desc; + uint16_t flags; + uint64_t(*getfn) (void *context); /* optional fn ptr to get value */ + union { + uint64_t *val; /* where value is stored if getfn is NULL */ + uint64_t off; /* of offset if that makes more sense */ + } u; +}; + +/* + * Copy the array of entries and keep track of the context + */ +psm2_error_t +psmi_stats_register_type(const char *heading, + uint32_t statstype, + const struct psmi_stats_entry *entries, + int num_entries, void *context); + +psm2_error_t psmi_stats_deregister_all(void); + +#endif /* PSM_STATS_H */ diff --git a/psm_sysbuf.c b/psm_sysbuf.c new file mode 100644 index 0000000..48fc06e --- /dev/null +++ b/psm_sysbuf.c @@ -0,0 +1,222 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm_mq_internal.h" + +/* + * + * System buffer (unexpected message) allocator + * + */ + +#define MM_FLAG_NONE 0 +#define MM_FLAG_TRANSIENT 0x1 + +struct psmi_mem_block_ctrl { + union { + psmi_mem_ctrl_t *mem_handler; + struct psmi_mem_block_ctrl *next; + }; +}; + + +/* Per MQ allocators */ +void psmi_mq_sysbuf_init(psm2_mq_t mq) +{ + int i; + uint32_t block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, (uint32_t)-1}; + uint32_t replenishing_rate[] = {128, 64, 32, 16, 8, 4, 0}; + + if (mq->mem_ctrl_is_init) + return; + mq->mem_ctrl_is_init = 1; + + for (i=0; i < MM_NUM_OF_POOLS; i++) { + mq->handler_index[i].block_size = block_sizes[i]; + mq->handler_index[i].current_available = 0; + mq->handler_index[i].free_list = NULL; + mq->handler_index[i].total_alloc = 0; + mq->handler_index[i].replenishing_rate = replenishing_rate[i]; + + if (block_sizes[i] == -1) { + psmi_assert_always(replenishing_rate[i] == 0); + mq->handler_index[i].flags = MM_FLAG_TRANSIENT; + } + else { + psmi_assert_always(replenishing_rate[i] > 0); + mq->handler_index[i].flags = MM_FLAG_NONE; + } + } + + /* Hit once on each block size so we have a pool that's allocated */ + for (i=0; i < MM_NUM_OF_POOLS; i++) { + void *ptr; + if (block_sizes[i] == -1) + continue; + ptr = psmi_mq_sysbuf_alloc(mq, block_sizes[i]); + psmi_mq_sysbuf_free(mq, ptr); + } +} + +void psmi_mq_sysbuf_fini(psm2_mq_t mq) // free all buffers that is currently not used +{ + struct psmi_mem_block_ctrl *block; + int i; + + if (mq->mem_ctrl_is_init == 0) + return; + + for (i=0; i < MM_NUM_OF_POOLS; i++) { + while ((block = mq->handler_index[i].free_list) != NULL) { + mq->handler_index[i].free_list = block->next; + psmi_free(block); + } + } + mq->mem_ctrl_is_init = 0; +} + +void psmi_mq_sysbuf_getinfo(psm2_mq_t mq, char *buf, size_t len) +{ + snprintf(buf, len-1, "Sysbuf consumption: %"PRIu64" bytes\n", + mq->mem_ctrl_total_bytes); + buf[len-1] = '\0'; + return; +} + +void *psmi_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size) +{ + psmi_mem_ctrl_t *mm_handler = mq->handler_index; + struct psmi_mem_block_ctrl *new_block; + int replenishing; + + /* There is a timing race with ips initialization, fix later. + * * XXX */ + if (!mq->mem_ctrl_is_init) + psmi_mq_sysbuf_init(mq); + + mq->stats.rx_sysbuf_num++; + mq->stats.rx_sysbuf_bytes += alloc_size; + + while (mm_handler->block_size < alloc_size) + mm_handler++; + + replenishing = mm_handler->replenishing_rate; + + if (mm_handler->current_available == 0) { // allocate more buffers + if (mm_handler->flags & MM_FLAG_TRANSIENT) { + uint32_t newsz = alloc_size + sizeof(struct psmi_mem_block_ctrl); + new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz); + + if (new_block) { + new_block->mem_handler = mm_handler; + new_block++; + mm_handler->total_alloc++; + mq->mem_ctrl_total_bytes += newsz; + } + return new_block; + } + + do { + uint32_t newsz = mm_handler->block_size + sizeof(struct psmi_mem_block_ctrl); + + new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz); + mq->mem_ctrl_total_bytes += newsz; + + if (new_block) { + mm_handler->current_available++; + mm_handler->total_alloc++; + + new_block->next = mm_handler->free_list; + mm_handler->free_list = new_block; + } + + } while (--replenishing && new_block); + } + + if (mm_handler->current_available) { + mm_handler->current_available--; + + new_block = mm_handler->free_list; + mm_handler->free_list = new_block->next; + + new_block->mem_handler = mm_handler; + new_block++; + + return new_block; + } + return NULL; +} + +void psmi_mq_sysbuf_free(psm2_mq_t mq, void * mem_to_free) +{ + struct psmi_mem_block_ctrl * block_to_free; + psmi_mem_ctrl_t *mm_handler; + + psmi_assert_always(mq->mem_ctrl_is_init); + + block_to_free = (struct psmi_mem_block_ctrl *)mem_to_free - 1; + mm_handler = block_to_free->mem_handler; + + if (mm_handler->flags & MM_FLAG_TRANSIENT) { + psmi_free(block_to_free); + } else { + block_to_free->next = mm_handler->free_list; + mm_handler->free_list = block_to_free; + mm_handler->current_available++; + } + + return; +} diff --git a/psm_sysbuf.h b/psm_sysbuf.h new file mode 100644 index 0000000..07ab593 --- /dev/null +++ b/psm_sysbuf.h @@ -0,0 +1,81 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef SYSBUF_INT_H +#define SYSBUF_INT_H + +#include "psm_user.h" + +#define MM_NUM_OF_POOLS 7 + +typedef struct psmi_mem_ctrl { + struct psmi_mem_block_ctrl *free_list; + uint32_t total_alloc; + uint32_t current_available; + uint32_t block_size; + uint32_t flags; + uint32_t replenishing_rate; +} psmi_mem_ctrl_t; + +/* + * MQ unexpected buffer management + */ +void psmi_mq_sysbuf_init(psm2_mq_t mq); +void psmi_mq_sysbuf_fini(psm2_mq_t mq); +void* psmi_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t nbytes); +void psmi_mq_sysbuf_free(psm2_mq_t mq, void *); +void psmi_mq_sysbuf_getinfo(psm2_mq_t mq, char *buf, size_t len); + +#endif /* SYSBUF_INT_H */ diff --git a/psm_timer.c b/psm_timer.c new file mode 100644 index 0000000..9a8dddd --- /dev/null +++ b/psm_timer.c @@ -0,0 +1,198 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" + +#if PSMI_TIMER_STATS +# define PSMI_TIMER_STATS_ADD_INSERTION(ctrl) ((ctrl)->num_insertions++) +# define PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl) ((ctrl)->num_traversals++) +#else +# define PSMI_TIMER_STATS_ADD_INSERTION(ctrl) +# define PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl) +#endif + +psm2_error_t psmi_timer_init(struct psmi_timer_ctrl *ctrl) +{ + ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE; + +#if PSMI_TIMER_STATS + ctrl->num_insertions = 0; + ctrl->num_traversals = 0; +#endif + + TAILQ_INIT(&ctrl->timerq); + return PSM2_OK; +} + +psm2_error_t psmi_timer_fini(struct psmi_timer_ctrl *ctrl) +{ +#if PSMI_TIMER_STATS + if (ctrl->num_insertions > 0) { + _HFI_INFO("avg elem traversals/insertion = %3.2f %%\n", + 100.0 * (double)ctrl->num_traversals / + ctrl->num_insertions); + } +#endif + return PSM2_OK; +} + +void +psmi_timer_request_always(struct psmi_timer_ctrl *ctrl, + struct psmi_timer *t_insert, uint64_t t_cyc_expire) +{ + struct psmi_timer *t_cursor; + + psmi_assert(!(t_insert->flags & PSMI_TIMER_FLAG_PENDING)); + + t_insert->t_timeout = t_cyc_expire; + t_insert->flags |= PSMI_TIMER_FLAG_PENDING; + + /* + * We keep the list from oldest (head) to newest (tail), with the + * assumption that insert and remove occur much more often than search + * (when the timer expires). Newly added timers are more likely to expire + * later rather than sooner, which is why the head is older. + */ + PSMI_TIMER_STATS_ADD_INSERTION(ctrl); + + if (TAILQ_EMPTY(&ctrl->timerq)) { /* Common case */ + TAILQ_INSERT_TAIL(&ctrl->timerq, t_insert, timer); + ctrl->t_cyc_next_expire = t_cyc_expire; + PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl); + return; + } else if (t_cyc_expire > PSMI_TIMER_PRIO_LAST) { + TAILQ_FOREACH(t_cursor, &ctrl->timerq, timer) { + if (t_cursor->t_timeout <= t_cyc_expire) { + TAILQ_INSERT_BEFORE(t_cursor, t_insert, timer); + return; + } + PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl); + } + /* Got to the end of the list -- We're the next to expire */ + ctrl->t_cyc_next_expire = t_cyc_expire; + TAILQ_INSERT_TAIL(&ctrl->timerq, t_insert, timer); + return; + } else { + TAILQ_FOREACH_REVERSE(t_cursor, &ctrl->timerq, timerq, timer) { + if (t_cursor->t_timeout >= t_cyc_expire) { + TAILQ_INSERT_AFTER(&ctrl->timerq, t_cursor, + t_insert, timer); + ctrl->t_cyc_next_expire = + min(t_cyc_expire, ctrl->t_cyc_next_expire); + return; + } + PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl); + } + TAILQ_INSERT_HEAD(&ctrl->timerq, t_insert, timer); + /* No need to check if we inserted last, given first branch case */ + /* if (TAILQ_LAST(&ctrl->timerq, timerq) == t_insert) */ + /* ctrl->t_cyc_next_expire = t_cyc_expire; */ + return; + } + + return; +} + +psm2_error_t +psmi_timer_process_expired(struct psmi_timer_ctrl *ctrl, uint64_t t_cyc_expire) +{ + psm2_error_t err = PSM2_OK_NO_PROGRESS; + struct psmi_timer *t_cursor = TAILQ_LAST(&ctrl->timerq, timerq); + + PSM2_LOG_MSG("entering"); + + while (t_cursor) { + if (t_cursor->t_timeout > t_cyc_expire) + break; + + err = PSM2_OK; + psmi_assert(t_cursor->flags & PSMI_TIMER_FLAG_PENDING); + t_cursor->flags &= ~PSMI_TIMER_FLAG_PENDING; + TAILQ_REMOVE(&ctrl->timerq, t_cursor, timer); + t_cursor->expire_callback(t_cursor, t_cyc_expire); + t_cursor = TAILQ_PREV(t_cursor, timerq, timer); + } + + if (TAILQ_EMPTY(&ctrl->timerq)) + ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE; + else + ctrl->t_cyc_next_expire = + TAILQ_LAST(&ctrl->timerq, timerq)->t_timeout; + + PSM2_LOG_MSG("leaving"); + return err; +} + +void +psmi_timer_cancel_inner(struct psmi_timer_ctrl *ctrl, + struct psmi_timer *t_remove) +{ + + psmi_assert(t_remove->flags & PSMI_TIMER_FLAG_PENDING); + + t_remove->flags &= ~PSMI_TIMER_FLAG_PENDING; + TAILQ_REMOVE(&ctrl->timerq, t_remove, timer); + + /* + * If we're removing the last entry, we need to reset the + * expiration cycle time. + */ + if (TAILQ_EMPTY(&ctrl->timerq)) + ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE; + else + ctrl->t_cyc_next_expire = + TAILQ_LAST(&ctrl->timerq, timerq)->t_timeout; + return; +} diff --git a/psm_timer.h b/psm_timer.h new file mode 100644 index 0000000..8c03d18 --- /dev/null +++ b/psm_timer.h @@ -0,0 +1,160 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_IN_USER_H +#error psm_timer.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_TIMER_H +#define _PSMI_TIMER_H + + +typedef struct psmi_timer psmi_timer; +typedef psm2_error_t(*psmi_timer_expire_callback_t) (struct psmi_timer *, + uint64_t); + +struct psmi_timer { + TAILQ_ENTRY(psmi_timer) timer; /* opaque */ + uint64_t t_timeout; /* opaque */ + uint8_t flags; /* opaque */ + + psmi_timer_expire_callback_t expire_callback; /* user -- callback fn */ + void *context; /* user -- callback param */ +}; + +struct psmi_timer_ctrl { + uint64_t t_cyc_next_expire; + TAILQ_HEAD(timerq, psmi_timer) timerq; + +#if PSMI_TIMER_STATS + uint64_t num_insertions; + uint64_t num_traversals; +#endif +}; + +/* + * Some events need to be unconditionally enqueued at the beginning of the + * timerq -- they are not timers meant to expire but merely operations that + * need to be delayed. For delayed operations, there are 5 levels of + * priority. + */ +#define PSMI_TIMER_PRIO_0 0ULL +#define PSMI_TIMER_PRIO_1 1ULL +#define PSMI_TIMER_PRIO_2 2ULL +#define PSMI_TIMER_PRIO_3 3ULL +#define PSMI_TIMER_PRIO_4 4ULL +#define PSMI_TIMER_PRIO_LAST PSMI_TIMER_PRIO_4 + +#define PSMI_TIMER_INFINITE 0xFFFFFFFFFFFFFFFFULL +#define PSMI_TIMER_FLAG_PENDING 0x01 + +/* + * Timer control initialization and finalization + */ +psm2_error_t psmi_timer_init(struct psmi_timer_ctrl *ctrl); +psm2_error_t psmi_timer_fini(struct psmi_timer_ctrl *ctrl); + +/* + * Timer entry initialization (a timer must be initialized before it can be + * added to the timer request queue). + */ + +PSMI_ALWAYS_INLINE( +void +psmi_timer_entry_init(struct psmi_timer *t_init, + psmi_timer_expire_callback_t expire_fn, + void *context)) +{ + t_init->flags = 0; + t_init->expire_callback = expire_fn; + t_init->context = context; + return; +} + +/* + * Timer requests, conditional (macro) or unconditional + */ +#define psmi_timer_request(ctrl, t_insert, t_cyc) \ + if (!((t_insert)->flags & PSMI_TIMER_FLAG_PENDING)) \ + psmi_timer_request_always((ctrl), (t_insert), (t_cyc)) + +void psmi_timer_request_always(struct psmi_timer_ctrl *ctrl, + struct psmi_timer *t_insert, + uint64_t t_cyc_expire); + +/* + * Timer cancelations, conditional (macro) only (cancel_inner is internal) + */ +#define psmi_timer_cancel(ctrl, t_remove) \ + if ((t_remove)->flags & PSMI_TIMER_FLAG_PENDING) \ + psmi_timer_cancel_inner(ctrl, t_remove) +void psmi_timer_cancel_inner(struct psmi_timer_ctrl *ctrl, + struct psmi_timer *t_remove); + +/* + * Timer processing, conditional or unconditional. + */ +#define psmi_timer_process_if_expired(ctrl, t_cyc_expire) \ + (((ctrl)->t_cyc_next_expire <= (t_cyc_expire)) ? \ + psmi_timer_process_expired(ctrl, t_cyc_expire) : \ + PSM2_OK_NO_PROGRESS) + +#define psmi_timer_is_expired(ctrl, t_cyc_expire) \ + ((ctrl)->t_cyc_next_expire <= (t_cyc_expire)) + +psm2_error_t psmi_timer_process_expired(struct psmi_timer_ctrl *ctrl, + uint64_t t_cyc_expire); + +#endif /* _PSMI_TIMER_H */ diff --git a/psm_user.h b/psm_user.h new file mode 100644 index 0000000..5a35085 --- /dev/null +++ b/psm_user.h @@ -0,0 +1,487 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_USER_H +#define _PSMI_USER_H + +#include "psm_config.h" +#include +#include + +#include +#include +#include +#include + +#include "psm2.h" +#include "psm2_mq.h" + +#include "ptl.h" + +#include "opa_user.h" +#include "opa_queue.h" + +#include "psm_log.h" +#include "psm_perf.h" + +#define PSMI_LOCK_NO_OWNER ((pthread_t)(-1)) + +#define _PSMI_IN_USER_H + +/* Opaque hw context pointer used in HAL, + and defined by each HAL instance. */ +typedef void *psmi_hal_hw_context; + +#include "psm_help.h" +#include "psm_error.h" +#include "psm_context.h" +#include "psm_utils.h" +#include "psm_timer.h" +#include "psm_mpool.h" +#include "psm_ep.h" +#include "psm_lock.h" +#include "psm_stats.h" +#include "psm2_mock_testing.h" + +#undef _PSMI_IN_USER_H + +#define PSMI_VERNO_MAKE(major, minor) ((((major)&0xff)<<8)|((minor)&0xff)) +#define PSMI_VERNO PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR) +#define PSMI_VERNO_GET_MAJOR(verno) (((verno)>>8) & 0xff) +#define PSMI_VERNO_GET_MINOR(verno) (((verno)>>0) & 0xff) + +int psmi_verno_client(); +int psmi_verno_isinteroperable(uint16_t verno); +int MOCKABLE(psmi_isinitialized)(); +MOCK_DCL_EPILOGUE(psmi_isinitialized); + +psm2_error_t psmi_poll_internal(psm2_ep_t ep, int poll_amsh); +psm2_error_t psmi_mq_wait_internal(psm2_mq_req_t *ireq); + +int psmi_get_current_proc_location(); + +extern int psmi_epid_ver; +extern uint32_t non_dw_mul_sdma; +extern psmi_lock_t psmi_creation_lock; +extern psm2_ep_t psmi_opened_endpoint; + +extern int psmi_affinity_shared_file_opened; +extern uint64_t *shared_affinity_ptr; +extern char *affinity_shm_name; + +extern sem_t *sem_affinity_shm_rw; +extern int psmi_affinity_semaphore_open; +extern char *sem_affinity_shm_rw_name; + +PSMI_ALWAYS_INLINE( +int +_psmi_get_epid_version()) { + return psmi_epid_ver; +} + +#define PSMI_EPID_VERSION_SHM 0 +#define PSMI_EPID_SHM_ONLY 1 +#define PSMI_EPID_IPS_SHM 0 +#define PSMI_EPID_VERSION _psmi_get_epid_version() +#define PSMI_MAX_EPID_VERNO_SUPPORTED 2 +#define PSMI_MIN_EPID_VERNO_SUPPORTED 1 +#define PSMI_EPID_VERNO_DEFAULT 2 +#define PSMI_EPID_V1 1 +#define PSMI_EPID_V2 2 + +#define PSMI_EPID_GET_LID(epid) (PSMI_EPID_VERSION == PSMI_EPID_V1) ? \ + (int)PSMI_EPID_GET_LID_V1(epid) \ + : (int)PSMI_EPID_GET_LID_V2(epid) + +#define PSMI_GET_SUBNET_ID(gid_hi) (gid_hi & 0xffff) + +/* + * Following is the definition of various lock implementations. The choice is + * made by defining specific lock type in relevant section of psm_config.h + */ +#ifdef PSMI_LOCK_IS_SPINLOCK +#define _PSMI_LOCK_INIT(pl) psmi_spin_init(&((pl).lock)) +#define _PSMI_LOCK_TRY(pl) psmi_spin_trylock(&((pl).lock)) +#define _PSMI_LOCK(pl) psmi_spin_lock(&((pl).lock)) +#define _PSMI_UNLOCK(pl) psmi_spin_unlock(&((pl).lock)) +#define _PSMI_LOCK_ASSERT(pl) +#define _PSMI_UNLOCK_ASSERT(pl) +#define PSMI_LOCK_DISABLED 0 + +#elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG) + +PSMI_ALWAYS_INLINE( +int +_psmi_mutex_trylock_inner(pthread_mutex_t *mutex, + const char *curloc, pthread_t *lock_owner)) +{ + psmi_assert_always_loc(*lock_owner != pthread_self(), + curloc); + int ret = pthread_mutex_trylock(mutex); + if (ret == 0) + *lock_owner = pthread_self(); + return ret; +} + +PSMI_ALWAYS_INLINE( +int +_psmi_mutex_lock_inner(pthread_mutex_t *mutex, + const char *curloc, pthread_t *lock_owner)) +{ + psmi_assert_always_loc(*lock_owner != pthread_self(), + curloc); + int ret = pthread_mutex_lock(mutex); + psmi_assert_always_loc(ret != EDEADLK, curloc); + *lock_owner = pthread_self(); + return ret; +} + +PSMI_ALWAYS_INLINE( +void +_psmi_mutex_unlock_inner(pthread_mutex_t *mutex, + const char *curloc, pthread_t *lock_owner)) +{ + psmi_assert_always_loc(*lock_owner == pthread_self(), + curloc); + *lock_owner = PSMI_LOCK_NO_OWNER; + psmi_assert_always_loc(pthread_mutex_unlock(mutex) != + EPERM, curloc); + return; +} + +#define _PSMI_LOCK_INIT(pl) /* static initialization */ +#define _PSMI_LOCK_TRY(pl) \ + _psmi_mutex_trylock_inner(&((pl).lock), PSMI_CURLOC, \ + &((pl).lock_owner)) +#define _PSMI_LOCK(pl) \ + _psmi_mutex_lock_inner(&((pl).lock), PSMI_CURLOC, \ + &((pl).lock_owner)) +#define _PSMI_UNLOCK(pl) \ + _psmi_mutex_unlock_inner(&((pl).lock), PSMI_CURLOC, \ + &((pl).lock_owner)) +#define _PSMI_LOCK_ASSERT(pl) \ + psmi_assert_always((pl).lock_owner == pthread_self()); +#define _PSMI_UNLOCK_ASSERT(pl) \ + psmi_assert_always((pl).lock_owner != pthread_self()); +#define PSMI_LOCK_DISABLED 0 + +#elif defined(PSMI_LOCK_IS_MUTEXLOCK) +#define _PSMI_LOCK_INIT(pl) /* static initialization */ +#define _PSMI_LOCK_TRY(pl) pthread_mutex_trylock(&((pl).lock)) +#define _PSMI_LOCK(pl) pthread_mutex_lock(&((pl).lock)) +#define _PSMI_UNLOCK(pl) pthread_mutex_unlock(&((pl).lock)) +#define PSMI_LOCK_DISABLED 0 +#define _PSMI_LOCK_ASSERT(pl) +#define _PSMI_UNLOCK_ASSERT(pl) + +#elif defined(PSMI_PLOCK_IS_NOLOCK) +#define _PSMI_LOCK_TRY(pl) 0 /* 0 *only* so progress thread never succeeds */ +#define _PSMI_LOCK(pl) +#define _PSMI_UNLOCK(pl) +#define PSMI_LOCK_DISABLED 1 +#define _PSMI_LOCK_ASSERT(pl) +#define _PSMI_UNLOCK_ASSERT(pl) +#else +#error No LOCK lock type declared +#endif + +#define PSMI_YIELD(pl) \ + do { _PSMI_UNLOCK((pl)); sched_yield(); _PSMI_LOCK((pl)); } while (0) + +#ifdef PSM2_MOCK_TESTING +/* If this is a mocking tests build, all the operations on the locks + * are routed through functions which may be mocked, if necessary. */ +void MOCKABLE(psmi_mockable_lock_init)(psmi_lock_t *pl); +MOCK_DCL_EPILOGUE(psmi_mockable_lock_init); + +int MOCKABLE(psmi_mockable_lock_try)(psmi_lock_t *pl); +MOCK_DCL_EPILOGUE(psmi_mockable_lock_try); + +void MOCKABLE(psmi_mockable_lock)(psmi_lock_t *pl); +MOCK_DCL_EPILOGUE(psmi_mockable_lock); + +void MOCKABLE(psmi_mockable_unlock)(psmi_lock_t *pl); +MOCK_DCL_EPILOGUE(psmi_mockable_unlock); + +void MOCKABLE(psmi_mockable_lock_assert)(psmi_lock_t *pl); +MOCK_DCL_EPILOGUE(psmi_mockable_lock_assert); + +void MOCKABLE(psmi_mockable_unlock_assert)(psmi_lock_t *pl); +MOCK_DCL_EPILOGUE(psmi_mockable_unlock_assert); + +#define PSMI_LOCK_INIT(pl) psmi_mockable_lock_init(&(pl)) +#define PSMI_LOCK_TRY(pl) psmi_mockable_lock_try(&(pl)) +#define PSMI_LOCK(pl) psmi_mockable_lock(&(pl)) +#define PSMI_UNLOCK(pl) psmi_mockable_unlock(&(pl)) +#define PSMI_LOCK_ASSERT(pl) psmi_mockable_lock_assert(&(pl)) +#define PSMI_UNLOCK_ASSERT(pl) psmi_mockable_unlock_assert(&(pl)) +#else +#define PSMI_LOCK_INIT(pl) _PSMI_LOCK_INIT(pl) +#define PSMI_LOCK_TRY(pl) _PSMI_LOCK_TRY(pl) +#define PSMI_LOCK(pl) _PSMI_LOCK(pl) +#define PSMI_UNLOCK(pl) _PSMI_UNLOCK(pl) +#define PSMI_LOCK_ASSERT(pl) _PSMI_LOCK_ASSERT(pl) +#define PSMI_UNLOCK_ASSERT(pl) _PSMI_UNLOCK_ASSERT(pl) +#endif + +#ifdef PSM_PROFILE +void psmi_profile_block() __attribute__ ((weak)); +void psmi_profile_unblock() __attribute__ ((weak)); +void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak)); + +#define PSMI_PROFILE_BLOCK() psmi_profile_block() +#define PSMI_PROFILE_UNBLOCK() psmi_profile_unblock() +#define PSMI_PROFILE_REBLOCK(noprog) psmi_profile_reblock(noprog) +#else +#define PSMI_PROFILE_BLOCK() +#define PSMI_PROFILE_UNBLOCK() +#define PSMI_PROFILE_REBLOCK(noprog) +#endif + +#ifdef PSM_CUDA +#include +#include + +#if CUDA_VERSION < 7000 +#error Please update CUDA driver, required minimum version is 7.0 +#endif + +extern int is_cuda_enabled; +extern int is_gdr_copy_enabled; +extern int device_support_gpudirect; +extern int cuda_lib_version; + +extern CUcontext ctxt; +void *psmi_cuda_lib; +CUresult (*psmi_cuInit)(unsigned int Flags ); +CUresult (*psmi_cuCtxDetach)(CUcontext c); +CUresult (*psmi_cuCtxSetCurrent)(CUcontext c); +CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c); +CUresult (*psmi_cuCtxSetCurrent)(CUcontext c); +CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); +CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); +CUresult (*psmi_cuDeviceGet)(CUdevice* device, int ordinal); +CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev); +CUresult (*psmi_cuDriverGetVersion)(int* driverVersion); +CUresult (*psmi_cuDeviceGetCount)(int* count); +CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags); +CUresult (*psmi_cuStreamDestroy)(CUstream phStream); +CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags); +CUresult (*psmi_cuEventDestroy)(CUevent hEvent); +CUresult (*psmi_cuEventQuery)(CUevent hEvent); +CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream); +CUresult (*psmi_cuEventSynchronize)(CUevent hEvent); +CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags); +CUresult (*psmi_cuMemFreeHost)(void* p); +CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); +CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); +CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); +CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount); +CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); +CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream); +CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr); +CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags); +CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr); +CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr); +CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active); +CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev); +CUresult (*psmi_cuCtxGetDevice)(CUdevice* device); +CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device); + +#define PSMI_CUDA_CALL(func, args...) do { \ + CUresult cudaerr; \ + cudaerr = psmi_##func(args); \ + if (cudaerr != CUDA_SUCCESS) { \ + if (ctxt == NULL) \ + _HFI_ERROR( \ + "Check if CUDA is initialized" \ + "before psm2_ep_open call \n"); \ + _HFI_ERROR( \ + "CUDA failure: %s() (at %s:%d)" \ + "returned %d\n", \ + #func, __FILE__, __LINE__, cudaerr); \ + psmi_handle_error( \ + PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from CUDA function.\n");\ + } \ + } while (0) + +#define PSMI_CUDA_CHECK_EVENT(event, cudaerr) do { \ + cudaerr = psmi_cuEventQuery(event); \ + if ((cudaerr != CUDA_SUCCESS) && \ + (cudaerr != CUDA_ERROR_NOT_READY)) { \ + _HFI_ERROR( \ + "CUDA failure: %s() returned %d\n", \ + "cuEventQuery", cudaerr); \ + psmi_handle_error( \ + PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from CUDA function.\n");\ + } \ + } while (0) + +#define PSMI_CUDA_DLSYM(psmi_cuda_lib,func) do { \ + psmi_##func = dlsym(psmi_cuda_lib, STRINGIFY(func)); \ + if (!psmi_##func) { \ + psmi_handle_error(PSMI_EP_NORETURN, \ + PSM2_INTERNAL_ERR, \ + " Unable to resolve %s symbol" \ + " in CUDA libraries.\n",STRINGIFY(func));\ + } \ +} while (0) + +PSMI_ALWAYS_INLINE( +int +_psmi_is_cuda_mem(void *ptr)) +{ + CUresult cres; + CUmemorytype mt; + unsigned uvm = 0; + cres = psmi_cuPointerGetAttribute( + &mt, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr) ptr); + if ((cres == CUDA_SUCCESS) && (mt == CU_MEMORYTYPE_DEVICE)) { + cres = psmi_cuPointerGetAttribute( + &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr); + if ((cres == CUDA_SUCCESS) && (uvm == 0)) + return 1; + else + return 0; + } else + return 0; +} + +PSMI_ALWAYS_INLINE( +int +_psmi_is_cuda_enabled()) +{ + return is_cuda_enabled; +} + +#define PSMI_IS_CUDA_ENABLED _psmi_is_cuda_enabled() + +PSMI_ALWAYS_INLINE( +int +_psmi_is_gdr_copy_enabled()) +{ + return is_gdr_copy_enabled; +} + +#define PSMI_IS_GDR_COPY_ENABLED _psmi_is_gdr_copy_enabled() + +#define PSMI_IS_CUDA_MEM(p) _psmi_is_cuda_mem(p) + +struct ips_cuda_hostbuf { + STAILQ_ENTRY(ips_cuda_hostbuf) req_next; + STAILQ_ENTRY(ips_cuda_hostbuf) next; + uint32_t size, offset, bytes_read; + /* This flag indicates whether a chb is + * pulled from a mpool or dynamically + * allocated using calloc. */ + uint8_t is_tempbuf; + CUevent copy_status; + psm2_mq_req_t req; + void *host_buf; + CUdeviceptr gpu_buf; +}; + +struct ips_cuda_hostbuf_mpool_cb_context { + unsigned bufsz; +}; +void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj); + +#define CUDA_HOSTBUFFER_LIMITS { \ + .env = "PSM_CUDA_BOUNCEBUFFERS_MAX", \ + .descr = "Max CUDA bounce buffers (in MB)", \ + .env_level = PSMI_ENVVAR_LEVEL_HIDDEN, \ + .minval = 1, \ + .maxval = 1<<30, \ + .mode[PSMI_MEMMODE_NORMAL] = { 16, 256 }, \ + .mode[PSMI_MEMMODE_MINIMAL] = { 1, 1 }, \ + .mode[PSMI_MEMMODE_LARGE] = { 32, 512 } \ + } + +extern uint32_t gpudirect_send_threshold; +extern uint32_t gpudirect_recv_threshold; +extern uint32_t cuda_thresh_rndv; +/* This threshold dictates when the sender turns off + * GDR Copy. The threshold needs to be less than + * CUDA RNDV threshold. + */ +extern uint32_t gdr_copy_threshold_send; +/* This threshold dictates when the reciever turns off + * GDR Copy. The threshold needs to be less than + * CUDA RNDV threshold. + */ +extern uint32_t gdr_copy_threshold_recv; + +#define PSMI_USE_GDR_COPY(req, len) req->is_buf_gpu_mem && \ + PSMI_IS_GDR_COPY_ENABLED && \ + len >=1 && len <= gdr_copy_threshold_recv + +enum psm2_chb_match_type { + /* Complete data found in a single chb */ + PSMI_CUDA_FULL_MATCH_FOUND = 0, + /* Data is spread across two chb's */ + PSMI_CUDA_SPLIT_MATCH_FOUND = 1, + /* Data is only partially prefetched */ + PSMI_CUDA_PARTIAL_MATCH_FOUND = 2, + PSMI_CUDA_CONTINUE = 3 +}; +typedef enum psm2_chb_match_type psm2_chb_match_type_t; + +#endif /* PSM_CUDA */ + +#define COMPILE_TIME_ASSERT(NAME,COND) extern char NAME[1/ COND] + +#endif /* _PSMI_USER_H */ diff --git a/psm_utils.c b/psm_utils.c new file mode 100644 index 0000000..521467f --- /dev/null +++ b/psm_utils.c @@ -0,0 +1,2598 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include /* gethostbyname */ +#include /* malloc_usable_size */ +#include "psm_user.h" +#include "psm2_hal.h" +#include "psm_am_internal.h" +#include "psm_mq_internal.h" + +int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid); + +struct psmi_epid_table psmi_epid_table; + +/* Iterator to access the epid table. + * 'ep' can be NULL if remote endpoints from all endpoint handles are requested + */ +void psmi_epid_itor_init(struct psmi_eptab_iterator *itor, psm2_ep_t ep) +{ + itor->i = 0; + itor->ep = ep; + pthread_mutex_lock(&psmi_epid_table.tablock); +} + +void *psmi_epid_itor_next(struct psmi_eptab_iterator *itor) +{ + int i; + struct psmi_epid_tabentry *e; + + if (itor->i >= psmi_epid_table.tabsize) + return NULL; + for (i = itor->i; i < psmi_epid_table.tabsize; i++) { + e = &psmi_epid_table.table[i]; + if (!e->entry || e->entry == EPADDR_DELETED) + continue; + if (itor->ep && e->ep != itor->ep) + continue; + itor->i = i + 1; + return e->entry; + } + itor->i = psmi_epid_table.tabsize; /* put at end of table */ + return NULL; +} + +void psmi_epid_itor_fini(struct psmi_eptab_iterator *itor) +{ + pthread_mutex_unlock(&psmi_epid_table.tablock); + itor->i = 0; +} + +#define mix64(a, b, c) \ +{ \ + a -= b; a -= c; a ^= (c>>43); \ + b -= c; b -= a; b ^= (a<<9); \ + c -= a; c -= b; c ^= (b>>8); \ + a -= b; a -= c; a ^= (c>>38); \ + b -= c; b -= a; b ^= (a<<23); \ + c -= a; c -= b; c ^= (b>>5); \ + a -= b; a -= c; a ^= (c>>35); \ + b -= c; b -= a; b ^= (a<<49); \ + c -= a; c -= b; c ^= (b>>11); \ + a -= b; a -= c; a ^= (c>>12); \ + b -= c; b -= a; b ^= (a<<18); \ + c -= a; c -= b; c ^= (b>>22); \ +} + +psm2_error_t psmi_epid_init() +{ + pthread_mutexattr_t attr; + psmi_epid_table.table = NULL, psmi_epid_table.tabsize = 0; + psmi_epid_table.tabsize_used = 0; + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); + pthread_mutex_init(&psmi_epid_table.tablock, &attr); + pthread_mutexattr_destroy(&attr); + return PSM2_OK; +}; + +psm2_error_t psmi_epid_fini() +{ + if (psmi_epid_table.table != NULL) { + psmi_free(psmi_epid_table.table); + psmi_epid_table.table = NULL; + } + psmi_epid_table.tabsize = 0; + psmi_epid_table.tabsize_used = 0; + return PSM2_OK; +} + +PSMI_ALWAYS_INLINE( +uint64_t +hash_this(const psm2_ep_t ep, const psm2_epid_t epid)) +{ + uint64_t ep_i = (uint64_t) (uintptr_t) ep; + uint64_t epid_i = (uint64_t) epid; + uint64_t hash = 0x9e3779b97f4a7c13LL; + mix64(ep_i, epid_i, hash); + return hash; +} + +PSMI_ALWAYS_INLINE( +void * +psmi_epid_lookup_inner(psm2_ep_t ep, psm2_epid_t epid, int remove)) +{ + uint64_t key = hash_this(ep, epid); + struct psmi_epid_tabentry *e; + void *entry = NULL; + int idx; + + pthread_mutex_lock(&psmi_epid_table.tablock); + if (!psmi_epid_table.table) + goto ret; + idx = (int)(key % psmi_epid_table.tabsize); + while (psmi_epid_table.table[idx].entry != NULL) { + /* An epid can be added twice if there's more than one opened endpoint, + * but really we match on epid *and* on endpoint */ + e = &psmi_epid_table.table[idx]; + if (e->entry != EPADDR_DELETED && e->key == key) { + entry = e->entry; + if (remove) + psmi_epid_table.table[idx].entry = + EPADDR_DELETED; + goto ret; + } + if (++idx == psmi_epid_table.tabsize) + idx = 0; + } +ret: + pthread_mutex_unlock(&psmi_epid_table.tablock); + return entry; +} + +void *psmi_epid_lookup(psm2_ep_t ep, psm2_epid_t epid) +{ + void *entry = psmi_epid_lookup_inner(ep, epid, 0); + if (PSMI_EP_HOSTNAME != ep) + _HFI_VDBG("lookup of (%p,%" PRIx64 ") returns %p\n", ep, epid, + entry); + return entry; +} + +void *psmi_epid_remove(psm2_ep_t ep, psm2_epid_t epid) +{ + if (PSMI_EP_HOSTNAME != ep) + _HFI_VDBG("remove of (%p,%" PRIx64 ")\n", ep, epid); + return psmi_epid_lookup_inner(ep, epid, 1); +} + +psm2_error_t psmi_epid_add(psm2_ep_t ep, psm2_epid_t epid, void *entry) +{ + uint64_t key; + int idx, i, newsz; + struct psmi_epid_tabentry *e; + psm2_error_t err = PSM2_OK; + + if (PSMI_EP_HOSTNAME != ep) + _HFI_VDBG("add of (%p,%" PRIx64 ") with entry %p\n", ep, epid, + entry); + pthread_mutex_lock(&psmi_epid_table.tablock); + /* Leave this here, mostly for sanity and for the fact that the epid + * table is currently not used in the critical path */ + if (++psmi_epid_table.tabsize_used > + (int)(psmi_epid_table.tabsize * PSMI_EPID_TABLOAD_FACTOR)) { + struct psmi_epid_tabentry *newtab; + newsz = psmi_epid_table.tabsize + PSMI_EPID_TABSIZE_CHUNK; + newtab = (struct psmi_epid_tabentry *) + psmi_calloc(ep, PER_PEER_ENDPOINT, + newsz, sizeof(struct psmi_epid_tabentry)); + if (newtab == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + if (psmi_epid_table.table) { /* rehash the table */ + for (i = 0; i < psmi_epid_table.tabsize; i++) { + e = &psmi_epid_table.table[i]; + if (e->entry == NULL) + continue; + /* When rehashing, mark deleted as free again */ + if (e->entry == EPADDR_DELETED) { + psmi_epid_table.tabsize_used--; + continue; + } + idx = (int)(e->key % newsz); + while (newtab[idx].entry != NULL) + if (++idx == newsz) + idx = 0; + newtab[idx].entry = e->entry; + newtab[idx].key = e->key; + newtab[idx].ep = e->ep; + newtab[idx].epid = e->epid; + } + psmi_free(psmi_epid_table.table); + } + psmi_epid_table.table = newtab; + psmi_epid_table.tabsize = newsz; + } + key = hash_this(ep, epid); + idx = (int)(key % psmi_epid_table.tabsize); + e = &psmi_epid_table.table[idx]; + while (e->entry && e->entry != EPADDR_DELETED) { + if (++idx == psmi_epid_table.tabsize) + idx = 0; + e = &psmi_epid_table.table[idx]; + } + e->entry = entry; + e->key = key; + e->epid = epid; + e->ep = ep; + +fail: + pthread_mutex_unlock(&psmi_epid_table.tablock); + return err; +} + +char *psmi_gethostname(void) +{ + /* XXX this will need a lock in a multi-threaded environment */ + static char hostname[80] = { '\0' }; + char *c; + + if (hostname[0] == '\0') { + gethostname(hostname, sizeof(hostname)); + hostname[sizeof(hostname) - 1] = '\0'; /* no guarantee of nul termination */ + if ((c = strchr(hostname, '.'))) + *c = '\0'; + } + + return hostname; +} + +/* + * Hostname stuff. We really only register the network portion of the epid + * since all epids from the same nid are assumed to have the same hostname. + */ +psm2_error_t +psmi_epid_set_hostname(uint64_t nid, const char *hostname, int overwrite) +{ + size_t hlen; + char *h; + psm2_error_t err = PSM2_OK; + + if (hostname == NULL) + return PSM2_OK; + /* First see if a hostname already exists */ + if ((h = psmi_epid_lookup(PSMI_EP_HOSTNAME, nid)) != NULL) { + if (!overwrite) + return PSM2_OK; + + h = psmi_epid_remove(PSMI_EP_HOSTNAME, nid); + if (h != NULL) /* free the previous hostname if so exists */ + psmi_free(h); + } + + hlen = min(PSMI_EP_HOSTNAME_LEN, strlen(hostname) + 1); + h = (char *)psmi_malloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, hlen); + if (h == NULL) + return PSM2_NO_MEMORY; + snprintf(h, hlen, "%s", hostname); + h[hlen - 1] = '\0'; + err = psmi_epid_add(PSMI_EP_HOSTNAME, nid, h); + return err; +} + +/* XXX These two functions are not thread safe, we'll use a rotating buffer + * trick whenever we need to make them thread safe */ +const char *psmi_epaddr_get_hostname(psm2_epid_t epid) +{ + static char hostnamebufs[4][PSMI_EP_HOSTNAME_LEN]; + static int bufno; + uint64_t nid = psm2_epid_nid(epid); + char *h, *hostname; + + hostname = hostnamebufs[bufno]; + bufno = (bufno + 1) % 4; + + /* First, if we have registered a host for this epid, just return that, or + * else try to return something with lid and context */ + h = psmi_epid_lookup(PSMI_EP_HOSTNAME, nid); + if (h != NULL) + return h; + else { + snprintf(hostname, PSMI_EP_HOSTNAME_LEN - 1, "LID=%d:%d.%d", + (int)PSMI_EPID_GET_LID(epid), + (int)PSMI_EPID_GET_CONTEXT(epid), + (int)PSMI_EPID_GET_SUBCONTEXT(epid)); + hostname[PSMI_EP_HOSTNAME_LEN - 1] = '\0'; + return hostname; + } +} + +/* This one gives the hostname with a lid */ +const char *psmi_epaddr_get_name(psm2_epid_t epid) +{ + static char hostnamebufs[4][PSMI_EP_HOSTNAME_LEN]; + static int bufno; + char *h, *hostname; + hostname = hostnamebufs[bufno]; + bufno = (bufno + 1) % 4; + + h = psmi_epid_lookup(PSMI_EP_HOSTNAME, psm2_epid_nid(epid)); + if (h == NULL) + return psmi_epaddr_get_hostname(epid); + else { + snprintf(hostname, PSMI_EP_HOSTNAME_LEN - 1, + "%s (LID=%d:%d.%d)", h, + (int)PSMI_EPID_GET_LID(epid), + (int)PSMI_EPID_GET_CONTEXT(epid), + (int)PSMI_EPID_GET_SUBCONTEXT(epid)); + hostname[PSMI_EP_HOSTNAME_LEN - 1] = '\0'; + } + return hostname; +} + +/* Wrapper, in case we port to OS xyz that doesn't have sysconf */ +uintptr_t psmi_getpagesize(void) +{ + static uintptr_t pagesz = (uintptr_t) -1; + long sz; + if (pagesz != (uintptr_t) -1) + return pagesz; + sz = sysconf(_SC_PAGESIZE); + if (sz == -1) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Can't query system page size"); + } + + pagesz = (uintptr_t) sz; + return pagesz; +} + +/* If PSM2_VERBOSE_ENV is set in the environment, we determine + * what its verbose level is and print the environment at "INFO" + * level if the environment's level matches the desired printlevel. + */ +static int psmi_getenv_verblevel = -1; +static int psmi_getenv_is_verblevel(int printlevel) +{ + if (psmi_getenv_verblevel == -1) { + char *env = getenv("PSM2_VERBOSE_ENV"); + if (env && *env) { + char *ep; + int val = (int)strtol(env, &ep, 0); + if (ep == env) + psmi_getenv_verblevel = 0; + else if (val == 2) + psmi_getenv_verblevel = 2; + else + psmi_getenv_verblevel = 1; + } else + psmi_getenv_verblevel = 0; + } + return (printlevel <= psmi_getenv_verblevel); +} + +#define GETENV_PRINTF(_level, _fmt, ...) \ + do { \ + if ((_level & PSMI_ENVVAR_LEVEL_NEVER_PRINT) == 0) \ + { \ + int nlevel = _level; \ + if (psmi_getenv_is_verblevel(nlevel)) \ + nlevel = 0; \ + _HFI_ENVDBG(nlevel, _fmt, ##__VA_ARGS__); \ + } \ + } while (0) + +int +MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level, + int type, union psmi_envvar_val defval, + union psmi_envvar_val *newval) +{ + int used_default = 0; + union psmi_envvar_val tval; + char *env = getenv(name); +#if _HFI_DEBUGGING + int ishex = (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS || + type == PSMI_ENVVAR_TYPE_UINT_FLAGS); +#endif + + /* If we're not using the default, always reset the print + * level to '1' so the changed value gets seen at low + * verbosity */ +#define _GETENV_PRINT(used_default, fmt, val, defval) \ + do { \ + if (used_default) \ + GETENV_PRINTF(level, "%s%-25s %-40s =>%s" fmt \ + "\n", level > 1 ? "*" : " ", name, \ + descr, ishex ? "0x" : " ", val); \ + else \ + GETENV_PRINTF(1, "%s%-25s %-40s =>%s" \ + fmt " (default was%s" fmt ")\n", \ + level > 1 ? "*" : " ", name, descr, \ + ishex ? " 0x" : " ", val, \ + ishex ? " 0x" : " ", defval); \ + } while (0) + +/* _CONSUMED_ALL() is a macro which indicates if strtol() consumed all + of the input passed to it. */ +#define _CONSUMED_ALL(CHAR_PTR) (((CHAR_PTR) != NULL) && (*(CHAR_PTR) == 0)) +#define _CONVERT_TO_NUM(DEST,TYPE,STRTOL) \ + do { \ + char *ep; \ + /* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */ \ + DEST = (TYPE)STRTOL(env, &ep, 10); \ + if (! _CONSUMED_ALL(ep)) { \ + DEST = (TYPE)STRTOL(env, &ep, 16); \ + if (! _CONSUMED_ALL(ep)) { \ + used_default = 1; \ + tval = defval; \ + } \ + } \ + } while (0) + + switch (type) { + case PSMI_ENVVAR_TYPE_YESNO: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } else if (env[0] == 'Y' || env[0] == 'y') + tval.e_int = 1; + else if (env[0] == 'N' || env[0] == 'n') + tval.e_int = 0; + else { + char *ep; + tval.e_ulong = strtoul(env, &ep, 0); + if (ep == env) { + used_default = 1; + tval = defval; + } else if (tval.e_ulong != 0) + tval.e_ulong = 1; + } + _GETENV_PRINT(used_default, "%s", tval.e_long ? "YES" : "NO", + defval.e_int ? "YES" : "NO"); + break; + + case PSMI_ENVVAR_TYPE_STR: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } else + tval.e_str = env; + _GETENV_PRINT(used_default, "%s", tval.e_str, defval.e_str); + break; + + case PSMI_ENVVAR_TYPE_INT: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } else { + _CONVERT_TO_NUM(tval.e_int,int,strtol); + } + _GETENV_PRINT(used_default, "%d", tval.e_int, defval.e_int); + break; + + case PSMI_ENVVAR_TYPE_UINT: + case PSMI_ENVVAR_TYPE_UINT_FLAGS: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } else { + _CONVERT_TO_NUM(tval.e_int,unsigned int,strtoul); + } + if (type == PSMI_ENVVAR_TYPE_UINT_FLAGS) + _GETENV_PRINT(used_default, "%x", tval.e_uint, + defval.e_uint); + else + _GETENV_PRINT(used_default, "%u", tval.e_uint, + defval.e_uint); + break; + + case PSMI_ENVVAR_TYPE_LONG: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } else { + _CONVERT_TO_NUM(tval.e_long,long,strtol); + } + _GETENV_PRINT(used_default, "%ld", tval.e_long, defval.e_long); + break; + case PSMI_ENVVAR_TYPE_ULONG_ULONG: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } else { + _CONVERT_TO_NUM(tval.e_ulonglong,unsigned long long,strtoull); + } + _GETENV_PRINT(used_default, "%llu", + tval.e_ulonglong, defval.e_ulonglong); + break; + case PSMI_ENVVAR_TYPE_ULONG: + case PSMI_ENVVAR_TYPE_ULONG_FLAGS: + default: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } else { + _CONVERT_TO_NUM(tval.e_ulong,unsigned long,strtoul); + } + if (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS) + _GETENV_PRINT(used_default, "%lx", tval.e_ulong, + defval.e_ulong); + else + _GETENV_PRINT(used_default, "%lu", tval.e_ulong, + defval.e_ulong); + break; + } +#undef _GETENV_PRINT + *newval = tval; + + return used_default; +} +MOCK_DEF_EPILOGUE(psmi_getenv); + +/* + * Parsing int parameters set in string tuples. + * Output array int *vals should be able to store 'ntup' elements. + * Values are only overwritten if they are parsed. + * Tuples are always separated by colons ':' + */ +int psmi_parse_str_tuples(const char *string, int ntup, int *vals) +{ + char *b = (char *)string; + char *e = b; + int tup_i = 0; + int n_parsed = 0; + char *buf = psmi_strdup(NULL, string); + psmi_assert_always(buf != NULL); + + while (*e && tup_i < ntup) { + b = e; + while (*e && *e != ':') + e++; + if (e > b) { /* something to parse */ + char *ep; + int len = e - b; + long int l; + strncpy(buf, b, len); + buf[len] = '\0'; + l = strtol(buf, &ep, 0); + if (ep != buf) { /* successful conversion */ + vals[tup_i] = (int)l; + n_parsed++; + } + } + if (*e == ':') + e++; /* skip delimiter */ + tup_i++; + } + psmi_free(buf); + return n_parsed; +} + +/* + * Memory footprint/usage mode. + * + * This can be used for debug or for separating large installations from + * small/medium ones. The default is to assume a medium installation. Large + * is not that much larger in memory footprint, but we make a conscious effort + * an consuming only the amount of memory we need. + */ +int psmi_parse_memmode(void) +{ + union psmi_envvar_val env_mmode; + int used_default = + psmi_getenv("PSM2_MEMORY", "Memory usage mode (normal or large)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"normal", &env_mmode); + if (used_default || !strcasecmp(env_mmode.e_str, "normal")) + return PSMI_MEMMODE_NORMAL; + else if (!strcasecmp(env_mmode.e_str, "min")) + return PSMI_MEMMODE_MINIMAL; + else if (!strcasecmp(env_mmode.e_str, "large") || + !strcasecmp(env_mmode.e_str, "big")) + return PSMI_MEMMODE_LARGE; + else { + _HFI_PRDBG("PSM2_MEMORY env value %s unrecognized, " + "using 'normal' memory mode instead\n", + env_mmode.e_str); + return PSMI_MEMMODE_NORMAL; + } +} + +static +const char *psmi_memmode_string(int mode) +{ + psmi_assert(mode >= PSMI_MEMMODE_NORMAL && mode < PSMI_MEMMODE_NUM); + switch (mode) { + case PSMI_MEMMODE_NORMAL: + return "normal"; + case PSMI_MEMMODE_MINIMAL: + return "minimal"; + case PSMI_MEMMODE_LARGE: + return "large"; + default: + return "unknown"; + } +} + +psm2_error_t +psmi_parse_mpool_env(const psm2_mq_t mq, int level, + const struct psmi_rlimit_mpool *rlim, + uint32_t *valo, uint32_t *chunkszo) +{ + uint32_t val; + const char *env = rlim->env; + int mode = mq->memmode; + psm2_error_t err = PSM2_OK; + union psmi_envvar_val env_val; + + psmi_assert_always(mode >= PSMI_MEMMODE_NORMAL + && mode < PSMI_MEMMODE_NUM); + + psmi_getenv(rlim->env, rlim->descr, rlim->env_level, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)rlim->mode[mode].obj_max, &env_val); + + val = env_val.e_uint; + if (val < rlim->minval || val > rlim->maxval) { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Env. var %s=%u is invalid (valid settings in mode PSM2_MEMORY=%s" + " are inclusively between %u and %u)", + env, val, psmi_memmode_string(mode), + rlim->minval, rlim->maxval); + goto fail; + } + + _HFI_VDBG("%s max=%u,chunk=%u (mode=%s(%u),min=%u,max=%u)\n", + env, val, rlim->mode[mode].obj_chunk, + psmi_memmode_string(mode), mode, rlim->minval, rlim->maxval); + + *valo = val; + *chunkszo = rlim->mode[mode].obj_chunk; + +fail: + return err; +} + +uint64_t psmi_cycles_left(uint64_t start_cycles, int64_t timeout_ns) +{ + if (timeout_ns < 0) + return 0ULL; + else if (timeout_ns == 0ULL || timeout_ns == ~0ULL) + return ~0ULL; + else { + uint64_t t_end = nanosecs_to_cycles(timeout_ns); + uint64_t t_now = get_cycles() - start_cycles; + + if (t_now >= t_end) + return 0ULL; + else + return (t_end - t_now); + } +} + +uint32_t psmi_get_ipv4addr() +{ + struct hostent *he; + uint32_t addr = 0; + + he = gethostbyname(psmi_gethostname()); + if (he != NULL && he->h_addrtype == AF_INET && he->h_addr != NULL) { + memcpy(&addr, he->h_addr, sizeof(uint32_t)); + return addr; + } else + return 0; +} + +#define PSMI_EP_IS_PTR(ptr) ((ptr) != NULL && (ptr) < PSMI_EP_LOGEVENT) + +void +psmi_syslog(psm2_ep_t ep, int to_console, int level, const char *format, ...) +{ + va_list ap; + + /* If we've never syslogged anything from this ep at the PSM level, make + * sure we log context information */ + if (PSMI_EP_IS_PTR(ep) && !ep->did_syslog) { + char uuid_str[64]; + ep->did_syslog = 1; + + memset(&uuid_str, 0, sizeof(uuid_str)); + psmi_uuid_unparse(ep->uuid, uuid_str); + hfi_syslog("PSM", 0, LOG_WARNING, + "uuid_key=%s,unit=%d,context=%d,subcontext=%d", + uuid_str, + psmi_hal_get_unit_id(ep->context.psm_hw_ctxt), + psmi_hal_get_context(ep->context.psm_hw_ctxt), + psmi_hal_get_subctxt(ep->context.psm_hw_ctxt)); + } + + va_start(ap, format); + hfi_vsyslog("PSM", to_console, level, format, ap); + va_end(ap); +} + +/* Table of CRCs of all 8-bit messages. */ +static uint32_t crc_table[256]; + +/* Flag: has the table been computed? Initially false. */ +static int crc_table_computed; + +/* Make the table for a fast CRC. */ +static void make_crc_table(void) +{ + uint32_t c; + int n, k; + + for (n = 0; n < 256; n++) { + c = (uint32_t) n; + for (k = 0; k < 8; k++) { + if (c & 1) + c = 0xedb88320 ^ (c >> 1); + else + c = c >> 1; + } + crc_table[n] = c; + } + crc_table_computed = 1; +} + +/* Update a running CRC with the bytes buf[0..len-1]--the CRC + * should be initialized to all 1's, and the transmitted value + * is the 1's complement of the final running CRC (see the + * crc() routine below)). + */ + +static uint32_t update_crc(uint32_t crc, unsigned char *buf, int len) +{ + uint32_t c = crc; + int n; + + if_pf(!crc_table_computed) + make_crc_table(); + for (n = 0; n < len; n++) { + c = crc_table[(c ^ buf[n]) & 0xff] ^ (c >> 8); + } + return c; +} + +/* Return the CRC of the bytes buf[0..len-1]. */ +uint32_t psmi_crc(unsigned char *buf, int len) +{ + return update_crc(0xffffffff, buf, len) ^ 0xffffffff; +} + +struct psmi_faultinj_spec { + STAILQ_ENTRY(psmi_faultinj_spec) next; + char spec_name[PSMI_FAULTINJ_SPEC_NAMELEN]; + + unsigned long long num_faults; + unsigned long long num_calls; + + struct drand48_data drand48_data; + int num; + int denom; + +}; + +int psmi_multi_ep_enabled = 0; +void psmi_multi_ep_init() +{ + union psmi_envvar_val env_fi; + + psmi_getenv("PSM2_MULTI_EP", "PSM2 Multiple Endpoints (yes/no)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_YESNO, + PSMI_ENVVAR_VAL_NO, &env_fi); + + psmi_multi_ep_enabled = env_fi.e_uint; +} + +int psmi_faultinj_enabled = 0; +int psmi_faultinj_verbose = 0; +char *psmi_faultinj_outfile = NULL; + +static struct psmi_faultinj_spec psmi_faultinj_dummy; +static STAILQ_HEAD(, psmi_faultinj_spec) psmi_faultinj_head = +STAILQ_HEAD_INITIALIZER(psmi_faultinj_head); + +void psmi_faultinj_init() +{ + union psmi_envvar_val env_fi; + + psmi_getenv("PSM2_FI", "PSM Fault Injection (yes/no)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_YESNO, + PSMI_ENVVAR_VAL_NO, &env_fi); + + psmi_faultinj_enabled = !!env_fi.e_uint; + + if (psmi_faultinj_enabled) { + char *def = NULL; + if (!psmi_getenv + ("PSM2_FI_TRACEFILE", "PSM Fault Injection output file", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)def, &env_fi)) { + psmi_faultinj_outfile = psmi_strdup(NULL, env_fi.e_str); + } + } + + return; +} + +void psmi_faultinj_fini() +{ + struct psmi_faultinj_spec *fi; + FILE *fp; + int do_fclose = 0; + + if (!psmi_faultinj_enabled || psmi_faultinj_outfile == NULL) + return; + + if (strncmp(psmi_faultinj_outfile, "stdout", 7) == 0) + fp = stdout; + else if (strncmp(psmi_faultinj_outfile, "stderr", 7) == 0) + fp = stderr; + else { + char *c = psmi_faultinj_outfile; + char buf[192]; + int append = 0; + if (*c == '+') { + append = 1; + ++c; + } + do_fclose = 1; + snprintf(buf, sizeof(buf) - 1, "%s.%s", c, hfi_get_mylabel()); + buf[sizeof(buf) - 1] = '\0'; + fp = fopen(buf, append ? "a" : "w"); + } + + if (fp != NULL) { + STAILQ_FOREACH(fi, &psmi_faultinj_head, next) { + fprintf(fp, "%s:%s PSM2_FI_%-12s %2.3f%% => " + "%2.3f%% %10lld faults/%10lld events\n", + __progname, hfi_get_mylabel(), fi->spec_name, + (double)fi->num * 100.0 / fi->denom, + (double)fi->num_faults * 100.0 / fi->num_calls, + fi->num_faults, fi->num_calls); + } + fflush(fp); + if (do_fclose) + fclose(fp); + } + + psmi_free(psmi_faultinj_outfile); + return; +} + +/* + * Intended to be used only once, not in the critical path + */ +struct psmi_faultinj_spec *psmi_faultinj_getspec(char *spec_name, int num, + int denom) +{ + struct psmi_faultinj_spec *fi; + + if (!psmi_faultinj_enabled) + return &psmi_faultinj_dummy; + + STAILQ_FOREACH(fi, &psmi_faultinj_head, next) { + if (strcmp(fi->spec_name, spec_name) == 0) + return fi; + } + + /* We got here, so no spec -- allocate one */ + fi = psmi_malloc(PSMI_EP_NONE, UNDEFINED, + sizeof(struct psmi_faultinj_spec)); + psmi_assert_always(fi != NULL); + strncpy(fi->spec_name, spec_name, PSMI_FAULTINJ_SPEC_NAMELEN - 1); + fi->spec_name[PSMI_FAULTINJ_SPEC_NAMELEN - 1] = '\0'; + fi->num = num; + fi->denom = denom; + fi->num_faults = 0; + fi->num_calls = 0; + + /* + * See if we get a hint from the environment. + * Format is + * + * + * By default, we chose the initial seed to be the 'pid'. If users need + * repeatability, they should set initial_seed to be the 'pid' when the + * error was observed or force the initial_seed to be a constant number in + * each running process. Using 'pid' is useful because core dumps store + * pids and our backtrace format does as well so if a crash is observed for + * a specific seed, programs can reuse the 'pid' to regenerate the same + * error condition. + */ + { + int fvals[3] = { num, denom, (int)getpid() }; + union psmi_envvar_val env_fi; + char fvals_str[128]; + char fname[128]; + char fdesc[300]; + + snprintf(fvals_str, sizeof(fvals_str) - 1, "%d:%d:1", num, + denom); + fvals_str[sizeof(fvals_str) - 1] = '\0'; + snprintf(fname, sizeof(fname) - 1, "PSM2_FI_%s", spec_name); + fname[sizeof(fname) - 1] = '\0'; + snprintf(fdesc, sizeof(fdesc) - 1, "Fault Injection %s <%s>", + fname, fvals_str); + + if (!psmi_getenv(fname, fdesc, PSMI_ENVVAR_LEVEL_HIDDEN, + PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)fvals_str, &env_fi)) { + /* not using default values */ + int n_parsed = + psmi_parse_str_tuples(env_fi.e_str, 3, fvals); + if (n_parsed >= 1) + fi->num = fvals[0]; + if (n_parsed >= 2) + fi->denom = fvals[1]; + if (n_parsed >= 3) + srand48_r((long int) fvals[2], &fi->drand48_data); + } + } + + STAILQ_INSERT_TAIL(&psmi_faultinj_head, fi, next); + return fi; +} + +int psmi_faultinj_is_fault(struct psmi_faultinj_spec *fi) +{ + if (!psmi_faultinj_enabled) /* never fault if disabled */ + return 0; + if (fi->num == 0) + return 0; + + fi->num_calls++; + long int rnum; + lrand48_r(&fi->drand48_data, &rnum); + if (((int) (rnum % INT_MAX)) % fi->denom <= fi->num) { + fi->num_faults++; + return 1; + } else + return 0; +} + +/* For memory allocation, we kind of break the PSM error handling rules. + * If the caller gets NULL, it has to assume that the error has been handled + * and should always return PSM2_NO_MEMORY */ + +/* + * Log memory increments or decrements of type memstats_t. + */ +struct psmi_memtype_hdr { + struct { + uint64_t size:48; + uint64_t magic:8; + uint64_t type:8; + }; + void *original_allocation; +}; + +struct psmi_stats_malloc psmi_stats_memory; + +void psmi_log_memstats(psmi_memtype_t type, int64_t nbytes) +{ +#define _add_max_total(type, nbytes) \ + psmi_stats_memory.m_ ## type ## _total += (nbytes); \ + psmi_stats_memory.m_ ## type ## _max = max( \ + psmi_stats_memory.m_ ## type ## _total, \ + psmi_stats_memory.m_ ## type ## _max); + + switch (type) { + case PER_PEER_ENDPOINT: + _add_max_total(perpeer, nbytes); + break; + case NETWORK_BUFFERS: + _add_max_total(netbufs, nbytes); + break; + case DESCRIPTORS: + _add_max_total(descriptors, nbytes); + break; + case UNEXPECTED_BUFFERS: + _add_max_total(unexpbufs, nbytes); + break; + case STATS: + _add_max_total(stats, nbytes); + break; + case UNDEFINED: + _add_max_total(undefined, nbytes); + break; + default: + psmi_assert_always(type == TOTAL); + break; + } + _add_max_total(all, nbytes); + psmi_stats_memory.m_all_max++; +#undef _add_max_total + + return; +} + +// Memory stats will only be collected under debug builds + +#ifdef PSM_DEBUG +#define psmi_stats_mask PSMI_STATSTYPE_MEMORY +#else +#define psmi_stats_mask 0 +#endif + +#ifdef malloc +#undef malloc +#endif + +#ifdef PSM_HEAP_DEBUG + +/* PSM HEAP DEBUG documentation: + + In the following code, the acronym: 'HD' is short for "Heap Debug". + + Each actual heap allocation will have a header and a trailer surrounding it, + and the header itself may have some vacant space preceding it due to alignment + needs: + + 0. This area is the actual return value of posix_memalign and is due to + alignment requirements. (This area does not exist for heap allocations + from malloc()). + 1. HD HEADER + 2. Actual allocation + 3. HD TRAILER + + malloc() / posix_memalign returns area 0 through 3 to the Heap Debug (HD) code, + then the HD code writes areas 1 and 3, and then returns a pointer to area 2 to + the caller. Thereafter, the HD code will inspect areas 1 and 3 of all heap + allocations to make sure they have retained their integrity. + + Surrounding the actual allocation like this enables: + + 1. Checking for heap overrun / underrun of all allocations. + 2. Checking for double frees. + 3. Use of an area that has been freed. + 4. Identifying orphaned heap allocations. + +Constant no-mans-land written to areas that no-one should be writing to: + + */ + +#define HD_NO_MANS_LAND -15 + +/* The following is the declaration of the HD header. */ + +/* Heap debug header magic number type: */ +typedef char HD_Hdr_Magic_Type[8]; + +typedef struct HD_Header_Struct +{ + HD_Hdr_Magic_Type magic1; /* Magic number to ensure this + allocation has integrity. + (guards against heap + overrun from above). */ + const char *allocLoc; /* Source file name/line + number where this heap + allocation was made. */ + const char *freeLoc; /* Source filename/line number + where this heap allocation + was freed. */ + struct HD_Header_Struct *nextHD_header; /* Creates a singly-linked + list of all heap + allocations. */ + uint64_t sizeOfAlloc; /* size of this heap + allocation. */ + void *systemAlloc; /* The actual return value + from malloc()/posix_memaligh(). */ + uint64_t systemAllocSize;/* The size that is actually allocated + by malloc()/posix_memalign(). */ + HD_Hdr_Magic_Type magic2; /* Second magic number to + ensure this allocation + has integrity. + (guards against heap + underrun from the actual + allocation that follows). */ +} __attribute__ ((packed)) HD_Header_Type; + +typedef struct HD_free_list_struct +{ + HD_Header_Type *freedStuct; + struct HD_free_list_struct *next_free_struct; +} HD_Free_Struct_Type; + +static HD_Free_Struct_Type *HD_free_list_root = NULL; +static HD_Free_Struct_Type **HD_free_list_bottom = &HD_free_list_root; + +typedef char HD_Trlr_Magic_Type[16]; + +static const HD_Hdr_Magic_Type HD_HDR_MGC_1 = "Eric"; +static const HD_Hdr_Magic_Type HD_HDR_MGC_2 = "Emily"; +static const HD_Trlr_Magic_Type HD_TRLR_MGC = "Erin&Elaine"; + +/* Convert a pointer of an actual allocation to a pointer to its HD header: */ +static inline HD_Header_Type *HD_AA_TO_HD_HDR(void *aa) +{ + char *p = (char*)aa; + return (HD_Header_Type*)(p - sizeof(HD_Header_Type)); +} + +/* Convert a pointer to an HD header to the actual allocation: */ +static inline void *HD_HDR_TO_AA(HD_Header_Type *phdHdr) +{ + char *p = (char*)phdHdr; + return p + sizeof(HD_Header_Type); +} + +/* Get the address of the trailer that follows the actual allocation: */ +static inline void *HD_GET_HD_TRLR(HD_Header_Type *phdr) +{ + char *p = (char*)HD_HDR_TO_AA(phdr); + return p + phdr->sizeOfAlloc; +} + +static HD_Header_Type * HD_root_of_list = NULL; /* Root of singly linked list + of all heap allocations */ +static HD_Header_Type **HD_end_of_list = &HD_root_of_list; /* Pointer to the + last pointer of the singly linked list of all heap allocations. */ + +/* Number of allocations in the list. Maintained to assert the integrity + of the singly linked list of heap allocations. */ +static int n_allocations = 0; + +/* HD_check_one_struct() checks one heap allocation for integrity. */ +static inline void HD_check_one_struct(HD_Header_Type *p, int checkAA,const char *curloc) +{ + int s=0; + + /* First check the magic values in the header and trailer: */ + s |= memcmp(p->magic1,HD_HDR_MGC_1,sizeof(HD_HDR_MGC_1)) ? 1 : 0; + s |= memcmp(p->magic2,HD_HDR_MGC_2,sizeof(HD_HDR_MGC_2)) ? 2 : 0; + s |= memcmp(HD_GET_HD_TRLR(p),HD_TRLR_MGC,sizeof(HD_TRLR_MGC)) ? 4 : 0; + + if (s != 0) + { + fprintf(stderr,"header/trailer error: checking location: %s, s: %d, p: %p, " + "p->allocLoc: %s\n",curloc,s,p,p->allocLoc); + fprintf(stderr,"actual allocation starts at: %p, length: %" PRIu64 "\n", (char*)HD_HDR_TO_AA(p),p->sizeOfAlloc); + fflush(0); + abort(); + } + + /* Next, check the area between systemAlloc and the start of the header */ + signed char *pchr = (signed char *)p->systemAlloc; + while (pchr < (signed char*)p) + { + psmi_assert_always(*pchr == (signed char) HD_NO_MANS_LAND); + pchr++; + } + + /* Lastly, check the actual allocation area if directed to do so: */ + if (checkAA) + { + uint64_t i; + signed char *pchr = HD_HDR_TO_AA(p); + for (i=0;i < p->sizeOfAlloc;i++) + if (pchr[i] != (signed char) HD_NO_MANS_LAND) + { + fprintf(stderr, + "use after free; ptr: %p,\n" + " allocated from: %s,\n" + " validated from: %s\n" + " freed from: %s\n", + pchr+i,p->allocLoc,curloc,p->freeLoc); + fflush(0); + psmi_assert_always(0); + } + } +} + +/* _psmi_heapdebug_val_heapallocs() walks the singly linked list and inspects all + * heap allocations to ensure all of them have integrity still. */ +void _psmi_heapdebug_val_heapallocs(const char *curloc) +{ + /* first check current allocation list: */ + HD_Header_Type *p = HD_root_of_list; + int cnt = 0; + + while (p) + { + HD_check_one_struct(p,0,curloc); + p = p->nextHD_header; + cnt++; + } + psmi_assert_always(cnt == n_allocations); + /* Next check free list */ + HD_Free_Struct_Type *pfreestruct = HD_free_list_root; + while (pfreestruct) + { + HD_check_one_struct(pfreestruct->freedStuct,1,curloc); + pfreestruct = pfreestruct->next_free_struct; + } +} + +/* hd_est_hdr_trlr() establishes the new allocation to the singly linked list, and adds + * the header and trailer to the allocation. Lastly, it validates the existing singly-linked + * list for integrity. */ +static void hd_est_hdr_trlr(HD_Header_Type *hd_alloc, + void *systemAlloc, + uint64_t systemSize, + uint64_t actualSize, + const char *curloc) +{ +#if 0 + /* if we use this block of code, psm hangs running mpistress. See JIRA STL-5244. */ + memset(systemAlloc,HD_NO_MANS_LAND,systemSize); +#else + /* write HD_NO_MANS_LAND to the area between the system allocation and the start of the hd header. */ + signed char *pchr = systemAlloc; + for (;pchr < (signed char*) hd_alloc;pchr++) + *pchr = (signed char) HD_NO_MANS_LAND; +#endif + /* Write the HD header info: */ + memcpy(hd_alloc->magic1,HD_HDR_MGC_1,sizeof(HD_HDR_MGC_1)); + hd_alloc->allocLoc = curloc; + hd_alloc->freeLoc = NULL; + hd_alloc->nextHD_header = NULL; + hd_alloc->sizeOfAlloc = actualSize; + hd_alloc->systemAlloc = systemAlloc; + hd_alloc->systemAllocSize = systemSize; + memcpy(hd_alloc->magic2,HD_HDR_MGC_2,sizeof(HD_HDR_MGC_2)); + memcpy(HD_GET_HD_TRLR(hd_alloc),HD_TRLR_MGC,sizeof(HD_TRLR_MGC)); + *HD_end_of_list = hd_alloc; + HD_end_of_list = &hd_alloc->nextHD_header; + n_allocations++; + psmi_heapdebug_val_heapallocs(); +} + +/* hd_malloc() is the heap debug version of malloc that will create the header and trailer + * and link the allocation into the singly linked list. */ +static inline void *hd_malloc(size_t sz, const char *curloc) +{ + const uint64_t wholeSize = sizeof(HD_Header_Type) + sz + sizeof(HD_TRLR_MGC); + HD_Header_Type *hd_alloc = (HD_Header_Type*)malloc(wholeSize); + + hd_est_hdr_trlr(hd_alloc,hd_alloc,wholeSize,sz,curloc); + return HD_HDR_TO_AA(hd_alloc); +} + +/* hd_memalign() is the heap debug version of posix_memalign(). */ +static inline int hd_memalign(void **ptr,uint64_t alignment, size_t sz, const char *curloc) +{ + void *systemAlloc = NULL; + const uint64_t alignMask = alignment - 1; + uint64_t systemSize = sizeof(HD_Header_Type) + alignMask + sz + sizeof(HD_TRLR_MGC); + int rv = posix_memalign(&systemAlloc,alignment,systemSize); + char *actualAlloc = NULL; + const char *endOfSystemAlloc = ((char*)systemAlloc) + systemSize; + + if (rv) + return rv; + + uint64_t actualAllocu64 = (uint64_t) systemAlloc; + actualAllocu64 += sizeof(HD_Header_Type) + alignMask; + actualAllocu64 &= ~ alignMask; + actualAlloc = (char*)actualAllocu64; + psmi_assert_always((actualAllocu64 & alignMask) == 0); + psmi_assert_always((actualAlloc+sz+sizeof(HD_TRLR_MGC)) <= endOfSystemAlloc); + psmi_assert_always((actualAlloc - (char*)systemAlloc) >= sizeof(HD_Header_Type)); + + hd_est_hdr_trlr(HD_AA_TO_HD_HDR(actualAlloc),systemAlloc,systemSize,sz,curloc); + *ptr = actualAlloc; + return rv; +} + +/* hd_free() is the heap debug version of free(). First, hd_free() ensures that the ptr to be + * freed in fact is known by the HD code. Next, hd_free() removes the ptr from the list. Then, + * hd_free scribbles to the ptr's area and actually frees the heap space. */ +static inline void hd_free(void *ptr,const char *curloc) +{ + HD_Header_Type *hd_alloc = HD_AA_TO_HD_HDR(ptr); + HD_Header_Type *p = HD_root_of_list, *q = NULL; + + psmi_heapdebug_val_heapallocs(); + while (p) + { + if (p == hd_alloc) + { + /* first, fix the next pointers: */ + if (q) + { + q->nextHD_header = p->nextHD_header; + } + else + { + psmi_assert_always(p == HD_root_of_list); + HD_root_of_list = p->nextHD_header; + } + /* Now, handle the case of removing the last entry in the list. */ + if (&p->nextHD_header == HD_end_of_list) + { + if (q) + { + q->nextHD_header = NULL; + HD_end_of_list = &q->nextHD_header; + } + else + { + HD_root_of_list = NULL; + HD_end_of_list = &HD_root_of_list; + } + } + /* Scribble to the actual allocation to make further access to the heap + area unusable. */ + n_allocations--; + memset(HD_HDR_TO_AA(hd_alloc),HD_NO_MANS_LAND,hd_alloc->sizeOfAlloc); + hd_alloc->freeLoc = curloc; + /* Add this allocation to the free list. */ + HD_Free_Struct_Type *pfreestruct = (HD_Free_Struct_Type*)malloc(sizeof(HD_Free_Struct_Type)); + *HD_free_list_bottom = pfreestruct; + HD_free_list_bottom = &pfreestruct->next_free_struct; + pfreestruct->freedStuct = hd_alloc; + pfreestruct->next_free_struct = NULL; + psmi_heapdebug_val_heapallocs(); + return; + } + q = p; + p = p->nextHD_header; + } + /* trying to free a heap allocation that we did not allocate. */ + psmi_assert_always(0); +} + +size_t hd_malloc_usable_size(void *ptr,const char *curloc) +{ + HD_Header_Type *hd_alloc = HD_AA_TO_HD_HDR(ptr); + return hd_alloc->systemAllocSize; +} + +#endif + +#ifdef PSM_HEAP_DEBUG + +/* For HD code, we retarget the malloc, memaligh and free calls to the hd versions + * of the code. */ + +#define my_malloc(SZ,CURLOC) hd_malloc(SZ,CURLOC) +#define my_memalign(PTR,ALIGN,SZ,CURLOC) hd_memalign(PTR,ALIGN,SZ,CURLOC) +#define my_free(PTR,CURLOC) hd_free(PTR,CURLOC) +#define my_malloc_usable_size(PTR,CURLOC) hd_malloc_usable_size(PTR,CURLOC) + +#else + +/* For non-HD code, we target the code to the usual functions: */ +#define my_malloc(SZ,CURLOC) malloc(SZ) +#define my_memalign(PTR,ALIGN,SZ,CURLOC) posix_memalign(PTR,ALIGN,SZ) +#define my_free(PTR,CURLOC) free(PTR) +#define my_malloc_usable_size(PTR,CURLOC) malloc_usable_size(PTR) + +#endif + +void *psmi_malloc_internal(psm2_ep_t ep, psmi_memtype_t type, + size_t sz, const char *curloc) +{ + size_t newsz = sz; + void *newa; + + if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) + newsz += sizeof(struct psmi_memtype_hdr); + + newa = my_malloc(newsz,curloc); + if (newa == NULL) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, + "Out of memory for malloc at %s", curloc); + return NULL; + } + + if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) { + struct psmi_memtype_hdr *hdr = (struct psmi_memtype_hdr *)newa; + hdr->size = newsz; + hdr->type = type; + hdr->magic = 0x8c; + hdr->original_allocation = newa; + psmi_log_memstats(type, newsz); + newa = (void *)(hdr + 1); + /* _HFI_INFO("alloc is %p\n", newa); */ + } + return newa; +} + +void *psmi_realloc_internal(psm2_ep_t ep, psmi_memtype_t type, + void *ptr, size_t nsz, const char *curloc) +{ + if (ptr) + { + size_t existingSize = psmi_malloc_usable_size_internal(ptr,curloc); + if (nsz > existingSize) + { + void *newPtr = psmi_malloc_internal(ep,type,nsz,curloc); + + memcpy(newPtr,ptr,existingSize); + psmi_free_internal(ptr,curloc); + return newPtr; + } + else + /* We will not support shrinking virtual space + for performance reasons. */ + return ptr; + } + else + return psmi_malloc_internal(ep,type,nsz,curloc); +} + +#ifdef memalign +#undef memalign +#endif +void *psmi_memalign_internal(psm2_ep_t ep, psmi_memtype_t type, + size_t alignment, size_t sz, const char *curloc) +{ + size_t newsz = sz; + void *newa; + int ret, preambleSize = 0; + + if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) + { + if (sizeof(struct psmi_memtype_hdr) > alignment) + { + int n = sizeof(struct psmi_memtype_hdr) / alignment; + int r = sizeof(struct psmi_memtype_hdr) % alignment; + if (r) + n++; + preambleSize = n * alignment; + } + else + preambleSize = alignment; + newsz += preambleSize; + } + + ret = my_memalign(&newa, alignment, newsz, curloc); + if (ret) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, + "Out of memory for malloc at %s", curloc); + return NULL; + } + + if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) { + void *rv = newa + preambleSize; + struct psmi_memtype_hdr *hdr = (struct psmi_memtype_hdr *)(rv-sizeof(struct psmi_memtype_hdr)); + hdr->size = newsz; + hdr->type = type; + hdr->magic = 0x8c; + hdr->original_allocation = newa; + psmi_log_memstats(type, newsz); + newa = rv; + /* _HFI_INFO("alloc is %p\n", newa); */ + } + return newa; +} + +#ifdef calloc +#undef calloc +#endif + +void *psmi_calloc_internal(psm2_ep_t ep, psmi_memtype_t type, size_t nelem, + size_t elemsz, const char *curloc) +{ + void *newa = psmi_malloc_internal(ep, type, nelem * elemsz, curloc); + if (newa == NULL) /* error handled above */ + return NULL; + memset(newa, 0, nelem * elemsz); + return newa; +} + +#ifdef strdup +#undef strdup +#endif + +void *psmi_strdup_internal(psm2_ep_t ep, const char *string, const char *curloc) +{ + size_t len = strlen(string) + 1; + void *newa = psmi_malloc_internal(ep, UNDEFINED, len, curloc); + if (newa == NULL) + return NULL; + memcpy(newa, string, len); /* copy with \0 */ + return newa; +} + +#ifdef free +#undef free +#endif + +void MOCKABLE(psmi_free_internal)(void *ptr,const char *curloc) +{ + if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) { + struct psmi_memtype_hdr *hdr = + (struct psmi_memtype_hdr *)ptr - 1; + /* _HFI_INFO("hdr is %p, ptr is %p\n", hdr, ptr); */ + psmi_memtype_t type = hdr->type; + int64_t size = hdr->size; + int magic = (int)hdr->magic; + psmi_log_memstats(type, -size); + psmi_assert_always(magic == 0x8c); + ptr = hdr->original_allocation; + } + my_free(ptr,curloc); +} +MOCK_DEF_EPILOGUE(psmi_free_internal); + +#ifdef malloc_usable_size +#undef malloc_usable_size +#endif + +size_t psmi_malloc_usable_size_internal(void *ptr, const char *curLoc) +{ + return my_malloc_usable_size(ptr,curLoc); +} + +PSMI_ALWAYS_INLINE( +psm2_error_t +psmi_coreopt_ctl(const void *core_obj, int optname, + void *optval, uint64_t *optlen, int get)) +{ + psm2_error_t err = PSM2_OK; + + switch (optname) { + case PSM2_CORE_OPT_DEBUG: + /* Sanity check length */ + if (*optlen < sizeof(unsigned)) { + err = psmi_handle_error(NULL, + PSM2_PARAM_ERR, + "Option value length error"); + *optlen = sizeof(unsigned); + return err; + } + + if (get) { + *((unsigned *)optval) = hfi_debug; + } else + hfi_debug = *(unsigned *)optval; + break; + case PSM2_CORE_OPT_EP_CTXT: + { + /* core object is epaddr */ + psm2_epaddr_t epaddr = (psm2_epaddr_t) core_obj; + + /* Sanity check epaddr */ + if (!epaddr) { + return psmi_handle_error(NULL, + PSM2_PARAM_ERR, + "Invalid endpoint address"); + } + + /* Sanity check length */ + if (*optlen < sizeof(unsigned long)) { + err = psmi_handle_error(NULL, + PSM2_PARAM_ERR, + "Option value length error"); + *optlen = sizeof(void *); + return err; + } + + if (get) { + *((unsigned long *)optval) = + (unsigned long)epaddr->usr_ep_ctxt; + } else + epaddr->usr_ep_ctxt = optval; + } + break; + default: + /* Unknown/unrecognized option */ + err = psmi_handle_error(NULL, + PSM2_PARAM_ERR, + "Unknown PSM2_CORE option %u.", + optname); + break; + } + return err; +} + +psm2_error_t psmi_core_setopt(const void *core_obj, int optname, + const void *optval, uint64_t optlen) +{ + return psmi_coreopt_ctl(core_obj, optname, (void *)optval, &optlen, 0); +} + +psm2_error_t psmi_core_getopt(const void *core_obj, int optname, + void *optval, uint64_t *optlen) +{ + return psmi_coreopt_ctl(core_obj, optname, optval, optlen, 1); +} + +/* PSM AM component option handling */ +PSMI_ALWAYS_INLINE( +psm2_error_t +psmi_amopt_ctl(const void *am_obj, int optname, + void *optval, uint64_t *optlen, int get)) +{ + psm2_error_t err = PSM2_OK; + + /* AM object is a psm2_epaddr (or NULL for global minimum sz) */ + /* psm2_epaddr_t epaddr = (psm2_epaddr_t) am_obj; */ + + /* All AM options are read-only. */ + if (!get) { + return err = + psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_OPT_READONLY, + "Attempted to set read-only option value"); + } + + /* Sanity check length -- all AM options are uint32_t. */ + if (*optlen < sizeof(uint32_t)) { + *optlen = sizeof(uint32_t); + return err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_PARAM_ERR, + "Option value length error"); + } + + switch (optname) { + case PSM2_AM_OPT_FRAG_SZ: + *((uint32_t *) optval) = psmi_am_parameters.max_request_short; + break; + case PSM2_AM_OPT_NARGS: + *((uint32_t *) optval) = psmi_am_parameters.max_nargs; + break; + case PSM2_AM_OPT_HANDLERS: + *((uint32_t *) optval) = psmi_am_parameters.max_handlers; + break; + default: + err = + psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Unknown PSM2_AM option %u.", optname); + } + + return err; +} + +psm2_error_t psmi_am_setopt(const void *am_obj, int optname, + const void *optval, uint64_t optlen) +{ + return psmi_amopt_ctl(am_obj, optname, (void *)optval, &optlen, 0); +} + +psm2_error_t psmi_am_getopt(const void *am_obj, int optname, + void *optval, uint64_t *optlen) +{ + return psmi_amopt_ctl(am_obj, optname, optval, optlen, 1); +} + +#ifdef PSM_LOG + +#include +#include +#include +#include +#include +#include "ptl_ips/ips_proto_header.h" + +/* A treeNode is used to store the list of Function Name Lists that + are passed to the PSM_LOG facility via environment variables. + See psm_log.h for more information. + + Note that treeNode is a node in a binary tree data structure. */ +typedef struct _treeNode +{ + const char *name; + int line1,line2; + struct _treeNode *left,*right; +} treeNode; + +/* An epmTreeNode is used to track the number of protocol packets + that are send/recevied, for a given opcode, and source epid + to another epid. */ +typedef struct _epmTreeNode +{ + int opcode,count,txrx; + uint64_t fromepid,toepid; + struct _epmTreeNode *left,*right; +} epmTreeNode; + + +/* given a line range: [*line1 .. *line2], and another line, line + 'join' the line range to the new line if the line immediately abuts + the line range. The new line does not abut the existing range, + return 0. Else, return 1. + + For example, take the line range [ 20 .. 30 ] and the line: 19. + Since 19 comes immediately before 20, the line range can be joined + resulting in the line rage: [ 19 .. 30 ]. The function returns 1 for this + case. + + The following other examples gives the new line range given the new line and + range [ 20 .. 30 ], and gives the return value: + + 31 [ 20 .. 31 ] 1 + 18 [ 20 .. 30 ] 0 + 32 [ 20 .. 30 ] 0 + 25 [ 20 .. 30 ] 1 */ +static int joinOverlap(int *line1,int *line2,int line) +{ + long long ll_line = line; + + if (ll_line+1 >= *line1 && ll_line-1 <= *line2) + { + *line1 = min(*line1,line); + *line2 = max(*line2,line); + return 1; + } + return 0; +} + +/* given two line ranges, determine the range that encompasses both line ranges + if an overlap has occurred. Returns 0 if the two ranges do not overlap and + do not abutt. + + Some examples, if line1=20 and line2=30 + + [20 30] [20 30] 2 + [19 30] [19 30] 2 + [19 20] [19 30] 2 + [10 15] [20 30] 0 + [40 50] [20 30] 0 */ +static int joinOverlapRange(int *line1,int *line2,int l1,int l2) +{ + return joinOverlap(line1,line2,l1) + joinOverlap(line1,line2,l2); +} + +/* inserts a new treeNode into the FNL tree, or, merges the lines that are already + present in the tree. */ +static void insertNodeInTree(treeNode **root,const char *name,int line1,int line2) +{ + if (*root) + { + int c = strcmp(name,(*root)->name); + if (c < 0) + insertNodeInTree(&((*root)->left),name,line1,line2); + else if (c > 0) + insertNodeInTree(&((*root)->right),name,line1,line2); + else + { + if (joinOverlapRange(&(*root)->line1,&(*root)->line2,line1,line2)) + return; + else if (line1 < (*root)->line1) + insertNodeInTree(&((*root)->left),name,line1,line2); + else if (line2 > (*root)->line2) + insertNodeInTree(&((*root)->right),name,line1,line2); + else psmi_assert_always(0); /* should never happen. */ + } + } + else + { + *root = malloc(sizeof(treeNode)); + (*root)->name = strdup(name); + (*root)->line1 = line1; + (*root)->line2 = line2; + (*root)->left = (*root)->right = NULL; + } +} + +/* Returns -1 if the data in the node is less than the data supplied as parameter, else + Returns 1 if the data in the node is greater than the data supplied as parameter, else + Returns 0. + */ +static int compareEpmNode(epmTreeNode *node,int opcode,int txrx,uint64_t fromepid,uint64_t toepid) +{ +#define COMPARE_ONE(X) if (node->X != X) return node->X < X ? -1 : 1 + COMPARE_ONE(opcode); + COMPARE_ONE(txrx); + COMPARE_ONE(fromepid); + COMPARE_ONE(toepid); + return 0; +} + +/* Inserts a new node in the tree corresponding to the parameters, or, retrieves the node in the tree. + In either case, this code returns a pointer to the count in the node. */ +static int *insertNodeInEpmTree(epmTreeNode **root,int opcode,int txrx,uint64_t fromepid,uint64_t toepid) +{ + if (*root) + { + int a = compareEpmNode((*root),opcode,txrx,fromepid,toepid); + if (a < 0) + return insertNodeInEpmTree(&((*root)->left),opcode,txrx,fromepid,toepid); + else if (a > 0) + return insertNodeInEpmTree(&((*root)->right),opcode,txrx,fromepid,toepid); + else + return &((*root)->count); + } + else + { + *root = malloc(sizeof(epmTreeNode)); + (*root)->opcode = opcode; + (*root)->txrx = txrx; + (*root)->count = 0; + (*root)->fromepid = fromepid; + (*root)->toepid = toepid; + (*root)->left = (*root)->right = NULL; + return &((*root)->count); + } +} + +/* returns 0, if the node is present, non-zero if it is absent. */ +static int lookupNodeInTree(const treeNode *root,const char *name,int line) +{ + if (root) + { + int c = strcmp(name,root->name); + if (c < 0) + return lookupNodeInTree(root->left,name,line); + else if (c > 0) + return lookupNodeInTree(root->right,name,line); + else + { + if (line < root->line1) + return lookupNodeInTree(root->left,name,line); + else if (line > root->line2) + return lookupNodeInTree(root->right,name,line); + else /* line must be >= root->line1 and line must be <= root->line2. */ + return 0; + } + } + else + { + return 1; + } +} + +/* Declare a prototype for a parserFunc - referenced in the following code: */ +typedef void parserFunc(char *,int,int,void *); + +/* breaks down a string into 'c'-delimited substrings, and calls the parser func for each substring. */ +static void parseString(char *ps,char c,parserFunc pf,void *ctx) +{ + int idx,n=0; + char *p; + + /* first, count the number of instances of c in ps, for use by the parser function: */ + for (idx=0;ps[idx];idx++) + if (ps[idx] == c) + n++; + /* next, break down ps into 'c'-delimited substrings, and call parser function, pf for each substring: */ + for (idx=0,p=ps;p && *p;idx++) + { + char *t = strchr(p,c); + if (!t) + { + break; + } + else + { + *t = 0; + pf(p,idx,n,ctx); + p = t+1; + } + } + /* finally, call pf on the final substring. */ + pf(p,idx,n,ctx); +} + +/* fncNameCtx is the context used while parsing FNL's (see psm_log.h for more info) from the environment: */ +typedef struct +{ + const char *currentFuncName; + int firstLineNumber; + treeNode **root; +} funcNameCtx; + +/* This is the start of the parser code for parsing FNL's. Here is the grammar: + + An FNL is a 'Function Name List' that is defined by the following grammar: + + # A LINE1 is either a single line number of a range of line numbers: +(1) LINE1 :: lineNumber | +(2) lineNumber1 '-' lineNumber2 + + # LINES is a list of LINE1's separated by commas: +(3) LINES :: LINE1 | +(4) LINE1 ',' LINES + + # An FN is either a function name, or a function name with a list of lines: +(5) FN :: functionName | +(6) functionName ';' LINES + + # A FNL is a list of FN's separated by colons: +(7) FNL :: FN | +(8) FN ':' FNL + + # Examples: + foo:bar the two functions foo and bar + foo;1-10 lines 1 to 10 of function foo. + bar;1,3,5 lines 1, 3 and 5 of function bar + +*/ + +/* p4() inserts a (function name and line number) pair into the FNL tree or a (function name and line number range) in the FNL tree. +*/ +static void p4(char *s,int idx,int n,void *ctx) +{ + funcNameCtx *pfnc = (funcNameCtx *)ctx; + + if (n == 0) /* production (1) */ + { + pfnc->firstLineNumber = atoi(s); + insertNodeInTree(pfnc->root,pfnc->currentFuncName,pfnc->firstLineNumber,pfnc->firstLineNumber); + } + else if (n == 1) /* production (2) */ + { + if (idx == 0) /* lhs of production (2) */ + pfnc->firstLineNumber = atoi(s); + else /* rhs of production (2). */ + insertNodeInTree(pfnc->root,pfnc->currentFuncName,pfnc->firstLineNumber,atoi(s)); + } +} + +/* p3 puts an entry into the FNL tree for all of the lines of a given functionname, or, it parses the list of line number ranges and + uses p4 to spill each individual range (or just one line number) into the tree */ +static void p3(char *s,int idx,int n,void *ctx) +{ + funcNameCtx *pfnc = (funcNameCtx *)ctx; + + if (n == 0 && *s == 0) /* production (5)/(7) */ + { + insertNodeInTree(pfnc->root,pfnc->currentFuncName,0,INT_MAX); + } + else if (*s) /* production (2) */ + { + /* breakdown the string into hyphen-delimited substrings, and further parses each substring with p4: */ + parseString(s,'-',p4,ctx); + } +} + +/* p2 parses the function name, and caches it into the context, and thereafter uses p3 to parse the line number range list. */ +static void p2(char *s,int idx,int n,void *ctx) +{ + funcNameCtx *pfnc = (funcNameCtx *)ctx; + + if (n) + { + if (idx == 0) + pfnc->currentFuncName = s; + else + { + /* production (4) */ + /* breakdown the string into comma-delimited substrings, and further parses each substring with p3: */ + parseString(s,',',p3,ctx); + } + } + else + { + /* production (7)/(5). */ + insertNodeInTree(pfnc->root,pfnc->currentFuncName=s,0,INT_MAX); + } +} + +/* p1 parses each function name and line range list. */ +static void p1(char *s,int idx,int n,void *ctx) +{ + /* production (5)/(6)) */ + /* breakdown the string into semi-colon-delimited substrings, and further parses each substring with p2: */ + parseString(s,';',p2,ctx); +} + +static void parseAndInsertInTree(const char *buf,treeNode **root) +{ + funcNameCtx t; + t.root = root; + char *p = alloca(strlen(buf)+1); + strcpy(p,buf); + /* productions (7)/(8) */ + /* separates string into colon-separated strings, and then parses each substring in p1: */ + parseString(p,':',p1,(void*)&t); +} + +/* initialization code for the psmi log mechanism. */ +static inline void psmi_initialize(const char **plmf_fileName_kernel, + const char **plmf_search_format_string, + treeNode **includeFunctionNamesTreeRoot, + treeNode **excludeFunctionNamesTreeRoot) +{ + static volatile int plmf_initialized = 0; + + if (!plmf_initialized) + { + static pthread_mutex_t plmf_init_mutex = PTHREAD_MUTEX_INITIALIZER; + + if (pthread_mutex_lock(&plmf_init_mutex)) + { + perror("cannot lock mutex for psmi_log_message facility"); + return; + } + /* CRITICAL SECTION BEGIN */ + if (!plmf_initialized) + { + /* initializing psmi log message facility here. */ + const char *env = getenv("PSM2_LOG_FILENAME"); + if (env) + *plmf_fileName_kernel = env; + env = getenv("PSM2_LOG_SRCH_FORMAT_STRING"); + if (env) + { + *plmf_search_format_string = env; + } + else + { + env = getenv("PSM2_LOG_INC_FUNCTION_NAMES"); + if (env) + { + parseAndInsertInTree(env,includeFunctionNamesTreeRoot); + } + env = getenv("PSM2_LOG_EXC_FUNCTION_NAMES"); + if (env) + { + parseAndInsertInTree(env,excludeFunctionNamesTreeRoot); + } + } + /* initialization of psmi log message facility is completed. */ + plmf_initialized = 1; + } + /* CRITICAL SECTION END */ + if (pthread_mutex_unlock(&plmf_init_mutex)) + { + perror("cannot unlock mutex for psmi_log_message facility"); + return; + } + } +} + +/* Utility function to map the integer txrx value to the given strings for emitting to the log file. */ +static const char * const TxRxString(int txrx) +{ + switch(txrx) + { + case PSM2_LOG_TX: return "Sent"; + case PSM2_LOG_RX: return "Received"; + case PSM2_LOG_PEND: return "Pending"; + default: return "Unknown"; + } +} + +/* Utility function to map an integer opcode value to the given strings for emitting to the log file. */ +static const char * const OpcodeString(int opcode) +{ + switch(opcode) + { + case OPCODE_LONG_RTS: return "RTS"; + case OPCODE_LONG_CTS: return "CTS"; + case OPCODE_LONG_DATA: return "DATA"; + case OPCODE_EXPTID: return "EXPTID"; + case OPCODE_EXPTID_COMPLETION: return "EXPTID_COMPLETION"; + default: return "UNKNOWN"; + } +} + +static const char *plmf_fileName_kernel = "/tmp/psm2_log"; +static const char *plmf_search_format_string = NULL; +static treeNode *includeFunctionNamesTreeRoot = NULL; +static treeNode *excludeFunctionNamesTreeRoot = NULL; + +void psmi_log_initialize(void) +{ + /* If not initialized, then, initialize in a single thread of execution. */ + psmi_initialize(&plmf_fileName_kernel, + &plmf_search_format_string, + &includeFunctionNamesTreeRoot, + &excludeFunctionNamesTreeRoot); +} + +#ifdef PSM_LOG_FAST_IO + +struct psmi_log_io_thread_info +{ + pthread_t thread_id; + char *buff; + unsigned long max_buff_length, curr_buff_length; + pthread_mutex_t flags_mutex; + volatile int flags; +#define PSMI_LOG_IO_FLAG_IO_IN_PROGRESS 1 /* io is currently in progress */ +#define PSMI_LOG_IO_FLAG_IO_SHUTDOWN 2 /* we are shutting down logging. */ +}; + +/* Please note that psmi_log_io_info is in thread local storage. */ +static __thread struct psmi_log_io_thread_info psmi_log_io_info = +{ + .thread_id = 0, + .buff = NULL, + .max_buff_length = 0, + .curr_buff_length = 0, + .flags_mutex = PTHREAD_MUTEX_INITIALIZER, + .flags = 0 +}; + +static struct +{ + unsigned int nTableEntries,maxTableEntries; + pthread_mutex_t table_mutex; + struct psmi_log_io_thread_info **table; +} psmi_log_io_table = +{ + .nTableEntries = 0, + .maxTableEntries = 0, + .table_mutex = PTHREAD_MUTEX_INITIALIZER, + .table = NULL +}; + +void psmi_log_fini() +{ + if (pthread_mutex_lock(&psmi_log_io_table.table_mutex)) + { + perror("Cannot lock mutex for psmi_log_io_table"); + return; + } + /* Start critical section. */ + + unsigned int i; + for (i=0;i < psmi_log_io_table.nTableEntries;i++) + { + if (psmi_log_io_table.table[i]) + { + struct psmi_log_io_thread_info *pti = psmi_log_io_table.table[i]; + int flags; + + if (pthread_mutex_lock(&pti->flags_mutex)) + { + perror("can't lock the flags mutex."); + continue; + } + /* critical section */ + flags = (pti->flags |= PSMI_LOG_IO_FLAG_IO_SHUTDOWN); + /* end critical section */ + pthread_mutex_unlock(&pti->flags_mutex); + /* if io is currenctly in progress, allow it to complete. */ + while (flags & PSMI_LOG_IO_FLAG_IO_IN_PROGRESS) + { + sleep(1); + if (pthread_mutex_lock(&pti->flags_mutex)) + { + perror("can't lock the flags mutex."); + continue; + } + flags = pti->flags; + pthread_mutex_unlock(&pti->flags_mutex); + } + if (pti->buff) + { + char logFileName[256]; + FILE *fout; + + snprintf(logFileName,sizeof(logFileName),"%s.%d.%ld", + plmf_fileName_kernel,getpid(),pti->thread_id); + fout = fopen(logFileName,"w"); + if (!fout) + { + perror(logFileName); + continue; + } + fwrite(pti->buff,pti->curr_buff_length,1,fout); + fclose(fout); + } + } + psmi_log_io_table.table[i] = NULL; + } + psmi_log_io_table.nTableEntries = 0; + psmi_free(psmi_log_io_table.table); + psmi_log_io_table.table = NULL; + psmi_log_io_table.maxTableEntries = 0; + /* End critical section. */ + pthread_mutex_unlock(&psmi_log_io_table.table_mutex); +} + +static int psmi_log_register_tls(void) +{ + if (psmi_log_io_info.thread_id != pthread_self()) + { + psmi_log_io_info.thread_id = pthread_self(); + if (pthread_mutex_lock(&psmi_log_io_table.table_mutex)) + { + perror("cannot lock table mutex"); + return -1; + } + /* critical section start. */ + if (psmi_log_io_table.maxTableEntries < psmi_log_io_table.nTableEntries+1) + { + if (psmi_log_io_table.maxTableEntries == 0) + { + psmi_log_io_table.maxTableEntries = 2; + psmi_log_io_table.table = psmi_malloc(PSMI_EP_NONE, + PER_PEER_ENDPOINT, + psmi_log_io_table.maxTableEntries * + sizeof(struct psmi_log_io_thread_info *)); + } + else + { + psmi_log_io_table.maxTableEntries *= 2; + psmi_log_io_table.table = psmi_realloc(PSMI_EP_NONE, + PER_PEER_ENDPOINT, + psmi_log_io_table.table, + psmi_log_io_table.maxTableEntries * + sizeof(struct psmi_log_io_thread_info *)); + } + } + psmi_log_io_table.table[psmi_log_io_table.nTableEntries] = &psmi_log_io_info; + psmi_log_io_table.nTableEntries++; + /* critical section end. */ + pthread_mutex_unlock(&psmi_log_io_table.table_mutex); + } + if (pthread_mutex_lock(&psmi_log_io_info.flags_mutex)) + { + perror("cannot lock table mutex"); + return -1; + } + /* critical section start. */ + int old_flags = psmi_log_io_info.flags; + int new_flags = old_flags; + if (0 == (old_flags & PSMI_LOG_IO_FLAG_IO_SHUTDOWN)) + new_flags |= PSMI_LOG_IO_FLAG_IO_IN_PROGRESS; + psmi_log_io_info.flags = new_flags; + /* critical section end. */ + pthread_mutex_unlock(&psmi_log_io_info.flags_mutex); + if (new_flags & PSMI_LOG_IO_FLAG_IO_IN_PROGRESS) + return 0; + return -1; +} + +static void psmi_buff_fclose(int port) +{ + if (pthread_mutex_lock(&psmi_log_io_info.flags_mutex)) + { + perror("cannot lock table mutex"); + return; + } + /* critical section start. */ + psmi_log_io_info.flags &= ~PSMI_LOG_IO_FLAG_IO_IN_PROGRESS; + /* critical section end. */ + pthread_mutex_unlock(&psmi_log_io_info.flags_mutex); +} + +static void growBuff(size_t minExcess) +{ + while (psmi_log_io_info.curr_buff_length+minExcess > psmi_log_io_info.max_buff_length) + { + if (!psmi_log_io_info.buff) + psmi_log_io_info.buff = (char *)psmi_malloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, + psmi_log_io_info.max_buff_length = 1 << 20); + else + { + psmi_log_io_info.max_buff_length *= 2; + psmi_log_io_info.buff = (char *)psmi_realloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, + psmi_log_io_info.buff, + psmi_log_io_info.max_buff_length); + } + } +} + +static int psmi_buff_vfprintf(int port, const char *format, va_list ap) +{ + int done = 0; + size_t excess = 1024; + int length; + + while (!done) + { + growBuff(excess); + + length = vsnprintf(psmi_log_io_info.buff + psmi_log_io_info.curr_buff_length, + excess, format, ap); + if (length >= excess) + excess *= 2; + else + done = 1; + } + psmi_log_io_info.curr_buff_length += length; + return length; +} + +static int psmi_buff_fprintf(int port,const char *format, ...) +{ + int length; + va_list ap; + + va_start(ap, format); + + length = psmi_buff_vfprintf(port,format,ap); + + va_end(ap); + return length; +} + +static int psmi_buff_fputc(int c, int port) +{ + growBuff(1024); + psmi_log_io_info.buff[psmi_log_io_info.curr_buff_length] = c; + psmi_log_io_info.curr_buff_length++; + return 1; +} +#endif + + +#define IS_PSMI_LOG_MAGIC(S) ((((uint64_t)(S)) <= ((uint64_t)PSM2_LOG_MIN_MAGIC)) && \ + (((uint64_t)(S)) >= ((uint64_t)PSM2_LOG_MAX_MAGIC))) + +/* plmf is short for 'psm log message facility. All of the PSM_LOG macros defined in psm_log.h + are serviced from this back end. */ +void psmi_log_message(const char *fileName, + const char *functionName, + int lineNumber, + const char *format, ...) +{ + va_list ap; + + va_start(ap, format); + + /* Next, determine if this log message is signal or noise. */ + if (plmf_search_format_string) + { + if (!IS_PSMI_LOG_MAGIC(format)) + { + if (fnmatch(plmf_search_format_string, format, 0)) + { + va_end(ap); + /* tis noise, return. */ + return; + } + } + } + else + { + if (includeFunctionNamesTreeRoot) + { + if (lookupNodeInTree(includeFunctionNamesTreeRoot,functionName,lineNumber)) + { + va_end(ap); + /* tis noise, return. */ + return; + } + } + + if (excludeFunctionNamesTreeRoot) + { + if (!lookupNodeInTree(excludeFunctionNamesTreeRoot,functionName,lineNumber)) + { + va_end(ap); + /* tis noise, return. */ + return; + } + } + } + + /* At this point, we think that this may be a message that we want to emit to the log. + But, there is one more test, to apply to the cases where the format is one of the + special formats for backtrack, and packet stream for example. */ + { + void **voidarray = NULL; + int nframes = 0; + const char *newFormat = format; + int opcode = 0; + psmi_log_tx_rx_t txrx = 0; + uint64_t fromepid = 0; + uint64_t toepid = 0; + void *dumpAddr[2] = {0}; + size_t dumpSize[2] = {0}; + +#ifdef PSM_LOG_FAST_IO +#define IO_PORT 0 +#define MY_FPRINTF psmi_buff_fprintf +#define MY_VFPRINTF psmi_buff_vfprintf +#define MY_FPUTC psmi_buff_fputc +#define MY_FCLOSE psmi_buff_fclose +#else + char logFileName[256]; + FILE *fout; +#define IO_PORT fout +#define MY_FPRINTF fprintf +#define MY_VFPRINTF vfprintf +#define MY_FPUTC fputc +#define MY_FCLOSE fclose +#endif + struct timespec tp; + + /* Pop arguments for the alternative forms of PSM_LOG functionality: */ + if (format == PSM2_LOG_BT_MAGIC) + { + voidarray = va_arg(ap,void **); + nframes = va_arg(ap,int); + newFormat = va_arg(ap,const char *); + } + else if (format == PSM2_LOG_EPM_MAGIC) + { + opcode = va_arg(ap,int); + txrx = va_arg(ap,psmi_log_tx_rx_t); + fromepid = va_arg(ap,uint64_t); + toepid = va_arg(ap,uint64_t); + newFormat = va_arg(ap,const char *); + } + else if (format == PSM2_LOG_DUMP_MAGIC) + { + dumpAddr[0] = va_arg(ap,void*); + dumpSize[0] = va_arg(ap,size_t); + newFormat = va_arg(ap,const char *); + } + else if (format == PSM2_LOG_PKT_STRM_MAGIC) + { + txrx = va_arg(ap,psmi_log_tx_rx_t); + dumpAddr[0] = va_arg(ap,struct ips_message_header *); + if (txrx == PSM2_LOG_RX) + { + dumpAddr[1] = va_arg(ap,uint32_t *); + dumpSize[1] = sizeof(uint64_t); + } + newFormat = va_arg(ap,const char *); + dumpSize[0] = sizeof(struct ips_message_header); + } + + /* One last test to make sure that this message is signal: */ + if (plmf_search_format_string && newFormat) + { + if (fnmatch(plmf_search_format_string, newFormat, 0)) + { + va_end(ap); + /* tis noise, return. */ + return; + } + } + +#ifdef PSM_LOG_FAST_IO + if (psmi_log_register_tls() != 0) + { + va_end(ap); + return; + } +#else + /* At this point we know that the message is not noise, and it is going to be emitted to the log. */ + snprintf(logFileName,sizeof(logFileName),"%s.%d.%ld", + plmf_fileName_kernel,getpid(), + pthread_self()); + fout = fopen(logFileName,"a"); + if (!fout) + { + va_end(ap); + return; + } +#endif + +#define M1() clock_gettime(CLOCK_REALTIME, &tp); \ + MY_FPRINTF(IO_PORT,"%f %s %s:%d: ", \ + (double)tp.tv_sec + ((double)tp.tv_nsec/1000000000.0), \ + functionName,fileName,lineNumber) + + M1(); + + if (!IS_PSMI_LOG_MAGIC(format)) + { + MY_VFPRINTF(IO_PORT,format,ap); + MY_FPUTC('\n',IO_PORT); + } + else if (format == PSM2_LOG_BT_MAGIC) + { + void *newframes[nframes]; + int newframecnt = backtrace(newframes,nframes); + int pframes = min(newframecnt,nframes); + + MY_VFPRINTF(IO_PORT,newFormat,ap); + MY_FPUTC('\n',IO_PORT); + + if (memcmp(voidarray,newframes,pframes * sizeof(void*))) + { + int i; + char **strings; + + memcpy(voidarray,newframes,sizeof(newframes)); + M1(); + MY_FPRINTF(IO_PORT, + "backtrace() returned %d addresses\n", + newframecnt); + strings = backtrace_symbols(voidarray, pframes); + if (strings == NULL) + { + perror("backtrace_symbols"); + exit(EXIT_FAILURE); + } + for (i = 0; i < pframes; i++) + { + M1(); + MY_FPRINTF(IO_PORT,"%s\n", strings[i]); + } +#undef free + free(strings); + } + } + else if (format == PSM2_LOG_EPM_MAGIC) + { + static epmTreeNode *root = 0; + static pthread_mutex_t plmf_epm_mutex = + PTHREAD_MUTEX_INITIALIZER; + int *pcount = 0; + if (pthread_mutex_lock(&plmf_epm_mutex)) + { + perror("cannot lock mutex for " + "psmi_log_message facility"); + va_end(ap); + return; + } + /* START OF CRITICAL SECTION */ + pcount = insertNodeInEpmTree(&root,opcode,txrx, + fromepid,toepid); + /* END OF CRITICAL SECTION */ + if (pthread_mutex_unlock(&plmf_epm_mutex)) + { + perror("cannot unlock mutex for " + "psmi_log_message facility"); + va_end(ap); + return; + } + (*pcount)++; + MY_FPRINTF(IO_PORT,"%s %s from: %" PRIx64 + ", to: %" PRIx64 ", count: %d, ", + TxRxString(txrx),OpcodeString(opcode), + fromepid,toepid,*pcount); + MY_VFPRINTF(IO_PORT,newFormat,ap); + MY_FPUTC('\n',IO_PORT); + } + else if (format == PSM2_LOG_PKT_STRM_MAGIC) + { + MY_FPRINTF(IO_PORT,"PKT_STRM: %s: imh: %p%s ", TxRxString(txrx), + dumpAddr[0], (txrx == PSM2_LOG_RX) ? "," : ""); + if (txrx == PSM2_LOG_RX) + MY_FPRINTF(IO_PORT,"rhf: %p ", dumpAddr[1]); + goto dumpit; + } + else if (format == PSM2_LOG_DUMP_MAGIC) + { + MY_VFPRINTF(IO_PORT,newFormat,ap); + MY_FPUTC('\n',IO_PORT); + dumpit: + M1(); + + uint8_t *pu8 = (uint8_t *)dumpAddr[0]; + size_t i,cnt=0; + for (i=0;i < dumpSize[0];i++) + { + if ((i != 0) && ((i % 8) == 0)) + { + MY_FPRINTF(IO_PORT," (%d)\n",(int)(i-8)); + M1(); + cnt = 0; + } + else if (cnt) + MY_FPUTC(',',IO_PORT); + MY_FPRINTF(IO_PORT,"0x%02x", pu8[i]); + cnt++; + } + if (cnt) + MY_FPRINTF(IO_PORT," (%d)\n",(int)(i-8)); + if (dumpSize[1]) + { + dumpSize[0] = dumpSize[1]; + dumpAddr[0] = dumpAddr[1]; + dumpSize[1] = 0; + goto dumpit; + } + } + MY_FCLOSE(IO_PORT); + } + + va_end(ap); +} +#endif /* #ifdef PSM_LOG */ diff --git a/psm_utils.h b/psm_utils.h new file mode 100644 index 0000000..fc38153 --- /dev/null +++ b/psm_utils.h @@ -0,0 +1,375 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef _PSMI_IN_USER_H +#error psm_utils.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_UTILS_H +#define _PSMI_UTILS_H + +#include /* ipv4addr */ +#include /* malloc/free */ + +/* + * Endpoint 'id' hash table, with iterator interface + */ +struct psmi_epid_table { + struct psmi_epid_tabentry *table; + int tabsize; + int tabsize_used; + pthread_mutex_t tablock; +}; +/* + * Endpoint address hash table + */ +struct psmi_epid_tabentry { + void *entry; + uint64_t key; + psm2_ep_t ep; + psm2_epid_t epid; +}; + +extern struct psmi_epid_table psmi_epid_table; +#define EPADDR_DELETED ((void *)-1) /* tag used to mark deleted entries */ + +psm2_error_t psmi_epid_init(); +psm2_error_t psmi_epid_fini(); +void *psmi_epid_lookup(psm2_ep_t ep, psm2_epid_t epid); +void *psmi_epid_remove(psm2_ep_t ep, psm2_epid_t epid); +psm2_error_t psmi_epid_add(psm2_ep_t ep, psm2_epid_t epid, void *entry); +#define PSMI_EP_HOSTNAME ((psm2_ep_t) -1) /* Special endpoint handle we use + * to register hostnames */ +#define PSMI_EP_CROSSTALK ((psm2_ep_t) -2) /* Second special endpoint handle + * to log which nodes we've seen + * crosstalk from */ +struct psmi_eptab_iterator { + int i; /* last index looked up */ + psm2_ep_t ep; +}; +void psmi_epid_itor_init(struct psmi_eptab_iterator *itor, psm2_ep_t ep); +void *psmi_epid_itor_next(struct psmi_eptab_iterator *itor); +void psmi_epid_itor_fini(struct psmi_eptab_iterator *itor); + +uint64_t psmi_epid_version(psm2_epid_t epid); + +/* + * Hostname manipulation + */ +char *psmi_gethostname(void); +const char *psmi_epaddr_get_hostname(psm2_epid_t epid); +const char *psmi_epaddr_get_name(psm2_epid_t epid); +psm2_error_t psmi_epid_set_hostname(uint64_t nid, const char *hostname, + int overwrite); + +/* + * Memory allocation, use macros only. + * + * In all calls, ep can be a specific endpoint (valid psm2_ep_t) or PSMI_EP_NONE + * if no endpoint is available. + * + * psmi_malloc_usable_size(void *ptr) + * psmi_malloc(ep, memtype, size) + * psmi_realloc(ep, memtype, ptr, newsize) + * psmi_memalign(ep, memtype, alignment, size) + * psmi_calloc(ep, memtype, elemsz, numelems) + * psmi_strdup(ep, memtype, ptr) + * psmi_free(ptr) + * + */ +typedef enum psmi_memtype { + TOTAL = 0, /* Logged automatically by malloc/calloc */ + UNDEFINED, /* For tracking "other types" of allocations */ + PER_PEER_ENDPOINT, /* For tracking "per peer" allocations */ + NETWORK_BUFFERS, /* For tracking network buffers */ + DESCRIPTORS, /* For tracking send/recv descriptors */ + UNEXPECTED_BUFFERS, /* For tracking unexpected recv buffers */ + STATS, /* For tracking stats-related allocs */ +} psmi_memtype_t; + +/* + * We track allocation stats. + */ +struct psmi_stats_malloc { + int64_t m_all_total; + int64_t m_all_max; + int64_t m_perpeer_total; + int64_t m_perpeer_max; + int64_t m_netbufs_total; + int64_t m_netbufs_max; + int64_t m_descriptors_total; + int64_t m_descriptors_max; + int64_t m_unexpbufs_total; + int64_t m_unexpbufs_max; + int64_t m_undefined_total; + int64_t m_undefined_max; + int64_t m_stats_total; + int64_t m_stats_max; +}; + +extern struct psmi_stats_malloc psmi_stats_memory; + +void *psmi_malloc_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t sz, + const char *curloc); +void *psmi_realloc_internal(psm2_ep_t ep, psmi_memtype_t mt, void *ptr, + size_t newSz, const char *curloc); +void *psmi_memalign_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t alignment, + size_t sz, const char *curloc); +void *psmi_calloc_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t num, + size_t sz, const char *curloc); +void *psmi_strdup_internal(psm2_ep_t ep, const char *string, const char *curloc); + +void MOCKABLE(psmi_free_internal)(void *ptr, const char *curLoc); +MOCK_DCL_EPILOGUE(psmi_free_internal); + +size_t psmi_malloc_usable_size_internal(void *ptr, const char *curLoc); + +#ifdef PSM_HEAP_DEBUG +/* During heap debug code, we can sprinkle function calls: + psmi_heapdebug_val_heapallocs(), that will examine all of the heap allocations + to ensure integrity. */ +void _psmi_heapdebug_val_heapallocs(const char *curloc); + +#define psmi_heapdebug_val_heapallocs() _psmi_heapdebug_val_heapallocs(PSMI_CURLOC) + +#else + +#define psmi_heapdebug_val_heapallocs() /* nothing */ + +#endif + +#define psmi_strdup(ep, string) psmi_strdup_internal(ep, string, PSMI_CURLOC) +#define psmi_calloc(ep, mt, nelem, elemsz) \ + psmi_calloc_internal(ep, mt, nelem, elemsz, PSMI_CURLOC) +#define psmi_malloc(ep, mt, sz) psmi_malloc_internal(ep, mt, sz, PSMI_CURLOC) +#define psmi_realloc(ep, mt, ptr, nsz) psmi_realloc_internal(ep, mt, ptr, nsz, PSMI_CURLOC) +#define psmi_memalign(ep, mt, al, sz) \ + psmi_memalign_internal(ep, mt, al, sz, PSMI_CURLOC) +#define psmi_free(ptr) psmi_free_internal(ptr, PSMI_CURLOC) +#define psmi_malloc_usable_size(ptr) psmi_malloc_usable_size_internal(ptr, PSMI_CURLOC) +#ifndef PSM_IS_TEST +#define malloc(sz) _use_psmi_malloc_instead_of_plain_malloc +#define realloc(ptr,nsz) _use_psmi_realloc_instead_of_plain_realloc +#define memalign(algn,sz) _use_psmi_memalign_instead_of_plain_memalign +#define calloc(sz, nelm) _use_psmi_calloc_instead_of_plain_calloc +#ifdef strdup +#undef strdup +#endif +#define strdup(ptr) _use_psmi_strdup_instead_of_plain_strdup +#define free(ptr) _use_psmi_free_instead_of_plain_free +#define malloc_usable_size(ptr) _use_psmi_malloc_usable_size_instead_of_plain_malloc_usable_size +#endif /* PSM_IS_TEST */ + +void psmi_log_memstats(psmi_memtype_t type, int64_t nbytes); + +/* + * Parsing int parameters set in string tuples. + */ +int psmi_parse_str_tuples(const char *str, int ntup, int *vals); + +/* + * Resource Limiting based on PSM memory mode. + */ +#define PSMI_MEMMODE_NORMAL 0 +#define PSMI_MEMMODE_MINIMAL 1 +#define PSMI_MEMMODE_LARGE 2 +#define PSMI_MEMMODE_NUM 3 + +struct psmi_rlimit_mpool { + const char *env; + const char *descr; + int env_level; + uint32_t minval; + uint32_t maxval; + struct { + uint32_t obj_chunk; + uint32_t obj_max; + } mode[PSMI_MEMMODE_NUM]; +}; +psm2_error_t psmi_parse_mpool_env(const psm2_mq_t mq, int level, + const struct psmi_rlimit_mpool *rlim, + uint32_t *valo, uint32_t *chunkszo); +int psmi_parse_memmode(void); + +/* + * Parsing environment variables + */ + +union psmi_envvar_val { + void *e_void; + char *e_str; + int e_int; + unsigned int e_uint; + long e_long; + unsigned long e_ulong; + unsigned long long e_ulonglong; +}; + +#define PSMI_ENVVAR_LEVEL_USER 1 +#define PSMI_ENVVAR_LEVEL_HIDDEN 2 +#define PSMI_ENVVAR_LEVEL_NEVER_PRINT 4 + +#define PSMI_ENVVAR_TYPE_YESNO 0 +#define PSMI_ENVVAR_TYPE_STR 1 +#define PSMI_ENVVAR_TYPE_INT 2 +#define PSMI_ENVVAR_TYPE_UINT 3 +#define PSMI_ENVVAR_TYPE_UINT_FLAGS 4 +#define PSMI_ENVVAR_TYPE_LONG 5 +#define PSMI_ENVVAR_TYPE_ULONG 6 +#define PSMI_ENVVAR_TYPE_ULONG_FLAGS 7 +#define PSMI_ENVVAR_TYPE_ULONG_ULONG 8 + +#define PSMI_ENVVAR_VAL_YES ((union psmi_envvar_val) 1) +#define PSMI_ENVVAR_VAL_NO ((union psmi_envvar_val) 0) + +int +MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level, + int type, union psmi_envvar_val defval, + union psmi_envvar_val *newval); +MOCK_DCL_EPILOGUE(psmi_getenv); +/* + * Misc functionality + */ +uintptr_t psmi_getpagesize(void); +uint64_t psmi_cycles_left(uint64_t start_cycles, int64_t timeout_ns); +uint32_t psmi_get_ipv4addr(); +void psmi_syslog(psm2_ep_t ep, int to_console, int level, + const char *format, ...); +void psmi_uuid_unparse(const psm2_uuid_t uuid, char *out); +int psmi_uuid_compare(const psm2_uuid_t uuA, const psm2_uuid_t uuB); +void *psmi_memcpyo(void *dst, const void *src, size_t n); +uint32_t psmi_crc(unsigned char *buf, int len); + +/* + * Internal CPUID detection + */ +#define CPUID_FAMILY_MASK 0x00000f00 +#define CPUID_MODEL_MASK 0x000000f0 +#define CPUID_EXMODEL_MASK 0x000f0000 + +/* + * CPUID return values + */ +#define CPUID_FAMILY_XEON 0x00000600 +#define CPUID_MODEL_PHI_GEN2 87 +#define CPUID_MODEL_PHI_GEN2M 133 +/* + * cpuid function 0, returns "GeniuneIntel" in EBX,ECX,EDX + * due to Little Endian and Hex it is not so obvious + */ +#define CPUID_GENUINE_INTEL_EBX 0x756e6547 /* "uneG" - Little Endian "Genu" */ +#define CPUID_GENUINE_INTEL_ECX 0x6c65746e /* "Ieni" - Little Endian "ineI" */ +#define CPUID_GENUINE_INTEL_EDX 0x49656e69 /* "letn" - Little Endian "ntel" */ + +/* + * These values are internal only, not real register values + */ +#define CPUID_GENUINE_INTEL 0xf0000000 +#define CPUID_MODEL_UNDEFINED -1 + +/* + * Global model so we can tune defaults better for specific cpu's + */ +uint32_t psmi_cpu_model; + +/* + * Diagnostics, all in psm_diags.c + */ +int psmi_diags(void); + +/* + * Multiple Endpoints + */ +extern int psmi_multi_ep_enabled; +void psmi_multi_ep_init(); + +/* + * Fault injection + */ +struct psmi_faultinj_spec; +int psmi_faultinj_enabled; /* use macro to test */ +#if 1 /* possible to disable at compile time */ +#define PSMI_FAULTINJ_ENABLED() (!!psmi_faultinj_enabled) +#else +#define PSMI_FAULTINJ_ENABLED() 0 +#endif + +void psmi_faultinj_init(); +void psmi_faultinj_fini(); +struct psmi_faultinj_spec *psmi_faultinj_getspec(char *spec_name, + int num, int denom); +#define PSMI_FAULTINJ_STATIC_DECL(var, spec_name, num, denom) \ + static struct psmi_faultinj_spec *var; \ + if (PSMI_FAULTINJ_ENABLED() && (var) == NULL) \ + (var) = psmi_faultinj_getspec((spec_name), (num), (denom)); +int psmi_faultinj_is_fault(struct psmi_faultinj_spec *spec); + +/* + * PSM core component set/get options + */ +psm2_error_t psmi_core_setopt(const void *core_obj, int optname, + const void *optval, uint64_t optlen); + +psm2_error_t psmi_core_getopt(const void *core_obj, int optname, + void *optval, uint64_t *optlen); + +/* + * PSM AM component set/get options + */ +psm2_error_t psmi_am_setopt(const void *am_obj, int optname, + const void *optval, uint64_t optlen); + +psm2_error_t psmi_am_getopt(const void *am_obj, int optname, + void *optval, uint64_t *optlen); + +#endif /* _PSMI_UTILS_H */ diff --git a/psmi_wrappers.c b/psmi_wrappers.c new file mode 100644 index 0000000..ba2b0a6 --- /dev/null +++ b/psmi_wrappers.c @@ -0,0 +1,94 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include +#include "psmi_wrappers.h" +#include + +/* The following indirection wrappers for external functions + * are only created if this is a mocking tests build + */ +#ifdef PSM2_MOCK_TESTING + +void MOCKABLE(psmi_exit)(int status) +{ + exit(status); +} +MOCK_DEF_EPILOGUE(psmi_exit); + +ssize_t MOCKABLE(psmi_write)(int fd, const void *buf, size_t count) +{ + return write(fd, buf, count); +} +MOCK_DEF_EPILOGUE(psmi_write); + +int MOCKABLE(psmi_ioctl)(int fd, unsigned int cmd, unsigned long arg) +{ + return ioctl(fd, cmd, arg); +} +MOCK_DEF_EPILOGUE(psmi_ioctl); + +int MOCKABLE(psmi_sigaction)(int signum, const struct sigaction *act, struct sigaction *oldact) +{ + return sigaction(signum, act, oldact); +} +MOCK_DEF_EPILOGUE(psmi_sigaction); + +void MOCKABLE(psmi_rmb)(void) +{ + return ips_rmb(); +} +MOCK_DEF_EPILOGUE(psmi_rmb); + +#endif /* def PSM2_MOCK_TESTING */ diff --git a/psmi_wrappers.h b/psmi_wrappers.h new file mode 100644 index 0000000..68f11c8 --- /dev/null +++ b/psmi_wrappers.h @@ -0,0 +1,98 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef _PSMI_WRAPPERS_H +#define _PSMI_WRAPPERS_H + +#include +#include "psm2_mock_testing.h" +#include "opa_intf.h" + +#if defined( IB_IOCTL_MAGIC ) +#include +#endif + +/* If this is a mocking tests build, we introduce "incision points" + * through which we can easily mock external dependencies. + * For non-mocking-tests build, we bypass those indirections + * for performance reasons. + */ + +#ifdef PSM2_MOCK_TESTING +void MOCKABLE(psmi_exit)(int status); +MOCK_DCL_EPILOGUE(psmi_exit); + +ssize_t MOCKABLE(psmi_write)(int fd, const void *buf, size_t count); +MOCK_DCL_EPILOGUE(psmi_write); + +int MOCKABLE(psmi_ioctl)(int fd, unsigned int cmd, unsigned long arg); +MOCK_DCL_EPILOGUE(psmi_ioctl); + +int MOCKABLE(psmi_sigaction)(int signum, const struct sigaction *act, struct sigaction *oldact); +MOCK_DCL_EPILOGUE(psmi_sigaction); + +void MOCKABLE(psmi_rmb)(void); +MOCK_DCL_EPILOGUE(psmi_rmb); + +#else /* def PSM2_MOCK_TESTING */ + +#define psmi_exit exit +#define psmi_write write +#define psmi_ioctl ioctl +#define psmi_sigaction sigaction +#define psmi_rmb ips_rmb + +#endif /* def PSM2_MOCK_TESTING */ + +#endif // _PSMI_WRAPPERS_H + diff --git a/ptl.h b/ptl.h new file mode 100644 index 0000000..14f8cd1 --- /dev/null +++ b/ptl.h @@ -0,0 +1,222 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +/* Interface implemented by Packet Transport layers such as + * ips and active messages. + * + * This interface can be volatile, it is never seen by PSM clients, and it will + * probably change as the AM ptl is developed. + */ + +#ifndef PSM_PTL_H +#define PSM_PTL_H +#include +#include +#include +#include + +/* We currently have 3 PTLs, 0 is reserved. */ +#define PTL_DEVID_IPS 1 +#define PTL_DEVID_AMSH 2 +#define PTL_DEVID_SELF 3 + +/* We can currently initialize up to 3 PTLs */ +#define PTL_MAX_INIT 3 + +/* struct ptl is an incomplete type, and it serves as a generic or opaque + container. It should remain an incomplete type in the entire psm + source base. concrete ptl types need to have a suffix such as ptl_self, + ptl_ips. */ +struct ptl; +typedef struct ptl ptl_t; + +struct ptl_ctl; +typedef struct ptl_ctl ptl_ctl_t; + +struct ptl_mq_req; +typedef struct ptl_mq_req ptl_mq_req_t; + +struct ips_proto; +typedef struct ips_proto ips_proto_t; + +/* To be filled in statically by all PTLs */ +struct ptl_ctl_init { + size_t(*sizeof_ptl) (void); + + psm2_error_t(*init) (const psm2_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl); + + psm2_error_t(*fini) (ptl_t *ptl, int force, uint64_t timeout_ns); + + psm2_error_t + (*setopt) (const void *component_obj, int optname, + const void *optval, uint64_t optlen); + + psm2_error_t + (*getopt) (const void *component_obj, int optname, + void *optval, uint64_t *optlen); +}; + +struct ptl_ctl_rcvthread { + uint32_t(*is_enabled) (const ptl_t *ptl); + void(*transfer_ownership) (ptl_t *from_ptl, ptl_t *to_ptl); +}; + +typedef +struct ptl_arg { + union { + struct { + uint16_t u16w3; + uint16_t u16w2; + uint16_t u16w1; + uint16_t u16w0; + }; + struct { + uint32_t u32w1; + uint32_t u32w0; + }; + uint64_t u64w0; + uint64_t u64; + void *uptr; + }; +} ptl_arg_t; + +#include "ptl_self/ptl_fwd.h" +#include "ptl_ips/ptl_fwd.h" +#include "ptl_am/ptl_fwd.h" + +/* To be filled in as part of ptl_init */ +struct ptl_ctl { + ptl_t *ptl; /* pointer to ptl */ + psm2_ep_t ep; /* pointer to ep */ + + /* EP-specific stuff */ + psm2_error_t(*ep_poll) (ptl_t *ptl, int replyonly); + + /* PTL-level connect + * + * This PTL-level is slightly different from the top-level PSM connect. + * + * pre 1: Caller has masked off epids in epid array that are already + * connected at the PSM level. + * + * post 0: PTL has allocate all epaddrs and whatever internal ptladdr + * that ptl needs. + * post 1: PTL marks error[i] as UNREACHABLE if PTL can't get to epid[i] + * post 2: PTL marks error[i] as UNKNOWN for all epid[i] that couldn't + * be connected before a timeout occurred. + * post 3: PTL returns OK if all epids are either OK or UNREACHABLE + * post 4: PTL defines content or epaddr[i] only if epaddr[i] is OK. + */ + psm2_error_t(*ep_connect) (ptl_t *ptl, + int num_ep, + const psm2_epid_t input_array_of_epid[], + const int array_of_epid_mask[], + psm2_error_t output_array_of_errors[], + psm2_epaddr_t output_array_of_epddr[], + uint64_t timeout_ns); + + psm2_error_t (*ep_disconnect)(ptl_t *ptl, + int force, + int num_ep, + psm2_epaddr_t input_array_of_epaddr[], + const int array_of_epaddr_mask[], + psm2_error_t output_array_of_errors[], + uint64_t timeout_ns); + + /* MQ stuff */ + psm2_error_t(*mq_send) (psm2_mq_t mq, psm2_epaddr_t dest, + uint32_t flags, psm2_mq_tag_t *stag, + const void *buf, uint32_t len); + psm2_error_t(*mq_isend) (psm2_mq_t mq, psm2_epaddr_t dest, + uint32_t flags_user, uint32_t flags_internal, + psm2_mq_tag_t *stag, const void *buf, + uint32_t len, void *ctxt, psm2_mq_req_t *req); + + int (*epaddr_stats_num) (void); + int (*epaddr_stats_init) (char *desc[], uint16_t *flags); + int (*epaddr_stats_get) (psm2_epaddr_t epaddr, uint64_t *stats); + + /* AM stuff */ + psm2_error_t(*am_get_parameters) (psm2_ep_t ep, + struct psm2_am_parameters * + parameters); + psm2_error_t(*am_short_request) (psm2_epaddr_t epaddr, + psm2_handler_t handler, + psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt); + psm2_error_t(*am_short_reply) (psm2_am_token_t token, + psm2_handler_t handler, + psm2_amarg_t *args, int nargs, void *src, + size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt); + /* Long messages currently unsupported */ +#if 0 + psm2_error_t(*am_long_request) (psm2_epaddr_t epaddr, + psm2_handler_t handler, + psm2_amarg_t *args, int nargs, + void *src, size_t len, void *dest, + int flags); + psm2_error_t(*am_long_reply) (psm2_am_token_t token, + psm2_handler_t handler, psm2_amarg_t *args, + int nargs, void *src, size_t len, + void *dest, int flags); +#endif + psm2_error_t (*msg_size_thresh_query) (enum psm2_info_query_thresh_et, + uint32_t *out, psm2_mq_t mq, psm2_epaddr_t); +}; +#endif diff --git a/ptl_am/Makefile b/ptl_am/Makefile new file mode 100644 index 0000000..71c4e97 --- /dev/null +++ b/ptl_am/Makefile @@ -0,0 +1,91 @@ +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2015 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2015 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Copyright (c) 2003-2014 Intel Corporation. All rights reserved. +# + +OUTDIR = . + +this_srcdir := $(shell readlink -m .) +top_srcdir := $(this_srcdir)/.. + +INCLUDES += -I$(top_srcdir) + +${TARGLIB}-objs := am_reqrep.o am_reqrep_shmem.o ptl.o cmarwu.o +${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs}) + +DEPS := $(${TARGLIB}-objs:.o=.d) + +.PHONY: all clean +IGNORE_DEP_TARGETS = clean + +all .DEFAULT: ${${TARGLIB}-objs} + +$(OUTDIR)/%.d: $(this_srcdir)/%.c + $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o) + +$(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS} + $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -c $< -o $@ + +clean: + @if [ -d $(OUTDIR) ]; then \ + cd $(OUTDIR); \ + rm -f *.o *.d *.gcda *.gcno; \ + cd -; \ + fi + +#ifeq prevents the deps from being included during clean +#-include line is required to pull in auto-dependecies during 2nd pass +ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),) +-include ${DEPS} +endif + +install: + @echo "Nothing to do for install." diff --git a/ptl_am/am_config.h b/ptl_am/am_config.h new file mode 100644 index 0000000..d887118 --- /dev/null +++ b/ptl_am/am_config.h @@ -0,0 +1,82 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2018 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2018 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef PTL_AM_AM_CONFIG_H +#define PTL_AM_AM_CONFIG_H + +#include "psm_config.h" + +/* + * Can change the rendezvous threshold based on usage of cma (or not) + */ +#define PSMI_MQ_RV_THRESH_CMA 16000 + +/* If no kernel assisted copy is available this is the rendezvous threshold */ +#define PSMI_MQ_RV_THRESH_NO_KASSIST 16000 + +#define AMSH_HAVE_CMA 0x1 +#define AMSH_HAVE_KASSIST 0x1 + +/* Each block reserves some space at the beginning to store auxiliary data */ +#define AMSH_BLOCK_HEADER_SIZE 4096 + +/* AMLONG_SZ is the total size in memory of a bulk packet, including an + * am_pkt_bulk_t header struct. + * AMLONG_MTU is the number of bytes available in a bulk packet for payload. */ +#define AMLONG_SZ 8192 +#define AMLONG_MTU (AMLONG_SZ-sizeof(am_pkt_bulk_t)) + +#define PSMI_KASSIST_MODE_DEFAULT PSMI_KASSIST_CMA_GET +#define PSMI_KASSIST_MODE_DEFAULT_STRING "cma-get" + +#endif /* PTL_AM_AM_CONFIG_H */ diff --git a/ptl_am/am_cuda_memhandle_cache.c b/ptl_am/am_cuda_memhandle_cache.c new file mode 100644 index 0000000..8406a37 --- /dev/null +++ b/ptl_am/am_cuda_memhandle_cache.c @@ -0,0 +1,321 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifdef PSM_CUDA + +#include "psm_user.h" +#include "am_cuda_memhandle_cache.h" +#define RBTREE_GET_LEFTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start) +#define RBTREE_GET_RIGHTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start+((PAYLOAD_PTR)->length)) +#define RBTREE_ASSERT psmi_assert +#define RBTREE_MAP_COUNT(PAYLOAD_PTR) ((PAYLOAD_PTR)->nelems) + +#include "rbtree.c" + +#ifdef PSM_DEBUG +static int cache_hit_counter; +static int cache_miss_counter; +#endif + +/* + * Creating mempool for cuda memhandle cache nodes. + */ +psm2_error_t +am_cuda_memhandle_mpool_init(uint32_t memcache_size) +{ + psm2_error_t err; + cuda_memhandle_cache_size = memcache_size; + /* Creating a memory pool of size PSM2_CUDA_MEMCACHE_SIZE + * which includes the Root and NIL items + */ + cuda_memhandle_mpool = psmi_mpool_create_for_cuda(sizeof(cl_map_item_t), + cuda_memhandle_cache_size, + cuda_memhandle_cache_size, 0, + UNDEFINED, NULL, NULL, + psmi_cuda_memhandle_cache_alloc_func, + NULL); + if (cuda_memhandle_mpool == NULL) { + err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, + "Couldn't allocate CUDA host receive buffer pool"); + return err; + } + return PSM2_OK; +} + +/* + * Initialize rbtree. + */ +psm2_error_t am_cuda_memhandle_cache_map_init() +{ + cl_map_item_t *root, *nil_item; + root = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); + if (root == NULL) + return PSM2_NO_MEMORY; + nil_item = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); + if (nil_item == NULL) + return PSM2_NO_MEMORY; + nil_item->payload.start = 0; + nil_item->payload.epid = 0; + nil_item->payload.length = 0; + cuda_memhandle_cache_enabled = 1; + ips_cl_qmap_init(&cuda_memhandle_cachemap,root,nil_item); + NELEMS = 0; + return PSM2_OK; +} + +void am_cuda_memhandle_cache_map_fini() +{ +#ifdef PSM_DEBUG + _HFI_DBG("cache hit counter: %d\n", cache_hit_counter); + _HFI_DBG("cache miss counter: %d\n", cache_miss_counter); +#endif + + if (cuda_memhandle_cachemap.nil_item) + psmi_free(cuda_memhandle_cachemap.nil_item); + if (cuda_memhandle_cachemap.root) + psmi_free(cuda_memhandle_cachemap.root); + if (cuda_memhandle_cache_enabled) + psmi_mpool_destroy(cuda_memhandle_mpool); + return; +} + +/* + * Insert at the head of Idleq. + */ +static void +am_cuda_idleq_insert(cl_map_item_t* memcache_item) +{ + if (FIRST == NULL) { + FIRST = memcache_item; + LAST = memcache_item; + return; + } + INEXT(FIRST) = memcache_item; + IPREV(memcache_item) = FIRST; + FIRST = memcache_item; + return; +} + +/* + * Remove least recent used element. + */ +static void +am_cuda_idleq_remove_last(cl_map_item_t* memcache_item) +{ + if (!INEXT(memcache_item)) { + LAST = NULL; + FIRST = NULL; + return; + } + LAST = INEXT(memcache_item); + IPREV(LAST) = NULL; + return; +} + +static void +am_cuda_idleq_remove(cl_map_item_t* memcache_item) +{ + if (LAST == memcache_item) { + am_cuda_idleq_remove_last(memcache_item); + return; + } + if (INEXT(memcache_item) == NULL) { + INEXT(IPREV(memcache_item)) = NULL; + return; + } + INEXT(IPREV(memcache_item)) = INEXT(memcache_item); + IPREV(INEXT(memcache_item)) = IPREV(memcache_item); + return; +} + +static void +am_cuda_idleq_reorder(cl_map_item_t* memcache_item) +{ + if (FIRST == memcache_item && LAST == memcache_item ) { + return; + } + am_cuda_idleq_remove(memcache_item); + am_cuda_idleq_insert(memcache_item); + return; +} + +/* + * After a successful cache hit, item is validated by doing a + * memcmp on the handle stored and the handle we recieve from the + * sender. If the validation fails the item is removed from the idleq, + * the rbtree, is put back into the mpool and IpcCloseMemHandle function + * is called. + */ +static psm2_error_t +am_cuda_memhandle_cache_validate(cl_map_item_t* memcache_item, + uintptr_t sbuf, CUipcMemHandle* handle, + uint32_t length, psm2_epid_t epid) +{ + if ((0 == memcmp(handle, &memcache_item->payload.cuda_ipc_handle, + sizeof(CUipcMemHandle))) + && sbuf == memcache_item->payload.start + && epid == memcache_item->payload.epid) { + return PSM2_OK; + } + ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, memcache_item); + PSMI_CUDA_CALL(cuIpcCloseMemHandle, + memcache_item->payload.cuda_ipc_dev_ptr); + am_cuda_idleq_remove(memcache_item); + psmi_mpool_put(memcache_item); + return PSM2_OK_NO_PROGRESS; +} + +/* + * Current eviction policy: Least Recently Used. + */ +static void +am_cuda_memhandle_cache_evict() +{ + cl_map_item_t *p_item = LAST; + ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, p_item); + PSMI_CUDA_CALL(cuIpcCloseMemHandle, p_item->payload.cuda_ipc_dev_ptr); + am_cuda_idleq_remove_last(p_item); + psmi_mpool_put(p_item); + return; +} + +static psm2_error_t +am_cuda_memhandle_cache_register(uintptr_t sbuf, CUipcMemHandle* handle, + uint32_t length, psm2_epid_t epid, + CUdeviceptr cuda_ipc_dev_ptr) +{ + if (NELEMS == cuda_memhandle_cache_size) + am_cuda_memhandle_cache_evict(); + cl_map_item_t* memcache_item = psmi_mpool_get(cuda_memhandle_mpool); + /* memcache_item cannot be NULL as we evict + * before the call to mpool_get. Check has + * been fixed to help with klockwork analysis. + */ + if (memcache_item == NULL) + return PSM2_NO_MEMORY; + memcache_item->payload.start = sbuf; + memcache_item->payload.cuda_ipc_handle = *handle; + memcache_item->payload.cuda_ipc_dev_ptr = cuda_ipc_dev_ptr; + memcache_item->payload.length = length; + memcache_item->payload.epid = epid; + ips_cl_qmap_insert_item(&cuda_memhandle_cachemap, memcache_item); + am_cuda_idleq_insert(memcache_item); + return PSM2_OK; +} + +/* + * The key used to search the cache is the senders buf address pointer. + * Upon a succesful hit in the cache, additional validation is required + * as multiple senders could potentially send the same buf address value. + */ +CUdeviceptr +am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle, + uint32_t length, psm2_epid_t epid) +{ + CUdeviceptr cuda_ipc_dev_ptr; + if(cuda_memhandle_cache_enabled) { + cl_qmap_t *p_map = &cuda_memhandle_cachemap; + cl_map_item_t *p_item; + unsigned long start = (unsigned long)sbuf; + unsigned long end = start + length; + p_item = ips_cl_qmap_search(p_map, start, end); + if (p_item->payload.start) { + if (am_cuda_memhandle_cache_validate(p_item, sbuf, + handle, length, epid) == PSM2_OK) { +#ifdef PSM_DEBUG + cache_hit_counter++; +#endif + am_cuda_idleq_reorder(p_item); + return p_item->payload.cuda_ipc_dev_ptr; + } + } +#ifdef PSM_DEBUG + cache_miss_counter++; +#endif + PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, + *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + am_cuda_memhandle_cache_register(sbuf, handle, + length, epid, cuda_ipc_dev_ptr); + return cuda_ipc_dev_ptr; + } else { + PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, + *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + return cuda_ipc_dev_ptr; + } +} + +void +am_cuda_memhandle_release(CUdeviceptr cuda_ipc_dev_ptr) +{ + if(!cuda_memhandle_cache_enabled) + PSMI_CUDA_CALL(cuIpcCloseMemHandle, cuda_ipc_dev_ptr); + return; +} + +/* + * This is the callback function when mempool are resized or destroyed. + * Upon calling cache fini mpool is detroyed which in turn calls this callback + * which helps in closing all memhandles. + */ +void +psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj) +{ + cl_map_item_t* memcache_item = (cl_map_item_t*)obj; + if (!is_alloc) { + if(memcache_item->payload.start) + PSMI_CUDA_CALL(cuIpcCloseMemHandle, + memcache_item->payload.cuda_ipc_dev_ptr); + } +} + +#endif diff --git a/ptl_am/am_cuda_memhandle_cache.h b/ptl_am/am_cuda_memhandle_cache.h new file mode 100644 index 0000000..494de32 --- /dev/null +++ b/ptl_am/am_cuda_memhandle_cache.h @@ -0,0 +1,124 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifdef PSM_CUDA + +#ifndef _AM_CUDA_MEMHANDLE_CACHE_H +#define _AM_CUDA_MEMHANDLE_CACHE_H + +#include +#include +#include +#include + +struct _cl_map_item; + +typedef struct +{ + unsigned long start; /* start virtual address */ + CUipcMemHandle cuda_ipc_handle; /* cuda ipc mem handle */ + CUdeviceptr cuda_ipc_dev_ptr;/* Cuda device pointer */ + uint16_t length; /* length*/ + psm2_epid_t epid; + struct _cl_map_item* i_prev; /* idle queue previous */ + struct _cl_map_item* i_next; /* idle queue next */ +}__attribute__ ((aligned (128))) rbtree_cuda_memhandle_cache_mapitem_pl_t; + +typedef struct { + uint32_t nelems; /* number of elements in the cache */ +} rbtree_cuda_memhandle_cache_map_pl_t; + +#define RBTREE_MI_PL rbtree_cuda_memhandle_cache_mapitem_pl_t +#define RBTREE_MAP_PL rbtree_cuda_memhandle_cache_map_pl_t + +#include "rbtree.h" + +cl_qmap_t cuda_memhandle_cachemap; /* Global cache */ +uint8_t cuda_memhandle_cache_enabled; +mpool_t cuda_memhandle_mpool; +uint32_t cuda_memhandle_cache_size; +#define CUDA_MEMHANDLE_CACHE_SIZE 64 + +/* + * Macro definition for easy programming. + */ + +#define NELEMS cuda_memhandle_cachemap.payload.nelems + +/* + * Macro for idle queue management. + */ +#define IHEAD cuda_memhandle_cachemap.root +#define LAST IHEAD->payload.i_prev +#define FIRST IHEAD->payload.i_next +#define INEXT(x) x->payload.i_next +#define IPREV(x) x->payload.i_prev + + +psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size); + +psm2_error_t am_cuda_memhandle_cache_map_init(); + +CUdeviceptr +am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle, + uint32_t length, psm2_epid_t epid); +void +am_cuda_memhandle_release(CUdeviceptr cuda_ipc_dev_ptr); + +void psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj); + +void am_cuda_memhandle_cache_map_fini(); + +#endif + +#endif diff --git a/ptl_am/am_reqrep.c b/ptl_am/am_reqrep.c new file mode 100644 index 0000000..5f90ec7 --- /dev/null +++ b/ptl_am/am_reqrep.c @@ -0,0 +1,118 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_am.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" + +psm2_error_t +psmi_amsh_am_short_request(psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + psm2_amarg_t req_args[NSHORT_ARGS + NBULK_ARGS]; + + /* All sends are synchronous. Ignore PSM2_AM_FLAG_ASYNC. + * Treat PSM2_AM_FLAG_NOREPLY as "advisory". This was mainly + * used to optimize the IPS path though we could put a stricter interpretation + * on it to disallow any replies. + */ + + /* For now less than NSHORT_ARGS+NBULK_ARGS-1. We use the first arg to carry + * the handler index. + */ + psmi_assert(nargs <= (NSHORT_ARGS + NBULK_ARGS - 1)); + psmi_assert(epaddr->ptlctl->ptl != NULL); + + req_args[0].u32w0 = (uint32_t) handler; + psmi_mq_mtucpy((void *)&req_args[1], (const void *)args, + (nargs * sizeof(psm2_amarg_t))); + psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr, am_handler_hidx, + req_args, nargs + 1, src, len, 0); + + if (completion_fn) + completion_fn(completion_ctxt); + + return PSM2_OK; +} + +psm2_error_t +psmi_amsh_am_short_reply(psm2_am_token_t tok, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + psm2_amarg_t rep_args[NSHORT_ARGS + NBULK_ARGS]; + + /* For now less than NSHORT_ARGS+NBULK_ARGS-1. We use the first arg to carry + * the handler index. + */ + psmi_assert(nargs <= (NSHORT_ARGS + NBULK_ARGS - 1)); + rep_args[0].u32w0 = (uint32_t) handler; + psmi_mq_mtucpy((void *)&rep_args[1], (const void *)args, + (nargs * sizeof(psm2_amarg_t))); + + psmi_amsh_short_reply((amsh_am_token_t *) tok, am_handler_hidx, + rep_args, nargs + 1, src, len, 0); + + if (completion_fn) + completion_fn(completion_ctxt); + + return PSM2_OK; +} diff --git a/ptl_am/am_reqrep_shmem.c b/ptl_am/am_reqrep_shmem.c new file mode 100644 index 0000000..95973c9 --- /dev/null +++ b/ptl_am/am_reqrep_shmem.c @@ -0,0 +1,2677 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include /* shm_open and signal handling */ +#include +#include +#include +#include + +#include "psm_user.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" +#include "cmarw.h" +#include "psmi_wrappers.h" + +#ifdef PSM_CUDA +#include "am_cuda_memhandle_cache.h" +#endif + +int psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST; + +static const amsh_qinfo_t amsh_qcounts = { + .qreqFifoShort = 1024, + .qreqFifoLong = 256, + .qrepFifoShort = 1024, + .qrepFifoLong = 256 +}; + +static const amsh_qinfo_t amsh_qelemsz = { + .qreqFifoShort = sizeof(am_pkt_short_t), + .qreqFifoLong = AMLONG_SZ, + .qrepFifoShort = sizeof(am_pkt_short_t), + .qrepFifoLong = AMLONG_SZ +}; + +ustatic struct { + void *addr; + size_t len; + struct sigaction SIGSEGV_old_act; + struct sigaction SIGBUS_old_act; +} action_stash; + +static psm2_error_t amsh_poll(ptl_t *ptl, int replyonly); +static void process_packet(ptl_t *ptl, am_pkt_short_t *pkt, int isreq); +static void amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, + void *buf, size_t len); + +/* Kassist helper functions */ +#if _HFI_DEBUGGING +static const char *psmi_kassist_getmode(int mode); +#endif +static int psmi_get_kassist_mode(); +int psmi_epaddr_pid(psm2_epaddr_t epaddr); + +static inline void +am_ctl_qhdr_init(volatile am_ctl_qhdr_t *q, int elem_cnt, int elem_sz) +{ + pthread_spin_init(&q->lock, PTHREAD_PROCESS_SHARED); + q->head = 0; + q->tail = 0; + q->elem_cnt = elem_cnt; + q->elem_sz = elem_sz; +} + +static void +am_ctl_bulkpkt_init(am_pkt_bulk_t *base_ptr, size_t elemsz, int nelems) +{ + int i; + am_pkt_bulk_t *bulkpkt; + uintptr_t bulkptr = (uintptr_t) base_ptr; + + for (i = 0; i < nelems; i++, bulkptr += elemsz) { + bulkpkt = (am_pkt_bulk_t *) bulkptr; + bulkpkt->idx = i; + } +} + +#define _PA(type) PSMI_ALIGNUP(amsh_qcounts.q ## type * amsh_qelemsz.q ## type, \ + PSMI_PAGESIZE) +static inline uintptr_t am_ctl_sizeof_block() +{ + return PSMI_ALIGNUP( + PSMI_ALIGNUP(AMSH_BLOCK_HEADER_SIZE, PSMI_PAGESIZE) + + /* reqctrl block */ + PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) + + _PA(reqFifoShort) + _PA(reqFifoLong) + + /*reqctrl block */ + PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) + + /* align to page size */ + _PA(repFifoShort) + _PA(repFifoLong), PSMI_PAGESIZE); +} + +#undef _PA + +static void am_update_directory(struct am_ctl_nodeinfo *); + +static +void amsh_atexit() +{ + static pthread_mutex_t mutex_once = PTHREAD_MUTEX_INITIALIZER; + static int atexit_once; + psm2_ep_t ep; + struct ptl_am *ptl; + + pthread_mutex_lock(&mutex_once); + if (atexit_once) { + pthread_mutex_unlock(&mutex_once); + return; + } else + atexit_once = 1; + pthread_mutex_unlock(&mutex_once); + + ep = psmi_opened_endpoint; + while (ep) { + ptl = (struct ptl_am *)(ep->ptl_amsh.ptl); + if (ptl->self_nodeinfo && + ptl->amsh_keyname != NULL) { + _HFI_VDBG("unlinking shm file %s\n", + ptl->amsh_keyname); + shm_unlink(ptl->amsh_keyname); + } + ep = ep->user_ep_next; + } + + return; +} + +ustatic +void amsh_mmap_fault(int signo, siginfo_t *siginfo, void *context) +{ + if ((unsigned long int) siginfo->si_addr >= (unsigned long int) action_stash.addr && + (unsigned long int) siginfo->si_addr < (unsigned long int) action_stash.addr + (unsigned long int) action_stash.len) { + + static char shm_errmsg[256]; + + snprintf(shm_errmsg, sizeof(shm_errmsg), + "%s: Unable to allocate shared memory for intra-node messaging.\n" + "%s: Delete stale shared memory files in /dev/shm.\n", + psmi_gethostname(), psmi_gethostname()); + amsh_atexit(); + if (psmi_write(2, shm_errmsg, strlen(shm_errmsg) + 1) == -1) + psmi_exit(2); + else + psmi_exit(1); /* XXX revisit this... there's probably a better way to exit */ + } else { + if (signo == SIGSEGV) { + if (action_stash.SIGSEGV_old_act.sa_sigaction == (void*) SIG_DFL) { + psmi_sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL); + raise(SIGSEGV); + struct sigaction act; + act.sa_sigaction = amsh_mmap_fault; + act.sa_flags = SA_SIGINFO; + psmi_sigaction(SIGSEGV, &act, NULL); + } else if (action_stash.SIGSEGV_old_act.sa_sigaction == (void*) SIG_IGN) { + return; + } else { + action_stash.SIGSEGV_old_act.sa_sigaction(signo, siginfo, context); + } + } else if (signo == SIGBUS) { + if (action_stash.SIGBUS_old_act.sa_sigaction == (void*) SIG_DFL) { + psmi_sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL); + raise(SIGBUS); + struct sigaction act; + act.sa_sigaction = amsh_mmap_fault; + act.sa_flags = SA_SIGINFO; + psmi_sigaction(SIGBUS, &act, NULL); + } else if (action_stash.SIGBUS_old_act.sa_sigaction == (void*) SIG_IGN) { + return; + } else { + action_stash.SIGBUS_old_act.sa_sigaction(signo, siginfo, context); + } + } else { + psmi_exit(signo); + } + } +} + +/** + * Create endpoint shared-memory object, containing ep's info + * and message queues. + */ +psm2_error_t psmi_shm_create(ptl_t *ptl_gen) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + psm2_ep_t ep = ptl->ep; + char shmbuf[256]; + void *mapptr; + size_t segsz; + psm2_error_t err = PSM2_OK; + int shmfd = -1; + char *amsh_keyname; + int iterator; + /* Get which kassist mode to use. */ + ptl->psmi_kassist_mode = psmi_get_kassist_mode(); + + if (_HFI_PRDBG_ON) { + _HFI_PRDBG_ALWAYS + ("kassist_mode %d %s use_kassist %d\n", + ptl->psmi_kassist_mode, + psmi_kassist_getmode(ptl->psmi_kassist_mode), + (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF)); + } + + segsz = am_ctl_sizeof_block(); + for (iterator = 0; iterator <= INT_MAX; iterator++) { + snprintf(shmbuf, + sizeof(shmbuf), + "/psm2_shm.%ld%016lx%d", + (long int) getuid(), + ep->epid, + iterator); + amsh_keyname = psmi_strdup(NULL, shmbuf); + if (amsh_keyname == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + shmfd = + shm_open(amsh_keyname, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (shmfd < 0) { + if (errno == EACCES && iterator < INT_MAX) + continue; + else { + err = psmi_handle_error(NULL, + PSM2_SHMEM_SEGMENT_ERR, + "Error creating shared " + "memory object in " + "shm_open: %s", + strerror(errno)); + goto fail; + } + } else { + struct stat st; + if (fstat(shmfd, &st) == -1) { + err = psmi_handle_error(NULL, + PSM2_SHMEM_SEGMENT_ERR, + "Error validating " + "shared memory object " + "with fstat: %s", + strerror(errno)); + goto fail; + } + if (getuid() == st.st_uid) { + err = PSM2_OK; + break; + } else { + err = PSM2_SHMEM_SEGMENT_ERR; + close(shmfd); + } + } + } + if (err) { + err = psmi_handle_error(NULL, + PSM2_SHMEM_SEGMENT_ERR, + "Error creating shared memory object " + "in shm_open: namespace exhausted."); + goto fail; + } + + /* Now register the atexit handler for cleanup, whether master or slave */ + atexit(amsh_atexit); + + _HFI_PRDBG("Opened shmfile %s\n", amsh_keyname); + + if (ftruncate(shmfd, segsz) != 0) { + err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, + "Error setting size of shared memory object to %u bytes in " + "ftruncate: %s\n", + (uint32_t) segsz, + strerror(errno)); + goto fail; + } + + mapptr = mmap(NULL, segsz, + PROT_READ | PROT_WRITE, MAP_SHARED, shmfd, 0); + if (mapptr == MAP_FAILED) { + err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, + "Error mmapping shared memory: %s", + strerror(errno)); + goto fail; + } + + memset((void *) mapptr, 0, segsz); /* touch all of my pages */ + + /* Our own ep's info for ptl_am resides at the start of the + shm object. Other processes need some of this info to + understand the rest of the queue structure and other details. */ + ptl->self_nodeinfo = (struct am_ctl_nodeinfo *) mapptr; + ptl->amsh_keyname = amsh_keyname; + ptl->self_nodeinfo->amsh_shmbase = (uintptr_t) mapptr; + +fail: + if (shmfd >= 0) close(shmfd); + return err; +} + +psm2_error_t psmi_epdir_extend(ptl_t *ptl_gen) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + struct am_ctl_nodeinfo *new = NULL; + + new = (struct am_ctl_nodeinfo *) + psmi_memalign(ptl->ep, PER_PEER_ENDPOINT, 64, + (ptl->am_ep_size + AMSH_DIRBLOCK_SIZE) * + sizeof(struct am_ctl_nodeinfo)); + if (new == NULL) + return PSM2_NO_MEMORY; + + memcpy(new, ptl->am_ep, + ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo)); + memset(new + ptl->am_ep_size, 0, + AMSH_DIRBLOCK_SIZE * sizeof(struct am_ctl_nodeinfo)); + + psmi_free(ptl->am_ep); + ptl->am_ep = new; + ptl->am_ep_size += AMSH_DIRBLOCK_SIZE; + + return PSM2_OK; +} + +/** + * Unmap shm regions upon proper disconnect with other processes + */ +psm2_error_t psmi_do_unmap(uintptr_t shmbase) +{ + psm2_error_t err = PSM2_OK; + if (munmap((void *)shmbase, am_ctl_sizeof_block())) { + err = + psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, + "Error with munmap of shared segment: %s", + strerror(errno)); + } + return err; +} + +/** + * Map a remote process' shared memory object. + * + * If the remote process has a shared memory object available, add it to our own + * directory and return the shmidx. If the shared memory object does not exist, + * return -1, and the connect poll function will try to map again later. + * + * If force_remap is true, then clear the entry that matches the epid. + */ +psm2_error_t psmi_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shmidx_o, int force_remap) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + int i; + int use_kassist; + uint16_t shmidx; + char shmbuf[256]; + void *dest_mapptr; + size_t segsz; + psm2_error_t err = PSM2_OK; + int dest_shmfd; + struct am_ctl_nodeinfo *dest_nodeinfo; + int iterator; + + shmidx = *shmidx_o = -1; + + for (i = 0; i <= ptl->max_ep_idx; i++) { + if (ptl->am_ep[i].epid == epid) { + if (force_remap) { + ptl->am_ep[i].epaddr = NULL; + ptl->am_ep[i].epid = 0; + break; + } + *shmidx_o = shmidx = i; + return err; + } + } + + + use_kassist = (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF); + + segsz = am_ctl_sizeof_block(); + for (iterator = 0; iterator <= INT_MAX; iterator++) { + snprintf(shmbuf, + sizeof(shmbuf), + "/psm2_shm.%ld%016lx%d", + (long int) getuid(), + epid, + iterator); + dest_shmfd = shm_open(shmbuf, O_RDWR, S_IRWXU); + if (dest_shmfd < 0) { + if (errno == EACCES && iterator < INT_MAX) + continue; + else { + err = psmi_handle_error(NULL, + PSM2_SHMEM_SEGMENT_ERR, + "Error opening remote " + "shared memory object " + "in shm_open: %s", + strerror(errno)); + goto fail; + } + } else { + struct stat st; + if (fstat(dest_shmfd, &st) == -1) { + err = psmi_handle_error(NULL, + PSM2_SHMEM_SEGMENT_ERR, + "Error validating " + "shared memory object " + "with fstat: %s", + strerror(errno)); + goto fail; + } + if (getuid() == st.st_uid) { + err = PSM2_OK; + break; + } else { + err = PSM2_SHMEM_SEGMENT_ERR; + close(dest_shmfd); + } + } + } + if (err) { + err = psmi_handle_error(NULL, + PSM2_SHMEM_SEGMENT_ERR, + "Error opening remote shared " + "memory object in shm_open: " + "namespace exhausted."); + goto fail; + } + + dest_mapptr = mmap(NULL, segsz, + PROT_READ | PROT_WRITE, MAP_SHARED, dest_shmfd, 0); + if (dest_mapptr == MAP_FAILED) { + err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, + "Error mmapping remote shared memory: %s", + strerror(errno)); + goto fail; + } + close(dest_shmfd); + dest_nodeinfo = (struct am_ctl_nodeinfo *)dest_mapptr; + + /* We core dump right after here if we don't check the mmap */ + action_stash.addr = dest_mapptr; + action_stash.len = segsz; + + struct sigaction act = { .sa_sigaction = amsh_mmap_fault, .sa_flags = SA_SIGINFO }; + + sigaction(SIGSEGV, &act, &action_stash.SIGSEGV_old_act); + sigaction(SIGBUS, &act, &action_stash.SIGBUS_old_act); + + { + volatile uint16_t *is_init = &dest_nodeinfo->is_init; + while (*is_init == 0) + usleep(1); + ips_sync_reads(); + _HFI_PRDBG("Got a published remote dirpage page at " + "%p, size=%dn", dest_mapptr, (int)segsz); + } + + shmidx = -1; + if ((ptl->max_ep_idx + 1) == ptl->am_ep_size) { + err = psmi_epdir_extend(ptl_gen); + if (err) + goto fail; + + for (i = 0; i <= ptl->max_ep_idx; i++) { + if (ptl->am_ep[i].epid != 0) + am_update_directory(&ptl->am_ep[i]); + } + } + for (i = 0; i < ptl->am_ep_size; i++) { + psmi_assert(ptl->am_ep[i].epid != epid); + if (ptl->am_ep[i].epid == 0) { + ptl->am_ep[i].epid = epid; + ptl->am_ep[i].psm_verno = dest_nodeinfo->psm_verno; + ptl->am_ep[i].pid = dest_nodeinfo->pid; + if (use_kassist) { + /* If we are able to use CMA assume everyone + * else on the node can also use it. + * Advertise that CMA is active via the + * feature flag. + */ + + if (cma_available()) { + ptl->am_ep[i].amsh_features |= + AMSH_HAVE_CMA; + psmi_shm_mq_rv_thresh = + PSMI_MQ_RV_THRESH_CMA; + } else { + ptl->psmi_kassist_mode = + PSMI_KASSIST_OFF; + use_kassist = 0; + psmi_shm_mq_rv_thresh = + PSMI_MQ_RV_THRESH_NO_KASSIST; + } + } else + psmi_shm_mq_rv_thresh = + PSMI_MQ_RV_THRESH_NO_KASSIST; + _HFI_PRDBG("KASSIST MODE: %s\n", + psmi_kassist_getmode(ptl->psmi_kassist_mode)); + shmidx = *shmidx_o = i; + _HFI_PRDBG("Mapped epid %lx into shmidx %d\n", epid, shmidx); + ptl->am_ep[i].amsh_shmbase = (uintptr_t) dest_mapptr; + ptl->am_ep[i].amsh_qsizes = dest_nodeinfo->amsh_qsizes; + if (i > ptl->max_ep_idx) + ptl->max_ep_idx = i; + break; + } + } + + /* install the old sighandler back */ + sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL); + sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL); + + if (shmidx == (uint16_t)-1) + err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, + "Could not connect to local endpoint"); fail: + return err; +} + +/** + * Initialize pointer structure and locks for endpoint shared-memory AM. + */ + +#define AMSH_QSIZE(type) \ + PSMI_ALIGNUP(amsh_qelemsz.q ## type * amsh_qcounts.q ## type, \ + PSMI_PAGESIZE) + +static psm2_error_t amsh_init_segment(ptl_t *ptl_gen) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + psm2_error_t err = PSM2_OK; + + /* Preconditions */ + psmi_assert_always(ptl != NULL); + psmi_assert_always(ptl->ep != NULL); + psmi_assert_always(ptl->epaddr != NULL); + psmi_assert_always(ptl->ep->epid != 0); + + if ((err = psmi_shm_create(ptl_gen))) + goto fail; + + ptl->self_nodeinfo->amsh_qsizes.qreqFifoShort = AMSH_QSIZE(reqFifoShort); + ptl->self_nodeinfo->amsh_qsizes.qreqFifoLong = AMSH_QSIZE(reqFifoLong); + ptl->self_nodeinfo->amsh_qsizes.qrepFifoShort = AMSH_QSIZE(repFifoShort); + ptl->self_nodeinfo->amsh_qsizes.qrepFifoLong = AMSH_QSIZE(repFifoLong); + + /* We core dump right after here if we don't check the mmap */ + + struct sigaction act; + act.sa_sigaction = amsh_mmap_fault; + act.sa_flags = SA_SIGINFO; + + sigaction(SIGSEGV, &act, &action_stash.SIGSEGV_old_act); + sigaction(SIGBUS, &act, &action_stash.SIGBUS_old_act); + + /* + * Now that we know our epid, update it in the shmidx array + */ + ptl->reqH.base = ptl->reqH.head = ptl->reqH.end = NULL; + ptl->repH.base = ptl->repH.head = ptl->repH.end = NULL; + + am_update_directory(ptl->self_nodeinfo); + + ptl->reqH.head = ptl->reqH.base = (am_pkt_short_t *) + (((uintptr_t)ptl->self_nodeinfo->qdir.qreqFifoShort)); + ptl->reqH.end = (am_pkt_short_t *) + (((uintptr_t)ptl->self_nodeinfo->qdir.qreqFifoShort) + + amsh_qcounts.qreqFifoShort * amsh_qelemsz.qreqFifoShort); + + ptl->repH.head = ptl->repH.base = (am_pkt_short_t *) + (((uintptr_t)ptl->self_nodeinfo->qdir.qrepFifoShort)); + ptl->repH.end = (am_pkt_short_t *) + (((uintptr_t)ptl->self_nodeinfo->qdir.qrepFifoShort) + + amsh_qcounts.qrepFifoShort * amsh_qelemsz.qrepFifoShort); + + am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qreqH->shortq, + amsh_qcounts.qreqFifoShort, + amsh_qelemsz.qreqFifoShort); + am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qreqH->longbulkq, + amsh_qcounts.qreqFifoLong, amsh_qelemsz.qreqFifoLong); + am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qrepH->shortq, + amsh_qcounts.qrepFifoShort, + amsh_qelemsz.qrepFifoShort); + am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qrepH->longbulkq, + amsh_qcounts.qrepFifoLong, amsh_qelemsz.qrepFifoLong); + + /* Set bulkidx in every bulk packet */ + am_ctl_bulkpkt_init(ptl->self_nodeinfo->qdir.qreqFifoLong, + amsh_qelemsz.qreqFifoLong, + amsh_qcounts.qreqFifoLong); + am_ctl_bulkpkt_init(ptl->self_nodeinfo->qdir.qrepFifoLong, + amsh_qelemsz.qrepFifoLong, + amsh_qcounts.qrepFifoLong); + + /* install the old sighandler back */ + sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL); + sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL); + +fail: + return err; +} + +psm2_error_t psmi_shm_detach(ptl_t *ptl_gen) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + psm2_error_t err = PSM2_OK; + uintptr_t shmbase; + + if (ptl->self_nodeinfo == NULL) + return err; + + _HFI_VDBG("unlinking shm file %s\n", ptl->amsh_keyname + 1); + shmbase = ptl->self_nodeinfo->amsh_shmbase; + shm_unlink(ptl->amsh_keyname); + psmi_free(ptl->amsh_keyname); + + if (munmap((void *)shmbase, am_ctl_sizeof_block())) { + err = + psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, + "Error with munmap of shared segment: %s", + strerror(errno)); + goto fail; + } + ptl->self_nodeinfo = NULL; + return PSM2_OK; + +fail: + return err; +} + +/** + * Update locally shared-pointer directory. The directory must be + * updated when a new epaddr is connected to or on every epaddr already + * connected to whenever the shared memory segment is relocated via mremap. + * + * @param epaddr Endpoint address for which to update local directory. + */ + +static +void am_update_directory(struct am_ctl_nodeinfo *nodeinfo) +{ + uintptr_t base_this; + + base_this = nodeinfo->amsh_shmbase + + AMSH_BLOCK_HEADER_SIZE; + + /* Request queues */ + nodeinfo->qdir.qreqH = (am_ctl_blockhdr_t *) base_this; + nodeinfo->qdir.qreqFifoShort = (am_pkt_short_t *) + ((uintptr_t) nodeinfo->qdir.qreqH + + PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE)); + + nodeinfo->qdir.qreqFifoLong = (am_pkt_bulk_t *) + ((uintptr_t) nodeinfo->qdir.qreqFifoShort + + nodeinfo->amsh_qsizes.qreqFifoShort); + + /* Reply queues */ + nodeinfo->qdir.qrepH = (am_ctl_blockhdr_t *) + ((uintptr_t) nodeinfo->qdir.qreqFifoLong + + nodeinfo->amsh_qsizes.qreqFifoLong); + + nodeinfo->qdir.qrepFifoShort = (am_pkt_short_t *) + ((uintptr_t) nodeinfo->qdir.qrepH + + PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE)); + nodeinfo->qdir.qrepFifoLong = (am_pkt_bulk_t *) + ((uintptr_t) nodeinfo->qdir.qrepFifoShort + + nodeinfo->amsh_qsizes.qrepFifoShort); + + _HFI_VDBG("epaddr=%p Request Hdr=%p,Pkt=%p,Long=%p\n", + nodeinfo->epaddr, + nodeinfo->qdir.qreqH, + nodeinfo->qdir.qreqFifoShort, + nodeinfo->qdir.qreqFifoLong); + _HFI_VDBG("epaddr=%p Reply Hdr=%p,Pkt=%p,Long=%p\n", + nodeinfo->epaddr, + nodeinfo->qdir.qrepH, + nodeinfo->qdir.qrepFifoShort, + nodeinfo->qdir.qrepFifoLong); + + /* Sanity check */ + uintptr_t base_next = + (uintptr_t) nodeinfo->qdir.qrepFifoLong + + nodeinfo->amsh_qsizes.qrepFifoLong; + + psmi_assert_always(base_next - base_this <= am_ctl_sizeof_block()); +} + + +/* ep_epid_share_memory wrapper */ +static +int amsh_epid_reachable(ptl_t *ptl_gen, psm2_epid_t epid) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + int result; + psm2_error_t err; + err = psm2_ep_epid_share_memory(ptl->ep, epid, &result); + psmi_assert_always(err == PSM2_OK); + return result; +} + +static +psm2_error_t +amsh_epaddr_add(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t shmidx, psm2_epaddr_t *epaddr_o) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + psm2_epaddr_t epaddr; + am_epaddr_t *amaddr; + psm2_error_t err = PSM2_OK; + + psmi_assert(psmi_epid_lookup(ptl->ep, epid) == NULL); + + /* The self PTL handles loopback communication. */ + psmi_assert(epid != ptl->epid); + + /* note the size of the memory is am_epaddr_t */ + epaddr = (psm2_epaddr_t) psmi_calloc(ptl->ep, + PER_PEER_ENDPOINT, 1, + sizeof(am_epaddr_t)); + if (epaddr == NULL) { + return PSM2_NO_MEMORY; + } + psmi_assert_always(ptl->am_ep[shmidx].epaddr == NULL); + + if ((err = psmi_epid_set_hostname(psm2_epid_nid(epid), + psmi_gethostname(), 0))) + goto fail; + + epaddr->ptlctl = ptl->ctl; + epaddr->epid = epid; + + /* convert to am_epaddr_t */ + amaddr = (am_epaddr_t *) epaddr; + /* tell the other endpoint their location in our directory */ + amaddr->shmidx = shmidx; + /* we haven't connected yet, so we can't give them the same hint */ + amaddr->return_shmidx = -1; + amaddr->cstate_outgoing = AMSH_CSTATE_OUTGOING_NONE; + amaddr->cstate_incoming = AMSH_CSTATE_INCOMING_NONE; + + /* other setup */ + ptl->am_ep[shmidx].epaddr = epaddr; + am_update_directory(&ptl->am_ep[shmidx]); + /* Finally, add to table */ + if ((err = psmi_epid_add(ptl->ep, epid, epaddr))) + goto fail; + _HFI_VDBG("epaddr=%s added to ptl=%p\n", + psmi_epaddr_get_name(epid), ptl); + *epaddr_o = epaddr; + return PSM2_OK; +fail: + if (epaddr != ptl->epaddr) + psmi_free(epaddr); + return err; +} + +static +void +amsh_epaddr_update(ptl_t *ptl_gen, psm2_epaddr_t epaddr) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + am_epaddr_t *amaddr; + uint16_t shmidx; + struct am_ctl_nodeinfo *nodeinfo; + + amaddr = (am_epaddr_t *) epaddr; + shmidx = amaddr->shmidx; + nodeinfo = (struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase; + + /* restart the connection process */ + amaddr->return_shmidx = -1; + amaddr->cstate_outgoing = AMSH_CSTATE_OUTGOING_NONE; + + /* wait for the other process to init again */ + { + volatile uint16_t *is_init = &nodeinfo->is_init; + while (*is_init == 0) + usleep(1); + ips_sync_reads(); + } + + /* get the updated values from the new nodeinfo page */ + ptl->am_ep[shmidx].psm_verno = nodeinfo->psm_verno; + ptl->am_ep[shmidx].pid = nodeinfo->pid; + ptl->am_ep[shmidx].amsh_qsizes = nodeinfo->amsh_qsizes; + am_update_directory(&ptl->am_ep[shmidx]); + return; +} + +struct ptl_connection_req { + int isdone; + int op; /* connect or disconnect */ + int numep; + int numep_left; + int phase; + + int *epid_mask; + const psm2_epid_t *epids; /* input epid list */ + psm2_epaddr_t *epaddr; + psm2_error_t *errors; /* inout errors */ + + /* Used for connect/disconnect */ + psm2_amarg_t args[4]; +}; + +static +void amsh_free_epaddr(psm2_epaddr_t epaddr) +{ + psmi_epid_remove(epaddr->ptlctl->ep, epaddr->epid); + psmi_free(epaddr); + return; +} + +#define PTL_OP_CONNECT 0 +#define PTL_OP_DISCONNECT 1 +#define PTL_OP_ABORT 2 + +static +psm2_error_t +amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */ + int numep, const psm2_epid_t *array_of_epid, /* non-NULL on connect */ + const int array_of_epid_mask[], + psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, + struct ptl_connection_req **req_o) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + int i, cstate; + psm2_epaddr_t epaddr; + psm2_epid_t epid; + struct ptl_connection_req *req = NULL; + + req = (struct ptl_connection_req *) + psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, 1, + sizeof(struct ptl_connection_req)); + if (req == NULL) + return PSM2_NO_MEMORY; + req->isdone = 0; + req->op = op; + req->numep = numep; + req->numep_left = 0; + req->phase = ptl->connect_phase; + req->epid_mask = (int *) + psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, numep, sizeof(int)); + if (req->epid_mask == NULL) { + psmi_free(req); + return PSM2_NO_MEMORY; + } + req->epaddr = array_of_epaddr; + req->epids = array_of_epid; + req->errors = array_of_errors; + + /* First check if there's really something to connect/disconnect + * for this PTL */ + for (i = 0; i < numep; i++) { + req->epid_mask[i] = AMSH_CMASK_NONE; /* no connect by default */ + if (!array_of_epid_mask[i]) + continue; + if (op == PTL_OP_CONNECT) { + epid = array_of_epid[i]; + + /* Connect only to other processes reachable by shared memory. + The self PTL handles loopback communication, so explicitly + refuse to connect to self. */ + if (!amsh_epid_reachable(ptl_gen, epid) + || epid == ptl->epid) { + array_of_errors[i] = PSM2_EPID_UNREACHABLE; + array_of_epaddr[i] = NULL; + continue; + } + + _HFI_VDBG("looking at epid %llx\n", + (unsigned long long)epid); + epaddr = psmi_epid_lookup(ptl->ep, epid); + if (epaddr != NULL) { + if (epaddr->ptlctl->ptl != ptl_gen) { + array_of_errors[i] = + PSM2_EPID_UNREACHABLE; + array_of_epaddr[i] = NULL; + continue; + } + cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; + if (cstate == AMSH_CSTATE_OUTGOING_ESTABLISHED) { + array_of_epaddr[i] = epaddr; + array_of_errors[i] = PSM2_OK; + } else { + psmi_assert(cstate == + AMSH_CSTATE_OUTGOING_NONE); + array_of_errors[i] = PSM2_TIMEOUT; + array_of_epaddr[i] = epaddr; + req->epid_mask[i] = AMSH_CMASK_PREREQ; + } + } else { + req->epid_mask[i] = AMSH_CMASK_PREREQ; + array_of_epaddr[i] = NULL; + } + } else { /* disc or abort */ + epaddr = array_of_epaddr[i]; + if (epaddr->ptlctl->ptl != ptl_gen) + continue; + + psmi_assert(epaddr != NULL); + cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; + if (cstate == AMSH_CSTATE_OUTGOING_ESTABLISHED) { + req->epid_mask[i] = AMSH_CMASK_PREREQ; + _HFI_VDBG + ("Just set index %d to AMSH_CMASK_PREREQ\n", + i); + } + /* XXX undef ? */ + } + if (req->epid_mask[i] != AMSH_CMASK_NONE) + req->numep_left++; + } + + if (req->numep_left == 0) { /* nothing to do */ + psmi_free(req->epid_mask); + psmi_free(req); + _HFI_VDBG("Nothing to connect, bump up phase\n"); + ptl->connect_phase++; + *req_o = NULL; + return PSM2_OK; + } else { + *req_o = req; + return PSM2_OK_NO_PROGRESS; + } +} + +static +psm2_error_t +amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + int i, j, cstate; + uint16_t shmidx = (uint16_t)-1; + psm2_error_t err = PSM2_OK; + psm2_epid_t epid; + psm2_epaddr_t epaddr; + + if (req == NULL || req->isdone) + return PSM2_OK; + + psmi_assert_always(ptl->connect_phase == req->phase); + + if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) { + for (i = 0; i < req->numep; i++) { + if (req->epid_mask[i] == AMSH_CMASK_NONE || + req->epid_mask[i] == AMSH_CMASK_DONE) + continue; + + epaddr = req->epaddr[i]; + psmi_assert(epaddr != NULL); + if (req->epid_mask[i] == AMSH_CMASK_PREREQ) { + shmidx = ((am_epaddr_t *) epaddr)->shmidx; + /* Make sure the target of the disconnect is still there */ + if (ptl->am_ep[shmidx]. + epid != epaddr->epid) { + req->numep_left--; + req->epid_mask[i] = AMSH_CMASK_DONE; + ((am_epaddr_t *) epaddr)->cstate_outgoing = + AMSH_CSTATE_OUTGOING_NONE; + } + } + + if (req->epid_mask[i] == AMSH_CMASK_PREREQ) { + req->args[0].u16w0 = PSMI_AM_DISC_REQ; + req->args[0].u16w1 = shmidx; + req->args[0].u32w1 = ptl->connect_phase; + req->args[1].u64w0 = (uint64_t) ptl->epid; + psmi_assert(shmidx != (uint16_t)-1); + req->args[2].u32w0 = getpid(); + req->args[2].u32w1 = PSM2_OK; + req->args[3].u64w0 = + (uint64_t) (uintptr_t) &req->errors[i]; + psmi_amsh_short_request(ptl_gen, epaddr, + amsh_conn_handler_hidx, + req->args, 4, NULL, 0, + 0); + ((am_epaddr_t *) epaddr)->cstate_outgoing = + AMSH_CSTATE_OUTGOING_DISC_REQUESTED; + /** + * Only munmap if we have nothing more to + * communicate with the other node, i.e. we + * already recieved a disconnect req from the + * other node. + */ + if (((am_epaddr_t *) epaddr)->cstate_incoming == + AMSH_CSTATE_INCOMING_DISC_REQUESTED) + err = psmi_do_unmap(ptl->am_ep[shmidx].amsh_shmbase); + req->epid_mask[i] = AMSH_CMASK_POSTREQ; + } else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) { + cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; + if (cstate == AMSH_CSTATE_OUTGOING_DISC_REPLIED) { + req->numep_left--; + req->epid_mask[i] = AMSH_CMASK_DONE; + ((am_epaddr_t *) epaddr)->cstate_outgoing = + AMSH_CSTATE_OUTGOING_NONE; + } + } + } + } else { + /* First see if we've made progress on any postreqs */ + int n_prereq = 0; + for (i = 0; i < req->numep; i++) { + int cstate; + if (req->epid_mask[i] != AMSH_CMASK_POSTREQ) { + if (req->epid_mask[i] == AMSH_CMASK_PREREQ) + n_prereq++; + continue; + } + epaddr = req->epaddr[i]; + psmi_assert(epaddr != NULL); + + /* detect if a race has occurred on due to re-using an + * old shm file - if so, restart the connection */ + shmidx = ((am_epaddr_t *) epaddr)->shmidx; + if (ptl->am_ep[shmidx].pid != + ((struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase)->pid) { + req->epid_mask[i] = AMSH_CMASK_PREREQ; + ((am_epaddr_t *) epaddr)->cstate_outgoing = + AMSH_CSTATE_OUTGOING_NONE; + n_prereq++; + amsh_epaddr_update(ptl_gen, epaddr); + continue; + } + + cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; + if (cstate == AMSH_CSTATE_OUTGOING_REPLIED) { + req->numep_left--; + ((am_epaddr_t *) epaddr)->cstate_outgoing = + AMSH_CSTATE_OUTGOING_ESTABLISHED; + req->epid_mask[i] = AMSH_CMASK_DONE; + continue; + } + } + if (n_prereq > 0) { + psmi_assert(req->numep_left > 0); + /* Go through the list of peers we need to connect to and find out + * if they each shared ep is mapped into shm */ + for (i = 0; i < req->numep; i++) { + if (req->epid_mask[i] != AMSH_CMASK_PREREQ) + continue; + epid = req->epids[i]; + epaddr = req->epaddr[i]; + /* Go through mapped epids and find the epid we're looking for */ + for (shmidx = -1, j = 0; + j <= ptl->max_ep_idx; j++) { + /* epid is connected and ready to go */ + if (ptl->am_ep[j]. + epid == epid) { + shmidx = j; + break; + } + } + if (shmidx == (uint16_t)-1) { + /* Couldn't find peer's epid in dirpage. + Check shmdir to see if epid is up now. */ + if ((err = psmi_shm_map_remote(ptl_gen, epid, &shmidx, 0))) { + return err; + } + continue; + } + /* Before we even send the request out, check to see if + * versions are interoperable */ + if (!psmi_verno_isinteroperable + (ptl->am_ep[shmidx]. + psm_verno)) { + char buf[32]; + uint16_t their_verno = + ptl->am_ep[shmidx]. + psm_verno; + snprintf(buf, sizeof(buf), "%d.%d", + PSMI_VERNO_GET_MAJOR + (their_verno), + PSMI_VERNO_GET_MINOR + (their_verno)); + + _HFI_INFO("Local endpoint id %" PRIx64 + " has version %s " + "which is not supported by library version %d.%d", + epid, buf, PSM2_VERNO_MAJOR, + PSM2_VERNO_MINOR); + req->errors[i] = + PSM2_EPID_INVALID_VERSION; + req->numep_left--; + req->epid_mask[i] = AMSH_CMASK_DONE; + continue; + } + if (epaddr != NULL) { + psmi_assert(((am_epaddr_t *) epaddr)-> + shmidx == shmidx); + } else + if ((epaddr = + psmi_epid_lookup(ptl->ep, + epid)) == NULL) { + if ((err = + amsh_epaddr_add(ptl_gen, epid, shmidx, + &epaddr))) { + return err; + } + /* Remote pid is unknown at the moment */ + ((am_epaddr_t *) epaddr)->pid = + AMSH_PID_UNKNOWN; + } + req->epaddr[i] = epaddr; + req->args[0].u16w0 = PSMI_AM_CONN_REQ; + /* tell the other process its shmidx here */ + req->args[0].u16w1 = shmidx; + req->args[0].u32w1 = ptl->connect_phase; + req->args[1].u64w0 = (uint64_t) ptl->epid; + req->args[2].u32w0 = getpid(); + req->args[2].u32w1 = PSM2_OK; + req->args[3].u64w0 = + (uint64_t) (uintptr_t) &req->errors[i]; + req->epid_mask[i] = AMSH_CMASK_POSTREQ; + psmi_amsh_short_request(ptl_gen, epaddr, + amsh_conn_handler_hidx, + req->args, 4, NULL, 0, + 0); + _HFI_PRDBG("epaddr=%p, epid=%" PRIx64 + " at shmidx=%d\n", epaddr, epid, + shmidx); + } + } + } + + if (req->numep_left == 0) { /* we're all done */ + req->isdone = 1; + return PSM2_OK; + } else { + sched_yield(); + return PSM2_OK_NO_PROGRESS; + } +} + +static +psm2_error_t +amsh_ep_connreq_fini(ptl_t *ptl_gen, struct ptl_connection_req *req) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + psm2_error_t err = PSM2_OK; + int i; + + /* Wherever we are at in our connect process, we've been instructed to + * finish the connection process */ + if (req == NULL) + return PSM2_OK; + + /* This prevents future connect replies from referencing data structures + * that disappeared */ + ptl->connect_phase++; + + /* First process any leftovers in postreq or prereq */ + for (i = 0; i < req->numep; i++) { + if (req->epid_mask[i] == AMSH_CMASK_NONE) + continue; + else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) { + int cstate; + req->epid_mask[i] = AMSH_CMASK_DONE; + cstate = ((am_epaddr_t *) req->epaddr[i])->cstate_outgoing; + if (cstate == AMSH_CSTATE_OUTGOING_REPLIED) { + req->numep_left--; + ((am_epaddr_t *) req->epaddr[i])->cstate_outgoing = + AMSH_CSTATE_OUTGOING_ESTABLISHED; + } else { /* never actually got reply */ + req->errors[i] = PSM2_TIMEOUT; + } + } + /* If we couldn't go from prereq to postreq, that means we couldn't + * find the shmidx for an epid in time. This can only be a case of + * time out */ + else if (req->epid_mask[i] == AMSH_CMASK_PREREQ) { + req->errors[i] = PSM2_TIMEOUT; + req->numep_left--; + req->epid_mask[i] = AMSH_CMASK_DONE; + } + } + + /* Whatever is left can only be in DONE or NONE state */ + for (i = 0; i < req->numep; i++) { + if (req->epid_mask[i] == AMSH_CMASK_NONE) + continue; + psmi_assert(req->epid_mask[i] == AMSH_CMASK_DONE); + + err = psmi_error_cmp(err, req->errors[i]); + /* XXX TODO: Report errors in connection. */ + /* Only free epaddr if they have disconnected from us */ + int cstate = ((am_epaddr_t *) req->epaddr[i])->cstate_incoming; + if (cstate == AMSH_CSTATE_INCOMING_DISC_REQUESTED) { + if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) { + psmi_assert(req->epaddr[i] != NULL); + amsh_free_epaddr(req->epaddr[i]); + req->epaddr[i] = NULL; + } + } + } + + psmi_free(req->epid_mask); + psmi_free(req); + + return err; +} + +/* Wrapper for 2.0's use of connect/disconnect. The plan is to move the + * init/poll/fini interface up to the PTL level for 2.2 */ +#define CONNREQ_ZERO_POLLS_BEFORE_YIELD 20 +static +psm2_error_t +amsh_ep_connreq_wrap(ptl_t *ptl_gen, int op, + int numep, + const psm2_epid_t *array_of_epid, + const int array_of_epid_mask[], + psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, uint64_t timeout_ns) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + psm2_error_t err; + uint64_t t_start; + struct ptl_connection_req *req; + int num_polls_noprogress = 0; + static int shm_polite_attach = -1; + + if (shm_polite_attach == -1) { + char *p = getenv("PSM2_SHM_POLITE_ATTACH"); + if (p && *p && atoi(p) != 0) { + fprintf(stderr, "%s: Using Polite SHM segment attach\n", + psmi_gethostname()); + shm_polite_attach = 1; + } + shm_polite_attach = 0; + } + + /* Initialize */ + err = amsh_ep_connreq_init(ptl_gen, op, numep, + array_of_epid, array_of_epid_mask, + array_of_errors, array_of_epaddr, &req); + if (err != PSM2_OK_NO_PROGRESS) /* Either we're all done with connect or + * there was an error */ + return err; + + /* Poll until either + * 1. We time out + * 2. We are done with connecting + */ + t_start = get_cycles(); + do { + psmi_poll_internal(ptl->ep, 1); + err = amsh_ep_connreq_poll(ptl_gen, req); + if (err == PSM2_OK) + break; /* Finished before timeout */ + else if (err != PSM2_OK_NO_PROGRESS) { + psmi_free(req->epid_mask); + psmi_free(req); + goto fail; + } else if (shm_polite_attach && + ++num_polls_noprogress == + CONNREQ_ZERO_POLLS_BEFORE_YIELD) { + num_polls_noprogress = 0; + PSMI_YIELD(ptl->ep->mq->progress_lock); + } + } + while (psmi_cycles_left(t_start, timeout_ns)); + + err = amsh_ep_connreq_fini(ptl_gen, req); + +fail: + return err; +} + +static +psm2_error_t +amsh_ep_connect(ptl_t *ptl, + int numep, + const psm2_epid_t *array_of_epid, + const int array_of_epid_mask[], + psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, uint64_t timeout_ns) +{ + return amsh_ep_connreq_wrap(ptl, PTL_OP_CONNECT, numep, array_of_epid, + array_of_epid_mask, array_of_errors, + array_of_epaddr, timeout_ns); +} + +static +psm2_error_t +amsh_ep_disconnect(ptl_t *ptl, int force, int numep, + psm2_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm2_error_t array_of_errors[], uint64_t timeout_ns) +{ + return amsh_ep_connreq_wrap(ptl, + force ? PTL_OP_ABORT : PTL_OP_DISCONNECT, + numep, NULL, array_of_epaddr_mask, + array_of_errors, + array_of_epaddr, + timeout_ns); +} + +#undef CSWAP +PSMI_ALWAYS_INLINE( +int32_t +cswap(volatile int32_t *p, int32_t old_value, int32_t new_value)) +{ + asm volatile ("lock cmpxchg %2, %0" : + "+m" (*p), "+a"(old_value) : "r"(new_value) : "memory"); + return old_value; +} + +PSMI_ALWAYS_INLINE( +am_pkt_short_t * +am_ctl_getslot_pkt_inner(volatile am_ctl_qhdr_t *shq, am_pkt_short_t *pkt0)) +{ + am_pkt_short_t *pkt; + uint32_t idx; +#ifndef CSWAP + pthread_spin_lock(&shq->lock); + idx = shq->tail; + pkt = (am_pkt_short_t *) ((uintptr_t) pkt0 + idx * shq->elem_sz); + if (pkt->flag == QFREE) { + ips_sync_reads(); + pkt->flag = QUSED; + shq->tail += 1; + if (shq->tail == shq->elem_cnt) + shq->tail = 0; + } else { + pkt = 0; + } + pthread_spin_unlock(&shq->lock); +#else + uint32_t idx_next; + do { + idx = shq->tail; + idx_next = (idx + 1 == shq->elem_cnt) ? 0 : idx + 1; + } while (cswap(&shq->tail, idx, idx_next) != idx); + + pkt = (am_pkt_short_t *) ((uintptr_t) pkt0 + idx * shq->elem_sz); + while (cswap(&pkt->flag, QFREE, QUSED) != QFREE); +#endif + return pkt; +} + +/* This is safe because 'flag' is at the same offset on both pkt and bulkpkt */ +#define am_ctl_getslot_bulkpkt_inner(shq, pkt0) ((am_pkt_bulk_t *) \ + am_ctl_getslot_pkt_inner(shq, (am_pkt_short_t *)(pkt0))) + +PSMI_ALWAYS_INLINE( +am_pkt_short_t * +am_ctl_getslot_pkt(ptl_t *ptl_gen, uint16_t shmidx, int is_reply)) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + volatile am_ctl_qhdr_t *shq; + am_pkt_short_t *pkt0; + if (!is_reply) { + shq = &(ptl->am_ep[shmidx].qdir.qreqH->shortq); + pkt0 = ptl->am_ep[shmidx].qdir.qreqFifoShort; + } else { + shq = &(ptl->am_ep[shmidx].qdir.qrepH->shortq); + pkt0 = ptl->am_ep[shmidx].qdir.qrepFifoShort; + } + return am_ctl_getslot_pkt_inner(shq, pkt0); +} + +PSMI_ALWAYS_INLINE( +am_pkt_bulk_t * +am_ctl_getslot_long(ptl_t *ptl_gen, uint16_t shmidx, int is_reply)) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + volatile am_ctl_qhdr_t *shq; + am_pkt_bulk_t *pkt0; + if (!is_reply) { + shq = &(ptl->am_ep[shmidx].qdir.qreqH->longbulkq); + pkt0 = ptl->am_ep[shmidx].qdir.qreqFifoLong; + } else { + shq = &(ptl->am_ep[shmidx].qdir.qrepH->longbulkq); + pkt0 = ptl->am_ep[shmidx].qdir.qrepFifoLong; + } + return am_ctl_getslot_bulkpkt_inner(shq, pkt0); +} + +psmi_handlertab_t psmi_allhandlers[] = { + {0} + , + {amsh_conn_handler} + , + {psmi_am_mq_handler} + , + {psmi_am_mq_handler_data} + , + {psmi_am_mq_handler_rtsmatch} + , + {psmi_am_mq_handler_rtsdone} + , + {psmi_am_handler} +}; + +PSMI_ALWAYS_INLINE(void advance_head(volatile am_ctl_qshort_cache_t *hdr)) +{ + QMARKFREE(hdr->head); + hdr->head++; + if (hdr->head == hdr->end) + hdr->head = hdr->base; +} + +#define AMSH_ZERO_POLLS_BEFORE_YIELD 64 +#define AMSH_POLLS_BEFORE_PSM_POLL 16 + +/* XXX this can be made faster. Instead of checking the flag of the head, keep + * a cached copy of the integer value of the tail and compare it against the + * previous one we saw. + */ +PSMI_ALWAYS_INLINE( +psm2_error_t +amsh_poll_internal_inner(ptl_t *ptl_gen, int replyonly, + int is_internal)) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + psm2_error_t err = PSM2_OK_NO_PROGRESS; + /* poll replies */ + if (!QISEMPTY(ptl->repH.head->flag)) { + do { + ips_sync_reads(); + process_packet(ptl_gen, (am_pkt_short_t *) ptl->repH.head, + 0); + advance_head(&ptl->repH); + err = PSM2_OK; + } while (!QISEMPTY(ptl->repH.head->flag)); + } + + if (!replyonly) { + /* Request queue not enable for 2.0, will be re-enabled to support long + * replies */ + if (!is_internal && ptl->psmi_am_reqq_fifo.first != NULL) { + psmi_am_reqq_drain(ptl_gen); + err = PSM2_OK; + } + if (!QISEMPTY(ptl->reqH.head->flag)) { + do { + ips_sync_reads(); + process_packet(ptl_gen, + (am_pkt_short_t *) ptl->reqH. + head, 1); + advance_head(&ptl->reqH); + err = PSM2_OK; + } while (!QISEMPTY(ptl->reqH.head->flag)); + } + } + + if (is_internal) { + if (err == PSM2_OK) /* some progress, no yields */ + ptl->zero_polls = 0; + else if (++ptl->zero_polls == AMSH_ZERO_POLLS_BEFORE_YIELD) { + /* no progress for AMSH_ZERO_POLLS_BEFORE_YIELD */ + sched_yield(); + ptl->zero_polls = 0; + } + + if (++ptl->amsh_only_polls == AMSH_POLLS_BEFORE_PSM_POLL) { + psmi_poll_internal(ptl->ep, 0); + ptl->amsh_only_polls = 0; + } + } + return err; /* if we actually did something */ +} + +/* non-inlined version */ +static +psm2_error_t +amsh_poll_internal(ptl_t *ptl, int replyonly) +{ + return amsh_poll_internal_inner(ptl, replyonly, 1); +} + +#ifdef PSM_PROFILE +#define AMSH_POLL_UNTIL(ptl, isreply, cond) \ + do { \ + PSMI_PROFILE_BLOCK(); \ + while (!(cond)) { \ + PSMI_PROFILE_REBLOCK( \ + amsh_poll_internal(ptl, isreply) == \ + PSM2_OK_NO_PROGRESS); \ + } \ + PSMI_PROFILE_UNBLOCK(); \ + } while (0) +#else +#define AMSH_POLL_UNTIL(ptl, isreply, cond) \ + do { \ + while (!(cond)) { \ + amsh_poll_internal(ptl, isreply); \ + } \ + } while (0) +#endif + +static psm2_error_t amsh_poll(ptl_t *ptl, int replyonly) +{ + return amsh_poll_internal_inner(ptl, replyonly, 0); +} + +PSMI_ALWAYS_INLINE( +void +am_send_pkt_short(ptl_t *ptl, uint32_t destidx, uint32_t returnidx, + uint32_t bulkidx, uint16_t fmt, uint16_t nargs, + uint16_t handleridx, psm2_amarg_t *args, + const void *src, uint32_t len, int isreply)) +{ + int i; + volatile am_pkt_short_t *pkt; + int copy_nargs; + + AMSH_POLL_UNTIL(ptl, isreply, + (pkt = + am_ctl_getslot_pkt(ptl, destidx, isreply)) != NULL); + + /* got a free pkt... fill it in */ + pkt->bulkidx = bulkidx; + pkt->shmidx = returnidx; + pkt->type = fmt; + pkt->nargs = nargs; + pkt->handleridx = handleridx; + + /* Limit the number of args copied here to NSHORT_ARGS. Additional args + are carried in the bulkpkt. */ + copy_nargs = nargs; + if (copy_nargs > NSHORT_ARGS) { + copy_nargs = NSHORT_ARGS; + } + + for (i = 0; i < copy_nargs; i++) + pkt->args[i] = args[i]; + + if (fmt == AMFMT_SHORT_INLINE) + mq_copy_tiny((uint32_t *) &pkt->args[nargs], (uint32_t *) src, + len); + + _HFI_VDBG("pkt=%p fmt=%d bulkidx=%d,flag=%d,nargs=%d," + "buf=%p,len=%d,hidx=%d,value=%d\n", pkt, (int)fmt, bulkidx, + pkt->flag, pkt->nargs, src, (int)len, (int)handleridx, + src != NULL ? *((uint32_t *) src) : 0); + QMARKREADY(pkt); +} + +#define amsh_shm_copy_short psmi_mq_mtucpy +#define amsh_shm_copy_long psmi_mq_mtucpy + +PSMI_ALWAYS_INLINE( +int +psmi_amsh_generic_inner(uint32_t amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, void *dst, int flags)) +{ +#ifdef PSM_DEBUG + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; +#endif + uint16_t type; + uint32_t bulkidx; + uint16_t hidx = (uint16_t) handler; + int destidx = ((am_epaddr_t *) epaddr)->shmidx; + int returnidx = ((am_epaddr_t *) epaddr)->return_shmidx; + int is_reply = AM_IS_REPLY(amtype); + volatile am_pkt_bulk_t *bulkpkt; + + _HFI_VDBG("%s epaddr=%s, shmidx=%d, type=%d\n", + is_reply ? "reply" : "request", + psmi_epaddr_get_name(epaddr->epid), + ((am_epaddr_t *) epaddr)->shmidx, amtype); + psmi_assert(epaddr != ptl->epaddr); + + switch (amtype) { + case AMREQUEST_SHORT: + case AMREPLY_SHORT: + if (len + (nargs << 3) <= (NSHORT_ARGS << 3)) { + /* Payload fits in args packet */ + type = AMFMT_SHORT_INLINE; + bulkidx = len; + } else { + int i; + + psmi_assert(len < amsh_qelemsz.qreqFifoLong); + psmi_assert(src != NULL || nargs > NSHORT_ARGS); + type = AMFMT_SHORT; + + AMSH_POLL_UNTIL(ptl_gen, is_reply, + (bulkpkt = + am_ctl_getslot_long(ptl_gen, destidx, + is_reply)) != + NULL); + + bulkidx = bulkpkt->idx; + bulkpkt->len = len; + _HFI_VDBG("bulkpkt %p flag is %d from idx %d\n", + bulkpkt, bulkpkt->flag, destidx); + + for (i = 0; i < nargs - NSHORT_ARGS; i++) { + bulkpkt->args[i] = args[i + NSHORT_ARGS]; + } + + amsh_shm_copy_short((void *)bulkpkt->payload, src, + (uint32_t) len); + QMARKREADY(bulkpkt); + } + am_send_pkt_short(ptl_gen, destidx, returnidx, bulkidx, type, + nargs, hidx, args, src, len, is_reply); + break; + + case AMREQUEST_LONG: + case AMREPLY_LONG: + { + uint32_t bytes_left = len; + uint8_t *src_this = (uint8_t *) src; + uint8_t *dst_this = (uint8_t *) dst; + uint32_t bytes_this; + + type = AMFMT_LONG; + + _HFI_VDBG("[long][%s] src=%p,dest=%p,len=%d,hidx=%d\n", + is_reply ? "rep" : "req", src, dst, + (uint32_t) len, hidx); + while (bytes_left) { + bytes_this = min(bytes_left, AMLONG_MTU); + AMSH_POLL_UNTIL(ptl_gen, is_reply, + (bulkpkt = + am_ctl_getslot_long(ptl_gen, + destidx, + is_reply)) + != NULL); + bytes_left -= bytes_this; + if (bytes_left == 0) + type = AMFMT_LONG_END; + bulkidx = bulkpkt->idx; + amsh_shm_copy_long((void *)bulkpkt->payload, + src_this, bytes_this); + + bulkpkt->dest = (uintptr_t) dst; + bulkpkt->dest_off = + (uint32_t) ((uintptr_t) dst_this - + (uintptr_t) dst); + bulkpkt->len = bytes_this; + QMARKREADY(bulkpkt); + am_send_pkt_short(ptl_gen, destidx, returnidx, + bulkidx, type, nargs, hidx, + args, NULL, 0, is_reply); + src_this += bytes_this; + dst_this += bytes_this; + } + break; + } + default: + break; + } + return 1; +} + +/* A generic version that's not inlined */ +int +psmi_amsh_generic(uint32_t amtype, ptl_t *ptl, psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, void *dst, int flags) +{ + return psmi_amsh_generic_inner(amtype, ptl, epaddr, handler, args, + nargs, src, len, dst, flags); +} + +int +psmi_amsh_short_request(ptl_t *ptl, psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, int flags) +{ + return psmi_amsh_generic_inner(AMREQUEST_SHORT, ptl, epaddr, handler, + args, nargs, src, len, NULL, flags); +} + +int +psmi_amsh_long_request(ptl_t *ptl, psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, void *dest, int flags) +{ + return psmi_amsh_generic_inner(AMREQUEST_LONG, ptl, epaddr, handler, + args, nargs, src, len, dest, flags); +} + +void +psmi_amsh_short_reply(amsh_am_token_t *tok, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, int flags) +{ + psmi_amsh_generic_inner(AMREPLY_SHORT, tok->ptl, tok->tok.epaddr_incoming, + handler, args, nargs, src, len, NULL, flags); + return; +} + +void +psmi_amsh_long_reply(amsh_am_token_t *tok, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, void *dest, int flags) +{ + psmi_amsh_generic_inner(AMREPLY_LONG, tok->ptl, tok->tok.epaddr_incoming, + handler, args, nargs, src, len, dest, flags); + return; +} + +void psmi_am_reqq_init(ptl_t *ptl_gen) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + ptl->psmi_am_reqq_fifo.first = NULL; + ptl->psmi_am_reqq_fifo.lastp = &ptl->psmi_am_reqq_fifo.first; +} + +psm2_error_t psmi_am_reqq_drain(ptl_t *ptl_gen) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + am_reqq_t *reqn = ptl->psmi_am_reqq_fifo.first; + am_reqq_t *req; + psm2_error_t err = PSM2_OK_NO_PROGRESS; + + /* We're going to process the entire list, and running the generic handler + * below can cause other requests to be enqueued in the queue that we're + * processing. */ + ptl->psmi_am_reqq_fifo.first = NULL; + ptl->psmi_am_reqq_fifo.lastp = &ptl->psmi_am_reqq_fifo.first; + + while ((req = reqn) != NULL) { + err = PSM2_OK; + reqn = req->next; + _HFI_VDBG + ("push of reqq=%p epaddr=%s localreq=%p remotereq=%p\n", + req, psmi_epaddr_get_hostname(req->epaddr->epid), + (void *)(uintptr_t) req->args[1].u64w0, + (void *)(uintptr_t) req->args[0].u64w0); + psmi_amsh_generic(req->amtype, req->ptl, req->epaddr, + req->handler, req->args, req->nargs, req->src, + req->len, req->dest, req->amflags); + if (req->flags & AM_FLAG_SRC_TEMP) + psmi_free(req->src); + psmi_free(req); + } + return err; +} + +void +psmi_am_reqq_add(int amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, void *dest, int amflags) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + int i; + int flags = 0; + am_reqq_t *nreq = + (am_reqq_t *) psmi_malloc(ptl->ep, UNDEFINED, sizeof(am_reqq_t)); + psmi_assert_always(nreq != NULL); + _HFI_VDBG("alloc of reqq=%p, to epaddr=%s, ptr=%p, len=%d, " + "localreq=%p, remotereq=%p\n", nreq, + psmi_epaddr_get_hostname(epaddr->epid), dest, + (int)len, (void *)(uintptr_t) args[1].u64w0, + (void *)(uintptr_t) args[0].u64w0); + + psmi_assert(nargs <= 8); + nreq->next = NULL; + nreq->amtype = amtype; + nreq->ptl = ptl_gen; + nreq->epaddr = epaddr; + nreq->handler = handler; + for (i = 0; i < nargs; i++) + nreq->args[i] = args[i]; + nreq->nargs = nargs; + if (AM_IS_LONG(amtype) && src != NULL && + len > 0 && !(amflags & AM_FLAG_SRC_ASYNC)) { + abort(); + flags |= AM_FLAG_SRC_TEMP; + nreq->src = psmi_malloc(ptl->ep, UNDEFINED, len); + psmi_assert_always(nreq->src != NULL); /* XXX mem */ + amsh_shm_copy_short(nreq->src, src, len); + } else + nreq->src = src; + nreq->len = len; + nreq->dest = dest; + nreq->amflags = amflags; + nreq->flags = flags; + + nreq->next = NULL; + *(ptl->psmi_am_reqq_fifo.lastp) = nreq; + ptl->psmi_am_reqq_fifo.lastp = &nreq->next; +} + +static +void process_packet(ptl_t *ptl_gen, am_pkt_short_t *pkt, int isreq) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + amsh_am_token_t tok; + psmi_handler_fn_t fn; + psm2_amarg_t *args = pkt->args; + uint16_t shmidx = pkt->shmidx; + int nargs = pkt->nargs; + + tok.tok.epaddr_incoming = ((shmidx != (uint16_t)-1) ? ptl->am_ep[shmidx].epaddr : 0); + tok.ptl = ptl_gen; + tok.mq = ptl->ep->mq; + tok.shmidx = shmidx; + + uint16_t hidx = (uint16_t) pkt->handleridx; + uint32_t bulkidx = pkt->bulkidx; + uintptr_t bulkptr; + am_pkt_bulk_t *bulkpkt; + + fn = (psmi_handler_fn_t) psmi_allhandlers[hidx].fn; + psmi_assert(fn != NULL); + psmi_assert((uintptr_t) pkt > ptl->self_nodeinfo->amsh_shmbase); + + if (pkt->type == AMFMT_SHORT_INLINE) { + _HFI_VDBG + ("%s inline flag=%d nargs=%d from_idx=%d pkt=%p hidx=%d\n", + isreq ? "request" : "reply", pkt->flag, nargs, shmidx, pkt, + hidx); + + fn(&tok, args, nargs, pkt->length > 0 ? + (void *)&args[nargs] : NULL, pkt->length); + } else { + int isend = 0; + switch (pkt->type) { + case AMFMT_LONG_END: + isend = 1; + case AMFMT_LONG: + case AMFMT_SHORT: + if (isreq) { + bulkptr = + (uintptr_t) ptl->self_nodeinfo->qdir. + qreqFifoLong; + bulkptr += bulkidx * amsh_qelemsz.qreqFifoLong; + } else { + bulkptr = + (uintptr_t) ptl->self_nodeinfo->qdir. + qrepFifoLong; + bulkptr += bulkidx * amsh_qelemsz.qrepFifoLong; + } + break; + default: + bulkptr = 0; + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unknown/unhandled packet type 0x%x", + pkt->type); + return; + } + + bulkpkt = (am_pkt_bulk_t *) bulkptr; + _HFI_VDBG("ep=%p mq=%p type=%d bulkidx=%d flag=%d/%d nargs=%d " + "from_idx=%d pkt=%p/%p hidx=%d\n", + ptl->ep, ptl->ep->mq, pkt->type, bulkidx, pkt->flag, + bulkpkt->flag, nargs, shmidx, pkt, bulkpkt, hidx); + psmi_assert(bulkpkt->flag == QREADY); + + if (nargs > NSHORT_ARGS || isend == 1) { + /* Either there are more args in the bulkpkt, or this is the last + packet of a long payload. In either case, copy the args. */ + int i; + args = + alloca((NSHORT_ARGS + + NBULK_ARGS) * sizeof(psm2_amarg_t)); + + for (i = 0; i < NSHORT_ARGS; i++) { + args[i] = pkt->args[i]; + } + + for (; i < nargs; i++) { + args[i] = bulkpkt->args[i - NSHORT_ARGS]; + } + } + + if (pkt->type == AMFMT_SHORT) { + fn(&tok, args, nargs, + (void *)bulkpkt->payload, bulkpkt->len); + QMARKFREE(bulkpkt); + } else { + amsh_shm_copy_long((void *)(bulkpkt->dest + + bulkpkt->dest_off), + bulkpkt->payload, bulkpkt->len); + + /* If this is the last packet, copy args before running the + * handler */ + if (isend) { + void *dest = (void *)bulkpkt->dest; + size_t len = + (size_t) (bulkpkt->dest_off + bulkpkt->len); + QMARKFREE(bulkpkt); + fn(&tok, args, nargs, dest, len); + } else + QMARKFREE(bulkpkt); + } + } + return; +} + +static +psm2_error_t +amsh_mq_rndv(ptl_t *ptl, psm2_mq_t mq, psm2_mq_req_t req, + psm2_epaddr_t epaddr, psm2_mq_tag_t *tag, const void *buf, + uint32_t len) +{ + psm2_amarg_t args[5]; + psm2_error_t err = PSM2_OK; + + args[0].u32w0 = MQ_MSG_LONGRTS; + args[0].u32w1 = len; + args[1].u32w1 = tag->tag[0]; + args[1].u32w0 = tag->tag[1]; + args[2].u32w1 = tag->tag[2]; + args[3].u64w0 = (uint64_t) (uintptr_t) req; + args[4].u64w0 = (uint64_t) (uintptr_t) buf; + + psmi_assert(req != NULL); + req->type = MQE_TYPE_SEND; + req->req_data.buf = (void *)buf; + req->req_data.buf_len = len; + req->req_data.send_msglen = len; + req->send_msgoff = 0; + +#ifdef PSM_CUDA + /* If the send buffer is on gpu, we create a cuda IPC + * handle and send it as payload in the RTS + */ + if (req->is_buf_gpu_mem) { + CUdeviceptr buf_base_ptr; + PSMI_CUDA_CALL(cuMemGetAddressRange, &buf_base_ptr, NULL, (CUdeviceptr)buf); + + /* Offset in GPU buffer from which we copy data, we have to + * send it separetly because this offset is lost + * when cuIpcGetMemHandle is called */ + req->cuda_ipc_offset = buf - (void*)buf_base_ptr; + args[2].u32w0 = (uint32_t)req->cuda_ipc_offset; + + PSMI_CUDA_CALL(cuIpcGetMemHandle, + &req->cuda_ipc_handle, + (CUdeviceptr) buf); + if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) { + psmi_am_reqq_add(AMREQUEST_SHORT, ptl, + epaddr, mq_handler_hidx, + args, 5, (void*)&req->cuda_ipc_handle, + sizeof(CUipcMemHandle), NULL, 0); + } else { + psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx, + args, 5, (void*)&req->cuda_ipc_handle, + sizeof(CUipcMemHandle), 0); + } + req->cuda_ipc_handle_attached = 1; + } else +#endif + if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) { + psmi_am_reqq_add(AMREQUEST_SHORT, ptl, + epaddr, mq_handler_hidx, + args, 5, NULL, 0, NULL, 0); + } else { + psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx, + args, 5, NULL, 0, 0); + } + + return err; +} + +/* + * All shared am mq sends, req can be NULL + */ +PSMI_ALWAYS_INLINE( +psm2_error_t +amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, + uint32_t flags_user, uint32_t flags_internal, psm2_mq_tag_t *tag, + const void *ubuf, uint32_t len)) +{ + psm2_amarg_t args[3]; + psm2_error_t err = PSM2_OK; + int is_blocking = (req == NULL); + +#ifdef PSM_CUDA + int gpu_mem; + /* All sends from a gpu buffer use the rendezvous protocol */ + if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)ubuf)) { + if (!PSMI_IS_CUDA_ENABLED) + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + " Please enable PSM CUDA support when using GPU buffer \n"); + gpu_mem = 1; + goto do_rendezvous; + } else + gpu_mem = 0; +#endif + + if (!flags_user && len <= AMLONG_MTU) { + if (len <= 32) + args[0].u32w0 = MQ_MSG_TINY; + else + args[0].u32w0 = MQ_MSG_SHORT; + args[1].u32w1 = tag->tag[0]; + args[1].u32w0 = tag->tag[1]; + args[2].u32w1 = tag->tag[2]; + + if (flags_internal & PSMI_REQ_FLAG_FASTPATH) { + psmi_am_reqq_add(AMREQUEST_SHORT, epaddr->ptlctl->ptl, + epaddr, mq_handler_hidx, + args, 3, (void *)ubuf, len, NULL, 0); + } else { + psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr, + mq_handler_hidx, args, 3, ubuf, len, 0); + } + } else if (flags_user & PSM2_MQ_FLAG_SENDSYNC) + goto do_rendezvous; + else if (len <= mq->shm_thresh_rv) { + uint32_t bytes_left = len; + uint32_t bytes_this = min(bytes_left, AMLONG_MTU); + uint8_t *buf = (uint8_t *) ubuf; + args[0].u32w0 = MQ_MSG_EAGER; + args[0].u32w1 = len; + args[1].u32w1 = tag->tag[0]; + args[1].u32w0 = tag->tag[1]; + args[2].u32w1 = tag->tag[2]; + if (flags_internal & PSMI_REQ_FLAG_FASTPATH) { + psmi_am_reqq_add(AMREQUEST_SHORT, epaddr->ptlctl->ptl, + epaddr, mq_handler_hidx, + args, 3, buf, bytes_this, NULL, 0); + } else { + psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr, + mq_handler_hidx, args, 3, buf, + bytes_this, 0); + } + bytes_left -= bytes_this; + buf += bytes_this; + args[2].u32w0 = 0; + while (bytes_left) { + args[2].u32w0 += bytes_this; + bytes_this = min(bytes_left, AMLONG_MTU); + /* Here we kind of bend the rules, and assume that shared-memory + * active messages are delivered in order */ + if (flags_internal & PSMI_REQ_FLAG_FASTPATH) { + psmi_am_reqq_add(AMREQUEST_SHORT, epaddr->ptlctl->ptl, + epaddr, mq_handler_data_hidx, + args, 3, buf, bytes_this, NULL, 0); + } else { + psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr, + mq_handler_data_hidx, args, + 3, buf, bytes_this, 0); + } + buf += bytes_this; + bytes_left -= bytes_this; + } + } else { +do_rendezvous: + if (is_blocking) { + req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); + if_pf(req == NULL) + return PSM2_NO_MEMORY; + req->req_data.send_msglen = len; + req->req_data.tag = *tag; + + /* Since SEND command is blocking, this request is + * entirely internal and we will not be exposed to user. + * Setting as internal so it will not be added to + * mq->completed_q */ + req->flags_internal |= (flags_internal | PSMI_REQ_FLAG_IS_INTERNAL); + } +#ifdef PSM_CUDA + /* CUDA documentation dictates the use of SYNC_MEMOPS attribute + * when the buffer pointer received into PSM has been allocated + * by the application. This guarantees the all memory operations + * to this region of memory (used by multiple layers of the stack) + * always synchronize + */ + if (gpu_mem) { + int trueflag = 1; + PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag, + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + (CUdeviceptr)ubuf); + req->is_buf_gpu_mem = 1; + } else + req->is_buf_gpu_mem = 0; +#endif + + err = + amsh_mq_rndv(epaddr->ptlctl->ptl, mq, req, epaddr, tag, + ubuf, len); + + if (err == PSM2_OK && is_blocking) { /* wait... */ + err = psmi_mq_wait_internal(&req); + } + return err; /* skip eager accounting below */ + } + + /* All eager async sends are always "all done" */ + if (req != NULL) { + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); + } + + mq->stats.tx_num++; + mq->stats.tx_shm_num++; + mq->stats.tx_eager_num++; + mq->stats.tx_eager_bytes += len; + + return err; +} + +static +psm2_error_t +amsh_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags_user, + uint32_t flags_internal, psm2_mq_tag_t *tag, const void *ubuf, + uint32_t len, void *context, psm2_mq_req_t *req_o) +{ + psm2_mq_req_t req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); + if_pf(req == NULL) + return PSM2_NO_MEMORY; + + req->req_data.send_msglen = len; + req->req_data.tag = *tag; + req->req_data.context = context; + req->flags_user = flags_user; + req->flags_internal = flags_internal; + _HFI_VDBG("[ishrt][%s->%s][n=0][b=%p][l=%d][t=%08x.%08x.%08x]\n", + psmi_epaddr_get_name(epaddr->ptlctl->ep->epid), + psmi_epaddr_get_name(epaddr->epid), ubuf, len, + tag->tag[0], tag->tag[1], tag->tag[2]); + + amsh_mq_send_inner(mq, req, epaddr, flags_user, flags_internal, tag, ubuf, len); + + *req_o = req; + return PSM2_OK; +} + +static +psm2_error_t +amsh_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags, + psm2_mq_tag_t *tag, const void *ubuf, uint32_t len) +{ + _HFI_VDBG("[shrt][%s->%s][n=0][b=%p][l=%d][t=%08x.%08x.%08x]\n", + psmi_epaddr_get_name(epaddr->ptlctl->ep->epid), + psmi_epaddr_get_name(epaddr->epid), ubuf, len, + tag->tag[0], tag->tag[1], tag->tag[2]); + + amsh_mq_send_inner(mq, NULL, epaddr, flags, PSMI_REQ_FLAG_NORMAL, tag, ubuf, len); + + return PSM2_OK; +} + +/* kassist-related handling */ +int psmi_epaddr_pid(psm2_epaddr_t epaddr) +{ + uint16_t shmidx = ((am_epaddr_t *) epaddr)->shmidx; + return ((struct ptl_am *)(epaddr->ptlctl->ptl))->am_ep[shmidx].pid; +} +#if _HFI_DEBUGGING +static +const char *psmi_kassist_getmode(int mode) +{ + switch (mode) { + case PSMI_KASSIST_OFF: + return "kassist off"; + case PSMI_KASSIST_CMA_GET: + return "cma get"; + case PSMI_KASSIST_CMA_PUT: + return "cma put"; + default: + return "unknown"; + } +} +#endif + +static +int psmi_get_kassist_mode() +{ + int mode = PSMI_KASSIST_MODE_DEFAULT; + /* Cuda PSM only supports KASSIST_CMA_GET */ +#ifdef PSM_CUDA + mode = PSMI_KASSIST_CMA_GET; +#else + union psmi_envvar_val env_kassist; + + if (!psmi_getenv("PSM2_KASSIST_MODE", + "PSM Shared memory kernel assist mode " + "(cma-put, cma-get, none)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val) + PSMI_KASSIST_MODE_DEFAULT_STRING, &env_kassist)) { + char *s = env_kassist.e_str; + if (strcasecmp(s, "cma-put") == 0) + mode = PSMI_KASSIST_CMA_PUT; + else if (strcasecmp(s, "cma-get") == 0) + mode = PSMI_KASSIST_CMA_GET; + else + mode = PSMI_KASSIST_OFF; + } else { + /* cma-get is the fastest, so it's the default. + Availability of CMA is checked in psmi_shm_create(); + if CMA is not available it falls back to 'none' there. */ + mode = PSMI_KASSIST_CMA_GET; + } +#endif + return mode; +} + +/* Connection handling for shared memory AM. + * + * arg0 => conn_op, result (PSM error type) + * arg1 => epid (always) + * arg2 => pid, version. + * arg3 => pointer to error for replies. + */ +static +void +amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, + size_t len) +{ + int op = args[0].u16w0; + int phase = args[0].u32w1; + psm2_epid_t epid = args[1].u64w0; + int16_t return_shmidx = args[0].u16w1; + psm2_error_t err = (psm2_error_t) args[2].u32w1; + psm2_error_t *perr = (psm2_error_t *) (uintptr_t) args[3].u64w0; + int pid = args[2].u32w0; + int force_remap = 0; + + psm2_epaddr_t epaddr; + amsh_am_token_t *tok = (amsh_am_token_t *) toki; + uint16_t shmidx = tok->shmidx; + int is_valid; + struct ptl_am *ptl = (struct ptl_am *)(tok->ptl); + ptl_t *ptl_gen = tok->ptl; + int cstate; + + /* We do this because it's an assumption below */ + psmi_assert_always(buf == NULL && len == 0); + + _HFI_VDBG("Conn op=%d, phase=%d, epid=%llx, err=%d\n", + op, phase, (unsigned long long)epid, err); + + switch (op) { + case PSMI_AM_CONN_REQ: + _HFI_VDBG("Connect from %d:%d\n", + (int)psm2_epid_nid(epid), (int)psm2_epid_context(epid)); + epaddr = psmi_epid_lookup(ptl->ep, epid); + if (epaddr && ((am_epaddr_t *) epaddr)->pid != pid) { + /* If old pid is unknown consider new pid the correct one */ + if (((am_epaddr_t *) epaddr)->pid == AMSH_PID_UNKNOWN) { + ((am_epaddr_t *) epaddr)->pid = pid; + } else { + psmi_epid_remove(ptl->ep, epid); + epaddr = NULL; + force_remap = 1; + } + } + + if (shmidx == (uint16_t)-1) { + /* incoming packet will never be from our shmidx slot 0 + thus the other process doesn't know our return info. + attach_to will lookup or create the proper shmidx */ + if ((err = psmi_shm_map_remote(ptl_gen, epid, &shmidx, force_remap))) { + psmi_handle_error(PSMI_EP_NORETURN, err, + "Fatal error in " + "connecting to shm segment"); + } + am_update_directory(&ptl->am_ep[shmidx]); + tok->shmidx = shmidx; + } + + if (epaddr == NULL) { + uintptr_t args_segoff = + (uintptr_t) args - ptl->self_nodeinfo->amsh_shmbase; + if ((err = amsh_epaddr_add(ptl_gen, epid, shmidx, &epaddr))) + /* Unfortunately, no way out of here yet */ + psmi_handle_error(PSMI_EP_NORETURN, err, + "Fatal error " + "in connecting to shm segment"); + args = + (psm2_amarg_t *) (ptl->self_nodeinfo->amsh_shmbase + + args_segoff); + + ((am_epaddr_t *) epaddr)->pid = pid; + } + + /* Rewrite args */ + ptl->connect_incoming++; + args[0].u16w0 = PSMI_AM_CONN_REP; + /* and return our shmidx for the connecting process */ + args[0].u16w1 = shmidx; + args[1].u64w0 = (psm2_epid_t) ptl->epid; + args[2].u32w0 = getpid(); + args[2].u32w1 = PSM2_OK; + ((am_epaddr_t *) epaddr)->cstate_incoming = + AMSH_CSTATE_INCOMING_ESTABLISHED; + ((am_epaddr_t *) epaddr)->return_shmidx = return_shmidx; + tok->tok.epaddr_incoming = epaddr; /* adjust token */ + psmi_amsh_short_reply(tok, amsh_conn_handler_hidx, + args, narg, NULL, 0, 0); + break; + + case PSMI_AM_CONN_REP: + if (ptl->connect_phase != phase) { + _HFI_VDBG("Out of phase connect reply\n"); + return; + } + epaddr = ptl->am_ep[shmidx].epaddr; + /* check if a race has occurred on shm-file reuse. + * if so, don't transition to the next state. + * the next call to connreq_poll() will restart the + * connection. + */ + if (ptl->am_ep[shmidx].pid != + ((struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase)->pid) + break; + + *perr = err; + ((am_epaddr_t *) epaddr)->cstate_outgoing + = AMSH_CSTATE_OUTGOING_REPLIED; + ((am_epaddr_t *) epaddr)->return_shmidx = return_shmidx; + ptl->connect_outgoing++; + _HFI_VDBG("CCC epaddr=%s connected to ptl=%p\n", + psmi_epaddr_get_name(epaddr->epid), ptl); + break; + + case PSMI_AM_DISC_REQ: + epaddr = psmi_epid_lookup(ptl->ep, epid); + if (!epaddr) { + _HFI_VDBG("Dropping disconnect request from an epid that we are not connected to\n"); + return; + } + args[0].u16w0 = PSMI_AM_DISC_REP; + args[2].u32w1 = PSM2_OK; + ((am_epaddr_t *) epaddr)->cstate_incoming = + AMSH_CSTATE_INCOMING_DISC_REQUESTED; + ptl->connect_incoming--; + /* Before sending the reply, make sure the process + * is still connected */ + + if (ptl->am_ep[shmidx].epid != epaddr->epid) + is_valid = 0; + else + is_valid = 1; + + if (is_valid) { + psmi_amsh_short_reply(tok, amsh_conn_handler_hidx, + args, narg, NULL, 0, 0); + /** + * Only munmap if we have nothing more to + * communicate with the other node, i.e. we are + * already disconnected with the other node + * or have sent a disconnect request. + */ + cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; + if (cstate == AMSH_CSTATE_OUTGOING_DISC_REQUESTED) { + err = psmi_do_unmap(ptl->am_ep[shmidx].amsh_shmbase); + psmi_epid_remove(epaddr->ptlctl->ep, epaddr->epid); + } + } + break; + + case PSMI_AM_DISC_REP: + if (ptl->connect_phase != phase) { + _HFI_VDBG("Out of phase disconnect reply\n"); + return; + } + *perr = err; + epaddr = tok->tok.epaddr_incoming; + ((am_epaddr_t *) epaddr)->cstate_outgoing = + AMSH_CSTATE_OUTGOING_DISC_REPLIED; + ptl->connect_outgoing--; + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unknown/unhandled connect handler op=%d", + op); + break; + } + return; +} + +static +size_t amsh_sizeof(void) +{ + return sizeof(struct ptl_am); +} + +/* Fill in AM capabilities parameters */ +psm2_error_t +psmi_amsh_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters) +{ + if (parameters == NULL) { + return PSM2_PARAM_ERR; + } + + parameters->max_handlers = PSMI_AM_NUM_HANDLERS; + parameters->max_nargs = PSMI_AM_MAX_ARGS; + parameters->max_request_short = AMLONG_MTU; + parameters->max_reply_short = AMLONG_MTU; + + return PSM2_OK; +} + +/** + * @param ep PSM Endpoint, guaranteed to have initialized epaddr and epid. + * @param ptl Pointer to caller-allocated space for PTL (fill in) + * @param ctl Pointer to caller-allocated space for PTL-control + * structure (fill in) + */ +static +psm2_error_t +amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + psm2_error_t err = PSM2_OK; + + /* Preconditions */ + psmi_assert_always(ep != NULL); + psmi_assert_always(ep->epaddr != NULL); + psmi_assert_always(ep->epid != 0); + + ptl->ep = ep; /* back pointer */ + ptl->epid = ep->epid; /* cache epid */ + ptl->epaddr = ep->epaddr; /* cache a copy */ + ptl->ctl = ctl; + ptl->zero_polls = 0; + + ptl->connect_phase = 0; + ptl->connect_incoming = 0; + ptl->connect_outgoing = 0; + + memset(&ptl->amsh_empty_shortpkt, 0, sizeof(ptl->amsh_empty_shortpkt)); + memset(&ptl->psmi_am_reqq_fifo, 0, sizeof(ptl->psmi_am_reqq_fifo)); + + ptl->max_ep_idx = -1; + ptl->am_ep_size = AMSH_DIRBLOCK_SIZE; + + ptl->am_ep = (struct am_ctl_nodeinfo *) + psmi_memalign(ptl->ep, PER_PEER_ENDPOINT, 64, + ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo)); + + if (ptl->am_ep == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + memset(ptl->am_ep, 0, ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo)); + + if ((err = amsh_init_segment(ptl_gen))) + goto fail; + + ptl->self_nodeinfo->psm_verno = PSMI_VERNO; + if (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF) { + if (cma_available()) { + ptl->self_nodeinfo->amsh_features |= + AMSH_HAVE_CMA; + psmi_shm_mq_rv_thresh = + PSMI_MQ_RV_THRESH_CMA; + } else { + ptl->psmi_kassist_mode = + PSMI_KASSIST_OFF; + psmi_shm_mq_rv_thresh = + PSMI_MQ_RV_THRESH_NO_KASSIST; + } + } else { + psmi_shm_mq_rv_thresh = + PSMI_MQ_RV_THRESH_NO_KASSIST; + } + ptl->self_nodeinfo->pid = getpid(); + ptl->self_nodeinfo->epid = ep->epid; + ptl->self_nodeinfo->epaddr = ep->epaddr; + + ips_mb(); + ptl->self_nodeinfo->is_init = 1; + + psmi_am_reqq_init(ptl_gen); + memset(ctl, 0, sizeof(*ctl)); + + /* Fill in the control structure */ + ctl->ep = ep; + ctl->ptl = ptl_gen; + ctl->ep_poll = amsh_poll; + ctl->ep_connect = amsh_ep_connect; + ctl->ep_disconnect = amsh_ep_disconnect; + + ctl->mq_send = amsh_mq_send; + ctl->mq_isend = amsh_mq_isend; + + ctl->am_get_parameters = psmi_amsh_am_get_parameters; + ctl->am_short_request = psmi_amsh_am_short_request; + ctl->am_short_reply = psmi_amsh_am_short_reply; + + /* No stats in shm (for now...) */ + ctl->epaddr_stats_num = NULL; + ctl->epaddr_stats_init = NULL; + ctl->epaddr_stats_get = NULL; +#ifdef PSM_CUDA + union psmi_envvar_val env_memcache_enabled; + psmi_getenv("PSM2_CUDA_MEMCACHE_ENABLED", + "PSM cuda ipc memhandle cache enabled (default is enabled)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) + 1, &env_memcache_enabled); + if (PSMI_IS_CUDA_ENABLED && env_memcache_enabled.e_uint) { + union psmi_envvar_val env_memcache_size; + psmi_getenv("PSM2_CUDA_MEMCACHE_SIZE", + "Size of the cuda ipc memhandle cache ", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) + CUDA_MEMHANDLE_CACHE_SIZE, &env_memcache_size); + if ((err = am_cuda_memhandle_mpool_init(env_memcache_size.e_uint) + != PSM2_OK)) + goto fail; + if ((err = am_cuda_memhandle_cache_map_init() != PSM2_OK)) + goto fail; + } +#endif +fail: + return err; +} + +static psm2_error_t amsh_fini(ptl_t *ptl_gen, int force, uint64_t timeout_ns) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + struct psmi_eptab_iterator itor; + psm2_epaddr_t epaddr; + psm2_error_t err = PSM2_OK; + psm2_error_t err_seg; + uint64_t t_start = get_cycles(); + int i = 0; + + /* Close whatever has been left open -- this will be factored out for 2.1 */ + if (ptl->connect_outgoing > 0) { + int num_disc = 0; + int *mask; + psm2_error_t *errs; + psm2_epaddr_t *epaddr_array; + + psmi_epid_itor_init(&itor, ptl->ep); + while ((epaddr = psmi_epid_itor_next(&itor))) { + if (epaddr->ptlctl->ptl != ptl_gen) + continue; + if (((am_epaddr_t *) epaddr)->cstate_outgoing == + AMSH_CSTATE_OUTGOING_ESTABLISHED) + num_disc++; + } + psmi_epid_itor_fini(&itor); + + mask = + (int *)psmi_calloc(ptl->ep, UNDEFINED, num_disc, + sizeof(int)); + errs = (psm2_error_t *) + psmi_calloc(ptl->ep, UNDEFINED, num_disc, + sizeof(psm2_error_t)); + epaddr_array = (psm2_epaddr_t *) + psmi_calloc(ptl->ep, UNDEFINED, num_disc, + sizeof(psm2_epaddr_t)); + + if (errs == NULL || epaddr_array == NULL || mask == NULL) { + if (epaddr_array) + psmi_free(epaddr_array); + if (errs) + psmi_free(errs); + if (mask) + psmi_free(mask); + err = PSM2_NO_MEMORY; + goto fail; + } + psmi_epid_itor_init(&itor, ptl->ep); + while ((epaddr = psmi_epid_itor_next(&itor))) { + if (epaddr->ptlctl->ptl == ptl_gen) { + if (((am_epaddr_t *) epaddr)->cstate_outgoing == + AMSH_CSTATE_OUTGOING_ESTABLISHED) { + mask[i] = 1; + epaddr_array[i] = epaddr; + i++; + } + } + } + psmi_epid_itor_fini(&itor); + psmi_assert(i == num_disc && num_disc > 0); + err = amsh_ep_disconnect(ptl_gen, force, num_disc, epaddr_array, + mask, errs, timeout_ns); + psmi_free(mask); + psmi_free(errs); + psmi_free(epaddr_array); + } + + if (ptl->connect_incoming > 0 || ptl->connect_outgoing > 0) { + while (ptl->connect_incoming > 0 || ptl->connect_outgoing > 0) { + if (!psmi_cycles_left(t_start, timeout_ns)) { + err = PSM2_TIMEOUT; + _HFI_VDBG("CCC timed out with from=%d,to=%d\n", + ptl->connect_incoming, ptl->connect_outgoing); + break; + } + psmi_poll_internal(ptl->ep, 1); + } + } else + _HFI_VDBG("CCC complete disconnect from=%d,to=%d\n", + ptl->connect_incoming, ptl->connect_outgoing); + + if ((err_seg = psmi_shm_detach(ptl_gen))) { + err = err_seg; + goto fail; + } + + /* This prevents poll calls between now and the point where the endpoint is + * deallocated to reference memory that disappeared */ + ptl->repH.head = &ptl->amsh_empty_shortpkt; + ptl->reqH.head = &ptl->amsh_empty_shortpkt; +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED) + am_cuda_memhandle_cache_map_fini(); +#endif + return PSM2_OK; +fail: + return err; + +} + +static +psm2_error_t +amsh_setopt(const void *component_obj, int optname, + const void *optval, uint64_t optlen) +{ + /* No options for AM PTL at the moment */ + return psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Unknown AM ptl option %u.", optname); +} + +static +psm2_error_t +amsh_getopt(const void *component_obj, int optname, + void *optval, uint64_t *optlen) +{ + /* No options for AM PTL at the moment */ + return psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Unknown AM ptl option %u.", optname); +} + +/* Only symbol we expose out of here */ +struct ptl_ctl_init +psmi_ptl_amsh = { + amsh_sizeof, amsh_init, amsh_fini, amsh_setopt, amsh_getopt +}; diff --git a/ptl_am/cmarw.h b/ptl_am/cmarw.h new file mode 100644 index 0000000..0317ed4 --- /dev/null +++ b/ptl_am/cmarw.h @@ -0,0 +1,73 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include +#include + +/* + * read from remote process pid + */ +int64_t cma_get(pid_t pid, const void *src, void *dst, int64_t n); + +/* + * write to remote process pid + */ +int64_t cma_put(const void *src, pid_t pid, void *dst, int64_t n); + +/* + * Test if CMA is available by trying a no-op call. + * Returns 1 if CMA is present, 0 if not. + */ +int cma_available(void); diff --git a/ptl_am/cmarwu.c b/ptl_am/cmarwu.c new file mode 100644 index 0000000..a9a1d83 --- /dev/null +++ b/ptl_am/cmarwu.c @@ -0,0 +1,207 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include +#include +#include +#include +#include +#include + +#include "psm_user.h" +#include "cmarw.h" + +/* An iovec looks like this: + * struct iovec { + * void *iov_base; // Starting address + * size_t iov_len; // Number of bytes to transfer + * }; + */ + +#if 0 +#define __NR_process_vm_readv 310 +#define __NR_process_vm_writev 311 + +#define process_vm_readv(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \ + syscall(__NR_process_vm_readv, \ + pid, local_iov, liovcnt, remote_iov, riovcnt, flags) + +#define process_vm_writev(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \ + syscall(__NR_process_vm_writev, \ + pid, local_iov, liovcnt, remote_iov, riovcnt, flags) +#endif + +/*CMA syscall wrappers were added in glibc 2.15. For anything older than that, + we need to define our own wrappers. Apparently older (and maybe newer?) + (2.12 from RHEL6.3 definitely has this bug) glibcs only pass up to 5 + arguments via the generic syscall() function. These CMA functions, however, + have 6 arguments. So for now, we hack our way around it by generating ASM + code for doing a syscall directly. +*/ + +#if defined(__GLIBC__) && ((__GLIBC__ == 2) && (__GLIBC_MINOR__ < 15)) + +#ifdef __x86_64__ + +#define __NR_process_vm_readv 310 +#define __NR_process_vm_writev 311 + +static inline ssize_t __x86_64_syscall6(int syscall, + pid_t pid, + const struct iovec *local_iov, + unsigned long liovcnt, + const struct iovec *remote_iov, + unsigned long riovcnt, + unsigned long flags) +{ + /*GCC inline ASM is annoying -- can't specify all the x86_64 registers + directly, so declare register-specific variables and use them. */ + register int64_t rax asm("rax") = syscall; + register int64_t rdi asm("rdi") = pid; + register int64_t rsi asm("rsi") = (intptr_t) local_iov; + register int64_t rdx asm("rdx") = liovcnt; + register int64_t r10 asm("r10") = (intptr_t) remote_iov; + register int64_t r8 asm("r8") = riovcnt; + register int64_t r9 asm("r9") = flags; + + asm volatile ("syscall\n" : "=a" (rax) + : "r"(rax), "r"(rdi), "r"(rsi), "r"(rdx), "r"(r10), + "r"(r8), "r"(r9) + : "%rcx", "%r11", "cc", "memory"); + return rax; +} + +#define process_vm_readv(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \ + __x86_64_syscall6(__NR_process_vm_readv, \ + pid, local_iov, liovcnt, remote_iov, riovcnt, flags) + +#define process_vm_writev(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \ + __x86_64_syscall6(__NR_process_vm_writev, \ + pid, local_iov, liovcnt, remote_iov, riovcnt, flags) + +#else /* ndef __x86_64__ */ +#error "Can't compile CMA support for this architecture." +#endif /* __x86_64__ */ +#endif /* __GLIBC__ < 2.15 */ + +int64_t cma_get(pid_t pid, const void *src, void *dst, int64_t n) +{ + int64_t nr, sum; + struct iovec local = { + .iov_base = dst, + .iov_len = n + }; + struct iovec remote = { + .iov_base = (void *)src, + .iov_len = n + }; + nr = sum = 0; + while (sum != n) { + nr = process_vm_readv(pid, &local, 1, &remote, 1, 0); + if (nr == -1) { + return -1; + } + sum += nr; + local.iov_base += nr; + local.iov_len -= nr; + remote.iov_base += nr; + remote.iov_len -= nr; + } + return sum; +} + +int64_t cma_put(const void *src, pid_t pid, void *dst, int64_t n) +{ + int64_t nr, sum; + struct iovec local = { + .iov_base = (void *)src, + .iov_len = n + }; + struct iovec remote = { + .iov_base = dst, + .iov_len = n + }; + + nr = sum = 0; + while (sum != n) { + nr = process_vm_writev(pid, &local, 1, &remote, 1, 0); + if (nr == -1) { + return -1; + } + sum += nr; + local.iov_base += nr; + local.iov_len -= nr; + remote.iov_base += nr; + remote.iov_len -= nr; + } + return sum; +} + +/* Test if CMA is available by trying a no-op call. */ +int cma_available(void) +{ + + /* Make a no-op CMA syscall. If CMA is present, 0 (bytes transferred) + * should be returned. If not present, expect -ENOSYS. */ + + int ret = process_vm_readv(getpid(), NULL, 0, NULL, 0, 0); + + if (ret == 0) { + /* CMA is available! */ + return 1; + } + + return 0; +} diff --git a/ptl_am/psm_am_internal.h b/ptl_am/psm_am_internal.h new file mode 100644 index 0000000..8e07a57 --- /dev/null +++ b/ptl_am/psm_am_internal.h @@ -0,0 +1,442 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +#ifndef PSMI_AM_H +#define PSMI_AM_H + +#include "am_config.h" +#include "../psm_am_internal.h" + +#define AMSH_DIRBLOCK_SIZE 128 + +typedef +struct am_epaddr { + /* + * epaddr must be the first field to have the same address as this + * structure + */ + struct psm2_epaddr epaddr; + + uint16_t shmidx; + uint16_t return_shmidx; + + uint32_t cstate_outgoing:4; + uint32_t cstate_incoming:4; + uint32_t pid:22; +} am_epaddr_t; + +/* Up to NSHORT_ARGS are supported via am_pkt_short_t; the remaining + arguments are passed using space in am_pkt_bulk_t. One additional argument + is added for passing the internal ptl_am handler index. */ +#define NSHORT_ARGS 6 +#define NBULK_ARGS (PSMI_AM_MAX_ARGS - NSHORT_ARGS + 1) + +typedef +struct amsh_am_token { + struct psmi_am_token tok; + + ptl_t *ptl; /**> What PTL was it received on */ + psm2_mq_t mq; /**> What matched queue is this for ? */ + uint16_t shmidx; /**> what shmidx sent this */ +} amsh_am_token_t; + +typedef void (*psmi_handler_fn_t) (void *token, psm2_amarg_t *args, int nargs, + void *src, size_t len); + +typedef struct psmi_handlertab { + psmi_handler_fn_t fn; +} psmi_handlertab_t; + +#define PSMI_AM_CONN_REQ 1 +#define PSMI_AM_CONN_REP 2 +#define PSMI_AM_DISC_REQ 3 +#define PSMI_AM_DISC_REP 4 + +#define PSMI_KASSIST_OFF 0x0 +#define PSMI_KASSIST_CMA_GET 0x1 +#define PSMI_KASSIST_CMA_PUT 0x2 + +#define PSMI_KASSIST_CMA 0x3 +#define PSMI_KASSIST_GET 0x1 +#define PSMI_KASSIST_PUT 0x2 +#define PSMI_KASSIST_MASK 0x3 + +int psmi_epaddr_pid(psm2_epaddr_t epaddr); + +/* + * Eventually, we will allow users to register handlers as "don't reply", which + * may save on some of the buffering requirements + */ +#define PSMI_HANDLER_NEEDS_REPLY(handler) 1 +#define PSMI_VALIDATE_REPLY(handler) assert(PSMI_HANDLER_NEEDS_REPLY(handler)) + +int psmi_amsh_poll(ptl_t *ptl, int replyonly); + +/* Shared memory AM, forward decls */ +int +psmi_amsh_short_request(ptl_t *ptl, psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, int flags); + +void +psmi_amsh_short_reply(amsh_am_token_t *tok, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, int flags); + +int +psmi_amsh_long_request(ptl_t *ptl, psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, void *dest, int flags); + +void +psmi_amsh_long_reply(amsh_am_token_t *tok, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, void *dest, int flags); + +void psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, + size_t len); + +void psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, + size_t len); +void psmi_am_mq_handler_data(void *toki, psm2_amarg_t *args, int narg, + void *buf, size_t len); +void psmi_am_mq_handler_complete(void *toki, psm2_amarg_t *args, int narg, + void *buf, size_t len); +void psmi_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg, + void *buf, size_t len); +void psmi_am_mq_handler_rtsdone(void *toki, psm2_amarg_t *args, int narg, + void *buf, size_t len); +void psmi_am_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, + size_t len); + +/* AM over shared memory (forward decls) */ +psm2_error_t +psmi_amsh_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters); + +psm2_error_t +psmi_amsh_am_short_request(psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt); + +psm2_error_t +psmi_amsh_am_short_reply(psm2_am_token_t tok, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt); + +#define amsh_conn_handler_hidx 1 +#define mq_handler_hidx 2 +#define mq_handler_data_hidx 3 +#define mq_handler_rtsmatch_hidx 4 +#define mq_handler_rtsdone_hidx 5 +#define am_handler_hidx 6 + +#define AMREQUEST_SHORT 0 +#define AMREQUEST_LONG 1 +#define AMREPLY_SHORT 2 +#define AMREPLY_LONG 3 +#define AM_IS_REPLY(x) ((x)&0x2) +#define AM_IS_REQUEST(x) (!AM_IS_REPLY(x)) +#define AM_IS_LONG(x) ((x)&0x1) +#define AM_IS_SHORT(x) (!AM_IS_LONG(x)) + +#define AM_FLAG_SRC_ASYNC 0x1 +#define AM_FLAG_SRC_TEMP 0x2 + +/* + * Request Fifo. + */ +typedef +struct am_reqq { + struct am_reqq *next; + + ptl_t *ptl; + psm2_epaddr_t epaddr; + int amtype; + psm2_handler_t handler; + psm2_amarg_t args[8]; + int nargs; + uint32_t len; + void *src; + void *dest; + int amflags; + int flags; +} am_reqq_t; + +struct am_reqq_fifo_t { + am_reqq_t *first; + am_reqq_t **lastp; +}; + +psm2_error_t psmi_am_reqq_drain(ptl_t *ptl); +void psmi_am_reqq_add(int amtype, ptl_t *ptl, psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, void *dest, int flags); + +/* + * Shared memory Active Messages, implementation derived from + * Lumetta, Mainwaring, Culler. Multi-Protocol Active Messages on a Cluster of + * SMP's. Supercomputing 1997. + * + * We support multiple endpoints in shared memory, but we only support one + * shared memory context with up to AMSH_MAX_LOCAL_PROCS local endpoints. Some + * structures are endpoint specific (as denoted * with amsh_ep_) and others are + * specific to the single shared memory context * (amsh_ global variables). + * + * Each endpoint maintains a shared request block and a shared reply block. + * Each block is composed of queues for small, medium and large messages. + */ + +#define QFREE 0 +#define QUSED 1 +#define QREADY 2 +#define QREADYMED 3 +#define QREADYLONG 4 + +#define QISEMPTY(flag) (flag < QREADY) +#if defined(__x86_64__) || defined(__i386__) +# define _QMARK_FLAG_FENCE() asm volatile("" : : : "memory") /* compilerfence */ +#else +# error No _QMARK_FLAG_FENCE() defined for this platform +#endif + +#define _QMARK_FLAG(pkt_ptr, _flag) \ + do { \ + _QMARK_FLAG_FENCE(); \ + (pkt_ptr)->flag = (_flag); \ + } while (0) + +#define QMARKFREE(pkt_ptr) _QMARK_FLAG(pkt_ptr, QFREE) +#define QMARKREADY(pkt_ptr) _QMARK_FLAG(pkt_ptr, QREADY) +#define QMARKUSED(pkt_ptr) _QMARK_FLAG(pkt_ptr, QUSED) + +#define AMFMT_SYSTEM 1 +#define AMFMT_SHORT_INLINE 2 +#define AMFMT_SHORT 3 +#define AMFMT_LONG 4 +#define AMFMT_LONG_END 5 + +#define AMSH_CMASK_NONE 0 +#define AMSH_CMASK_PREREQ 1 +#define AMSH_CMASK_POSTREQ 2 +#define AMSH_CMASK_DONE 3 + +#define AMSH_CSTATE_OUTGOING_NONE 1 +#define AMSH_CSTATE_OUTGOING_REPLIED 2 +#define AMSH_CSTATE_OUTGOING_ESTABLISHED 3 +#define AMSH_CSTATE_OUTGOING_DISC_REPLIED 4 +#define AMSH_CSTATE_OUTGOING_DISC_REQUESTED 5 + +#define AMSH_CSTATE_INCOMING_NONE 1 +#define AMSH_CSTATE_INCOMING_DISC_REQUESTED 4 +#define AMSH_CSTATE_INCOMING_ESTABLISHED 5 + +#define AMSH_PID_UNKNOWN 0 + +/********************************** + * Shared memory packet formats + **********************************/ +typedef +struct am_pkt_short { + uint32_t flag; /**> Packet state */ + union { + uint32_t bulkidx; /**> index in bulk packet queue */ + uint32_t length; /**> length when no bulkidx used */ + }; + uint16_t shmidx; /**> index in shared segment */ + uint16_t type; + uint16_t nargs; + uint16_t handleridx; + + psm2_amarg_t args[NSHORT_ARGS]; /* AM arguments */ + + /* We eventually will expose up to 8 arguments, but this isn't implemented + * For now. >6 args will probably require a medium instead of a short */ +} __attribute__ ((aligned(64))) +am_pkt_short_t; +PSMI_STRICT_SIZE_DECL(am_pkt_short_t, 64); + +typedef struct am_pkt_bulk { + uint32_t flag; + uint32_t idx; + uintptr_t dest; /* Destination pointer in "longs" */ + uint32_t dest_off; /* Destination pointer offset */ + uint32_t len; /* Destination length within offset */ + psm2_amarg_t args[NBULK_ARGS]; /* Additional "spillover" for >6 args */ + uint8_t payload[0]; +} am_pkt_bulk_t; +/* No strict size decl, used for mediums and longs */ + +/**************************************************** + * Shared memory header and block control structures + ***************************************************/ + +/* Each pkt queue has the same header format, although the queue + * consumers don't use the 'head' index in the same manner. */ +typedef struct am_ctl_qhdr { + uint32_t head; /* Touched only by 1 consumer */ + uint8_t _pad0[64 - 4]; + + pthread_spinlock_t lock; + uint32_t tail; /* XXX candidate for fetch-and-incr */ + uint32_t elem_cnt; + uint32_t elem_sz; + uint8_t _pad1[64 - 3 * 4 - sizeof(pthread_spinlock_t)]; +} am_ctl_qhdr_t; +PSMI_STRICT_SIZE_DECL(am_ctl_qhdr_t, 128); + +/* Each process has a reply qhdr and a request qhdr */ +typedef struct am_ctl_blockhdr { + volatile am_ctl_qhdr_t shortq; + volatile am_ctl_qhdr_t longbulkq; +} am_ctl_blockhdr_t; +PSMI_STRICT_SIZE_DECL(am_ctl_blockhdr_t, 128 * 2); + +/* We cache the "shorts" because that's what we poll on in the critical path. + * We take care to always update these pointers whenever the segment is remapped. + */ +typedef struct am_ctl_qshort_cache { + volatile am_pkt_short_t *base; + volatile am_pkt_short_t *head; + volatile am_pkt_short_t *end; +} am_ctl_qshort_cache_t; + +/****************************************** + * Shared segment local directory (global) + ****************************************** + * + * Each process keeps a directory for where request and reply structures are + * located at its peers. This directory must be re-initialized every time the + * shared segment moves in the VM, and the segment moves every time we remap() + * for additional memory. + */ +struct amsh_qdirectory { + am_ctl_blockhdr_t *qreqH; + am_pkt_short_t *qreqFifoShort; + am_pkt_bulk_t *qreqFifoLong; + + am_ctl_blockhdr_t *qrepH; + am_pkt_short_t *qrepFifoShort; + am_pkt_bulk_t *qrepFifoLong; +} __attribute__ ((aligned(64))); + +/****************************************** + * Shared fifo element counts and sizes + ****************************************** + * These values are context-wide, they can only be set early on and can't be * + * modified at runtime. All endpoints are expected to use the same values. + */ +typedef +struct amsh_qinfo { + int qreqFifoShort; + int qreqFifoLong; + + int qrepFifoShort; + int qrepFifoLong; +} amsh_qinfo_t; + +/****************************************** + * Per-endpoint structures (ep-local) + ****************************************** + * Each endpoint keeps its own information as to where it resides in the + * directory, and maintains its own cached copies of where the short header + * resides in shared memory. + * + * This structure is carefully arranged to optimize cache locality and + * performance. Do not modify without careful and thorough analysis. + */ +struct am_ctl_nodeinfo { + uint16_t psm_verno; + volatile uint16_t is_init; + volatile pid_t pid; + psm2_epid_t epid; + psm2_epaddr_t epaddr; + uintptr_t amsh_shmbase; + amsh_qinfo_t amsh_qsizes; + uint32_t amsh_features; + struct amsh_qdirectory qdir; +} __attribute__((aligned(64))); + +struct ptl_am { + psm2_ep_t ep; + psm2_epid_t epid; + psm2_epaddr_t epaddr; + ptl_ctl_t *ctl; + + int connect_phase; + int connect_outgoing; + int connect_incoming; + + int zero_polls; + int amsh_only_polls; + int max_ep_idx, am_ep_size; + int psmi_kassist_mode; + char *amsh_keyname; + + /* These three items carefully picked to fit in one cache line. */ + am_ctl_qshort_cache_t reqH; + am_ctl_qshort_cache_t repH; + struct am_reqq_fifo_t psmi_am_reqq_fifo; + + am_pkt_short_t amsh_empty_shortpkt; + + struct am_ctl_nodeinfo *self_nodeinfo; + struct am_ctl_nodeinfo *am_ep; +} __attribute__((aligned(64))); + +#endif diff --git a/ptl_am/ptl.c b/ptl_am/ptl.c new file mode 100644 index 0000000..2e42c1b --- /dev/null +++ b/ptl_am/ptl.c @@ -0,0 +1,378 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" +#include "cmarw.h" + +#ifdef PSM_CUDA +#include "am_cuda_memhandle_cache.h" +#endif + +/** + * Callback function when a receive request is matched with the + * tag obtained from the RTS packet. + */ +static +psm2_error_t +ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted, + amsh_am_token_t *tok) +{ + psm2_amarg_t args[5]; + psm2_epaddr_t epaddr = req->rts_peer; + struct ptl_am *ptl = (struct ptl_am *)(epaddr->ptlctl->ptl); + int cma_succeed = 0; + int pid = 0, cuda_ipc_send_completion = 0; + + PSM2_LOG_MSG("entering."); + psmi_assert((tok != NULL && was_posted) + || (tok == NULL && !was_posted)); + + _HFI_VDBG("[shm][rndv][recv] req=%p dest=%p len=%d tok=%p\n", + req, req->req_data.buf, req->req_data.recv_msglen, tok); +#ifdef PSM_CUDA + if (req->cuda_ipc_handle_attached) { + + CUdeviceptr cuda_ipc_dev_ptr = am_cuda_memhandle_acquire(req->rts_sbuf - req->cuda_ipc_offset, + (CUipcMemHandle*)&req->cuda_ipc_handle, + req->req_data.recv_msglen, + req->rts_peer->epid); + cuda_ipc_dev_ptr = cuda_ipc_dev_ptr + req->cuda_ipc_offset; + /* cuMemcpy into the receive side buffer + * based on its location */ + if (req->is_buf_gpu_mem) { + PSMI_CUDA_CALL(cuMemcpyDtoD, (CUdeviceptr)req->req_data.buf, cuda_ipc_dev_ptr, + req->req_data.recv_msglen); + PSMI_CUDA_CALL(cuEventRecord, req->cuda_ipc_event, 0); + PSMI_CUDA_CALL(cuEventSynchronize, req->cuda_ipc_event); + } else + PSMI_CUDA_CALL(cuMemcpyDtoH, req->req_data.buf, cuda_ipc_dev_ptr, + req->req_data.recv_msglen); + cuda_ipc_send_completion = 1; + am_cuda_memhandle_release(cuda_ipc_dev_ptr - req->cuda_ipc_offset); + req->cuda_ipc_handle_attached = 0; + goto send_cts; + } +#endif + + if ((ptl->psmi_kassist_mode & PSMI_KASSIST_GET) + && req->req_data.recv_msglen > 0 + && (pid = psmi_epaddr_pid(epaddr))) { +#ifdef PSM_CUDA + /* If the buffer on the send side is on the host, + * we alloc a bounce buffer, use kassist and then + * do a cuMemcpy if the buffer on the recv side + * resides on the GPU + */ + if (req->is_buf_gpu_mem) { + void* cuda_ipc_bounce_buf = psmi_malloc(PSMI_EP_NONE, UNDEFINED, req->req_data.recv_msglen); + size_t nbytes = cma_get(pid, (void *)req->rts_sbuf, + cuda_ipc_bounce_buf, req->req_data.recv_msglen); + psmi_assert_always(nbytes == req->req_data.recv_msglen); + PSMI_CUDA_CALL(cuMemcpyHtoD, (CUdeviceptr)req->req_data.buf, cuda_ipc_bounce_buf, + req->req_data.recv_msglen); + /* Cuda library has recent optimizations where they do + * not guarantee synchronus nature for Host to Device + * copies for msg sizes less than 64k. The event record + * and synchronize calls are to guarentee completion. + */ + PSMI_CUDA_CALL(cuEventRecord, req->cuda_ipc_event, 0); + PSMI_CUDA_CALL(cuEventSynchronize, req->cuda_ipc_event); + psmi_free(cuda_ipc_bounce_buf); + } else { + /* cma can be done in handler context or not. */ + size_t nbytes = cma_get(pid, (void *)req->rts_sbuf, + req->req_data.buf, req->req_data.recv_msglen); + psmi_assert_always(nbytes == req->req_data.recv_msglen); + } +#else + /* cma can be done in handler context or not. */ + size_t nbytes = cma_get(pid, (void *)req->rts_sbuf, + req->req_data.buf, req->req_data.recv_msglen); + if (nbytes == -1) { + ptl->psmi_kassist_mode = PSMI_KASSIST_OFF; + _HFI_ERROR("Reading from remote process' memory failed. Disabling CMA support\n"); + } + else { + psmi_assert_always(nbytes == req->req_data.recv_msglen); + cma_succeed = 1; + } + psmi_assert_always(nbytes == req->req_data.recv_msglen); +#endif + } + +#ifdef PSM_CUDA +send_cts: +#endif + args[0].u64w0 = (uint64_t) (uintptr_t) req->ptl_req_ptr; + args[1].u64w0 = (uint64_t) (uintptr_t) req; + args[2].u64w0 = (uint64_t) (uintptr_t) req->req_data.buf; + args[3].u32w0 = req->req_data.recv_msglen; + args[3].u32w1 = tok != NULL ? 1 : 0; + args[4].u32w0 = ptl->psmi_kassist_mode; // pass current kassist mode to the peer process + + if (tok != NULL) { + psmi_am_reqq_add(AMREQUEST_SHORT, tok->ptl, + tok->tok.epaddr_incoming, mq_handler_rtsmatch_hidx, + args, 5, NULL, 0, NULL, 0); + } else + psmi_amsh_short_request((struct ptl *)ptl, epaddr, mq_handler_rtsmatch_hidx, + args, 5, NULL, 0, 0); + + /* 0-byte completion or we used kassist */ + if (pid || cma_succeed || + req->req_data.recv_msglen == 0 || cuda_ipc_send_completion == 1) { + psmi_mq_handle_rts_complete(req); + } + PSM2_LOG_MSG("leaving."); + return PSM2_OK; +} + +static +psm2_error_t +ptl_handle_rtsmatch(psm2_mq_req_t req, int was_posted) +{ + /* was_posted == 0 allows us to assume that we're not running this callback + * within am handler context (i.e. we can poll) */ + psmi_assert(was_posted == 0); + return ptl_handle_rtsmatch_request(req, 0, NULL); +} + +void +psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, + size_t len) +{ + amsh_am_token_t *tok = (amsh_am_token_t *) toki; + psm2_mq_req_t req; + psm2_mq_tag_t tag; + int rc; + uint32_t opcode = args[0].u32w0; + uint32_t msglen = opcode <= MQ_MSG_SHORT ? len : args[0].u32w1; + + tag.tag[0] = args[1].u32w1; + tag.tag[1] = args[1].u32w0; + tag.tag[2] = args[2].u32w1; + psmi_assert(toki != NULL); + _HFI_VDBG("mq=%p opcode=%d, len=%d, msglen=%d\n", + tok->mq, opcode, (int)len, msglen); + + switch (opcode) { + case MQ_MSG_TINY: + case MQ_MSG_SHORT: + case MQ_MSG_EAGER: + rc = psmi_mq_handle_envelope(tok->mq, tok->tok.epaddr_incoming, + &tag, msglen, 0, buf, + (uint32_t) len, 1, opcode, &req); + + /* for eager matching */ + req->ptl_req_ptr = (void *)tok->tok.epaddr_incoming; + req->msg_seqnum = 0; /* using seqnum 0 */ + break; + default:{ + void *sreq = (void *)(uintptr_t) args[3].u64w0; + uintptr_t sbuf = (uintptr_t) args[4].u64w0; + psmi_assert(narg == 5); + psmi_assert_always(opcode == MQ_MSG_LONGRTS); + rc = psmi_mq_handle_rts(tok->mq, tok->tok.epaddr_incoming, + &tag, msglen, NULL, 0, 1, + ptl_handle_rtsmatch, &req); + + req->rts_peer = tok->tok.epaddr_incoming; + req->ptl_req_ptr = sreq; + req->rts_sbuf = sbuf; +#ifdef PSM_CUDA + /* Payload in RTS would mean an IPC handle has been + * sent. This would also mean the sender has to + * send from a GPU buffer + */ + if (buf && len > 0) { + req->cuda_ipc_handle = *((CUipcMemHandle*)buf); + req->cuda_ipc_handle_attached = 1; + req->cuda_ipc_offset = args[2].u32w0; + } +#endif + + if (rc == MQ_RET_MATCH_OK) /* we are in handler context, issue a reply */ + ptl_handle_rtsmatch_request(req, 1, tok); + /* else will be called later */ + break; + } + } + return; +} + +void +psmi_am_mq_handler_data(void *toki, psm2_amarg_t *args, int narg, void *buf, + size_t len) +{ + amsh_am_token_t *tok = (amsh_am_token_t *) toki; + + psmi_assert(toki != NULL); + + psm2_epaddr_t epaddr = (psm2_epaddr_t) tok->tok.epaddr_incoming; + psm2_mq_req_t req = mq_eager_match(tok->mq, epaddr, 0); /* using seqnum 0 */ + psmi_assert_always(req != NULL); + psmi_mq_handle_data(tok->mq, req, args[2].u32w0, buf, len); + + return; +} + +/** + * Function to handle CTS on the sender. + */ +void +psmi_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg, void *buf, + size_t len) +{ + amsh_am_token_t *tok = (amsh_am_token_t *) toki; + + psmi_assert(toki != NULL); + + ptl_t *ptl = tok->ptl; + psm2_mq_req_t sreq = (psm2_mq_req_t) (uintptr_t) args[0].u64w0; +#ifdef PSM_CUDA + /* If send side req has a cuda ipc handle attached then we can + * assume the data has been copied as soon as we get a CTS + */ + if (sreq->cuda_ipc_handle_attached) { + sreq->cuda_ipc_handle_attached = 0; + psmi_mq_handle_rts_complete(sreq); + return; + } +#endif + void *dest = (void *)(uintptr_t) args[2].u64w0; + uint32_t msglen = args[3].u32w0; + psm2_amarg_t rarg[1]; + + _HFI_VDBG("[rndv][send] req=%p dest_req=%p src=%p dest=%p len=%d\n", + sreq, (void *)(uintptr_t) args[1].u64w0, sreq->req_data.buf, dest, + msglen); + + if (msglen > 0) { + rarg[0].u64w0 = args[1].u64w0; /* rreq */ + int kassist_mode = ((struct ptl_am *)ptl)->psmi_kassist_mode; + int kassist_mode_peer = args[4].u32w0; + // In general, peer process(es) shall have the same kassist mode set, + // but due to dynamic CMA failure detection, we must align local and remote state, + // and make protocol to adopt to that potential change. + if (kassist_mode_peer == PSMI_KASSIST_OFF && (kassist_mode & PSMI_KASSIST_MASK)) { + ((struct ptl_am *)ptl)->psmi_kassist_mode = PSMI_KASSIST_OFF; + goto no_kassist; + } + + if (kassist_mode & PSMI_KASSIST_PUT) { + int pid = psmi_epaddr_pid(tok->tok.epaddr_incoming); + size_t nbytes = cma_put(sreq->req_data.buf, pid, dest, msglen); + if (nbytes == -1) { + _HFI_ERROR("Writing to remote process' memory failed. Disabling CMA support\n"); + ((struct ptl_am *)ptl)->psmi_kassist_mode = PSMI_KASSIST_OFF; + goto no_kassist; + } + + psmi_assert_always(nbytes == msglen); + + /* Send response that PUT is complete */ + psmi_amsh_short_reply(tok, mq_handler_rtsdone_hidx, + rarg, 1, NULL, 0, 0); + } else if (!(kassist_mode & PSMI_KASSIST_MASK)) { + /* Only transfer if kassist is off, i.e. neither GET nor PUT. */ +no_kassist: + psmi_amsh_long_reply(tok, mq_handler_rtsdone_hidx, rarg, + 1, sreq->req_data.buf, msglen, dest, 0); + } + } + psmi_mq_handle_rts_complete(sreq); +} + +void +psmi_am_mq_handler_rtsdone(void *toki, psm2_amarg_t *args, int narg, void *buf, + size_t len) +{ + psm2_mq_req_t rreq = (psm2_mq_req_t) (uintptr_t) args[0].u64w0; + psmi_assert(narg == 1); + _HFI_VDBG("[rndv][recv] req=%p dest=%p len=%d\n", rreq, rreq->req_data.buf, + rreq->req_data.recv_msglen); + psmi_mq_handle_rts_complete(rreq); +} + +void +psmi_am_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len) +{ + amsh_am_token_t *tok = (amsh_am_token_t *) toki; + struct psm2_ep_am_handle_entry *hentry; + + psmi_assert(toki != NULL); + + hentry = psm_am_get_handler_function(tok->mq->ep, + (psm2_handler_t) args[0].u32w0); + + /* Note a guard here for hentry != NULL is not needed because at + * initialization, a psmi_assert_always() assure the entry will be + * non-NULL. */ + + /* Invoke handler function. For AM we do not support break functionality */ + if (likely(hentry->version == PSM2_AM_HANDLER_V2)) { + psm2_am_handler_2_fn_t hfn2 = + (psm2_am_handler_2_fn_t)hentry->hfn; + hfn2(toki, args + 1, narg - 1, buf, len, hentry->hctx); + } else { + psm2_am_handler_fn_t hfn1 = + (psm2_am_handler_fn_t)hentry->hfn; + hfn1(toki, args + 1, narg - 1, buf, len); + } + + return; +} diff --git a/ptl_am/ptl_fwd.h b/ptl_am/ptl_fwd.h new file mode 100644 index 0000000..e1bd064 --- /dev/null +++ b/ptl_am/ptl_fwd.h @@ -0,0 +1,64 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +#ifndef _PTL_FWD_AMSH_H +#define _PTL_FWD_AMSH_H + +/* Symbol in am ptl */ +struct ptl_ctl_init psmi_ptl_amsh; + +extern int psmi_shm_mq_rv_thresh; + +#endif diff --git a/ptl_ips/Makefile b/ptl_ips/Makefile new file mode 100644 index 0000000..d455886 --- /dev/null +++ b/ptl_ips/Makefile @@ -0,0 +1,96 @@ +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2015 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2015 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Copyright (c) 2003-2014 Intel Corporation. All rights reserved. +# + +OUTDIR = . + +this_srcdir = $(shell readlink -m .) +top_srcdir := $(this_srcdir)/.. +INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips + +${TARGLIB}-objs := ptl.o ptl_rcvthread.o ips_proto.o ips_recvq.o \ + ips_recvhdrq.o ips_proto_recv.o ips_proto_connect.o \ + ips_proto_dump.o ips_proto_mq.o \ + ips_writehdrq.o ips_proto_expected.o ips_tid.o \ + ips_scb.o ips_proto_am.o ips_opp_path_rec.o ips_tidflow.o \ + ips_epstate.o ips_crc32.o ips_path_rec.o ips_tidcache.o + +${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs}) + +DEPS := $(${TARGLIB}-objs:.o=.d) + +.PHONY: all clean +IGNORE_DEP_TARGETS = clean + +all .DEFAULT: ${${TARGLIB}-objs} + +$(OUTDIR)/%.d: $(this_srcdir)/%.c + $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o) + +$(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS} + $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -c $< -o $@ + +clean: + @if [ -d $(OUTDIR) ]; then \ + cd $(OUTDIR); \ + rm -f *.o *.d *.gcda *.gcno; \ + cd -; \ + fi + +#ifeq prevents the deps from being included during clean +#-include line is required to pull in auto-dependecies during 2nd pass +ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),) +-include ${DEPS} +endif + +install: + @echo "Nothing to do for install." diff --git a/ptl_ips/ips_config.h b/ptl_ips/ips_config.h new file mode 100644 index 0000000..c323194 --- /dev/null +++ b/ptl_ips/ips_config.h @@ -0,0 +1,134 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2018 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2018 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef PTL_IPS_IPS_CONFIG_H +#define PTL_IPS_IPS_CONFIG_H + +#include "psm_config.h" + +/* Allocate new epaddrs in chunks of 128 */ +#define PTL_EPADDR_ALLOC_CHUNK 128 + +/* Generate an expected header every 16 packets */ +#define PSM_DEFAULT_EXPECTED_HEADER 16 + +#define DF_OPP_LIBRARY "libopasadb.so.1.0.0" +#define DATA_VFABRIC_OFFSET 8 + +/* Send retransmission */ +#define IPS_PROTO_SPIO_RETRY_US_DEFAULT 2 /* in uS */ + +#define IPS_PROTO_ERRCHK_MS_MIN_DEFAULT 160 /* in millisecs */ +#define IPS_PROTO_ERRCHK_MS_MAX_DEFAULT 640 /* in millisecs */ +#define IPS_PROTO_ERRCHK_FACTOR_DEFAULT 2 +#define PSM_TID_TIMEOUT_DEFAULT "160:640:2" /* update from above params */ + +/* We have to get an MTU of at least 2K, or else this breaks some assumptions + * in the packets that handle tid descriptors + */ +#define IPS_PROTOEXP_MIN_MTU 2048 + +/* Fault injection, becomes parameters to psmi_faultinj_getspec so + * a comma-delimited list of + * "spec_name", num, denom + * Where num/denom means fault num out of every denom. + * The defines set 'denum' and assume that num is set to 1 + * + * These values are all defaults, each is overridable via + * PSM2_FI_ in the environment (and yes, spec_name is in lowercase + * *in the environment* just to minimize it appearing in the wild). The format + * there is so the same thing except that one can set + * a specific seed to the random number generator. + */ +#define IPS_FAULTINJ_DMALOST 20 /* 1 every 20 dma writev get lost */ +#define IPS_FAULTINJ_PIOLOST 100 /* 1 every 100 pio writes get lost */ +#define IPS_FAULTINJ_PIOBUSY 10 /* 1 every 10 pio sends get busy */ +#define IPS_FAULTINJ_RECVLOST 200 /* 1 every 200 pkts dropped at recv */ + + +/* TID */ + +/* Max tids a context can support */ +#define IPS_TID_MAX_TIDS 2048 +/* Max tid-session buffer size */ +#define PSM_TIDLIST_BUFSIZE 4096 +/* Max tid-session window size */ +#define PSM_TID_WINSIZE (4*1024*1024) +/* Max number of packets for a single TID flow, fitting tid-session window. + * In PSM2 packet integrity is realized by PSN (Packet Sequence Number), + * which is kept as 11 bits field (for 9B KDETH), + * giving max value 2048 (0 - 2047) */ +#define PSM_TID_MAX_PKTS 2048 +/* Total number of combined pages from the Tid-pair to be merged */ +#define PSM_MAX_NUM_PAGES_IN_TIDPAIR 512 + + +/* rcv thread */ +/* All in milliseconds */ +#define RCVTHREAD_TO_MIN_FREQ 10 /* min of 10 polls per sec */ +#define RCVTHREAD_TO_MAX_FREQ 100 /* max of 100 polls per sec */ +#define RCVTHREAD_TO_SHIFT 1 + +/* ptl.c */ +#define PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS 250 + +/* ips_proto_recv.c */ +#define PSM_STRAY_WARN_INTERVAL_DEFAULT_SECS 30 + +/* + * Easy switch to (say) _HFI_INFO if debugging in the expected protocol is + * needed + */ +#define _HFI_EXP _HFI_VDBG + +#endif /* PTL_IPS_IPS_CONFIG_H */ diff --git a/ptl_ips/ips_crc32.c b/ptl_ips/ips_crc32.c new file mode 100644 index 0000000..589f327 --- /dev/null +++ b/ptl_ips/ips_crc32.c @@ -0,0 +1,93 @@ +/* The code in this file was derived from crc32.c in zlib 1.2.3, and + modified from its original form to suit our requirements. The zlib + license and crc32.c copyright and credits are preserved below. */ + +/* zlib.h -- interface of the 'zlib' general purpose compression library + version 1.2.3, July 18th, 2005 + + Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu + + The data format used by the zlib library is described by RFCs (Request for + Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt + (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). +*/ + +/* crc32.c -- compute the CRC-32 of a data stream + * Copyright (C) 1995-2005 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + * + * Thanks to Rodney Brown for his contribution of faster + * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing + * tables for updating the shift register in one step with three exclusive-ors + * instead of four steps with four exclusive-ors. This results in about a + * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3. + */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" + +/* Table of CRCs of all 8-bit messages. */ +static uint32_t crc_table[256]; + +/* Flag: has the table been computed? Initially false. */ +static int crc_table_computed; + +/* Make the table for a fast CRC. */ +static void make_crc_table(void) +{ + uint32_t c; + int n, k; + + for (n = 0; n < 256; n++) { + c = (uint32_t) n; + for (k = 0; k < 8; k++) { + if (c & 1) + c = 0xedb88320 ^ (c >> 1); + else + c = c >> 1; + } + crc_table[n] = c; + } + crc_table_computed = 1; +} + +/* Update a running CRC with the bytes buf[0..len-1]--the CRC + * should be initialized to all 1's, and the transmitted value + * is the 1's complement of the final running CRC (see the + * crc() routine below)). + */ + +uint32_t ips_crc_calculate(uint32_t len, uint8_t *data, uint32_t crc) +{ + uint32_t c = crc; + uint32_t n; + + if (!crc_table_computed) { + make_crc_table(); + } + for (n = 0; n < len; n++) { + c = crc_table[(c ^ data[n]) & 0xff] ^ (c >> 8); + } + return c; +} diff --git a/ptl_ips/ips_epstate.c b/ptl_ips/ips_epstate.c new file mode 100644 index 0000000..af3ec7a --- /dev/null +++ b/ptl_ips/ips_epstate.c @@ -0,0 +1,154 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" +#include "ips_epstate.h" + +/* The indexes are used to map a particular endpoint to a structure at the + * receiver. Although we take extra care to validate the identity of endpoints + * when packets are received, the communication index is at an offset selected + * by the endpoint that allocates the index. This narrows the window of two + * jobs communicated with the same set of indexes from getting crosstalk. + */ + +psm2_error_t +ips_epstate_init(struct ips_epstate *eps, const psmi_context_t *context) +{ + memset(eps, 0, sizeof(*eps)); + eps->context = context; + eps->eps_base_idx = ((ips_epstate_idx)get_cycles()) & + (IPS_EPSTATE_CONNIDX_MAX-1); + return PSM2_OK; +} + +psm2_error_t ips_epstate_fini(struct ips_epstate *eps) +{ + if (eps->eps_tab) + psmi_free(eps->eps_tab); + memset(eps, 0, sizeof(*eps)); + return PSM2_OK; +} + +/* + * Add ipsaddr with epid to the epstate table, return new index to caller in + * 'connidx'. + */ +psm2_error_t +ips_epstate_add(struct ips_epstate *eps, struct ips_epaddr *ipsaddr, + ips_epstate_idx *connidx_o) +{ + int i, j; + ips_epstate_idx connidx; + + if (++eps->eps_tabsizeused > eps->eps_tabsize) { /* realloc */ + struct ips_epstate_entry *newtab; + eps->eps_tabsize += PTL_EPADDR_ALLOC_CHUNK; + newtab = (struct ips_epstate_entry *) + psmi_calloc(eps->context->ep, PER_PEER_ENDPOINT, + eps->eps_tabsize, + sizeof(struct ips_epstate_entry)); + if (newtab == NULL) + return PSM2_NO_MEMORY; + else if (eps->eps_tab) { /* NOT first alloc */ + for (i = 0; + i < eps->eps_tabsize - PTL_EPADDR_ALLOC_CHUNK; i++) + newtab[i] = eps->eps_tab[i]; /* deep copy */ + psmi_free(eps->eps_tab); + } + eps->eps_tab = newtab; + } + /* Find the next free hole. We can afford to do this since connect is not + * in the critical path */ + for (i = 0, j = eps->eps_tab_nextidx; i < eps->eps_tabsize; i++, j++) { + if (j == eps->eps_tabsize) + j = 0; + if (eps->eps_tab[j].ipsaddr == NULL) { + eps->eps_tab_nextidx = j + 1; + if (eps->eps_tab_nextidx == eps->eps_tabsize) + eps->eps_tab_nextidx = 0; + break; + } + } + psmi_assert_always(i != eps->eps_tabsize); + connidx = (j - eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1); + _HFI_VDBG("node %s gets connidx=%d (table idx %d)\n", + psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), connidx, + j); + eps->eps_tab[j].ipsaddr = ipsaddr; + if (j >= IPS_EPSTATE_CONNIDX_MAX) { + return psmi_handle_error(eps->context->ep, + PSM2_TOO_MANY_ENDPOINTS, + "Can't connect to more than %d non-local endpoints", + IPS_EPSTATE_CONNIDX_MAX); + } + *connidx_o = connidx; + return PSM2_OK; +} + +psm2_error_t ips_epstate_del(struct ips_epstate *eps, ips_epstate_idx connidx) +{ + ips_epstate_idx idx; + /* actual table index */ + idx = (connidx + eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1); + psmi_assert_always(idx < eps->eps_tabsize); + _HFI_VDBG("connidx=%d, table_idx=%d\n", connidx, idx); + eps->eps_tab[idx].ipsaddr = NULL; + /* We may eventually want to release memory, but probably not */ + eps->eps_tabsizeused--; + return PSM2_OK; +} diff --git a/ptl_ips/ips_epstate.h b/ptl_ips/ips_epstate.h new file mode 100644 index 0000000..7308040 --- /dev/null +++ b/ptl_ips/ips_epstate.h @@ -0,0 +1,100 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_EPSTATE_H +#define _IPS_EPSTATE_H + +#include "psm_user.h" + +typedef uint32_t ips_epstate_idx; +#define IPS_EPSTATE_CONNIDX_MAX (1<<26) + +struct ips_epaddr; + +struct ips_epstate_entry { + struct ips_epaddr *ipsaddr; +}; + +struct ips_epstate { + const psmi_context_t *context; + ips_epstate_idx eps_base_idx; + int eps_tabsize; + int eps_tabsizeused; + int eps_tab_nextidx; + + struct ips_epstate_entry *eps_tab; +}; + +psm2_error_t ips_epstate_init(struct ips_epstate *eps, + const psmi_context_t *contextj); +psm2_error_t ips_epstate_fini(struct ips_epstate *eps); + +psm2_error_t ips_epstate_add(struct ips_epstate *eps, + struct ips_epaddr *ipsaddr, + ips_epstate_idx *connidx); +psm2_error_t ips_epstate_del(struct ips_epstate *eps, ips_epstate_idx connidx); + +PSMI_INLINE( +struct ips_epstate_entry * +ips_epstate_lookup(const struct ips_epstate *eps, ips_epstate_idx idx)) +{ + idx = (idx + eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1); + if (idx < eps->eps_tabsize) + return &eps->eps_tab[idx]; + else + return NULL; +} + +#endif /* _IPS_EPSTATE_H */ diff --git a/ptl_ips/ips_expected_proto.h b/ptl_ips/ips_expected_proto.h new file mode 100644 index 0000000..d845193 --- /dev/null +++ b/ptl_ips/ips_expected_proto.h @@ -0,0 +1,400 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +/* + * Control and state structure for one instance of the expected protocol. The + * protocol depends on some upcalls from internal portions of the receive + * protocol (such as opcodes dedicated for expected protocol handling) + */ + +/* + * Expected tid operations are carried out over "sessions". One session is a + * collection of N tids where N is determined by the expected message window + * size (-W option or PSM2_MQ_RNDV_HFI_WINDOW). Since naks can cause + * retransmissions, each session has an session index (_desc_idx) and a + * generation count (_desc_genc) to be able to identify if retransmitted + * packets reference the correct session. + * + * index and generation count are each 4 bytes encoded in one ptl_arg. They + * could be compressed further but we have the header space, so we don't + * bother. + */ + +#ifndef __IPS_EXPECTED_PROTO_H__ + +#define __IPS_EXPECTED_PROTO_H__ 1 + +#define _desc_idx u32w0 +#define _desc_genc u32w1 + +/* + * For debug and/or other reasons, we can log the state of each tid and + * optionally associate it to a particular receive descriptor + */ + +#define TIDSTATE_FREE 0 +#define TIDSTATE_USED 1 + +struct ips_tidinfo { + uint32_t tid; + uint32_t state; + struct ips_tid_recv_desc *tidrecvc; +}; + +struct ips_protoexp { + const struct ptl *ptl; + struct ips_proto *proto; + struct psmi_timer_ctrl *timerq; + struct ips_tid tidc; + struct ips_tf tfc; + + psm_transfer_type_t ctrl_xfer_type; + psm_transfer_type_t tid_xfer_type; + struct ips_scbctrl tid_scbc_rv; + mpool_t tid_desc_send_pool; + mpool_t tid_getreq_pool; + mpool_t tid_sreq_pool; /* backptr into proto->ep->mq */ + mpool_t tid_rreq_pool; /* backptr into proto->ep->mq */ + struct drand48_data tidflow_drand48_data; + uint32_t tid_flags; + uint32_t tid_send_fragsize; + uint32_t tid_page_offset_mask; + uint64_t tid_page_mask; + uint32_t hdr_pkt_interval; + struct ips_tidinfo *tid_info; + + STAILQ_HEAD(ips_tid_send_pend, /* pending exp. sends */ + ips_tid_send_desc) pend_sendq; + struct psmi_timer timer_send; + + STAILQ_HEAD(ips_tid_get_pend, ips_tid_get_request) pend_getreqsq; /* pending tid reqs */ + struct psmi_timer timer_getreqs; + +#ifdef PSM_CUDA + STAILQ_HEAD(ips_tid_get_cudapend, /* pending cuda transfers */ + ips_tid_get_request) cudapend_getreqsq; + struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_recv_cfg; + struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_small_recv_cfg; + mpool_t cuda_hostbuf_pool_recv; + mpool_t cuda_hostbuf_pool_small_recv; + CUstream cudastream_recv; +#endif +}; + +/* + * TID member list format used in communication. + * Since the compiler does not make sure the bit fields order, + * we use mask and shift defined below. +typedef struct { + uint32_t length:11; // in page unit, max 1024 pages + uint32_t reserved:9; // for future usage + uint32_t tidctrl:2; // hardware defined tidctrl value + uint32_t tid:10; // hardware only support 10bits +} +ips_tid_session_member; + */ +#define IPS_TIDINFO_LENGTH_SHIFT 0 +#define IPS_TIDINFO_LENGTH_MASK 0x7ff +#define IPS_TIDINFO_TIDCTRL_SHIFT 20 +#define IPS_TIDINFO_TIDCTRL_MASK 0x3 +#define IPS_TIDINFO_TID_SHIFT 22 +#define IPS_TIDINFO_TID_MASK 0x3ff + +#define IPS_TIDINFO_GET_LENGTH(tidinfo) \ + (((tidinfo)>>IPS_TIDINFO_LENGTH_SHIFT)&IPS_TIDINFO_LENGTH_MASK) +#define IPS_TIDINFO_GET_TIDCTRL(tidinfo) \ + (((tidinfo)>>IPS_TIDINFO_TIDCTRL_SHIFT)&IPS_TIDINFO_TIDCTRL_MASK) +#define IPS_TIDINFO_GET_TID(tidinfo) \ + (((tidinfo)>>IPS_TIDINFO_TID_SHIFT)&IPS_TIDINFO_TID_MASK) + +typedef struct ips_tid_session_list_tag { + uint8_t tsess_unaligned_start; /* unaligned bytes at starting */ + uint8_t tsess_unaligned_end; /* unaligned bytes at ending */ + uint16_t tsess_tidcount; /* tid number for the session */ + uint32_t tsess_tidoffset; /* offset in first tid */ + uint32_t tsess_srcoff; /* source offset from beginning */ + uint32_t tsess_length; /* session length, including start/end */ + + uint32_t tsess_list[0]; /* must be last in struct */ +} ips_tid_session_list; + +/* + * Send-side expected send descriptors. + * + * Descriptors are allocated when tid grant requests are received (the 'target' + * side of an RDMA get request). Descriptors are added to a pending queue of + * expected sends and processed one at a time (scb's are requested and messages + * sent until all fragments of the descriptor's length are put on the wire). + * + */ +#define TIDSENDC_SDMA_VEC_DEFAULT 260 + +struct ips_tid_send_desc { + struct ips_protoexp *protoexp; + STAILQ_ENTRY(ips_tid_send_desc) next; + + /* Filled in at allocation time */ + ptl_arg_t sdescid; /* sender descid */ + ptl_arg_t rdescid; /* reciever descid */ + ips_epaddr_t *ipsaddr; + psm2_mq_req_t mqreq; + + /* tidflow to send tid traffic */ + struct ips_flow tidflow; + + /* Iterated during send progress */ + void *userbuf; /* user privided buffer */ + void *buffer; + uint32_t length; /* total length, includint start/end */ + + uint32_t tidbytes; /* bytes sent over tid so far */ + uint32_t remaining_tidbytes; + uint32_t offset_in_tid; /* could be more than page */ + uint32_t remaining_bytes_in_tid; + + uint16_t frame_send; + uint16_t tid_idx; + uint16_t is_complete; + uint16_t frag_size; + /* bitmap of queued control messages for flow */ + uint16_t ctrl_msg_queued; + +#ifdef PSM_CUDA + /* As size of cuda_hostbuf is less than equal to window size, + * there is a guarantee that the maximum number of host bufs we + * would need to attach to a tidsendc would be 2 + */ + struct ips_cuda_hostbuf *cuda_hostbuf[2]; + /* Number of hostbufs attached */ + uint8_t cuda_num_buf; +#endif + /* + * tid_session_list is 24 bytes, plus 512 tidpair for 2048 bytes, + * so the max possible tid window size mq->hfi_base_window_rv is 4M. + * However, PSM must fit tid grant message into a single transfer + * unit, either PIO or SDMA, PSM will shrink the window accordingly. + */ + uint16_t tsess_tidlist_length; + union { + ips_tid_session_list tid_list; + uint8_t filler[PSM_TIDLIST_BUFSIZE+ + sizeof(ips_tid_session_list)]; + }; +}; + +#define TIDRECVC_STATE_FREE 0 +#define TIDRECVC_STATE_BUSY 1 + +struct ips_expected_recv_stats { + uint32_t nSeqErr; + uint32_t nGenErr; + uint32_t nReXmit; + uint32_t nErrChkReceived; +}; + +struct ips_tid_recv_desc { + const psmi_context_t *context; + struct ips_protoexp *protoexp; + + ptl_arg_t rdescid; /* reciever descid */ + ips_epaddr_t *ipsaddr; + struct ips_tid_get_request *getreq; + + /* scb to send tid grant CTS */ + ips_scb_t *grantscb; + /* scb to send tid data completion */ + ips_scb_t *completescb; + + /* tidflow to only send ctrl msg ACK and NAK */ + struct ips_flow tidflow; + + /* TF protocol state (recv) */ + uint32_t state; + uint32_t tidflow_active_gen; + uint32_t tidflow_nswap_gen; + psmi_seqnum_t tidflow_genseq; + +#ifdef PSM_CUDA + struct ips_cuda_hostbuf *cuda_hostbuf; + uint8_t is_ptr_gpu_backed; +#endif + + void *buffer; + uint32_t recv_msglen; + uint32_t recv_tidbytes; /* exlcude start/end trim */ + + struct ips_expected_recv_stats stats; + + /* bitmap of queued control messages for */ + uint16_t ctrl_msg_queued; + /* + * tid_session_list is 24 bytes, plus 512 tidpair for 2048 bytes, + * so the max possible tid window size mq->hfi_base_window_rv is 4M. + * However, PSM must fit tid grant message into a single transfer + * unit, either PIO or SDMA, PSM will shrink the window accordingly. + */ + uint16_t tsess_tidlist_length; + union { + ips_tid_session_list tid_list; + uint8_t filler[PSM_TIDLIST_BUFSIZE+ + sizeof(ips_tid_session_list)]; + }; +}; + +/* + * Get requests, issued by MQ when there's a match on a large message. Unlike + * an RDMA get, the initiator identifies the location of the data at the target + * using a 'send token' instead of a virtual address. This, of course, assumes + * that the target has already registered the token and communicated it to the + * initiator beforehand (it actually sends the token as part of the initial + * MQ message that contains the MQ tag). + * + * The operation is semantically a two-sided RDMA get. + */ +typedef void (*ips_tid_completion_callback_t) (void *); + +struct ips_tid_get_request { + STAILQ_ENTRY(ips_tid_get_request) tidgr_next; + struct ips_protoexp *tidgr_protoexp; + psm2_epaddr_t tidgr_epaddr; + + void *tidgr_lbuf; + uint32_t tidgr_length; + uint32_t tidgr_rndv_winsz; + uint32_t tidgr_sendtoken; + ips_tid_completion_callback_t tidgr_callback; + void *tidgr_ucontext; + + uint32_t tidgr_offset; /* offset in bytes */ + uint32_t tidgr_bytesdone; + uint32_t tidgr_flags; + +#ifdef PSM_CUDA + int cuda_hostbuf_used; + uint32_t tidgr_cuda_bytesdone; + STAILQ_HEAD(ips_tid_getreq_cuda_hostbuf_pend, /* pending exp. sends */ + ips_cuda_hostbuf) pend_cudabuf; +#endif +}; + +/* + * Descriptor limits, structure contents of struct psmi_rlimit_mpool for + * normal, min and large configurations. + */ +#define TID_SENDSESSIONS_LIMITS { \ + .env = "PSM2_TID_SENDSESSIONS_MAX", \ + .descr = "Tid max send session descriptors", \ + .env_level = PSMI_ENVVAR_LEVEL_HIDDEN, \ + .minval = 1, \ + .maxval = 1<<30, \ + .mode[PSMI_MEMMODE_NORMAL] = { 256, 8192 }, \ + .mode[PSMI_MEMMODE_MINIMAL] = { 1, 1 }, \ + .mode[PSMI_MEMMODE_LARGE] = { 512, 16384 } \ + } + +/* + * Expected send support + */ +/* + * The expsend token is currently always a pointer to a MQ request. It is + * echoed on the wire throughout various phases of the expected send protocol + * to identify a particular send. + */ +psm2_error_t +MOCKABLE(ips_protoexp_init)(const psmi_context_t *context, + const struct ips_proto *proto, + uint32_t protoexp_flags, int num_of_send_bufs, + int num_of_send_desc, + struct ips_protoexp **protoexp_o); +MOCK_DCL_EPILOGUE(ips_protoexp_init); + +psm2_error_t ips_protoexp_fini(struct ips_protoexp *protoexp); +void ips_protoexp_handle_tiderr(const struct ips_recvhdrq_event *rcv_ev); +void ips_protoexp_handle_data_err(const struct ips_recvhdrq_event *rcv_ev); +void ips_protoexp_handle_tf_seqerr(const struct ips_recvhdrq_event *rcv_ev); +void ips_protoexp_handle_tf_generr(const struct ips_recvhdrq_event *rcv_ev); + +int ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev); +int ips_protoexp_recv_tid_completion(struct ips_recvhdrq_event *rcv_ev); + +psm2_error_t ips_protoexp_flow_newgen(struct ips_tid_recv_desc *tidrecvc); + +PSMI_ALWAYS_INLINE( +void ips_protoexp_unaligned_copy(uint8_t *dst, uint8_t *src, uint16_t len)) +{ + while (len) { + dst[len-1] = src[len-1]; + len--; + } +} + +/* + * Peer is waiting (blocked) for this request + */ +#define IPS_PROTOEXP_TIDGET_WAIT 0x1 +#define IPS_PROTOEXP_TIDGET_PEERWAIT 0x2 +psm2_error_t ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, + void *buf, uint32_t length, + psm2_epaddr_t epaddr, + uint32_t remote_tok, uint32_t flags, + ips_tid_completion_callback_t + callback, void *context); +psm2_error_t +ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, + ips_epaddr_t *ipsaddr, psm2_mq_req_t req, + ptl_arg_t rdescid, uint32_t tidflow_genseq, + ips_tid_session_list *tid_list, + uint32_t tid_list_size); +#endif /* #ifndef __IPS_EXPECTED_PROTO_H__ */ diff --git a/ptl_ips/ips_opp_path_rec.c b/ptl_ips/ips_opp_path_rec.c new file mode 100644 index 0000000..4b0fc3f --- /dev/null +++ b/ptl_ips/ips_opp_path_rec.c @@ -0,0 +1,609 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include + +/* SLID and DLID are in network byte order */ +static psm2_error_t +ips_opp_get_path_rec(ips_path_type_t type, struct ips_proto *proto, + uint16_t slid, uint16_t dlid, uint16_t desthfi_type, + ips_path_rec_t **ppath_rec) +{ + psm2_error_t err = PSM2_OK; + ibta_path_rec_t query, opp_response; +#ifdef _HFI_DEBUGGING + int opp_response_set = 0; +#endif + ips_path_rec_t *path_rec; + int opp_err; + ENTRY elid, *epath = NULL; + char eplid[128]; + uint64_t timeout_ack_ms; + + /* Query path record query cache first */ + bzero(&query, sizeof(query)); + bzero(eplid, sizeof(eplid)); + + /* Bulk service ID is control service id + 1 */ + switch (type) { + case IPS_PATH_LOW_PRIORITY: + query.service_id = + __cpu_to_be64(proto->ep->service_id + DATA_VFABRIC_OFFSET); + break; + case IPS_PATH_NORMAL_PRIORITY: + case IPS_PATH_HIGH_PRIORITY: + default: + query.service_id = __cpu_to_be64(proto->ep->service_id); + } + + query.slid = slid; + query.dlid = dlid; + + snprintf(eplid, sizeof(eplid), "%s_%x_%x", + (type == IPS_PATH_LOW_PRIORITY) ? "LOW" : "HIGH", + query.slid, query.dlid); + elid.key = eplid; + hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash); + + if (!epath) { /* Unable to find path record in cache */ + elid.key = + psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1); + path_rec = (ips_path_rec_t *) + psmi_calloc(proto->ep, UNDEFINED, 1, + sizeof(ips_path_rec_t)); + if (!elid.key || !path_rec) { + if (elid.key) + psmi_free(elid.key); + if (path_rec) + psmi_free(path_rec); + err = PSM2_NO_MEMORY; + goto fail; + } + + /* Get path record between local LID and remote */ + opp_err = + proto->opp_fn.op_path_get_path_by_rec(proto->opp_ctxt, + &query, + &opp_response); + if (opp_err) { + psmi_free(path_rec); + psmi_free(elid.key); + err = PSM2_EPID_PATH_RESOLUTION; + goto fail; + } +#ifdef _HFI_DEBUGGING + opp_response_set = 1; +#endif + /* Create path record */ + path_rec->pr_slid = opp_response.slid; + path_rec->pr_dlid = opp_response.dlid; + path_rec->pr_mtu = + min(opa_mtu_enum_to_int(opp_response.mtu & 0x3f), + proto->epinfo.ep_mtu); + path_rec->pr_pkey = ntohs(opp_response.pkey); + path_rec->pr_sl = ntohs(opp_response.qos_class_sl); + path_rec->pr_static_ipd = + proto->ips_ipd_delay[opp_response.rate & 0x3f]; + + /* Setup CCA parameters for path */ + if (path_rec->pr_sl > PSMI_SL_MAX) { + psmi_free(path_rec); + psmi_free(elid.key); + err = PSM2_INTERNAL_ERR; + goto fail; + } + if (!(proto->ccti_ctrlmap & (1 << path_rec->pr_sl))) { + _HFI_CCADBG("No CCA for sl %d, disable CCA\n", + path_rec->pr_sl); + proto->flags &= ~IPS_PROTO_FLAG_CCA; + proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN; + } + if (!psmi_hal_has_cap(PSM_HAL_CAP_STATIC_RATE_CTRL)) { + _HFI_CCADBG("No Static-Rate-Control, disable CCA\n"); + proto->flags &= ~IPS_PROTO_FLAG_CCA; + proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN; + } + + path_rec->proto = proto; + path_rec->pr_ccti = proto->cace[path_rec->pr_sl].ccti_min; + path_rec->pr_timer_cca = NULL; + + /* Determine active IPD for path. Is max of static rate and CCT table */ + if (!(proto->flags & IPS_PROTO_FLAG_CCA)) { + path_rec->pr_active_ipd = 0; + path_rec->pr_cca_divisor = 0; + } else if ((path_rec->pr_static_ipd) && + ((path_rec->pr_static_ipd + 1) > + (proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK))) { + path_rec->pr_active_ipd = path_rec->pr_static_ipd + 1; + path_rec->pr_cca_divisor = 0; /*Static rate has no CCA divisor */ + } else { + /* Pick it from the CCT table */ + path_rec->pr_active_ipd = + proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK; + path_rec->pr_cca_divisor = + proto->cct[path_rec->pr_ccti] >> CCA_DIVISOR_SHIFT; + } + + /* Compute max timeout based on pkt life time for path */ + timeout_ack_ms = + ((4096UL * (1UL << (opp_response.pkt_life & 0x3f))) / + 1000000UL); + timeout_ack_ms = + ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT + + timeout_ack_ms); + if (proto->epinfo.ep_timeout_ack_max < timeout_ack_ms) + proto->epinfo.ep_timeout_ack_max = timeout_ack_ms; + + /* Add path record into cache */ + strcpy(elid.key, eplid); + elid.data = (void *)path_rec; + hsearch_r(elid, ENTER, &epath, &proto->ips_path_rec_hash); + } else /* Path record found in cache */ + path_rec = (ips_path_rec_t *) epath->data; + +#ifdef _HFI_DEBUGGING + /* Dump path record stats */ + _HFI_PRDBG("Path Record ServiceID: %" PRIx64 " %x -----> %x\n", + (uint64_t) __be64_to_cpu(query.service_id), + __be16_to_cpu(slid), __be16_to_cpu(dlid)); + if (opp_response_set) + { + _HFI_PRDBG("MTU: %x, %x\n", (opp_response.mtu & 0x3f), + path_rec->pr_mtu); + _HFI_PRDBG("PKEY: 0x%04x\n", ntohs(opp_response.pkey)); + _HFI_PRDBG("SL: 0x%04x\n", ntohs(opp_response.qos_class_sl)); + _HFI_PRDBG("Rate: %x, IPD: %x\n", (opp_response.rate & 0x3f), + path_rec->pr_static_ipd); + } + _HFI_PRDBG("Timeout Init.: 0x%" PRIx64 " Max: 0x%" PRIx64 "\n", + proto->epinfo.ep_timeout_ack, + proto->epinfo.ep_timeout_ack_max); +#endif + /* Return the IPS path record */ + *ppath_rec = path_rec; + +fail: + return err; +} + +static psm2_error_t +ips_opp_path_rec(struct ips_proto *proto, + uint16_t slid, uint16_t dlid, uint16_t desthfi_type, + unsigned long timeout, ips_path_grp_t **ppathgrp) +{ + psm2_error_t err = PSM2_OK; + uint16_t pidx, cpath, num_path = (1 << proto->epinfo.ep_lmc); + ips_path_type_t path_type = IPS_PATH_NORMAL_PRIORITY; + ips_path_rec_t *path; + ips_path_grp_t *pathgrp; + uint16_t path_slid, path_dlid; + ENTRY elid, *epath = NULL; + char eplid[128]; + + /* + * High Priority Path + * ------------------ + * + * Uses the "base" Service ID. For now there exists only 1 high priority + * path between nodes even for non zero LMC fabrics. + * + * Normal/Low Priority Paths + * ------------------------- + * + * Currently these paths are the same i.e. they are queried for the same + * Service ID/vFabric which is the Base Service ID for High Priority + 1. + * + * Use case Scenarios + * ------------------ + * + * Since with vFabrics we have the capability to define different QoS + * parameters per vFabric it is envisioned that the IPS_PATH_HIGH_PRIORITY is + * setup in a separate vFabric for high priority traffic. The NORMAL paths + * are setup in a separate vFabric optimized for high bandwidth. This allows + * us to potentially have control traffic (RTS, CTS etc.) not be bottlenecked + * by bulk transfer data. All control messages (ACKs,NAKs, TID_GRANT etc.) + * also use the high priority control vFabric. + * + * NOTE: In order to distinguish between the different vFabrics the user + * specifies the service ID to use via mpirun (or environment variable). + * This is the service ID for the high priority control traffic. The bulk + * data vFabric is identified by service ID + 1. So for each MPI application + * one should specify two service IDs for the high priority and bulk data. + * Both these service IDs can be placed in the same vFabric which can be + * configured for high priority or bandwidth traffic giving us the default + * behavior upto Infinhfi 2.5 release. + * + * NOTE: All of the above would have really helped if the S20 silicon could + * correctly support IBTA QoS features. Due to S20 design we can only have + * high priority VLarb table (low priority VLarb table results in round + * robin arbitration ignoring the weights!). But if this is fixed in a + * subsequent chip respin then this may potentially help our scalability + * on large fabrics. + * + * Mesh/Torus and DOR routed networks + * ---------------------------------- + * + * In a mesh/torus fabric we always have a non zero LMC (at least 1 can be + * more). We would like to take advantage of dispersive routing on these + * fabrics as well to obtain better "worst case/congested" bandwidth. For + * these networks currently the base LIDs are used for UPDN routing which + * is suboptimal on these networks. Higher order LIDs (+1 .. +N) use DOR + * routing (Dimension Ordered Routing) to avoid deadlocks and provide + * higher performance. If a fabric is disrupted then only the base UPDN + * routing is available. PSM should continue to operate in this environment + * albeit with degraded performance. In disrupted fabric the OPP path + * record queries may fail for some DOR routed LIDs i.e. no path exists + * PSM should hence ignore path record failures as they indicate a disrupted + * fabric and only use valid paths that are returned from the replica. This + * will degenerate to only using the UPDN paths on disrupted fabrics and DOR + * routes only for fully configured fabrics. Note: For a clean fabric the + * base LIDs that are configured for UPDN route will not exist in the replica + * as DOR routes are preferred. Hence we will only dispersively route across + * the DOR routes only using the UPDN route for disrupted fabrics. + * + * AS LONG AS ONE PATH EXISTS (for each of the priorities) COMMUNICATION CAN + * TAKE PLACE. + */ + + /* Check if this path grp is already in hash table */ + snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid); + elid.key = eplid; + hsearch_r(elid, FIND, &epath, &proto->ips_path_grp_hash); + + if (epath) { /* Find path group in cache */ + *ppathgrp = (ips_path_grp_t *) epath->data; + return err; + } + + /* If base lids are only used then reset num_path to 1 */ + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE) + num_path = 1; + + /* Allocate a new pathgroup */ + elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1); + pathgrp = (ips_path_grp_t *) + psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_path_grp_t) + + num_path * IPS_PATH_MAX_PRIORITY * + sizeof(ips_path_rec_t *)); + if (!elid.key || !pathgrp) { + if (elid.key) + psmi_free(elid.key); + if (pathgrp) + psmi_free(pathgrp); + err = PSM2_NO_MEMORY; + goto fail; + } + + /* + * dlid is the peer base lid. + * slid is the base lid for the local end point. + * Store here in network byte order. + */ + pathgrp->pg_base_dlid = dlid; + pathgrp->pg_base_slid = slid; + + pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] = + pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] = + pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = 0; + + /* For now there is always only one high priority path between nodes. */ + for (pidx = 0, cpath = 0; pidx < num_path && cpath == 0; pidx++) { + path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx); + path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx); + + err = ips_opp_get_path_rec(IPS_PATH_HIGH_PRIORITY, proto, + path_slid, path_dlid, + desthfi_type, &path); + + if (err == PSM2_OK) { /* Valid high priority path found */ + /* Resolved high priority path successfully */ + pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY]++; + pathgrp->pg_path[cpath][IPS_PATH_HIGH_PRIORITY] = path; + + /* Increment current path index */ + cpath++; + } + + PSM2_LOG_MSG("path %p slid %hu dlid %hu\n", + path, + __be16_to_cpu(path->pr_slid), + __be16_to_cpu(path->pr_dlid)); + } + + /* Make sure we have atleast 1 high priority path */ + if (pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] == 0) { + psmi_free(elid.key); + psmi_free(pathgrp); + err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION, + "OFED Plus path lookup failed. Unable to resolve high priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %" + PRIx64 " defined?", ntohs(slid), + ntohs(dlid), + (uint64_t) proto->ep->service_id); + goto fail; + } + + /* Once we have the high-priority path, set the partition key */ + if (psmi_hal_set_pkey(proto->ep->context.psm_hw_ctxt, + (uint16_t) pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY]->pr_pkey) + != 0) { + err = psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE, + "Couldn't set device pkey 0x%x: %s", + (int)pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY]->pr_pkey, + strerror(errno)); + psmi_free(elid.key); + psmi_free(pathgrp); + goto fail; + } + + + /* Next setup the bulk paths. If the subnet administrator has misconfigured + * or rather not configured two separate service IDs we place the bulk + * paths in the same vFabric as the control paths. + */ + + path_type = IPS_PATH_NORMAL_PRIORITY; + for (pidx = 0, cpath = 0; pidx < num_path; pidx++) { + path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx); + path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx); + +retry_normal_path_res: + err = ips_opp_get_path_rec(path_type, proto, + path_slid, path_dlid, desthfi_type, + &path); + if (err != PSM2_OK) { + if (path_type == IPS_PATH_NORMAL_PRIORITY) { + /* Subnet may only be configured for one service ID/vFabric. Default + * to using the control vFabric/service ID for bulk data as well. + */ + path_type = IPS_PATH_HIGH_PRIORITY; + goto retry_normal_path_res; + } + + /* Unable to resolve path for . This is possible + * for disrupted fabrics using DOR routing so continue to acquire paths + */ + err = PSM2_OK; + continue; + } + + /* Valid path. */ + pathgrp->pg_path[cpath][IPS_PATH_NORMAL_PRIORITY] = path; + pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY]++; + cpath++; + } + + /* Make sure we have at least have a single bulk data transfer path */ + if (pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] == 0) { + psmi_free(elid.key); + psmi_free(pathgrp); + err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION, + "OFED Plus path lookup failed. Unable to resolve normal priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %" + PRIx64 " defined?", ntohs(slid), + ntohs(dlid), + (uint64_t) proto->ep->service_id); + goto fail; + } + + path_type = IPS_PATH_LOW_PRIORITY; + for (pidx = 0, cpath = 0; pidx < num_path; pidx++) { + path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx); + path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx); + +retry_low_path_res: + err = ips_opp_get_path_rec(path_type, proto, + path_slid, path_dlid, desthfi_type, + &path); + if (err != PSM2_OK) { + if (path_type == IPS_PATH_LOW_PRIORITY) { + /* Subnet may only be configured for one service ID/vFabric. Default + * to using the control vFabric/service ID for bulk data as well. + */ + path_type = IPS_PATH_HIGH_PRIORITY; + goto retry_low_path_res; + } + + /* Unable to resolve path for . This is possible + * for disrupted fabrics using DOR routing so continue to acquire paths + */ + err = PSM2_OK; + continue; + } + + /* Valid path. */ + pathgrp->pg_path[cpath][IPS_PATH_LOW_PRIORITY] = path; + pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY]++; + cpath++; + } + + /* Make sure we have at least have a single bulk data transfer path */ + if (pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] == 0) { + psmi_free(elid.key); + psmi_free(pathgrp); + err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION, + "OFED Plus path lookup failed. Unable to resolve low priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %" + PRIx64 " defined?", ntohs(slid), + ntohs(dlid), + (uint64_t) proto->ep->service_id); + goto fail; + } + + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) { + pathgrp->pg_next_path[IPS_PATH_NORMAL_PRIORITY] = + proto->epinfo.ep_context % + pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY]; + pathgrp->pg_next_path[IPS_PATH_LOW_PRIORITY] = + proto->epinfo.ep_context % + pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY]; + } + + /* Add path group into cache */ + strcpy(elid.key, eplid); + elid.data = (void *)pathgrp; + hsearch_r(elid, ENTER, &epath, &proto->ips_path_grp_hash); + + *ppathgrp = pathgrp; + +fail: + if (err != PSM2_OK) + _HFI_PRDBG + ("Unable to get path record for LID 0x%x <---> DLID 0x%x.\n", + slid, dlid); + return err; +} + +static psm2_error_t ips_opp_fini(struct ips_proto *proto) +{ + psm2_error_t err = PSM2_OK; + + if (proto->opp_lib) + dlclose(proto->opp_lib); + + return err; +} + +psm2_error_t ips_opp_init(struct ips_proto *proto) +{ + psm2_error_t err = PSM2_OK; + char hfiName[32]; + + proto->opp_lib = dlopen(DF_OPP_LIBRARY, RTLD_NOW); + if (!proto->opp_lib) { + char *err = dlerror(); + _HFI_ERROR + ("Unable to open OFED Plus Plus library %s. Error: %s\n", + DF_OPP_LIBRARY, err ? err : "no dlerror()"); + goto fail; + } + + /* Resolve symbols that we require within opp library */ + proto->opp_fn.op_path_find_hca = + dlsym(proto->opp_lib, "op_path_find_hfi"); + proto->opp_fn.op_path_open = dlsym(proto->opp_lib, "op_path_open"); + proto->opp_fn.op_path_close = dlsym(proto->opp_lib, "op_path_close"); + proto->opp_fn.op_path_get_path_by_rec = + dlsym(proto->opp_lib, "op_path_get_path_by_rec"); + + /* If we can't resovle any symbol then fail to load opp module */ + if (!proto->opp_fn.op_path_find_hca || !proto->opp_fn.op_path_open || + !proto->opp_fn.op_path_close + || !proto->opp_fn.op_path_get_path_by_rec) { + _HFI_ERROR + ("Unable to resolve symbols in OPP library. Unloading.\n"); + goto fail; + } + + /* If PSM2_IDENTIFY is set display the OPP library location being used. */ + if (getenv("PSM2_IDENTIFY")) { + Dl_info info_opp; + printf + ("PSM2 path record queries using OFED Plus Plus (%s) from %s\n", + DF_OPP_LIBRARY, dladdr(proto->opp_fn.op_path_open, + &info_opp) ? info_opp. + dli_fname : + "Unknown/unsupported version of OPP library found!"); + } + + /* Obtain handle to hfi (requires verbs on node) */ + snprintf(hfiName, sizeof(hfiName), "%s_%d", + psmi_hal_get_hfi_name(), + psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt)); + proto->hndl = proto->opp_fn.op_path_find_hca(hfiName, &proto->device); + if (!proto->hndl) { + _HFI_ERROR + ("OPP: Unable to find HFI %s. Disabling OPP interface for path record queries.\n", + hfiName); + goto fail; + } + + /* Get OPP context */ + proto->opp_ctxt = proto->opp_fn.op_path_open(proto->device, 1); + if (!proto->opp_ctxt) { + _HFI_ERROR + ("OPP: Unable to obtain OPP context. Disabling OPP interface for path record queries.\n"); + goto fail; + } + + /* Setup default errorcheck timeout. OPP may change it later. */ + proto->epinfo.ep_timeout_ack = + ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT); + proto->epinfo.ep_timeout_ack_max = + ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT); + proto->epinfo.ep_timeout_ack_factor = IPS_PROTO_ERRCHK_FACTOR_DEFAULT; + + /* OPP initialized successfully */ + proto->ibta.get_path_rec = ips_opp_path_rec; + proto->ibta.fini = ips_opp_fini; + proto->flags |= IPS_PROTO_FLAG_QUERY_PATH_REC; + + return err; + +fail: + _HFI_ERROR("Make sure SM is running...\n"); + _HFI_ERROR("Make sure service ibacm is running...\n"); + _HFI_ERROR("to start ibacm: service ibacm start\n"); + _HFI_ERROR("or enable it at boot time: opaconfig -E ibacm\n\n"); + + err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION, + "Unable to initialize OFED Plus library successfully.\n"); + + if (proto->opp_lib) + dlclose(proto->opp_lib); + + return err; +} diff --git a/ptl_ips/ips_path_rec.c b/ptl_ips/ips_path_rec.c new file mode 100644 index 0000000..5b37347 --- /dev/null +++ b/ptl_ips/ips_path_rec.c @@ -0,0 +1,863 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include +#include +#include + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" + +/* + * These are the default values used in parsing the environment + * variable PSM2_PATH_NO_LMC_RANGE, which can be used to exclude + * a range of message sizes from the LMC LID assignments used to + * implement dispersive routing. + * + * This value is 2^32 - 1. + */ +#define DEF_LIMITS_STRING "4294967295:4294967295" +#define DEF_LIMITS_VALUE 4294967295 + +static void ips_gen_ipd_table(struct ips_proto *proto) +{ + uint8_t delay = 0, step = 1; + /* Based on our current link rate setup the IPD table */ + memset(proto->ips_ipd_delay, 0xFF, sizeof(proto->ips_ipd_delay)); + + /* + * Based on the starting rate of the link, we let the code to + * fall through to next rate without 'break' in the code. The + * decrement is doubled at each rate level... + */ + switch (proto->epinfo.ep_link_rate) { + case IBV_RATE_300_GBPS: + proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay; + delay += step; + step *= 2; + case IBV_RATE_200_GBPS: + proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay; + delay += step; + step *= 2; + case IBV_RATE_168_GBPS: + proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay; + delay += step; + step *= 2; + case IBV_RATE_120_GBPS: + proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay; + case IBV_RATE_112_GBPS: + proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay; + case IBV_RATE_100_GBPS: + proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay; + delay += step; + step *= 2; + case IBV_RATE_80_GBPS: + proto->ips_ipd_delay[IBV_RATE_80_GBPS] = delay; + case IBV_RATE_60_GBPS: + proto->ips_ipd_delay[IBV_RATE_60_GBPS] = delay; + delay += step; + step *= 2; + case IBV_RATE_40_GBPS: + proto->ips_ipd_delay[IBV_RATE_40_GBPS] = delay; + case IBV_RATE_30_GBPS: + proto->ips_ipd_delay[IBV_RATE_30_GBPS] = delay; + delay += step; + step *= 2; + case IBV_RATE_25_GBPS: + proto->ips_ipd_delay[IBV_RATE_25_GBPS] = delay; + case IBV_RATE_20_GBPS: + proto->ips_ipd_delay[IBV_RATE_20_GBPS] = delay; + delay += step; + step *= 2; + case IBV_RATE_10_GBPS: + proto->ips_ipd_delay[IBV_RATE_10_GBPS] = delay; + case IBV_RATE_5_GBPS: + proto->ips_ipd_delay[IBV_RATE_5_GBPS] = delay; + default: + break; + } +} + +static psm2_error_t ips_gen_cct_table(struct ips_proto *proto) +{ + psm2_error_t err = PSM2_OK; + uint32_t cca_divisor, ipdidx, ipdval = 1; + uint16_t *cct_table; + + /* The CCT table is static currently. If it's already created then return */ + if (proto->cct) + goto fail; + + /* Allocate the CCT table */ + cct_table = psmi_calloc(proto->ep, UNDEFINED, + proto->ccti_size, sizeof(uint16_t)); + if (!cct_table) { + err = PSM2_NO_MEMORY; + goto fail; + } + + if (proto->ccti_size) + { + /* The first table entry is always 0 i.e. no IPD delay */ + cct_table[0] = 0; + } + + /* Generate the remaining CCT table entries */ + for (ipdidx = 1; ipdidx < proto->ccti_size; ipdidx += 4, ipdval++) + for (cca_divisor = 0; cca_divisor < 4; cca_divisor++) { + if ((ipdidx + cca_divisor) == proto->ccti_size) + break; + cct_table[ipdidx + cca_divisor] = + (((cca_divisor ^ 0x3) << CCA_DIVISOR_SHIFT) | + (ipdval & 0x3FFF)); + _HFI_CCADBG("CCT[%d] = %x. Divisor: %x, IPD: %x\n", + ipdidx + cca_divisor, + cct_table[ipdidx + cca_divisor], + (cct_table[ipdidx + cca_divisor] >> + CCA_DIVISOR_SHIFT), + cct_table[ipdidx + + cca_divisor] & CCA_IPD_MASK); + } + + /* On link up/down CCT is re-generated. If CCT table is previously created + * free it + */ + if (proto->cct) { + psmi_free(proto->cct); + proto->cct = NULL; + } + + /* Update to the new CCT table */ + proto->cct = cct_table; + +fail: + return err; +} + +static opa_rate ips_default_hfi_rate(uint16_t hfi_type) +{ + opa_rate rate; + + switch (hfi_type) { + case PSMI_HFI_TYPE_OPA1: + rate = IBV_RATE_100_GBPS; + break; + case PSMI_HFI_TYPE_OPA2: + rate = IBV_RATE_120_GBPS; + break; + default: + rate = IBV_RATE_MAX; + } + + return rate; +} + +static opa_rate ips_rate_to_enum(int link_rate) +{ + opa_rate rate; + + switch (link_rate) { + case 300: + rate = IBV_RATE_300_GBPS; + break; + case 200: + rate = IBV_RATE_200_GBPS; + break; + case 100: + rate = IBV_RATE_100_GBPS; + break; + case 25: + rate = IBV_RATE_25_GBPS; + break; + case 168: + rate = IBV_RATE_168_GBPS; + break; + case 112: + rate = IBV_RATE_112_GBPS; + break; + case 56: + rate = IBV_RATE_56_GBPS; + break; + case 14: + rate = IBV_RATE_14_GBPS; + break; + case 120: + rate = IBV_RATE_120_GBPS; + break; + case 80: + rate = IBV_RATE_80_GBPS; + break; + case 60: + rate = IBV_RATE_60_GBPS; + break; + case 40: + rate = IBV_RATE_40_GBPS; + break; + case 30: + rate = IBV_RATE_30_GBPS; + break; + case 20: + rate = IBV_RATE_20_GBPS; + break; + case 10: + rate = IBV_RATE_10_GBPS; + break; + case 5: + rate = IBV_RATE_5_GBPS; + break; + default: + rate = IBV_RATE_MAX; + } + + return rate; +} + +static psm2_error_t +ips_none_get_path_rec(struct ips_proto *proto, + uint16_t slid, uint16_t dlid, uint16_t desthfi_type, + unsigned long timeout, ips_path_rec_t **ppath_rec) +{ + psm2_error_t err = PSM2_OK; + ips_path_rec_t *path_rec; + ENTRY elid, *epath = NULL; + char eplid[128]; + + /* Query the path record cache */ + snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid); + elid.key = eplid; + hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash); + + if (!epath) { + elid.key = + psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1); + path_rec = (ips_path_rec_t *) + psmi_calloc(proto->ep, UNDEFINED, 1, + sizeof(ips_path_rec_t)); + if (!elid.key || !path_rec) { + if (elid.key) + psmi_free(elid.key); + if (path_rec) + psmi_free(path_rec); + return PSM2_NO_MEMORY; + } + + /* Create path record */ + path_rec->pr_slid = slid; + path_rec->pr_dlid = dlid; + path_rec->pr_mtu = proto->epinfo.ep_mtu; + path_rec->pr_pkey = proto->epinfo.ep_pkey; + path_rec->pr_sl = proto->epinfo.ep_sl; + + /* Determine the IPD based on our local link rate and default link rate for + * remote hfi type. + */ + path_rec->pr_static_ipd = + proto->ips_ipd_delay[ips_default_hfi_rate(desthfi_type)]; + + _HFI_CCADBG("pr_static_ipd = %d\n", (int) path_rec->pr_static_ipd); + + /* Setup CCA parameters for path */ + if (path_rec->pr_sl > PSMI_SL_MAX) { + psmi_free(elid.key); + psmi_free(path_rec); + return PSM2_INTERNAL_ERR; + } + if (!(proto->ccti_ctrlmap & (1 << path_rec->pr_sl))) { + _HFI_CCADBG("No CCA for sl %d, disable CCA\n", + path_rec->pr_sl); + proto->flags &= ~IPS_PROTO_FLAG_CCA; + proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN; + } + if (!psmi_hal_has_cap(PSM_HAL_CAP_STATIC_RATE_CTRL)) { + _HFI_CCADBG("No Static-Rate-Control, disable CCA\n"); + proto->flags &= ~IPS_PROTO_FLAG_CCA; + proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN; + } + + path_rec->proto = proto; + path_rec->pr_ccti = proto->cace[path_rec->pr_sl].ccti_min; + path_rec->pr_timer_cca = NULL; + + /* Determine active IPD for path. Is max of static rate and CCT table */ + if (!(proto->flags & IPS_PROTO_FLAG_CCA)) { + _HFI_CCADBG("No IPS_PROTO_FLAG_CCA\n"); + + path_rec->pr_active_ipd = 0; + path_rec->pr_cca_divisor = 0; + + _HFI_CCADBG("pr_active_ipd = %d\n", (int) path_rec->pr_active_ipd); + _HFI_CCADBG("pr_cca_divisor = %d\n", (int) path_rec->pr_cca_divisor); + } else if ((path_rec->pr_static_ipd) && + ((path_rec->pr_static_ipd + 1) > + (proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK))) { + _HFI_CCADBG("IPS_PROTO_FLAG_CCA set, Setting pr_active_ipd.\n"); + + path_rec->pr_active_ipd = path_rec->pr_static_ipd + 1; + path_rec->pr_cca_divisor = 0; + + _HFI_CCADBG("pr_active_ipd = %d\n", (int) path_rec->pr_active_ipd); + _HFI_CCADBG("pr_cca_divisor = %d\n", (int) path_rec->pr_cca_divisor); + } else { + /* Pick it from the CCT table */ + _HFI_CCADBG("Picking up active IPD from CCT table, index %d, value 0x%x\n", + (int) path_rec->pr_ccti, (int) proto->cct[path_rec->pr_ccti]); + + path_rec->pr_active_ipd = + proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK; + path_rec->pr_cca_divisor = + proto->cct[path_rec->pr_ccti] >> CCA_DIVISOR_SHIFT; + + _HFI_CCADBG("pr_active_ipd = %d\n", (int) path_rec->pr_active_ipd); + _HFI_CCADBG("pr_cca_divisor = %d\n", (int) path_rec->pr_cca_divisor); + } + + /* Add path record into cache */ + strcpy(elid.key, eplid); + elid.data = (void *)path_rec; + hsearch_r(elid, ENTER, &epath, &proto->ips_path_rec_hash); + } else + path_rec = (ips_path_rec_t *) epath->data; + + /* Return IPS path record */ + *ppath_rec = path_rec; + + return err; +} + +static psm2_error_t +ips_none_path_rec(struct ips_proto *proto, + uint16_t slid, uint16_t dlid, uint16_t desthfi_type, + unsigned long timeout, ips_path_grp_t **ppathgrp) +{ + psm2_error_t err = PSM2_OK; + uint16_t pidx, num_path = (1 << proto->epinfo.ep_lmc); + uint16_t path_slid, path_dlid; + ips_path_rec_t *path; + ips_path_grp_t *pathgrp; + ENTRY elid, *epath = NULL; + char eplid[128]; + + /* For the "none" path record resolution all paths are assumed to be + * of equal priority however since we want to isolate all control + * traffic (acks, naks) to a separate path for non zero LMC subnets + * the "first path" between a pair of endpoints is always the "higher" + * priority paths. The rest of the paths are the normal (and low + * priority) paths. + */ + + /* Query the path record cache */ + snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid); + elid.key = eplid; + hsearch_r(elid, FIND, &epath, &proto->ips_path_grp_hash); + + if (epath) { /* Find path group in cache */ + *ppathgrp = (ips_path_grp_t *) epath->data; + return err; + } + + /* If base lids are only used then reset num_path to 1 */ + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE) + num_path = 1; + + /* Allocate a new pathgroup */ + elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1); + pathgrp = (ips_path_grp_t *) + psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_path_grp_t) + + num_path * IPS_PATH_MAX_PRIORITY * + sizeof(ips_path_rec_t *)); + if (!elid.key || !pathgrp) { + if (elid.key) + psmi_free(elid.key); + if (pathgrp) + psmi_free(pathgrp); + err = PSM2_NO_MEMORY; + goto fail; + } + + /* + * dlid is the peer base lid. + * slid is the base lid for the local end point. + * Store in network byte order. + */ + pathgrp->pg_base_dlid = dlid; + pathgrp->pg_base_slid = slid; + + if (num_path > 1) { + /* One control path and (num_path - 1) norm and low priority paths */ + pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] = 1; + pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] = num_path - 1; + pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = num_path - 1; + } else { + /* LMC of 0. Use the same path for all priorities */ + pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] = 1; + pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] = 1; + pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = 1; + } + + /* For "none" path record we just setup 2^lmc paths. To get better load + * balance + */ + for (pidx = 0; pidx < num_path; pidx++) { + path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx); + path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx); + + err = + ips_none_get_path_rec(proto, path_slid, path_dlid, + desthfi_type, timeout, &path); + if (err != PSM2_OK) { + psmi_free(elid.key); + psmi_free(pathgrp); + goto fail; + } + + if (num_path > 1) { + if (pidx == 0) { + /* First path is always the high priority path */ + pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY] = + path; + } else { + pathgrp->pg_path[pidx - + 1][IPS_PATH_NORMAL_PRIORITY] = + path; + pathgrp->pg_path[pidx - + 1][IPS_PATH_LOW_PRIORITY] = + path; + } + } else { + pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY] = path; + pathgrp->pg_path[0][IPS_PATH_NORMAL_PRIORITY] = path; + pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY] = path; + } + PSM2_LOG_MSG("path %p slid %hu dlid %hu \n", + path, + __be16_to_cpu(path->pr_slid), + __be16_to_cpu(path->pr_dlid)); + + } + + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) { + pathgrp->pg_next_path[IPS_PATH_NORMAL_PRIORITY] = + proto->epinfo.ep_context % + pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY]; + pathgrp->pg_next_path[IPS_PATH_LOW_PRIORITY] = + proto->epinfo.ep_context % + pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY]; + } + + /* Add path record into cache */ + strcpy(elid.key, eplid); + elid.data = (void *)pathgrp; + hsearch_r(elid, ENTER, &epath, &proto->ips_path_grp_hash); + + *ppathgrp = pathgrp; + +fail: + if (err != PSM2_OK) + _HFI_PRDBG + ("Unable to get path record for LID %x <---> DLID %x.\n", + slid, dlid); + return err; +} + +static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto) +{ + psm2_error_t err = PSM2_OK; + + /* Obtain the SL and PKEY to use from the environment (HFI_SL & PSM_KEY) */ + proto->epinfo.ep_sl = proto->ep->out_sl; + proto->epinfo.ep_pkey = (uint16_t) proto->ep->network_pkey; + + /* + * Parse the err_chk settings from the environment. + * :: + */ + { + union psmi_envvar_val env_to; + char *errchk_to = PSM_TID_TIMEOUT_DEFAULT; + int tvals[3] = { + IPS_PROTO_ERRCHK_MS_MIN_DEFAULT, + IPS_PROTO_ERRCHK_MS_MAX_DEFAULT, + IPS_PROTO_ERRCHK_FACTOR_DEFAULT + }; + + if (!psmi_getenv("PSM2_ERRCHK_TIMEOUT", + "Errchk timeouts in mS ", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)errchk_to, &env_to)) { + /* Not using default values, parse what we can */ + errchk_to = env_to.e_str; + psmi_parse_str_tuples(errchk_to, 3, tvals); + /* Adjust for max smaller than min, things would break */ + if (tvals[1] < tvals[0]) + tvals[1] = tvals[0]; + } + + proto->epinfo.ep_timeout_ack = ms_2_cycles(tvals[0]); + proto->epinfo.ep_timeout_ack_max = ms_2_cycles(tvals[1]); + proto->epinfo.ep_timeout_ack_factor = tvals[2]; + } + + proto->ibta.get_path_rec = ips_none_path_rec; + proto->ibta.fini = NULL; + + /* With no path records queries set pkey manually */ + if (psmi_hal_set_pkey(proto->ep->context.psm_hw_ctxt, + (uint16_t) proto->ep->network_pkey) != 0) { + err = psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE, + "Couldn't set device pkey 0x%x: %s", + (int)proto->ep->network_pkey, + strerror(errno)); + } + + return err; +} + +/* (Re)load the SL2VL table */ +psm2_error_t ips_ibta_init_sl2sc2vl_table(struct ips_proto *proto) +{ + int ret, i; + + /* Get SL2SC table for unit, port */ + for (i = 0; i < 32; i++) { + if ((ret = + psmi_hal_get_port_sl2sc(psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt), + psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt), + (uint8_t) i)) < 0) { + /* Unable to get SL2SC. Set it to default */ + ret = PSMI_SC_DEFAULT; + } + + proto->sl2sc[i] = (uint16_t) ret; + } + /* Get SC2VL table for unit, port */ + for (i = 0; i < 32; i++) { + if ((ret = + psmi_hal_get_port_sc2vl(psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt), + psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt), + (uint8_t) i)) < 0) { + /* Unable to get SC2VL. Set it to default */ + ret = PSMI_VL_DEFAULT; + } + + proto->sc2vl[i] = (uint16_t) ret; + } + + return PSM2_OK; +} + +/* On link up/down we need to update some state */ +psm2_error_t ips_ibta_link_updown_event(struct ips_proto *proto) +{ + psm2_error_t err = PSM2_OK; + int ret; + + /* Get base lid, lmc and rate as these may have changed if the link bounced */ + proto->epinfo.ep_base_lid = + __cpu_to_be16((uint16_t) psm2_epid_nid(proto->ep->context.epid)); + + if ((ret = psmi_hal_get_port_lmc(psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt), + psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt))) < 0) { + err = psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE, + "Could not obtain LMC for unit %u:%u. Error: %s", + psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt), + psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt), + strerror(errno)); + goto fail; + } + proto->epinfo.ep_lmc = min(ret, IPS_MAX_PATH_LMC); + + if ((ret = psmi_hal_get_port_rate(psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt), + psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt))) < + 0) { + err = + psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE, + "Could obtain link rate for unit %u:%u. Error: %s", + psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt), + psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt), + strerror(errno)); + goto fail; + } + proto->epinfo.ep_link_rate = ips_rate_to_enum(ret); + + /* Load the SL2SC2VL table */ + ips_ibta_init_sl2sc2vl_table(proto); + + /* Regenerate new IPD table for the updated link rate. */ + ips_gen_ipd_table(proto); + + /* Generate the CCT table. */ + err = ips_gen_cct_table(proto); + +fail: + return err; +} + +psm2_error_t +MOCKABLE(ips_ibta_init)(struct ips_proto *proto) +{ + psm2_error_t err = PSM2_OK; + union psmi_envvar_val psm_path_policy; + union psmi_envvar_val disable_cca; + union psmi_envvar_val cca_prescan; + union psmi_envvar_val path_disable_lmc_interval; + + /* Get the path selection policy */ + psmi_getenv("PSM2_PATH_SELECTION", + "Policy to use if multiple paths are available between endpoints. Options are adaptive, static_src, static_dest, static_base. Default is adaptive.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"adaptive", &psm_path_policy); + + if (!strcasecmp((const char *)psm_path_policy.e_str, "adaptive")) + proto->flags |= IPS_PROTO_FLAG_PPOLICY_ADAPTIVE; + else if (!strcasecmp((const char *)psm_path_policy.e_str, "static_src")) + proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_SRC; + else if (!strcasecmp + ((const char *)psm_path_policy.e_str, "static_dest")) + proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_DST; + else if (!strcasecmp + ((const char *)psm_path_policy.e_str, "static_base")) + proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_BASE; + + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) + _HFI_PRDBG("Using adaptive path selection.\n"); + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC) + _HFI_PRDBG("Static path selection: Src Context\n"); + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST) + _HFI_PRDBG("Static path selection: Dest Context\n"); + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE) + _HFI_PRDBG("Static path selection: Base LID\n"); + + psmi_getenv("PSM2_DISABLE_CCA", + "Disable use of Congestion Control Architecture (CCA) [enabled] ", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)0, &disable_cca); + if (disable_cca.e_uint) + _HFI_CCADBG("CCA is disabled for congestion control.\n"); + else { + int i; + char ccabuf[256]; + uint8_t *p; + + proto->flags |= IPS_PROTO_FLAG_CCA; +/* + * If user set any environment variable, use self CCA. + */ + if (getenv("PSM2_CCTI_INCREMENT") || getenv("PSM2_CCTI_TIMER") + || getenv("PSM2_CCTI_TABLE_SIZE")) { + goto disablecca; + } + + psmi_getenv("PSM2_CCA_PRESCAN", + "Enable Congestion Control Prescanning (disabled by default) ", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)0, &cca_prescan); + + if (cca_prescan.e_uint) + proto->flags |= IPS_PROTO_FLAG_CCA_PRESCAN; + +/* + * Check qib driver CCA setting, and try to use it if available. + * Fall to self CCA setting if errors. + */ + i = psmi_hal_get_cc_settings_bin( + psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt), + psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt), + ccabuf, sizeof(ccabuf)); + + if (i <= 0) { + goto disablecca; + } + p = (uint8_t *) ccabuf; + memcpy(&proto->ccti_ctrlmap, p, 4); + p += 4; + memcpy(&proto->ccti_portctrl, p, 2); + p += 2; + for (i = 0; i < 32; i++) { + proto->cace[i].ccti_increase = *p; + p++; + /* skip reserved u8 */ + p++; + memcpy(&proto->cace[i].ccti_timer_cycles, p, 2); + p += 2; + proto->cace[i].ccti_timer_cycles = + us_2_cycles(proto->cace[i].ccti_timer_cycles); + proto->cace[i].ccti_threshold = *p; + p++; + proto->cace[i].ccti_min = *p; + p++; + } + + i = psmi_hal_get_cc_table_bin(psmi_hal_get_unit_id(proto->ep->context. + psm_hw_ctxt), + psmi_hal_get_port_num(proto->ep->context. + psm_hw_ctxt), + &proto->cct); + if (i < 0) { + err = PSM2_NO_MEMORY; + goto fail; + } else if (i == 0) { + goto disablecca; + } + proto->ccti_limit = i; + proto->ccti_size = proto->ccti_limit + 1; + + _HFI_CCADBG("ccti_limit = %d\n", (int) proto->ccti_limit); + for (i = 0; i < proto->ccti_limit; i++) + _HFI_CCADBG("cct[%d] = 0x%04x\n", i, (int) proto->cct[i]); + + + goto finishcca; + +/* + * Disable CCA. + */ +disablecca: + proto->flags &= ~IPS_PROTO_FLAG_CCA; + proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN; + } + +finishcca: + /* Initialize path record/group hash table */ + + { + uint32_t lmc_disable_low, lmc_disable_high; + int sscanf_ret; + + /* The default disable_low and disable_low values + * are 2^32 - 1, the maximum allowable message size. + * So by default all messages should be smaller than the + * lower limit, and so will not have LMC dispersive + * routing disabled. + * + * Add to this, these limits are applied only to SDMA + * and PIO message, NOT TID messages. So this size + * bigger than any PIO size. + */ + psmi_getenv("PSM2_PATH_NO_LMC_RANGE", + "Disable LMC route dispersion within this range, " + "low_value:high_value\n", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)DEF_LIMITS_STRING, + &path_disable_lmc_interval); + + sscanf_ret = sscanf(path_disable_lmc_interval.e_str, "%u:%u", + &lmc_disable_low, &lmc_disable_high); + + /* + * It's "invalid" for the low end of the range to be + * larger than the hig end of the range, so revert + * to the "maximum message size" (2^32 - 1). + */ + if ((sscanf_ret != 2) || (lmc_disable_low > lmc_disable_high)) { + lmc_disable_low = lmc_disable_high = DEF_LIMITS_VALUE; + } + + PSM2_LOG_MSG("PSM2_PATH_NO_LMC_RANGE: " + "lmc_disable_low %u lmc_disable_high %u\n", + lmc_disable_low, lmc_disable_high); + + /* + * These specify the range of message sizes in bytes, of + * the messages to disable LMC dynamic LID assignment. + */ + proto->ips_lmc_disable_low = lmc_disable_low; + proto->ips_lmc_disable_high = lmc_disable_high; + } + + hcreate_r(DF_PATH_REC_HASH_SIZE, &proto->ips_path_rec_hash); + hcreate_r(DF_PATH_GRP_HASH_SIZE, &proto->ips_path_grp_hash); + + /* On startup treat it as a link up/down event to setup state . */ + if ((err = ips_ibta_link_updown_event(proto)) != PSM2_OK) + goto fail; + + /* Setup the appropriate query interface for the endpoint */ + switch (proto->ep->path_res_type) { + case PSM2_PATH_RES_OPP: + err = ips_opp_init(proto); + if (err != PSM2_OK) + _HFI_ERROR + ("Unable to use OFED Plus Plus for path record queries.\n"); + break; + case PSM2_PATH_RES_UMAD: + _HFI_ERROR + ("Path record queries using UMAD is not supported in PSM version %d.%dx\n", + PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR); + err = PSM2_EPID_PATH_RESOLUTION; + break; + case PSM2_PATH_RES_NONE: + default: + err = ips_none_path_rec_init(proto); + } + +fail: + return err; +} +MOCK_DEF_EPILOGUE(ips_ibta_init); + +psm2_error_t ips_ibta_fini(struct ips_proto *proto) +{ + psm2_error_t err = PSM2_OK; + + if (proto->ibta.fini) + err = proto->ibta.fini(proto); + + /* Destroy the path record/group hash */ + hdestroy_r(&proto->ips_path_rec_hash); + hdestroy_r(&proto->ips_path_grp_hash); + + return err; +} diff --git a/ptl_ips/ips_path_rec.h b/ptl_ips/ips_path_rec.h new file mode 100644 index 0000000..efcdc77 --- /dev/null +++ b/ptl_ips/ips_path_rec.h @@ -0,0 +1,190 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2009-2014 Intel Corporation. All rights reserved. */ + + +#ifndef _IPS_PATH_REC_H_ +#define _IPS_PATH_REC_H_ + +#include + +/* Default size of path record hash table */ +#define DF_PATH_REC_HASH_SIZE 2047 + +/* Default size of path group hash table */ +#define DF_PATH_GRP_HASH_SIZE 255 + +/* Default size of CCT table. Must be multiple of 64 */ +#define DF_CCT_TABLE_SIZE 128 + +/* CCT max IPD delay. */ +#define DF_CCT_MAX_IPD_DELAY_US 21 + +/* CCA divisor shift */ +#define CCA_DIVISOR_SHIFT 14 + +/* CCA ipd mask */ +#define CCA_IPD_MASK 0x3FFF + +/* A lot of these are IBTA specific defines that are available in other header + * files. To minimize dependencies with PSM build process they are listed + * here. Most of this is used to implement IBTA compliance features with PSM + * like path record query etc. + */ + +enum opa_mtu { + IBTA_MTU_256 = 1, + IBTA_MTU_512 = 2, + IBTA_MTU_1024 = 3, + IBTA_MTU_2048 = 4, + IBTA_MTU_4096 = 5, + OPA_MTU_8192 = 6, + OPA_MTU_10240 = 7, + IBTA_MTU_MIN = IBTA_MTU_256, + OPA_MTU_MIN = IBTA_MTU_256, + OPA_MTU_MAX = OPA_MTU_10240, +}; + +typedef enum { + IBV_RATE_MAX = 0, + IBV_RATE_2_5_GBPS = 2, + IBV_RATE_5_GBPS = 5, + IBV_RATE_10_GBPS = 3, + IBV_RATE_20_GBPS = 6, + IBV_RATE_30_GBPS = 4, + IBV_RATE_40_GBPS = 7, + IBV_RATE_60_GBPS = 8, + IBV_RATE_80_GBPS = 9, + IBV_RATE_120_GBPS = 10, + IBV_RATE_14_GBPS = 11, + IBV_RATE_56_GBPS = 12, + IBV_RATE_112_GBPS = 13, + IBV_RATE_168_GBPS = 14, + IBV_RATE_25_GBPS = 15, + IBV_RATE_100_GBPS = 16, + IBV_RATE_200_GBPS = 17, + IBV_RATE_300_GBPS = 18 +} opa_rate; + +static inline int opa_mtu_enum_to_int(enum opa_mtu mtu) +{ + switch (mtu) { + case IBTA_MTU_256: + return 256; + case IBTA_MTU_512: + return 512; + case IBTA_MTU_1024: + return 1024; + case IBTA_MTU_2048: + return 2048; + case IBTA_MTU_4096: + return 4096; + case OPA_MTU_8192: + return 8192; + case OPA_MTU_10240: + return 10240; + default: + return -1; + } +} + +/* This is same as ob_path_rec from ib_types.h. Listed here to be self + * contained to minimize dependencies during build etc. + */ +typedef struct _ibta_path_rec { + uint64_t service_id; /* net order */ + uint8_t dgid[16]; + uint8_t sgid[16]; + uint16_t dlid; /* net order */ + uint16_t slid; /* net order */ + uint32_t hop_flow_raw; /* net order */ + uint8_t tclass; + uint8_t num_path; + uint16_t pkey; /* net order */ + uint16_t qos_class_sl; /* net order */ + uint8_t mtu; /* IBTA encoded */ + uint8_t rate; /* IBTA encoded */ + uint8_t pkt_life; /* IBTA encoded */ + uint8_t preference; + uint8_t resv2[6]; +} ibta_path_rec_t; + +/* + * PSM IPS path record components for endpoint. + * + * For Torus/non-zero LMC fabrics, pr_slid and pr_dlid may be different from + * the "base lid" values for this connection. + */ +struct ips_proto; + +typedef struct ips_path_rec { + uint16_t pr_slid; + uint16_t pr_dlid; + uint16_t pr_mtu; /* < Path's MTU */ + uint16_t pr_pkey; + uint16_t pr_static_ipd; /* Static rate IPD from path record */ + uint8_t pr_sl; + + /* IBTA CCA parameters per path */ + uint8_t pr_cca_divisor; /* CCA divisor [14:15] in CCT entry */ + uint16_t pr_active_ipd; /* The current active IPD. max(static,cct) */ + uint16_t pr_ccti; /* CCA table index */ + /* Congestion timer for epr_ccti increment. */ + psmi_timer *pr_timer_cca; + struct ips_proto *proto; /* for global info */ +} ips_path_rec_t; + +psm2_error_t ips_opp_init(struct ips_proto *proto); + +#endif diff --git a/ptl_ips/ips_proto.c b/ptl_ips/ips_proto.c new file mode 100644 index 0000000..35dcce7 --- /dev/null +++ b/ptl_ips/ips_proto.c @@ -0,0 +1,2463 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +/* + * IPS - Interconnect Protocol Stack. + */ + +#include +#include /* writev */ +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" +#include "ips_proto_help.h" +#include "psmi_wrappers.h" +#include "psm_mq_internal.h" + +#ifdef PSM_CUDA +#include "psm_gdrcpy.h" +#endif + +/* + * Control message types have their own flag to determine whether a message of + * that type is queued or not. These flags are kept in a state bitfield. + */ +#define CTRL_MSG_ACK_QUEUED 0x0001 +#define CTRL_MSG_NAK_QUEUED 0x0002 +#define CTRL_MSG_BECN_QUEUED 0x0004 +#define CTRL_MSG_ERR_CHK_QUEUED 0x0008 +#define CTRL_MSG_ERR_CHK_GEN_QUEUED 0x0010 +#define CTRL_MSG_CONNECT_REQUEST_QUEUED 0x0020 +#define CTRL_MSG_CONNECT_REPLY_QUEUED 0x0040 +#define CTRL_MSG_DISCONNECT_REQUEST_QUEUED 0x0080 +#define CTRL_MSG_DISCONNECT_REPLY_QUEUED 0x0100 + +#ifdef PSM_CUDA +uint32_t gpudirect_send_threshold; +uint32_t gpudirect_recv_threshold; +#endif + +static void ctrlq_init(struct ips_ctrlq *ctrlq, struct ips_proto *proto); +static psm2_error_t proto_sdma_init(struct ips_proto *proto, + const psmi_context_t *context); + +#ifdef PSM_CUDA +void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj) +{ + struct ips_cuda_hostbuf *icb; + struct ips_cuda_hostbuf_mpool_cb_context *ctxt = + (struct ips_cuda_hostbuf_mpool_cb_context *) context; + + icb = (struct ips_cuda_hostbuf *)obj; + if (is_alloc) { + PSMI_CUDA_CALL(cuMemHostAlloc, + (void **) &icb->host_buf, + ctxt->bufsz, + CU_MEMHOSTALLOC_PORTABLE); + PSMI_CUDA_CALL(cuEventCreate, &icb->copy_status, CU_EVENT_DEFAULT); + } else { + if (icb->host_buf) { + PSMI_CUDA_CALL(cuMemFreeHost, icb->host_buf); + PSMI_CUDA_CALL(cuEventDestroy, icb->copy_status); + } + } + return; +} +#endif + +static uint16_t ips_proto_compute_mtu_code(int mtu) +{ + static const struct MapMTUToMtuCode + { + int mtu; + uint16_t mtu_code; + } mtumap[] = + { + { 256, IBTA_MTU_256 }, + { 512, IBTA_MTU_512 }, + { 1024, IBTA_MTU_1024}, + { 2048, IBTA_MTU_2048}, + { 4096, IBTA_MTU_4096}, + { 8192, OPA_MTU_8192 }, + {10240, OPA_MTU_10240}, + }; + int i; + + for (i=0;i < sizeof(mtumap)/sizeof(mtumap[0]);i++) + if (mtu == mtumap[i].mtu) + return mtumap[i].mtu_code; + return 0; +} + +psm2_error_t +ips_proto_init(const psmi_context_t *context, const ptl_t *ptl, + int num_of_send_bufs, int num_of_send_desc, uint32_t imm_size, + const struct psmi_timer_ctrl *timerq, + const struct ips_epstate *epstate, + void *spioc, struct ips_proto *proto) +{ + uint32_t protoexp_flags, cksum_sz; + union psmi_envvar_val env_tid, env_cksum, env_mtu; + psm2_error_t err = PSM2_OK; + + /* + * Checksum packets within PSM. Default is off. + * This is heavy weight and done in software so not recommended for + * production runs. + */ + + psmi_getenv("PSM2_CHECKSUM", + "Enable checksum of messages (0 disables checksum)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)0, &env_cksum); + + memset(proto, 0, sizeof(struct ips_proto)); + proto->ptl = (ptl_t *) ptl; + proto->ep = context->ep; /* cached */ + proto->mq = context->ep->mq; /* cached */ + proto->pend_sends.proto = proto; + psmi_timer_entry_init(&proto->pend_sends.timer, + ips_proto_timer_pendq_callback, + &proto->pend_sends); + STAILQ_INIT(&proto->pend_sends.pendq); + proto->epstate = (struct ips_epstate *)epstate; + proto->timerq = (struct psmi_timer_ctrl *)timerq; + proto->spioc = spioc; + + proto->epinfo.ep_baseqp = psmi_hal_get_bthqp(context->psm_hw_ctxt); + proto->epinfo.ep_context = psmi_hal_get_context(context->psm_hw_ctxt); /* "real" context */ + proto->epinfo.ep_subcontext = psmi_hal_get_subctxt(context->psm_hw_ctxt); + proto->epinfo.ep_hfi_type = psmi_hal_get_hfi_type(context->psm_hw_ctxt); + proto->epinfo.ep_jkey = psmi_hal_get_jkey(context->psm_hw_ctxt); + + /* If checksums enabled we insert checksum at end of packet */ + cksum_sz = env_cksum.e_uint ? PSM_CRC_SIZE_IN_BYTES : 0; + proto->epinfo.ep_mtu = context->ep->mtu; + /* Decrement checksum */ + proto->epinfo.ep_mtu -= cksum_sz; + + /* See if user specifies a lower MTU to use */ + if (!psmi_getenv + ("PSM2_MTU", "MTU specified by user: 1-7,256-8192,10240]", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)-1, &env_mtu)) { + if (env_mtu.e_int != 256 && env_mtu.e_int != 512 + && env_mtu.e_int != 1024 && env_mtu.e_int != 2048 + && env_mtu.e_int != 4096 && env_mtu.e_int != 8192 + && env_mtu.e_int != 10240) { + if (env_mtu.e_int < OPA_MTU_MIN || + env_mtu.e_int > OPA_MTU_MAX) + env_mtu.e_int = OPA_MTU_8192; + env_mtu.e_int = + opa_mtu_enum_to_int((enum opa_mtu)env_mtu.e_int); + } + if (proto->epinfo.ep_mtu > env_mtu.e_int) + proto->epinfo.ep_mtu = env_mtu.e_int; + } + + proto->epinfo.ep_mtu_code = ips_proto_compute_mtu_code(proto->epinfo.ep_mtu); + + /* + * The PIO size should not include the ICRC because it is + * stripped by HW before delivering to receiving buffer. + * We decide to use minimum 2 PIO buffers so that PSM has + * turn-around time to do PIO transfer. Each credit is a + * block of 64 bytes. Also PIO buffer size must not be + * bigger than MTU. + */ + proto->epinfo.ep_piosize = psmi_hal_get_pio_size(context->psm_hw_ctxt) - cksum_sz; + proto->epinfo.ep_piosize = + min(proto->epinfo.ep_piosize, proto->epinfo.ep_mtu); + + /* Keep PIO as multiple of cache line size */ + if (proto->epinfo.ep_piosize > PSM_CACHE_LINE_BYTES) + proto->epinfo.ep_piosize &= ~(PSM_CACHE_LINE_BYTES - 1); + + /* Save back to hfi level. */ + psmi_hal_set_effective_mtu(proto->epinfo.ep_mtu, proto->ep->context.psm_hw_ctxt); + psmi_hal_set_pio_size(proto->epinfo.ep_piosize, + proto->ep->context.psm_hw_ctxt); + + /* sdma queue size */ + proto->sdma_queue_size = psmi_hal_get_sdma_ring_size(context->psm_hw_ctxt); + /* don't use the last slot */ + + if (proto->sdma_queue_size > 8) { + /* configure sdma_avail_counter */ + union psmi_envvar_val env_sdma_avail; + int tmp_queue_size = 8; + + psmi_getenv("PSM2_MAX_PENDING_SDMA_REQS", + "PSM maximum pending SDMA requests", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val) tmp_queue_size, + &env_sdma_avail); + + if ((env_sdma_avail.e_int < 8) || (env_sdma_avail.e_int > (proto->sdma_queue_size - 1))) + proto->sdma_avail_counter = 8; + else + proto->sdma_avail_counter = env_sdma_avail.e_int; + } else { + err = PSM2_PARAM_ERR; + goto fail; + } + + + proto->sdma_fill_index = 0; + proto->sdma_done_index = 0; + proto->sdma_scb_queue = (struct ips_scb **) + psmi_calloc(proto->ep, UNDEFINED, + proto->sdma_queue_size, sizeof(struct ips_scb *)); + if (proto->sdma_scb_queue == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + + proto->timeout_send = us_2_cycles(IPS_PROTO_SPIO_RETRY_US_DEFAULT); + proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = ~0U; + proto->t_init = get_cycles(); + proto->t_fini = 0; + proto->flags = env_cksum.e_uint ? IPS_PROTO_FLAG_CKSUM : 0; + proto->runid_key = getpid(); + + proto->num_connected_outgoing = 0; + proto->num_connected_incoming = 0; + proto->num_disconnect_requests = 0; + proto->stray_warn_interval = (uint64_t) -1; + proto->done_warning = 0; + proto->done_once = 0; + proto->num_bogus_warnings = 0; + proto->psmi_logevent_tid_send_reqs.interval_secs = 15; + proto->psmi_logevent_tid_send_reqs.next_warning = 0; + proto->psmi_logevent_tid_send_reqs.count = 0; + + /* Initialize IBTA related stuff (path record, SL2VL, CCA etc.) */ + if ((err = ips_ibta_init(proto))) + goto fail; + + { + /* User asks for HFI loopback? */ + union psmi_envvar_val env_loopback; + + psmi_getenv("PSM2_HFI_LOOPBACK", + "PSM uses HFI loopback (default is disabled i.e. 0)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)0, /* Disabled by default */ + &env_loopback); + + if (env_loopback.e_uint) + proto->flags |= IPS_PROTO_FLAG_LOOPBACK; + } + + /* Update JKey if necessary */ + if (getenv("PSM2_SELINUX")) + proto->epinfo.ep_jkey = psmi_hal_get_jkey(context->psm_hw_ctxt); + + { + /* Disable coalesced ACKs? */ + union psmi_envvar_val env_coalesce_acks; + + psmi_getenv("PSM2_COALESCE_ACKS", "Coalesce ACKs on the wire (default is enabled i.e. 1)", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)1, /* Enabled by default */ + &env_coalesce_acks); + + if (env_coalesce_acks.e_uint) + proto->flags |= IPS_PROTO_FLAG_COALESCE_ACKS; + } + + { + /* Number of credits per flow */ + union psmi_envvar_val env_flow_credits; + int df_flow_credits = min(PSM2_FLOW_CREDITS, num_of_send_desc); + + psmi_getenv("PSM2_FLOW_CREDITS", + "Number of unacked packets (credits) per flow (default is 64)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)df_flow_credits, + &env_flow_credits); + proto->flow_credits = env_flow_credits.e_uint; + } + + /* + * Pre-calculate the PSN mask to support 24 or 31 bits PSN. + */ + if (psmi_hal_has_cap(PSM_HAL_CAP_EXTENDED_PSN)) { + proto->psn_mask = 0x7FFFFFFF; + } else { + proto->psn_mask = 0xFFFFFF; + } + + /* + * Initialize SDMA, otherwise, turn on all PIO. + */ + if (psmi_hal_has_cap(PSM_HAL_CAP_SDMA)) { + if ((err = proto_sdma_init(proto, context))) + goto fail; + } else { + proto->flags |= IPS_PROTO_FLAG_SPIO; + proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = + ~0U; + } + + /* + * Setup the protocol wide short message ep flow. + */ + if (proto->flags & IPS_PROTO_FLAG_SDMA) { + proto->msgflowid = EP_FLOW_GO_BACK_N_DMA; + } else { + proto->msgflowid = EP_FLOW_GO_BACK_N_PIO; + } + + /* + * Clone sendreq mpool configuration for pend sends config + */ + { + uint32_t chunks, maxsz; + + psmi_assert_always(proto->ep->mq->sreq_pool != NULL); + psmi_mpool_get_obj_info(proto->ep->mq->sreq_pool, &chunks, + &maxsz); + + proto->pend_sends_pool = + psmi_mpool_create(sizeof(struct ips_pend_sreq), chunks, + maxsz, 0, DESCRIPTORS, NULL, NULL); + if (proto->pend_sends_pool == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + } + + /* + * Create a pool of CCA timers for path_rec. The timers should not + * exceed the scb number num_of_send_desc(default 4K). + */ + { + uint32_t chunks, maxsz; + + chunks = 256; + maxsz = num_of_send_desc; + + proto->timer_pool = + psmi_mpool_create(sizeof(struct psmi_timer), chunks, maxsz, + 0, DESCRIPTORS, NULL, NULL); + if (proto->timer_pool == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + } + + /* + * Register ips protocol statistics + * + * We put a (*) in the output to denote stats that may cause a drop in + * performance. + * + * We put a (**) in the output of those stats that "should never happen" + */ + { + uint64_t *pio_stall_cnt = NULL; + + psmi_hal_get_pio_stall_cnt(context->psm_hw_ctxt,&pio_stall_cnt); + + struct psmi_stats_entry entries[] = { + PSMI_STATS_DECLU64("pio busy count", + &proto->stats.pio_busy_cnt), + /* Throttling by kernel */ + PSMI_STATS_DECLU64("writev busy cnt", + &proto->stats.writev_busy_cnt), + /* When local dma completion is in the way... */ + PSMI_STATS_DECLU64("writev compl. eagain", + &proto->stats.writev_compl_eagain), + /* When remote completion happens before local completion */ + PSMI_STATS_DECLU64("writev compl. delay (*)", + &proto->stats.writev_compl_delay), + PSMI_STATS_DECLU64("scb unavail eager count", + &proto->stats.scb_egr_unavail_cnt), + PSMI_STATS_DECLU64("scb unavail exp count", + &proto->stats.scb_exp_unavail_cnt), + PSMI_STATS_DECLU64("rcvhdr overflows", /* Normal egr/hdr ovflw */ + &proto->stats.hdr_overflow), + PSMI_STATS_DECLU64("rcveager overflows", + &proto->stats.egr_overflow), + PSMI_STATS_DECLU64("lid zero errs (**)", /* shouldn't happen */ + &proto->stats.lid_zero_errs), + PSMI_STATS_DECLU64("unknown packets (**)", /* shouldn't happen */ + &proto->stats.unknown_packets), + PSMI_STATS_DECLU64("stray packets (*)", + &proto->stats.stray_packets), + PSMI_STATS_DECLU64("pio stalls (*)", /* shouldn't happen too often */ + pio_stall_cnt), + PSMI_STATS_DECLU64("ICRC error (*)", + &proto->error_stats.num_icrc_err), + PSMI_STATS_DECLU64("ECC error ", + &proto->error_stats.num_ecc_err), + PSMI_STATS_DECLU64("Len error", + &proto->error_stats.num_len_err), + PSMI_STATS_DECLU64("TID error ", + &proto->error_stats.num_tid_err), + PSMI_STATS_DECLU64("DC error ", + &proto->error_stats.num_dc_err), + PSMI_STATS_DECLU64("DCUNC error ", + &proto->error_stats.num_dcunc_err), + PSMI_STATS_DECLU64("KHDRLEN error ", + &proto->error_stats.num_khdrlen_err), + + }; + + err = + psmi_stats_register_type + ("OPA low-level protocol stats", + PSMI_STATSTYPE_IPSPROTO, entries, + PSMI_STATS_HOWMANY(entries), NULL); + if (err != PSM2_OK) + goto fail; + } + + /* + * Control Queue and messaging + */ + ctrlq_init(&proto->ctrlq, proto); + + /* + * Receive-side handling + */ + if ((err = ips_proto_recv_init(proto))) + goto fail; + + /* If progress thread is enabled, set the proto flag */ + { + if (psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD)) + proto->flags |= IPS_PROTO_FLAG_RCVTHREAD; + } + + /* + * Eager buffers. We don't care to receive a callback when eager buffers + * are newly released since we actively poll for new bufs. + */ + { + /* configure PSM bounce buffer size */ + union psmi_envvar_val env_bbs; + + psmi_getenv("PSM2_BOUNCE_SZ", + "PSM bounce buffer size (default is 8192B)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)8192, + &env_bbs); + + proto->scb_bufsize = env_bbs.e_uint; + } + + if ((err = ips_scbctrl_init(context, num_of_send_desc, + num_of_send_bufs, imm_size, + proto->scb_bufsize, NULL, NULL, + &proto->scbc_egr))) + goto fail; + + /* + * Expected protocol handling. + * If we enable tid-based expected rendezvous, the expected protocol code + * handles its own rv scb buffers. If not, we have to enable eager-based + * rendezvous and we allocate scb buffers for it. + */ + psmi_getenv("PSM2_TID", + "Tid proto flags (0 disables protocol)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)IPS_PROTOEXP_FLAGS_DEFAULT, + &env_tid); + protoexp_flags = env_tid.e_uint; + + if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) { +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED) { + PSMI_CUDA_CALL(cuStreamCreate, + &proto->cudastream_send, CU_STREAM_NON_BLOCKING); + } +#endif + proto->scbc_rv = NULL; + if ((err = ips_protoexp_init(context, proto, protoexp_flags, + num_of_send_bufs, num_of_send_desc, + &proto->protoexp))) + goto fail; + } else { + proto->protoexp = NULL; + proto->scbc_rv = (struct ips_scbctrl *) + psmi_calloc(proto->ep, DESCRIPTORS, + 1, sizeof(struct ips_scbctrl)); + if (proto->scbc_rv == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + /* + * Rendezvous buffers. We want to get a callback for rendezvous bufs + * since we asynchronously try to make progress on these sends and only + * schedule them on the timerq if there are pending sends and available + * bufs. + */ + if ((err = + ips_scbctrl_init(context, num_of_send_desc, + 0 /* no bufs */ , + 0, 0 /* bufsize==0 */ , + ips_proto_rv_scbavail_callback, + proto, proto->scbc_rv))) + goto fail; + } + + /* + * Parse the tid error settings from the environment. + * : + */ + { + int tvals[2]; + char *tid_err; + union psmi_envvar_val env_tiderr; + + tid_err = "-1:0"; /* no tiderr warnings, never exits */ + tvals[0] = -1; + tvals[1] = 0; + + if (!psmi_getenv("PSM2_TID_ERROR", + "Tid error control ", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)tid_err, &env_tiderr)) { + /* not using default values */ + tid_err = env_tiderr.e_str; + psmi_parse_str_tuples(tid_err, 2, tvals); + } + if (tvals[0] >= 0) + proto->tiderr_warn_interval = sec_2_cycles(tvals[0]); + else + proto->tiderr_warn_interval = UINT64_MAX; + proto->tiderr_max = tvals[1]; + _HFI_PRDBG("Tid error control: warning every %d secs%s, " + "fatal error after %d tid errors%s\n", + tvals[0], (tvals[0] < 0) ? " (no warnings)" : "", + tvals[1], (tvals[1] == 0) ? " (never fatal)" : ""); + } + + /* Active Message interface. AM requests compete with MQ for eager + * buffers, since request establish the amount of buffering in the + * network (maximum number of requests in flight). The AM init function + * does not allow the number of send buffers to be set separately from + * the number of send descriptors, because otherwise it would have to + * impose extremely arcane constraints on the relative amounts to avoid + * a deadlock scenario. Thus, it handles it internally. The constraint + * is: In a node pair, the number of reply send buffers on at least one + * of the nodes must be at least double the number (optimal: double + 1) + * of send descriptors on the other node. */ + if ((err = ips_proto_am_init(proto, + min(num_of_send_bufs, num_of_send_desc), + imm_size, + &proto->proto_am))) + goto fail; + +#if 0 + if (!host_pid) { + char ipbuf[INET_ADDRSTRLEN], *p; + host_pid = (uint32_t) getpid(); + host_ipv4addr = psmi_get_ipv4addr(); /* already be */ + if (host_ipv4addr == 0) { + _HFI_DBG("Unable to obtain local IP address, " + "not fatal but some features may be disabled\n"); + } else if (host_ipv4addr == __cpu_to_be32(0x7f000001)) { + _HFI_INFO("Localhost IP address is set to the " + "loopback address 127.0.0.1, " + "not fatal but some features may be disabled\n"); + } else { + p = (char *)inet_ntop(AF_INET, + (const void *)&host_ipv4addr, + ipbuf, sizeof(ipbuf)); + _HFI_PRDBG("Ethernet Host IP=%s and PID=%d\n", p, + host_pid); + } + + /* Store in big endian for use in ERR_CHK */ + host_pid = __cpu_to_be32(host_pid); + } +#endif +#ifdef PSM_CUDA + union psmi_envvar_val env_gpudirect_rdma; + psmi_getenv("PSM2_GPUDIRECT", + "Use GPUDirect RDMA support to allow the HFI to directly read" + " from the GPU for SDMA. Requires driver support.(default is " + " disabled i.e. 0)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)0, /* Disabled by default */ + &env_gpudirect_rdma); + /* The following cases need to be handled: + * 1) GPU DIRECT is turned off but GDR COPY is turned on by the user or + * by default - Turn off GDR COPY + * 2) GPU DIRECT is on but GDR COPY is turned off by the user - Leave + *. this config as it is. + */ + if (!env_gpudirect_rdma.e_uint) + is_gdr_copy_enabled = 0; + + /* Default Send threshold for Gpu-direct set to 30000 */ + union psmi_envvar_val env_gpudirect_send_thresh; + psmi_getenv("PSM2_GPUDIRECT_SEND_THRESH", + "GPUDirect feature on send side will be switched off if threshold value is exceeded.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)30000, &env_gpudirect_send_thresh); + gpudirect_send_threshold = env_gpudirect_send_thresh.e_uint; + + union psmi_envvar_val env_gpudirect_recv_thresh; + psmi_getenv("PSM2_GPUDIRECT_RECV_THRESH", + "GPUDirect feature on receive side will be switched off if threshold value is exceeded.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)UINT_MAX, &env_gpudirect_recv_thresh); + gpudirect_recv_threshold = env_gpudirect_recv_thresh.e_uint; + + if (env_gpudirect_rdma.e_uint && device_support_gpudirect) { + if (!PSMI_IS_CUDA_ENABLED || + /* All pio, No SDMA*/ + (proto->flags & IPS_PROTO_FLAG_SPIO) || + !(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) || + !PSMI_IS_DRIVER_GPUDIRECT_ENABLED) + err = psmi_handle_error(PSMI_EP_NORETURN, + PSM2_INTERNAL_ERR, + "Requires hfi1 driver with GPU-Direct feature enabled.\n"); + proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND; + proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV; + } else { + /* The following environment variables are here for internal + * experimentation and will not be documented for any customers. + */ + /* Use GPUDirect RDMA for SDMA send? */ + union psmi_envvar_val env_gpudirect_rdma_send; + psmi_getenv("PSM2_GPUDIRECT_RDMA_SEND", + "Use GPUDirect RDMA support to allow the HFI to directly" + " read from the GPU for SDMA. Requires driver" + " support.(default is disabled i.e. 0)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)0, /* Disabled by default */ + &env_gpudirect_rdma_send); + + if (env_gpudirect_rdma_send.e_uint && device_support_gpudirect) { + if (!PSMI_IS_CUDA_ENABLED || + /* All pio, No SDMA*/ + (proto->flags & IPS_PROTO_FLAG_SPIO)) + err = psmi_handle_error(PSMI_EP_NORETURN, + PSM2_INTERNAL_ERR, + "Unable to start run as PSM would require cuda, sdma" + "and TID support\n"); + proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND; + } + /* Use GPUDirect RDMA for recv? */ + union psmi_envvar_val env_gpudirect_rdma_recv; + psmi_getenv("PSM2_GPUDIRECT_RDMA_RECV", + "Use GPUDirect RDMA support to allow the HFI to directly" + " write into GPU. Requires driver support.(default is" + " disabled i.e. 0)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)0, /* Disabled by default */ + &env_gpudirect_rdma_recv); + + if (env_gpudirect_rdma_recv.e_uint && device_support_gpudirect) { + if (!PSMI_IS_CUDA_ENABLED || + !(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) + err = psmi_handle_error(PSMI_EP_NORETURN, + PSM2_INTERNAL_ERR, + "Unable to start run as PSM would require cuda," + " sdma and TID support\n"); + proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV; + } + } + + if (PSMI_IS_CUDA_ENABLED && + (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) { + struct psmi_rlimit_mpool rlim = CUDA_HOSTBUFFER_LIMITS; + uint32_t maxsz, chunksz, max_elements; + + if ((err = psmi_parse_mpool_env(proto->mq, 1, + &rlim, &maxsz, &chunksz))) + goto fail; + + /* the maxsz is the amount in MB, not the number of entries, + * since the element size depends on the window size */ + max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv; + /* mpool requires max_elements to be power of 2. round down. */ + max_elements = 1 << (31 - __builtin_clz(max_elements)); + proto->cuda_hostbuf_send_cfg.bufsz = proto->mq->hfi_base_window_rv; + proto->cuda_hostbuf_pool_send = + psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf), + chunksz, max_elements, 0, + UNDEFINED, NULL, NULL, + psmi_cuda_hostbuf_alloc_func, + (void *) + &proto->cuda_hostbuf_send_cfg); + + if (proto->cuda_hostbuf_pool_send == NULL) { + err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, + "Couldn't allocate CUDA host send buffer pool"); + goto fail; + } + + /* use the same number of elements for the small pool */ + proto->cuda_hostbuf_small_send_cfg.bufsz = CUDA_SMALLHOSTBUF_SZ; + proto->cuda_hostbuf_pool_small_send = + psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf), + chunksz, max_elements, 0, + UNDEFINED, NULL, NULL, + psmi_cuda_hostbuf_alloc_func, + (void *) + &proto->cuda_hostbuf_small_send_cfg); + + if (proto->cuda_hostbuf_pool_small_send == NULL) { + err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, + "Couldn't allocate CUDA host small send buffer pool"); + goto fail; + } + + /* Configure the amount of prefetching */ + union psmi_envvar_val env_prefetch_limit; + + psmi_getenv("PSM2_CUDA_PREFETCH_LIMIT", + "How many TID windows to prefetch at RTS time(default is 2)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)CUDA_WINDOW_PREFETCH_DEFAULT, + &env_prefetch_limit); + proto->cuda_prefetch_limit = env_prefetch_limit.e_uint; + } +#endif +fail: + return err; +} + +psm2_error_t +ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in) +{ + struct psmi_eptab_iterator itor; + uint64_t t_start; + uint64_t t_grace_start, t_grace_time, t_grace_interval; + psm2_epaddr_t epaddr; + psm2_error_t err = PSM2_OK; + int i; + union psmi_envvar_val grace_intval; + + psmi_getenv("PSM2_CLOSE_GRACE_PERIOD", + "Additional grace period in seconds for closing end-point.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)0, &grace_intval); + + if (getenv("PSM2_CLOSE_GRACE_PERIOD")) { + t_grace_time = grace_intval.e_uint * SEC_ULL; + } else if (timeout_in > 0) { + /* default to half of the close time-out */ + t_grace_time = timeout_in / 2; + } else { + /* propagate the infinite time-out case */ + t_grace_time = 0; + } + + if (t_grace_time > 0 && t_grace_time < PSMI_MIN_EP_CLOSE_TIMEOUT) + t_grace_time = PSMI_MIN_EP_CLOSE_TIMEOUT; + + /* At close we will busy wait for the grace interval to see if any + * receive progress is made. If progress is made we will wait for + * another grace interval, until either no progress is made or the + * entire grace period has passed. If the grace interval is too low + * we may miss traffic and exit too early. If the grace interval is + * too large the additional time spent while closing the program + * will become visible to the user. */ + psmi_getenv("PSM2_CLOSE_GRACE_INTERVAL", + "Grace interval in seconds for closing end-point.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)0, &grace_intval); + + if (getenv("PSM2_CLOSE_GRACE_INTERVAL")) { + t_grace_interval = grace_intval.e_uint * SEC_ULL; + } else { + /* A heuristic is used to scale up the timeout linearly with + * the number of endpoints, and we allow one second per 1000 + * endpoints. */ + t_grace_interval = (proto->ep->connections * SEC_ULL) / 1000; + } + + if (t_grace_interval < PSMI_MIN_EP_CLOSE_GRACE_INTERVAL) + t_grace_interval = PSMI_MIN_EP_CLOSE_GRACE_INTERVAL; + if (t_grace_interval > PSMI_MAX_EP_CLOSE_GRACE_INTERVAL) + t_grace_interval = PSMI_MAX_EP_CLOSE_GRACE_INTERVAL; + + PSMI_LOCK_ASSERT(proto->mq->progress_lock); + + t_start = proto->t_fini = get_cycles(); + + /* Close whatever has been left open */ + if (proto->num_connected_outgoing > 0) { + int num_disc = 0; + int *mask; + psm2_error_t *errs; + psm2_epaddr_t *epaddr_array; + + psmi_epid_itor_init(&itor, proto->ep); + while ((epaddr = psmi_epid_itor_next(&itor))) { + if (epaddr->ptlctl->ptl == proto->ptl) + num_disc++; + } + psmi_epid_itor_fini(&itor); + mask = + (int *)psmi_calloc(proto->ep, UNDEFINED, num_disc, + sizeof(int)); + errs = (psm2_error_t *) + psmi_calloc(proto->ep, UNDEFINED, num_disc, + sizeof(psm2_error_t)); + epaddr_array = (psm2_epaddr_t *) + psmi_calloc(proto->ep, UNDEFINED, num_disc, + sizeof(psm2_epaddr_t)); + + if (errs == NULL || epaddr_array == NULL || mask == NULL) { + if (epaddr_array) + psmi_free(epaddr_array); + if (errs) + psmi_free(errs); + if (mask) + psmi_free(mask); + err = PSM2_NO_MEMORY; + goto fail; + } + psmi_epid_itor_init(&itor, proto->ep); + i = 0; + while ((epaddr = psmi_epid_itor_next(&itor))) { + /* + * if cstate_outgoing is CSTATE_NONE, then we know it + * is an uni-directional connect, in that the peer + * sent a connect request to us, but we never sent one + * out to the peer epid. Ignore handling those in + * ips_proto_disconnect() as we will do the right thing + * when a disconnect request for the epaddr comes in from the peer. + */ + if (epaddr->ptlctl->ptl == proto->ptl && + ((ips_epaddr_t *) epaddr)->cstate_outgoing != CSTATE_NONE) { + mask[i] = 1; + epaddr_array[i] = epaddr; + i++; + IPS_MCTXT_REMOVE((ips_epaddr_t *) epaddr); + } + } + psmi_epid_itor_fini(&itor); + err = ips_proto_disconnect(proto, force, num_disc, epaddr_array, + mask, errs, timeout_in); + psmi_free(mask); + psmi_free(errs); + psmi_free(epaddr_array); + } + + t_grace_start = get_cycles(); + + while (psmi_cycles_left(t_grace_start, t_grace_time)) { + uint64_t t_grace_interval_start = get_cycles(); + int num_disconnect_requests = proto->num_disconnect_requests; + PSMI_BLOCKUNTIL( + proto->ep, err, + proto->num_connected_incoming == 0 || + (!psmi_cycles_left(t_start, timeout_in) && + (!psmi_cycles_left(t_grace_interval_start, + t_grace_interval) || + !psmi_cycles_left(t_grace_start, t_grace_time)))); + if (num_disconnect_requests == proto->num_disconnect_requests) { + /* nothing happened in this grace interval so break out early */ + break; + } + } + +#if _HFI_DEBUGGING + if (_HFI_PRDBG_ON) { + uint64_t t_grace_finish = get_cycles(); + + _HFI_PRDBG_ALWAYS( + "Closing endpoint disconnect left to=%d,from=%d after %d millisec of grace (out of %d)\n", + proto->num_connected_outgoing, proto->num_connected_incoming, + (int)(cycles_to_nanosecs(t_grace_finish - t_grace_start) / + MSEC_ULL), (int)(t_grace_time / MSEC_ULL)); + } +#endif + +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED) { + PSMI_CUDA_CALL(cuStreamDestroy, proto->cudastream_send); + } +#endif + + if ((err = ips_ibta_fini(proto))) + goto fail; + + if ((err = ips_proto_am_fini(&proto->proto_am))) + goto fail; + + if ((err = ips_scbctrl_fini(&proto->scbc_egr))) + goto fail; + + ips_proto_recv_fini(proto); + + if (proto->protoexp) { + if ((err = ips_protoexp_fini(proto->protoexp))) + goto fail; + } else { + ips_scbctrl_fini(proto->scbc_rv); + psmi_free(proto->scbc_rv); + } + + psmi_mpool_destroy(proto->pend_sends_pool); + psmi_mpool_destroy(proto->timer_pool); + + psmi_free(proto->sdma_scb_queue); + +fail: + proto->t_fini = proto->t_init = 0; + return err; +} + +static +psm2_error_t +proto_sdma_init(struct ips_proto *proto, const psmi_context_t *context) +{ + union psmi_envvar_val env_sdma, env_hfiegr; + psm2_error_t err = PSM2_OK; + + /* + * Only initialize if RUNTIME_SDMA is enabled. + */ + psmi_assert_always(psmi_hal_has_cap(PSM_HAL_CAP_SDMA)); + + psmi_getenv("PSM2_SDMA", + "hfi send dma flags (0 disables send dma, 2 disables send pio, " + "1 for both sdma/spio, default 1)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)1, &env_sdma); + if (env_sdma.e_uint == 0) + proto->flags |= IPS_PROTO_FLAG_SPIO; + else if (env_sdma.e_uint == 2) + proto->flags |= IPS_PROTO_FLAG_SDMA; + + if (!(proto->flags & (IPS_PROTO_FLAG_SDMA | IPS_PROTO_FLAG_SPIO))) { + /* use both spio and sdma */ + if(psmi_cpu_model == CPUID_MODEL_PHI_GEN2 || psmi_cpu_model == CPUID_MODEL_PHI_GEN2M) + { + proto->iovec_thresh_eager = MQ_HFI_THRESH_EGR_SDMA_SQ_PHI2; + proto->iovec_thresh_eager_blocking = MQ_HFI_THRESH_EGR_SDMA_PHI2; + } else { + proto->iovec_thresh_eager = MQ_HFI_THRESH_EGR_SDMA_SQ_XEON; + proto->iovec_thresh_eager_blocking = MQ_HFI_THRESH_EGR_SDMA_XEON; + } + + if (!psmi_getenv("PSM2_MQ_EAGER_SDMA_SZ", + "hfi pio-to-sdma eager switchover", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) proto->iovec_thresh_eager, + &env_hfiegr)) { + proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = + env_hfiegr.e_uint; + } + } else if (proto->flags & IPS_PROTO_FLAG_SDMA) { /* all sdma */ + proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = + 0; + } else if (proto->flags & IPS_PROTO_FLAG_SPIO) { /* all spio */ + proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = + ~0U; + } + + return err; +} + +static +void ctrlq_init(struct ips_ctrlq *ctrlq, struct ips_proto *proto) +{ + /* clear the ctrl send queue */ + memset(ctrlq, 0, sizeof(*ctrlq)); + + proto->message_type_to_index[OPCODE_ACK] = CTRL_MSG_ACK_QUEUED; + proto->message_type_to_index[OPCODE_NAK] = CTRL_MSG_NAK_QUEUED; + proto->message_type_to_index[OPCODE_BECN] = CTRL_MSG_BECN_QUEUED; + proto->message_type_to_index[OPCODE_ERR_CHK] = CTRL_MSG_ERR_CHK_QUEUED; + proto->message_type_to_index[OPCODE_ERR_CHK_GEN] = + CTRL_MSG_ERR_CHK_GEN_QUEUED; + proto->message_type_to_index[OPCODE_CONNECT_REQUEST] = + CTRL_MSG_CONNECT_REQUEST_QUEUED; + proto->message_type_to_index[OPCODE_CONNECT_REPLY] = + CTRL_MSG_CONNECT_REPLY_QUEUED; + proto->message_type_to_index[OPCODE_DISCONNECT_REQUEST] = + CTRL_MSG_DISCONNECT_REQUEST_QUEUED; + proto->message_type_to_index[OPCODE_DISCONNECT_REPLY] = + CTRL_MSG_DISCONNECT_REPLY_QUEUED; + + ctrlq->ctrlq_head = ctrlq->ctrlq_tail = 0; + ctrlq->ctrlq_overflow = 0; + ctrlq->ctrlq_proto = proto; + + /* + * We never enqueue ctrl messages with real payload. If we do, + * the queue 'elem_payload' size needs to be big enough. + * Note: enqueue nak/ack is very important for performance. + */ + proto->ctrl_msg_queue_enqueue = + CTRL_MSG_ACK_QUEUED | + CTRL_MSG_NAK_QUEUED | + CTRL_MSG_BECN_QUEUED; + + psmi_timer_entry_init(&ctrlq->ctrlq_timer, + ips_proto_timer_ctrlq_callback, ctrlq); + + return; +} + +static __inline__ void _build_ctrl_message(struct ips_proto *proto, + struct ips_flow *flow, uint8_t message_type, + ips_scb_t *ctrlscb, uint32_t paylen) +{ + uint32_t tot_paywords = (sizeof(struct ips_message_header) + + HFI_CRC_SIZE_IN_BYTES + paylen) >> BYTE2DWORD_SHIFT; + uint32_t slid, dlid; + ips_epaddr_t *ipsaddr = flow->ipsaddr; + struct ips_message_header *p_hdr = &ctrlscb->ips_lrh; + ips_path_rec_t *ctrl_path = + ipsaddr->pathgrp->pg_path[ipsaddr-> + hpp_index][IPS_PATH_HIGH_PRIORITY]; + + if ((proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) && + (++ipsaddr->hpp_index >= + ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY])) + ipsaddr->hpp_index = 0; + + /* + * If the size of the transfer is NOT within the "exclusion range", + * then use the "dispersive routling" slid/dlid. Otherwise + * use the base LIDS. + * + * This is a control message, so it should never be a TID transfer. + */ + slid = ctrl_path->pr_slid; + dlid = ctrl_path->pr_dlid; + if (ctrlscb->scb_flags & IPS_SEND_FLAG_NO_LMC) { + slid = ipsaddr->pathgrp->pg_base_slid; + dlid = ipsaddr->pathgrp->pg_base_dlid; + } + + /* Control messages go over the control path. */ + p_hdr->lrh[0] = __cpu_to_be16(HFI_LRH_BTH | + ((ctrl_path->pr_sl & HFI_LRH_SL_MASK) << + HFI_LRH_SL_SHIFT) | + ((proto->sl2sc[ctrl_path->pr_sl] & + HFI_LRH_SC_MASK) << HFI_LRH_SC_SHIFT)); + p_hdr->lrh[1] = dlid; + p_hdr->lrh[2] = __cpu_to_be16(tot_paywords & HFI_LRH_PKTLEN_MASK); + p_hdr->lrh[3] = slid; + + p_hdr->bth[0] = __cpu_to_be32(ctrl_path->pr_pkey | + (message_type << HFI_BTH_OPCODE_SHIFT)); + + /* If flow is congested then generate a BECN for path. */ + if_pf(flow->flags & IPS_FLOW_FLAG_GEN_BECN) { + p_hdr->bth[1] = __cpu_to_be32(ipsaddr->context | + ipsaddr-> + subcontext << + HFI_BTH_SUBCTXT_SHIFT | flow-> + flowid << HFI_BTH_FLOWID_SHIFT | + proto->epinfo. + ep_baseqp << HFI_BTH_QP_SHIFT | 1 + << HFI_BTH_BECN_SHIFT); + flow->flags &= ~IPS_FLOW_FLAG_GEN_BECN; + } + else { + p_hdr->bth[1] = __cpu_to_be32(ipsaddr->context | + ipsaddr-> + subcontext << + HFI_BTH_SUBCTXT_SHIFT | flow-> + flowid << HFI_BTH_FLOWID_SHIFT | + proto->epinfo. + ep_baseqp << HFI_BTH_QP_SHIFT); + } + + /* p_hdr->bth[2] already set by caller, or don't care */ + /* p_hdr->ack_seq_num already set by caller, or don't care */ + + p_hdr->connidx = ipsaddr->connidx_outgoing; + p_hdr->flags = 0; + + p_hdr->khdr.kdeth0 = __cpu_to_le32( + (ctrlscb->scb_flags & IPS_SEND_FLAG_INTR) | + (IPS_PROTO_VERSION << HFI_KHDR_KVER_SHIFT)); + p_hdr->khdr.kdeth1 = __cpu_to_le32(proto->epinfo.ep_jkey); + + return; +} + +psm2_error_t +ips_proto_timer_ctrlq_callback(struct psmi_timer *timer, uint64_t t_cyc_expire) +{ + struct ips_ctrlq *ctrlq = (struct ips_ctrlq *)timer->context; + struct ips_proto *proto = ctrlq->ctrlq_proto; + struct ips_ctrlq_elem *cqe; + uint32_t have_cksum = proto->flags & IPS_PROTO_FLAG_CKSUM; + psm2_error_t err; + + /* service ctrl send queue first */ + while (ctrlq->ctrlq_cqe[ctrlq->ctrlq_tail].msg_queue_mask) { + cqe = &ctrlq->ctrlq_cqe[ctrlq->ctrlq_tail]; + /* When PSM_PERF is enabled, the following line causes the + PMU to start a stop watch to measure instruction cycles of the + TX speedpath of PSM. The stop watch is stopped below. */ + GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR); + if (cqe->msg_scb.flow->transfer == PSM_TRANSFER_PIO) { + err = psmi_hal_spio_transfer_frame(proto, + cqe->msg_scb.flow, &cqe->msg_scb.pbc, + cqe->msg_scb.cksum, 0, PSMI_TRUE, + have_cksum, cqe->msg_scb.cksum[0], + proto->ep->context.psm_hw_ctxt +#ifdef PSM_CUDA + , 0 +#endif + ); + } else { + err = ips_dma_transfer_frame(proto, + cqe->msg_scb.flow, &cqe->msg_scb, + cqe->msg_scb.cksum, 0, + have_cksum, cqe->msg_scb.cksum[0]); + } + /* When PSM_PERF is enabled, the following line causes the + PMU to stop a stop watch to measure instruction cycles of the + TX speedpath of PSM. The stop watch was started above. */ + GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); + + if (err == PSM2_OK) { + PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&cqe->msg_scb.ips_lrh,"PKT_STRM: err: %d", err); + ips_proto_epaddr_stats_set(proto, cqe->message_type); + *cqe->msg_queue_mask &= + ~message_type2index(proto, cqe->message_type); + cqe->msg_queue_mask = NULL; + ctrlq->ctrlq_tail = + (ctrlq->ctrlq_tail + 1) % CTRL_MSG_QEUEUE_SIZE; + } else { + psmi_assert(err == PSM2_EP_NO_RESOURCES); + + if (proto->flags & IPS_PROTO_FLAG_SDMA) + proto->stats.writev_busy_cnt++; + else + proto->stats.pio_busy_cnt++; + /* re-request a timer expiration */ + psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer, + PSMI_TIMER_PRIO_0); + return PSM2_OK; + } + } + + return PSM2_OK; +} + +/* Update cqe struct which is a single element from pending control message queue */ +PSMI_ALWAYS_INLINE( +void ips_proto_update_cqe(struct ips_ctrlq_elem *cqe, uint16_t *msg_queue_mask, + struct ips_flow *flow, ips_scb_t *ctrlscb, uint8_t message_type)){ + + cqe->message_type = message_type; + cqe->msg_queue_mask = msg_queue_mask; + psmi_mq_mtucpy(&cqe->msg_scb.ips_lrh, + &ctrlscb->ips_lrh, sizeof(ctrlscb->ips_lrh)); + cqe->msg_scb.flow = flow; + cqe->msg_scb.cksum[0] = ctrlscb->cksum[0]; +} + +psm2_error_t +ips_proto_send_ctrl_message(struct ips_flow *flow, uint8_t message_type, + uint16_t *msg_queue_mask, ips_scb_t *ctrlscb, + void *payload, uint32_t paylen) +{ + psm2_error_t err = PSM2_EP_NO_RESOURCES; + ips_epaddr_t *ipsaddr = flow->ipsaddr; + struct ips_proto *proto = ((psm2_epaddr_t) ipsaddr)->proto; + struct ips_ctrlq *ctrlq = &proto->ctrlq; + struct ips_ctrlq_elem *cqe = ctrlq->ctrlq_cqe; + uint32_t have_cksum; + + psmi_assert(message_type >= OPCODE_ACK && + message_type <= OPCODE_DISCONNECT_REPLY); + psmi_assert((paylen & 0x3) == 0); /* require 4-byte multiple */ + psmi_assert(flow->frag_size >= + (paylen + PSM_CRC_SIZE_IN_BYTES)); + + /* Drain queue if non-empty */ + if (cqe[ctrlq->ctrlq_tail].msg_queue_mask) + ips_proto_timer_ctrlq_callback(&ctrlq->ctrlq_timer, 0ULL); + + /* finish setup control message header */ + ips_set_LMC_LID_choice(proto, ctrlscb, paylen); + _build_ctrl_message(proto, flow, message_type, ctrlscb, paylen); + + /* If enabled checksum control message */ + have_cksum = proto->flags & IPS_PROTO_FLAG_CKSUM; + if (have_cksum) { + ctrlscb->ips_lrh.flags |= IPS_SEND_FLAG_PKTCKSUM; + ips_do_cksum(proto, &ctrlscb->ips_lrh, + payload, paylen, ctrlscb->cksum); + } + + /* + * for ACK/NAK/BECN, we use the fast flow to send over, otherwise, + * we use the original flow + */ + if (message_type == OPCODE_ACK || + message_type == OPCODE_NAK || + message_type == OPCODE_BECN) + { + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + flow = &ipsaddr->flows[proto->msgflowid]; + } + + switch (flow->transfer) { + case PSM_TRANSFER_PIO: + /* When PSM_PERF is enabled, the following line causes the + PMU to start a stop watch to measure instruction cycles of the + TX speedpath of PSM. The stop watch is stopped below. */ + GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR); + err = psmi_hal_spio_transfer_frame(proto, flow, + &ctrlscb->pbc, payload, paylen, + PSMI_TRUE, have_cksum, ctrlscb->cksum[0], + proto->ep->context.psm_hw_ctxt +#ifdef PSM_CUDA + , 0 +#endif + ); + /* When PSM_PERF is enabled, the following line causes the + PMU to stop a stop watch to measure instruction cycles of the + TX speedpath of PSM. The stop watch was started above. */ + GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); + break; + case PSM_TRANSFER_DMA: + /* When PSM_PERF is enabled, the following line causes the + PMU to start a stop watch to measure instruction cycles of the + TX speedpath of PSM. The stop watch is stopped below. */ + GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR); + err = ips_dma_transfer_frame(proto, flow, + ctrlscb, payload, paylen, + have_cksum, ctrlscb->cksum[0]); + /* When PSM_PERF is enabled, the following line causes the + PMU to stop a stop watch to measure instruction cycles of the + TX speedpath of PSM. The stop watch was started above. */ + GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); + break; + default: + err = PSM2_INTERNAL_ERR; + break; + } + + if (err == PSM2_OK) + { + PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&ctrlscb->ips_lrh,"PKT_STRM: err: %d", err); + ips_proto_epaddr_stats_set(proto, message_type); + } + + _HFI_VDBG("transfer_frame of opcode=0x%x,remote_lid=%d," + "src=%p,len=%d returns %d\n", + (int)_get_proto_hfi_opcode(&ctrlscb->ips_lrh), + __be16_to_cpu(ctrlscb->ips_lrh.lrh[1]), payload, paylen, err); + + if (err != PSM2_EP_NO_RESOURCES) + return err; + if (proto->flags & IPS_PROTO_FLAG_SDMA) + proto->stats.writev_busy_cnt++; + else + proto->stats.pio_busy_cnt++; + + if (proto->ctrl_msg_queue_enqueue & proto-> + message_type_to_index[message_type]) { + /* We only queue control msg without payload */ + psmi_assert(paylen == 0); + + if ((*msg_queue_mask) & proto-> + message_type_to_index[message_type]) { + + if (message_type == OPCODE_ACK) { + /* Pending queue should contain latest ACK type message, + * overwrite the previous one. */ + ips_proto_update_cqe(&cqe[flow->ack_index], msg_queue_mask, + flow, ctrlscb, message_type); + } + + err = PSM2_OK; + } else if (cqe[ctrlq->ctrlq_head].msg_queue_mask == NULL) { + /* entry is free */ + if (message_type == OPCODE_ACK) { + /* Track the index of last ACK type message in queue*/ + flow->ack_index = ctrlq->ctrlq_head; + } + + *msg_queue_mask |= + message_type2index(proto, message_type); + + ips_proto_update_cqe(&cqe[ctrlq->ctrlq_head], msg_queue_mask, + flow, ctrlscb, message_type); + + ctrlq->ctrlq_head = + (ctrlq->ctrlq_head + 1) % CTRL_MSG_QEUEUE_SIZE; + /* _HFI_INFO("requesting ctrlq timer for msgtype=%d!\n", message_type); */ + psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer, + PSMI_TIMER_PRIO_0); + + err = PSM2_OK; + } else { + proto->ctrl_msg_queue_overflow++; + } + } + + return err; +} + +void MOCKABLE(ips_proto_flow_enqueue)(struct ips_flow *flow, ips_scb_t *scb) +{ + ips_epaddr_t *ipsaddr = flow->ipsaddr; + struct ips_proto *proto = ((psm2_epaddr_t) ipsaddr)->proto; + + ips_scb_prepare_flow_inner(proto, ipsaddr, flow, scb); + if ((proto->flags & IPS_PROTO_FLAG_CKSUM) && + (scb->tidctrl == 0) && (scb->nfrag == 1)) { + scb->ips_lrh.flags |= IPS_SEND_FLAG_PKTCKSUM; + ips_do_cksum(proto, &scb->ips_lrh, + ips_scb_buffer(scb), scb->payload_size, &scb->cksum[0]); + } + + /* If this is the first scb on flow, pull in both timers. */ + if (flow->timer_ack == NULL) { + psmi_assert(flow->timer_send == NULL); + flow->timer_ack = scb->timer_ack; + flow->timer_send = scb->timer_send; + } + psmi_assert(flow->timer_ack != NULL); + psmi_assert(flow->timer_send != NULL); + + /* Every flow has a pending head that points into the unacked queue. + * If sends are already pending, process those first */ + if (SLIST_EMPTY(&flow->scb_pend)) + { + PSM2_LOG_PKT_STRM(PSM2_LOG_PEND,&scb->ips_lrh,"PKT_STRM: pkt in pend list"); + SLIST_FIRST(&flow->scb_pend) = scb; + } + + /* Insert scb into flow's unacked queue */ + STAILQ_INSERT_TAIL(&flow->scb_unacked, scb, nextq); + +#ifdef PSM_DEBUG + /* update scb counters in flow. */ + flow->scb_num_pending++; + flow->scb_num_unacked++; +#endif +} +MOCK_DEF_EPILOGUE(ips_proto_flow_enqueue); + +/* + * This function attempts to flush the current list of pending + * packets through PIO. + * + * Recoverable errors: + * PSM2_OK: Packet triggered through PIO. + * PSM2_EP_NO_RESOURCES: No PIO bufs available or cable pulled. + * + * Unrecoverable errors: + * PSM2_EP_NO_NETWORK: No network, no lid, ... + * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc. + */ +psm2_error_t +ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed) +{ + struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto; + struct ips_scb_pendlist *scb_pend = &flow->scb_pend; + int num_sent = 0; + uint64_t t_cyc; + ips_scb_t *scb; + psm2_error_t err = PSM2_OK; + + psmi_assert(!SLIST_EMPTY(scb_pend)); + + /* Out of credits - ACKs/NAKs reclaim recredit or congested flow */ + if_pf((flow->credits <= 0) || (flow->flags & IPS_FLOW_FLAG_CONGESTED)) { + if (nflushed) + *nflushed = 0; + return PSM2_EP_NO_RESOURCES; + } + + while (!SLIST_EMPTY(scb_pend) && flow->credits > 0) { + scb = SLIST_FIRST(scb_pend); + psmi_assert(scb->nfrag == 1); + /* When PSM_PERF is enabled, the following line causes the + PMU to start a stop watch to measure instruction cycles of the + TX speedpath of PSM. The stop watch is stopped below. */ + GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR); + if ((err = psmi_hal_spio_transfer_frame(proto, flow, &scb->pbc, + ips_scb_buffer(scb), + scb->payload_size, + PSMI_FALSE, + scb->ips_lrh.flags & + IPS_SEND_FLAG_PKTCKSUM, + scb->cksum[0], + proto->ep->context.psm_hw_ctxt +#ifdef PSM_CUDA + , IS_TRANSFER_BUF_GPU_MEM(scb) +#endif + )) + == PSM2_OK) { + /* When PSM_PERF is enabled, the following line causes the + PMU to stop a stop watch to measure instruction cycles of the + TX speedpath of PSM. The stop watch was started above. */ + GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); + t_cyc = get_cycles(); + scb->scb_flags &= ~IPS_SEND_FLAG_PENDING; + scb->ack_timeout = proto->epinfo.ep_timeout_ack; + scb->abs_timeout = proto->epinfo.ep_timeout_ack + t_cyc; + psmi_timer_request(proto->timerq, flow->timer_ack, + scb->abs_timeout); + num_sent++; + flow->credits--; + SLIST_REMOVE_HEAD(scb_pend, next); +#ifdef PSM_DEBUG + flow->scb_num_pending--; +#endif + PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&scb->ips_lrh,"PKT_STRM: err: %d", err); + + } else + { + /* When PSM_PERF is enabled, the following line causes the + PMU to stop a stop watch to measure instruction cycles of the + TX speedpath of PSM. The stop watch was started above. */ + GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); + break; + } + } + + /* If out of flow credits re-schedule send timer */ + if (!SLIST_EMPTY(scb_pend)) { + proto->stats.pio_busy_cnt++; + psmi_timer_request(proto->timerq, flow->timer_send, + get_cycles() + proto->timeout_send); + } + + if (nflushed != NULL) + *nflushed = num_sent; + + return err; +} + +/* + * Flush all packets currently marked as pending + */ +static psm2_error_t scb_dma_send(struct ips_proto *proto, struct ips_flow *flow, + struct ips_scb_pendlist *slist, int *num_sent); + +/* + * Flush all packets queued up on a flow via send DMA. + * + * Recoverable errors: + * PSM2_OK: Able to flush entire pending queue for DMA. + * PSM2_OK_NO_PROGRESS: Flushed at least 1 but not all pending packets for DMA. + * PSM2_EP_NO_RESOURCES: No scb's available to handle unaligned packets + * or writev returned a recoverable error (no mem for + * descriptors, dma interrupted or no space left in dma + * queue). + * + * Unrecoverable errors: + * PSM2_EP_DEVICE_FAILURE: Unexpected error calling writev(), chip failure, + * rxe/txe parity error. + * PSM2_EP_NO_NETWORK: No network, no lid, ... + */ +psm2_error_t +ips_proto_flow_flush_dma(struct ips_flow *flow, int *nflushed) +{ + struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto; + struct ips_scb_pendlist *scb_pend = &flow->scb_pend; + ips_scb_t *scb = NULL; + psm2_error_t err = PSM2_OK; + int nsent = 0; + + psmi_assert(!SLIST_EMPTY(scb_pend)); + + /* Out of credits - ACKs/NAKs reclaim recredit or congested flow */ + if_pf((flow->credits <= 0) || (flow->flags & IPS_FLOW_FLAG_CONGESTED)) { + if (nflushed) + *nflushed = 0; + return PSM2_EP_NO_RESOURCES; + } + + err = scb_dma_send(proto, flow, scb_pend, &nsent); + if (err != PSM2_OK && err != PSM2_EP_NO_RESOURCES && + err != PSM2_OK_NO_PROGRESS) + goto fail; + + if (nsent > 0) { + uint64_t t_cyc = get_cycles(); + int i = 0; + /* + * inflight counter proto->iovec_cntr_next_inflight should not drift + * from completion counter proto->iovec_cntr_last_completed away too + * far because we only have very small scb counter compared with + * uint32_t counter value. + */ +#ifdef PSM_DEBUG + flow->scb_num_pending -= nsent; +#endif + SLIST_FOREACH(scb, scb_pend, next) { + if (++i > nsent) + break; + + PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&scb->ips_lrh,"PKT_STRM: (dma)"); + + scb->scb_flags &= ~IPS_SEND_FLAG_PENDING; + scb->ack_timeout = + scb->nfrag * proto->epinfo.ep_timeout_ack; + scb->abs_timeout = + scb->nfrag * proto->epinfo.ep_timeout_ack + t_cyc; + + psmi_assert(proto->sdma_scb_queue + [proto->sdma_fill_index] == NULL); + proto->sdma_scb_queue[proto->sdma_fill_index] = scb; + scb->dma_complete = 0; + + proto->sdma_avail_counter--; + proto->sdma_fill_index++; + if (proto->sdma_fill_index == proto->sdma_queue_size) + proto->sdma_fill_index = 0; + + /* Flow credits can temporarily go to negative for + * packets tracking purpose, because we have sdma + * chunk processing which can't send exact number + * of packets as the number of credits. + */ + flow->credits -= scb->nfrag; + } + SLIST_FIRST(scb_pend) = scb; + } + + if (SLIST_FIRST(scb_pend) != NULL) { + psmi_assert(flow->scb_num_pending > 0); + + switch (flow->protocol) { + case PSM_PROTOCOL_TIDFLOW: + /* For Tidflow we can cancel the ack timer if we have flow credits + * available and schedule the send timer. If we are out of flow + * credits then the ack timer is scheduled as we are waiting for + * an ACK to reclaim credits. This is required since multiple + * tidflows may be active concurrently. + */ + if (flow->credits > 0) { + /* Cancel ack timer and reschedule send timer. Increment + * writev_busy_cnt as this really is DMA buffer exhaustion. + */ + psmi_timer_cancel(proto->timerq, + flow->timer_ack); + psmi_timer_request(proto->timerq, + flow->timer_send, + get_cycles() + + (proto->timeout_send << 1)); + proto->stats.writev_busy_cnt++; + } else { + /* Re-instate ACK timer to reap flow credits */ + psmi_timer_request(proto->timerq, + flow->timer_ack, + get_cycles() + + (proto->epinfo. + ep_timeout_ack >> 2)); + } + + break; + case PSM_PROTOCOL_GO_BACK_N: + default: + if (flow->credits > 0) { + /* Schedule send timer and increment writev_busy_cnt */ + psmi_timer_request(proto->timerq, + flow->timer_send, + get_cycles() + + (proto->timeout_send << 1)); + proto->stats.writev_busy_cnt++; + } else { + /* Schedule ACK timer to reap flow credits */ + psmi_timer_request(proto->timerq, + flow->timer_ack, + get_cycles() + + (proto->epinfo. + ep_timeout_ack >> 2)); + } + break; + } + } else { + /* Schedule ack timer */ + psmi_timer_cancel(proto->timerq, flow->timer_send); + psmi_timer_request(proto->timerq, flow->timer_ack, + get_cycles() + proto->epinfo.ep_timeout_ack); + } + + /* We overwrite error with its new meaning for flushing packets */ + if (nsent > 0) + if (scb) + err = PSM2_OK_NO_PROGRESS; /* partial flush */ + else + err = PSM2_OK; /* complete flush */ + else + err = PSM2_EP_NO_RESOURCES; /* no flush at all */ + +fail: + if (nflushed) + *nflushed = nsent; + + return err; +} + +/* + * Fault injection in dma sends. Since DMA through writev() is all-or-nothing, + * we don't inject faults on a packet-per-packet basis since the code gets + * quite complex. Instead, each call to flush_dma or transfer_frame is treated + * as an "event" and faults are generated according to the IPS_FAULTINJ_DMASEND + * setting. + * + * The effect is as if the event was successful but dropped on the wire + * somewhere. + */ +PSMI_ALWAYS_INLINE(int dma_do_fault()) +{ + + if_pf(PSMI_FAULTINJ_ENABLED()) { + PSMI_FAULTINJ_STATIC_DECL(fi, "dmalost", 1, + IPS_FAULTINJ_DMALOST); + return psmi_faultinj_is_fault(fi); + } + else + return 0; +} + +/* + * Driver defines the following sdma completion error code, returned + * as negative value: + * #define SDMA_TXREQ_S_OK 0 + * #define SDMA_TXREQ_S_SENDERROR 1 + * #define SDMA_TXREQ_S_ABORTED 2 + * #define SDMA_TXREQ_S_SHUTDOWN 3 + * + * When hfi is in freeze mode, driver will complete all the pending + * sdma request as aborted. Since PSM needs to recover from hfi + * freeze mode, this routine ignore aborted error. + */ +psm2_error_t ips_proto_dma_completion_update(struct ips_proto *proto) +{ + ips_scb_t *scb; + + while (proto->sdma_done_index != proto->sdma_fill_index) { + psmi_hal_sdma_ring_slot_status status; + uint32_t errorCode; + int rc = psmi_hal_get_sdma_ring_slot_status(proto->sdma_done_index, &status, &errorCode, + proto->ep->context.psm_hw_ctxt); + psmi_rmb(); + + if (rc < 0) + return PSM2_INTERNAL_ERR; + + if (status == PSM_HAL_SDMA_RING_QUEUED) + return PSM2_OK; + + /* Mark sdma request is complete */ + scb = proto->sdma_scb_queue[proto->sdma_done_index]; + if (scb) + { + psmi_assert(status == PSM_HAL_SDMA_RING_COMPLETE); + scb->dma_complete = 1; + proto->sdma_scb_queue[proto->sdma_done_index] = NULL; + } + + if (status == PSM_HAL_SDMA_RING_ERROR && (int)errorCode != -2) { + psm2_error_t err = + psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE, + "SDMA completion error: %d (fd=%d, index=%d)", + 0 - ((int32_t)errorCode), + psmi_hal_get_fd(proto->ep->context. + psm_hw_ctxt), + proto->sdma_done_index); + return err; + } + + proto->sdma_avail_counter++; + proto->sdma_done_index++; + if (proto->sdma_done_index == proto->sdma_queue_size) + proto->sdma_done_index = 0; + } + + return PSM2_OK; +} + +/* + +Handles ENOMEM on a DMA completion. + + */ +static inline +psm2_error_t +handle_ENOMEM_on_DMA_completion(struct ips_proto *proto) +{ + psm2_error_t err; + time_t now = time(NULL); + + if (proto->protoexp && proto->protoexp->tidc.tid_cachemap.payload.nidle) { + uint64_t lengthEvicted = + ips_tidcache_evict(&proto->protoexp->tidc, -1); + + if (!proto->writevFailTime) + proto->writevFailTime = now; + + if (lengthEvicted) + return PSM2_OK; /* signals a retry of the writev command. */ + else { +#ifdef PSM_CUDA + if (PSMI_IS_GDR_COPY_ENABLED && gdr_cache_evict()) { + return PSM2_OK; + } else +#endif + return PSM2_EP_NO_RESOURCES; /* should signal a return of + no progress, and retry later */ + } + } +#ifdef PSM_CUDA + else if (PSMI_IS_GDR_COPY_ENABLED) { + uint64_t lengthEvicted = gdr_cache_evict(); + if (!proto->writevFailTime) + proto->writevFailTime = now; + + if (lengthEvicted) + return PSM2_OK; + else + return PSM2_EP_NO_RESOURCES; + } +#endif + else if (!proto->writevFailTime) + { + proto->writevFailTime = now; + return PSM2_EP_NO_RESOURCES; /* should signal a return of + no progress, and retry later */ + } + else + { + static const double thirtySeconds = 30.0; + + if (difftime(now, proto->writevFailTime) > + thirtySeconds) { + err = psmi_handle_error( + proto->ep, + PSM2_EP_DEVICE_FAILURE, + "SDMA completion error: out of " + "memory (fd=%d, index=%d)", + psmi_hal_get_fd(proto->ep->context.psm_hw_ctxt), + proto->sdma_done_index); + return err; + } + return PSM2_EP_NO_RESOURCES; /* should signal a return of + no progress, and retry later */ + } +} + +/* ips_dma_transfer_frame is used only for control messages, and is + * not enabled by default, and not tested by QA; expected send + * dma goes through scb_dma_send() */ +psm2_error_t +ips_dma_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, + ips_scb_t *scb, void *payload, uint32_t paylen, + uint32_t have_cksum, uint32_t cksum) +{ + ssize_t ret; + psm2_error_t err; + struct psm_hal_sdma_req_info *sdmahdr; + uint16_t iovcnt; + struct iovec iovec[2]; + + /* See comments above for fault injection */ + if_pf(dma_do_fault()) + return PSM2_OK; + + /* + * Check if there is a sdma queue slot. + */ + if (proto->sdma_avail_counter == 0) { + err = ips_proto_dma_completion_update(proto); + if (err) + return err; + + if (proto->sdma_avail_counter == 0) { + return PSM2_EP_NO_RESOURCES; + } + } + + /* + * If we have checksum, put to the end of payload. We make sure + * there is enough space in payload for us to put 8 bytes checksum. + * for control message, payload is internal PSM buffer, not user buffer. + */ + if (have_cksum) { + uint32_t *ckptr = (uint32_t *) ((char *)payload + paylen); + *ckptr = cksum; + ckptr++; + *ckptr = cksum; + paylen += PSM_CRC_SIZE_IN_BYTES; + } + + /* + * Setup PBC. + */ + psmi_hal_set_pbc(proto, flow, PSMI_TRUE, + &scb->pbc, HFI_MESSAGE_HDR_SIZE, paylen); + + /* + * Setup SDMA header and io vector. + */ + size_t extra_bytes; + sdmahdr = psmi_get_sdma_req_info(scb, &extra_bytes); + sdmahdr->npkts = 1; + sdmahdr->fragsize = flow->frag_size; + sdmahdr->comp_idx = proto->sdma_fill_index; + psmi_assert(psmi_hal_dma_slot_available(proto->sdma_fill_index, proto->ep->context.psm_hw_ctxt)); + + iovcnt = 1; + iovec[0].iov_base = sdmahdr; + iovec[0].iov_len = psmi_hal_get_sdma_req_size(proto->ep->context.psm_hw_ctxt) + extra_bytes; + + if (paylen > 0) { + iovcnt++; + iovec[1].iov_base = payload; + iovec[1].iov_len = paylen; + } + +#ifdef PSM_CUDA + if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) { + sdmahdr->ctrl = 2 | + (PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | + (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); + } else { +#endif + sdmahdr->ctrl = 1 | + (PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | + (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); +#ifdef PSM_CUDA + } +#endif + /* + * Write into driver to do SDMA work. + */ +retry: + ret = psmi_hal_writev(iovec, iovcnt, &proto->epinfo, proto->ep->context.psm_hw_ctxt); + + if (ret > 0) { + proto->writevFailTime = 0; + psmi_assert_always(ret == 1); + + proto->sdma_avail_counter--; + proto->sdma_fill_index++; + if (proto->sdma_fill_index == proto->sdma_queue_size) + proto->sdma_fill_index = 0; + + /* + * Wait for completion of this control message if + * stack buffer payload is used. This should not be + * a performance issue because sdma control message + * is not a performance code path. + */ + if (iovcnt > 1) { + /* Setup scb ready for completion. */ + psmi_assert(proto->sdma_scb_queue + [sdmahdr->comp_idx] == NULL); + proto->sdma_scb_queue[sdmahdr->comp_idx] = scb; + scb->dma_complete = 0; + + /* Wait for completion */ + err = ips_proto_dma_wait_until(proto, scb); + } else + err = PSM2_OK; + } else { + /* + * ret == 0: Driver did not queue packet. Try later. + * ENOMEM: No kernel memory to queue request, try later? * + * ECOMM: Link may have gone down + * EINTR: Got interrupt while in writev + */ + if (errno == ENOMEM) { + err = handle_ENOMEM_on_DMA_completion(proto); + if (err == PSM2_OK) + goto retry; + } else if (ret == 0 || errno == ECOMM || errno == EINTR) { + err = psmi_context_check_status( + (const psmi_context_t *)&proto->ep->context); + /* + * During a link bounce the err returned from + * psmi_context_check_status is PSM2_EP_NO_NETWORK. In this case + * the error code which we need to return to the calling flush + * function(ips_proto_flow_flush_dma) is PSM2_EP_NO_RESOURCES to + * signal it to restart the timers to flush the packets. + * Not doing so would leave the packet on the unacked and + * pending q without the sdma descriptors ever being updated. + */ + if (err == PSM2_OK || err == PSM2_EP_NO_NETWORK) + err = PSM2_EP_NO_RESOURCES; + } + + else + err = psmi_handle_error(proto->ep, + PSM2_EP_DEVICE_FAILURE, + "Unhandled error in writev(): " + "%s (fd=%d,iovec=%p,len=%d)", + strerror(errno), + psmi_hal_get_fd(proto->ep->context.psm_hw_ctxt), + &iovec, + 1); + } + + return err; +} + +/* + * Caller still expects num_sent to always be correctly set in case of an + * error. + * + * Recoverable errors: + * PSM2_OK: At least one packet was successfully queued up for DMA. + * PSM2_EP_NO_RESOURCES: No scb's available to handle unaligned packets + * or writev returned a recoverable error (no mem for + * descriptors, dma interrupted or no space left in dma + * queue). + * PSM2_OK_NO_PROGRESS: Cable pulled. + * + * Unrecoverable errors: + * PSM2_EP_DEVICE_FAILURE: Error calling hfi_sdma_inflight() or unexpected + * error in calling writev(), or chip failure, rxe/txe + * parity error. + * PSM2_EP_NO_NETWORK: No network, no lid, ... + */ +static +psm2_error_t +scb_dma_send(struct ips_proto *proto, struct ips_flow *flow, + struct ips_scb_pendlist *slist, int *num_sent) +{ + psm2_error_t err = PSM2_OK; + struct psm_hal_sdma_req_info *sdmahdr; + struct ips_scb *scb; + struct iovec *iovec; + uint16_t iovcnt; + + unsigned int vec_idx = 0; + unsigned int scb_idx = 0, scb_sent = 0; + unsigned int num = 0, max_elem; + uint32_t have_cksum; + uint32_t fillidx; + int16_t credits; + ssize_t ret; + + /* See comments above for fault injection */ + if_pf(dma_do_fault()) goto fail; + + /* Check how many SCBs to send based on flow credits */ + credits = flow->credits; + psmi_assert(SLIST_FIRST(slist) != NULL); + SLIST_FOREACH(scb, slist, next) { + num++; + credits -= scb->nfrag; + if (credits <= 0) + break; + } + if (proto->sdma_avail_counter < num) { + /* if there is not enough sdma slot, + * update and use what we have. + */ + err = ips_proto_dma_completion_update(proto); + if (err) + goto fail; + if (proto->sdma_avail_counter == 0) { + err = PSM2_EP_NO_RESOURCES; + goto fail; + } + if (proto->sdma_avail_counter < num) + num = proto->sdma_avail_counter; + } + + /* header, payload, checksum, tidarray */ + max_elem = 4 * num; + iovec = alloca(sizeof(struct iovec) * max_elem); + + fillidx = proto->sdma_fill_index; + SLIST_FOREACH(scb, slist, next) { + /* Can't exceed posix max writev count */ + if (vec_idx + (int)!!(scb->payload_size > 0) >= UIO_MAXIOV) + break; + + psmi_assert(vec_idx < max_elem); + psmi_assert_always(((scb->payload_size & 0x3) == 0) || + psmi_hal_has_cap(PSM_HAL_CAP_NON_DW_MULTIPLE_MSG_SIZE)); + + /* Checksum all eager packets */ + have_cksum = scb->ips_lrh.flags & IPS_SEND_FLAG_PKTCKSUM; + + /* + * Setup PBC. + */ + psmi_hal_set_pbc( + proto, + flow, + PSMI_FALSE, + &scb->pbc, + HFI_MESSAGE_HDR_SIZE, + scb->payload_size + + (have_cksum ? PSM_CRC_SIZE_IN_BYTES : 0)); + + psmi_assert(psmi_hal_dma_slot_available(fillidx, proto->ep->context. + psm_hw_ctxt)); + + size_t extra_bytes; + sdmahdr = psmi_get_sdma_req_info(scb, &extra_bytes); + + sdmahdr->npkts = + scb->nfrag > 1 ? scb->nfrag_remaining : scb->nfrag; + sdmahdr->fragsize = + scb->frag_size ? scb->frag_size : flow->frag_size; + + sdmahdr->comp_idx = fillidx; + fillidx++; + if (fillidx == proto->sdma_queue_size) + fillidx = 0; + + /* + * Setup io vector. + */ + iovec[vec_idx].iov_base = sdmahdr; + iovec[vec_idx].iov_len = psmi_hal_get_sdma_req_size(proto->ep->context. + psm_hw_ctxt) + extra_bytes; + vec_idx++; + iovcnt = 1; + _HFI_VDBG("hdr=%p,%d\n", + iovec[vec_idx - 1].iov_base, + (int)iovec[vec_idx - 1].iov_len); + + if (scb->payload_size > 0) { + /* + * OPA1 supports byte-aligned payload. If it is + * single packet per scb, use payload_size, else + * multi-packets per scb, use remaining chunk_size. + * payload_size is the remaining chunk first packet + * length. + */ + iovec[vec_idx].iov_base = ips_scb_buffer(scb); + iovec[vec_idx].iov_len = scb->nfrag > 1 + ? scb->chunk_size_remaining + : scb->payload_size; + vec_idx++; + iovcnt++; +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED && IS_TRANSFER_BUF_GPU_MEM(scb)) { + /* without this attr, CUDA memory accesses + * do not synchronize with gpudirect-rdma accesses. + * We set this field only if the currently loaded driver + * supports this field. If not, we have other problems + * where we have a non gpu-direct enabled driver loaded + * and PSM2 is trying to use GPU features. + */ + if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) + sdmahdr->flags = PSM_HAL_BUF_GPU_MEM; + else + sdmahdr->flags = 0; + } else if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) + sdmahdr->flags = 0; +#endif + _HFI_VDBG("seqno=%d hdr=%p,%d payload=%p,%d\n", + scb->seq_num.psn_num, + iovec[vec_idx - 2].iov_base, + (int)iovec[vec_idx - 2].iov_len, + iovec[vec_idx - 1].iov_base, + (int)iovec[vec_idx - 1].iov_len); + } + + /* If checksum then update checksum */ + if (have_cksum) { + scb->cksum[1] = scb->cksum[0]; + iovec[vec_idx].iov_base = scb->cksum; + iovec[vec_idx].iov_len = PSM_CRC_SIZE_IN_BYTES; + vec_idx++; + iovcnt++; + + _HFI_VDBG("chsum=%p,%d\n", + iovec[vec_idx - 1].iov_base, + (int)iovec[vec_idx - 1].iov_len); + } + + /* + * If it is TID receive, attached tid info. + */ + if (scb->tidctrl) { + iovec[vec_idx].iov_base = scb->tsess; + iovec[vec_idx].iov_len = scb->tsess_length; + vec_idx++; + iovcnt++; + +#ifdef PSM_CUDA + /* + * The driver knows to check for "flags" field in + * sdma_req_info only if ctrl=2. + */ + if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) { + sdmahdr->ctrl = 2 | + (PSM_HAL_EXP << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | + (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); + } else { +#endif + sdmahdr->ctrl = 1 | + (PSM_HAL_EXP << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | + (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); +#ifdef PSM_CUDA + } +#endif + _HFI_VDBG("tid-info=%p,%d\n", + iovec[vec_idx - 1].iov_base, + (int)iovec[vec_idx - 1].iov_len); + } else { + +#ifdef PSM_CUDA + if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) { + sdmahdr->ctrl = 2 | + (PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | + (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); + } else { +#endif + sdmahdr->ctrl = 1 | + (PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | + (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); +#ifdef PSM_CUDA + } +#endif + } + + /* Can bound the number to send by 'num' */ + if (++scb_idx == num) + break; + } + psmi_assert(vec_idx > 0); +retry: + ret = psmi_hal_writev(iovec, vec_idx, &proto->epinfo, proto->ep->context.psm_hw_ctxt); + + if (ret > 0) { + proto->writevFailTime = 0; + /* No need for inflight system call, we can infer it's value + * from + * writev's return value */ + scb_sent += ret; + } else { + /* + * ret == 0: Driver did not queue packet. Try later. + * ENOMEM: No kernel memory to queue request, try later? + * ECOMM: Link may have gone down + * EINTR: Got interrupt while in writev + */ + if (errno == ENOMEM) { + err = handle_ENOMEM_on_DMA_completion(proto); + if (err == PSM2_OK) + goto retry; + } else if (ret == 0 || errno == ECOMM || errno == EINTR) { + err = psmi_context_check_status( + (const psmi_context_t *)&proto->ep->context); + /* + * During a link bounce the err returned from + * psmi_context_check_status is PSM2_EP_NO_NETWORK. In this case + * the error code which we need to return to the calling flush + * function(ips_proto_flow_flush_dma) is PSM2_EP_NO_RESOURCES to + * signal the caller to restart the timers to flush the packets. + * Not doing so would leave the packet on the unacked and + * pending q without the sdma descriptors ever being updated. + */ + if (err == PSM2_OK || err == PSM2_EP_NO_NETWORK) + err = PSM2_EP_NO_RESOURCES; + } else { + err = psmi_handle_error( + proto->ep, + PSM2_EP_DEVICE_FAILURE, + "Unexpected error in writev(): %s (errno=%d) " + "(fd=%d,iovec=%p,len=%d)", + strerror(errno), + errno, + psmi_hal_get_fd(proto->ep->context.psm_hw_ctxt), + iovec, + vec_idx); + goto fail; + } + } + +fail: + *num_sent = scb_sent; + psmi_assert(*num_sent <= num && *num_sent >= 0); + return err; +} + +/* + * Because we only lazily reap send dma completions, it's possible that we + * receive a packet's remote acknowledgement before seeing that packet's local + * completion. As part of processing ack packets and releasing scbs, we issue + * a wait for the local completion if the scb is marked as having been sent via + * send dma. + */ +psm2_error_t +ips_proto_dma_wait_until(struct ips_proto *proto, ips_scb_t *scb) +{ + psm2_error_t err = PSM2_OK; + int spin_cnt = 0; + int did_yield = 0; + + PSMI_PROFILE_BLOCK(); + + do { + if (spin_cnt++ == proto->ep->yield_spin_cnt) { + /* Have to yield holding the PSM lock, mostly because we don't + * support another thread changing internal state at this point in + * the code. + */ + did_yield = 1; + spin_cnt = 0; + sched_yield(); + } + + err = ips_proto_dma_completion_update(proto); + if (err) + return err; + } while (scb->dma_complete == 0); + + if (did_yield) + proto->stats.writev_compl_delay++; + + PSMI_PROFILE_UNBLOCK(); + + return err; +} + +psm2_error_t +ips_proto_timer_ack_callback(struct psmi_timer *current_timer, + uint64_t current) +{ + struct ips_flow *flow = ((ips_scb_t *)current_timer->context)->flow; + struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto; + uint64_t t_cyc_next = get_cycles(); + psmi_seqnum_t err_chk_seq; + ips_scb_t *scb, ctrlscb; + uint8_t message_type; + + if (STAILQ_EMPTY(&flow->scb_unacked)) + return PSM2_OK; + + scb = STAILQ_FIRST(&flow->scb_unacked); + + if (current >= scb->abs_timeout) { + int done_local = 0; + + /* We have to ensure that the send is at least locally complete before + * sending an error check or else earlier data can get to the + * destination *after* we pio or dma this err_chk. + */ + if (flow->transfer == PSM_TRANSFER_DMA) { + /* error is caught inside this routine */ + ips_proto_dma_completion_update(proto); + + if (scb->dma_complete) + done_local = 1; + else + proto->stats.writev_compl_eagain++; + } else + done_local = 1; /* Always done for PIO flows */ + + scb->ack_timeout = + min(scb->ack_timeout * proto->epinfo.ep_timeout_ack_factor, + proto->epinfo.ep_timeout_ack_max); + scb->abs_timeout = t_cyc_next + scb->ack_timeout; + if (done_local) { + _HFI_VDBG + ("sending err_chk flow=%d with first=%d,last=%d\n", + flow->flowid, + STAILQ_FIRST(&flow->scb_unacked)->seq_num.psn_num, + STAILQ_LAST(&flow->scb_unacked, ips_scb, + nextq)->seq_num.psn_num); + + ctrlscb.scb_flags = 0; + if (proto->flags & IPS_PROTO_FLAG_RCVTHREAD) + ctrlscb.scb_flags |= IPS_SEND_FLAG_INTR; + + err_chk_seq = (SLIST_EMPTY(&flow->scb_pend)) ? + flow->xmit_seq_num : + SLIST_FIRST(&flow->scb_pend)->seq_num; + + if (flow->protocol == PSM_PROTOCOL_TIDFLOW) { + message_type = OPCODE_ERR_CHK_GEN; + err_chk_seq.psn_seq -= 1; + /* Receive descriptor index */ + ctrlscb.ips_lrh.data[0].u64 = + scb->tidsendc->rdescid.u64; + /* Send descriptor index */ + ctrlscb.ips_lrh.data[1].u64 = + scb->tidsendc->sdescid.u64; + } else { + PSM2_LOG_MSG("sending ERR_CHK message"); + message_type = OPCODE_ERR_CHK; + err_chk_seq.psn_num = (err_chk_seq.psn_num - 1) + & proto->psn_mask; + } + ctrlscb.ips_lrh.bth[2] = + __cpu_to_be32(err_chk_seq.psn_num); + + ips_proto_send_ctrl_message(flow, message_type, + &flow->ipsaddr->ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + } + + t_cyc_next = get_cycles() + scb->ack_timeout; + } else + t_cyc_next += (scb->abs_timeout - current); + + psmi_timer_request(proto->timerq, current_timer, t_cyc_next); + + return PSM2_OK; +} + +psm2_error_t +ips_proto_timer_send_callback(struct psmi_timer *current_timer, + uint64_t current) +{ + struct ips_flow *flow = ((ips_scb_t *)current_timer->context)->flow; + struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto; + + /* If flow is marked as congested adjust injection rate - see process nak + * when a congestion NAK is received. + */ + if_pf(flow->flags & IPS_FLOW_FLAG_CONGESTED) { + + /* Clear congestion flag and decrease injection rate */ + flow->flags &= ~IPS_FLOW_FLAG_CONGESTED; + if ((flow->path->pr_ccti + + proto->cace[flow->path->pr_sl].ccti_increase) <= + proto->ccti_limit) + ips_cca_adjust_rate(flow->path, + proto->cace[flow->path->pr_sl]. + ccti_increase); + } + + if (!SLIST_EMPTY(&flow->scb_pend)) + flow->flush(flow, NULL); + + return PSM2_OK; +} + +psm2_error_t ips_cca_adjust_rate(ips_path_rec_t *path_rec, int cct_increment) +{ + struct ips_proto *proto = path_rec->proto; + + /* Increment/decrement ccti for path */ + psmi_assert_always(path_rec->pr_ccti >= + proto->cace[path_rec->pr_sl].ccti_min); + path_rec->pr_ccti += cct_increment; + + /* Determine new active IPD. */ +#if _HFI_DEBUGGING + uint16_t prev_ipd = 0; + uint16_t prev_divisor = 0; + if (_HFI_CCADBG_ON) { + prev_ipd = path_rec->pr_active_ipd; + prev_divisor = path_rec->pr_cca_divisor; + } +#endif + if ((path_rec->pr_static_ipd) && + ((path_rec->pr_static_ipd + 1) > + (proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK))) { + path_rec->pr_active_ipd = path_rec->pr_static_ipd + 1; + path_rec->pr_cca_divisor = 0; + } else { + path_rec->pr_active_ipd = + proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK; + path_rec->pr_cca_divisor = + proto->cct[path_rec->pr_ccti] >> CCA_DIVISOR_SHIFT; + } + +#if _HFI_DEBUGGING + if (_HFI_CCADBG_ON) { + _HFI_CCADBG_ALWAYS("CCA: %s injection rate to <%x.%x> from <%x.%x>\n", + (cct_increment > 0) ? "Decreasing" : "Increasing", + path_rec->pr_cca_divisor, path_rec->pr_active_ipd, + prev_divisor, prev_ipd); + } +#endif + + /* Reschedule CCA timer if this path is still marked as congested */ + if (path_rec->pr_ccti > proto->cace[path_rec->pr_sl].ccti_min) { + if (path_rec->pr_timer_cca == NULL) { + path_rec->pr_timer_cca = + (struct psmi_timer *)psmi_mpool_get(proto-> + timer_pool); + psmi_assert(path_rec->pr_timer_cca != NULL); + psmi_timer_entry_init(path_rec->pr_timer_cca, + ips_cca_timer_callback, path_rec); + } + psmi_timer_request(proto->timerq, + path_rec->pr_timer_cca, + get_cycles() + + proto->cace[path_rec->pr_sl]. + ccti_timer_cycles); + } else if (path_rec->pr_timer_cca) { + psmi_mpool_put(path_rec->pr_timer_cca); + path_rec->pr_timer_cca = NULL; + } + + return PSM2_OK; +} + +psm2_error_t +ips_cca_timer_callback(struct psmi_timer *current_timer, uint64_t current) +{ + ips_path_rec_t *path_rec = (ips_path_rec_t *) current_timer->context; + + /* Increase injection rate for flow. Decrement CCTI */ + if (path_rec->pr_ccti > path_rec->proto->cace[path_rec->pr_sl].ccti_min) + return ips_cca_adjust_rate(path_rec, -1); + + psmi_mpool_put(path_rec->pr_timer_cca); + path_rec->pr_timer_cca = NULL; + return PSM2_OK; +} diff --git a/ptl_ips/ips_proto.h b/ptl_ips/ips_proto.h new file mode 100644 index 0000000..c6030f4 --- /dev/null +++ b/ptl_ips/ips_proto.h @@ -0,0 +1,733 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_PROTO_H +#define _IPS_PROTO_H + +#include "ips_config.h" +#include "psm_user.h" + +#include "ips_tid.h" +#include "ips_recvhdrq.h" +#include "ips_epstate.h" +#include "ips_proto_am.h" +#include "ips_tidflow.h" +#include "ips_path_rec.h" + +typedef enum ips_path_type { + IPS_PATH_LOW_PRIORITY, + IPS_PATH_NORMAL_PRIORITY, + IPS_PATH_HIGH_PRIORITY, + IPS_PATH_MAX_PRIORITY +} ips_path_type_t; + +/* + * Local Endpoint info. + * + * Contains information necessary for composing packets for the local endpoint + */ +struct ips_epinfo { + uint16_t ep_base_lid; + uint8_t ep_baseqp; + uint8_t ep_lmc; + opa_rate ep_link_rate; + uint16_t ep_context; + uint16_t ep_subcontext; + uint16_t ep_hfi_type; + uint16_t ep_sl; /* HFI_SL only when path record not used */ + uint16_t ep_mtu; + uint16_t ep_mtu_code; + uint16_t ep_piosize; + uint16_t ep_pkey; /* PSM2_PKEY only when path record not used */ + uint16_t ep_jkey; + uint64_t ep_timeout_ack; /* PSM2_ERRCHK_TIMEOUT if no path record */ + uint64_t ep_timeout_ack_max; + uint32_t ep_timeout_ack_factor; +}; + +/* + * This contains a path record table table that Enumerate the paths available + * between the local node and a remote node associated with an end point. + * Also maintain a state value for each message priority that keeps indicates + * which path should be assigned to the next message of that priority. + * + * For LMC/Torus, keep list of base and max dlid. Used for pkt verification + * + * pg_base_dlid and pg_base_slid are in network byte order. + */ +#define IPS_MAX_PATH_LMC 3 +typedef struct ips_path_grp { + uint16_t pg_base_dlid; + uint16_t pg_base_slid; + uint8_t pg_num_paths[IPS_PATH_MAX_PRIORITY]; + uint8_t pg_next_path[IPS_PATH_MAX_PRIORITY]; + ips_path_rec_t *pg_path[0][IPS_PATH_MAX_PRIORITY]; +} ips_path_grp_t; + +/* + * Start and finish routines for constructing an ips_proto. + */ +struct ips_proto; +psm2_error_t ips_proto_init(const psmi_context_t *context, + const struct ptl *ptl, + int num_of_send_bufs, + int num_of_send_desc, + uint32_t imm_size, + const struct psmi_timer_ctrl *timerq, /* PTL's timerq */ + const struct ips_epstate *epstate, /* PTL's epstate */ + void *spioc, /* PTL's opaque spio control */ + struct ips_proto *proto); /* output protocol */ + +psm2_error_t ips_proto_fini(struct ips_proto *proto, int force, + uint64_t timeout); + +/* + * Control message structures + * + * ips low-level control messages to ensure reliability of eager packets. + */ +#define CTRL_MSG_QEUEUE_SIZE 64 /* power of two */ + +struct ips_ctrlq_elem { + uint8_t message_type; + uint16_t *msg_queue_mask; + ips_scb_t msg_scb; +}; + +struct ips_ctrlq { + /* Queued control messages, queued when pio is busy */ + struct ips_proto *ctrlq_proto; + + uint32_t ctrlq_head; + uint32_t ctrlq_tail; + uint32_t ctrlq_overflow; + + struct ips_ctrlq_elem ctrlq_cqe[CTRL_MSG_QEUEUE_SIZE] PSMI_CACHEALIGN; + struct psmi_timer ctrlq_timer; /* when in timerq */ +}; + +/* Connect/disconnect, as implemented by ips */ + +/* + * Connections are not pairwise but we keep a single 'epaddr' for messages-from + * and messages-to a remote 'epaddr'. State transitions for connecting TO and + * FROM 'epaddrs' are the following: + * Connect TO (Connect OUTGOING): + * NONE -> WAITING -> ESTABLISHED -> WAITING_DISC -> DISCONNECTED -> NONE + * + * Connect FROM (we receive a connect request - Connect INCOMING) + * NONE -> ESTABLISHED -> NONE + */ +#define CSTATE_ESTABLISHED 1 +#define CSTATE_NONE 2 +#define CSTATE_OUTGOING_DISCONNECTED 3 +#define CSTATE_OUTGOING_WAITING 4 +#define CSTATE_OUTGOING_WAITING_DISC 5 + +psm2_error_t ips_proto_connect(struct ips_proto *proto, int numep, + const psm2_epid_t *array_of_epid, + const int *array_of_epid_mask, + psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, + uint64_t timeout_in); + +psm2_error_t ips_proto_disconnect(struct ips_proto *proto, int force, int numep, + psm2_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm2_error_t array_of_errors[], + uint64_t timeout_in); + +int ips_proto_isconnected(struct ips_epaddr *ipsaddr); + +/* + * Pending operation structures + */ +struct ips_pend_sreq { + STAILQ_ENTRY(ips_pend_sreq) next; + psm2_mq_req_t req; + uint32_t type; +}; + +#define IPS_PENDSEND_EAGER_DATA 1 +#define IPS_PENDSEND_EAGER_REQ 2 +#define IPS_PENDSEND_EXP_TIDS 3 +#define IPS_PENDSEND_EXP_SENDS 4 + +STAILQ_HEAD(ips_pendsendq, ips_pend_sreq); + +struct ips_pend_sends { + struct ips_proto *proto; /* back ptr */ + struct psmi_timer timer; + struct ips_pendsendq pendq; +}; + +/* + * One instance of the protocol + */ + +struct ips_protoexp; + +struct ips_proto_stats { + uint64_t pio_busy_cnt; + uint64_t writev_busy_cnt; + uint64_t writev_compl_eagain; + uint64_t writev_compl_delay; + uint64_t scb_egr_unavail_cnt; + uint64_t scb_exp_unavail_cnt; + uint64_t hdr_overflow; + uint64_t egr_overflow; + uint64_t lid_zero_errs; + uint64_t unknown_packets; + uint64_t stray_packets; +}; + +struct ips_proto_error_stats { + uint64_t num_icrc_err; + uint64_t num_ecc_err; + uint64_t num_len_err; + uint64_t num_tid_err; + uint64_t num_dc_err; + uint64_t num_dcunc_err; + uint64_t num_khdrlen_err; +}; + +/* + * Updates to these stats must be reflected in ips_ptl_epaddr_stats_init + */ +struct ips_proto_epaddr_stats { + uint64_t err_chk_send; + uint64_t err_chk_recv; + uint64_t nak_send; + uint64_t nak_recv; + uint64_t connect_req; + uint64_t disconnect_req; + uint64_t tids_grant_send; + uint64_t tids_grant_recv; + uint64_t send_rexmit; + uint64_t congestion_pkts; /* IB CCA FECN packets */ +}; + +/* OPP support structure. */ +struct opp_api { + void *(*op_path_find_hca) (const char *name, void **device); + void *(*op_path_open) (void *device, int port_num); + void (*op_path_close) (void *context); + int (*op_path_get_path_by_rec) (void *context, ibta_path_rec_t *query, + ibta_path_rec_t *response); +}; + +struct ips_ibta_compliance_fn { + psm2_error_t(*get_path_rec) (struct ips_proto *proto, uint16_t slid, + uint16_t dlid, uint16_t desthfi_type, + unsigned long timeout, + ips_path_grp_t **ppathgrp); + psm2_error_t(*fini) (struct ips_proto *proto); +}; + +/* please don't change the flow id order */ +typedef enum ips_epaddr_flow { + EP_FLOW_GO_BACK_N_PIO, + EP_FLOW_GO_BACK_N_DMA, + EP_FLOW_TIDFLOW, /* Can either pio or dma for tidflow */ + EP_FLOW_LAST /* Keep this the last endpoint flow */ +} ips_epaddr_flow_t; + +typedef enum psm_transfer_type { + PSM_TRANSFER_PIO, + PSM_TRANSFER_DMA, + PSM_TRANSFER_LAST /* Keep this the last transfer type */ +} psm_transfer_type_t; + +typedef enum psm_protocol_type { + PSM_PROTOCOL_GO_BACK_N, + PSM_PROTOCOL_TIDFLOW, + PSM_PROTOCOL_LAST /* Keep this the last protocol type */ +} psm_protocol_type_t; + +struct ips_proto { + struct ptl *ptl; /* cached */ + psm2_ep_t ep; /* cached, for errors */ + psm2_mq_t mq; /* cached, for mq handling */ + /* Pending sends */ + struct ips_pend_sends pend_sends; + struct ips_epstate *epstate; + struct psmi_timer_ctrl *timerq; + + struct ips_protoexp *protoexp; + struct ips_scbctrl *scbc_rv; + struct ips_spio *spioc; + struct ips_scbctrl scbc_egr; + struct ips_epinfo epinfo; + + ips_scb_t **sdma_scb_queue; + uint16_t sdma_queue_size; + uint16_t sdma_fill_index; + uint16_t sdma_done_index; + uint16_t sdma_avail_counter; + + uint64_t timeout_send; + uint32_t flags; /* < if IPS_PROTO_FLAG_SDMA is NOT set, SPIO flow will be initialized + * < if IPS_PROTO_FLAG_SPIO is NOT set, SDMA flow will be initialized + * < so both flows (SDMA and PIO) will be initialized if both of the + * < IPS_PROTO_FLAG_S{DMA,PIO} are CLEARED + */ + uint32_t iovec_thresh_eager; + uint32_t iovec_thresh_eager_blocking; + uint32_t psn_mask; + uint32_t scb_bufsize; + uint16_t flow_credits; + mpool_t pend_sends_pool; + mpool_t timer_pool; + struct ips_ibta_compliance_fn ibta; + struct ips_proto_stats stats; + struct ips_proto_error_stats error_stats; + struct ips_proto_epaddr_stats epaddr_stats; + + struct ips_proto_am proto_am; + + struct ips_ctrlq ctrlq; + /* pure sdma mode, use dma flow, otherwise, use pio flow */ + ips_epaddr_flow_t msgflowid; + + /* Handling tid errors */ + uint32_t tiderr_cnt; + uint32_t tiderr_max; + uint64_t tiderr_tnext; + uint64_t tiderr_warn_interval; + + uint64_t t_init; + uint64_t t_fini; + uint32_t runid_key; + + int num_connected_outgoing; + int num_connected_incoming; + int num_disconnect_requests; + + /* misc state variables. */ + + /* Smallest interval in cycles between which we warn about stray + * messages This is a per-endpoint quantity, overridable with + * PSM_STRAY_WARN_INTERVAL We use the same interval to send the "die" + * message. + */ + uint64_t stray_warn_interval; + int done_warning; + int done_once; + int num_bogus_warnings; + struct { + uint32_t interval_secs; + uint64_t next_warning; + uint64_t count; + } psmi_logevent_tid_send_reqs; + + /* SL2SC and SC2VL table for protocol */ + uint16_t sl2sc[32]; + uint16_t sc2vl[32]; + + /* CCA per port */ + uint16_t *cct; /* cct table */ + uint16_t ccti_size; /* ccti table size */ + uint16_t ccti_limit; /* should be <= size-1 */ + + uint16_t ccti_portctrl; /* QP or SL CC */ + uint32_t ccti_ctrlmap; /* map for valid sl */ + struct cace { /* CACongestionEntry */ + uint8_t ccti_increase; /* steps to increase */ + /* uint16_t ccti_timer;*/ /* CCTI Timer in units of 1.024 usec */ + uint64_t ccti_timer_cycles; /* converted from us_2_cycles() */ + uint8_t ccti_threshold; /* threshold to make log */ + uint8_t ccti_min; /* min value for ccti */ + } cace[32]; /* 32 service levels */ + + /* Path record support */ + uint8_t ips_ipd_delay[IBV_RATE_300_GBPS + 1]; + /* + * Disable the LMC based dispersive routing for all message + * sizes in bytes between ips_lmc_disable_low and ips_lmc_disable_high, + * inclusive. + */ + uint32_t ips_lmc_disable_low; + uint32_t ips_lmc_disable_high; + struct hsearch_data ips_path_rec_hash; + struct hsearch_data ips_path_grp_hash; + void *opp_lib; + void *hndl; + void *device; + void *opp_ctxt; + struct opp_api opp_fn; + +#ifdef PSM_CUDA + struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_send_cfg; + struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_small_send_cfg; + mpool_t cuda_hostbuf_pool_send; + mpool_t cuda_hostbuf_pool_small_send; + CUstream cudastream_send; + unsigned cuda_prefetch_limit; +#endif + +/* + * Control message queue for pending messages. + * + * Control messages are queued as pending when no PIO is available for sending + * the message. They are composed on the fly and do not need buffering. + * + * Variables here are write once (at init) and read afterwards (except the msg + * queue overflow counters). + */ + uint32_t ctrl_msg_queue_overflow; + uint32_t ctrl_msg_queue_enqueue; + uint32_t message_type_to_index[256]; +#define message_type2index(proto, msg_type) (proto->message_type_to_index[(msg_type)]) + + time_t writevFailTime; +}; + +static inline int +ips_proto_is_disabled_pio(struct ips_proto *proto) +{ + return !!(proto->flags & IPS_PROTO_FLAG_SDMA); +} + +static inline int +ips_proto_is_disabled_sdma(struct ips_proto *proto) +{ + return !!(proto->flags & IPS_PROTO_FLAG_SPIO); +} + +/* + * Test the payload length against the lmc_disable_low and lmc_disable_hi + * values, to determine if a transfer of this size should use LMC LIDs. + * Set the IPS_SEND_FLAG_NO_LMC flag in the scb. + */ +static inline void +ips_set_LMC_LID_choice(struct ips_proto *proto, ips_scb_t *scb, uint32_t len) +{ + if ((len >= proto->ips_lmc_disable_low) && + (len <= proto->ips_lmc_disable_high)) { + PSM2_LOG_MSG("DISABLE LMC paylen %u\n", len); + scb->scb_flags |= IPS_SEND_FLAG_NO_LMC; + } + + return; +} + +/* + * Endpoint address, encapsulates per-endpoint protocol metadata + * + * Directly implements the ptl epaddr. + */ +typedef psm2_error_t(*ips_flow_flush_fn_t) (struct ips_flow *, int *nflushed); + +/** + * ips_flow is a structure that combines all information regarding a send + * from one endpoint to another one. Specifically, it is the place where + * the Maximum Transmission Unit for a send is calculated, given how many + * factors could possibly influence the MTU calculation. See ips_flow_init + * documentation for more details. + */ +struct ips_flow { + SLIST_ENTRY(ips_flow) next; /* List of flows with pending acks */ + ips_flow_flush_fn_t flush; /* flush function for this flow */ + + struct ips_epaddr *ipsaddr; /* back pointer, remote endpoint */ + ips_path_rec_t *path; /* Path to use for flow */ + + uint16_t frag_size; /* < This flow's fragment size, calculated as the + < minimum of all relevant MTUs involved */ + + uint16_t flowid:2; /* flow id: pio(0) or dma(1) or tidflow(2) */ + uint16_t transfer:3; /* spio or sdma */ + uint16_t protocol:3; /* go-back-n or tidflow */ + uint16_t flags:8; /* flow state flags */ + + uint16_t cca_ooo_pkts; /* cca out of order packets */ + uint16_t cwin; /* Size of congestion window */ + uint16_t ack_interval; /* interval to ack packets */ + uint16_t ack_counter; /* counter to ack packets */ + int16_t credits; /* Current credits available to send on flow */ + uint32_t ack_index; /* Index of the last ACK message type in pending message queue */ + + psmi_seqnum_t xmit_seq_num; /* transmit packet sequence number */ + psmi_seqnum_t xmit_ack_num; /* acked packet sequence number */ + psmi_seqnum_t recv_seq_num; /* recieved packet sequence number */ + + psmi_timer *timer_send; /* timer for frames that got a busy PIO */ + psmi_timer *timer_ack; /* timer for unacked frames */ + + STAILQ_HEAD(ips_scb_unackedq, ips_scb) scb_unacked; /* unacked queue */ + SLIST_HEAD(ips_scb_pendlist, ips_scb) scb_pend; /* pending queue */ + +#ifdef PSM_DEBUG + uint32_t scb_num_pending; /* pending scb counter */ + uint32_t scb_num_unacked; /* unacked scb counter */ +#endif +}; + +#define IPS_FLOW_MSG_TOGGLE_OOO_MASK (1 << 0) /* ooo msg check */ +#define IPS_FLOW_MSG_TOGGLE_UNEXP_MASK (1 << 1) /* unexp msg check */ +/* + * Make sure ips_epaddr_t and psm2_epaddr_t can be converted each other. + */ +struct ips_epaddr { + struct psm2_epaddr epaddr; /* inlined psm level epaddr */ + struct ips_msgctl *msgctl; /* ips level msg control */ + + struct ips_epaddr *next; /* linklist */ + + struct ips_flow flows[EP_FLOW_LAST - 1]; /* pio and dma */ + ips_path_grp_t *pathgrp; /* pointer to slid/dlid group in hash */ + + uint32_t connidx_outgoing; /* peer's connection idx */ + uint32_t connidx_incoming; /* my connection idx */ + + uint16_t ctrl_msg_queued; /* bitmap of queued control messages to be send */ + uint32_t window_rv; /* RNDV window size per connection */ + + uint8_t hpp_index; /* high priority index */ + uint8_t context; /* real context value */ + uint8_t subcontext; /* sub context, 3 bits, 5 bits for future */ + uint8_t msg_toggle; /* only 2 bits used, 6 bits for future */ + + /* this portion is only for connect/disconnect */ + uint64_t s_timeout; /* used as a time in close */ + uint32_t runid_key; /* peer process pid */ + uint32_t credit:2; /* credit to connect/disconnect: 0 or 1 */ + uint32_t cstate_outgoing:3; /* connection state to, max 7 */ + uint32_t cstate_incoming:3; /* connection state from, max 7 */ + uint32_t delay_in_ms:8; /* disconnect delay in ms */ + uint32_t cerror_outgoing:8; /* error code during connection */ + uint32_t cerror_incoming:8; /* error code during connection */ +}; + +/* + * ips_msgctl_t is per connection struct. + */ +struct ips_msgctl { + struct ips_epaddr master_epaddr; /* Master rail's epaddr */ + + struct ips_epaddr *ipsaddr_next; /* next ipsaddr to send packet */ + uint16_t mq_send_seqnum; /* next sending message sequence */ + uint16_t mq_recv_seqnum; /* next receiving message sequence */ + uint16_t am_send_seqnum; /* next sending message sequence */ + uint16_t am_recv_seqnum; /* next receiving message sequence */ + uint16_t ipsaddr_count; /* number of ipsaddr to use */ + uint16_t outoforder_count; /* number of outoforder messages */ +}; + +static inline __attribute__ ((unused)) +void IPS_MCTXT_APPEND(ips_epaddr_t *head, ips_epaddr_t *node) +{ + ips_epaddr_t *cur; + + /* The new node is inserted before head. */ + node->next = head; + + /* Circle around the linked list to head's predecessor and update. */ + for (cur = head; cur->next != head; cur = cur->next); + cur->next = node; +} + +static inline __attribute__ ((unused)) +void IPS_MCTXT_REMOVE(ips_epaddr_t *node) +{ + ips_epaddr_t *cur; + + /* Circle around to node's predecessor and update. */ + for (cur = node; cur->next != node; cur = cur->next); + cur->next = node->next; + node->next = node; +} + +/* + * Initialize a flow, setting its attributes. Selects the path the flow will + * use as well as calculates the flow's fragment size defined as: + * - min(remote EP MTU, selected path's MTU, local EP MTU) for DMA sends + * - min(remote EP MTU, selected path's MTU, local EP MTU, local PIO bufsize) for PIO sends + */ +void MOCKABLE(ips_flow_init)(struct ips_flow *flow, struct ips_proto *proto, + ips_epaddr_t *ipsaddr, psm_transfer_type_t transfer_type, + psm_protocol_type_t protocol, ips_path_type_t path_type, + uint32_t flow_index); +MOCK_DCL_EPILOGUE(ips_flow_init); + +void ips_scb_prepare_flow(ips_scb_t *scb, ips_epaddr_t *ipsaddr, + struct ips_flow *flow); + +void MOCKABLE(ips_proto_flow_enqueue)(struct ips_flow *flow, ips_scb_t *scb); +MOCK_DCL_EPILOGUE(ips_proto_flow_enqueue); + +psm2_error_t ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed); +psm2_error_t ips_proto_flow_flush_dma(struct ips_flow *flow, int *nflushed); + +/* Wrapper for enqueue + flush */ +psm2_error_t ips_proto_scb_pio_send(struct ips_flow *flow, ips_scb_t *scb); + +void ips_proto_scb_dma_enqueue(struct ips_proto *proto, ips_scb_t *scb); +psm2_error_t ips_proto_scb_dma_flush(struct ips_proto *proto, + ips_epaddr_t *ipsaddr, int *nflushed); +psm2_error_t ips_proto_dma_wait_until(struct ips_proto *proto, ips_scb_t *scb); +psm2_error_t ips_proto_dma_completion_update(struct ips_proto *proto); + +psm2_error_t ips_dma_transfer_frame(struct ips_proto *proto, + struct ips_flow *flow, ips_scb_t *scb, + void *payload, uint32_t paylen, + uint32_t have_cksum, uint32_t cksum); + +/* + * Protocol receive processing + * + */ +/* Error handling for unknown packet, packet is unknown when epid doesn't match + * in epstate table */ +int ips_proto_process_unknown(const struct ips_recvhdrq_event *rcv_ev); +/* Exposed for fastpath only */ +int ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev); +int ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev); +/* Handling error cases */ +int ips_proto_process_packet_error(struct ips_recvhdrq_event *rcv_ev); + +/* + * Protocol exception handling and frame dumps + */ +void ips_proto_get_rhf_errstring(uint32_t err, char *msg, size_t len); +void ips_proto_dump_err_stats(struct ips_proto *proto); +void ips_proto_show_rhf_errors(const uint32_t *rhdr); +void ips_proto_show_header(struct ips_message_header *p_hdr, char *msg); +void ips_proto_dump_frame(void *frame, int lenght, char *message); +void ips_proto_dump_data(void *data, int data_length); +void ips_proto_dump_eager(uint32_t *curr_rcv_hdr); + +/* + * Checksum of ips packets + */ +uint32_t ips_crc_calculate(uint32_t len, uint8_t *data, uint32_t crc); + +/* + * Matched-Queue processing and sends + */ +psm2_error_t ips_proto_mq_push_cts_req(struct ips_proto *proto, + psm2_mq_req_t req); +psm2_error_t ips_proto_mq_push_rts_data(struct ips_proto *proto, + psm2_mq_req_t req); +int ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev); +int ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev); +int ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev); +int ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev); +int ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev); +void ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl); +int ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev); + +psm2_error_t ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr, + uint32_t flags, psm2_mq_tag_t *tag, + const void *ubuf, uint32_t len); + +psm2_error_t ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, + uint32_t flags_user, uint32_t flags_internal, + psm2_mq_tag_t *tag, const void *ubuf, uint32_t len, + void *context, psm2_mq_req_t *req_o); + +psm2_error_t ips_proto_msg_size_thresh_query (enum psm2_info_query_thresh_et, + uint32_t *out, psm2_mq_t mq, psm2_epaddr_t); + +int ips_proto_am(struct ips_recvhdrq_event *rcv_ev); + +/* + * IPS packet service routine table. + */ +typedef int (*ips_packet_service_fn_t)(struct ips_recvhdrq_event *rcv_ev); +extern ips_packet_service_fn_t + ips_packet_service_routine[OPCODE_FUTURE_FROM-OPCODE_RESERVED]; + +/* IBTA feature related functions (path record, sl2sc2vl etc.) */ +psm2_error_t ips_ibta_init_sl2sc2vl_table(struct ips_proto *proto); +psm2_error_t ips_ibta_link_updown_event(struct ips_proto *proto); + +psm2_error_t +MOCKABLE(ips_ibta_init)(struct ips_proto *proto); +MOCK_DCL_EPILOGUE(ips_ibta_init); + +psm2_error_t ips_ibta_fini(struct ips_proto *proto); + +PSMI_ALWAYS_INLINE( +struct psm_hal_sdma_req_info * +psmi_get_sdma_req_info(struct ips_scb *scb, size_t *extra)) +{ + *extra = 0; +#ifdef PSM_CUDA + if (!PSMI_IS_DRIVER_GPUDIRECT_ENABLED) + return (void *)(((char *)&scb->pbc) - + (sizeof(struct psm_hal_sdma_req_info) - + PSM_HAL_CUDA_SDMA_REQ_INFO_EXTRA)); + + *extra = PSM_HAL_CUDA_SDMA_REQ_INFO_EXTRA; +#endif + + return (void *)(((char *)&scb->pbc) - (sizeof(struct psm_hal_sdma_req_info))); +} + +#ifdef PSM_CUDA +PSMI_ALWAYS_INLINE( +uint32_t ips_cuda_next_window(uint32_t max_window, uint32_t offset, + uint32_t len)) +{ + uint32_t window_len; + window_len = len - offset; + if (window_len >= max_window) + window_len = max_window; + return window_len; +} +#endif + +#endif /* _IPS_PROTO_H */ diff --git a/ptl_ips/ips_proto_am.c b/ptl_ips/ips_proto_am.c new file mode 100644 index 0000000..31b4d9d --- /dev/null +++ b/ptl_ips/ips_proto_am.c @@ -0,0 +1,614 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "psm2_am.h" +#include "psm_am_internal.h" +#include "psm_mq_internal.h" +#include "ips_proto.h" +#include "ips_expected_proto.h" +#include "ips_proto_help.h" + +struct ips_am_token { + struct psmi_am_token tok; + + /* ptl-specific token stuff */ + struct ips_epaddr *epaddr_rail; + struct ips_proto_am *proto_am; +}; + +struct ips_am_message { + struct ips_message_header p_hdr; + struct ips_am_message *next; + struct ips_epaddr *ipsaddr; + struct ips_proto_am *proto_am; + uint64_t *payload; + uint32_t paylen; + uint16_t seqnum; +}; + +/* These variables are shared for all packet flows in a PSM process; they are + * shared across multiple rails. There is no single AM object to hang these + * off of, so they are declared here as globals. */ +static struct { + struct ips_am_message head; + struct ips_am_message *tail; +} ips_am_outoforder_q; + +static mpool_t ips_am_msg_pool; + +/* This calculation ensures that the number of reply slots will always be at + * least twice as large + 1 as the number of request slots. This is optimal: the + * minimum amount required is actually only twice as many, but it is much + * slower. */ +#define calc_optimal_num_reply_slots(nslots) (((nslots)*2 / 3) + 1) + +psm2_error_t +MOCKABLE(ips_proto_am_init)(struct ips_proto *proto, + int num_send_slots, + uint32_t imm_size, + struct ips_proto_am *proto_am) +{ + psm2_error_t err = PSM2_OK; + int send_buf_size = psmi_hal_get_pio_size(proto->ep->context.psm_hw_ctxt); + int num_rep_slots = calc_optimal_num_reply_slots(num_send_slots); + int num_req_slots = num_send_slots - num_rep_slots; + + proto_am->proto = proto; + + /* In a node pair, the number of reply send buffers on at least one of + * the nodes must be at least double the number (optimal: double + 1) of + * send descriptors on the other node. While this constraint applies + * only to the reply send buffers, allowing the caller to tune only the + * number of request send buffers would be awkward, as they have no + * knowledge of the subdivision of the memory into separate mempools for + * requests and replies. It's an internal concern at this point. */ + if ((err = ips_scbctrl_init(&proto->ep->context, + num_req_slots, + num_req_slots, + imm_size, + send_buf_size, + NULL, + NULL, + &proto_am->scbc_request))) + goto fail; + + if ((err = ips_scbctrl_init(&proto->ep->context, + num_rep_slots, + num_rep_slots, + imm_size, + send_buf_size, + NULL, + NULL, + &proto_am->scbc_reply))) + goto fail; + + if (ips_am_msg_pool == NULL) { + union psmi_envvar_val max_msgs; + + ips_am_outoforder_q.head.next = NULL; + ips_am_outoforder_q.tail = &ips_am_outoforder_q.head; + + psmi_getenv("PSM2_AM_MAX_OOO_MSGS", + "Maximum number of OOO Active Messages to queue before dropping.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)1024, &max_msgs); + + ips_am_msg_pool = psmi_mpool_create( + sizeof(struct ips_am_message), + 32, max_msgs.e_uint, 0, UNDEFINED, NULL, NULL); + } +fail: + return err; +} +MOCK_DEF_EPILOGUE(ips_proto_am_init); + +psm2_error_t ips_proto_am_fini(struct ips_proto_am *proto_am) +{ + ips_scbctrl_fini(&proto_am->scbc_request); + ips_scbctrl_fini(&proto_am->scbc_reply); + if (ips_am_msg_pool != NULL) { + psmi_mpool_destroy(ips_am_msg_pool); + ips_am_msg_pool = NULL; + } + + return PSM2_OK; +} + +/* Fill in AM capabilities parameters */ +psm2_error_t +ips_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters) +{ + int max_nargs = min(1 << IPS_AM_HDR_NARGS_BITS, PSMI_AM_MAX_ARGS); + int max_payload = + psmi_hal_get_pio_size(ep->context.psm_hw_ctxt) - + ((max_nargs - IPS_AM_HDR_NARGS) * sizeof(psm2_amarg_t)); + + if (parameters == NULL) { + return PSM2_PARAM_ERR; + } + + parameters->max_handlers = 1 << IPS_AM_HDR_HIDX_BITS; + parameters->max_nargs = max_nargs; + parameters->max_request_short = max_payload; + parameters->max_reply_short = max_payload; + + return PSM2_OK; +} + +static +psm2_error_t +am_short_reqrep(ips_scb_t *scb, struct ips_epaddr *ipsaddr, + psm2_amarg_t *args, int nargs, uint8_t opcode, + void *src, size_t len, int flags, int pad_bytes) +{ + int i, hdr_qwords = IPS_AM_HDR_NARGS; + struct ips_proto *proto = ((psm2_epaddr_t)ipsaddr)->proto; + + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + + struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; + + /* There are a limited number of bits for nargs in the header, making + overflow very easy. Make sure the values match. */ + psmi_assert(nargs == scb->ips_lrh.amhdr_nargs); + + _HFI_VDBG("%s src=%p len=%d, nargs=%d\n", + ((opcode == OPCODE_AM_REQUEST) || + (opcode == OPCODE_AM_REQUEST_NOREPLY)) ? "req" : "rep", + src, (int)len, nargs); + + if (nargs == 1) { /* fastpath */ + scb->ips_lrh.data[0].u64w0 = args[0].u64w0; + hdr_qwords--; + } else if (nargs > 1) { + /* Easily unrollable but leave as is in case we can increase + * qwords on the chip in the near future */ + for (i = 0; i < IPS_AM_HDR_NARGS; i++, hdr_qwords--) + scb->ips_lrh.data[i].u64w0 = args[i].u64w0; + + if (nargs > IPS_AM_HDR_NARGS) { + /* Slow case -- we don't have iovec and not enough + * space in the message header, so we have to copy the + * user's arguments even if the payload is marked ASYNC + */ + uintptr_t bufp = (uintptr_t) ips_scb_buffer(scb); + size_t arg_payload_len = + sizeof(psm2_amarg_t) * (nargs - IPS_AM_HDR_NARGS); + + psmi_mq_mtucpy((void *)bufp, + &args[IPS_AM_HDR_NARGS], + arg_payload_len); + bufp += arg_payload_len; + scb->payload_size = arg_payload_len; + + if (src != NULL && len > 0) { + psmi_mq_mtucpy((void *)bufp, src, len); + scb->payload_size += len; + } + + psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS)); + scb->payload_size += pad_bytes; + scb->ips_lrh.amhdr_len = pad_bytes; + goto send_scb; + } + } + + if (len == 0) { + scb->payload_size = 0; + scb->ips_lrh.amhdr_len = 0; + } else if (len <= (hdr_qwords << 3)) { + /* Inline the payload into the header. */ + /* This path CANNOT handle length = 0 due to limited space + in the header. If IPS_SEND_FLAG_AMISTINY is set, an + amhdr_len value of 0 means a full payload, i.e. + 1 << IPS_AM_HDR_LEN_BITS bytes of packed payload. */ + psmi_assert(len > 0); + + psmi_mq_mtucpy(&scb->ips_lrh. + data[IPS_AM_HDR_NARGS - hdr_qwords], src, len); + scb->payload_size = 0; + psmi_assert(len <= (1 << IPS_AM_HDR_LEN_BITS)); + scb->ips_lrh.amhdr_len = len & ((1 << IPS_AM_HDR_LEN_BITS) - 1); + scb->scb_flags |= IPS_SEND_FLAG_AMISTINY; + } else { /* Whatever's left requires a separate payload */ + if (ips_scb_buffer(scb) == NULL) /* Just attach the buffer */ + ips_scb_buffer(scb) = src; + else /* May need to re-xmit user data, keep it around */ + psmi_mq_mtucpy(ips_scb_buffer(scb), src, len); + + psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS)); + scb->payload_size = len + pad_bytes; + scb->ips_lrh.amhdr_len = pad_bytes; + } + +send_scb: + ips_scb_opcode(scb) = opcode; + scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->am_send_seqnum++; + ips_proto_flow_enqueue(flow, scb); + flow->flush(flow, NULL); + + return PSM2_OK; +} + +static inline int +calculate_pad_bytes(size_t len) +{ + /* Align to dword (4 bytes) */ + size_t dword_aligned_len = (len + 3) & ~3; + return dword_aligned_len - len; +} + +static inline +void +ips_am_scb_init(ips_scb_t *scb, uint8_t handler, int nargs, + int pad_bytes, + psm2_am_completion_fn_t completion_fn, void *completion_ctxt) +{ + psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS)); + + scb->completion_am = completion_fn; + scb->cb_param = completion_ctxt; + scb->ips_lrh.amhdr_hidx = handler; + scb->ips_lrh.amhdr_len = pad_bytes; + scb->ips_lrh.amhdr_nargs = nargs; + scb->ips_lrh.flags = 0; + if (completion_fn) + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + return; +} + +psm2_error_t +ips_am_short_request(psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + struct ips_proto_am *proto_am = &epaddr->proto->proto_am; + psm2_error_t err; + ips_scb_t *scb; + ips_epaddr_t *ipsaddr; + int pad_bytes = calculate_pad_bytes(len); + int payload_sz = (nargs << 3); + + if_pt(!(flags & PSM2_AM_FLAG_ASYNC)) + payload_sz += len; + + if (payload_sz > (IPS_AM_HDR_NARGS << 3)) { + /* Payload can't fit in header, allocate buffer to carry data */ + int arg_sz = (nargs > IPS_AM_HDR_NARGS) ? + ((nargs - IPS_AM_HDR_NARGS) << 3) : 0; + + /* len + pad_bytes + overflow_args */ + PSMI_BLOCKUNTIL(epaddr->ptlctl->ep, + err, + ((scb = ips_scbctrl_alloc( + &proto_am->scbc_request, + 1, + len + pad_bytes + arg_sz, + IPS_SCB_FLAG_ADD_BUFFER)) != NULL)); + } else { + PSMI_BLOCKUNTIL(epaddr->ptlctl->ep, + err, + ((scb = ips_scbctrl_alloc_tiny( + &proto_am->scbc_request)) != NULL)); + } + + psmi_assert_always(scb != NULL); + ips_am_scb_init(scb, handler, nargs, pad_bytes, + completion_fn, completion_ctxt); + + /* Select the next ipsaddr for multi-rail */ + ipsaddr = ((ips_epaddr_t *)epaddr)->msgctl->ipsaddr_next; + ipsaddr->msgctl->ipsaddr_next = ipsaddr->next; + + return am_short_reqrep(scb, ipsaddr, args, + nargs, + (flags & PSM2_AM_FLAG_NOREPLY) ? + OPCODE_AM_REQUEST_NOREPLY : OPCODE_AM_REQUEST, + src, len, flags, pad_bytes); +} + +psm2_error_t +ips_am_short_reply(psm2_am_token_t tok, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, void *completion_ctxt) +{ + struct ips_am_token *token = (struct ips_am_token *)tok; + struct ips_proto_am *proto_am = token->proto_am; + struct ips_epaddr *ipsaddr = token->epaddr_rail; + int pad_bytes = calculate_pad_bytes(len); + int scb_flags = 0; + ips_scb_t *scb; + + if (!token->tok.can_reply) { + _HFI_ERROR("Invalid AM reply for request!"); + return PSM2_AM_INVALID_REPLY; + } + + psmi_assert(ips_scbctrl_avail(&proto_am->scbc_reply)); + + if ((nargs << 3) + len <= (IPS_AM_HDR_NARGS << 3)) { + scb = ips_scbctrl_alloc_tiny(&proto_am->scbc_reply); + } else { + int payload_sz = (nargs << 3); + + payload_sz += (flags & PSM2_AM_FLAG_ASYNC) ? + 0 : (len + pad_bytes); + scb_flags |= (payload_sz > (IPS_AM_HDR_NARGS << 3)) ? + IPS_SCB_FLAG_ADD_BUFFER : 0; + + scb = + ips_scbctrl_alloc(&proto_am->scbc_reply, 1, payload_sz, + scb_flags); + } + + psmi_assert_always(scb != NULL); + ips_am_scb_init(scb, handler, nargs, pad_bytes, + completion_fn, completion_ctxt); + am_short_reqrep(scb, ipsaddr, args, nargs, OPCODE_AM_REPLY, + src, len, flags, pad_bytes); + return PSM2_OK; +} + +/* Prepares and runs a handler from a receive event. */ +static int +ips_am_run_handler(const struct ips_message_header *p_hdr, + struct ips_epaddr *ipsaddr, struct ips_proto_am *proto_am, + uint64_t *payload, + uint32_t paylen) +{ + struct ips_am_token token; + int nargs = p_hdr->amhdr_nargs; + int ret; + struct psm2_ep_am_handle_entry *hentry; + psm2_amarg_t *args = (psm2_amarg_t *)p_hdr->data; + + token.tok.flags = p_hdr->flags; + token.tok.epaddr_incoming = (psm2_epaddr_t)&ipsaddr->msgctl->master_epaddr; + token.tok.can_reply = + (_get_proto_hfi_opcode(p_hdr) == OPCODE_AM_REQUEST); + token.epaddr_rail = ipsaddr; + token.proto_am = proto_am; + + if (token.tok.flags & IPS_SEND_FLAG_AMISTINY) { + /* Payload is packed into header after args */ + payload = (uint64_t *)&p_hdr->data[nargs].u64; + paylen = p_hdr->amhdr_len; + /* Interpret amhdr_len == 0 as 16 bytes of payload */ + if (paylen == 0) + paylen = 1 << IPS_AM_HDR_LEN_BITS; + } else { + if (nargs > IPS_AM_HDR_NARGS) { + /* Args are split across header and payload */ + int payload_args_len = + (nargs - IPS_AM_HDR_NARGS) * + sizeof(psm2_amarg_t); + + args = alloca(PSMI_AM_MAX_ARGS * sizeof(psm2_amarg_t)); + + args[0].u64 = p_hdr->data[0].u64; + args[1].u64 = p_hdr->data[1].u64; + + memcpy(&args[2], payload, payload_args_len); + + payload += nargs - IPS_AM_HDR_NARGS; + paylen -= payload_args_len; + } + + /* Subtract off padding bytes (dword padding) for non-TINY. */ + paylen -= p_hdr->amhdr_len; + } + + hentry = psm_am_get_handler_function(proto_am->proto->ep, + p_hdr->amhdr_hidx); + + /* Note a guard here for hentry != NULL is not needed because at + * initialization, a psmi_assert_always() assure the entry will be + * non-NULL. */ + + if (likely(hentry->version == PSM2_AM_HANDLER_V2)) { + psm2_am_handler_2_fn_t hfn2 = + (psm2_am_handler_2_fn_t)hentry->hfn; + ret = hfn2(&token, args, nargs, payload, paylen, hentry->hctx); + } else { + psm2_am_handler_fn_t hfn1 = + (psm2_am_handler_fn_t)hentry->hfn; + ret = hfn1(&token, args, nargs, payload, paylen); + } + + return ret; +} + +static int +ips_proto_am_handle_outoforder_queue() +{ + struct ips_am_message *msg, *prev; + int ret = IPS_RECVHDRQ_CONTINUE; + + prev = &ips_am_outoforder_q.head; + msg = ips_am_outoforder_q.head.next; + + while (msg != NULL) { + struct ips_epaddr *ipsaddr = msg->ipsaddr; + if (ipsaddr->msgctl->am_recv_seqnum != msg->seqnum) { + prev = msg; + msg = msg->next; + continue; + } + + ipsaddr->msgctl->am_recv_seqnum++; + + if (ips_am_run_handler(&msg->p_hdr, + ipsaddr, msg->proto_am, + msg->payload, msg->paylen)) + ret = IPS_RECVHDRQ_BREAK; + + prev->next = msg->next; + if (prev->next == NULL) + ips_am_outoforder_q.tail = prev; + + psmi_mq_sysbuf_free(msg->proto_am->proto->mq, msg->payload); + psmi_mpool_put(msg); + + msg = prev->next; + } + + return ret; +} + +static void +ips_proto_am_queue_msg(struct ips_am_message *msg) +{ + msg->next = NULL; + ips_am_outoforder_q.tail->next = msg; + ips_am_outoforder_q.tail = msg; +} + +int ips_proto_am(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_epaddr *ipsaddr = rcv_ev->ipsaddr; + struct ips_proto_am *proto_am = &rcv_ev->proto->proto_am; + ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); + struct ips_flow *flow; + struct ips_am_message *msg = NULL; + int ret = IPS_RECVHDRQ_CONTINUE; + enum ips_msg_order msgorder; + + psmi_assert(flowid < EP_FLOW_LAST); + flow = &ipsaddr->flows[flowid]; + /* + * Based on AM request/reply traffic pattern, if we don't have a reply + * scb slot then we can't process the request packet, we just silently + * drop it. Otherwise, it will be a deadlock. note: + * ips_proto_is_expected_or_nak() can not be called in this case. + */ + if (_get_proto_hfi_opcode(p_hdr) == OPCODE_AM_REQUEST && + !ips_scbctrl_avail(&proto_am->scbc_reply)) + return IPS_RECVHDRQ_CONTINUE; + + if (!ips_proto_is_expected_or_nak(rcv_ev)) + return IPS_RECVHDRQ_CONTINUE; + + uint16_t send_msgseq = + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK; + msgorder = ips_proto_check_msg_order(ipsaddr, flow, send_msgseq, + &ipsaddr->msgctl->am_recv_seqnum); + + if (msgorder == IPS_MSG_ORDER_FUTURE) + return IPS_RECVHDRQ_REVISIT; + else if (msgorder == IPS_MSG_ORDER_FUTURE_RECV) { + uint64_t *msg_payload; + uint64_t *payload = ips_recvhdrq_event_payload(rcv_ev); + uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev); + + psmi_assert(paylen == 0 || payload); + msg = psmi_mpool_get(ips_am_msg_pool); + if (unlikely(msg == NULL)) { + /* Out of memory, drop the packet. */ + flow->recv_seq_num.psn_num = + (flow->recv_seq_num.psn_num - 1) & + rcv_ev->proto->psn_mask; + return IPS_RECVHDRQ_BREAK; + } + msg_payload = psmi_mq_sysbuf_alloc( + proto_am->proto->mq, + ips_recvhdrq_event_paylen(rcv_ev)); + if (unlikely(msg_payload == NULL)) { + /* Out of memory, drop the packet. */ + flow->recv_seq_num.psn_num = + (flow->recv_seq_num.psn_num - 1) & + rcv_ev->proto->psn_mask; + psmi_mpool_put(msg); + return IPS_RECVHDRQ_BREAK; + } + + memcpy(&msg->p_hdr, p_hdr, sizeof(struct ips_message_header)); + memcpy(msg_payload, payload, paylen); + + msg->payload = msg_payload; + msg->ipsaddr = ipsaddr; + msg->proto_am = proto_am; + msg->paylen = paylen; + msg->seqnum = + __le32_to_cpu(p_hdr->khdr.kdeth0) & + HFI_KHDR_MSGSEQ_MASK; + + ips_proto_am_queue_msg(msg); + } else if ((msgorder == IPS_MSG_ORDER_EXPECTED) || + (msgorder == IPS_MSG_ORDER_EXPECTED_MATCH)) { + uint64_t *payload = ips_recvhdrq_event_payload(rcv_ev); + uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev); + + psmi_assert(paylen == 0 || payload); + if (ips_am_run_handler(p_hdr, ipsaddr, proto_am, + payload, paylen)) + ret = IPS_RECVHDRQ_BREAK; + + ips_proto_am_handle_outoforder_queue(); + } + + /* Look if the handler replied, if it didn't, ack the request */ + if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); + + ips_proto_process_ack(rcv_ev); + return ret; +} diff --git a/ptl_ips/ips_proto_am.h b/ptl_ips/ips_proto_am.h new file mode 100644 index 0000000..3e0a271 --- /dev/null +++ b/ptl_ips/ips_proto_am.h @@ -0,0 +1,93 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_PROTO_AM_H +#define _IPS_PROTO_AM_H + +#include "psm_user.h" +#include "ips_scb.h" + +struct ips_proto_am { + struct ips_proto *proto; /* back pointer */ + struct ips_scbctrl scbc_request; + struct ips_scbctrl scbc_reply; +}; + +psm2_error_t +ips_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters); + +psm2_error_t +ips_am_short_reply(psm2_am_token_t tok, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, void *completion_ctxt); + +psm2_error_t +ips_am_short_request(psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt); + +psm2_error_t +MOCKABLE(ips_proto_am_init)(struct ips_proto *proto, + int num_send_slots, + uint32_t imm_size, + struct ips_proto_am *proto_am); +MOCK_DCL_EPILOGUE(ips_proto_am_init); + +psm2_error_t ips_proto_am_fini(struct ips_proto_am *proto_am); + +#endif /* _IPS_PROTO_AM_H */ diff --git a/ptl_ips/ips_proto_connect.c b/ptl_ips/ips_proto_connect.c new file mode 100644 index 0000000..a608760 --- /dev/null +++ b/ptl_ips/ips_proto_connect.c @@ -0,0 +1,1559 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include "psm_mq_internal.h" +#include "ips_proto_internal.h" + +/* + * define connection version. this is the basic version, optimized + * version will be added later for scalability. + */ +#define IPS_CONNECT_VERNO 0x0001 + +struct ips_connect_hdr { + uint16_t connect_verno; /* should be ver 1 */ + uint16_t psm_verno; /* should be 2.0 */ + uint32_t connidx; /* ignore if 0xffffffff */ + uint64_t epid; /* epid of connector process */ +}; + +struct ips_connect_reqrep { + uint16_t connect_verno; /* should be ver 1 */ + uint16_t psm_verno; /* should be 2.0 */ + uint32_t connidx; /* ignore if 0xffffffff */ + uint64_t epid; /* epid of connector process */ + /* above should be same as ips_connect_hdr */ + + uint16_t connect_result; /* error code */ + uint16_t sl; /* service level for matching */ + uint16_t mtu; /* receive payload */ + uint16_t job_pkey; /* partition key for verification */ + + uint32_t runid_key; /* one-time stamp connect key */ + uint32_t initpsn; /* initial psn for flow */ + + char hostname[128]; /* sender's hostname string */ +}; + +/* Startup protocol in PSM/IPS + * + * Start timer. + * + * For all nodes to connect to: + * Grab connect lock + * Look up epid in table + * MATCH. + * assert cstate_outgoing != CONNECT_WAITING (no re-entrancy) + * If cstate_outgoing == CONNECT_DONE + * return the already connected address. + * else + * assert cstate_outgoing == CONNECT_NONE + * assert cstate_incoming == CONNECT_DONE + * cstate_outgoing := CONNECT_WAITING + * assert connidx_outgoing != UNKNOWN && connidx_incoming != UNKNOWN + * req->connidx := epaddr->connidx_incoming + * add to list of pending connect. + * NO MATCH + * allocate epaddr and put in table + * cstate_outgoing := CONNECT_WAITING + * cstate_incoming := CONNECT_NONE + * connidx_outgoing := UNKNOWN + * req->connidx := epaddr->connidx_incoming := NEW connidx integer + * add to list of pending connect + * Release connect lock + * + * expected_connect_count = ep->total_connect_count + num_to_connect + * while (expected_connect_count != ep->total_connect_count) + * check for timeout + * progress(); + * + * For all connection requests received (within progress loop) + * If uuid doesn't match, NAK the connect and skip request + * Grab connect lock + * Lock up epid in table + * MATCH + * if cstate_incoming == CONNECT_DONE + * req->connidx := epaddr->connidx_incoming + * compose reply and send again (this is a dupe request). + * else + * assert cstate_incoming == CONNECT_NONE + * assert cstate_outgoing == (CONNECT_WAITING | CONNECT_DONE) + * cstate_incoming := CONNECT_DONE + * epaddr->connidx_outgoing := req->connidx + * req->connidx := epaddr->connidx_incoming + * NO MATCH + * allocate epaddr and put in table + * cstate_incoming := CONNECT_DONE + * epaddr->connidx_outgoing = req->connidx; + * rep->connidx := epaddr->connidx_incoming := NEW connidx integer + * compose connect reply and send + * Release connect lock + * + * For all connection replies received: + * If connect_result != 0, process error and skip. + * assert cstate_outgoing == CONNECT_WAITING + * if cstate_incoming == CONNECT_DONE + * assert rep->connidx == epaddr->connidx_outgoing + * else + * epaddr->connidx_outgoing := rep->connidx + * cstate_outgoing := CONNECT_DONE + * ep->total_connect_count ++ + * + * * Fill in a connection request: + * 1. Set connect protocol version and PSM versions + * 2. Set the uuid attached to current endpoint and add the job_pkey + * the node wishes to communicate post-connect. + * 3. Set our mtu, bitwidth and endianess to detect inconsistencies + * + */ + +/** + * Configure flows for an ipsaddr. + * + * @arg ipsaddr - the ipsaddr to configure the flows for + * @arg proto - the protocol used + * + * @pre proto's flags must be set + * + * Flows should be configured: + * - immediately upon creation of an ipsaddr + * - whenever a connection is established and the receiver's characteristics + * (e.g. mtu) become known + */ +ustatic +void +ips_ipsaddr_configure_flows(struct ips_epaddr *ipsaddr, struct ips_proto *proto) +{ + /* PIO flow uses the normal priority path, to separate low + * priority path for bulk sdma data packets + */ + ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], proto, + ipsaddr, PSM_TRANSFER_PIO, PSM_PROTOCOL_GO_BACK_N, + IPS_PATH_NORMAL_PRIORITY, EP_FLOW_GO_BACK_N_PIO); + + /* DMA flow uses the low priority path, multi MTU sized eager + * message uses the same flow to transfer to avoid out of order. + */ + ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA], proto, + ipsaddr, PSM_TRANSFER_DMA, PSM_PROTOCOL_GO_BACK_N, + IPS_PATH_LOW_PRIORITY, EP_FLOW_GO_BACK_N_DMA); +} + +/* + * Teardown any unnecessary timers that could still be active and assign NULL + * to pointers in flow structs. We do this mainly for PIO and DMA flows. + * TidFlow teardowns are conducted in ips_protoexp_fini() + */ +static +void +ips_flow_fini(struct ips_epaddr *ipsaddr, struct ips_proto *proto) +{ + struct ips_flow *flow; + int i; + + for (i = 0; i < EP_FLOW_TIDFLOW; i++) { + flow = &ipsaddr->flows[i]; + + /* Cancel any stale flow->timers in flight */ + if (flow->timer_ack) { + psmi_timer_cancel(proto->timerq, flow->timer_ack); + flow->timer_ack = NULL; + } + + if (flow->timer_send) { + psmi_timer_cancel(proto->timerq, flow->timer_send); + flow->timer_send = NULL; + } + + flow->flush = NULL; + flow->path = NULL; + flow->ipsaddr = NULL; + } +} + +static +psm2_epaddr_t +ips_alloc_epaddr(struct ips_proto *proto, int master, psm2_epid_t epid, + const char *hostname, uint16_t hfi_type, unsigned long timeout); + +/* + * Given a connection request, set mtu, communication index and hdr length + * parameters. + * + * The most subtle parameter is the mtu. When set as 'req->mtu', the mtu + * is our connecting peer's declared mtu (which may not be the same as our + * mtu). The approach is to take the smaller of both mtus when communicating + * with that peer. Also, when using pio, the size can be further restricted by + * the pio send buffer sizes (i.e. 4K IB MTU but only 2K PIO buffers). + */ +static +psm2_error_t +ips_ipsaddr_set_req_params(struct ips_proto *proto, + ips_epaddr_t *ipsaddr, + const struct ips_connect_reqrep *req, + uint32_t paylen) +{ + psm2_ep_t ep; + psm2_epaddr_t epaddr; + psm2_error_t err = PSM2_OK; + int i, start, count; + uint64_t *data; + psmi_assert_always(req->mtu > 0); + uint16_t common_mtu = min(req->mtu, proto->epinfo.ep_mtu); + int ptype, pidx; + + /* + * Make RNDV window size being dependent on MTU size; + * This is due to fact that number of send packets + * within a given window must not exceed 2048 (@ref PSM_TID_MAX_PKTS). + * Use smaller of two values: + * unified MTU * PSM_TID_MAX_PKTS vs already configured window size. + */ + ipsaddr->window_rv = min(common_mtu * PSM_TID_MAX_PKTS, proto->mq->hfi_base_window_rv); + + /* + * For static routes i.e. "none" path resolution update all paths to + * have the same profile (mtu, sl etc.). + * + * For path record queries the epr_mtu and epr_sl are setup correctly + * from the path itself. + */ + for (ptype = IPS_PATH_LOW_PRIORITY; + ptype < IPS_PATH_MAX_PRIORITY; ptype++) + for (pidx = 0; + pidx < ipsaddr->pathgrp->pg_num_paths[ptype]; pidx++) { + if (proto->ep->path_res_type == PSM2_PATH_RES_NONE) { + ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu = + common_mtu; + } else { + ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu = + min(common_mtu, + ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu); + } + } + + /* + * We've got updated mtu/path records, need to re-initialize the flows to take + * into account _real_ (updated) remote endpoint characteristics + */ + ips_ipsaddr_configure_flows(ipsaddr, proto); + + /* + * Save peer's info. + */ + ipsaddr->connidx_outgoing = req->connidx; + ipsaddr->runid_key = req->runid_key; + /* ipsaddr->initpsn = req->initpsn; */ + + err = + psmi_epid_set_hostname(psm2_epid_nid(((psm2_epaddr_t) ipsaddr)->epid), + (char *)req->hostname, 0); + if (err) + return err; + + /* + * Check if there is other rails to setup. + */ + paylen -= sizeof(struct ips_connect_reqrep); + if (paylen == 0) + return PSM2_OK; + + /* + * Yes, other rail's gid/epid is attached. + */ + if (paylen % (sizeof(uint64_t) + sizeof(psm2_epid_t))) { + return PSM2_INTERNAL_ERR; + } + count = paylen / (sizeof(uint64_t) + sizeof(psm2_epid_t)); + if (count > HFI_MAX_RAILS) + return PSM2_INTERNAL_ERR; + + /* + * Both side are ordered, so just search from small to big. + */ + start = 0; + data = (uint64_t *) (req + 1); + ep = proto->ep->mctxt_next; + + struct drand48_data drand48_data; + srand48_r((long int)(ipsaddr->epaddr.epid + proto->ep->epid), &drand48_data); + + /* Loop over all slave endpoints */ + while (ep != ep->mctxt_master) { + for (i = start; i < count; i++) { + + /* There is a gid match, create the epaddr */ + if (data[2 * i] == ep->gid_hi) { + + epaddr = + ips_alloc_epaddr(&((struct ptl_ips *)(ep->ptl_ips.ptl))->proto, 0, + data[2 * i + 1], NULL, + PSMI_HFI_TYPE_OPA1, + 5000); + if (epaddr == NULL) + return PSM2_NO_MEMORY; + + /* link the ipsaddr */ + IPS_MCTXT_APPEND(ipsaddr, + (ips_epaddr_t *) epaddr); + + /* Setup message control info to the same struct */ + ((ips_epaddr_t *) epaddr)->msgctl = + ipsaddr->msgctl; + ipsaddr->msgctl->ipsaddr_count++; + + /* randomize the rail to start traffic */ + long int rnum; + lrand48_r(&drand48_data, &rnum); + if ((rnum % count) == i) { + ipsaddr->msgctl->ipsaddr_next = + (ips_epaddr_t *) epaddr; + } + + /* update the starting point, + * all previous ones are not valid anymore */ + start = i + 1; + break; + } + } + + ep = ep->mctxt_next; + } + + return PSM2_OK; +} + +static psm2_error_t +ips_proto_send_ctrl_message_request(struct ips_proto *proto, + struct ips_flow *flow, uint8_t message_type, + uint16_t *msg_queue_mask, uint64_t timeout) +{ + psm2_error_t err = PSM2_OK; + ips_scb_t ctrlscb; + + /* msg header plus gid+epid for all rails plus checksum */ + char payload[sizeof(struct ips_connect_reqrep) + + 16*HFI_MAX_RAILS + PSM_CRC_SIZE_IN_BYTES]; + uint32_t paylen; + + ctrlscb.scb_flags = 0; + paylen = ips_proto_build_connect_message(proto, + flow->ipsaddr, message_type, payload); + psmi_assert_always(paylen <= sizeof(payload)); + + do { + err = ips_proto_send_ctrl_message(flow, message_type, + msg_queue_mask, &ctrlscb, payload, paylen); + if (err == PSM2_OK) { + break; + } + if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1)))) { + break; + } + } while (get_cycles() < timeout); + + return err; +} + +static psm2_error_t +ips_proto_send_ctrl_message_reply(struct ips_proto *proto, + struct ips_flow *flow, uint8_t message_type, + uint16_t *msg_queue_mask) +{ + /* This will try up to 100 times until the message is sent. The code + * is persistent because dropping replies will lead to a lack of + * overall progress on the connection/disconnection. We do not want + * to poll from here, and we cannot afford a lengthy timeout, since + * this is called from the receive path. + */ + psm2_error_t err = PSM2_OK; + int i; + ips_scb_t ctrlscb; + /* msg header plus gid+epid for all rails plus checksum */ + char payload[sizeof(struct ips_connect_reqrep) + + 16*HFI_MAX_RAILS + PSM_CRC_SIZE_IN_BYTES]; + uint32_t paylen; + + ctrlscb.scb_flags = 0; + paylen = ips_proto_build_connect_message(proto, + flow->ipsaddr, message_type, payload); + psmi_assert_always(paylen <= sizeof(payload)); + + for (i = 0; i < 100; i++) { + err = ips_proto_send_ctrl_message(flow, message_type, + msg_queue_mask, &ctrlscb, payload, paylen); + if (err == PSM2_OK) { + break; + } + } + + return err; +} + +int +ips_proto_build_connect_message(struct ips_proto *proto, + ips_epaddr_t *ipsaddr, + uint8_t opcode, void *payload) +{ + struct ips_connect_hdr *hdr = (struct ips_connect_hdr *)payload; + struct ips_connect_reqrep *req = (struct ips_connect_reqrep *)payload; + uint32_t paylen = 0; + + psmi_assert_always(proto != NULL); + + hdr->connect_verno = IPS_CONNECT_VERNO; + hdr->psm_verno = PSMI_VERNO; + hdr->connidx = (uint32_t) ipsaddr->connidx_incoming; + hdr->epid = proto->ep->epid; + + switch (opcode) { + case OPCODE_CONNECT_REPLY: + case OPCODE_CONNECT_REQUEST: + if (opcode == OPCODE_CONNECT_REQUEST) { + req->connect_result = PSM2_OK; + req->runid_key = proto->runid_key; + } else { + req->connect_result = ipsaddr->cerror_incoming; + req->runid_key = ipsaddr->runid_key; + } + + req->sl = proto->epinfo.ep_sl; + req->mtu = proto->epinfo.ep_mtu; + req->job_pkey = proto->epinfo.ep_pkey; + + strncpy(req->hostname, psmi_gethostname(), + sizeof(req->hostname) - 1); + req->hostname[sizeof(req->hostname) - 1] = '\0'; + + paylen = sizeof(struct ips_connect_reqrep); + + /* Attach all multi-context subnetids and epids. */ + if (proto->ep->mctxt_master == proto->ep) { + psm2_ep_t ep = proto->ep->mctxt_next; + uint64_t *data = (uint64_t *) (req + 1); + while (ep != proto->ep) { + *data = ep->gid_hi; + paylen += sizeof(uint64_t); + data++; + *data = ep->epid; + paylen += sizeof(uint64_t); + data++; + ep = ep->mctxt_next; + } + } + + break; + + case OPCODE_DISCONNECT_REQUEST: + case OPCODE_DISCONNECT_REPLY: + paylen = sizeof(struct ips_connect_hdr); + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unexpected/unhandled connection opcode 0x%x\n", + opcode); + break; + } + + return paylen; +} + +void +MOCKABLE(ips_flow_init)(struct ips_flow *flow, struct ips_proto *proto, + ips_epaddr_t *ipsaddr, psm_transfer_type_t transfer_type, + psm_protocol_type_t protocol, ips_path_type_t path_type, + uint32_t flow_index) +{ + psmi_assert_always(protocol < PSM_PROTOCOL_LAST); + psmi_assert_always(flow_index < EP_FLOW_LAST); + + SLIST_NEXT(flow, next) = NULL; + if (transfer_type == PSM_TRANSFER_PIO) { + flow->flush = ips_proto_flow_flush_pio; + } else { + flow->flush = ips_proto_flow_flush_dma; + } + + flow->path = + ips_select_path(proto, path_type, ipsaddr, ipsaddr->pathgrp); + + /* Select the fragment size for this flow. Flow is the common + * denominator between the local endpoint, the remote endpoint, + * the path between those and whether it's a PIO or DMA send. + * Hence, it "owns" the maximum transmission unit in its frag_size + * member. + */ + + /* min of local MTU and path MTU */ + flow->frag_size = min(proto->epinfo.ep_mtu, flow->path->pr_mtu); + /* if PIO, need to consider local pio buffer size */ + if (transfer_type == PSM_TRANSFER_PIO) { + flow->frag_size = min(flow->frag_size, proto->epinfo.ep_piosize); + _HFI_VDBG("[ipsaddr=%p] PIO flow->frag_size: %u = min(" + "proto->epinfo.ep_mtu(%u), flow->path->pr_mtu(%u), proto->epinfo.ep_piosize(%u))\n", + ipsaddr, flow->frag_size, proto->epinfo.ep_mtu, + flow->path->pr_mtu, proto->epinfo.ep_piosize); + } else { + _HFI_VDBG("[ipsaddr=%p] SDMA flow->frag_size: %u = min(" + "proto->epinfo.ep_mtu(%u), flow->path->pr_mtu(%u))\n", + ipsaddr, flow->frag_size, proto->epinfo.ep_mtu, + flow->path->pr_mtu); + } + + flow->ipsaddr = ipsaddr; + flow->transfer = transfer_type; + flow->protocol = protocol; + flow->flowid = flow_index; + flow->xmit_seq_num.psn_val = 0; + flow->recv_seq_num.psn_val = 0; + flow->xmit_ack_num.psn_val = 0; + flow->flags = 0; + flow->cca_ooo_pkts = 0; + flow->credits = flow->cwin = proto->flow_credits; + flow->ack_interval = max((proto->flow_credits >> 2) - 1, 1); + flow->ack_counter = 0; +#ifdef PSM_DEBUG + flow->scb_num_pending = 0; + flow->scb_num_unacked = 0; +#endif + + flow->timer_ack = NULL; + flow->timer_send = NULL; + + STAILQ_INIT(&flow->scb_unacked); + SLIST_INIT(&flow->scb_pend); + return; +} +MOCK_DEF_EPILOGUE(ips_flow_init); + +static +psm2_epaddr_t +ips_alloc_epaddr(struct ips_proto *proto, int master, psm2_epid_t epid, + const char *hostname, uint16_t hfi_type, unsigned long timeout) +{ + psm2_error_t err = PSM2_OK; + psm2_epaddr_t epaddr; + ips_epaddr_t *ipsaddr; + ips_path_grp_t *pathgrp; + uint16_t lid; + + /* The PSM/PTL-level epaddr, ips-level epaddr, and per-peer msgctl + * structures are collocated in memory for performance reasons -- this is + * why ips allocates memory for all three together. + * + * The PSM/PTL structure data is filled in upon successfully ep connect in + * ips_ptl_connect(). + */ + if (master) { + struct ips_msgctl *msgctl; + + /* Although an ips_msgtl is allocated here, it can be safely casted to + both an ips_epaddr and a psm2_epaddr. It is eventually freed as an + ips_epaddr. */ + msgctl = + (struct ips_msgctl *)psmi_calloc(proto->ep, + PER_PEER_ENDPOINT, 1, + sizeof(struct ips_msgctl)); + if (msgctl == NULL) + return NULL; + + ipsaddr = &msgctl->master_epaddr; + epaddr = (psm2_epaddr_t) ipsaddr; + + ipsaddr->msgctl = msgctl; + + /* initialize items in ips_msgctl_t */ + msgctl->ipsaddr_next = ipsaddr; + msgctl->mq_send_seqnum = 0; + msgctl->mq_recv_seqnum = 0; + msgctl->am_send_seqnum = 0; + msgctl->am_recv_seqnum = 0; + msgctl->ipsaddr_count = 1; + msgctl->outoforder_count = 0; + } else { + epaddr = + (psm2_epaddr_t) psmi_calloc(proto->ep, PER_PEER_ENDPOINT, 1, + sizeof(struct ips_epaddr)); + psmi_assert_always(epaddr); + ipsaddr = (ips_epaddr_t *) epaddr; + } + + epaddr->ptlctl = ((struct ptl_ips *)(proto->ptl))->ctl; + epaddr->proto = proto; + epaddr->epid = epid; + + /* IPS-level epaddr */ + ipsaddr->next = ipsaddr; + + ipsaddr->ctrl_msg_queued = 0; + ipsaddr->msg_toggle = 0; + + /* Actual context of peer */ + ipsaddr->context = PSMI_EPID_GET_CONTEXT(epid); + /* Subcontext */ + ipsaddr->subcontext = PSMI_EPID_GET_SUBCONTEXT(epid); + + /* Get path record for tuple */ + lid = PSMI_EPID_GET_LID(epid); + err = proto->ibta.get_path_rec(proto, proto->epinfo.ep_base_lid, + __cpu_to_be16(lid), hfi_type, timeout, + &pathgrp); + if (err != PSM2_OK) { + psmi_free(epaddr); + return NULL; + } + ipsaddr->pathgrp = pathgrp; + + /* Setup high priority path index, control messages use the high + * priority CONTROL path. + */ + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) + ipsaddr->hpp_index = 0; + else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST) + ipsaddr->hpp_index = ipsaddr->context % + ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY]; + else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC) + ipsaddr->hpp_index = proto->epinfo.ep_context % + ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY]; + else /* Base LID */ + ipsaddr->hpp_index = 0; + + /* + * Set up the flows on this ipsaddr + */ + ips_ipsaddr_configure_flows(ipsaddr, proto); + + /* clear connection state. */ + ipsaddr->cstate_outgoing = CSTATE_NONE; + ipsaddr->cstate_incoming = CSTATE_NONE; + + /* Add epaddr to PSM's epid table */ + psmi_epid_add(proto->ep, epaddr->epid, epaddr); + psmi_assert(psmi_epid_lookup(proto->ep, epaddr->epid) == epaddr); + + return epaddr; +} + +static +void ips_free_epaddr(psm2_epaddr_t epaddr, struct ips_proto *proto) +{ + ips_epaddr_t *ipsaddr = (ips_epaddr_t *) epaddr; + ips_flow_fini(ipsaddr, proto); + + _HFI_VDBG("epaddr=%p,ipsaddr=%p,connidx_incoming=%d\n", epaddr, ipsaddr, + ipsaddr->connidx_incoming); + psmi_epid_remove(epaddr->proto->ep, epaddr->epid); + ips_epstate_del(epaddr->proto->epstate, ipsaddr->connidx_incoming); + psmi_free(epaddr); + return; +} + +static +psm2_error_t +ptl_handle_connect_req(struct ips_proto *proto, + psm2_epaddr_t epaddr, struct ips_connect_reqrep *req, + uint32_t paylen); + +psm2_error_t +ips_proto_process_connect(struct ips_proto *proto, uint8_t opcode, + struct ips_message_header *p_hdr, void *payload, + uint32_t paylen) +{ + struct ips_connect_hdr *hdr = (struct ips_connect_hdr *)payload; + psm2_epaddr_t epaddr; + ips_epaddr_t *ipsaddr; + psm2_error_t err = PSM2_OK; + + PSMI_LOCK_ASSERT(proto->mq->progress_lock); + + epaddr = psmi_epid_lookup(proto->ep, hdr->epid); + ipsaddr = epaddr ? (ips_epaddr_t *) epaddr : NULL; + + switch (opcode) { + case OPCODE_CONNECT_REQUEST: + err = ptl_handle_connect_req(proto, epaddr, + (struct ips_connect_reqrep *)hdr, + paylen); + break; + + case OPCODE_CONNECT_REPLY: + { + struct ips_connect_reqrep *req = + (struct ips_connect_reqrep *)payload; + + if (!ipsaddr || req->runid_key != proto->runid_key) { + _HFI_PRDBG + ("Unknown connectrep (ipsaddr=%p, %d,%d) from epid %d:%d:%d\n", + ipsaddr, req->runid_key, proto->runid_key, + (int)PSMI_EPID_GET_LID(hdr->epid), + (int)PSMI_EPID_GET_CONTEXT(hdr->epid), + (int)PSMI_EPID_GET_SUBCONTEXT(hdr->epid)); + } else if (ipsaddr->cstate_outgoing != CSTATE_OUTGOING_WAITING) { + /* possible dupe */ + _HFI_VDBG("connect dupe, expected %d got %d\n", + CSTATE_OUTGOING_WAITING, + ipsaddr->cstate_outgoing); + } else { + /* Reply to our request for connection (i.e. outgoing connection) */ + if (ipsaddr->cstate_incoming != CSTATE_ESTABLISHED) { + err = + ips_ipsaddr_set_req_params(proto, + ipsaddr, + req, + paylen); + if (err) + goto fail; + } + ipsaddr->cstate_outgoing = CSTATE_ESTABLISHED; + ipsaddr->cerror_outgoing = req->connect_result; + } + } + break; + + case OPCODE_DISCONNECT_REQUEST: + { + ips_epaddr_t ipsaddr_f; /* fake a ptl addr */ + int epaddr_do_free = 0; + psmi_assert_always(paylen == + sizeof(struct ips_connect_hdr)); + _HFI_VDBG("Got a disconnect from %s\n", + psmi_epaddr_get_name(hdr->epid)); + proto->num_disconnect_requests++; + /* It's possible to get a disconnection request on a ipsaddr that + * we've since removed if the request is a dupe. Instead of + * silently dropping the packet, we "echo" the request in the + * reply. */ + if (ipsaddr == NULL) { + ips_path_grp_t *pathgrp; + uint16_t lid; + + ipsaddr = &ipsaddr_f; + memset(&ipsaddr_f, 0, sizeof(ips_epaddr_t)); + ipsaddr_f.context = + PSMI_EPID_GET_CONTEXT(hdr->epid); + ipsaddr_f.subcontext = + PSMI_EPID_GET_SUBCONTEXT(hdr->epid); + + /* Get path record for peer */ + lid = PSMI_EPID_GET_LID(hdr->epid); + err = proto->ibta.get_path_rec(proto, + proto->epinfo. + ep_base_lid, + __cpu_to_be16(lid), + PSMI_HFI_TYPE_OPA1, + 3000, &pathgrp); + if (err != PSM2_OK) + goto fail; + + ipsaddr_f.pathgrp = pathgrp; + ((psm2_epaddr_t) &ipsaddr_f)->ptlctl = + ((struct ptl_ips *)(proto->ptl))->ctl; + ((psm2_epaddr_t) &ipsaddr_f)->proto = proto; + /* If the send fails because of pio_busy, don't let ips queue + * the request on an invalid ipsaddr, just drop the reply */ + ipsaddr_f.ctrl_msg_queued = ~0; + + psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); + + ips_flow_init(&ipsaddr_f. + flows[proto->msgflowid], proto, + &ipsaddr_f, PSM_TRANSFER_PIO, + PSM_PROTOCOL_GO_BACK_N, + IPS_PATH_LOW_PRIORITY, + EP_FLOW_GO_BACK_N_PIO); + _HFI_VDBG + ("Disconnect on unknown epaddr, just echo request\n"); + } else if (ipsaddr->cstate_incoming != CSTATE_NONE) { + ipsaddr->cstate_incoming = CSTATE_NONE; + proto->num_connected_incoming--; + if (ipsaddr->cstate_outgoing == CSTATE_NONE) { + epaddr_do_free = 1; + } + } + + psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); + + ips_proto_send_ctrl_message_reply(proto, &ipsaddr-> + flows[proto-> + msgflowid], + OPCODE_DISCONNECT_REPLY, + &ipsaddr-> + ctrl_msg_queued); + /* We can safely free the ipsaddr if required since disconnect + * messages are never enqueued so no reference to ipsaddr is kept */ + if (epaddr_do_free) { + ips_free_epaddr(epaddr, proto); + epaddr = NULL; + } + } + break; + + case OPCODE_DISCONNECT_REPLY: + if (!ipsaddr) { + _HFI_VDBG + ("Unknown disconnect reply from epid %d:%d.%d\n", + (int)PSMI_EPID_GET_LID(hdr->epid), + (int)PSMI_EPID_GET_CONTEXT(hdr->epid), + (int)PSMI_EPID_GET_SUBCONTEXT(hdr->epid)); + break; + } else if (ipsaddr->cstate_outgoing == CSTATE_OUTGOING_WAITING_DISC) { + ipsaddr->cstate_outgoing = CSTATE_OUTGOING_DISCONNECTED; + /* Freed in disconnect() if cstate_incoming == NONE */ + } /* else dupe reply */ + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unexpected/unhandled connect opcode 0x%x\n", + opcode); + } + +fail: + return err; +} + +static +psm2_error_t +ptl_handle_connect_req(struct ips_proto *proto, psm2_epaddr_t epaddr, + struct ips_connect_reqrep *req, uint32_t paylen) +{ + ips_epaddr_t *ipsaddr; + psm2_error_t err = PSM2_OK; + uint16_t connect_result; + int newconnect = 0; + + if (req->epid == proto->ep->epid) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_EPID_NETWORK_ERROR, + "Network connectivity problem: Locally detected duplicate " + "LIDs 0x%04x on hosts %s and %s. (Exiting)", + (uint32_t) psm2_epid_nid(req->epid), + psmi_epaddr_get_hostname(req->epid), + psmi_gethostname()); + /* XXX no return */ + abort(); + } else if (epaddr == NULL) { /* new ep connect before we call into connect */ + newconnect = 1; + if ((epaddr = + ips_alloc_epaddr(proto, 1, req->epid, req->hostname, + PSMI_HFI_TYPE_OPA1, + 5000)) == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + } else if (((ips_epaddr_t *) epaddr)->cstate_incoming == CSTATE_ESTABLISHED) { + ipsaddr = (ips_epaddr_t *) epaddr; + /* Duplicate lid detection. */ + if (ipsaddr->runid_key == req->runid_key) + goto do_reply; /* duplicate request, not duplicate lid */ + else { /* Some out of context message. Just drop it */ + if (!proto->done_warning) { + psmi_syslog(proto->ep, 1, LOG_INFO, + "Non-fatal connection problem: Received an out-of-context " + "connection message from host %s LID=0x%x context=%d. (Ignoring)", + req->hostname, + (int)psm2_epid_nid(req->epid), + psm2_epid_context(req->epid)); + proto->done_warning = 1; + } + goto no_reply; + } + } else if (((ips_epaddr_t *) epaddr)->cstate_outgoing == CSTATE_NONE) { + /* pre-created epaddr in multi-rail */ + psmi_assert_always(epaddr->proto->ep != + epaddr->proto->ep->mctxt_master); + newconnect = 1; + } + + ipsaddr = (ips_epaddr_t *) epaddr; + psmi_assert_always(ipsaddr->cstate_incoming == CSTATE_NONE); + + /* Check connect version and psm version */ + if (req->connect_verno < 0x0001) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_EPID_INVALID_VERSION, + "Connect protocol (%x,%x) is obsolete and incompatible", + (req->connect_verno >> 8) & 0xff, + req->connect_verno & 0xff); + connect_result = PSM2_EPID_INVALID_CONNECT; + } else if (!psmi_verno_isinteroperable(req->psm_verno)) { + connect_result = PSM2_EPID_INVALID_VERSION; + } else if (!(proto->flags & IPS_PROTO_FLAG_QUERY_PATH_REC) && + proto->epinfo.ep_pkey != HFI_DEFAULT_P_KEY && + proto->epinfo.ep_pkey != req->job_pkey) { + connect_result = PSM2_EPID_INVALID_PKEY; + } else if (req->sl != proto->epinfo.ep_sl) { + connect_result = PSM2_EPID_INVALID_CONNECT; + _HFI_ERROR("Connection error: Service Level mismatch (local:%d, remote:%d)\n", proto->epinfo.ep_sl, req->sl); + } else { + connect_result = PSM2_OK; + if (ipsaddr->cstate_outgoing == CSTATE_NONE) { + ips_epstate_idx idx; + psmi_assert_always(newconnect == 1); + err = ips_epstate_add(proto->epstate, ipsaddr, &idx); + if (err) + goto fail; + ipsaddr->connidx_incoming = idx; + } + } + + /* Incoming connection request */ + if (ipsaddr->cstate_outgoing != CSTATE_ESTABLISHED) { + err = ips_ipsaddr_set_req_params(proto, ipsaddr, req, paylen); + if (err) + goto fail; + } + ipsaddr->cstate_incoming = CSTATE_ESTABLISHED; + ipsaddr->cerror_incoming = connect_result; + + ipsaddr->runid_key = req->runid_key; + + proto->num_connected_incoming++; + +do_reply: + psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); + ips_proto_send_ctrl_message_reply(proto, + &ipsaddr->flows[proto->msgflowid], + OPCODE_CONNECT_REPLY, + &ipsaddr->ctrl_msg_queued); +no_reply: +fail: + return err; +} + +psm2_error_t +ips_proto_connect(struct ips_proto *proto, int numep, + const psm2_epid_t *array_of_epid, + const int *array_of_epid_mask, psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, uint64_t timeout_in) +{ + int i, n, n_first; + psm2_error_t err = PSM2_OK; + psm2_epaddr_t epaddr; + ips_epaddr_t *ipsaddr; + ips_epstate_idx idx; + int numep_toconnect = 0, numep_left; + union psmi_envvar_val credits_intval; + int connect_credits; + + psmi_getenv("PSM2_CONNECT_CREDITS", + "End-point connect request credits.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)100, &credits_intval); + + connect_credits = credits_intval.e_uint; + + PSMI_LOCK_ASSERT(proto->mq->progress_lock); + + /* All timeout values are in cycles */ + uint64_t t_start = get_cycles(); + /* Print a timeout at the warning interval */ + union psmi_envvar_val warn_intval; + uint64_t to_warning_interval; + uint64_t to_warning_next; + + /* Setup warning interval */ + psmi_getenv("PSM2_CONNECT_WARN_INTERVAL", + "Period in seconds to warn if connections are not completed." + "Default is 300 seconds, 0 to disable", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)300, &warn_intval); + + to_warning_interval = nanosecs_to_cycles(warn_intval.e_uint * SEC_ULL); + to_warning_next = t_start + to_warning_interval; + + /* Some sanity checks */ + psmi_assert_always(array_of_epid_mask != NULL); + + /* First pass: make sure array of errors is at least fully defined */ + for (i = 0; i < numep; i++) { + _HFI_VDBG("epid-connect=%s connect to %d:%d:%d\n", + array_of_epid_mask[i] ? "YES" : " NO", + (int)PSMI_EPID_GET_LID(array_of_epid[i]), + (int)PSMI_EPID_GET_CONTEXT(array_of_epid[i]), + (int)PSMI_EPID_GET_SUBCONTEXT(array_of_epid[i])); + if (array_of_epid_mask[i]) { + array_of_errors[i] = PSM2_EPID_UNKNOWN; + array_of_epaddr[i] = NULL; + } + } + + /* Second pass: see what to connect and what is connectable. */ + for (i = 0, numep_toconnect = 0; i < numep; i++) { + if (!array_of_epid_mask[i]) + continue; + + /* Can't send to epid on same lid if not loopback */ + if ((psm2_epid_nid(proto->ep->epid) == + psm2_epid_nid(array_of_epid[i])) && + !(proto->flags & IPS_PROTO_FLAG_LOOPBACK)) { + array_of_errors[i] = PSM2_EPID_UNREACHABLE; + continue; + } + + if ((PSMI_EPID_VERSION == PSMI_EPID_V2) + && (PSMI_GET_SUBNET_ID(proto->ep->gid_hi) != + PSMI_EPID_GET_SUBNET_ID(array_of_epid[i]))) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + " Trying to connect to a HFI (subnet id - %"PRIu64")on a" + " different subnet - %"PRIu64" \n", + PSMI_GET_SUBNET_ID(proto->ep->gid_hi), + PSMI_EPID_GET_SUBNET_ID(array_of_epid[i])); + } + + epaddr = psmi_epid_lookup(proto->ep, array_of_epid[i]); + if (epaddr == NULL) { + /* We're sending a connect request message before some other node + * has sent its connect message */ + epaddr = ips_alloc_epaddr(proto, 1, array_of_epid[i], + NULL, + PSMI_HFI_TYPE_OPA1, + (timeout_in / 1000000UL)); + if (epaddr == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + + ipsaddr = (ips_epaddr_t *) epaddr; + err = ips_epstate_add(proto->epstate, ipsaddr, &idx); + if (err) + goto fail; + ipsaddr->connidx_incoming = idx; + } else if (((ips_epaddr_t *) epaddr)->cstate_outgoing != CSTATE_NONE) { /* already connected */ + psmi_assert_always(((ips_epaddr_t *) epaddr)-> + cstate_outgoing == CSTATE_ESTABLISHED); + array_of_errors[i] = PSM2_EPID_ALREADY_CONNECTED; + array_of_epaddr[i] = epaddr; + continue; + } else if (((ips_epaddr_t *) epaddr)->cstate_incoming == + CSTATE_NONE) { + /* pre-created epaddr in multi-rail */ + psmi_assert_always(epaddr->proto->ep != + epaddr->proto->ep->mctxt_master); + ipsaddr = (ips_epaddr_t *) epaddr; + err = ips_epstate_add(proto->epstate, ipsaddr, &idx); + if (err) + goto fail; + ipsaddr->connidx_incoming = idx; + } else { + /* We've already received a connect request message from a remote + * peer, it's time to send our own. */ + ipsaddr = (ips_epaddr_t *) epaddr; + /* No re-entrancy sanity check and makes sure we are not connected + * twice (caller's precondition) */ + psmi_assert(ipsaddr->cstate_outgoing == CSTATE_NONE); + psmi_assert(ipsaddr->cstate_incoming != CSTATE_NONE); + } + + ipsaddr->cstate_outgoing = CSTATE_OUTGOING_WAITING; + ipsaddr->cerror_outgoing = PSM2_OK; + array_of_epaddr[i] = epaddr; + ipsaddr->s_timeout = get_cycles(); + ipsaddr->delay_in_ms = 1; + ipsaddr->credit = 0; + numep_toconnect++; + } + + /* Second pass: do the actual connect. + * PSM2_EPID_UNKNOWN: Not connected yet. + * PSM2_EPID_UNREACHABLE: Not to be connected. + * PSM2_OK: Successfully connected. + * Start sending connect messages at a random index between 0 and numep-1 + */ + numep_left = numep_toconnect; + n_first = ((uint32_t) get_cycles()) % numep; + while (numep_left > 0) { + for (n = 0; n < numep; n++) { + int keep_polling = 1; + i = (n_first + n) % numep; + if (!array_of_epid_mask[i]) + continue; + switch (array_of_errors[i]) { + case PSM2_EPID_UNREACHABLE: + case PSM2_EPID_ALREADY_CONNECTED: + case PSM2_OK: + continue; + default: + break; + } + psmi_assert_always(array_of_epaddr[i] != NULL); + ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; + if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED) { + /* This is not the real error code, we only set OK here + * so we know to stop polling for the reply. The actual + * error is in ipsaddr->cerror_outgoing */ + array_of_errors[i] = PSM2_OK; + numep_left--; + connect_credits++; + ipsaddr->credit = 0; + continue; + } + while (keep_polling) { + if (!psmi_cycles_left(t_start, timeout_in)) { + err = PSM2_TIMEOUT; + goto err_timeout; + } + if (to_warning_interval + && get_cycles() >= to_warning_next) { +#if _HFI_DEBUGGING + uint64_t waiting_time = 0; + if (_HFI_INFO_ON) { + waiting_time = cycles_to_nanosecs( + get_cycles() - + t_start) / SEC_ULL; + } +#endif + const char *first_name = NULL; + int num_waiting = 0; + + for (i = 0; i < numep; i++) { + if (!array_of_epid_mask[i] || + array_of_errors[i] != + PSM2_EPID_UNKNOWN) + continue; + if (!first_name) + first_name = + psmi_epaddr_get_name + (array_of_epid[i]); + num_waiting++; + } + if (_HFI_INFO_ON) { + if (first_name) { + _HFI_INFO_ALWAYS + ("Couldn't connect to %s (and %d others). " + "Time elapsed %02i:%02i:%02i. Still trying...\n", + first_name, num_waiting, + (int)(waiting_time / 3600), + (int)((waiting_time / 60) - + ((waiting_time / + 3600) * 60)), + (int)(waiting_time - + ((waiting_time / + 60) * 60))); + } + } + to_warning_next = + get_cycles() + to_warning_interval; + } + + if (get_cycles() > ipsaddr->s_timeout) { + if (!ipsaddr->credit && connect_credits) { + ipsaddr->credit = 1; + connect_credits--; + } + if (ipsaddr->credit) { + _HFI_VDBG + ("Connect req to %u:%u:%u\n", + __be16_to_cpu(ipsaddr-> + pathgrp->pg_base_dlid), + ipsaddr->context, + ipsaddr->subcontext); + psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); + if ( + ips_proto_send_ctrl_message_request + (proto, &ipsaddr-> + flows[proto->msgflowid], + OPCODE_CONNECT_REQUEST, + &ipsaddr->ctrl_msg_queued, + 0) == PSM2_OK) { + keep_polling = 0; + ipsaddr->delay_in_ms = + min(100, + ipsaddr-> + delay_in_ms << + 1); + ipsaddr->s_timeout = + get_cycles() + + nanosecs_to_cycles + (ipsaddr-> + delay_in_ms * + MSEC_ULL); + } + /* If not, send got "busy", keep trying */ + } else { + keep_polling = 0; + } + } + + if ((err = + psmi_err_only(psmi_poll_internal + (proto->ep, 1)))) + goto fail; + + if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED) { + /* This is not the real error code, we only set OK here + * so we know to stop polling for the reply. The actual + * error is in ipsaddr->cerror_outgoing */ + array_of_errors[i] = PSM2_OK; + numep_left--; + connect_credits++; + ipsaddr->credit = 0; + break; + } + } + } + } + +err_timeout: + /* Find the worst error to report */ + for (i = 0; i < numep; i++) { + if (!array_of_epid_mask[i]) + continue; + switch (array_of_errors[i]) { + /* These are benign */ + case PSM2_EPID_UNREACHABLE: + case PSM2_EPID_ALREADY_CONNECTED: + break; + case PSM2_EPID_UNKNOWN: + array_of_errors[i] = PSM2_TIMEOUT; + err = psmi_error_cmp(err, PSM2_TIMEOUT); + break; + case PSM2_OK: + /* Restore the real connect error */ + ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; + array_of_errors[i] = ipsaddr->cerror_outgoing; + psmi_assert_always(ipsaddr->cstate_outgoing == + CSTATE_ESTABLISHED); + if (ipsaddr->cerror_outgoing != PSM2_OK) { + err = psmi_error_cmp(err, ipsaddr->cerror_outgoing); + ips_free_epaddr(array_of_epaddr[i], proto); + array_of_epaddr[i] = NULL; + } else { + proto->num_connected_outgoing++; + psmi_assert_always(ipsaddr->pathgrp-> + pg_path[0] + [IPS_PATH_HIGH_PRIORITY]-> + pr_mtu > 0); + } + break; + default: + break; + } + } + +fail: + return err; +} + +/* Repercussions on MQ. + * + * If num_connected==0, everything that exists in the posted queue should + * complete and the error must be marked epid_was_closed. + * + */ + +psm2_error_t +ips_proto_disconnect(struct ips_proto *proto, int force, int numep, + psm2_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm2_error_t array_of_errors[], uint64_t timeout_in) +{ + ips_epaddr_t *ipsaddr; + int numep_left, numep_todisc, i, n; + int n_first; + int has_pending; + uint64_t timeout; + psm2_error_t err = PSM2_OK; + uint64_t reqs_sent = 0; + union psmi_envvar_val credits_intval; + int disconnect_credits; + uint64_t t_warning, t_start; + union psmi_envvar_val warn_intval; + unsigned warning_secs; + + /* In case of a forced close, we cancel whatever timers are pending + * on the proto so that we don't have zombie timers coming back + * after the internal structures of PSM2 have been destroyed + */ + if (force) { + struct psmi_timer *t_cursor; + TAILQ_FOREACH(t_cursor, &proto->timerq->timerq, timer) { + psmi_timer_cancel(proto->timerq, t_cursor); + } + } + + psmi_assert_always(numep > 0); + + psmi_getenv("PSM2_DISCONNECT_CREDITS", + "End-point disconnect request credits.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)100, &credits_intval); + + disconnect_credits = credits_intval.e_uint; + + /* Setup warning interval */ + psmi_getenv("PSM2_DISCONNECT_WARN_INTERVAL", + "Period in seconds to warn if disconnections are not completed." + "Default is 300 seconds, 0 to disable.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)300, &warn_intval); + + warning_secs = warn_intval.e_uint; + + PSMI_LOCK_ASSERT(proto->mq->progress_lock); + + /* First pass: see what to disconnect and what is disconnectable */ + for (i = 0, numep_todisc = 0; i < numep; i++) { + if (!array_of_epaddr_mask[i]) + continue; + psmi_assert_always(array_of_epaddr[i]->ptlctl->ptl == + proto->ptl); + ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; + ipsaddr->credit = 0; + if (ipsaddr->cstate_outgoing == CSTATE_NONE) { + array_of_errors[i] = PSM2_OK; + continue; + } else { + psmi_assert_always(ipsaddr->cstate_outgoing == + CSTATE_ESTABLISHED); + } + _HFI_VDBG("disconnecting %p\n", array_of_epaddr[i]); + array_of_errors[i] = PSM2_EPID_UNKNOWN; + numep_todisc++; + } + if (numep_todisc == 0) + goto success; + + /* Wait for everyone to ack previous packets before putting */ + if (timeout_in == 0) + timeout = ~0ULL; + else + timeout = get_cycles() + nanosecs_to_cycles(timeout_in); + + t_start = get_cycles(); + t_warning = t_start + nanosecs_to_cycles(warning_secs * SEC_ULL); + + n_first = ((uint32_t) get_cycles()) % numep; + if (!force) { + numep_left = numep_todisc; + do { + for (n = 0; n < numep; n++) { + i = (n_first + n) % numep; + if (!array_of_epaddr_mask[i] + || array_of_errors[i] == PSM2_OK) + continue; + ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; + switch (ipsaddr->cstate_outgoing) { + case CSTATE_OUTGOING_DISCONNECTED: + array_of_errors[i] = PSM2_OK; + numep_left--; + disconnect_credits++; + ipsaddr->credit = 0; + continue; + case CSTATE_OUTGOING_WAITING_DISC: + if (ipsaddr->s_timeout > get_cycles()) + continue; + ipsaddr->delay_in_ms = + min(100, ipsaddr->delay_in_ms << 1); + ipsaddr->s_timeout = get_cycles() + + nanosecs_to_cycles(ipsaddr-> + delay_in_ms * + MSEC_ULL); + psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); + ips_proto_send_ctrl_message_request + (proto, + &ipsaddr->flows[proto->msgflowid], + OPCODE_DISCONNECT_REQUEST, + &ipsaddr->ctrl_msg_queued, + timeout); + reqs_sent++; + break; + case CSTATE_ESTABLISHED: + /* Still pending acks, hold off for now */ + has_pending = + !STAILQ_EMPTY(&ipsaddr->flows + [EP_FLOW_GO_BACK_N_PIO]. + scb_unacked) + || + !STAILQ_EMPTY(&ipsaddr->flows + [EP_FLOW_GO_BACK_N_DMA]. + scb_unacked); + if (has_pending) + continue; + if (!ipsaddr->credit + && disconnect_credits) { + ipsaddr->credit = 1; + disconnect_credits--; + } + if (!ipsaddr->credit) + continue; + ipsaddr->delay_in_ms = 1; + ipsaddr->cstate_outgoing = + CSTATE_OUTGOING_WAITING_DISC; + ipsaddr->s_timeout = + get_cycles() + + nanosecs_to_cycles(MSEC_ULL); + psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); + ips_proto_send_ctrl_message_request + (proto, + &ipsaddr->flows[proto->msgflowid], + OPCODE_DISCONNECT_REQUEST, + &ipsaddr->ctrl_msg_queued, + timeout); + reqs_sent++; + break; + default: + psmi_handle_error(PSMI_EP_NORETURN, + PSM2_INTERNAL_ERR, + "Unhandled/unknown close state %d", + ipsaddr->cstate_outgoing); + break; + } + } + if (numep_left == 0) + break; + + if ((err = + psmi_err_only(psmi_poll_internal(proto->ep, 1)))) + goto fail; + + if (warning_secs && get_cycles() > t_warning) { + _HFI_INFO + ("graceful close in progress for %d/%d peers " + "(elapsed=%d millisecs,timeout=%d millisecs,reqs=%lld)\n", + numep_left, numep_todisc, + (int)(cycles_to_nanosecs + (get_cycles() - t_start) / MSEC_ULL), + (int)(timeout_in / MSEC_ULL), + (unsigned long long)reqs_sent); + t_warning = + get_cycles() + + nanosecs_to_cycles(warning_secs * SEC_ULL); + } + } + while (timeout > get_cycles()); + + if (numep_left > 0) { + err = PSM2_TIMEOUT; + for (i = 0; i < numep; i++) { + if (!array_of_epaddr_mask[i]) + continue; + if (array_of_errors[i] == PSM2_EPID_UNKNOWN) { + array_of_errors[i] = PSM2_TIMEOUT; + _HFI_VDBG + ("disc timeout on index %d, epaddr %s\n", + i, + psmi_epaddr_get_name + (array_of_epaddr[i]->epid)); + } + } + _HFI_PRDBG("graceful close incomplete for %d/%d peers " + "(elapsed=%d millisecs,timeout=%d millisecs,reqs=%lld)\n", + numep_left, numep_todisc, + (int)(cycles_to_nanosecs + (get_cycles() - t_start) / MSEC_ULL), + (int)(timeout_in / MSEC_ULL), + (unsigned long long)reqs_sent); + } else + _HFI_PRDBG + ("graceful close complete from %d peers in %d millisecs, reqs_sent=%lld\n", + numep_todisc, + (int)(cycles_to_nanosecs(get_cycles() - t_start) / + MSEC_ULL), (unsigned long long)reqs_sent); + } else { + psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); + for (n = 0; n < numep; n++) { + i = (n_first + n) % numep; + if (!array_of_epaddr_mask[i]) + continue; + ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; + psmi_assert_always(ipsaddr->cstate_outgoing == + CSTATE_ESTABLISHED); + ips_proto_send_ctrl_message_request(proto, &ipsaddr-> + flows[proto->msgflowid], + OPCODE_DISCONNECT_REQUEST, + &ipsaddr->ctrl_msg_queued, + 0); + /* Force state to DISCONNECTED */ + ipsaddr->cstate_outgoing = CSTATE_OUTGOING_DISCONNECTED; + array_of_errors[i] = PSM2_OK; + } + _HFI_VDBG("non-graceful close complete from %d peers\n", numep); + } + + for (i = 0; i < numep; i++) { + if (!array_of_epaddr_mask[i] || array_of_errors[i] != PSM2_OK) + continue; + ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; + if (ipsaddr->cstate_outgoing == CSTATE_NONE) + continue; + psmi_assert_always(ipsaddr->cstate_outgoing == + CSTATE_OUTGOING_DISCONNECTED); + proto->num_connected_outgoing--; + /* Remote disconnect req arrived already, remove this epid. If it + * hasn't arrived yet, that's okay, we'll pick it up later and just + * mark our connect-to status as being "none". */ + if (ipsaddr->cstate_incoming == CSTATE_NONE) { + ips_free_epaddr(array_of_epaddr[i], proto); + array_of_epaddr[i] = NULL; + } else + ipsaddr->cstate_outgoing = CSTATE_NONE; + } + +fail: +success: + return err; +} + +int ips_proto_isconnected(ips_epaddr_t *ipsaddr) +{ + if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED || + ipsaddr->cstate_incoming == CSTATE_ESTABLISHED) + return 1; + else + return 0; +} diff --git a/ptl_ips/ips_proto_dump.c b/ptl_ips/ips_proto_dump.c new file mode 100644 index 0000000..9abde00 --- /dev/null +++ b/ptl_ips/ips_proto_dump.c @@ -0,0 +1,256 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include "ips_expected_proto.h" +#include "ips_proto_help.h" + +void ips_proto_dump_frame(void *frame, int lenght, char *message) +{ + uint8_t *raw_frame = frame; + int counter; + char default_message[] = ""; + + if (!message) + message = default_message; + + printf("\nHex dump of %i bytes at %p from %s\n", lenght, frame, + message); + + for (counter = 0; counter < lenght; counter++) { + if ((counter % 16) == 0) + printf("\n"); + + if ((counter % 4) == 0) + printf(" "); + + printf("%02X ", raw_frame[counter]); + } + printf("\n"); +} + +void ips_proto_dump_data(void *data, int data_length) +{ + int counter; + uint8_t *payload = (uint8_t *) data; + + printf("\nHex dump of data, length = %i\n", data_length); + + for (counter = 0; counter < data_length; counter++) { + if ((counter % 16) == 0) + printf("\n %04d: ", counter); + + if ((counter % 4) == 0) + printf(" "); + + printf("%02X ", payload[counter]); + } + printf("\n"); +} + +void ips_proto_show_header(struct ips_message_header *p_hdr, char *msg) +{ + psmi_seqnum_t ack_seq_num; + + printf("\nHeader decoding in hex: %s\n", msg ? msg : ""); + + printf("LRH: VL4-LVer4-SL4-Res2-LNH2: %x\n", + __be16_to_cpu(p_hdr->lrh[0])); + printf("LRH: DLID %x\n", __be16_to_cpu(p_hdr->lrh[1])); + printf("LRH: Res4-PktLen12 %x\n", __be16_to_cpu(p_hdr->lrh[2])); + printf("LRH: SLID %x\n", __be16_to_cpu(p_hdr->lrh[3])); + + printf("BTH: OpCode8-SE1-M1-PC2-TVer4-Pkey16 %x\n", + __be32_to_cpu(p_hdr->bth[0])); + printf("BTH: F1-B1-Res6-DestQP24 %x\n", __be32_to_cpu(p_hdr->bth[1])); + printf("BTH: A1-PSN31 %x\n", __be32_to_cpu(p_hdr->bth[2])); + + printf("IPH: jkey-hcrc %x\n", __le32_to_cpu(p_hdr->khdr.kdeth1)); + printf("IPH: kver-sh-intr-tidctrl-tid-om-offset %x\n", + __le32_to_cpu(p_hdr->khdr.kdeth0)); + + printf("opcode %x\n", _get_proto_hfi_opcode(p_hdr)); + + ack_seq_num.psn_num = p_hdr->ack_seq_num; + if (GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0))) + printf("TidFlow Flow: %x, Gen: %x, Seq: %x\n", + (__be32_to_cpu(p_hdr->bth[1]) >> + HFI_BTH_FLOWID_SHIFT) & HFI_BTH_FLOWID_MASK, + (__be32_to_cpu(p_hdr->bth[2]) >> + HFI_BTH_GEN_SHIFT) & HFI_BTH_GEN_MASK, + (__be32_to_cpu(p_hdr->bth[2]) >> + HFI_BTH_SEQ_SHIFT) & HFI_BTH_SEQ_MASK); + else if (ips_proto_flowid(p_hdr) == EP_FLOW_TIDFLOW) + printf("ack_seq_num gen %x, seq %x\n", + ack_seq_num.psn_gen, ack_seq_num.psn_seq); + else + printf("ack_seq_num %x\n", ack_seq_num.psn_num); + + printf("src_rank/connidx %x\n", p_hdr->connidx); + if (GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0))) + printf("tid_session_gen %d\n", p_hdr->exp_rdescid_genc); + printf("flags %x\n", p_hdr->flags); +} + +/* linux doesn't have strlcat; this is a stripped down implementation */ +/* not super-efficient, but we use it rarely, and only for short strings */ +/* not fully standards conforming! */ +static size_t strlcat(char *d, const char *s, size_t l) +{ + int dlen = strlen(d), slen, max; + if (l <= dlen) /* bug */ + return l; + slen = strlen(s); + max = l - (dlen + 1); + if (slen > max) + slen = max; + memcpy(d + dlen, s, slen); + d[dlen + slen] = '\0'; + return dlen + slen + 1; /* standard says to return full length, not actual */ +} + +/* decode RHF errors; only used one place now, may want more later */ +void ips_proto_get_rhf_errstring(uint32_t err, char *msg, size_t len) +{ + *msg = '\0'; /* if no errors, and so don't need to check what's first */ + + if (err & PSMI_HAL_RHF_ERR_ICRC) + strlcat(msg, "icrcerr ", len); + if (err & PSMI_HAL_RHF_ERR_ECC) + strlcat(msg, "eccerr ", len); + if (err & PSMI_HAL_RHF_ERR_LEN) + strlcat(msg, "lenerr ", len); + if (err & PSMI_HAL_RHF_ERR_TID) + strlcat(msg, "tiderr ", len); + if (err & PSMI_HAL_RHF_ERR_DC) + strlcat(msg, "dcerr ", len); + if (err & PSMI_HAL_RHF_ERR_DCUN) + strlcat(msg, "dcuncerr ", len); + if (err & PSMI_HAL_RHF_ERR_KHDRLEN) + strlcat(msg, "khdrlenerr ", len); +} + +void ips_proto_dump_err_stats(struct ips_proto *proto) +{ + char err_stat_msg[2048]; + char tmp_buf[128]; + int len = sizeof(err_stat_msg); + + if (!(hfi_debug & __HFI_PKTDBG)) + return; + + *err_stat_msg = '\0'; + + if (proto->error_stats.num_icrc_err || + proto->error_stats.num_ecc_err || + proto->error_stats.num_len_err || + proto->error_stats.num_tid_err || + proto->error_stats.num_dc_err || + proto->error_stats.num_dcunc_err || + proto->error_stats.num_khdrlen_err) { + + snprintf(tmp_buf, sizeof(tmp_buf), "ERROR STATS: "); + + if (proto->error_stats.num_icrc_err) { + snprintf(tmp_buf, sizeof(tmp_buf), "ICRC: %" PRIu64 " ", + proto->error_stats.num_icrc_err); + strlcat(err_stat_msg, tmp_buf, len); + } + + if (proto->error_stats.num_ecc_err) { + snprintf(tmp_buf, sizeof(tmp_buf), "ECC: %" PRIu64 " ", + proto->error_stats.num_ecc_err); + strlcat(err_stat_msg, tmp_buf, len); + } + + if (proto->error_stats.num_len_err) { + snprintf(tmp_buf, sizeof(tmp_buf), "LEN: %" PRIu64 " ", + proto->error_stats.num_len_err); + strlcat(err_stat_msg, tmp_buf, len); + } + + if (proto->error_stats.num_tid_err) { + snprintf(tmp_buf, sizeof(tmp_buf), "TID: %" PRIu64 " ", + proto->error_stats.num_tid_err); + strlcat(err_stat_msg, tmp_buf, len); + } + + if (proto->error_stats.num_dc_err) { + snprintf(tmp_buf, sizeof(tmp_buf), "DC: %" PRIu64 " ", + proto->error_stats.num_dc_err); + strlcat(err_stat_msg, tmp_buf, len); + } + + if (proto->error_stats.num_dcunc_err) { + snprintf(tmp_buf, sizeof(tmp_buf), + "DCUNC: %" PRIu64 " ", + proto->error_stats.num_dcunc_err); + strlcat(err_stat_msg, tmp_buf, len); + } + + if (proto->error_stats.num_khdrlen_err) { + snprintf(tmp_buf, sizeof(tmp_buf), + "KHDRLEN: %" PRIu64 " ", + proto->error_stats.num_khdrlen_err); + strlcat(err_stat_msg, tmp_buf, len); + } + strlcat(err_stat_msg, "\n", len); + } else + strlcat(err_stat_msg, "No previous errors.\n", len); + + _HFI_ERROR("%s", err_stat_msg); +} diff --git a/ptl_ips/ips_proto_expected.c b/ptl_ips/ips_proto_expected.c new file mode 100644 index 0000000..1f507ed --- /dev/null +++ b/ptl_ips/ips_proto_expected.c @@ -0,0 +1,2880 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2016 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" + +#include "ips_scb.h" +#include "ips_tid.h" +#include "ips_tidflow.h" +#include "ips_proto.h" +#include "ips_expected_proto.h" +#include "ips_proto_help.h" +#include "psm_mq_internal.h" + +/* + * Timer callbacks. When we need work to be done out of the receive process + * loop, we schedule work on timers to be done at a later time. + */ +static psm2_error_t +ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current); + +static psm2_error_t +ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current); + +static void +ips_protoexp_do_tf_seqerr(void *vpprotoexp + /* actually: struct ips_protoexp *protoexp */, + void *vptidrecvc + /* actually: struct ips_tid_recv_desc *tidrecvc */, + struct ips_message_header *p_hdr); +static void +ips_protoexp_do_tf_generr(void *vpprotoexp + /* actually: struct ips_protoexp *protoexp */, + void *vptidrecvc + /* actually: struct ips_tid_recv_desc *tidrecvc */, + struct ips_message_header *p_hdr); + +static void ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context); +static void ips_tid_avail_callback(struct ips_tid *tidc, void *context); +static void ips_tidflow_avail_callback(struct ips_tf *tfc, void *context); + +/* Defined at the ptl-level (breaks abstractions but needed for shared vs + * non-shared contexts */ +extern int ips_ptl_recvq_isempty(const struct ptl *ptl); + +static psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc); +static psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc); + +#ifdef PSM_CUDA +static +void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp, + struct ips_tid_send_desc *tidsendc); +static void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, + psm2_mq_req_t req, + struct ips_tid_send_desc *tidsendc, + struct ips_cuda_hostbuf *chb_prev, + uint32_t tsess_srcoff, + uint32_t tsess_length, + uint32_t tsess_unaligned_start, + psm2_chb_match_type_t type); +#endif + +psm2_error_t +MOCKABLE(ips_protoexp_init)(const psmi_context_t *context, + const struct ips_proto *proto, + uint32_t protoexp_flags, + int num_of_send_bufs, + int num_of_send_desc, struct ips_protoexp **protoexp_o) +{ + struct ips_protoexp *protoexp = NULL; + uint32_t tidmtu_max; + psm2_error_t err = PSM2_OK; + + protoexp = (struct ips_protoexp *) + psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ips_protoexp)); + if (protoexp == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + *protoexp_o = protoexp; + + protoexp->ptl = (const struct ptl *)proto->ptl; + protoexp->proto = (struct ips_proto *)proto; + protoexp->timerq = proto->timerq; + srand48_r((long int) getpid(), &protoexp->tidflow_drand48_data); + protoexp->tid_flags = protoexp_flags; + + if (context->ep->memmode == PSMI_MEMMODE_MINIMAL) { + protoexp->tid_flags |= IPS_PROTOEXP_FLAG_CTS_SERIALIZED; + } + + { + /* + * Adjust the session window size so that tid-grant message can + * fit into a single frag size packet for single transfer, PSM + * must send tid-grant message with a single packet. + */ + uint32_t fragsize, winsize; + + if (proto->flags & IPS_PROTO_FLAG_SDMA) + fragsize = proto->epinfo.ep_mtu; + else + fragsize = proto->epinfo.ep_piosize; + + winsize = 2 * PSMI_PAGESIZE /* bytes per tid-pair */ + /* space in packet */ + * min((fragsize - sizeof(ips_tid_session_list)), + /* space in tidsendc/tidrecvc descriptor */ + PSM_TIDLIST_BUFSIZE) + / sizeof(uint32_t); /* convert to tid-pair */ + + if (proto->mq->hfi_base_window_rv > winsize) + proto->mq->hfi_base_window_rv = winsize; + } + + /* Must be initialized already */ + /* Comment out because of Klockwork scanning critical error. CQ 11/16/2012 + psmi_assert_always(proto->ep != NULL && proto->ep->mq != NULL && + proto->ep->mq->rreq_pool != NULL && + proto->ep->mq->sreq_pool != NULL); + */ + psmi_assert_always(proto->timerq != NULL); + + /* These request pools are managed by the MQ component */ + protoexp->tid_sreq_pool = proto->ep->mq->sreq_pool; + protoexp->tid_rreq_pool = proto->ep->mq->rreq_pool; + + /* tid traffic xfer type */ + if (proto->flags & IPS_PROTO_FLAG_SPIO) + protoexp->tid_xfer_type = PSM_TRANSFER_PIO; + else + protoexp->tid_xfer_type = PSM_TRANSFER_DMA; + + /* ctrl ack/nak xfer type */ + if (proto->flags & IPS_PROTO_FLAG_SDMA) + protoexp->ctrl_xfer_type = PSM_TRANSFER_DMA; + else + protoexp->ctrl_xfer_type = PSM_TRANSFER_PIO; + + /* Initialize tid flow control. */ + err = ips_tf_init(protoexp, context, &protoexp->tfc, + ips_tidflow_avail_callback); + if (err != PSM2_OK) + goto fail; + + if (proto->flags & IPS_PROTO_FLAG_SPIO) + tidmtu_max = proto->epinfo.ep_piosize; + else + tidmtu_max = proto->epinfo.ep_mtu; + + protoexp->tid_send_fragsize = tidmtu_max; + + if ((err = ips_tid_init(context, protoexp, + ips_tid_avail_callback, protoexp))) + goto fail; + + if ((err = ips_scbctrl_init(context, num_of_send_desc, 0, + 0, 0, ips_tid_scbavail_callback, + protoexp, &protoexp->tid_scbc_rv))) + goto fail; + + { + /* Determine interval to generate headers (relevant only when header + * suppression is enabled) else headers will always be generated. + * + * The PSM2_EXPECTED_HEADERS environment variable can specify the + * packet interval to generate headers at. Else a header packet is + * generated every + * min(PSM_DEFAULT_EXPECTED_HEADER, window_size/tid_send_fragsize). + * Note: A header is always generated for the last packet in the flow. + */ + + union psmi_envvar_val env_exp_hdr; + uint32_t defval = min(PSM_DEFAULT_EXPECTED_HEADER, + proto->mq->hfi_base_window_rv / + protoexp->tid_send_fragsize); + + psmi_getenv("PSM2_EXPECTED_HEADERS", + "Interval to generate expected protocol headers", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)defval, &env_exp_hdr); + + protoexp->hdr_pkt_interval = env_exp_hdr.e_uint; + /* Account for flow credits - Should try to have atleast 4 headers + * generated per window. + */ + protoexp->hdr_pkt_interval = + max(min + (protoexp->hdr_pkt_interval, proto->flow_credits >> 2), + 1); + + if (protoexp->hdr_pkt_interval != env_exp_hdr.e_uint) { + _HFI_VDBG + ("Overriding PSM2_EXPECTED_HEADERS=%u to be '%u'\n", + env_exp_hdr.e_uint, protoexp->hdr_pkt_interval); + } + + } + + { + union psmi_envvar_val env_rts_cts_interleave; + + psmi_getenv("PSM2_RTS_CTS_INTERLEAVE", + "Interleave the handling of RTS to provide a fair distribution between multiple senders", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)0, &env_rts_cts_interleave); + if (env_rts_cts_interleave.e_uint) + protoexp->tid_flags |= IPS_PROTOEXP_FLAG_RTS_CTS_INTERLEAVE; + } + + /* Send descriptors. + * + * There can be up to 2^32 of these send descriptors. We conservatively + * allocate 256 but large node configurations can allocate up to sdesc_num + * of these (they are about 2k each). + * We impose a theoretical limit of 2^30. + */ + { + struct psmi_rlimit_mpool rlim = TID_SENDSESSIONS_LIMITS; + uint32_t maxsz, chunksz; + + if ((err = psmi_parse_mpool_env(protoexp->proto->mq, 1, + &rlim, &maxsz, &chunksz))) + goto fail; + + protoexp->tid_desc_send_pool = + psmi_mpool_create(sizeof(struct ips_tid_send_desc), chunksz, + maxsz, 0, DESCRIPTORS, NULL, NULL); + + if (protoexp->tid_desc_send_pool == NULL) { + err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, + "Couldn't allocate tid descriptor memory pool"); + goto fail; + } + } + + /* Receive descriptors are an array in tidflow structure. */ + + /* This pool can never be smaller than the max number of rreqs that can be + * allocated. */ + { + uint32_t rreq_per_chunk, rreq_max; + + psmi_assert_always(protoexp->proto->mq->rreq_pool != NULL); + + psmi_mpool_get_obj_info(protoexp->proto->mq->rreq_pool, + &rreq_per_chunk, &rreq_max); + + protoexp->tid_getreq_pool = + psmi_mpool_create(sizeof(struct ips_tid_get_request), + rreq_per_chunk, rreq_max, 0, DESCRIPTORS, + NULL, NULL); + + if (protoexp->tid_getreq_pool == NULL) { + err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, + "Couldn't allocate getreq descriptor memory pool"); + goto fail; + } + } + + /* Timers to handle requeueing of work out of the receive path */ + psmi_timer_entry_init(&protoexp->timer_send, + ips_tid_pendsend_timer_callback, protoexp); + STAILQ_INIT(&protoexp->pend_sendq); + psmi_timer_entry_init(&protoexp->timer_getreqs, + ips_tid_pendtids_timer_callback, protoexp); + STAILQ_INIT(&protoexp->pend_getreqsq); + + protoexp->tid_page_offset_mask = PSMI_PAGESIZE - 1; + protoexp->tid_page_mask = ~(PSMI_PAGESIZE - 1); + + /* + * After ips_tid_init(), we know if we use tidcache or not. + * if tid cache is used, we can't use tid debug. + */ +#ifdef PSM_DEBUG + if (protoexp->tidc.tid_array == NULL) + protoexp->tid_flags |= IPS_PROTOEXP_FLAG_TID_DEBUG; +#endif + + if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) { + int i; + protoexp->tid_info = (struct ips_tidinfo *) + psmi_calloc(context->ep, UNDEFINED, IPS_TID_MAX_TIDS, + sizeof(struct ips_tidinfo)); + if (protoexp->tid_info == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + for (i = 0; i < IPS_TID_MAX_TIDS; i++) { + protoexp->tid_info[i].state = TIDSTATE_FREE; + protoexp->tid_info[i].tidrecvc = NULL; + protoexp->tid_info[i].tid = 0xFFFFFFFF; + } + } else + protoexp->tid_info = NULL; + +#ifdef PSM_CUDA + { + if (PSMI_IS_CUDA_ENABLED && + !(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) { + struct psmi_rlimit_mpool rlim = CUDA_HOSTBUFFER_LIMITS; + uint32_t maxsz, chunksz, max_elements; + + if ((err = psmi_parse_mpool_env(protoexp->proto->mq, 1, + &rlim, &maxsz, &chunksz))) + goto fail; + + /* the maxsz is the amount in MB, not the number of entries, + * since the element size depends on the window size */ + max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv; + /* mpool requires max_elements to be power of 2. round down. */ + max_elements = 1 << (31 - __builtin_clz(max_elements)); + protoexp->cuda_hostbuf_recv_cfg.bufsz = + proto->mq->hfi_base_window_rv; + + protoexp->cuda_hostbuf_pool_recv = + psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf), + chunksz, max_elements, 0, + UNDEFINED, NULL, NULL, + psmi_cuda_hostbuf_alloc_func, + (void *) + &protoexp->cuda_hostbuf_recv_cfg); + + if (protoexp->cuda_hostbuf_pool_recv == NULL) { + err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, + "Couldn't allocate CUDA host receive buffer pool"); + goto fail; + } + + protoexp->cuda_hostbuf_small_recv_cfg.bufsz = + CUDA_SMALLHOSTBUF_SZ; + protoexp->cuda_hostbuf_pool_small_recv = + psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf), + chunksz, max_elements, 0, + UNDEFINED, NULL, NULL, + psmi_cuda_hostbuf_alloc_func, + (void *) + &protoexp->cuda_hostbuf_small_recv_cfg); + + if (protoexp->cuda_hostbuf_pool_small_recv == NULL) { + err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, + "Couldn't allocate CUDA host small receive buffer pool"); + goto fail; + } + + PSMI_CUDA_CALL(cuStreamCreate, + &protoexp->cudastream_recv, + CU_STREAM_NON_BLOCKING); + STAILQ_INIT(&protoexp->cudapend_getreqsq); + } else { + protoexp->cuda_hostbuf_pool_recv = NULL; + protoexp->cuda_hostbuf_pool_small_recv = NULL; + } + } +#endif + psmi_assert(err == PSM2_OK); + return err; + +fail: +#ifdef PSM_CUDA + if (protoexp != NULL && protoexp->cuda_hostbuf_pool_recv != NULL) + psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_recv); + if (protoexp != NULL && protoexp->cuda_hostbuf_pool_small_recv != NULL) + psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_small_recv); +#endif + if (protoexp != NULL && protoexp->tid_getreq_pool != NULL) + psmi_mpool_destroy(protoexp->tid_getreq_pool); + if (protoexp != NULL && protoexp->tid_desc_send_pool != NULL) + psmi_mpool_destroy(protoexp->tid_desc_send_pool); + if (protoexp != NULL) + ips_scbctrl_fini(&protoexp->tid_scbc_rv); + if (protoexp != NULL) + psmi_free(protoexp); + return err; +} +MOCK_DEF_EPILOGUE(ips_protoexp_init); + +psm2_error_t ips_protoexp_fini(struct ips_protoexp *protoexp) +{ + psm2_error_t err = PSM2_OK; + +#ifdef PSM_CUDA + if(PSMI_IS_CUDA_ENABLED && + !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) { + psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_small_recv); + psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_recv); + PSMI_CUDA_CALL(cuStreamDestroy, protoexp->cudastream_recv); + } +#endif + psmi_mpool_destroy(protoexp->tid_getreq_pool); + psmi_mpool_destroy(protoexp->tid_desc_send_pool); + + if ((err = ips_scbctrl_fini(&protoexp->tid_scbc_rv))) + goto fail; + + if ((err = ips_tid_fini(&protoexp->tidc))) + goto fail; + + if ((err = ips_tf_fini(&protoexp->tfc))) + goto fail; + + if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) + psmi_free(protoexp->tid_info); + + psmi_free(protoexp); + +fail: + return err; +} + +/* New scbs now available. If we have pending sends or pending get requests, + * turn on the timer so it can be processed. */ +static +void ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context) +{ + struct ips_protoexp *protoexp = (struct ips_protoexp *)context; + + if (!STAILQ_EMPTY(&protoexp->pend_sendq)) + psmi_timer_request(protoexp->timerq, + &protoexp->timer_send, PSMI_TIMER_PRIO_1); + if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); + return; +} + +/* New Tids are available. If there are pending get requests put the + * get timer on the timerq so it can be processed. */ +static +void ips_tid_avail_callback(struct ips_tid *tidc, void *context) +{ + struct ips_protoexp *protoexp = (struct ips_protoexp *)context; + + if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); + return; +} + +/* New Tid Flows are available. If there are pending get requests put the + * get timer on the timerq so it can be processed. */ +static +void ips_tidflow_avail_callback(struct ips_tf *tfc, void *context) +{ + struct ips_protoexp *protoexp = (struct ips_protoexp *)context; + + if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) + { + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); + } + return; +} + +/* + * The tid get request is always issued from within the receive progress loop, + * which is why we always enqueue the request instead of issuing it directly. + * Eventually, if we expose tid_get to users, we will want to differentiate + * when the request comes from the receive progress loop from cases where the + * tid_get is issued directly from user code. + * + */ +psm2_error_t +ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, + void *buf, + uint32_t length, + psm2_epaddr_t epaddr, + uint32_t remote_tok, + uint32_t flags, + ips_tid_completion_callback_t callback, + void *context) +{ + struct ips_tid_get_request *getreq; + int count, tids, tidflows; + uint64_t nbytes; + + PSM2_LOG_MSG("entering"); + psmi_assert((((ips_epaddr_t *) epaddr)->window_rv % PSMI_PAGESIZE) == 0); + getreq = (struct ips_tid_get_request *) + psmi_mpool_get(protoexp->tid_getreq_pool); + + /* We can't *really* run out of these here because we always allocate as + * much as available receive reqs */ + if_pf(getreq == NULL) + { + PSM2_LOG_MSG("leaving"); + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Ran out of 'getreq' descriptors"); + } + + getreq->tidgr_protoexp = protoexp; + getreq->tidgr_epaddr = epaddr; + getreq->tidgr_lbuf = buf; + getreq->tidgr_length = length; + getreq->tidgr_sendtoken = remote_tok; + getreq->tidgr_ucontext = context; + getreq->tidgr_callback = callback; + getreq->tidgr_offset = 0; + getreq->tidgr_bytesdone = 0; + getreq->tidgr_flags = flags; + +#ifdef PSM_CUDA + psm2_mq_req_t req = (psm2_mq_req_t)context; + if ((req->is_buf_gpu_mem && + !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) || + ((req->is_buf_gpu_mem && + (protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) && + gpudirect_recv_threshold && + length > gpudirect_recv_threshold))) { + getreq->cuda_hostbuf_used = 1; + getreq->tidgr_cuda_bytesdone = 0; + STAILQ_INIT(&getreq->pend_cudabuf); + } else + getreq->cuda_hostbuf_used = 0; +#endif + + /* nbytes is the bytes each channel should transfer. */ + count = ((ips_epaddr_t *) epaddr)->msgctl->ipsaddr_count; +#ifdef PSM_CUDA + if (req->is_buf_gpu_mem) + nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_GPU_PAGESIZE); + else +#endif + nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_PAGESIZE); + getreq->tidgr_rndv_winsz = + min(nbytes, ((ips_epaddr_t *) epaddr)->window_rv); + /* must be within the tid window size */ + if (getreq->tidgr_rndv_winsz > PSM_TID_WINSIZE) + getreq->tidgr_rndv_winsz = PSM_TID_WINSIZE; + + STAILQ_INSERT_TAIL(&protoexp->pend_getreqsq, getreq, tidgr_next); + tids = ips_tid_num_available(&protoexp->tidc); + tidflows = ips_tf_available(&protoexp->tfc); + + if (tids > 0 && tidflows > 0) + ips_tid_pendtids_timer_callback(&protoexp->timer_getreqs, 0); + else if (tids != -1 && tidflows != -1) + psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, + PSMI_TIMER_PRIO_1); + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} + +/* List of perf events */ +#define _ips_logeventid_tid_send_reqs 0 /* out of tid send descriptors */ + +#define ips_logevent_id(event) _ips_logeventid_ ## event +#define ips_logevent(proto, event, ptr) ips_logevent_inner(proto, ips_logevent_id(event), ptr) + +static +void ips_logevent_inner(struct ips_proto *proto, int eventid, void *context) +{ + uint64_t t_now = get_cycles(); + + switch (eventid) { + case ips_logevent_id(tid_send_reqs):{ + psm2_epaddr_t epaddr = (psm2_epaddr_t) context; + proto->psmi_logevent_tid_send_reqs.count++; + + if (t_now >= + proto->psmi_logevent_tid_send_reqs.next_warning) { + psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_OK, + "Non-fatal temporary exhaustion of send tid dma descriptors " + "(elapsed=%.3fs, source LID=0x%x/context=%d, count=%lld)", + (double) + cycles_to_nanosecs(t_now - + proto-> + t_init) / + 1.0e9, + (int)psm2_epid_nid(epaddr-> + epid), + (int)psm2_epid_context(epaddr-> + epid), + (long long)proto-> + psmi_logevent_tid_send_reqs. + count); + proto->psmi_logevent_tid_send_reqs. + next_warning = + t_now + + sec_2_cycles(proto-> + psmi_logevent_tid_send_reqs. + interval_secs); + } + } + break; + + default: + break; + } + + return; +} + +/* + * Expected Protocol. + * + * We're granted tids (as part of a tid get request) and expected to fulfill + * the request by associating the request's sendtoken to a tid send descriptor. + * + * It's possible to be out of tid send descriptors when somehow all allocated + * descriptors can't complete all of their sends. For example, the targets of + * the sends may be busy in computation loops and not processing incoming + * packets. + */ + +void +ips_protoexp_send_tid_grant(struct ips_tid_recv_desc *tidrecvc) +{ + ips_epaddr_t *ipsaddr = tidrecvc->ipsaddr; + struct ips_proto *proto = tidrecvc->protoexp->proto; + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; + ips_scb_t *scb; + + scb = tidrecvc->grantscb; + ips_scb_opcode(scb) = OPCODE_LONG_CTS; + scb->ips_lrh.khdr.kdeth0 = 0; + scb->ips_lrh.mdata = tidrecvc->tidflow_genseq.psn_val; + scb->ips_lrh.data[0] = tidrecvc->rdescid; + scb->ips_lrh.data[1].u32w1 = tidrecvc->getreq->tidgr_length; + scb->ips_lrh.data[1].u32w0 = tidrecvc->getreq->tidgr_sendtoken; + + ips_scb_buffer(scb) = (void *)&tidrecvc->tid_list; + ips_scb_length(scb) = tidrecvc->tsess_tidlist_length; + + PSM2_LOG_EPM(OPCODE_LONG_CTS,PSM2_LOG_TX, proto->ep->epid, + flow->ipsaddr->epaddr.epid ,"tidrecvc->getreq->tidgr_sendtoken; %d", + tidrecvc->getreq->tidgr_sendtoken); + + ips_proto_flow_enqueue(flow, scb); + flow->flush(flow, NULL); +} + +void +ips_protoexp_send_tid_completion(struct ips_tid_recv_desc *tidrecvc, + ptl_arg_t sdescid) +{ + ips_epaddr_t *ipsaddr = tidrecvc->ipsaddr; + struct ips_proto *proto = tidrecvc->protoexp->proto; + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; + ips_scb_t *scb; + + PSM2_LOG_EPM(OPCODE_EXPTID_COMPLETION,PSM2_LOG_TX, proto->ep->epid, + flow->ipsaddr->epaddr.epid ,"sdescid._desc_idx: %d", + sdescid._desc_idx); + scb = tidrecvc->completescb; + + ips_scb_opcode(scb) = OPCODE_EXPTID_COMPLETION; + scb->ips_lrh.khdr.kdeth0 = 0; + scb->ips_lrh.data[0] = sdescid; + + /* Attached tidflow gen/seq */ + scb->ips_lrh.mdata = tidrecvc->tidflow_genseq.psn_val; + + ips_proto_flow_enqueue(flow, scb); + flow->flush(flow, NULL); + + if (tidrecvc->protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) { + flow->flags &= ~IPS_FLOW_FLAG_SKIP_CTS; /* Let the next CTS be processed */ + ips_tid_pendtids_timer_callback(&tidrecvc->protoexp->timer_getreqs, 0); /* and make explicit progress for it. */ + } +} + +#ifdef PSM_CUDA +static +void psmi_deallocate_chb(struct ips_cuda_hostbuf* chb) +{ + PSMI_CUDA_CALL(cuMemFreeHost, chb->host_buf); + PSMI_CUDA_CALL(cuEventDestroy, chb->copy_status); + psmi_free(chb); + return; +} +#endif + +int +ips_protoexp_recv_tid_completion(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_epaddr *ipsaddr = rcv_ev->ipsaddr; + ptl_arg_t desc_id = p_hdr->data[0]; + struct ips_tid_send_desc *tidsendc; + + PSM2_LOG_MSG("entering"); + PSM2_LOG_EPM(OPCODE_EXPTID_COMPLETION,PSM2_LOG_RX,rcv_ev->ipsaddr->epaddr.epid, + rcv_ev->proto->ep->mq->ep->epid,"desc_id._desc_idx: %d",desc_id._desc_idx); + + if (!ips_proto_is_expected_or_nak(rcv_ev)) + { + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; + } + + if (__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, + &ipsaddr->flows[ips_proto_flowid(p_hdr)]); + + ips_proto_process_ack(rcv_ev); + + /* + * Get the session send descriptor and complete. + */ + tidsendc = (struct ips_tid_send_desc *) + psmi_mpool_find_obj_by_index(protoexp->tid_desc_send_pool, + desc_id._desc_idx); + _HFI_VDBG("desc_id=%d (%p)\n", desc_id._desc_idx, tidsendc); + if (tidsendc == NULL) { + _HFI_ERROR + ("exptid comp: Index %d is out of range\n", + desc_id._desc_idx); + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; + } else { + ptl_arg_t desc_tidsendc; + + psmi_mpool_get_obj_index_gen_count(tidsendc, + &desc_tidsendc._desc_idx, + &desc_tidsendc._desc_genc); + + _HFI_VDBG("desc_req:id=%d,gen=%d desc_sendc:id=%d,gen=%d\n", + desc_id._desc_idx, desc_id._desc_genc, + desc_tidsendc._desc_idx, desc_tidsendc._desc_genc); + + /* See if the reference is still live and valid */ + if (desc_tidsendc.u64 != desc_id.u64) { + _HFI_ERROR("exptid comp: Genc %d does not match\n", + desc_id._desc_genc); + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; + } + } + + if (!STAILQ_EMPTY(&tidsendc->tidflow.scb_unacked)) { + struct ips_message_header hdr; + + /* Hack to handle the tidflow */ + hdr.data[0] = rcv_ev->p_hdr->data[0]; + hdr.ack_seq_num = rcv_ev->p_hdr->mdata; + hdr.khdr.kdeth0 = __cpu_to_le32(3 << HFI_KHDR_TIDCTRL_SHIFT); + rcv_ev->p_hdr = &hdr; + + /* + * This call should directly complete the tidflow + * and free all scb on the unacked queue. + */ + ips_proto_process_ack(rcv_ev); + + /* Keep KW happy. */ + rcv_ev->p_hdr = NULL; + /* Prove that the scb will not leak in the unacked queue: */ + psmi_assert(STAILQ_EMPTY(&tidsendc->tidflow.scb_unacked)); + } + + psm2_mq_req_t req = tidsendc->mqreq; + /* Check if we can complete the send request. */ + req->send_msgoff += tidsendc->length; + +#ifdef PSM_CUDA + if (req->cuda_hostbuf_used) { + if (tidsendc->cuda_num_buf == 1) { + tidsendc->cuda_hostbuf[0]->bytes_read += + tidsendc->tid_list.tsess_length; + if(tidsendc->cuda_hostbuf[0]->bytes_read == + tidsendc->cuda_hostbuf[0]->size){ + STAILQ_REMOVE(&req->sendreq_prefetch, + tidsendc->cuda_hostbuf[0], + ips_cuda_hostbuf, req_next); + if (tidsendc->cuda_hostbuf[0]->is_tempbuf) + psmi_deallocate_chb(tidsendc->cuda_hostbuf[0]); + else { + tidsendc->cuda_hostbuf[0]->req = NULL; + tidsendc->cuda_hostbuf[0]->offset = 0; + tidsendc->cuda_hostbuf[0]->bytes_read = 0; + psmi_mpool_put(tidsendc->cuda_hostbuf[0]); + } + psmi_cuda_run_prefetcher(protoexp, tidsendc); + } + } else + psmi_free(tidsendc->userbuf); + } +#endif + if (req->send_msgoff == req->req_data.send_msglen) { + psmi_mq_handle_rts_complete(req); + } + + psmi_mpool_put(tidsendc); + + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; +} + +int ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_proto *proto = rcv_ev->proto; + struct ips_protoexp *protoexp = proto->protoexp; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_tid_recv_desc *tidrecvc; + ptl_arg_t desc_id; + psmi_seqnum_t sequence_num; + + psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID); + + PSM2_LOG_MSG("entering"); + + desc_id._desc_idx = ips_proto_flowid(p_hdr); + PSM2_LOG_EPM(OPCODE_EXPTID,PSM2_LOG_RX,rcv_ev->ipsaddr->epaddr.epid, + proto->ep->mq->ep->epid,"desc_id._desc_idx: %d", desc_id._desc_idx); + + desc_id._desc_genc = p_hdr->exp_rdescid_genc; + + tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; + + if (tidrecvc->rdescid._desc_genc != desc_id._desc_genc) { + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; /* skip */ + } + + /* IBTA CCA handling for expected flow. */ + if (rcv_ev->is_congested & IPS_RECV_EVENT_FECN) { + /* Mark flow to generate BECN in control packet */ + tidrecvc->tidflow.flags |= IPS_FLOW_FLAG_GEN_BECN; + /* Update stats for congestion encountered */ + proto->epaddr_stats.congestion_pkts++; + /* Clear FECN event */ + rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; + } + + sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); + + if_pf (PSM_HAL_ERROR_OK != psmi_hal_tidflow_check_update_pkt_seq( + protoexp,sequence_num,tidrecvc,p_hdr, + ips_protoexp_do_tf_generr,ips_protoexp_do_tf_seqerr)) + return IPS_RECVHDRQ_CONTINUE; + + /* Reset the swapped generation count as we received a valid packet */ + tidrecvc->tidflow_nswap_gen = 0; + + /* Do some sanity checking */ + psmi_assert_always(tidrecvc->state == TIDRECVC_STATE_BUSY); + int recv_completion = (tidrecvc->recv_tidbytes == + (p_hdr->exp_offset + ips_recvhdrq_event_paylen(rcv_ev))); + + /* If sender requested an ACK with the packet and it is not the last + * packet, or if the incoming flow faced congestion, respond with an + * ACK packet. The ACK when congested will have the BECN bit set. + */ + if (((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) && + !recv_completion) || + (tidrecvc->tidflow.flags & IPS_FLOW_FLAG_GEN_BECN)) { + ips_scb_t ctrlscb; + + /* Ack sender with descriptor index */ + ctrlscb.scb_flags = 0; + ctrlscb.ips_lrh.data[0] = p_hdr->exp_sdescid; + ctrlscb.ips_lrh.ack_seq_num = tidrecvc->tidflow_genseq.psn_val; + + ips_proto_send_ctrl_message(&tidrecvc->tidflow, + OPCODE_ACK, + &tidrecvc->ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + } + + /* If RSM is a HW capability, and RSM has found a TID packet marked + * with FECN, the payload will be written to the eager buffer, and + * we will have a payload pointer here. In that case, copy the payload + * into the user's buffer. If RSM did not intercept this EXPTID + * packet, the HFI will handle the packet payload. Possibly should + * assert(0 < paylen < MTU). + */ + if (psmi_hal_has_cap(PSM_HAL_CAP_RSM_FECN_SUPP) && + ips_recvhdrq_event_payload(rcv_ev) && + ips_recvhdrq_event_paylen(rcv_ev)) + psmi_mq_mtucpy(tidrecvc->buffer + p_hdr->exp_offset, + ips_recvhdrq_event_payload(rcv_ev), + ips_recvhdrq_event_paylen(rcv_ev)); + + /* If last packet then we are done. We send a tid transfer completion + * packet back to sender, free all tids and close the current tidflow + * as well as tidrecvc descriptor. + * Note: If we were out of tidflow, this will invoke the callback to + * schedule pending transfer. + */ + if (recv_completion) { + /* copy unaligned data if any */ + uint8_t *dst, *src; + + if (tidrecvc->tid_list.tsess_unaligned_start) { + dst = (uint8_t *)tidrecvc->buffer; + src = (uint8_t *)p_hdr->exp_ustart; +#ifdef PSM_CUDA + if (tidrecvc->is_ptr_gpu_backed) { + PSMI_CUDA_CALL(cuMemcpyHtoD, (CUdeviceptr)dst, src, + tidrecvc->tid_list.tsess_unaligned_start); + } else +#endif + ips_protoexp_unaligned_copy(dst, src, + tidrecvc->tid_list.tsess_unaligned_start); + } + + if (tidrecvc->tid_list.tsess_unaligned_end) { + dst = (uint8_t *)tidrecvc->buffer + + tidrecvc->recv_msglen - + tidrecvc->tid_list.tsess_unaligned_end; + src = (uint8_t *)p_hdr->exp_uend; +#ifdef PSM_CUDA + if (tidrecvc->is_ptr_gpu_backed) { + PSMI_CUDA_CALL(cuMemcpyHtoD, (CUdeviceptr)dst, src, + tidrecvc->tid_list.tsess_unaligned_end); + } else +#endif + ips_protoexp_unaligned_copy(dst, src, + tidrecvc->tid_list.tsess_unaligned_end); + } + + /* reply tid transfer completion packet to sender */ + ips_protoexp_send_tid_completion(tidrecvc, p_hdr->exp_sdescid); + + /* Mark receive as done */ + ips_tid_recv_free(tidrecvc); + } + PSM2_LOG_MSG("leaving"); + + return IPS_RECVHDRQ_CONTINUE; +} + +#ifndef PSM_DEBUG +# define ips_dump_tids(tid_list, msg, ...) +#else +static +void ips_dump_tids(ips_tid_session_list *tid_list, const char *msg, ...) +{ + char buf[256]; + size_t off = 0; + int i, num_tids = tid_list->tsess_tidcount; + + va_list argptr; + va_start(argptr, msg); + off += vsnprintf(buf, sizeof(buf) - off, msg, argptr); + va_end(argptr); + + for (i = 0; i < num_tids && off < (sizeof(buf) - 1); i++) + off += snprintf(buf + off, sizeof(buf) - off, "%d%s", + IPS_TIDINFO_GET_TID(tid_list->tsess_list[i]), + i < num_tids - 1 ? "," : ""); + + _HFI_VDBG("%s\n", buf); + return; +} +#endif + +static +void ips_expsend_tiderr(struct ips_tid_send_desc *tidsendc) +{ + char buf[256]; + size_t off = 0; + int i; + + off += snprintf(buf + off, sizeof(buf) - off, + "Remaining bytes: %d Member id %d is not in tid_session_id=%d :", + tidsendc->remaining_tidbytes, tidsendc->tid_idx, + tidsendc->rdescid._desc_idx); + + for (i = 0; i < tidsendc->tid_list.tsess_tidcount + 1; i++) + off += snprintf(buf + off, sizeof(buf) - off, "%d,", + IPS_TIDINFO_GET_TID(tidsendc->tid_list. + tsess_list[i])); + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Trying to use tid idx %d and there are %d members: %s\n", + tidsendc->tid_idx, tidsendc->tid_list.tsess_tidcount, + buf); + return; +} + +#ifdef PSM_CUDA +static +psm2_error_t +psmi_cuda_reclaim_hostbufs(struct ips_tid_get_request *getreq) +{ + struct ips_protoexp *protoexp = getreq->tidgr_protoexp; + struct ips_tid_getreq_cuda_hostbuf_pend *cmemcpyhead = + &getreq->pend_cudabuf; + struct ips_cuda_hostbuf *chb; + CUresult status; + + /* Get the getreq's first memcpy op */ + while (!STAILQ_EMPTY(cmemcpyhead)) { + chb = STAILQ_FIRST(cmemcpyhead); + PSMI_CUDA_CHECK_EVENT(chb->copy_status, status); + if (status != CUDA_SUCCESS) { + /* At least one of the copies is still + * in progress. Schedule the timer, + * then leave the CUDA progress phase + * and check for other pending TID work. + */ + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, + PSMI_TIMER_PRIO_1); + return PSM2_OK_NO_PROGRESS; + } + /* The getreq's oldest cudabuf is done. Reclaim it. */ + getreq->tidgr_cuda_bytesdone += chb->size; + STAILQ_REMOVE_HEAD(cmemcpyhead, next); + psmi_mpool_put(chb); + } + return PSM2_OK; +} + +static +struct ips_cuda_hostbuf* psmi_allocate_chb(uint32_t window_len) +{ + struct ips_cuda_hostbuf* chb = (struct ips_cuda_hostbuf*) + psmi_calloc(PSMI_EP_NONE, + UNDEFINED, 1, + sizeof(struct ips_cuda_hostbuf)); + if (chb == NULL) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, + "Couldn't allocate cuda host buffers "); + } + PSMI_CUDA_CALL(cuMemHostAlloc, + (void **) &chb->host_buf, + window_len, + CU_MEMHOSTALLOC_PORTABLE); + PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT); + return chb; +} + +static +void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp, + struct ips_tid_send_desc *tidsendc) +{ + struct ips_proto *proto = protoexp->proto; + struct ips_cuda_hostbuf *chb = NULL; + psm2_mq_req_t req = tidsendc->mqreq; + uint32_t offset, window_len; + + /* try to push the prefetcher forward */ + if (req->prefetch_send_msgoff < req->req_data.send_msglen) { + /* some data remains to be sent */ + offset = req->prefetch_send_msgoff; + window_len = + ips_cuda_next_window(tidsendc->ipsaddr->window_rv, + offset, req->req_data.buf_len); + if (window_len <= CUDA_SMALLHOSTBUF_SZ) + chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( + proto->cuda_hostbuf_pool_small_send); + if (chb == NULL) + chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( + proto->cuda_hostbuf_pool_send); + /* were any buffers available for the prefetcher? */ + if (chb == NULL) + return; + req->prefetch_send_msgoff += window_len; + chb->offset = offset; + chb->size = window_len; + chb->req = req; + chb->gpu_buf = (CUdeviceptr) req->req_data.buf + offset; + chb->bytes_read = 0; + PSMI_CUDA_CALL(cuMemcpyDtoHAsync, + chb->host_buf, chb->gpu_buf, + window_len, + proto->cudastream_send); + PSMI_CUDA_CALL(cuEventRecord, chb->copy_status, + proto->cudastream_send); + + STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next); + return; + } + return; +} + +static +void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, + psm2_mq_req_t req, + struct ips_tid_send_desc *tidsendc, + struct ips_cuda_hostbuf *chb_prev, + uint32_t tsess_srcoff, + uint32_t tsess_length, + uint32_t tsess_unaligned_start, + psm2_chb_match_type_t type) +{ + struct ips_proto *proto = protoexp->proto; + struct ips_cuda_hostbuf *chb = NULL; + uint32_t offset, window_len, attached=0; + + /* try to push the prefetcher forward */ + while (req->prefetch_send_msgoff < tsess_srcoff + tsess_length) { + /* some data remains to be sent */ + offset = req->prefetch_send_msgoff; + window_len = + ips_cuda_next_window(tidsendc->ipsaddr->window_rv, + offset, req->req_data.buf_len); + if (window_len <= CUDA_SMALLHOSTBUF_SZ) + chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( + proto->cuda_hostbuf_pool_small_send); + if (chb == NULL) + chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( + proto->cuda_hostbuf_pool_send); + + /* were any buffers available? If not force allocate */ + if (chb == NULL) { + chb = psmi_allocate_chb(window_len); + psmi_assert(chb); + chb->is_tempbuf = 1; + } + req->prefetch_send_msgoff += window_len; + chb->offset = offset; + chb->size = window_len; + chb->req = req; + chb->gpu_buf = (CUdeviceptr) req->req_data.buf + offset; + chb->bytes_read = 0; + PSMI_CUDA_CALL(cuMemcpyDtoHAsync, + chb->host_buf, chb->gpu_buf, + window_len, + proto->cudastream_send); + PSMI_CUDA_CALL(cuEventRecord, chb->copy_status, + proto->cudastream_send); + + STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next); + if (type == PSMI_CUDA_PARTIAL_MATCH_FOUND) { + if ((tsess_srcoff < chb->offset) + && ((tsess_srcoff + tsess_length) > chb->offset)) { + tidsendc->cuda_hostbuf[0] = chb_prev; + tidsendc->cuda_hostbuf[1] = chb; + tidsendc->cuda_num_buf = 2; + void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED, + tsess_length); + tidsendc->userbuf = + (void *)((uintptr_t) buffer); + tidsendc->buffer = + (void *)((uintptr_t)tidsendc->userbuf + + tsess_unaligned_start); + return; + } + } else { + if (attached) { + tidsendc->cuda_hostbuf[0] = chb_prev; + tidsendc->cuda_hostbuf[1] = chb; + tidsendc->cuda_num_buf = 2; + void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED, + tsess_length); + tidsendc->userbuf = + (void *)((uintptr_t) buffer); + tidsendc->buffer = + (void *)((uintptr_t)tidsendc->userbuf + + tsess_unaligned_start); + attached = 0; + return; + } + if ((tsess_srcoff > chb->offset) + && (tsess_srcoff < (chb->offset + chb->size)) + && ((tsess_srcoff + tsess_length) > (chb->offset + chb->size))) { + chb_prev = chb; + attached = 1; + chb = NULL; + continue; + } else if ((chb->offset <= tsess_srcoff) && + ((tsess_srcoff + tsess_length) <= + (chb->offset+chb->size))) { + tidsendc->cuda_hostbuf[0] = chb; + tidsendc->cuda_hostbuf[1] = NULL; + tidsendc->cuda_num_buf = 1; + tidsendc->userbuf = + (void *)((uintptr_t) chb->host_buf + + tsess_srcoff - chb->offset); + tidsendc->buffer = + (void *)((uintptr_t)tidsendc->userbuf + + tsess_unaligned_start ); + return; + } else + chb = NULL; + } + } +} + + +static +psm2_chb_match_type_t psmi_find_match_in_prefeteched_chb(struct ips_cuda_hostbuf* chb, + ips_tid_session_list *tid_list, + uint32_t prefetch_send_msgoff) +{ + /* To get a match: + * 1. Tid list offset + length is contained within a chb + * 2. Tid list offset + length is contained within + * the prefetched offset of this req. + * 3. Tid list offset + length is partially prefetched + * within one chb. (A partial match) + */ + if (chb->offset <= tid_list->tsess_srcoff) { + if ((chb->offset + chb->size) >= + (tid_list->tsess_srcoff + tid_list->tsess_length)) { + return PSMI_CUDA_FULL_MATCH_FOUND; + } else { + if((chb->offset + chb->size) > tid_list->tsess_srcoff){ + if(((chb->offset + (2 * chb->size)) > + (tid_list->tsess_srcoff + tid_list->tsess_length)) && + ((prefetch_send_msgoff) >= + (tid_list->tsess_srcoff + tid_list->tsess_length))){ + return PSMI_CUDA_SPLIT_MATCH_FOUND; + } else if((tid_list->tsess_srcoff + tid_list->tsess_length) + > prefetch_send_msgoff) { + return PSMI_CUDA_PARTIAL_MATCH_FOUND; + } + } + } + } + return PSMI_CUDA_CONTINUE; +} +#endif + +psm2_error_t +ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, + ips_epaddr_t *ipsaddr, + psm2_mq_req_t req, + ptl_arg_t rdescid, + uint32_t tidflow_genseq, + ips_tid_session_list *tid_list, + uint32_t tid_list_size) +{ + struct ips_tid_send_desc *tidsendc; + uint32_t i, j, *src, *dst; + + PSM2_LOG_MSG("entering"); + psmi_assert(tid_list_size > sizeof(ips_tid_session_list)); + psmi_assert(tid_list_size <= sizeof(tidsendc->filler)); + psmi_assert(tid_list->tsess_tidcount > 0); + psmi_assert((rdescid._desc_genc>>16) == 0); + + tidsendc = (struct ips_tid_send_desc *) + psmi_mpool_get(protoexp->tid_desc_send_pool); + if (tidsendc == NULL) { + PSM2_LOG_MSG("leaving"); + ips_logevent(protoexp->proto, tid_send_reqs, ipsaddr); + return PSM2_EP_NO_RESOURCES; + } + + req->ptl_req_ptr = (void *)tidsendc; + tidsendc->protoexp = protoexp; + + /* Uniquely identify this send descriptor in space and time */ + tidsendc->sdescid._desc_idx = psmi_mpool_get_obj_index(tidsendc); + tidsendc->sdescid._desc_genc = psmi_mpool_get_obj_gen_count(tidsendc); + tidsendc->rdescid = rdescid; + tidsendc->ipsaddr = ipsaddr; + tidsendc->mqreq = req; + + /* + * Copy received tidinfo to local tidsendc buffer. + * while doing the copy, we try to merge the tids based on + * following rules: + * 1. both tids are virtually contiguous(i and i+1 in the array); + * 2. both tids have the same tidpair value; + * 3. first tid (i) has tidctrl=1; + * 4. second tid (i+1) has tidctrl=2; + * 5. total length does not exceed 512 pages (2M); + * 6. The h/w supports merged tid_ctrl's. + * + * The restriction of 512 pages comes from the limited number + * of bits we have for KDETH.OFFSET: + * - The entire mapping space provided through TIDs is to be + * viewed as a zero-based address mapping. + * - We have 15 bits in KDETH offset field through which we + * can address upto a maximum of 2MB. + * (with 64-byte offset mode or KDETH.OM = 1) + * - Assuming a 4KB page size, 2MB/4KB = 512 pages. + */ + psmi_mq_mtucpy_host_mem(&tidsendc->tid_list, tid_list, + sizeof(ips_tid_session_list)); + ips_dump_tids(tid_list, "Received %d tids: ", + tid_list->tsess_tidcount); + + if (psmi_hal_has_cap(PSM_HAL_CAP_MERGED_TID_CTRLS)) + { + src = tid_list->tsess_list; + dst = tidsendc->tid_list.tsess_list; + dst[0] = src[0]; + j = 0; i = 1; + while (i < tid_list->tsess_tidcount) { + if ((((dst[j]>>IPS_TIDINFO_TIDCTRL_SHIFT)+1) == + (src[i]>>IPS_TIDINFO_TIDCTRL_SHIFT)) && + (((dst[j]&IPS_TIDINFO_LENGTH_MASK)+ + (src[i]&IPS_TIDINFO_LENGTH_MASK)) <= + PSM_MAX_NUM_PAGES_IN_TIDPAIR)) { + /* merge 'i' to 'j' + * (We need to specify "tidctrl" value as 3 + * if we merge the individual tid-pairs. + * Doing that here) */ + dst[j] += (2 << IPS_TIDINFO_TIDCTRL_SHIFT) + + (src[i] & IPS_TIDINFO_LENGTH_MASK); + i++; + if (i == tid_list->tsess_tidcount) break; + } + j++; + /* copy 'i' to 'j' */ + dst[j] = src[i]; + i++; + } + tidsendc->tid_list.tsess_tidcount = j + 1; + tid_list = &tidsendc->tid_list; + } + else + { + tidsendc->tid_list.tsess_tidcount = tid_list->tsess_tidcount; + psmi_mq_mtucpy(&tidsendc->tid_list.tsess_list, tid_list->tsess_list, + tid_list->tsess_tidcount * sizeof(tid_list->tsess_list[0])); + tid_list = &tidsendc->tid_list; + } + + /* Initialize tidflow for window. Use path requested by remote endpoint */ + ips_flow_init(&tidsendc->tidflow, protoexp->proto, ipsaddr, + protoexp->tid_xfer_type, PSM_PROTOCOL_TIDFLOW, + IPS_PATH_LOW_PRIORITY, EP_FLOW_TIDFLOW); + tidsendc->tidflow.xmit_seq_num.psn_val = tidflow_genseq; + tidsendc->tidflow.xmit_ack_num.psn_val = tidflow_genseq; + + tidsendc->userbuf = + (void *)((uintptr_t) req->req_data.buf + tid_list->tsess_srcoff); + tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf + + tid_list->tsess_unaligned_start); + tidsendc->length = tid_list->tsess_length; + tidsendc->ctrl_msg_queued = 0; + tidsendc->frag_size = min(protoexp->tid_send_fragsize, + tidsendc->tidflow.frag_size); + +#ifdef PSM_CUDA + /* Matching on previous prefetches and initiating next prefetch */ + struct ips_cuda_hostbuf *chb = NULL, *chb_next = NULL; + psm2_chb_match_type_t rc = PSMI_CUDA_CONTINUE; + + /* check if the prefetcher has a buffer ready to use */ + tidsendc->cuda_hostbuf[0] = NULL; + tidsendc->cuda_hostbuf[1] = NULL; + tidsendc->cuda_num_buf = 0; + if (req->cuda_hostbuf_used) { + /* To get a match: + * 1. Tid list offset + length is contained within a chb + * 2. Tid list offset + length is contained within + * the prefetched offset of this req. + * 3. Tid list offset + length is partially prefetched + * within one chb. (A partial match) + */ + STAILQ_FOREACH(chb, &req->sendreq_prefetch, req_next) { + rc = psmi_find_match_in_prefeteched_chb(chb, + tid_list, + req->prefetch_send_msgoff); + if (rc < PSMI_CUDA_CONTINUE) + break; + } + if (rc == PSMI_CUDA_FULL_MATCH_FOUND) { + tidsendc->userbuf = + (void *)((uintptr_t) chb->host_buf+ + tid_list->tsess_srcoff - chb->offset); + tidsendc->buffer = + (void *)((uintptr_t)tidsendc->userbuf + + tid_list->tsess_unaligned_start); + /* now associate the buffer with the tidsendc */ + tidsendc->cuda_hostbuf[0] = chb; + tidsendc->cuda_hostbuf[1] = NULL; + tidsendc->cuda_num_buf = 1; + } else if (rc == PSMI_CUDA_SPLIT_MATCH_FOUND){ + void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED, + tid_list->tsess_length); + tidsendc->userbuf = + (void *)((uintptr_t) buffer); + tidsendc->buffer = + (void *)((uintptr_t)tidsendc->userbuf + + tid_list->tsess_unaligned_start); + chb_next = STAILQ_NEXT(chb, req_next); + tidsendc->cuda_hostbuf[0] = chb; + tidsendc->cuda_hostbuf[1] = chb_next; + tidsendc->cuda_num_buf = 2; + } else if (rc == PSMI_CUDA_PARTIAL_MATCH_FOUND) { + psmi_attach_chb_to_tidsendc(protoexp, req, + tidsendc, + chb, + tid_list->tsess_srcoff, + tid_list->tsess_length, + tid_list->tsess_unaligned_start, + rc); + } else { + psmi_attach_chb_to_tidsendc(protoexp, req, + tidsendc, + NULL, + tid_list->tsess_srcoff, + tid_list->tsess_length, + tid_list->tsess_unaligned_start, + PSMI_CUDA_CONTINUE); + } + } +#endif + + /* frag size must be 64B multiples */ + tidsendc->frag_size &= (~63); + tidsendc->is_complete = 0; + tidsendc->tid_idx = 0; + tidsendc->frame_send = 0; + + tidsendc->tidbytes = 0; + tidsendc->remaining_tidbytes = tid_list->tsess_length - + tid_list->tsess_unaligned_start - tid_list->tsess_unaligned_end; + tidsendc->remaining_bytes_in_tid = + (IPS_TIDINFO_GET_LENGTH(tid_list->tsess_list[0]) << 12) - + tid_list->tsess_tidoffset; + tidsendc->offset_in_tid = tid_list->tsess_tidoffset; + + _HFI_EXP + ("alloc tidsend=%4d tidrecv=%4d srcoff=%6d length=%6d,s=%d,e=%d\n", + tidsendc->sdescid._desc_idx, rdescid._desc_idx, + tid_list->tsess_srcoff, tid_list->tsess_length, + tid_list->tsess_unaligned_start, tid_list->tsess_unaligned_end); + + ips_tid_send_exp(tidsendc); + + /* Add as a pending op and ring up the timer */ + if (tidsendc->is_complete == 0) { + STAILQ_INSERT_TAIL(&protoexp->pend_sendq, tidsendc, next); + psmi_timer_request(protoexp->timerq, &protoexp->timer_send, + PSMI_TIMER_PRIO_1); + } + + PSM2_LOG_MSG("leaving"); + /* Consider breaking out of progress engine here */ + return PSM2_OK; +} + +static +ips_scb_t * +ips_scb_prepare_tid_sendctrl(struct ips_flow *flow, + struct ips_tid_send_desc *tidsendc) +{ + struct ips_protoexp *protoexp = tidsendc->protoexp; + uint32_t *tsess_list = tidsendc->tid_list.tsess_list; + uint32_t tid, omode, offset, chunk_size; + uint32_t startidx, endidx; + uint32_t frame_len, nfrag; + uint8_t *bufptr = tidsendc->buffer; + ips_scb_t *scb; + + uint8_t is_payload_per_frag_leq_8dw = 0; + /* If payload in the first and last nfrag is less then or equal + * to 8DW we disable header suppression so as to detect uncorrectable + * errors which will otherwise be non-detectable(since header is + * suppressed we lose RHF.EccErr) + */ + if ((scb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0)) == NULL) + return NULL; + + /* + * Make sure the next offset is in 64B multiples with the tid. + */ + frame_len = + min(tidsendc->remaining_bytes_in_tid, tidsendc->remaining_tidbytes); + if (frame_len > tidsendc->frag_size) { + frame_len = + tidsendc->frag_size - (tidsendc->offset_in_tid & 63); + } + /* + * Frame length is the amount of payload to be included in a particular + * frag of the scb, so we check if frame len is less than or equal + * to 8DW. If length is less then then or equal to 8DW for the first + * frag then we avoid header suppression + */ + if (frame_len <= 32) + is_payload_per_frag_leq_8dw = 1; + + /* + * Using large offset mode based on offset length. + */ + if (tidsendc->offset_in_tid < 131072) { /* 2^15 * 4 */ + psmi_assert((tidsendc->offset_in_tid % 4) == 0); + offset = tidsendc->offset_in_tid / 4; + omode = 0; + } else { + psmi_assert((tidsendc->offset_in_tid % 64) == 0); + offset = tidsendc->offset_in_tid / 64; + omode = 1; + } + startidx = tidsendc->tid_idx; + tid = IPS_TIDINFO_GET_TID(tsess_list[startidx]); + scb->ips_lrh.khdr.kdeth0 = (offset & HFI_KHDR_OFFSET_MASK) | + (omode << HFI_KHDR_OM_SHIFT) | (tid << HFI_KHDR_TID_SHIFT); + + scb->tidctrl = IPS_TIDINFO_GET_TIDCTRL(tsess_list[startidx]); + scb->tsess = (uint32_t *) &tsess_list[startidx]; + + /* + * Payload and buffer address for current packet. payload_size + * must be the first packet size because it is used to initialize + * the packet header. + */ + scb->payload_size = frame_len; + ips_scb_buffer(scb) = (void *)bufptr; + scb->frag_size = tidsendc->frag_size; + + /* + * Other packet fields. + */ + PSM2_LOG_EPM(OPCODE_EXPTID,PSM2_LOG_TX, protoexp->proto->ep->epid, + flow->ipsaddr->epaddr.epid, + "psmi_mpool_get_obj_index(tidsendc->mqreq): %d, tidsendc->rdescid._desc_idx: %d, tidsendc->sdescid._desc_idx: %d", + psmi_mpool_get_obj_index(tidsendc->mqreq),tidsendc->rdescid._desc_idx,tidsendc->sdescid._desc_idx); + ips_scb_opcode(scb) = OPCODE_EXPTID; + scb->ips_lrh.exp_sdescid = tidsendc->sdescid; + scb->ips_lrh.exp_rdescid_genc = (uint16_t)tidsendc->rdescid._desc_genc; + scb->ips_lrh.exp_offset = tidsendc->tidbytes; + + scb->tidsendc = tidsendc; + SLIST_NEXT(scb, next) = NULL; + + /* + * Loop over the tid session list, count the frag number and payload size. + */ + nfrag = 1; + chunk_size = frame_len; + while (1) { + /* Record last tididx used */ + endidx = tidsendc->tid_idx; + /* Check if all tidbytes are done */ + tidsendc->remaining_tidbytes -= frame_len; + if (!tidsendc->remaining_tidbytes) { + /* We do another frame length check for the last frag */ + if (frame_len <= 32) + is_payload_per_frag_leq_8dw = 1; + break; + } + + /* Update in current tid */ + tidsendc->remaining_bytes_in_tid -= frame_len; + tidsendc->offset_in_tid += frame_len; + psmi_assert((tidsendc->offset_in_tid >= 128*1024) ? + ((tidsendc->offset_in_tid % 64) == 0) : + ((tidsendc->offset_in_tid % 4) == 0)); + + /* Done with this tid, move on to the next tid */ + if (!tidsendc->remaining_bytes_in_tid) { + tidsendc->tid_idx++; + psmi_assert_always(tidsendc->tid_idx < + tidsendc->tid_list.tsess_tidcount); + tidsendc->remaining_bytes_in_tid = + IPS_TIDINFO_GET_LENGTH(tsess_list + [tidsendc->tid_idx]) << 12; + tidsendc->offset_in_tid = 0; + } + + /* For PIO, only single packet per scb allowed */ + if (flow->transfer == PSM_TRANSFER_PIO) { + break; + } + + frame_len = + min(tidsendc->remaining_bytes_in_tid, + tidsendc->remaining_tidbytes); + if (frame_len > tidsendc->frag_size) + frame_len = tidsendc->frag_size; + nfrag++; + chunk_size += frame_len; + } + + scb->nfrag = nfrag; + if (nfrag > 1) { + scb->nfrag_remaining = scb->nfrag; + scb->chunk_size = scb->chunk_size_remaining = chunk_size; + } + scb->tsess_length = (endidx - startidx + 1) * sizeof(uint32_t); + + /* Keep track of latest buffer location so we restart at the + * right location, if we don't complete the transfer */ + tidsendc->buffer = bufptr + chunk_size; + tidsendc->tidbytes += chunk_size; + + if (flow->transfer == PSM_TRANSFER_DMA && + psmi_hal_has_cap(PSM_HAL_CAP_DMA_HSUPP_FOR_32B_MSGS)) { + is_payload_per_frag_leq_8dw = 0; + } + + /* If last packet, we want a completion notification */ + if (!tidsendc->remaining_tidbytes) { + /* last packet/chunk, attach unaligned data */ + uint8_t *dst, *src; + + if (tidsendc->tid_list.tsess_unaligned_start) { + dst = (uint8_t *)scb->ips_lrh.exp_ustart; + src = (uint8_t *)tidsendc->userbuf; +#ifdef PSM_CUDA + if (IS_TRANSFER_BUF_GPU_MEM(scb) && !tidsendc->mqreq->cuda_hostbuf_used) { + PSMI_CUDA_CALL(cuMemcpyDtoH, dst, (CUdeviceptr)src, + tidsendc->tid_list.tsess_unaligned_start); + } else +#endif + ips_protoexp_unaligned_copy(dst, src, + tidsendc->tid_list.tsess_unaligned_start); + } + + if (tidsendc->tid_list.tsess_unaligned_end) { + dst = (uint8_t *)&scb->ips_lrh.exp_uend; + src = (uint8_t *)tidsendc->userbuf + + tidsendc->length - + tidsendc->tid_list.tsess_unaligned_end; +#ifdef PSM_CUDA + if (IS_TRANSFER_BUF_GPU_MEM(scb) && !tidsendc->mqreq->cuda_hostbuf_used) { + PSMI_CUDA_CALL(cuMemcpyDtoH, dst, (CUdeviceptr)src, + tidsendc->tid_list.tsess_unaligned_end); + } else +#endif + ips_protoexp_unaligned_copy(dst, src, + tidsendc->tid_list.tsess_unaligned_end); + } + /* + * If the number of fragments is greater then one and + * "no header suppression" flag is unset then we go + * ahead and suppress the header */ + if ((scb->nfrag > 1) && (!is_payload_per_frag_leq_8dw)) + scb->scb_flags |= IPS_SEND_FLAG_HDRSUPP; + else + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + + tidsendc->is_complete = 1; + } else { + /* Do not suppress header every hdr_pkt_interval */ + if ((++tidsendc->frame_send % + protoexp->hdr_pkt_interval) == 0) + /* Request an ACK */ + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + else { + if (!is_payload_per_frag_leq_8dw) { + /* Request hdr supp */ + scb->scb_flags |= IPS_SEND_FLAG_HDRSUPP; + } + } + /* assert only single packet per scb */ + psmi_assert(scb->nfrag == 1); + } + +#ifdef PSM_CUDA + if (tidsendc->mqreq->is_buf_gpu_mem && /* request's buffer comes from GPU realm */ + !tidsendc->mqreq->cuda_hostbuf_used) { /* and it was NOT moved to HOST memory */ + scb->mq_req = tidsendc->mqreq; /* so let's mark it per scb, not to check its locality again */ + ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; + } +#endif + + return scb; +} + +/* + * Returns: + * + * PSM2_OK: scb was allocated for at least one frame, the packet may be queued + * or actually sent. + * + * PSM2_OK_NO_PROGRESS: Reached a limit on the maximum number of sends we allow + * to be enqueued before polling receive queue. + * + * PSM2_EP_NO_RESOURCES: No scbs, available, a callback will be issued when more + * scbs become available. + * + * PSM2_TIMEOUT: PIO-busy or DMA-busy, stop trying to send for now. + * + */ + +static +psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc) +{ + ips_scb_t *scb = NULL; + psm2_error_t err = PSM2_OK, err_f; + struct ips_protoexp *protoexp = tidsendc->protoexp; + struct ips_proto *proto = protoexp->proto; + struct ips_flow *flow = &tidsendc->tidflow; + +#ifdef PSM_CUDA + struct ips_cuda_hostbuf *chb, *chb_next; + CUresult chb_status; + uint32_t offset_in_chb, i; + for (i = 0; i < tidsendc->cuda_num_buf; i++) { + chb = tidsendc->cuda_hostbuf[i]; + if (chb) { + PSMI_CUDA_CHECK_EVENT(chb->copy_status, chb_status); + if (chb_status != CUDA_SUCCESS) { + err = PSM2_OK_NO_PROGRESS; + PSM2_LOG_MSG("leaving"); + return err; + } + } + } + + if (tidsendc->cuda_num_buf == 2) { + chb = tidsendc->cuda_hostbuf[0]; + chb_next = tidsendc->cuda_hostbuf[1]; + offset_in_chb = tidsendc->tid_list.tsess_srcoff - chb->offset; + /* Copying data from multiple cuda + * host buffers into a bounce buffer. + */ + memcpy(tidsendc->buffer, chb->host_buf + + offset_in_chb, chb->size-offset_in_chb); + memcpy(tidsendc->buffer+ chb->size - + offset_in_chb, chb_next->host_buf, + tidsendc->tid_list.tsess_srcoff + + tidsendc->tid_list.tsess_length - chb_next->offset); + + chb->bytes_read += chb->size - offset_in_chb; + chb_next->bytes_read += tidsendc->tid_list.tsess_srcoff + + tidsendc->tid_list.tsess_length - + chb_next->offset; + if(chb->bytes_read == chb->size) { + STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb, + ips_cuda_hostbuf, req_next); + if (chb->is_tempbuf) + psmi_deallocate_chb(chb); + else { + chb->req = NULL; + chb->offset = 0; + chb->bytes_read = 0; + psmi_mpool_put(chb); + } + psmi_cuda_run_prefetcher(protoexp, tidsendc); + } + if(chb_next->bytes_read == chb_next->size) { + STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb_next, + ips_cuda_hostbuf, req_next); + if (chb_next->is_tempbuf) + psmi_deallocate_chb(chb_next); + else{ + chb_next->req = NULL; + chb_next->offset = 0; + chb_next->bytes_read = 0; + psmi_mpool_put(chb_next); + } + psmi_cuda_run_prefetcher(protoexp, tidsendc); + } + } +#endif + /* + * We aggressively try to grab as many scbs as possible, enqueue them to a + * flow and flush them when either we're out of scbs our we've completely + * filled the send request. + */ + while (!tidsendc->is_complete) { + if_pf(tidsendc->tid_list.tsess_tidcount && + (tidsendc->tid_idx >= tidsendc->tid_list.tsess_tidcount || + tidsendc->tid_idx < 0)) + ips_expsend_tiderr(tidsendc); + + if ((scb = + ips_scb_prepare_tid_sendctrl(flow, tidsendc)) == NULL) { + proto->stats.scb_exp_unavail_cnt++; + err = PSM2_EP_NO_RESOURCES; + break; + } else { + ips_proto_flow_enqueue(flow, scb); + } + } + + if (!SLIST_EMPTY(&flow->scb_pend)) { /* Something to flush */ + int num_sent; + + err_f = flow->flush(flow, &num_sent); + + if (err != PSM2_EP_NO_RESOURCES) { + /* PSM2_EP_NO_RESOURCES is reserved for out-of-scbs */ + if (err_f == PSM2_EP_NO_RESOURCES) + err = PSM2_TIMEOUT; /* force a resend reschedule */ + else if (err_f == PSM2_OK && num_sent > 0 && + !ips_ptl_recvq_isempty(protoexp->ptl)) + err = PSM2_OK_NO_PROGRESS; /* force a rcvhdrq service */ + } + } + + PSM2_LOG_MSG("leaving"); + return err; +} + +static +psm2_error_t +ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current) +{ + struct ips_protoexp *protoexp = (struct ips_protoexp *)timer->context; + struct ips_tid_send_pend *phead = &protoexp->pend_sendq; + struct ips_tid_send_desc *tidsendc; + psm2_error_t err = PSM2_OK; + + while (!STAILQ_EMPTY(phead)) { + tidsendc = STAILQ_FIRST(phead); + + err = ips_tid_send_exp(tidsendc); + + if (tidsendc->is_complete) + STAILQ_REMOVE_HEAD(phead, next); + + if (err == PSM2_OK) { + /* Was able to complete the send, keep going */ + } else if (err == PSM2_EP_NO_RESOURCES) { + /* No more sendbufs available, sendbuf callback will requeue this + * timer */ + break; + } else if (err == PSM2_TIMEOUT) { + /* Always a case of try later: + * On PIO flow, means no send pio bufs available + * On DMA flow, means kernel can't queue request or would have to block + */ + psmi_timer_request(protoexp->proto->timerq, + &protoexp->timer_send, + get_cycles() + + protoexp->proto->timeout_send); + break; + } else { + /* Forced to reschedule later so we can check receive queue */ + psmi_assert(err == PSM2_OK_NO_PROGRESS); + psmi_timer_request(protoexp->proto->timerq, + &protoexp->timer_send, + PSMI_TIMER_PRIO_1); + break; + } + } + + return PSM2_OK; +} + +/* Right now, in the kernel we are allowing for virtually non-contiguous pages, + in a single call, and we are therefore locking one page at a time, but since + the intended use of this routine is for a single group of + virtually contiguous pages, that should change to improve + performance. That means possibly changing the calling MPI code. + Doing so gets rid of some of the loop stuff here, and in the driver, + and allows for a single call to the core VM code in the kernel, + rather than one per page, definitely improving performance. */ + +static +psm2_error_t +ips_tid_recv_alloc_frag(struct ips_protoexp *protoexp, + struct ips_tid_recv_desc *tidrecvc, + uint32_t nbytes_this) +{ + ips_tid_session_list *tid_list = &tidrecvc->tid_list; + uintptr_t bufptr = (uintptr_t) tidrecvc->buffer; + uint32_t size = nbytes_this; + psm2_error_t err = PSM2_OK; + uintptr_t pageaddr; + uint32_t tidoff, pageoff, pagelen, reglen, num_tids; + + psmi_assert(size >= 4); + + /* + * The following calculation does not work when size < 4 + * and bufptr is byte aligned, it can get negative value. + */ + tid_list->tsess_unaligned_start = (bufptr & 3) ? (4 - (bufptr & 3)) : 0; + size -= tid_list->tsess_unaligned_start; + bufptr += tid_list->tsess_unaligned_start; + + tid_list->tsess_unaligned_end = size & 3; + size -= tid_list->tsess_unaligned_end; + + psmi_assert(size > 0); + +#ifdef PSM_CUDA + /* Driver pins GPU pages when using GPU Direct RDMA for TID recieves, + * to accomadate this change the calculations of pageaddr, pagelen + * and pageoff have been modified to take GPU page size into + * consideration. + */ + if (tidrecvc->is_ptr_gpu_backed) { + uint64_t page_mask = ~(PSMI_GPU_PAGESIZE -1); + uint32_t page_offset_mask = (PSMI_GPU_PAGESIZE -1); + pageaddr = bufptr & page_mask; + pagelen = (uint32_t) (PSMI_GPU_PAGESIZE + + ((bufptr + size - 1) & page_mask) - + (bufptr & page_mask)); + tidoff = pageoff = (uint32_t) (bufptr & page_offset_mask); + } else { + pageaddr = bufptr & protoexp->tid_page_mask; + pagelen = (uint32_t) (PSMI_PAGESIZE + + ((bufptr + size - 1) & protoexp->tid_page_mask) - + (bufptr & protoexp->tid_page_mask)); + tidoff = pageoff = (uint32_t) (bufptr & protoexp->tid_page_offset_mask); + } +#else + pageaddr = bufptr & protoexp->tid_page_mask; + pagelen = (uint32_t) (PSMI_PAGESIZE + + ((bufptr + size - 1) & protoexp->tid_page_mask) - + (bufptr & protoexp->tid_page_mask)); + tidoff = pageoff = (uint32_t) (bufptr & protoexp->tid_page_offset_mask); +#endif + + reglen = pagelen; + if (protoexp->tidc.tid_array) { + if ((err = ips_tidcache_acquire(&protoexp->tidc, + (void *)pageaddr, ®len, + (uint32_t *) tid_list->tsess_list, &num_tids, + &tidoff +#ifdef PSM_CUDA + , tidrecvc->is_ptr_gpu_backed +#endif + ))) + goto fail; + } else { + if ((err = ips_tid_acquire(&protoexp->tidc, + (void *)pageaddr, ®len, + (uint32_t *) tid_list->tsess_list, &num_tids +#ifdef PSM_CUDA + , tidrecvc->is_ptr_gpu_backed +#endif + ))) + goto fail; + } + + /* + * PSM2 currently provides storage space enough to hold upto + * 1024 tids. (PSM_TIDLIST_BUFSIZE). So, make sure we + * don't get more than what we can hold from the tidcache here. + * + * The reason for 1024 tids comes from the PSM_TID_WINSIZE value + * (currently 4MB. So, if in future, there is a change to this macro, + * then you would need a change to PSM_TIDLIST_BUFSIZE as well). + * + * Assuming a 4KB page size, to be able to receive + * a message of 4MB size, we'd need an maximum of 4MB/4KB = 1024 tids. + */ + psmi_assert(num_tids > 0); + psmi_assert(num_tids <= (PSM_TID_WINSIZE/PSM_TIDLIST_BUFSIZE)); + if (reglen > pagelen) { + err = psmi_handle_error(protoexp->tidc.context->ep, + PSM2_EP_DEVICE_FAILURE, + "PSM tid registration: " + "register more pages than asked"); + goto fail; + } else if (reglen < pagelen) { + /* + * driver registered less pages, update PSM records. + */ + tid_list->tsess_unaligned_end = 0; + tidrecvc->recv_tidbytes = reglen - pageoff; + tidrecvc->recv_msglen = tid_list->tsess_unaligned_start + + tidrecvc->recv_tidbytes; + } else { + tidrecvc->recv_tidbytes = size; + tidrecvc->recv_msglen = nbytes_this; + } + + tid_list->tsess_tidcount = num_tids; + tid_list->tsess_tidoffset = tidoff; + + ips_dump_tids(tid_list, "Registered %d tids: ", num_tids); + +fail: + return err; +} + +static +psm2_error_t +ips_tid_recv_alloc(struct ips_protoexp *protoexp, + ips_epaddr_t *ipsaddr, + const struct ips_tid_get_request *getreq, + uint32_t nbytes_this, struct ips_tid_recv_desc **ptidrecvc) +{ + psm2_error_t err; + ips_scb_t *grantscb, *completescb; + struct ips_tid_recv_desc *tidrecvc; + + PSM2_LOG_MSG("entering"); + /* Allocate all necessary resources. */ + + /* 1. allocate a tid grant scb. */ + grantscb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0); + if (grantscb == NULL) { + /* ips_tid_scbavail_callback() will reschedule */ + PSM2_LOG_MSG("leaving"); + return PSM2_EP_NO_RESOURCES; + } + + /* 2. allocate a tid complete scb. */ + completescb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0); + if (completescb == NULL) { + ips_scbctrl_free(grantscb); + /* ips_tid_scbavail_callback() will reschedule */ + PSM2_LOG_MSG("leaving"); + return PSM2_EP_NO_RESOURCES; + } + + /* 3. allocate a tid flow entry. */ + err = ips_tf_allocate(&protoexp->tfc, &tidrecvc); + if (err != PSM2_OK) { + ips_scbctrl_free(completescb); + ips_scbctrl_free(grantscb); + /* Unable to get a tidflow for expected protocol. */ + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); + PSM2_LOG_MSG("leaving"); + return err; + } + +#ifdef PSM_CUDA + psm2_mq_req_t req = (psm2_mq_req_t)getreq->tidgr_ucontext; + + if (req->is_buf_gpu_mem) + tidrecvc->is_ptr_gpu_backed = !getreq->cuda_hostbuf_used; + else + tidrecvc->is_ptr_gpu_backed = req->is_buf_gpu_mem; + + /* 4. allocate a cuda bounce buffer, if required */ + struct ips_cuda_hostbuf *chb = NULL; + if (getreq->cuda_hostbuf_used) { + if (nbytes_this <= CUDA_SMALLHOSTBUF_SZ) + chb = (struct ips_cuda_hostbuf *) + psmi_mpool_get( + protoexp->cuda_hostbuf_pool_small_recv); + if (chb == NULL) + chb = (struct ips_cuda_hostbuf *) + psmi_mpool_get( + protoexp->cuda_hostbuf_pool_recv); + if (chb == NULL) { + /* Unable to get a cudahostbuf for TID. + * Release the resources we're holding and reschedule.*/ + ips_tf_deallocate(&protoexp->tfc, + tidrecvc->rdescid._desc_idx); + ips_scbctrl_free(completescb); + ips_scbctrl_free(grantscb); + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, + PSMI_TIMER_PRIO_1); + PSM2_LOG_MSG("leaving"); + return PSM2_EP_NO_RESOURCES; + } + + tidrecvc->cuda_hostbuf = chb; + tidrecvc->buffer = chb->host_buf; + chb->size = 0; + chb->gpu_buf = (CUdeviceptr) getreq->tidgr_lbuf + + getreq->tidgr_offset; + } else { + chb = NULL; + tidrecvc->buffer = (void *)((uintptr_t) getreq->tidgr_lbuf + + getreq->tidgr_offset); + tidrecvc->cuda_hostbuf = NULL; + } +#else + tidrecvc->buffer = + (void *)((uintptr_t) getreq->tidgr_lbuf + getreq->tidgr_offset); +#endif + + /* 5. allocate some tids from driver. */ + err = ips_tid_recv_alloc_frag(protoexp, tidrecvc, nbytes_this); + if (err != PSM2_OK) { +#ifdef PSM_CUDA + if (chb) + psmi_mpool_put(chb); +#endif + ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx); + ips_scbctrl_free(completescb); + ips_scbctrl_free(grantscb); + /* Unable to register tids */ + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); + PSM2_LOG_MSG("leaving"); + return err; + } + + if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) { + int num_tids = tidrecvc->tid_list.tsess_tidcount; + int tid, i; + for (i = 0; i < num_tids; i++) { + tid = + IPS_TIDINFO_GET_TID(tidrecvc->tid_list. + tsess_list[i]) * 2 + + IPS_TIDINFO_GET_TIDCTRL(tidrecvc->tid_list. + tsess_list[i]) - 1; + psmi_assert(protoexp->tid_info[tid].state == + TIDSTATE_FREE); + psmi_assert(protoexp->tid_info[tid].tidrecvc == NULL); + psmi_assert(protoexp->tid_info[tid].tid == 0xFFFFFFFF); + protoexp->tid_info[tid].state = TIDSTATE_USED; + protoexp->tid_info[tid].tidrecvc = tidrecvc; + protoexp->tid_info[tid].tid = + tidrecvc->tid_list.tsess_list[i]; + } + } + + /* Initialize recv descriptor */ + tidrecvc->ipsaddr = ipsaddr; + tidrecvc->getreq = (struct ips_tid_get_request *)getreq; + + /* Initialize tidflow, instead calling generic routine: + ips_flow_init(&tidrecvc->tidflow, protoexp->proto, ipsaddr, + protoexp->ctrl_xfer_type, PSM_PROTOCOL_TIDFLOW, + IPS_PATH_LOW_PRIORITY, EP_FLOW_TIDFLOW); + * only reset following necessary field. */ + tidrecvc->tidflow.ipsaddr = ipsaddr; + tidrecvc->tidflow.flags = 0; + + tidrecvc->tidflow_nswap_gen = 0; + tidrecvc->tidflow_genseq.psn_gen = tidrecvc->tidflow_active_gen; + tidrecvc->tidflow_genseq.psn_seq = 0; /* Always start sequence number at 0 (zero), + in order to prevent wraparound sequence numbers */ + psmi_hal_tidflow_set_entry( + tidrecvc->rdescid._desc_idx, + tidrecvc->tidflow_genseq.psn_gen, + tidrecvc->tidflow_genseq.psn_seq, + tidrecvc->context->psm_hw_ctxt); + + tidrecvc->tid_list.tsess_srcoff = getreq->tidgr_offset; + tidrecvc->tid_list.tsess_length = tidrecvc->recv_msglen; + + tidrecvc->ctrl_msg_queued = 0; + tidrecvc->state = TIDRECVC_STATE_BUSY; + + tidrecvc->stats.nSeqErr = 0; + tidrecvc->stats.nGenErr = 0; + tidrecvc->stats.nReXmit = 0; + tidrecvc->stats.nErrChkReceived = 0; + + /* This gets sent out as a control message, so we need to force 4-byte IB + * alignment */ + tidrecvc->tsess_tidlist_length = (uint16_t) + PSMI_ALIGNUP((sizeof(ips_tid_session_list) + + (tidrecvc->tid_list.tsess_tidcount * + sizeof(uint32_t))), 4); + + _HFI_EXP("alloc tidrecv=%d, paylen=%d, ntid=%d\n", + tidrecvc->rdescid._desc_idx, + tidrecvc->tsess_tidlist_length, + tidrecvc->tid_list.tsess_tidcount); + + tidrecvc->grantscb = grantscb; + tidrecvc->completescb = completescb; + + *ptidrecvc = tidrecvc; /* return to caller */ + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} + +static +psm2_error_t +ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current) +{ + struct ips_tid_get_pend *phead = + &((struct ips_protoexp *)timer->context)->pend_getreqsq; + struct ips_protoexp *protoexp; + struct ips_tid_get_request *getreq; + struct ips_tid_recv_desc *tidrecvc; + ips_epaddr_t *ipsaddr; + uint32_t nbytes_this, count; + int ret; + + PSM2_LOG_MSG("entering"); + +#ifdef PSM_CUDA + if (!(((struct ips_protoexp *)timer->context)->proto->flags + & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) || + ((((struct ips_protoexp *)timer->context)->proto->flags & + IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) && + gpudirect_recv_threshold)) { + /* Before processing pending TID requests, first try to free up + * any CUDA host buffers that are now idle. */ + struct ips_tid_get_cudapend *cphead = + &((struct ips_protoexp *)timer->context)->cudapend_getreqsq; + psm2_error_t err; + + /* See if any CUDA memcpys are in progress. Grab the first getreq... */ + while (!STAILQ_EMPTY(cphead)) { + getreq = STAILQ_FIRST(cphead); + + err = psmi_cuda_reclaim_hostbufs(getreq); + if (err == PSM2_OK_NO_PROGRESS) + goto cudapend_exit; + + /* This pending cuda getreq has no more CUDA ops queued up. + * Either it's completely done, or the CUDA copies have caught + * up with the TID data xfer, but the TID xfer itself is not + * finished. + */ + if (getreq->tidgr_cuda_bytesdone == getreq->tidgr_length) { + /* TID xfer is done. + * We should only get here if: + * this was involved a cuda copy, and + * the TIX xfer is done. + */ + psmi_assert(getreq->cuda_hostbuf_used); + psmi_assert(getreq->tidgr_length == + getreq->tidgr_offset); + + /* Remove from the cudapend list, and reclaim */ + getreq->tidgr_protoexp = NULL; + getreq->tidgr_epaddr = NULL; + STAILQ_REMOVE_HEAD(cphead, tidgr_next); + + /* mark the req as done */ + if (getreq->tidgr_callback) + getreq->tidgr_callback(getreq->tidgr_ucontext); + psmi_mpool_put(getreq); + } else + break; /* CUDA xfers in progress. Leave. */ + } + } +cudapend_exit: +#endif + + while (!STAILQ_EMPTY(phead)) { + getreq = STAILQ_FIRST(phead); + ipsaddr = (ips_epaddr_t *) (getreq->tidgr_epaddr); + count = ipsaddr->msgctl->ipsaddr_count; + +ipsaddr_next: + ipsaddr = ipsaddr->msgctl->ipsaddr_next; + ipsaddr->msgctl->ipsaddr_next = ipsaddr->next; + protoexp = ((psm2_epaddr_t) ipsaddr)->proto->protoexp; + + if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) { + psmi_assert(protoexp->proto->msgflowid < EP_FLOW_LAST); + struct ips_flow *flow = &ipsaddr->flows[protoexp->proto->msgflowid]; + if (flow->flags & IPS_FLOW_FLAG_SKIP_CTS) { + break; /* skip sending next CTS */ + } + } + +#ifdef PSM_CUDA + if (getreq->cuda_hostbuf_used) { + /* If this is a large transfer, we may be able to + * start reclaiming before all of the data is sent. */ + psmi_cuda_reclaim_hostbufs(getreq); + } +#endif + /* + * Calculate the next window size, avoid the last + * window too small. + */ + nbytes_this = getreq->tidgr_length - getreq->tidgr_offset; + if (nbytes_this >= 2 * getreq->tidgr_rndv_winsz) + nbytes_this = getreq->tidgr_rndv_winsz; + else if (nbytes_this > getreq->tidgr_rndv_winsz) + nbytes_this /= 2; + + /* + * If there is a next window and the next window + * length is greater than PAGESIZE, make sure the window + * starts on a page boundary. + */ +#ifdef PSM_CUDA + psm2_mq_req_t req = (psm2_mq_req_t)getreq->tidgr_ucontext; + if (req->is_buf_gpu_mem){ + if (((getreq->tidgr_offset + nbytes_this) < + getreq->tidgr_length) && + nbytes_this > PSMI_GPU_PAGESIZE) { + uint32_t pageoff = + (((uintptr_t)getreq->tidgr_lbuf) & + (PSMI_GPU_PAGESIZE - 1)) + + getreq->tidgr_offset + nbytes_this; + nbytes_this -= pageoff & (PSMI_GPU_PAGESIZE - 1); + } + } else { +#endif + if ((getreq->tidgr_offset + nbytes_this) < + getreq->tidgr_length && + nbytes_this > PSMI_PAGESIZE) { + uint32_t pageoff = + (((uintptr_t)getreq->tidgr_lbuf) & + (PSMI_PAGESIZE - 1)) + + getreq->tidgr_offset + nbytes_this; + nbytes_this -= pageoff & (PSMI_PAGESIZE - 1); + } +#ifdef PSM_CUDA + } +#endif + + psmi_assert(nbytes_this >= 4); + psmi_assert(nbytes_this <= PSM_TID_WINSIZE); + + if ((ret = ips_tid_num_available(&protoexp->tidc)) <= 0) { + /* We're out of tids. If this process used all the resource, + * the free callback will reschedule the operation, otherwise, + * we reschedule it here */ + if (ret == 0) + { + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, + PSMI_TIMER_PRIO_1); + } + } else if ((ret = ips_tf_available(&protoexp->tfc)) <= 0) { + /* We're out of tidflow. If this process used all the resource, + * the free callback will reschedule the operation, otherwise, + * we reschedule it here */ + if (ret == 0) + { + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, + PSMI_TIMER_PRIO_1); + } + } else if (ips_tid_recv_alloc(protoexp, ipsaddr, + getreq, nbytes_this, &tidrecvc) == PSM2_OK) { + ips_protoexp_send_tid_grant(tidrecvc); + + if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) { + /* + * Once the CTS was sent, we mark it per 'flow' object + * not to proceed with next CTSes until that one is done. + */ + struct ips_proto *proto = tidrecvc->protoexp->proto; + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; + flow->flags |= IPS_FLOW_FLAG_SKIP_CTS; + } + + /* + * nbytes_this is the asked length for this session, + * ips_tid_recv_alloc() might register less pages, the + * real length is in tidrecvc->recv_msglen. + */ + getreq->tidgr_offset += tidrecvc->recv_msglen; + psmi_assert(getreq->tidgr_offset <= + getreq->tidgr_length); + _HFI_VDBG("GRANT tididx=%d srcoff=%d nbytes=%d/%d\n", + tidrecvc->rdescid._desc_idx, + getreq->tidgr_offset, tidrecvc->recv_msglen, + getreq->tidgr_length); + + if (getreq->tidgr_offset == getreq->tidgr_length) { +#ifdef PSM_CUDA + if (getreq->cuda_hostbuf_used) { + /* this completes the tid xfer setup. + move to the pending cuda ops queue, + set the timer to catch completion */ + STAILQ_REMOVE_HEAD(phead, tidgr_next); + STAILQ_INSERT_TAIL( + &getreq->tidgr_protoexp->cudapend_getreqsq, + getreq, tidgr_next); + psmi_timer_request(getreq->tidgr_protoexp->timerq, + &getreq->tidgr_protoexp->timer_getreqs, + PSMI_TIMER_PRIO_1); + continue; + } +#endif + getreq->tidgr_protoexp = NULL; + getreq->tidgr_epaddr = NULL; + STAILQ_REMOVE_HEAD(phead, tidgr_next); + continue; /* try next grant request */ + } + else if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_RTS_CTS_INTERLEAVE) { + /* In case of multi rail, PSM sends one CTS per request + * per card after which the request is moved to the end + * of the queue. + */ + count--; + if (count) + goto ipsaddr_next; + STAILQ_REMOVE_HEAD(phead, tidgr_next); + STAILQ_INSERT_TAIL(phead, getreq ,tidgr_next); + continue; + } + + /* created a tidrecvc, reset count */ + count = ipsaddr->msgctl->ipsaddr_count; + goto ipsaddr_next; /* try next fragment on next ipsaddr */ + } + + /* + * We need to loop until we can't get a tidrecvc on all + * ipsaddrs, then the callbacks on the home protoexp where + * getreq is linked can resume this routine. Otherwise, we + * might make this getreq to be orphaned and cause deadlock. + */ + count--; + if (count) + goto ipsaddr_next; + break; + } + PSM2_LOG_MSG("leaving"); + return PSM2_OK; /* XXX err-broken */ +} + +#ifdef PSM_CUDA +static +void psmi_cudamemcpy_tid_to_device(struct ips_tid_recv_desc *tidrecvc) +{ + struct ips_protoexp *protoexp = tidrecvc->protoexp; + struct ips_cuda_hostbuf *chb; + + chb = tidrecvc->cuda_hostbuf; + chb->size += tidrecvc->recv_tidbytes + tidrecvc->tid_list.tsess_unaligned_start + + tidrecvc->tid_list.tsess_unaligned_end; + + PSMI_CUDA_CALL(cuMemcpyHtoDAsync, + chb->gpu_buf, chb->host_buf, + tidrecvc->recv_tidbytes + tidrecvc->tid_list.tsess_unaligned_start + + tidrecvc->tid_list.tsess_unaligned_end, + protoexp->cudastream_recv); + PSMI_CUDA_CALL(cuEventRecord, chb->copy_status, + protoexp->cudastream_recv); + + STAILQ_INSERT_TAIL(&tidrecvc->getreq->pend_cudabuf, chb, next); + tidrecvc->cuda_hostbuf = NULL; + ips_tid_pendtids_timer_callback(&tidrecvc->getreq->tidgr_protoexp->timer_getreqs,0); +} +#endif + +static +psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc) +{ + struct ips_protoexp *protoexp = tidrecvc->protoexp; + struct ips_tid_get_request *getreq = tidrecvc->getreq; + int tidcount = tidrecvc->tid_list.tsess_tidcount; + psm2_error_t err = PSM2_OK; + + psmi_assert(getreq != NULL); + psmi_assert(tidcount > 0); + psmi_assert(tidrecvc->state == TIDRECVC_STATE_BUSY); + +#ifdef PSM_CUDA + if (tidrecvc->cuda_hostbuf) + psmi_cudamemcpy_tid_to_device(tidrecvc); +#endif + + if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) { + int tid, i; + + for (i = 0; i < tidcount; i++) { + tid = + IPS_TIDINFO_GET_TID(tidrecvc->tid_list. + tsess_list[i]) * 2 + + IPS_TIDINFO_GET_TIDCTRL(tidrecvc->tid_list. + tsess_list[i]) - 1; + psmi_assert(protoexp->tid_info[tid].state == + TIDSTATE_USED); + psmi_assert(protoexp->tid_info[tid].tidrecvc == + tidrecvc); + psmi_assert(protoexp->tid_info[tid].tid == + tidrecvc->tid_list.tsess_list[i]); + protoexp->tid_info[tid].state = TIDSTATE_FREE; + protoexp->tid_info[tid].tidrecvc = NULL; + protoexp->tid_info[tid].tid = 0xFFFFFFFF; + } + } + + ips_dump_tids(&tidrecvc->tid_list, "Deregistered %d tids: ", + tidrecvc->tid_list.tsess_tidcount); + + if (protoexp->tidc.tid_array) { + if ((err = ips_tidcache_release(&protoexp->tidc, + tidrecvc->tid_list.tsess_list, tidcount))) + goto fail; + } else { + if ((err = ips_tid_release(&protoexp->tidc, + tidrecvc->tid_list.tsess_list, tidcount))) + goto fail; + } + + getreq->tidgr_bytesdone += tidrecvc->recv_msglen; + + _HFI_EXP("req=%p bytes=%d/%d\n", + getreq->tidgr_ucontext, + getreq->tidgr_bytesdone, getreq->tidgr_length); + + tidrecvc->state = TIDRECVC_STATE_FREE; + + /* finally free the tidflow */ + ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx); + + if (getreq->tidgr_bytesdone == getreq->tidgr_length) { +#ifdef PSM_CUDA + /* if cuda, we handle callbacks when the cuda xfer is done */ + if (!getreq->cuda_hostbuf_used) { + if (getreq->tidgr_callback) + getreq->tidgr_callback(getreq->tidgr_ucontext); + psmi_mpool_put(getreq); + } +#else + if (getreq->tidgr_callback) + getreq->tidgr_callback(getreq->tidgr_ucontext); + psmi_mpool_put(getreq); +#endif + } else { + /* We just released some tids. + * If requests are waiting on tids to be + * freed, queue up the timer */ + if (getreq->tidgr_offset < getreq->tidgr_length) { + ips_tid_pendtids_timer_callback(&getreq-> + tidgr_protoexp-> + timer_getreqs, 0); + } + } + + if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) { + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, + PSMI_TIMER_PRIO_1); + } + +fail: + return err; +} + +void +ips_protoexp_handle_tiderr(const struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_tid_recv_desc *tidrecvc; + struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + + ptl_arg_t desc_id; + int tidpair = (__le32_to_cpu(p_hdr->khdr.kdeth0) >> + HFI_KHDR_TID_SHIFT) & HFI_KHDR_TID_MASK; + int tidctrl = (__le32_to_cpu(p_hdr->khdr.kdeth0) >> + HFI_KHDR_TIDCTRL_SHIFT) & HFI_KHDR_TIDCTRL_MASK; + int tid0, tid1, tid; + + psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID); + + /* Expected sends not enabled */ + if (protoexp == NULL) + return; + + /* Not doing extra tid debugging or not really a tiderr */ + if (!(protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) || + !(psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf) & PSMI_HAL_RHF_ERR_TID)) + return; + + if (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) != PSM_HAL_RHF_RX_TYPE_EXPECTED) { + _HFI_ERROR("receive type %d is not " + "expected in tid debugging\n", psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf)); + return; + } + + desc_id._desc_idx = ips_proto_flowid(p_hdr); + desc_id._desc_genc = p_hdr->exp_rdescid_genc; + + tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; + + if (tidctrl != 3) + tid0 = tid1 = tidpair * 2 + tidctrl - 1; + else { + tid0 = tidpair * 2; + tid1 = tid0 + 1; + } + + for (tid = tid0; tid <= tid1; tid++) { + if (protoexp->tid_info[tid].state == TIDSTATE_USED) + continue; + + char buf[128]; + char *s = "invalid (not even in table)"; + + if (tidrecvc->rdescid._desc_genc == + desc_id._desc_genc) + s = "valid"; + else { + snprintf(buf, sizeof(buf) - 1, + "wrong generation (gen=%d,received=%d)", + tidrecvc->rdescid._desc_genc, + desc_id._desc_genc); + buf[sizeof(buf) - 1] = '\0'; + s = buf; + } + + if (protoexp->tid_info[tid].tidrecvc != tidrecvc) { + _HFI_ERROR + ("tid %d not a known member of tidsess %d\n", + tid, desc_id._desc_idx); + } + + _HFI_ERROR("tid %d is marked unused (session=%d): %s\n", tid, + desc_id._desc_idx, s); + } + return; +} + +void +ips_protoexp_handle_data_err(const struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_tid_recv_desc *tidrecvc; + struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + int hdr_err = psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf) & PSMI_HAL_RHF_ERR_KHDRLEN; + uint8_t op_code = _get_proto_hfi_opcode(p_hdr); + char pktmsg[128]; + char errmsg[256]; + + psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID); + + /* Expected sends not enabled */ + if (protoexp == NULL) + return; + + ips_proto_get_rhf_errstring(psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf), pktmsg, + sizeof(pktmsg)); + + snprintf(errmsg, sizeof(errmsg), + "%s pkt type opcode 0x%x at hd=0x%x %s\n", + (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_EAGER) ? "Eager" : + (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_EXPECTED) ? "Expected" : + (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_NON_KD) ? "Non-kd" : + "", op_code, rcv_ev->recvq->state->hdrq_head, + pktmsg); + + if (!hdr_err) { + ptl_arg_t desc_id; + psmi_seqnum_t sequence_num; + + desc_id._desc_idx = ips_proto_flowid(p_hdr); + desc_id._desc_genc = p_hdr->exp_rdescid_genc; + + tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; + + if (tidrecvc->rdescid._desc_genc != desc_id._desc_genc) { + /* Print this at very verbose level. Noisy links can have a few of + * these! */ + _HFI_VDBG + ("Data Error Pkt and Recv Generation Mismatch: %s", + errmsg); + return; /* skip */ + } + + if (tidrecvc->state == TIDRECVC_STATE_FREE) { + _HFI_EPDBG + ("Data Error Pkt for a Completed Rendezvous: %s", + errmsg); + return; /* skip */ + } + + /* See if CRC error for a previous packet */ + sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); + if (sequence_num.psn_gen == tidrecvc->tidflow_genseq.psn_gen) { + /* Try to recover the flow by restarting from previous known good + * sequence (possible if the packet with CRC error is after the "known + * good PSN" else we can't restart the flow. + */ + return ips_protoexp_do_tf_seqerr(protoexp, + tidrecvc, p_hdr); + } else { + /* Print this at very verbose level */ + _HFI_VDBG + ("Data Error Packet. GenMismatch: Yes. Tidrecvc: %p. " + "Pkt Gen.Seq: %d.%d, TF Gen.Seq: %d.%d. %s\n", + tidrecvc, sequence_num.psn_gen, + sequence_num.psn_seq, + tidrecvc->tidflow_genseq.psn_gen, + tidrecvc->tidflow_genseq.psn_seq, errmsg); + } + + } else { + _HFI_VDBG("HDR_ERROR: %s\n", errmsg); + } + +} + +psm2_error_t +ips_protoexp_flow_newgen(struct ips_tid_recv_desc *tidrecvc) +{ + psmi_assert_always(tidrecvc->state == TIDRECVC_STATE_BUSY); + ips_tfgen_allocate(&tidrecvc->protoexp->tfc, + tidrecvc->rdescid._desc_idx, + &tidrecvc->tidflow_active_gen); + + /* Update tidflow table with new generation number */ + tidrecvc->tidflow_genseq.psn_gen = tidrecvc->tidflow_active_gen; + psmi_hal_tidflow_set_entry( + tidrecvc->rdescid._desc_idx, + tidrecvc->tidflow_genseq.psn_gen, + tidrecvc->tidflow_genseq.psn_seq, + tidrecvc->context->psm_hw_ctxt); + + /* Increment swapped generation count for tidflow */ + tidrecvc->tidflow_nswap_gen++; + return PSM2_OK; +} + +void +ips_protoexp_handle_tf_seqerr(const struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_tid_recv_desc *tidrecvc; + ptl_arg_t desc_id; + + psmi_assert_always(protoexp != NULL); + psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID); + + desc_id._desc_idx = ips_proto_flowid(p_hdr); + desc_id._desc_genc = p_hdr->exp_rdescid_genc; + + tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; + + if (tidrecvc->rdescid._desc_genc == desc_id._desc_genc + && tidrecvc->state == TIDRECVC_STATE_BUSY) + ips_protoexp_do_tf_seqerr(protoexp, tidrecvc, p_hdr); + + return; +} + +static +void ips_protoexp_do_tf_seqerr(void *vpprotoexp + /* actually: struct ips_protoexp *protoexp */, + void *vptidrecvc + /* actually: struct ips_tid_recv_desc *tidrecvc */, + struct ips_message_header *p_hdr) +{ + struct ips_protoexp *protoexp = (struct ips_protoexp *) vpprotoexp; + struct ips_tid_recv_desc *tidrecvc = (struct ips_tid_recv_desc *) vptidrecvc; + psmi_seqnum_t sequence_num, tf_sequence_num; + ips_scb_t ctrlscb; + + /* Update stats for sequence errors */ + tidrecvc->stats.nSeqErr++; + + sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); + + /* Only care about sequence error for currently active generation */ + if (tidrecvc->tidflow_active_gen != sequence_num.psn_gen) + return; + + /* If a "large" number of swapped generation we are loosing packets + * for this flow. Request throttling of tidflow by generating a + * BECN. With header suppression we will miss some FECN packet + * on OPA hence keeping track of swapped generation is another + * mechanism to do congestion control for tidflows. + * + * For mismatched sender/receiver/link speeds we can get into a + * deadly embrace where minimal progress is made due to generation + * mismatch errors. This can occur if we wrap around the generation + * count without making progress. Hence in cases where the swapped + * generation count is > 254 stop sending BECN (and the NAK) so the + * send -> receiver pipeline is flushed with an error check and things + * can sync up. This should be an extremely rare event. + */ + + if_pf(tidrecvc->tidflow_nswap_gen >= 254) + return; /* Do not send NAK. Let error check kick in. */ + + if_pf((tidrecvc->tidflow_nswap_gen > 4) && + (protoexp->proto->flags & IPS_PROTO_FLAG_CCA)) { + _HFI_CCADBG("Generating BECN. Number of swapped gen: %d.\n", + tidrecvc->tidflow_nswap_gen); + /* Mark flow to generate BECN in control packet */ + tidrecvc->tidflow.flags |= IPS_FLOW_FLAG_GEN_BECN; + + /* Update stats for congestion encountered */ + protoexp->proto->epaddr_stats.congestion_pkts++; + } + + /* Get the latest seq from hardware tidflow table, if that value is + * reliable. The value is not reliable if context sharing is used, + * because context sharing might drop packet even though hardware + * has received it successfully. The hardware table may also be + * incorrect if RSM is intercepting TID & FECN & SH packets. + * We can handle this condition by taking the most recent PSN whether + * it comes from the tidflow table or from PSM's own accounting. + */ + if (!tidrecvc->context->tf_ctrl) { + uint64_t tf; + uint32_t seqno=0; + + psmi_hal_tidflow_get(tidrecvc->rdescid._desc_idx, &tf, + tidrecvc->context->psm_hw_ctxt); + psmi_hal_tidflow_get_seqnum(tf, &seqno); + tf_sequence_num.psn_val = seqno; + + if (psmi_hal_has_cap(PSM_HAL_CAP_RSM_FECN_SUPP)) { + if (tf_sequence_num.psn_val > tidrecvc->tidflow_genseq.psn_seq) + tidrecvc->tidflow_genseq.psn_seq = tf_sequence_num.psn_seq; + } + else + tidrecvc->tidflow_genseq.psn_seq = tf_sequence_num.psn_seq; + } + + /* Swap generation for the flow. */ + ips_protoexp_flow_newgen(tidrecvc); + + ctrlscb.scb_flags = 0; + ctrlscb.ips_lrh.data[0] = p_hdr->exp_sdescid; + /* Keep peer generation but use my last received sequence */ + sequence_num.psn_seq = tidrecvc->tidflow_genseq.psn_seq; + ctrlscb.ips_lrh.ack_seq_num = sequence_num.psn_val; + + /* My new generation and last received sequence */ + ctrlscb.ips_lrh.data[1].u32w0 = tidrecvc->tidflow_genseq.psn_val; + + ips_proto_send_ctrl_message(&tidrecvc->tidflow, + OPCODE_NAK, + &tidrecvc->ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + + /* Update stats for retransmit */ + tidrecvc->stats.nReXmit++; + + return; +} + +void +ips_protoexp_handle_tf_generr(const struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_tid_recv_desc *tidrecvc; + ptl_arg_t desc_id; + + psmi_assert_always(protoexp != NULL); + psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID); + + /* For a generation error our NAK crossed on the wire or this is a stale + * packet. Error recovery should sync things up again. Just drop this + * packet. + */ + desc_id._desc_idx = ips_proto_flowid(p_hdr); + desc_id._desc_genc = p_hdr->exp_rdescid_genc; + + tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; + + if (tidrecvc->rdescid._desc_genc == desc_id._desc_genc + && tidrecvc->state == TIDRECVC_STATE_BUSY) + ips_protoexp_do_tf_generr(protoexp, tidrecvc, p_hdr); + + return; +} + +static +void ips_protoexp_do_tf_generr(void *vpprotoexp + /* actually: struct ips_protoexp *protoexp */, + void *vptidrecvc + /* actually: struct ips_tid_recv_desc *tidrecvc */, + struct ips_message_header *p_hdr) +{ + struct ips_tid_recv_desc *tidrecvc = (struct ips_tid_recv_desc *) vptidrecvc; + /* Update stats for generation errors */ + tidrecvc->stats.nGenErr++; + + /* If packet faced congestion we may want to generate + * a CN packet to rate control sender. + */ + + return; +} diff --git a/ptl_ips/ips_proto_header.h b/ptl_ips/ips_proto_header.h new file mode 100644 index 0000000..6677162 --- /dev/null +++ b/ptl_ips/ips_proto_header.h @@ -0,0 +1,181 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_PROTO_HEADER_H +#define _IPS_PROTO_HEADER_H + +/* Although defined as macros, the *_BITS values below are NOT meant to be + changed. They are defined this way so that their values are written in + exactly one place. These macros are used in struct ips_message_header + below, as well as in the active messages code for the purpose of + establishing how many arguments/handlers are supported, and to assert that + values written into the header fields are not too large for the number of + bits available. The preprocessor check below ensures less than 32 bits are + used. + */ + +/* Number of bits to use for the amhdr_len field. */ +#define IPS_AM_HDR_LEN_BITS 4 + +/* Number of bits to use for the amhdr_hidx field. Bounds the number of + * handlers supported (1 << IPS_AM_HDR_HIDX_BITS). */ +#define IPS_AM_HDR_HIDX_BITS 8 + +/* Number of bits to use for the amhdr_nargs field. Bounds the number of + arguments supported (1 << IPS_AM_HDR_NARGS_BITS). */ +#define IPS_AM_HDR_NARGS_BITS 4 + +#if (IPS_AM_HDR_LEN_BITS + IPS_AM_HDR_HIDX_BITS + IPS_AM_HDR_NARGS_BITS) > 32 +#error "Bad IPS header definition: AM fields must use 32 bits or less" +#endif + +/* Number of AM arguments that can be packets into struct_ips_message_header. + Remaining arguments up to the max (1 << IPS_AM_HDR_NARGS_BITS) are placed in + the data payload. */ +#define IPS_AM_HDR_NARGS \ + (sizeof(((struct ips_message_header *)0)->data) / sizeof(psm2_amarg_t)) + +/* The actual size of the message header is determined by three paramters: + * IPS_HEADER_QUEUE_IWORDS (fixed at 5 by hardware) + * OPA words contain LRH and BTH + * IPS_HEADER_QUEUE_HWORDS (fixed at 2 by ips protocol) + * IPS hardware words contain ips-protocol-specific data + * IPS_HEADER_QUEUE_UWORDS (fixed at 7 by ips protocol) + * IPS user words contain ips-protocol-specific data + * + * The header message size is determined to as IWORDS + HWORDS + UWORDS + */ +struct ips_message_header { + __be16 lrh[4]; + __be32 bth[3]; + + /* fields below this point are in host byte order */ + struct hfi_kdeth khdr; + + struct { + __u32 flags:6; + __u32 connidx:26; /* connection idx */ + }; + + union { + struct { + struct { + __u32 ack_seq_num:31; + __u32 reserved:1; + }; + + union { + struct { /* for active message */ + __u32 amhdr_len:IPS_AM_HDR_LEN_BITS; + __u32 amhdr_nargs:IPS_AM_HDR_NARGS_BITS; + __u32 amhdr_hidx:IPS_AM_HDR_HIDX_BITS; + }; + __u32 mdata; /* for misc data */ + }; + + /* Inline arguments and/or message payload */ + union { + ptl_arg_t data[2]; + __u32 uwords[4]; + }; + }; + + /* for message header packet only */ + struct { + __u32 pad1; + __u32 tag[3]; /* 96 bits psm tag */ + ptl_arg_t hdr_data; + }; + + /* for expected tid packet only */ + struct { + __u8 exp_ustart[3]; /* unaligned start bytes */ + __u8 exp_uend[3]; /* unaligned end bytes */ + __u16 exp_rdescid_genc; /* tidrecvc gen count */ + ptl_arg_t exp_sdescid; /* sender descriptor id */ + __u32 exp_cksum; /* optional checksum */ + __u32 exp_offset; /* packet offset */ + }; + }; +}; + +/* + * OpCodes in BTH[0], 24-31 bits. Order is important!!! + */ +#define OPCODE_RESERVED 0xC0 /* reserved */ +#define OPCODE_TINY 0xC1 /* 0 <= msglen <= 8 */ +#define OPCODE_SHORT 0xC2 /* 8 < msglen <= MTU */ +#define OPCODE_EAGER 0xC3 /* eager packet */ +#define OPCODE_LONG_RTS 0xC4 /* ready to send */ +#define OPCODE_LONG_CTS 0xC5 /* confirm to send */ +#define OPCODE_LONG_DATA 0xC6 /* long data packets */ +#define OPCODE_EXPTID 0xC7 /* expected tid data */ +#define OPCODE_EXPTID_COMPLETION 0xC8 /* expected tid completion */ +#define OPCODE_ACK 0xC9 /* explicit ACK packet */ +#define OPCODE_NAK 0xCA /* explicit NAK packet */ +#define OPCODE_BECN 0xCB /* congestion control */ +#define OPCODE_ERR_CHK 0xCC /* query eager receiving */ +#define OPCODE_ERR_CHK_GEN 0xCD /* query tid receiving */ +#define OPCODE_CONNECT_REQUEST 0xCE /* connect request */ +#define OPCODE_CONNECT_REPLY 0xCF /* connect reply */ +#define OPCODE_DISCONNECT_REQUEST 0xD0 /* disconnect request */ +#define OPCODE_DISCONNECT_REPLY 0xD1 /* disconnect reply */ +#define OPCODE_AM_REQUEST_NOREPLY 0xD2 /* AM request w/o reply */ +#define OPCODE_AM_REQUEST 0xD3 /* AM request */ +#define OPCODE_AM_REPLY 0xD4 /* AM reply */ +#define OPCODE_FUTURE_FROM 0xD5 /* reserved for expansion */ +#define OPCODE_FUTURE_TO 0xDF /* reserved for expansion */ + +#endif /* _IPS_PROTO_HEADER_H */ diff --git a/ptl_ips/ips_proto_help.h b/ptl_ips/ips_proto_help.h new file mode 100644 index 0000000..42567f5 --- /dev/null +++ b/ptl_ips/ips_proto_help.h @@ -0,0 +1,626 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_PROTO_HELP_H +#define _IPS_PROTO_HELP_H + +#include "ptl_ips.h" + +/* hfi_opcode is not the ips-level opcode. */ +PSMI_ALWAYS_INLINE( +uint8_t +_get_proto_hfi_opcode(const struct ips_message_header *p_hdr)) +{ + return ((__be32_to_cpu(p_hdr->bth[0]) >> + HFI_BTH_OPCODE_SHIFT) & HFI_BTH_OPCODE_MASK); +} + +PSMI_ALWAYS_INLINE( +uint8_t +ips_flow_gen_ackflags(ips_scb_t *scb, struct ips_flow *flow)) +{ + /* + * Setup ACK request if more than ack_interval packets + * have not been requested an ACK + */ + if (scb->scb_flags & IPS_SEND_FLAG_ACKREQ || scb->nfrag > 1) { + flow->ack_counter = 0; + } else { + flow->ack_counter++; + if (flow->ack_counter > flow->ack_interval) { + flow->ack_counter = 0; + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + } + } + + /* Bottom 6 bits wind up in protocol header fields, other bits + * control other aspects of packet composition */ + return (uint8_t) (scb->scb_flags & IPS_SEND_FLAG_PROTO_OPTS); +} + +PSMI_ALWAYS_INLINE( +ips_epaddr_flow_t +ips_proto_flowid(struct ips_message_header *p_hdr)) +{ + return (ips_epaddr_flow_t) ((__be32_to_cpu(p_hdr->bth[1]) >> + HFI_BTH_FLOWID_SHIFT) & + HFI_BTH_FLOWID_MASK); +} + +PSMI_ALWAYS_INLINE( +int +ips_do_cksum(struct ips_proto *proto, struct ips_message_header *p_hdr, + void *payload, uint32_t paylen, uint32_t *cksum)) +{ + uint16_t paywords; + + /* Update the payload words in header */ + paywords = (sizeof(struct ips_message_header) + paylen + + PSM_CRC_SIZE_IN_BYTES + HFI_CRC_SIZE_IN_BYTES) >> + BYTE2DWORD_SHIFT; + p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK); + + /* Need to regenerate KDETH checksum after updating payload length */ + /* ips_kdeth_cksum(p_hdr); */ + + *cksum = 0xffffffff; + + /* Checksum header */ + *cksum = ips_crc_calculate(sizeof(struct ips_message_header), + (uint8_t *) p_hdr, *cksum); + + /* Checksum payload (if any) */ + if (paylen) { + psmi_assert_always(payload); + *cksum = ips_crc_calculate(paylen, (uint8_t *) payload, *cksum); + } + + return 0; +} + +PSMI_ALWAYS_INLINE( +uint32_t +ips_proto_dest_context_from_header(struct ips_proto *proto, + struct ips_message_header *p_hdr)) +{ + return (__be32_to_cpu(p_hdr->bth[1]) & 0xFF); +} + +PSMI_ALWAYS_INLINE( +void +ips_proto_hdr(struct ips_proto *proto, struct ips_epaddr *ipsaddr, + struct ips_flow *flow, ips_scb_t *scb, uint8_t flags)) +{ + uint16_t slid, dlid; + uint32_t paywords = (sizeof(struct ips_message_header) + + scb->payload_size + HFI_CRC_SIZE_IN_BYTES) >> + BYTE2DWORD_SHIFT; + struct ips_message_header *p_hdr = &scb->ips_lrh; +#if 0 + /* + * This scb has been used by this connection last time, + * so some of the header fields are already set. + */ + if (scb->flow == flow) { + p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK); + + p_hdr->bth[0] = __cpu_to_be32(flow->path->pr_pkey | + (scb-> + opcode << BTH_OPCODE_SHIFT) | + (extra_bytes << + BTH_EXTRA_BYTE_SHIFT)); + p_hdr->bth[2] = + __cpu_to_be32(flow->xmit_seq_num. + psn | (scb->scb_flags & IPS_SEND_FLAG_ACKREQ)); + + p_hdr->khdr.kdeth0 = __cpu_to_le32(scb->offset | + (scb-> + offset_mode << + HFI_KHDR_OM_SHIFT) | (scb-> + tid << + HFI_KHDR_TID_SHIFT) + | (scb-> + tidctrl << + HFI_KHDR_TIDCTRL_SHIFT) | + (scb-> + flags & IPS_SEND_FLAG_INTR) + | (scb-> + flags & + IPS_SEND_FLAG_HDR_SUPPRESS) + | (IPS_PROTO_VERSION << + HFI_KHDR_KVER_SHIFT)); + + /* ips_kdeth_cksum(p_hdr); // Generate KDETH checksum */ + + p_hdr->ack_seq_num = flow->recv_seq_num.psn; + p_hdr->flags = flags; + + return; + } +#endif + slid = flow->path->pr_slid; + dlid = flow->path->pr_dlid; + if (scb->scb_flags & IPS_SEND_FLAG_NO_LMC) { + slid = ipsaddr->pathgrp->pg_base_slid; + dlid = ipsaddr->pathgrp->pg_base_dlid; + } + + /* Setup LRH fields */ + p_hdr->lrh[0] = __cpu_to_be16(HFI_LRH_BTH | + ((flow->path->pr_sl & HFI_LRH_SL_MASK) << + HFI_LRH_SL_SHIFT) | + ((proto->sl2sc[flow->path->pr_sl] & + HFI_LRH_SC_MASK) << HFI_LRH_SC_SHIFT)); + p_hdr->lrh[1] = dlid; + p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK); + p_hdr->lrh[3] = slid; + + /* Setup BTH fields */ + p_hdr->bth[0] = __cpu_to_be32(flow->path->pr_pkey | + (scb->opcode << HFI_BTH_OPCODE_SHIFT)); + p_hdr->bth[2] = __cpu_to_be32(flow->xmit_seq_num.psn_num | + (scb->scb_flags & IPS_SEND_FLAG_ACKREQ)); + + if (scb->tidctrl) { /* expected receive packet */ + psmi_assert(scb->tidsendc != NULL); + p_hdr->bth[1] = __cpu_to_be32(ipsaddr->context | + (ipsaddr-> + subcontext << + HFI_BTH_SUBCTXT_SHIFT) | + (scb->tidsendc-> + rdescid._desc_idx + << HFI_BTH_FLOWID_SHIFT) + | (proto->epinfo. + ep_baseqp << + HFI_BTH_QP_SHIFT)); + + /* Setup KHDR fields */ + p_hdr->khdr.kdeth0 = __cpu_to_le32(p_hdr->khdr.kdeth0 | + (scb->tidctrl << + HFI_KHDR_TIDCTRL_SHIFT) | + (scb->scb_flags & + IPS_SEND_FLAG_INTR) + | (scb->scb_flags & + IPS_SEND_FLAG_HDRSUPP) | + (IPS_PROTO_VERSION << + HFI_KHDR_KVER_SHIFT)); + } else { /* eager receive packet */ + p_hdr->bth[1] = __cpu_to_be32(ipsaddr->context | + (ipsaddr-> + subcontext << + HFI_BTH_SUBCTXT_SHIFT) | + (flow->flowid + << HFI_BTH_FLOWID_SHIFT) + | (proto->epinfo. + ep_baseqp << + HFI_BTH_QP_SHIFT)); + + /* Setup KHDR fields */ + p_hdr->khdr.kdeth0 = __cpu_to_le32(p_hdr->khdr.kdeth0 | + (scb->scb_flags & + IPS_SEND_FLAG_INTR) + | (IPS_PROTO_VERSION << + HFI_KHDR_KVER_SHIFT)); + + p_hdr->ack_seq_num = flow->recv_seq_num.psn_num; + } + + p_hdr->khdr.job_key = __cpu_to_le32(proto->epinfo.ep_jkey); + p_hdr->connidx = ipsaddr->connidx_outgoing; + p_hdr->flags = flags; + + scb->flow = flow; + + return; +} + +/* + * Assumes that the following fields are already set in scb: + * payload + * payload_size + * flags + */ +PSMI_INLINE( +void +ips_scb_prepare_flow_inner(struct ips_proto *proto, struct ips_epaddr *ipsaddr, + struct ips_flow *flow, ips_scb_t *scb)) +{ + psmi_assert((scb->payload_size & 3) == 0); + ips_proto_hdr(proto, ipsaddr, flow, scb, + ips_flow_gen_ackflags(scb, flow)); + + scb->ack_timeout = proto->epinfo.ep_timeout_ack; + scb->abs_timeout = TIMEOUT_INFINITE; + scb->scb_flags |= IPS_SEND_FLAG_PENDING; + + if (flow->protocol == PSM_PROTOCOL_TIDFLOW) { + flow->xmit_seq_num.psn_seq += scb->nfrag; + scb->seq_num = flow->xmit_seq_num; + scb->seq_num.psn_seq--; + } else { + flow->xmit_seq_num.psn_num = + (flow->xmit_seq_num.psn_num + scb->nfrag) & proto->psn_mask; + scb->seq_num.psn_num = + (flow->xmit_seq_num.psn_num - 1) & proto->psn_mask; + } + + return; +} + +PSMI_ALWAYS_INLINE( +void +ips_proto_epaddr_stats_set(struct ips_proto *proto, uint8_t msgtype)) +{ + switch (msgtype) { + case OPCODE_ACK: + break; + case OPCODE_ERR_CHK: + case OPCODE_ERR_CHK_GEN: + proto->epaddr_stats.err_chk_send++; + break; + case OPCODE_NAK: + proto->epaddr_stats.nak_send++; + break; + case OPCODE_CONNECT_REQUEST: + proto->epaddr_stats.connect_req++; + break; + case OPCODE_DISCONNECT_REQUEST: + proto->epaddr_stats.disconnect_req++; + break; + default: + break; + } + return; +} + +/* + * Exported there solely for inlining is_expected_or_nak and mq_tiny handling + */ +extern +psm2_error_t ips_proto_send_ctrl_message(struct ips_flow *flow, + uint8_t message_type, uint16_t *msg_queue_mask, + ips_scb_t *ctrlscb, void *payload, uint32_t paylen); + +PSMI_ALWAYS_INLINE( +void +ips_proto_send_ack(struct ips_recvhdrq *recvq, struct ips_flow *flow)) +{ + if_pt(recvq->proto->flags & IPS_PROTO_FLAG_COALESCE_ACKS) { + if (flow->flags & IPS_FLOW_FLAG_PENDING_NAK) { + flow->flags &= ~IPS_FLOW_FLAG_PENDING_NAK; /* ACK clears NAK */ + } else if (!(flow->flags & IPS_FLOW_FLAG_PENDING_ACK)) { + SLIST_INSERT_HEAD(&recvq->pending_acks, flow, next); + } + + flow->flags |= IPS_FLOW_FLAG_PENDING_ACK; + } + else { + ips_scb_t ctrlscb; + + ctrlscb.scb_flags = 0; + ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num; + /* Coalesced ACKs disabled. Send ACK immediately */ + ips_proto_send_ctrl_message(flow, OPCODE_ACK, + &flow->ipsaddr->ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + } +} + +PSMI_ALWAYS_INLINE( +void +ips_proto_send_nak(struct ips_recvhdrq *recvq, struct ips_flow *flow)) +{ + if_pt(recvq->proto->flags & IPS_PROTO_FLAG_COALESCE_ACKS) { + if (flow->flags & IPS_FLOW_FLAG_PENDING_ACK) { + flow->flags &= ~IPS_FLOW_FLAG_PENDING_ACK; /* NAK clears ACK */ + } else if (!(flow->flags & IPS_FLOW_FLAG_PENDING_NAK)) { + SLIST_INSERT_HEAD(&recvq->pending_acks, flow, next); + } + + flow->flags |= IPS_FLOW_FLAG_PENDING_NAK; + } + else { + ips_scb_t ctrlscb; + + ctrlscb.scb_flags = 0; + ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num; + /* Coalesced ACKs disabled. Send NAK immediately */ + ips_proto_send_ctrl_message(flow, OPCODE_NAK, + &flow->ipsaddr->ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + } +} + +/* return 1 if packet is next expected in flow + * return 0 if packet is not next expected in flow (and nak packet). + */ +PSMI_ALWAYS_INLINE( +int +ips_proto_is_expected_or_nak(struct ips_recvhdrq_event *rcv_ev)) +{ + struct ips_proto *proto = rcv_ev->proto; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); + struct ips_flow *flow; + psmi_seqnum_t sequence_num; + + psmi_assert((flowid == EP_FLOW_GO_BACK_N_PIO) || + (flowid == EP_FLOW_GO_BACK_N_DMA) + ); + flow = &ipsaddr->flows[flowid]; + /* If packet faced congestion generate BECN in NAK. */ + if_pf((rcv_ev->is_congested & IPS_RECV_EVENT_FECN) && + ((flow->cca_ooo_pkts & 0xf) == 0)) { + /* Generate a BECN for every 16th OOO packet marked with a FECN. */ + flow->flags |= IPS_FLOW_FLAG_GEN_BECN; + flow->cca_ooo_pkts++; + rcv_ev->proto->epaddr_stats.congestion_pkts++; + rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; /* Clear FECN event */ + } + + sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); + if_pf(flow->recv_seq_num.psn_num == sequence_num.psn_num) { + flow->flags &= ~IPS_FLOW_FLAG_NAK_SEND; + + flow->recv_seq_num.psn_num = + (flow->recv_seq_num.psn_num + 1) & proto->psn_mask; + flow->cca_ooo_pkts = 0; + + /* don't process ack, caller will do it. */ + return 1; + + } + + int16_t diff = (int16_t) (sequence_num.psn_num - + flow->recv_seq_num.psn_num); + if (diff > 0) { + if (!(flow->flags & IPS_FLOW_FLAG_NAK_SEND)) { + /* Queue/Send NAK to peer */ + ips_proto_send_nak((struct ips_recvhdrq *) + rcv_ev->recvq, flow); + flow->flags |= IPS_FLOW_FLAG_NAK_SEND; + flow->cca_ooo_pkts = 0; + } else if (proto->flags & IPS_PROTO_FLAG_CCA) { + flow->cca_ooo_pkts = diff; + if (flow->cca_ooo_pkts > flow->ack_interval) { + ips_scb_t ctrlscb; + + rcv_ev->proto->epaddr_stats.congestion_pkts++; + flow->flags |= IPS_FLOW_FLAG_GEN_BECN; + _HFI_CCADBG + ("BECN Generation. Expected: %d, Got: %d.\n", + flow->recv_seq_num.psn_num, + sequence_num.psn_num); + + ctrlscb.scb_flags = 0; + ctrlscb.ips_lrh.data[0].u32w0 = + flow->cca_ooo_pkts; + /* Send Control message to throttle flow. Will clear flow flag and + * reset cca_ooo_pkts. + */ + ips_proto_send_ctrl_message(flow, + OPCODE_BECN, + &flow->ipsaddr-> + ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + } + } + } + + /* process ack if packet is not in sequence. */ + ips_proto_process_ack(rcv_ev); + + return 0; +} + +/* + * Note, some code depends on the literal values specified in this enum. + */ +enum ips_msg_order { + IPS_MSG_ORDER_PAST = 3, /* Old message, recv & drop */ + IPS_MSG_ORDER_EXPECTED_MATCH = 2, /* Expected message, recv on match */ + IPS_MSG_ORDER_EXPECTED = 1, /* Expected message, always recv */ + IPS_MSG_ORDER_FUTURE_RECV = 0, /* Future message, buffer in OOO Q */ + IPS_MSG_ORDER_FUTURE = -1, /* Future message, leave on RHQ */ +}; + +PSMI_ALWAYS_INLINE( +enum ips_msg_order +ips_proto_check_msg_order(ips_epaddr_t *ipsaddr, + struct ips_flow *flow, + uint16_t send_seqnum, + uint16_t *recv_seqnum)) + +{ + int16_t diff = (int16_t) (*recv_seqnum - send_seqnum); + + if (likely(diff == 0)) { + *recv_seqnum += 1; + + ipsaddr->msg_toggle ^= IPS_FLOW_MSG_TOGGLE_UNEXP_MASK; + if (ipsaddr->msg_toggle & IPS_FLOW_MSG_TOGGLE_UNEXP_MASK) + return IPS_MSG_ORDER_EXPECTED_MATCH; + + return IPS_MSG_ORDER_EXPECTED; + } else if (diff > 0) { + return IPS_MSG_ORDER_PAST; + } + + ipsaddr->msg_toggle ^= IPS_FLOW_MSG_TOGGLE_OOO_MASK; + if (!(ipsaddr->msg_toggle & IPS_FLOW_MSG_TOGGLE_OOO_MASK)) { + /* + * Second time to see the same ooo message, receive and put + * into OOO queue. + */ + return IPS_MSG_ORDER_FUTURE_RECV; + } + + /* The first time to see an OOO message, leave it there and try + * next time. But we need to revert back the receiving flow PSN. */ + uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask; + flow->recv_seq_num.psn_num = + (flow->recv_seq_num.psn_num - 1) & psn_mask; + return IPS_MSG_ORDER_FUTURE; +} + +PSMI_INLINE( +int +ips_proto_process_packet(const struct ips_recvhdrq_event *rcv_ev)) +{ + uint32_t index; + + /* NOTE: Fault injection will currently not work with hardware + * suppression. See note below for reason why as we currently + * do not update the hardware tidflow table if FI is dropping + * the packet. + * + * We need to look into the packet before dropping it and + * if it's an expected packet AND we have hardware suppression + * then we need to update the hardware tidflow table and the + * associated tidrecvc state to fake having received a packet + * until some point in the window defined by the loss rate. + * This way the subsequent err chk will be NAKd and we can resync + * the flow with the sender. + * + * Note: For real errors the hardware generates seq/gen errors + * which are handled appropriately by the protocol. + */ + + if_pf(PSMI_FAULTINJ_ENABLED()) { + PSMI_FAULTINJ_STATIC_DECL(fi_recv, "recvlost", 1, + IPS_FAULTINJ_RECVLOST); + if (psmi_faultinj_is_fault(fi_recv)) + return IPS_RECVHDRQ_CONTINUE; + } + + /* see file ips_proto_header.h for details */ + index = _get_proto_hfi_opcode(rcv_ev->p_hdr) - OPCODE_RESERVED; + if (index >= (OPCODE_FUTURE_FROM - OPCODE_RESERVED)) + index = 0; + + return ips_packet_service_routine[index] + ((struct ips_recvhdrq_event *)rcv_ev); +} + +/* + * Breaks header encapsulation but needed in mq sends so we can pay + * "near-equal" attention to putting sends on the wire and servicing the + * receive queue. + */ + +PSMI_ALWAYS_INLINE( +psm2_error_t +ips_recv_progress_if_busy(ptl_t *ptl_gen, psm2_error_t err)) +{ + struct ptl_ips *ptl = (struct ptl_ips *) ptl_gen; + + if (err == PSM2_EP_NO_RESOURCES) { + ptl->ctl->ep_poll(ptl_gen, 0); + return PSM2_OK; + } else + return err; +} + +/* Find next lowest power of a two for a 32 bit number*/ +PSMI_ALWAYS_INLINE( +unsigned int +ips_next_low_pow2(unsigned int v)) +{ + + const unsigned int b[] = { 0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000 }; + const unsigned int S[] = { 1, 2, 4, 8, 16 }; + register unsigned int r = 1; + int i; + + for (i = 4; i >= 0; i--) { + if (v & b[i]) { + v >>= S[i]; + r <<= S[i]; + } + } + + return r; +} + +PSMI_ALWAYS_INLINE( +ips_path_rec_t * +ips_select_path(struct ips_proto *proto, ips_path_type_t path_type, + ips_epaddr_t *ipsaddr, ips_path_grp_t *pathgrp)) +{ + uint32_t path_idx; + + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) { + /* If dispersive routes are configured then select the routes + * in round robin order. We may want to use congestion + * information to select the least lightly loaded path. + */ + path_idx = pathgrp->pg_next_path[path_type]; + if (++pathgrp->pg_next_path[path_type] >= + pathgrp->pg_num_paths[path_type]) + pathgrp->pg_next_path[path_type] = 0; + } else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST) + path_idx = /* Key on destination context */ + ipsaddr->context % pathgrp->pg_num_paths[path_type]; + else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC) + path_idx = /* Key off src context */ + proto->epinfo.ep_context % pathgrp->pg_num_paths[path_type]; + else /* Base LID routed - Default in Infinhfi 2.5 (Oct 09). */ + path_idx = 0; + + return pathgrp->pg_path[path_idx][path_type]; +} + +#endif /* _IPS_PROTO_HELP_H */ diff --git a/ptl_ips/ips_proto_internal.h b/ptl_ips/ips_proto_internal.h new file mode 100644 index 0000000..1b89b61 --- /dev/null +++ b/ptl_ips/ips_proto_internal.h @@ -0,0 +1,95 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_PROTO_INTERNAL_H +#define _IPS_PROTO_INTERNAL_H + +#include "ips_expected_proto.h" +#include "ips_proto_help.h" + +/* + * Connect protocol. + * + * On receive, handled by upcalling into the connect interface. + * On send, handled by ips_proto by having connect compose the message. + */ +psm2_error_t ips_proto_process_connect(struct ips_proto *proto, + uint8_t opcode, + struct ips_message_header *p_hdr, + void *payload, uint32_t paylen); +int ips_proto_build_connect_message(struct ips_proto *proto, + ips_epaddr_t *ptladdr, + uint8_t opcode, void *payload); + +psm2_error_t ips_proto_timer_ack_callback(struct psmi_timer *, uint64_t); +psm2_error_t ips_proto_timer_send_callback(struct psmi_timer *, uint64_t); +psm2_error_t ips_proto_timer_ctrlq_callback(struct psmi_timer *, uint64_t); +psm2_error_t ips_proto_timer_pendq_callback(struct psmi_timer *, uint64_t); +psm2_error_t ips_cca_timer_callback(struct psmi_timer *current_timer, + uint64_t current); + +psm2_error_t ips_cca_adjust_rate(ips_path_rec_t *path_rec, int cct_increment); +void ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context); + +psm2_error_t ips_proto_recv_init(struct ips_proto *proto); +psm2_error_t ips_proto_recv_fini(struct ips_proto *proto); + +int ips_proto_process_err_chk(struct ips_recvhdrq_event *rcv_ev); +int ips_proto_process_err_chk_gen(struct ips_recvhdrq_event *rcv_ev); +int ips_proto_process_becn(struct ips_recvhdrq_event *rcv_ev); +int ips_proto_connect_disconnect(struct ips_recvhdrq_event *rcv_ev); +int ips_proto_process_unknown_opcode(struct ips_recvhdrq_event *rcv_ev); + +#endif /* _IPS_PROTO_INTERNAL_H */ diff --git a/ptl_ips/ips_proto_mq.c b/ptl_ips/ips_proto_mq.c new file mode 100644 index 0000000..32471fd --- /dev/null +++ b/ptl_ips/ips_proto_mq.c @@ -0,0 +1,1929 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#ifdef PSM_CUDA +#include "psm_gdrcpy.h" +#endif +#include "ips_scb.h" +#include "ips_proto.h" +#include "psm_mq_internal.h" +#include "ips_expected_proto.h" +#include "ips_proto_help.h" + +PSMI_NEVER_INLINE(ips_scb_t * + ips_poll_scb(struct ips_proto *proto, + int npkts, int len, uint32_t flags, int istiny)) +{ + ips_scb_t *scb = NULL; + psmi_assert(npkts > 0); + psm2_error_t err; + + proto->stats.scb_egr_unavail_cnt++; + + PSMI_BLOCKUNTIL(proto->ep, err, + ((scb = + (istiny ? + ips_scbctrl_alloc_tiny(&proto->scbc_egr) : + ips_scbctrl_alloc(&proto->scbc_egr, npkts, len, + flags))) != NULL)); + psmi_assert(scb != NULL); + return scb; +} + +PSMI_ALWAYS_INLINE(ips_scb_t *mq_alloc_tiny(struct ips_proto *proto)) +{ + ips_scb_t *scb = ips_scbctrl_alloc_tiny(&proto->scbc_egr); + /* common case should branch right through */ + if_pt(scb != NULL) + return scb; + else + return ips_poll_scb(proto, 1, 0, 0, 1); +} + +PSMI_ALWAYS_INLINE( +ips_scb_t * +mq_alloc_pkts(struct ips_proto *proto, int npkts, int len, uint32_t flags)) +{ + psmi_assert(npkts > 0); + ips_scb_t *scb = ips_scbctrl_alloc(&proto->scbc_egr, npkts, len, flags); + if_pt(scb != NULL) { + return scb; + } + else { + return ips_poll_scb(proto, npkts, len, flags, + 0 /* not tiny scb */); + } +} + +static +int ips_proto_mq_eager_complete(void *reqp, uint32_t nbytes) +{ + psm2_mq_req_t req = (psm2_mq_req_t) reqp; + + /* This code path is executed when the send is on a device buffer + * and the receive is completed using eager buffers. As there is no + * completion notification sent to the sender, this is the only place + * where send side chb's can be freed and put back into the mpool. + */ +#ifdef PSM_CUDA + struct ips_cuda_hostbuf *chb; + if (req->cuda_hostbuf_used) { + while (!STAILQ_EMPTY(&req->sendreq_prefetch)) { + /* If any prefetched buffers weren't used, they + must be reclaimed here. */ + chb = STAILQ_FIRST(&req->sendreq_prefetch); + STAILQ_REMOVE_HEAD(&req->sendreq_prefetch, + req_next); + psmi_mpool_put(chb); + } + } +#endif + + req->send_msgoff += nbytes; + /* + * the reason to use >= is because + * we may have DW pad in nbytes. + */ + if (req->send_msgoff >= req->req_data.send_msglen) { + req->state = MQ_STATE_COMPLETE; + ips_barrier(); + if(!psmi_is_req_internal(req)) + mq_qq_append(&req->mq->completed_q, req); + } + return IPS_RECVHDRQ_CONTINUE; +} + +static +int ips_proto_mq_rv_complete(void *reqp) +{ + psm2_mq_req_t req = (psm2_mq_req_t) reqp; + psmi_mq_handle_rts_complete(req); + + return IPS_RECVHDRQ_CONTINUE; +} + +static +void ips_proto_mq_rv_complete_exp(void *reqp) +{ + ips_proto_mq_rv_complete(reqp); + return; +} + +PSMI_ALWAYS_INLINE( +void +ips_shortcpy(void *vdest, const void *vsrc, uint32_t nchars)) +{ + unsigned char *dest = vdest; + const unsigned char *src = vsrc; + +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(vdest) || PSMI_IS_CUDA_MEM((void *) vsrc))) { + PSMI_CUDA_CALL(cuMemcpy, + (CUdeviceptr)vdest, (CUdeviceptr)vsrc, nchars); + return; + } +#endif + + if (nchars >> 2) + hfi_dwordcpy((uint32_t *) dest, (uint32_t *) src, nchars >> 2); + dest += (nchars >> 2) << 2; + src += (nchars >> 2) << 2; + switch (nchars & 0x03) { + case 3: + *dest++ = *src++; + case 2: + *dest++ = *src++; + case 1: + *dest++ = *src++; + } + return; +} + +#ifdef PSM_CUDA +PSMI_ALWAYS_INLINE( +void +ips_shortcpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars)) +{ + unsigned char *dest = vdest; + const unsigned char *src = vsrc; + + if (nchars >> 2) + hfi_dwordcpy((uint32_t *) dest, (uint32_t *) src, nchars >> 2); + dest += (nchars >> 2) << 2; + src += (nchars >> 2) << 2; + switch (nchars & 0x03) { + case 3: + *dest++ = *src++; + case 2: + *dest++ = *src++; + case 1: + *dest++ = *src++; + } + return; +} +#endif + +extern psm2_error_t ips_ptl_poll(ptl_t *ptl, int _ignored); + +/* + * Mechanism to capture PIO-ing or DMA-ing the MQ message envelope + * + * Recoverable errors: + * PSM2_OK: If PIO, envelope is sent. + * If DMA, all queued up packets on flow were flushed. + * + * Recoverable errors converted to PSM2_OK just before return: + * PSM2_OK_NO_PROGRESS: DMA-only, flushed 1 but not all queued packets. + * PSM2_EP_NO_RESOURCES: + * If PIO, no pio available or cable currently pulled. + * If DMA, can be that no scb's available to handle unaligned packets + * or writev returned a recoverable error (no mem for + * descriptors, dma interrupted or no space left in dma queue). + * + * Unrecoverable errors (PIO or DMA). + * PSM2_EP_DEVICE_FAILURE: Unexpected error calling writev(), chip failure, + * rxe/txe parity error. + * PSM2_EP_NO_NETWORK: No network, no lid, ... + */ +PSMI_ALWAYS_INLINE( +psm2_error_t +ips_mq_send_envelope(struct ips_proto *proto, struct ips_flow *flow, + struct ips_scb *scb, int do_flush)) +{ + psm2_error_t err = PSM2_OK; + + ips_proto_flow_enqueue(flow, scb); + + if ((flow->transfer == PSM_TRANSFER_PIO) || do_flush) + err = flow->flush(flow, NULL); + + if (do_flush) + err = ips_recv_progress_if_busy(proto->ptl, err); + + /* As per the PSM error model (or lack thereof), PSM clients expect to see + * only PSM2_OK as a recoverable error */ + if (err == PSM2_EP_NO_RESOURCES || err == PSM2_OK_NO_PROGRESS) + err = PSM2_OK; + return err; +} + +/* + * We don't use message striping for middle message protocol, + * Tests on sandy-bridge two HFIs show lower bandwidth if + * message striping is used. + */ +ustatic +psm2_error_t +ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req, + struct ips_flow *flow, psm2_mq_tag_t *tag, const void *ubuf, + uint32_t len) +{ + ips_epaddr_t *ipsaddr = flow->ipsaddr; + psm2_error_t err = PSM2_OK; + uintptr_t buf = (uintptr_t) ubuf; + uint32_t nbytes_left, pktlen, offset, chunk_size; + uint16_t msgseq, padding; + ips_scb_t *scb; + uint32_t is_non_dw_mul_allowed = 0; + + psmi_assert(len > 0); + psmi_assert(req != NULL); + + if (flow->transfer == PSM_TRANSFER_DMA) { + psmi_assert((proto->flags & IPS_PROTO_FLAG_SPIO) == 0); + /* max chunk size is the rv window size */ + chunk_size = ipsaddr->window_rv; + if (psmi_hal_has_cap(PSM_HAL_CAP_NON_DW_MULTIPLE_MSG_SIZE)) + is_non_dw_mul_allowed = 1; + } else { + psmi_assert((proto->flags & IPS_PROTO_FLAG_SDMA) == 0); + chunk_size = flow->frag_size; + } + msgseq = ipsaddr->msgctl->mq_send_seqnum++; + + nbytes_left = len; + offset = 0; + do { + if (is_non_dw_mul_allowed) { + /* No need to care about padding if non-double word + * multiple message size is allowed. + */ + padding = 0; + } else { + padding = nbytes_left & 0x3; + } + + if (padding) { + psmi_assert(nbytes_left > flow->frag_size); + /* over reading should be OK on sender because + * the padding area is within the whole buffer, + * receiver will discard the extra bytes via + * padcnt in packet header + */ + padding = 4 - padding; + pktlen = flow->frag_size - padding; + } else { + pktlen = min(chunk_size, nbytes_left); + psmi_assert(((pktlen & 0x3) == 0) || (is_non_dw_mul_allowed)); + } + + scb = mq_alloc_pkts(proto, 1, 0, 0); + psmi_assert(scb != NULL); + ips_scb_opcode(scb) = OPCODE_EAGER; + ips_set_LMC_LID_choice(proto, scb, len); + scb->ips_lrh.khdr.kdeth0 = msgseq; + ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); + scb->ips_lrh.hdr_data.u32w1 = len; + scb->ips_lrh.hdr_data.u32w0 = offset; /* initial offset */ + + _HFI_VDBG + ("payload=%p, thislen=%d, frag_size=%d, nbytes_left=%d\n", + (void *)buf, pktlen, flow->frag_size, nbytes_left); + ips_scb_buffer(scb) = (void *)buf; + +#ifdef PSM_CUDA + /* PSM would never send packets using eager protocol + * if GPU Direct RDMA is turned off, which makes setting + * these flags safe. + */ + if (req->is_buf_gpu_mem) { + ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + } +#endif + + buf += pktlen; + offset += pktlen; + nbytes_left -= pktlen; + + pktlen += padding; + psmi_assert(((pktlen & 0x3) == 0) || (is_non_dw_mul_allowed)); + + scb->frag_size = flow->frag_size; + scb->nfrag = (pktlen + flow->frag_size - 1) / flow->frag_size; + if (scb->nfrag > 1) { + ips_scb_length(scb) = flow->frag_size; + scb->nfrag_remaining = scb->nfrag; + scb->chunk_size = + scb->chunk_size_remaining = pktlen; + } else + ips_scb_length(scb) = pktlen; + + if (nbytes_left == 0) { /* last segment/packet */ + ips_scb_cb(scb) = ips_proto_mq_eager_complete; + ips_scb_cb_param(scb) = req; + + /* Set ACKREQ if single packet per scb. For multi + * packets per scb, it is SDMA, driver will set + * ACKREQ in last packet, we only need ACK for + * last packet. + */ + if (scb->nfrag == 1) + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ; + } else { + req->send_msgoff += pktlen; + } + + ips_proto_flow_enqueue(flow, scb); + if (flow->transfer == PSM_TRANSFER_PIO) { + /* we need to flush the pio pending queue as quick as possible */ + err = flow->flush(flow, NULL); + } + + } while (nbytes_left); + + /* after all sdma setup, flush sdma queue, + * we want one system call to handle as many scbs as possible. + */ + if (flow->transfer == PSM_TRANSFER_DMA) { + err = flow->flush(flow, NULL); + } + + /* Before return, try to make some progress as long as the operation is + * not a fast path isend. If this is a fast path isend we cannot call + * progress functions since that will cause recursion into recvhdrq_progress + * and cause messages to be lost. Instead, for fast path if the operation + * was successfully enqueued, but flush returned PSM2_OK_NO_PROGRESS we return + * PSM2_OK since the user will progress the queue once the fast path call is + * complete. + */ + if (err == PSM2_EP_NO_RESOURCES || err == PSM2_OK_NO_PROGRESS) { + if (likely(!(req->flags_internal & PSMI_REQ_FLAG_FASTPATH))) { + err = ips_recv_progress_if_busy(proto->ptl, PSM2_EP_NO_RESOURCES); + } else if (err == PSM2_EP_NO_RESOURCES) { + err = PSM2_OK; + } + } + + return err; +} + +static +psm2_error_t +ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, + ips_epaddr_t *ipsaddr, const void *buf, uint32_t len) +{ + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; + psm2_error_t err = PSM2_OK; + ips_scb_t *scb; + + PSM2_LOG_MSG("entering"); + req->req_data.buf = (void *)buf; + req->req_data.buf_len = len; + req->req_data.send_msglen = len; + req->recv_msgoff = 0; + req->rts_peer = (psm2_epaddr_t) ipsaddr; + + scb = mq_alloc_pkts(proto, 1, 0, 0); + psmi_assert(scb); + ips_scb_opcode(scb) = OPCODE_LONG_RTS; + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ; + if (req->type & MQE_TYPE_WAITING) + ips_scb_flags(scb) |= IPS_SEND_FLAG_BLOCKING; + scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++; + ips_scb_copy_tag(scb->ips_lrh.tag, req->req_data.tag.tag); + scb->ips_lrh.hdr_data.u32w1 = len; + scb->ips_lrh.hdr_data.u32w0 = psmi_mpool_get_obj_index(req); + + if (len <= flow->frag_size && +#ifdef PSM_CUDA + !req->is_buf_gpu_mem && +#endif + !(len & 0x3)) { + ips_scb_buffer(scb) = (void *)buf; + ips_scb_length(scb) = len; + req->send_msgoff = len; + } else { + ips_scb_length(scb) = 0; + req->send_msgoff = 0; + } + +#ifdef PSM_CUDA + /* Used to indicate to the receiver that the send + * is issued on a device buffer. This helps the + * receiver select TID instead of using eager buffers. + */ + if (req->is_buf_gpu_mem) { + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + scb->mq_req = req; /* request comes from GPU domain (device) ... */ + } + req->cuda_hostbuf_used = 0; + if ((!(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) && + req->is_buf_gpu_mem && + (len > GPUDIRECT_THRESH_RV)) || + ((proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) && + req->is_buf_gpu_mem && + (len > gpudirect_send_threshold))) { + /* send from intermediate host buffer */ + struct ips_cuda_hostbuf *chb; + uint32_t offset, window_len; + int prefetch_lookahead = 0; + + STAILQ_INIT(&req->sendreq_prefetch); + offset = 0; + req->cuda_hostbuf_used = 1; + /* start prefetching */ + req->prefetch_send_msgoff = 0; + while ((offset < len) && + (prefetch_lookahead < proto->cuda_prefetch_limit)) { + chb = NULL; + window_len = + ips_cuda_next_window(ipsaddr->window_rv, + offset, len); + + if (window_len <= CUDA_SMALLHOSTBUF_SZ) + chb = (struct ips_cuda_hostbuf *) + psmi_mpool_get( + proto->cuda_hostbuf_pool_small_send); + if (chb == NULL) + chb = (struct ips_cuda_hostbuf *) + psmi_mpool_get( + proto->cuda_hostbuf_pool_send); + + /* any buffers available? */ + if (chb == NULL) + break; + + req->prefetch_send_msgoff += window_len; + + chb->offset = offset; + chb->size = window_len; + chb->req = req; + chb->gpu_buf = (CUdeviceptr) buf + offset; + chb->bytes_read = 0; + + PSMI_CUDA_CALL(cuMemcpyDtoHAsync, + chb->host_buf, chb->gpu_buf, + window_len, + proto->cudastream_send); + PSMI_CUDA_CALL(cuEventRecord, + chb->copy_status, + proto->cudastream_send); + + STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, + req_next); + offset += window_len; + prefetch_lookahead++; + } + } +#endif + + PSM2_LOG_EPM_COND((len > proto->mq->hfi_thresh_rv) && + proto->protoexp, + OPCODE_LONG_RTS,PSM2_LOG_TX,proto->ep->epid, req->rts_peer->epid, + "scb->ips_lrh.hdr_data.u32w0: %d",scb->ips_lrh.hdr_data.u32w0); + + /* If this is a fast path isend, then we cannot poll or + * allow progressing of the mq from within the fast path + * call otherwise messages will be lost. Therefore given fast path + * we will avoid calling poll_internal and not set PSMI_TRUE which would + * call ips_recv_progress_if_busy. + */ + if (unlikely(req->flags_internal & PSMI_REQ_FLAG_FASTPATH)) { + if ((err = ips_mq_send_envelope(proto, flow, scb, PSMI_FALSE))) + goto fail; + } else { + if ((err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE))) + goto fail; + + /* Assume that we already put a few rndv requests in flight. This helps + * for bibw microbenchmarks and doesn't hurt the 'blocking' case since + * we're going to poll anyway */ + psmi_poll_internal(proto->ep, 1); + } + +fail: + _HFI_VDBG + ("[rndv][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p/%d]: %s\n", + psmi_epaddr_get_name(proto->ep->epid), + psmi_epaddr_get_name(req->rts_peer->epid), buf, len, + req->req_data.tag.tag[0], req->req_data.tag.tag[1], req->req_data.tag.tag[2], req, + psmi_mpool_get_obj_index(req), psm2_error_get_string(err)); + PSM2_LOG_MSG("leaving"); + return err; +} + +#ifdef PSM_CUDA +static inline +int psmi_cuda_is_buffer_gpu_mem(void *ubuf) +{ + return (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)); +} + +/* + * CUDA documentation dictates the use of SYNC_MEMOPS attribute + * when the buffer pointer received into PSM has been allocated + * by the application. This guarantees that all memory operations + * to this region of memory (used by multiple layers of the stack) + * always synchronize. + */ +static inline +void psmi_cuda_set_attr_sync_memops(void *ubuf) +{ + int trueflag = 1; + + PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag, + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr) ubuf); +} + +static inline +int psmi_cuda_is_needed_rendezvous(struct ips_proto *proto, uint32_t len) +{ + if (!(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) || + !PSMI_IS_GDR_COPY_ENABLED || + len < 1 || len > cuda_thresh_rndv){ + return 1; + } + + return 0; +} +#endif + +/* Find the correct flow (PIO/DMA) */ +static inline +ips_epaddr_flow_t +flow_select_type(struct ips_proto *proto, uint32_t len, int gpu_mem, + uint32_t eager_thresh) +{ + ips_epaddr_flow_t flow_type; + uint32_t pio_gdr_threshold; + +#ifdef PSM_CUDA + if (gpu_mem) { + pio_gdr_threshold = gdr_copy_threshold_send; + } else +#endif + { + pio_gdr_threshold = eager_thresh; + } + + if (len <= pio_gdr_threshold) { /* PIO or GDRcopy */ + flow_type = EP_FLOW_GO_BACK_N_PIO; + /* + * If PIO was disabled through the environment variable, + * override the flow value. + */ + if (unlikely(ips_proto_is_disabled_pio(proto))) + flow_type = EP_FLOW_GO_BACK_N_DMA; + } else { /* Send DMA */ + flow_type = EP_FLOW_GO_BACK_N_DMA; + /* + * If Send DMA was disabled through the environment variable, + * override the flow value. + */ + if (unlikely(ips_proto_is_disabled_sdma(proto))) + flow_type = EP_FLOW_GO_BACK_N_PIO; + } + + return flow_type; +} + +psm2_error_t ips_proto_msg_size_thresh_query (enum psm2_info_query_thresh_et qt, + uint32_t *out, psm2_mq_t mq, psm2_epaddr_t epaddr) +{ + struct ptl_ips *ptl = (struct ptl_ips *) epaddr->ptlctl->ptl; + psm2_error_t rv = PSM2_INTERNAL_ERR; + + switch (qt) + { + case PSM2_INFO_QUERY_THRESH_IPS_PIO_DMA: + *out = ptl->proto.iovec_thresh_eager; + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_THRESH_IPS_TINY: + *out = mq->hfi_thresh_tiny; + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_THRESH_IPS_PIO_FRAG_SIZE: + { + ips_epaddr_t *ipsaddr = ((ips_epaddr_t *) epaddr)->msgctl->ipsaddr_next; + *out = ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].frag_size; + } + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_THRESH_IPS_DMA_FRAG_SIZE: + { + ips_epaddr_t *ipsaddr = ((ips_epaddr_t *) epaddr)->msgctl->ipsaddr_next; + *out = ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA].frag_size; + } + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_THRESH_IPS_RNDV: + *out = mq->hfi_thresh_rv; + rv = PSM2_OK; + break; + default: + break; + } + + return rv; +} + +psm2_error_t +ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user, + uint32_t flags_internal, psm2_mq_tag_t *tag, const void *ubuf, + uint32_t len, void *context, psm2_mq_req_t *req_o) +{ + psm2_error_t err = PSM2_OK; + ips_epaddr_flow_t flow_type; + struct ips_proto *proto; + struct ips_flow *flow; + ips_epaddr_t *ipsaddr; + ips_scb_t *scb; + psm2_mq_req_t req; + + req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); + if_pf(req == NULL) + return PSM2_NO_MEMORY; + + req->flags_user = flags_user; + req->flags_internal = flags_internal; + ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next; + ipsaddr->msgctl->ipsaddr_next = ipsaddr->next; + proto = ((psm2_epaddr_t) ipsaddr)->proto; + + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + req->req_data.send_msglen = len; + req->req_data.tag = *tag; + req->req_data.context = context; + +#ifdef PSM_CUDA + req->is_buf_gpu_mem = psmi_cuda_is_buffer_gpu_mem((void*)ubuf); + if (req->is_buf_gpu_mem) { + psmi_cuda_set_attr_sync_memops((void*)ubuf); + if (psmi_cuda_is_needed_rendezvous(proto, len)) + goto do_rendezvous; + } +#else + req->is_buf_gpu_mem = 0; +#endif + flow_type = flow_select_type(proto, len, req->is_buf_gpu_mem, + proto->iovec_thresh_eager); + flow = &ipsaddr->flows[flow_type]; + + if (flags_user & PSM2_MQ_FLAG_SENDSYNC) { + goto do_rendezvous; + } else if (len <= mq->hfi_thresh_tiny) { + scb = mq_alloc_tiny(proto); + psmi_assert(scb); + ips_scb_opcode(scb) = OPCODE_TINY; + ips_set_LMC_LID_choice(proto, scb, len); + scb->ips_lrh.khdr.kdeth0 = + ((len & HFI_KHDR_TINYLEN_MASK) << HFI_KHDR_TINYLEN_SHIFT) | + ipsaddr->msgctl->mq_send_seqnum++; + ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); + + const void *user_buffer = ubuf; +#ifdef PSM_CUDA + if (req->is_buf_gpu_mem) { + /* The following functions PINS the GPU pages + * and mmaps the pages into the process virtual + * space. This allows PSM to issue a standard + * memcpy to move data between HFI resources + * and the GPU + */ + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, + (unsigned long)ubuf, len, 0, proto); + } + mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data, + (uint32_t *) user_buffer, len); +#else + mq_copy_tiny((uint32_t *) &scb->ips_lrh.hdr_data, + (uint32_t *) user_buffer, len); +#endif + + /* If this is a fast path isend, then we cannot allow + * progressing of the mq from within the fast path + * call otherwise messages will be lost. Therefore given fast path + * we will set PSMI_FALSE which will prevent the call to + * ips_recv_progress_if_busy. + */ + err = ips_mq_send_envelope(proto, flow, scb, !(flags_internal & PSMI_REQ_FLAG_FASTPATH)); + if (err != PSM2_OK) + return err; + + /* We can mark this op complete since all the data is now copied + * into an SCB that remains live until it is remotely acked */ + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); + _HFI_VDBG + ("[itiny][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf, + len, tag->tag[0], tag->tag[1], tag->tag[2], req); + } else if (len <= flow->frag_size) { + uint32_t paylen = len & ~0x3; + + scb = mq_alloc_pkts(proto, 1, 0, 0); + psmi_assert(scb); + ips_scb_opcode(scb) = OPCODE_SHORT; + ips_set_LMC_LID_choice(proto, scb, len); + scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++; + scb->ips_lrh.hdr_data.u32w1 = len; + ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); + const void * user_buffer = ubuf; +#ifdef PSM_CUDA + if (req->is_buf_gpu_mem && len <= gdr_copy_threshold_send){ + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, + (unsigned long)ubuf, len , 0, proto); + } +#endif + + ips_scb_buffer(scb) = (void *)user_buffer; + + ips_scb_length(scb) = paylen; + if (len > paylen) { + /* there are nonDW bytes, copy to header */ + mq_copy_tiny + ((uint32_t *)&scb->ips_lrh.hdr_data.u32w0, + (uint32_t *)((uintptr_t)ubuf + paylen), + len - paylen); + + /* for complete callback */ + req->send_msgoff = len - paylen; + } else { + req->send_msgoff = 0; + } + + /* + * Need ack for send side completion because we + * send from user buffer. + */ + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ; +#ifdef PSM_CUDA + if (req->is_buf_gpu_mem && len > gdr_copy_threshold_send) { + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; + } +#endif + /* If this is a fast path isend, then we cannot allow + * progressing of the mq from within the fast path + * call otherwise messages will be lost. Therefore given fast path + * we will set PSMI_FALSE which will prevent the call to + * ips_recv_progress_if_busy. + */ + err = ips_mq_send_envelope(proto, flow, scb, !(flags_internal & PSMI_REQ_FLAG_FASTPATH)); + if (err != PSM2_OK) + return err; + + /* + * It should be OK to check the buffer address in + * 'scb' to be changed, when this scb is done, the + * address is set to NULL when scb is put back to + * scb pool. Even if the same scb is re-used, it + * is not possible to set to this 'buf' address. + */ + if (ips_scb_buffer(scb) == (void *)user_buffer) { + /* continue to send from user buffer */ + ips_scb_cb(scb) = ips_proto_mq_eager_complete; + ips_scb_cb_param(scb) = req; + } else { + /* mark the message done */ + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); + } + _HFI_VDBG + ("[ishrt][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf, + len, tag->tag[0], tag->tag[1], tag->tag[2], req); + } else if (len <= mq->hfi_thresh_rv) { + req->send_msgoff = 0; + err = ips_ptl_mq_eager(proto, req, flow, tag, ubuf, len); + if (err != PSM2_OK) + return err; + + _HFI_VDBG + ("[ilong][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf, + len, tag->tag[0], tag->tag[1], tag->tag[2], req); + } else { /* skip eager accounting below */ +do_rendezvous: + err = ips_ptl_mq_rndv(proto, req, ipsaddr, ubuf, len); + *req_o = req; + return err; + } + + *req_o = req; + mq->stats.tx_num++; + mq->stats.tx_eager_num++; + mq->stats.tx_eager_bytes += len; + + return err; +} + +psm2_error_t +ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, + psm2_mq_tag_t *tag, const void *ubuf, uint32_t len) +{ + psm2_error_t err = PSM2_OK; + ips_epaddr_flow_t flow_type; + struct ips_proto *proto; + struct ips_flow *flow; + ips_epaddr_t *ipsaddr; + ips_scb_t *scb; + int gpu_mem = 0; + + ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next; + ipsaddr->msgctl->ipsaddr_next = ipsaddr->next; + proto = ((psm2_epaddr_t) ipsaddr)->proto; + + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + +#ifdef PSM_CUDA + gpu_mem = psmi_cuda_is_buffer_gpu_mem((void*)ubuf); + if (gpu_mem) { + psmi_cuda_set_attr_sync_memops((void*)ubuf); + if (psmi_cuda_is_needed_rendezvous(proto, len)) + goto do_rendezvous; + } +#endif + flow_type = flow_select_type(proto, len, gpu_mem, + proto->iovec_thresh_eager_blocking); + flow = &ipsaddr->flows[flow_type]; + + if (flags & PSM2_MQ_FLAG_SENDSYNC) { + goto do_rendezvous; + } else if (len <= mq->hfi_thresh_tiny) { + scb = mq_alloc_tiny(proto); + psmi_assert(scb); + ips_scb_opcode(scb) = OPCODE_TINY; + ips_set_LMC_LID_choice(proto, scb, len); + scb->ips_lrh.khdr.kdeth0 = + ((len & HFI_KHDR_TINYLEN_MASK) << HFI_KHDR_TINYLEN_SHIFT) | + ipsaddr->msgctl->mq_send_seqnum++; + ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); +#ifdef PSM_CUDA + const void *user_buffer = ubuf; + if (gpu_mem){ + /* The following functions PINS the GPU pages + * and mmaps the pages into the process virtual + * space. This allows PSM to issue a standard + * memcpy to move data between HFI resources + * and the GPU + */ + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, + (unsigned long)ubuf, len, 0, proto); + } + mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data, + (uint32_t *) user_buffer, len); +#else + mq_copy_tiny + ((uint32_t *) &scb->ips_lrh.hdr_data, + (uint32_t *) ubuf, len); +#endif + err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE); + if (err != PSM2_OK) + return err; + + _HFI_VDBG("[tiny][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), + ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]); + } else if (len <= flow->frag_size) { + uint32_t paylen = len & ~0x3; + + scb = mq_alloc_pkts(proto, 1, 0, 0); + psmi_assert(scb); + ips_scb_opcode(scb) = OPCODE_SHORT; + ips_set_LMC_LID_choice(proto, scb, len); + scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++; + scb->ips_lrh.hdr_data.u32w1 = len; + ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); + + const void * user_buffer = ubuf; +#ifdef PSM_CUDA + if (gpu_mem && len <= gdr_copy_threshold_send) { + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, + (unsigned long)ubuf, len, 0, proto); + } +#endif + + + ips_scb_buffer(scb) = (void *)user_buffer; + ips_scb_length(scb) = paylen; + if (len > paylen) { + /* there are nonDW bytes, copy to header */ + mq_copy_tiny + ((uint32_t *)&scb->ips_lrh.hdr_data.u32w0, + (uint32_t *)((uintptr_t)ubuf + paylen), + len - paylen); + } + + /* + * Need ack for send side completion because we + * send from user buffer. + */ + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ; +#ifdef PSM_CUDA + if (gpu_mem && len > gdr_copy_threshold_send) { + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; + } +#endif + err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE); + if (err != PSM2_OK) + return err; + + /* + * It should be OK to check the buffer address in + * 'scb' to be changed, when this scb is done, the + * address is set to NULL when scb is put back to + * scb pool. Even if the same scb is re-used, it + * is not possible to set to this 'ubuf' address. + */ + if (ips_scb_buffer(scb) == (void *)user_buffer) { + if (flow->transfer != PSM_TRANSFER_PIO || + paylen > proto->scb_bufsize || + !ips_scbctrl_bufalloc(scb)) { + /* sdma transfer (can't change user buffer), + * or, payload is larger than bounce buffer, + * or, can't allocate bounce buffer, + * send from user buffer till complete */ + PSMI_BLOCKUNTIL(mq->ep, err, + ips_scb_buffer(scb) != (void*)user_buffer); + if (err > PSM2_OK_NO_PROGRESS) + return err; + err = PSM2_OK; + } else { + /* copy to bounce buffer */ +#ifdef PSM_CUDA + ips_shortcpy_host_mem +#else + ips_shortcpy +#endif + (ips_scb_buffer(scb), + (void*)user_buffer, paylen); + } + } + _HFI_VDBG("[shrt][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), + ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]); + } else if (len <= mq->hfi_thresh_rv) { + psm2_mq_req_t req; + + /* Block until we can get a req */ + PSMI_BLOCKUNTIL(mq->ep, err, + (req = + psmi_mq_req_alloc(mq, MQE_TYPE_SEND))); + if (err > PSM2_OK_NO_PROGRESS) + return err; + +#ifdef PSM_CUDA + if (gpu_mem) { + req->is_buf_gpu_mem = 1; + } else + req->is_buf_gpu_mem = 0; +#endif + + req->type |= MQE_TYPE_WAITING; + req->req_data.send_msglen = len; + req->req_data.tag = *tag; + req->send_msgoff = 0; + req->flags_user = flags; + req->flags_internal |= PSMI_REQ_FLAG_IS_INTERNAL; + + err = ips_ptl_mq_eager(proto, req, flow, tag, ubuf, len); + if (err != PSM2_OK) + return err; + + psmi_mq_wait_internal(&req); + + _HFI_VDBG("[long][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), + ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]); + } else { + psm2_mq_req_t req; +do_rendezvous: + /* Block until we can get a req */ + PSMI_BLOCKUNTIL(mq->ep, err, + (req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND))); + if (err > PSM2_OK_NO_PROGRESS) + return err; + + req->type |= MQE_TYPE_WAITING; + req->req_data.tag = *tag; + req->flags_user = flags; + req->flags_internal |= PSMI_REQ_FLAG_IS_INTERNAL; + +#ifdef PSM_CUDA + /* CUDA documentation dictates the use of SYNC_MEMOPS attribute + * when the buffer pointer received into PSM has been allocated + * by the application. This guarantees the all memory operations + * to this region of memory (used by multiple layers of the stack) + * always synchronize + */ + if (gpu_mem) { + req->is_buf_gpu_mem = 1; + } else + req->is_buf_gpu_mem = 0; +#endif + + err = ips_ptl_mq_rndv(proto, req, ipsaddr, ubuf, len); + if (err != PSM2_OK) + return err; + psmi_mq_wait_internal(&req); + return err; /* skip accounting, done separately at completion time */ + } + + mq->stats.tx_num++; + mq->stats.tx_eager_num++; + mq->stats.tx_eager_bytes += len; + + return err; +} + +static +psm2_error_t +ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted) +{ + psm2_epaddr_t epaddr = req->rts_peer; + struct ips_proto *proto = epaddr->proto; + + /* We have a match. + * We may already set with first packet, + * If we're doing eager-based r-v, just send back the sreq and length and + * have the sender complete the send. + */ + PSM2_LOG_MSG("entering"); +#ifdef PSM_CUDA + /* Cases where we do not use TIDs: + * 1) Recv on a host buffer, Send on a gpu buffer and len is less than 3 bytes + * 2) Recv on a host buffer, Send on a host buffer and len is less than hfi_thresh_rv + * 3) Recv on gpu buf and len is less than 3 bytes + * 4) Expected protocol not initialized. + */ + if ((!req->is_buf_gpu_mem && ((req->is_sendbuf_gpu_mem && + req->req_data.recv_msglen <= GPUDIRECT_THRESH_RV)|| + (!req->is_sendbuf_gpu_mem && + req->req_data.recv_msglen <= proto->mq->hfi_thresh_rv))) || + (req->is_buf_gpu_mem && req->req_data.recv_msglen <= GPUDIRECT_THRESH_RV) || + proto->protoexp == NULL) { /* no expected tid recieve */ +#else + if (req->req_data.recv_msglen <= proto->mq->hfi_thresh_rv ||/* less rv theshold */ + proto->protoexp == NULL) { /* no expected tid recieve */ +#endif + /* there is no order requirement, try to push CTS request + * directly, if fails, then queue it for later try. */ + if (ips_proto_mq_push_cts_req(proto, req) != PSM2_OK) { + struct ips_pend_sends *pends = &proto->pend_sends; + struct ips_pend_sreq *sreq = + psmi_mpool_get(proto->pend_sends_pool); + psmi_assert(sreq != NULL); + if (sreq == NULL) + { + PSM2_LOG_MSG("leaving"); + return PSM2_NO_MEMORY; + } + sreq->type = IPS_PENDSEND_EAGER_REQ; + sreq->req = req; + + STAILQ_INSERT_TAIL(&pends->pendq, sreq, next); + psmi_timer_request(proto->timerq, &pends->timer, + PSMI_TIMER_PRIO_1); + } + } else { + ips_protoexp_tid_get_from_token(proto->protoexp, req->req_data.buf, + req->req_data.recv_msglen, epaddr, + req->rts_reqidx_peer, + req->type & MQE_TYPE_WAITING_PEER ? + IPS_PROTOEXP_TIDGET_PEERWAIT : + 0, ips_proto_mq_rv_complete_exp, + req); + } + + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} + +psm2_error_t +ips_proto_mq_push_cts_req(struct ips_proto *proto, psm2_mq_req_t req) +{ + ips_epaddr_t *ipsaddr = (ips_epaddr_t *) (req->rts_peer); + struct ips_flow *flow; + ips_scb_t *scb; + ptl_arg_t *args; + + PSM2_LOG_MSG("entering"); + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + flow = &ipsaddr->flows[proto->msgflowid]; + scb = ips_scbctrl_alloc(&proto->scbc_egr, 1, 0, 0); + if (scb == NULL) + { + PSM2_LOG_MSG("leaving"); + return PSM2_OK_NO_PROGRESS; + } + args = (ptl_arg_t *) scb->ips_lrh.data; + + ips_scb_opcode(scb) = OPCODE_LONG_CTS; + scb->ips_lrh.khdr.kdeth0 = 0; + args[0].u32w0 = psmi_mpool_get_obj_index(req); + args[1].u32w1 = req->req_data.recv_msglen; + args[1].u32w0 = req->rts_reqidx_peer; + + PSM2_LOG_EPM(OPCODE_LONG_CTS,PSM2_LOG_TX, proto->ep->epid, + flow->ipsaddr->epaddr.epid ,"req->rts_reqidx_peer: %d", + req->rts_reqidx_peer); + + ips_proto_flow_enqueue(flow, scb); + flow->flush(flow, NULL); + + /* have already received enough bytes */ + if (req->recv_msgoff == req->req_data.recv_msglen) { + ips_proto_mq_rv_complete(req); + } + + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} + +psm2_error_t +ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) +{ + psm2_error_t err = PSM2_OK; + uintptr_t buf = (uintptr_t) req->req_data.buf + req->recv_msgoff; + ips_epaddr_t *ipsaddr = (ips_epaddr_t *) (req->rts_peer); + uint32_t nbytes_left = req->req_data.send_msglen - req->recv_msgoff; + uint32_t nbytes_sent = 0; + uint32_t nbytes_this, chunk_size; + uint16_t frag_size, unaligned_bytes; + struct ips_flow *flow; + ips_scb_t *scb; + + psmi_assert(nbytes_left > 0); + + PSM2_LOG_MSG("entering."); + if ( +#ifdef PSM_CUDA + req->is_buf_gpu_mem || +#endif + req->req_data.send_msglen > proto->iovec_thresh_eager) { + /* use SDMA transfer */ + psmi_assert((proto->flags & IPS_PROTO_FLAG_SPIO) == 0); + flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA]; + frag_size = flow->path->pr_mtu; + /* max chunk size is the rv window size */ + chunk_size = ipsaddr->window_rv; + } else { + /* use PIO transfer */ + psmi_assert((proto->flags & IPS_PROTO_FLAG_SDMA) == 0); + flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]; + chunk_size = frag_size = flow->frag_size; + } + + do { + /* + * don't try to call progression routine such as: + * ips_recv_progress_if_busy() in this loop, + * it will cause recursive call of this function. + */ + + /* + * When tid code path is enabled, we donā€™t allocate scbc_rv + * objects. If the message is less than the hfi_thresh_rv, + * we normally use eager protocol to do the transfer. + * However, if it is sync send, we use the rendezvous + * rts/cts/rts-data protocol. + * In this case, because scbc_rv is null, + * we use scbc_egr instead. + */ + + scb = ips_scbctrl_alloc(proto->scbc_rv ? proto->scbc_rv + : &proto->scbc_egr, 1, 0, 0); + if (scb == NULL) { + err = PSM2_OK_NO_PROGRESS; + break; + } + ips_scb_opcode(scb) = OPCODE_LONG_DATA; + scb->ips_lrh.khdr.kdeth0 = 0; + scb->ips_lrh.data[0].u32w0 = req->rts_reqidx_peer; + scb->ips_lrh.data[1].u32w1 = req->req_data.send_msglen; + + /* attached unaligned bytes into packet header */ + unaligned_bytes = nbytes_left & 0x3; + if (unaligned_bytes) { + mq_copy_tiny((uint32_t *)&scb->ips_lrh.mdata, + (uint32_t *)buf, unaligned_bytes); + + /* position to send */ + buf += unaligned_bytes; + req->recv_msgoff += unaligned_bytes; + psmi_assert(req->recv_msgoff < 4); + + /* for complete callback */ + req->send_msgoff += unaligned_bytes; + + nbytes_left -= unaligned_bytes; + nbytes_sent += unaligned_bytes; + } + scb->ips_lrh.data[1].u32w0 = req->recv_msgoff; + ips_scb_buffer(scb) = (void *)buf; + + scb->frag_size = frag_size; + nbytes_this = min(chunk_size, nbytes_left); + if (nbytes_this > 0) + scb->nfrag = (nbytes_this + frag_size - 1) / frag_size; + else + scb->nfrag = 1; + + if (scb->nfrag > 1) { + ips_scb_length(scb) = frag_size; + scb->nfrag_remaining = scb->nfrag; + scb->chunk_size = + scb->chunk_size_remaining = nbytes_this; + } else + ips_scb_length(scb) = nbytes_this; + + buf += nbytes_this; + req->recv_msgoff += nbytes_this; + nbytes_sent += nbytes_this; + nbytes_left -= nbytes_this; + if (nbytes_left == 0) { + /* because of scb callback, use eager complete */ + ips_scb_cb(scb) = ips_proto_mq_eager_complete; + ips_scb_cb_param(scb) = req; + + /* Set ACKREQ if single packet per scb. For multi + * packets per scb, it is SDMA, driver will set + * ACKREQ in last packet, we only need ACK for + * last packet. + */ + if (scb->nfrag == 1) + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ; + } else { + req->send_msgoff += nbytes_this; + } + + ips_proto_flow_enqueue(flow, scb); + if (flow->transfer == PSM_TRANSFER_PIO) { + /* we need to flush the pio pending queue as quick as possible */ + flow->flush(flow, NULL); + } + + } while (nbytes_left); + + /* for sdma, if some bytes are queued, flush them */ + if (flow->transfer == PSM_TRANSFER_DMA && nbytes_sent) { + flow->flush(flow, NULL); + } + + PSM2_LOG_MSG("leaving."); + + return err; +} + +int +ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_proto *proto = rcv_ev->proto; + psm2_mq_t mq = proto->ep->mq; + struct ips_flow *flow; + psm2_mq_req_t req; + uint32_t paylen; + + /* + * if PSN does not match, drop the packet. + */ + PSM2_LOG_MSG("entering"); + if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) + { + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; + } + req = psmi_mpool_find_obj_by_index(mq->sreq_pool, p_hdr->data[1].u32w0); + psmi_assert(req != NULL); + + /* + * if there is payload, it is expected tid protocol + * with tid session info as the payload. + */ + paylen = ips_recvhdrq_event_paylen(rcv_ev); + if (paylen > 0) { + ips_tid_session_list *payload = + ips_recvhdrq_event_payload(rcv_ev); + psmi_assert(paylen == 0 || payload); + PSM2_LOG_EPM(OPCODE_LONG_CTS,PSM2_LOG_RX,rcv_ev->ipsaddr->epaddr.epid, + mq->ep->epid,"p_hdr->data[1].u32w0 %d", + p_hdr->data[1].u32w0); + proto->epaddr_stats.tids_grant_recv++; + + psmi_assert(p_hdr->data[1].u32w1 > mq->hfi_thresh_rv); + psmi_assert(proto->protoexp != NULL); + + /* ptl_req_ptr will be set to each tidsendc */ + if (req->ptl_req_ptr == NULL) { + req->req_data.send_msglen = p_hdr->data[1].u32w1; + } + psmi_assert(req->req_data.send_msglen == p_hdr->data[1].u32w1); + + if (ips_tid_send_handle_tidreq(proto->protoexp, + rcv_ev->ipsaddr, req, p_hdr->data[0], + p_hdr->mdata, payload, paylen) == 0) { + proto->psmi_logevent_tid_send_reqs.next_warning = 0; + } else { + flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)]; + flow->recv_seq_num.psn_num -= 1; /* Decrement seq number to NAK proper CTS */ + ips_proto_send_nak((struct ips_recvhdrq *)rcv_ev->recvq, flow); + static unsigned int msg_cnt = 0; + if (msg_cnt++ == 0) { /* Report the message only once */ + _HFI_INFO("PSM2 memory shortage detected. Please consider modifying PSM2_MEMORY setting\n"); + } + return PSM2_EP_NO_RESOURCES; + } + } else { + req->rts_reqidx_peer = p_hdr->data[0].u32w0; /* eager receive only */ + req->req_data.send_msglen = p_hdr->data[1].u32w1; + + if (req->send_msgoff >= req->req_data.send_msglen) { + /* already sent enough bytes, may truncate so using >= */ + ips_proto_mq_rv_complete(req); + } else if (ips_proto_mq_push_rts_data(proto, req) != PSM2_OK) { + /* there is no order requirement, tried to push RTS data + * directly and not done, so queue it for later try. */ + struct ips_pend_sreq *sreq = + psmi_mpool_get(proto->pend_sends_pool); + psmi_assert(sreq != NULL); + + sreq->type = IPS_PENDSEND_EAGER_DATA; + sreq->req = req; + STAILQ_INSERT_TAIL(&proto->pend_sends.pendq, sreq, next); + /* Make sure it's processed by timer */ + psmi_timer_request(proto->timerq, &proto->pend_sends.timer, + PSMI_TIMER_PRIO_1); + } + } + + flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)]; + if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); + + ips_proto_process_ack(rcv_ev); + + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; +} + +int +ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev) +{ + int ret = IPS_RECVHDRQ_CONTINUE; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)]; + psm2_mq_t mq = rcv_ev->proto->mq; + ips_msgctl_t *msgctl = ipsaddr->msgctl; + enum ips_msg_order msgorder; + char *payload; + uint32_t paylen; + psm2_mq_req_t req; + + /* + * if PSN does not match, drop the packet. + */ + PSM2_LOG_MSG("entering"); + if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) + { + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; + } + + msgorder = ips_proto_check_msg_order(ipsaddr, flow, + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK, + &ipsaddr->msgctl->mq_recv_seqnum); + if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE)) + { + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_REVISIT; + } + + payload = ips_recvhdrq_event_payload(rcv_ev); + paylen = ips_recvhdrq_event_paylen(rcv_ev); + /* either no payload or whole message */ + psmi_assert(paylen == 0 || paylen >= p_hdr->data[1].u32w1); + + /* + * We can't have past message sequence here. For eager message, + * it must always have an eager queue matching because even in + * truncation case the code logic will wait till all packets + * have been received. + */ + psmi_assert(msgorder != IPS_MSG_ORDER_PAST); + + _HFI_VDBG("tag=%llx reqidx_peer=%d, msglen=%d\n", + (long long)p_hdr->data[0].u64, + p_hdr->data[1].u32w0, p_hdr->data[1].u32w1); + + int rc = psmi_mq_handle_rts(mq, + (psm2_epaddr_t) &ipsaddr->msgctl-> + master_epaddr, + (psm2_mq_tag_t *) p_hdr->tag, + p_hdr->data[1].u32w1, payload, paylen, + msgorder, ips_proto_mq_rts_match_callback, + &req); + if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) { + uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask; + + flow->recv_seq_num.psn_num = + (flow->recv_seq_num.psn_num - 1) & psn_mask; + ipsaddr->msgctl->mq_recv_seqnum--; + + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_REVISIT; + } + + req->rts_peer = (psm2_epaddr_t) ipsaddr; + req->rts_reqidx_peer = p_hdr->data[1].u32w0; + if (req->req_data.send_msglen > mq->hfi_thresh_rv) + { + PSM2_LOG_EPM(OPCODE_LONG_RTS,PSM2_LOG_RX,req->rts_peer->epid,mq->ep->epid, + "req->rts_reqidx_peer: %d",req->rts_reqidx_peer); + } + if (p_hdr->flags & IPS_SEND_FLAG_BLOCKING) + req->type |= MQE_TYPE_WAITING_PEER; + +#ifdef PSM_CUDA + if (p_hdr->flags & IPS_SEND_FLAG_USER_BUF_GPU) + req->is_sendbuf_gpu_mem = 1; + else + req->is_sendbuf_gpu_mem = 0; +#endif + + if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) { + /* for out of order matching only */ + req->msg_seqnum = + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK; + req->ptl_req_ptr = (void *)msgctl; + + msgctl->outoforder_count++; + mq_qq_append(&mq->outoforder_q, req); + + ret = IPS_RECVHDRQ_BREAK; + } else { + ipsaddr->msg_toggle = 0; + + if (rc == MQ_RET_MATCH_OK) + ips_proto_mq_rts_match_callback(req, 1); + + /* XXX if blocking, break out of progress loop */ + + if (msgctl->outoforder_count) + ips_proto_mq_handle_outoforder_queue(mq, msgctl); + + if (rc == MQ_RET_UNEXP_OK) + ret = IPS_RECVHDRQ_BREAK; + } + + if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); + + ips_proto_process_ack(rcv_ev); + + PSM2_LOG_MSG("leaving"); + return ret; +} + +int +ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev) +{ + int ret = IPS_RECVHDRQ_CONTINUE; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)]; + psm2_mq_t mq = rcv_ev->proto->mq; + ips_msgctl_t *msgctl = ipsaddr->msgctl; + enum ips_msg_order msgorder; + char *payload; + uint32_t paylen; + psm2_mq_req_t req; + + /* + * if PSN does not match, drop the packet. + */ + if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) + return IPS_RECVHDRQ_CONTINUE; + + msgorder = ips_proto_check_msg_order(ipsaddr, flow, + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK, + &ipsaddr->msgctl->mq_recv_seqnum); + if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE)) + return IPS_RECVHDRQ_REVISIT; + + payload = (void *)&p_hdr->hdr_data; + paylen = (__le32_to_cpu(p_hdr->khdr.kdeth0) >> + HFI_KHDR_TINYLEN_SHIFT) & HFI_KHDR_TINYLEN_MASK; + + /* + * We can't have past message sequence here. For eager message, + * it must always have an eager queue matching because even in + * truncation case the code logic will wait till all packets + * have been received. + */ + psmi_assert(msgorder != IPS_MSG_ORDER_PAST); + + _HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n", + p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2], + OPCODE_TINY, p_hdr->hdr_data.u32w1); + + /* store in req below too! */ + int rc = psmi_mq_handle_envelope(mq, + (psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr, + (psm2_mq_tag_t *) p_hdr->tag, paylen, 0, + payload, paylen, msgorder, OPCODE_TINY, &req); + if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) { + uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask; + + flow->recv_seq_num.psn_num = + (flow->recv_seq_num.psn_num - 1) & psn_mask; + ipsaddr->msgctl->mq_recv_seqnum--; + + return IPS_RECVHDRQ_REVISIT; + } + + if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) { + /* for out of order matching only */ + req->msg_seqnum = + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK; + req->ptl_req_ptr = (void *)msgctl; + + msgctl->outoforder_count++; + mq_qq_append(&mq->outoforder_q, req); + + ret = IPS_RECVHDRQ_BREAK; + } else { + ipsaddr->msg_toggle = 0; + + if (msgctl->outoforder_count) + ips_proto_mq_handle_outoforder_queue(mq, msgctl); + + if (rc == MQ_RET_UNEXP_OK) + ret = IPS_RECVHDRQ_BREAK; + } + + if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); + + ips_proto_process_ack(rcv_ev); + + return ret; +} + +int +ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev) +{ + int ret = IPS_RECVHDRQ_CONTINUE; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)]; + psm2_mq_t mq = rcv_ev->proto->mq; + ips_msgctl_t *msgctl = ipsaddr->msgctl; + enum ips_msg_order msgorder; + char *payload; + uint32_t paylen; + psm2_mq_req_t req; + + /* + * if PSN does not match, drop the packet. + */ + if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) + return IPS_RECVHDRQ_CONTINUE; + + msgorder = ips_proto_check_msg_order(ipsaddr, flow, + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK, + &ipsaddr->msgctl->mq_recv_seqnum); + if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE)) + return IPS_RECVHDRQ_REVISIT; + + payload = ips_recvhdrq_event_payload(rcv_ev); + paylen = ips_recvhdrq_event_paylen(rcv_ev); + psmi_assert(paylen == 0 || payload); + + /* + * We can't have past message sequence here. For eager message, + * it must always have an eager queue matching because even in + * truncation case the code logic will wait till all packets + * have been received. + */ + psmi_assert(msgorder != IPS_MSG_ORDER_PAST); + + _HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n", + p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2], + OPCODE_SHORT, p_hdr->hdr_data.u32w1); + + /* store in req below too! */ + int rc = psmi_mq_handle_envelope(mq, + (psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr, + (psm2_mq_tag_t *) p_hdr->tag, + p_hdr->hdr_data.u32w1, p_hdr->hdr_data.u32w0, + payload, paylen, msgorder, OPCODE_SHORT, &req); + if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) { + uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask; + + flow->recv_seq_num.psn_num = + (flow->recv_seq_num.psn_num - 1) & psn_mask; + ipsaddr->msgctl->mq_recv_seqnum--; + + return IPS_RECVHDRQ_REVISIT; + } + + if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) { + /* for out of order matching only */ + req->msg_seqnum = + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK; + req->ptl_req_ptr = (void *)msgctl; + + msgctl->outoforder_count++; + mq_qq_append(&mq->outoforder_q, req); + + ret = IPS_RECVHDRQ_BREAK; + } else { + ipsaddr->msg_toggle = 0; + + if (msgctl->outoforder_count) + ips_proto_mq_handle_outoforder_queue(mq, msgctl); + + if (rc == MQ_RET_UNEXP_OK) + ret = IPS_RECVHDRQ_BREAK; + } + + if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); + + ips_proto_process_ack(rcv_ev); + + return ret; +} + +int +ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev) +{ + int ret = IPS_RECVHDRQ_CONTINUE; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)]; + psm2_mq_t mq = rcv_ev->proto->mq; + ips_msgctl_t *msgctl = ipsaddr->msgctl; + enum ips_msg_order msgorder; + char *payload; + uint32_t paylen; + psm2_mq_req_t req; + + /* + * if PSN does not match, drop the packet. + */ + if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) + return IPS_RECVHDRQ_CONTINUE; + + msgorder = ips_proto_check_msg_order(ipsaddr, flow, + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK, + &ipsaddr->msgctl->mq_recv_seqnum); + if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE)) + return IPS_RECVHDRQ_REVISIT; + + payload = ips_recvhdrq_event_payload(rcv_ev); + paylen = ips_recvhdrq_event_paylen(rcv_ev); + psmi_assert(paylen == 0 || payload); + + if (msgorder == IPS_MSG_ORDER_PAST || + msgorder == IPS_MSG_ORDER_FUTURE_RECV) { + req = mq_eager_match(mq, msgctl, + __le32_to_cpu(p_hdr->khdr.kdeth0)&HFI_KHDR_MSGSEQ_MASK); + /* + * It is future message sequence or past message sequence, + * and there is request matching in eager queue, we handle + * the packet data and return. We can't go continue to + * match envelope. + * Past message sequence must always have a matching!!! + * error is caught below. + */ + if (req) { +#ifdef PSM_CUDA + if (PSMI_USE_GDR_COPY(req, req->req_data.send_msglen)) { + req->req_data.buf = gdr_convert_gpu_to_host_addr(GDR_FD, + (unsigned long)req->user_gpu_buffer, + req->req_data.send_msglen, 1, rcv_ev->proto); + } +#endif + psmi_mq_handle_data(mq, req, + p_hdr->data[1].u32w0, payload, paylen); + + if (msgorder == IPS_MSG_ORDER_FUTURE_RECV) + ret = IPS_RECVHDRQ_BREAK; + + if ((__be32_to_cpu(p_hdr->bth[2]) & + IPS_SEND_FLAG_ACKREQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *) + rcv_ev->recvq, flow); + + ips_proto_process_ack(rcv_ev); + + return ret; + } + + psmi_assert(msgorder == IPS_MSG_ORDER_FUTURE_RECV); + /* + * For future message sequence, since there is no eager + * queue matching yet, this must be the first packet for + * the message sequence. And of course, expected message + * sequence is always the first packet for the sequence. + */ + } + + /* + * We can't have past message sequence here. For eager message, + * it must always have an eager queue matching because even in + * truncation case the code logic will wait till all packets + * have been received. + */ + psmi_assert(msgorder != IPS_MSG_ORDER_PAST); + + _HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n", + p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2], + OPCODE_EAGER, p_hdr->hdr_data.u32w1); + + /* store in req below too! */ + int rc = psmi_mq_handle_envelope(mq, + (psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr, + (psm2_mq_tag_t *) p_hdr->tag, + p_hdr->hdr_data.u32w1, p_hdr->hdr_data.u32w0, + payload, paylen, msgorder, OPCODE_EAGER, &req); + if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) { + uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask; + + flow->recv_seq_num.psn_num = + (flow->recv_seq_num.psn_num - 1) & psn_mask; + ipsaddr->msgctl->mq_recv_seqnum--; + + return IPS_RECVHDRQ_REVISIT; + } + + /* for both outoforder matching and eager matching */ + req->msg_seqnum = + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK; + req->ptl_req_ptr = (void *)msgctl; + + if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) { + msgctl->outoforder_count++; + mq_qq_append(&mq->outoforder_q, req); + + ret = IPS_RECVHDRQ_BREAK; + } else { + ipsaddr->msg_toggle = 0; + + if (msgctl->outoforder_count) + ips_proto_mq_handle_outoforder_queue(mq, msgctl); + + if (rc == MQ_RET_UNEXP_OK) + ret = IPS_RECVHDRQ_BREAK; + } + + if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); + + ips_proto_process_ack(rcv_ev); + + return ret; +} + +/* + * Progress the out of order queue to see if any message matches + * current receiving sequence number. + */ +void +ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl) +{ + psm2_mq_req_t req; + + do { + req = + mq_ooo_match(&mq->outoforder_q, msgctl, + msgctl->mq_recv_seqnum); + if (req == NULL) + return; + + msgctl->outoforder_count--; + msgctl->mq_recv_seqnum++; + + psmi_mq_handle_outoforder(mq, req); + + } while (msgctl->outoforder_count > 0); + + return; +} + +int +ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + psm2_mq_t mq = rcv_ev->proto->mq; + char *payload; + uint32_t paylen; + psm2_mq_req_t req; + struct ips_flow *flow; + + /* + * if PSN does not match, drop the packet. + */ + if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) + return IPS_RECVHDRQ_CONTINUE; + + req = psmi_mpool_find_obj_by_index(mq->rreq_pool, p_hdr->data[0].u32w0); + psmi_assert(req != NULL); + psmi_assert(p_hdr->data[1].u32w1 == req->req_data.send_msglen); + + /* + * if a packet has very small offset, it must have unaligned data + * attached in the packet header, and this must be the first packet + * for that message. + */ + if (p_hdr->data[1].u32w0 < 4 && p_hdr->data[1].u32w0 > 0) { + psmi_assert(p_hdr->data[1].u32w0 == (req->req_data.send_msglen&0x3)); + mq_copy_tiny((uint32_t *)req->req_data.buf, + (uint32_t *)&p_hdr->mdata, + p_hdr->data[1].u32w0); + req->send_msgoff += p_hdr->data[1].u32w0; + } + + payload = ips_recvhdrq_event_payload(rcv_ev); + paylen = ips_recvhdrq_event_paylen(rcv_ev); + psmi_assert(paylen == 0 || payload); + + psmi_mq_handle_data(mq, req, p_hdr->data[1].u32w0, payload, paylen); + + flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)]; + if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); + + ips_proto_process_ack(rcv_ev); + + return IPS_RECVHDRQ_CONTINUE; +} diff --git a/ptl_ips/ips_proto_params.h b/ptl_ips/ips_proto_params.h new file mode 100644 index 0000000..ae2b894 --- /dev/null +++ b/ptl_ips/ips_proto_params.h @@ -0,0 +1,235 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_PROTO_PARAMS_H +#define _IPS_PROTO_PARAMS_H + +/* + * send method: dma, pio; + * recv method: tid, egr; + * + * send-recv mode combinations: 1=on, 0=off + * A: dma:1, pio=1, tid=1, egr=1; + * B: dma:0, pio=1, tid=1, egr=1; + * C: dma:1, pio=0, tid=1, egr=1; + * D: dma:1, pio=1, tid=0, egr=1; + * E: dma:0, pio=1, tid=0, egr=1; + * F: dma:1, pio=0, tid=0, egr=1; + * + * message packet type: + * T: tiny; S: short; E: eager; + * LR: long rts; LC: long cts; LD: long data; + * ED: expected data; EC: expected completion; + * C: ctrl msg; + * + * send,recv method for each packet type and each send-recv mode + * ------------------------------------------------------------------- + * | | A | B | C | D | E | F | + * ------------------------------------------------------------------- + * | T | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr | + * ------------------------------------------------------------------- + * | S | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr | + * ------------------------------------------------------------------- + * | E | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |threshold + * ------------------------------------------------------------------- + * | LR | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr | + * ------------------------------------------------------------------- + * | LC | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr | + * ------------------------------------------------------------------- + * | LD | x | x | x | pio,egr | pio,egr | dma,egr |threshold + * ------------------------------------------------------------------- + * | ED | dma,tid | pio,tid | dma,tid | x | x | x | + * ------------------------------------------------------------------- + * | EC | pio,egr | pio,egr | dma,egr | x | x | x | + * ------------------------------------------------------------------- + * | C | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr | + * ------------------------------------------------------------------- + */ + +/* Constants */ +#define BYTE2DWORD_SHIFT 2 +#define LOWER_16_BITS 0xFFFF +#define PSM_CACHE_LINE_BYTES 64 +#define PSM2_FLOW_CREDITS 64 +#define PSM_CRC_SIZE_IN_BYTES 8 + +/* + * version of protocol header (known to chip also). + * This value for OPA is defined in spec. + */ +#define IPS_PROTO_VERSION 0x1 + +/* time conversion macros */ +#define us_2_cycles(us) nanosecs_to_cycles(1000ULL*(us)) +#define ms_2_cycles(ms) nanosecs_to_cycles(1000000ULL*(ms)) +#define sec_2_cycles(sec) nanosecs_to_cycles(1000000000ULL*(sec)) + +/* Per-flow flags */ +#define IPS_FLOW_FLAG_NAK_SEND 0x01 +#define IPS_FLOW_FLAG_PENDING_ACK 0x02 +#define IPS_FLOW_FLAG_PENDING_NAK 0x04 +#define IPS_FLOW_FLAG_GEN_BECN 0x08 +#define IPS_FLOW_FLAG_CONGESTED 0x10 +#define IPS_FLOW_FLAG_SKIP_CTS 0x20 + +/* tid session expected send flags */ +#define EXP_SEND_FLAG_CLEAR_ALL 0x00 +#define EXP_SEND_FLAG_FREE_TIDS 0x01 + +#define TIMEOUT_INFINITE 0xFFFFFFFFFFFFFFFFULL /* 64 bit all-one's */ + +/* + * scb flags for wire, + * Only the lower 6 bits are wire-protocol options + */ +#define IPS_SEND_FLAG_NONE 0x00 +#define IPS_SEND_FLAG_BLOCKING 0x01 /* blocking send */ +#define IPS_SEND_FLAG_PKTCKSUM 0x02 /* Has packet checksum */ +#define IPS_SEND_FLAG_AMISTINY 0x04 /* AM is tiny, exclusive */ + +#ifdef PSM_CUDA +/* This flag is used to indicate to the reciever when + * the send is issued on a device buffer. This helps in + * selecting TID path on the recieve side regardless of + * the receive buffers locality. It is used + * in a special case where the send is on a device + * buffer and the receive is on a host buffer. + */ +#define IPS_SEND_FLAG_USER_BUF_GPU 0x08 +#endif + +#define IPS_SEND_FLAG_PROTO_OPTS 0x3f /* only 6bits wire flags */ + +/* scb flags */ +#define IPS_SEND_FLAG_PENDING 0x0100 +#define IPS_SEND_FLAG_PERSISTENT 0x0200 +#define IPS_SEND_FLAG_NO_LMC 0x0400 + +#ifdef PSM_CUDA +/* This flag is used to indicate if the send is on + * a GPU buffer. This helps PIO/SDMA paths to detect + * if payload is GPU buffer without having to call + * cudaGetPointerAttribute. + */ +#define IPS_SEND_FLAG_PAYLOAD_BUF_GPU 0x0800 +#endif + +/* 0x10000000, interrupt when done */ +#define IPS_SEND_FLAG_INTR (1< 0) + proto->stray_warn_interval = sec_2_cycles(interval_secs); + else + proto->stray_warn_interval = 0; + + return PSM2_OK; +} + +psm2_error_t ips_proto_recv_fini(struct ips_proto *proto) +{ + ips_report_strays(proto); + return PSM2_OK; +} + +#define cycles_to_sec_f(cycles) \ + (((double)cycles_to_nanosecs(cycles)) / 1000000000.0) + +struct ips_stray_epid { + psm2_epid_t epid; + uint32_t err_check_bad_sent; + uint32_t ipv4_addr; + uint32_t pid; + uint32_t num_messages; + uint64_t t_warn_next; + uint64_t t_first; + uint64_t t_last; +}; + +static +void ips_report_strays(struct ips_proto *proto) +{ + struct ips_stray_epid *sepid; + struct psmi_eptab_iterator itor; + psmi_epid_itor_init(&itor, PSMI_EP_CROSSTALK); + +#if _HFI_DEBUGGING + double t_first = 0; + double t_last = 0; + double t_runtime = 0; + if (_HFI_INFO_ON) { + t_runtime = cycles_to_sec_f(proto->t_fini - proto->t_init); + } +#endif + + while ((sepid = psmi_epid_itor_next(&itor))) { + char ipbuf[INET_ADDRSTRLEN], *ip = NULL; + char bufpid[32]; + uint32_t lid = psm2_epid_nid(sepid->epid); +#if _HFI_DEBUGGING + if (_HFI_INFO_ON) { + t_first = + cycles_to_sec_f(sepid->t_first - proto->t_init); + t_last = + cycles_to_sec_f(sepid->t_last - proto->t_init); + } +#endif + if (sepid->ipv4_addr) + ip = (char *) + inet_ntop(AF_INET, &sepid->ipv4_addr, ipbuf, + sizeof(ipbuf)); + if (!ip) + snprintf(ipbuf, sizeof(ipbuf), "%d (%x)", lid, lid); + + if (sepid->pid) + snprintf(bufpid, sizeof(bufpid), "PID=%d", sepid->pid); + else + snprintf(bufpid, sizeof(bufpid), "PID unknown"); + + if (_HFI_INFO_ON) { + _HFI_INFO_ALWAYS + ("Process %s on host %s=%s sent %d stray message(s) and " + "was told so %d time(s) (first stray message at %.1fs " + "(%d%%), last at %.1fs (%d%%) into application run)\n", + bufpid, ip ? "IP" : "LID", ipbuf, sepid->num_messages, + sepid->err_check_bad_sent, t_first, + (int)(t_first * 100.0 / t_runtime), t_last, + (int)(t_last * 100.0 / t_runtime)); + } + + psmi_epid_remove(PSMI_EP_CROSSTALK, sepid->epid); + psmi_free(sepid); + } + psmi_epid_itor_fini(&itor); + return; +} + +/* New scbs now available. If we have pending sends because we were out of + * scbs, put the pendq on the timerq so it can be processed. */ +void ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context) +{ + struct ips_proto *proto = (struct ips_proto *)context; + struct ips_pend_sreq *sreq = STAILQ_FIRST(&proto->pend_sends.pendq); + if (sreq != NULL) + psmi_timer_request(proto->timerq, + &proto->pend_sends.timer, PSMI_TIMER_PRIO_1); + return; +} + +psm2_error_t +ips_proto_timer_pendq_callback(struct psmi_timer *timer, uint64_t current) +{ + psm2_error_t err = PSM2_OK; + struct ips_pend_sends *pend_sends = + (struct ips_pend_sends *)timer->context; + struct ips_pendsendq *phead = &pend_sends->pendq; + struct ips_proto *proto = (struct ips_proto *)pend_sends->proto; + struct ips_pend_sreq *sreq; + + while (!STAILQ_EMPTY(phead)) { + sreq = STAILQ_FIRST(phead); + switch (sreq->type) { + case IPS_PENDSEND_EAGER_REQ: + err = ips_proto_mq_push_cts_req(proto, sreq->req); + break; + case IPS_PENDSEND_EAGER_DATA: + err = ips_proto_mq_push_rts_data(proto, sreq->req); + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unknown pendq state %d\n", + sreq->type); + } + + if (err == PSM2_OK) { + STAILQ_REMOVE_HEAD(phead, next); + psmi_mpool_put(sreq); + } else { /* out of scbs. wait for the next scb_avail callback */ + /* printf("!!!!! breaking out of pendq progress\n"); */ + break; + } + } + + return err; +} + +PSMI_INLINE( +int +between(int first_seq, int last_seq, int seq)) +{ + if (last_seq >= first_seq) { + if (seq < first_seq || seq > last_seq) { + return 0; + } + } else { + if (seq > last_seq && seq < first_seq) { + return 0; + } + } + return 1; +} + +PSMI_INLINE( +int +pio_dma_ack_valid(struct ips_proto *proto, struct ips_flow *flow, + psmi_seqnum_t ack_seq_num)) +{ + uint32_t last_num; + struct ips_scb_unackedq *unackedq = &flow->scb_unacked; + + if (STAILQ_EMPTY(unackedq)) + return 0; + + /* scb_pend will be moved back when an nak is received, but + * the packet may actually be received and acked after the nak, + * so we use the tail of unacked queue, which may include packets + * not being sent out yet, this is over do, but it should be OK. */ + last_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.psn_num; + + return between(flow->xmit_ack_num.psn_num, + last_num, ack_seq_num.psn_num); +} + +PSMI_INLINE( +struct ips_flow * +get_tidflow(struct ips_proto *proto, ips_epaddr_t *ipsaddr, + struct ips_message_header *p_hdr, psmi_seqnum_t ack_seq_num)) +{ + struct ips_protoexp *protoexp = proto->protoexp; + ptl_arg_t desc_id = p_hdr->data[0]; + struct ips_tid_send_desc *tidsendc; + ptl_arg_t desc_tidsendc; + struct ips_flow *flow; + uint32_t last_seq; + struct ips_scb_unackedq *unackedq; + + tidsendc = (struct ips_tid_send_desc *) + psmi_mpool_find_obj_by_index(protoexp->tid_desc_send_pool, + desc_id._desc_idx); + if (tidsendc == NULL) { + _HFI_ERROR + ("OPCODE_ACK: Index %d is out of range in tidflow ack\n", + desc_id._desc_idx); + return NULL; + } + + /* Ensure generation matches */ + psmi_mpool_get_obj_index_gen_count(tidsendc, + &desc_tidsendc._desc_idx, + &desc_tidsendc._desc_genc); + if (desc_tidsendc.u64 != desc_id.u64) + return NULL; + + /* Ensure ack is within window */ + flow = &tidsendc->tidflow; + unackedq = &flow->scb_unacked; + + /* No unacked scbs */ + if (STAILQ_EMPTY(unackedq)) + return NULL; + + /* Generation for ack should match */ + if (STAILQ_FIRST(unackedq)->seq_num.psn_gen != ack_seq_num.psn_gen) + return NULL; + + /* scb_pend will be moved back when an nak is received, but + * the packet may actually be received and acked after the nak, + * so we use the tail of unacked queue, which may include packets + * not being sent out yet, this is over do, but it should be OK. */ + last_seq = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.psn_seq; + + if (between(flow->xmit_ack_num.psn_seq, + last_seq, ack_seq_num.psn_seq) == 0) + return NULL; + + return flow; +} + +/* NAK post process for tid flow */ +void ips_tidflow_nak_post_process(struct ips_proto *proto, + struct ips_flow *flow) +{ + ips_scb_t *scb; + uint32_t first_seq, ack_seq; + + scb = STAILQ_FIRST(&flow->scb_unacked); + first_seq = __be32_to_cpu(scb->ips_lrh.bth[2]) & HFI_BTH_SEQ_MASK; + ack_seq = (flow->xmit_ack_num.psn_seq - 1) & HFI_BTH_SEQ_MASK; + + /* If the ack SEQ falls into a multi-packets scb, + * don't re-send the packets already acked. */ + if (scb->nfrag > 1 && + between(first_seq, scb->seq_num.psn_seq, ack_seq)) { + uint32_t om, offset_in_tid, remaining_bytes_in_tid; + uint32_t npkt, pktlen, nbytes; + uint32_t idx, loop; + + /* how many packets acked in this scb */ + npkt = ((ack_seq - first_seq) & HFI_BTH_SEQ_MASK) + 1; + + /* Get offset/om from current packet header */ + offset_in_tid = __le32_to_cpu(scb->ips_lrh.khdr.kdeth0) & + HFI_KHDR_OFFSET_MASK; + om = (__le32_to_cpu(scb->ips_lrh.khdr.kdeth0) >> + HFI_KHDR_OM_SHIFT) & 0x1; + if (om) + offset_in_tid *= 64; + else + offset_in_tid *= 4; + /* bytes remaining in current tid */ + remaining_bytes_in_tid = + (IPS_TIDINFO_GET_LENGTH(scb->tsess[0]) << 12) - + offset_in_tid; + + /* packet length in current header */ + pktlen = scb->payload_size; + psmi_assert(min(remaining_bytes_in_tid, + scb->frag_size) >= pktlen); + psmi_assert((((__be16_to_cpu(scb->ips_lrh.lrh[2]) & + HFI_LRH_PKTLEN_MASK) << BYTE2DWORD_SHIFT) - + sizeof(struct ips_message_header) - + HFI_CRC_SIZE_IN_BYTES) == pktlen); + + /* Loop to find the position to start */ + idx = 0; + nbytes = 0; + loop = npkt; + while (loop) { + remaining_bytes_in_tid -= pktlen; + offset_in_tid += pktlen; + nbytes += pktlen; + first_seq++; + loop--; + + if (remaining_bytes_in_tid == 0) { + idx++; + remaining_bytes_in_tid = + IPS_TIDINFO_GET_LENGTH(scb-> + tsess[idx]) << 12; + offset_in_tid = 0; + } + + pktlen = min(remaining_bytes_in_tid, scb->frag_size); + } + psmi_assert((first_seq & HFI_BTH_SEQ_MASK) == + ((ack_seq + 1) & HFI_BTH_SEQ_MASK)); + + /* 0. update scb info */ + psmi_assert(scb->nfrag_remaining > npkt); + scb->nfrag_remaining -= npkt; + psmi_assert(scb->chunk_size_remaining > nbytes); + scb->chunk_size_remaining -= nbytes; + ips_scb_buffer(scb) = (void *)((char *)ips_scb_buffer(scb) + nbytes); + + /* 1. if last packet in sequence, set ACK, clear SH */ + if (scb->nfrag_remaining == 1) { + psmi_assert(scb->chunk_size_remaining <= + scb->frag_size); + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + scb->scb_flags &= ~IPS_SEND_FLAG_HDRSUPP; + + /* last packet is what remaining */ + pktlen = scb->chunk_size_remaining; + } + + /* 2. set new packet sequence number */ + scb->ips_lrh.bth[2] = __cpu_to_be32( + ((first_seq & HFI_BTH_SEQ_MASK) << HFI_BTH_SEQ_SHIFT) | + ((scb->seq_num.psn_gen & + HFI_BTH_GEN_MASK) << HFI_BTH_GEN_SHIFT) | + (scb->scb_flags & IPS_SEND_FLAG_ACKREQ)); + + /* 3. set new packet offset */ + scb->ips_lrh.exp_offset += nbytes; + + /* 4. if packet length is changed, set new length */ + if (scb->payload_size != pktlen) { + scb->payload_size = pktlen; + scb->ips_lrh.lrh[2] = __cpu_to_be16(( + (scb->payload_size + + sizeof(struct ips_message_header) + + HFI_CRC_SIZE_IN_BYTES) >> + BYTE2DWORD_SHIFT) & HFI_LRH_PKTLEN_MASK); + } + + /* 5. set new tidctrl and tidinfo array */ + scb->tsess = &scb->tsess[idx]; + scb->tsess_length -= idx * sizeof(uint32_t); + scb->tidctrl = IPS_TIDINFO_GET_TIDCTRL(scb->tsess[0]); + + /* 6. calculate new offset mode */ + if (offset_in_tid < 131072) { /* 2^15 * 4 */ + offset_in_tid /= 4; + om = 0; + } else { + offset_in_tid /= 64; + om = 1; + } + + /* 7. set new tidinfo */ + scb->ips_lrh.khdr.kdeth0 = __cpu_to_le32( + (offset_in_tid & HFI_KHDR_OFFSET_MASK) | + (om << HFI_KHDR_OM_SHIFT) | + (IPS_TIDINFO_GET_TID(scb->tsess[0]) + << HFI_KHDR_TID_SHIFT) | + (scb->tidctrl << HFI_KHDR_TIDCTRL_SHIFT) | + (scb->scb_flags & IPS_SEND_FLAG_INTR) | + (scb->scb_flags & IPS_SEND_FLAG_HDRSUPP) | + (IPS_PROTO_VERSION << HFI_KHDR_KVER_SHIFT)); + } + + /* Update unacked scb's to use the new generation */ + while (scb) { + /* update with new generation */ + scb->ips_lrh.bth[2] = __cpu_to_be32( + (__be32_to_cpu(scb->ips_lrh.bth[2]) & + (~(HFI_BTH_GEN_MASK << HFI_BTH_GEN_SHIFT))) | + ((flow->xmit_seq_num.psn_gen & + HFI_BTH_GEN_MASK) << HFI_BTH_GEN_SHIFT)); + scb->seq_num.psn_gen = flow->xmit_seq_num.psn_gen; + scb = SLIST_NEXT(scb, next); + } +} + +/* NAK post process for dma flow */ +void ips_dmaflow_nak_post_process(struct ips_proto *proto, + struct ips_flow *flow) +{ + ips_scb_t *scb; + uint32_t first_num, ack_num; + uint16_t padding = 0; + + scb = STAILQ_FIRST(&flow->scb_unacked); + first_num = __be32_to_cpu(scb->ips_lrh.bth[2]) & proto->psn_mask; + ack_num = (flow->xmit_ack_num.psn_num - 1) & proto->psn_mask; + + + /* If the ack PSN falls into a multi-packets scb, + * don't re-send the packets already acked. */ + psmi_assert(scb->nfrag > 1); + if (between(first_num, scb->seq_num.psn_num, ack_num)) { + uint32_t npkt, pktlen, nbytes; + + /* how many packets acked in this scb */ + npkt = ((ack_num - first_num) & proto->psn_mask) + 1; + + /* how many bytes already acked in this scb, for eager receive + * packets, all payload size is frag_size except the last packet + * which is not acked yet */ + pktlen = scb->frag_size; + nbytes = (((ack_num - first_num) & + proto->psn_mask) + 1) * pktlen; + + /* 0. update scb info */ + psmi_assert(scb->nfrag_remaining > npkt); + scb->nfrag_remaining -= npkt; + psmi_assert(scb->chunk_size_remaining > nbytes); + scb->chunk_size_remaining -= nbytes; + ips_scb_buffer(scb) = (void *)((char *)ips_scb_buffer(scb) + nbytes); + + /* 1. if last packet in sequence, set IPS_SEND_FLAG_ACKREQ */ + if (scb->chunk_size_remaining <= scb->frag_size) { + psmi_assert(scb->nfrag_remaining == 1); + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + + /* last packet is what remaining */ + /* check if padding is required*/ + padding = scb->chunk_size_remaining & 0x3; + if_pf(padding) { + /* how much to pad with also equals how many bytes we need + * to rewind the source buffer offset by to keep it dw aligned */ + padding = 4 - padding; + ips_scb_buffer(scb) = (void *)((char*)ips_scb_buffer(scb) - padding); + scb->chunk_size_remaining += padding; + } + pktlen = scb->chunk_size_remaining; + } + + /* 2. set new packet sequence number */ + scb->ips_lrh.bth[2] = __cpu_to_be32( + ((ack_num + 1) & proto->psn_mask) | + (scb->scb_flags & IPS_SEND_FLAG_ACKREQ)); + + /* 3. set new packet offset adjusted with padding */ + scb->ips_lrh.hdr_data.u32w0 += nbytes - padding; + + /* 4. if packet length is changed, set new length */ + if (scb->payload_size != pktlen) { + scb->payload_size = pktlen; + scb->ips_lrh.lrh[2] = __cpu_to_be16(( + (scb->payload_size + + sizeof(struct ips_message_header) + + HFI_CRC_SIZE_IN_BYTES) >> + BYTE2DWORD_SHIFT) & HFI_LRH_PKTLEN_MASK); + } + } +} + +/* process an incoming ack message. Separate function to allow */ +/* for better optimization by compiler */ +int +ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_proto *proto = rcv_ev->proto; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_flow *flow = NULL; + struct ips_scb_unackedq *unackedq; + struct ips_scb_pendlist *scb_pend; + psmi_seqnum_t ack_seq_num, last_seq_num; + ips_epaddr_flow_t flowid; + ips_scb_t *scb; + uint32_t tidctrl; + + ack_seq_num.psn_num = p_hdr->ack_seq_num; + tidctrl = GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0)); + if (!tidctrl && ((flowid = ips_proto_flowid(p_hdr)) < EP_FLOW_TIDFLOW)) { + ack_seq_num.psn_num = + (ack_seq_num.psn_num - 1) & proto->psn_mask; + psmi_assert(flowid < EP_FLOW_LAST); + flow = &ipsaddr->flows[flowid]; + if (!pio_dma_ack_valid(proto, flow, ack_seq_num)) + goto ret; + } else { + ack_seq_num.psn_seq -= 1; + flow = get_tidflow(proto, ipsaddr, p_hdr, ack_seq_num); + if (!flow) /* Invalid ack for flow */ + goto ret; + } + flow->xmit_ack_num.psn_num = p_hdr->ack_seq_num; + + unackedq = &flow->scb_unacked; + scb_pend = &flow->scb_pend; + + if (STAILQ_EMPTY(unackedq)) + goto ret; + + last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num; + + INC_TIME_SPEND(TIME_SPEND_USER2); + + /* For tidflow, psn_gen matches. So for all flows, tid/pio/dma, + * we can used general psn_num to compare the PSN. */ + while (between((scb = STAILQ_FIRST(unackedq))->seq_num.psn_num, + last_seq_num.psn_num, ack_seq_num.psn_num) + ) { + + /* take it out of the xmit queue and .. */ + if (scb == SLIST_FIRST(scb_pend)) { +#ifdef PSM_DEBUG + flow->scb_num_pending--; +#endif + SLIST_REMOVE_HEAD(scb_pend, next); + } + + STAILQ_REMOVE_HEAD(unackedq, nextq); +#ifdef PSM_DEBUG + flow->scb_num_unacked--; + psmi_assert(flow->scb_num_unacked >= flow->scb_num_pending); +#endif + flow->credits += scb->nfrag; + + if (flow->transfer == PSM_TRANSFER_DMA && + scb->dma_complete == 0) + ips_proto_dma_wait_until(proto, scb); + + if (scb->callback) + (*scb->callback) (scb->cb_param, scb->nfrag > 1 ? + scb->chunk_size : scb->payload_size); + + if (!(scb->scb_flags & IPS_SEND_FLAG_PERSISTENT)) + ips_scbctrl_free(scb); + + /* set all index pointer to NULL if all frames have been + * acked */ + if (STAILQ_EMPTY(unackedq)) { + psmi_timer_cancel(proto->timerq, flow->timer_ack); + flow->timer_ack = NULL; + psmi_timer_cancel(proto->timerq, flow->timer_send); + flow->timer_send = NULL; + + SLIST_FIRST(scb_pend) = NULL; + psmi_assert(flow->scb_num_pending == 0); + /* Reset congestion window - all packets ACK'd */ + flow->credits = flow->cwin = proto->flow_credits; + flow->ack_interval = max((flow->credits >> 2) - 1, 1); + flow->flags &= ~IPS_FLOW_FLAG_CONGESTED; + goto ret; + } else if (flow->timer_ack == scb->timer_ack) { + /* + * Exchange timers with last scb on unackedq. + * timer in scb is used by flow, cancelling current + * timer and then requesting a new timer takes more + * time, instead, we exchange the timer between current + * freeing scb and the last scb on unacked queue. + */ + psmi_timer *timer; + ips_scb_t *last = STAILQ_LAST(unackedq, ips_scb, nextq); + + timer = scb->timer_ack; + scb->timer_ack = last->timer_ack; + last->timer_ack = timer; + timer = scb->timer_send; + scb->timer_send = last->timer_send; + last->timer_send = timer; + + scb->timer_ack->context = scb; + scb->timer_send->context = scb; + last->timer_ack->context = last; + last->timer_send->context = last; + } + } + + psmi_assert(!STAILQ_EMPTY(unackedq)); /* sanity for above loop */ + + /* CCA: If flow is congested adjust rate */ + if_pf(rcv_ev->is_congested & IPS_RECV_EVENT_BECN) { + if ((flow->path->pr_ccti + + proto->cace[flow->path->pr_sl].ccti_increase) <= + proto->ccti_limit) { + ips_cca_adjust_rate(flow->path, + proto->cace[flow->path->pr_sl]. + ccti_increase); + /* Clear congestion event */ + rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN; + } + } + else { + /* Increase congestion window if flow is not congested */ + if_pf(flow->cwin < proto->flow_credits) { + flow->credits += + min(flow->cwin << 1, + proto->flow_credits) - flow->cwin; + flow->cwin = min(flow->cwin << 1, proto->flow_credits); + flow->ack_interval = max((flow->credits >> 2) - 1, 1); + } + } + + /* Reclaimed some credits - attempt to flush flow */ + if (!SLIST_EMPTY(scb_pend)) + flow->flush(flow, NULL); + + /* + * If the next packet has not even been put on the wire, cancel the + * retransmission timer since we're still presumably waiting on free + * pio bufs + */ + if (STAILQ_FIRST(unackedq)->abs_timeout == TIMEOUT_INFINITE) + psmi_timer_cancel(proto->timerq, flow->timer_ack); + +ret: + return IPS_RECVHDRQ_CONTINUE; +} + +/* process an incoming nack message. Separate function to allow */ +/* for better optimization by compiler */ +int ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_proto *proto = rcv_ev->proto; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_flow *flow = NULL; + struct ips_scb_unackedq *unackedq; + struct ips_scb_pendlist *scb_pend; + psmi_seqnum_t ack_seq_num, last_seq_num; + psm_protocol_type_t protocol; + ips_epaddr_flow_t flowid; + ips_scb_t *scb; + uint32_t tidctrl; + + INC_TIME_SPEND(TIME_SPEND_USER3); + + ack_seq_num.psn_num = p_hdr->ack_seq_num; + tidctrl = GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0)); + if (!tidctrl && ((flowid = ips_proto_flowid(p_hdr)) < EP_FLOW_TIDFLOW)) { + protocol = PSM_PROTOCOL_GO_BACK_N; + psmi_assert(flowid < EP_FLOW_LAST); + flow = &ipsaddr->flows[flowid]; + if (!pio_dma_ack_valid(proto, flow, ack_seq_num)) + goto ret; + ack_seq_num.psn_num = + (ack_seq_num.psn_num - 1) & proto->psn_mask; + flow->xmit_ack_num.psn_num = p_hdr->ack_seq_num; + } else { + protocol = PSM_PROTOCOL_TIDFLOW; + flow = get_tidflow(proto, ipsaddr, p_hdr, ack_seq_num); + if (!flow) + goto ret; /* Invalid ack for flow */ + ack_seq_num.psn_seq--; + + psmi_assert(flow->xmit_seq_num.psn_gen == ack_seq_num.psn_gen); + psmi_assert(flow->xmit_ack_num.psn_gen == ack_seq_num.psn_gen); + /* Update xmit_ack_num with both new generation and new + * acked sequence; update xmit_seq_num with the new flow + * generation, don't change the sequence number. */ + flow->xmit_ack_num = (psmi_seqnum_t) p_hdr->data[1].u32w0; + flow->xmit_seq_num.psn_gen = flow->xmit_ack_num.psn_gen; + psmi_assert(flow->xmit_seq_num.psn_gen != ack_seq_num.psn_gen); + } + + unackedq = &flow->scb_unacked; + scb_pend = &flow->scb_pend; + + if (STAILQ_EMPTY(unackedq)) + goto ret; + + last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num; + + proto->epaddr_stats.nak_recv++; + + _HFI_VDBG("got a nack %d on flow %d, " + "first is %d, last is %d\n", ack_seq_num.psn_num, + flow->flowid, + STAILQ_EMPTY(unackedq) ? -1 : STAILQ_FIRST(unackedq)->seq_num. + psn_num, STAILQ_EMPTY(unackedq) ? -1 : STAILQ_LAST(unackedq, + ips_scb, + nextq)-> + seq_num.psn_num); + + /* For tidflow, psn_gen matches. So for all flows, tid/pio/dma, + * we can use general psn_num to compare the PSN. */ + while (between((scb = STAILQ_FIRST(unackedq))->seq_num.psn_num, + last_seq_num.psn_num, ack_seq_num.psn_num) + ) { + /* take it out of the xmit queue and .. */ + if (scb == SLIST_FIRST(scb_pend)) { +#ifdef PSM_DEBUG + flow->scb_num_pending--; +#endif + SLIST_REMOVE_HEAD(scb_pend, next); + } + + STAILQ_REMOVE_HEAD(unackedq, nextq); +#ifdef PSM_DEBUG + flow->scb_num_unacked--; + psmi_assert(flow->scb_num_unacked >= flow->scb_num_pending); +#endif + + if (flow->transfer == PSM_TRANSFER_DMA && + scb->dma_complete == 0) + ips_proto_dma_wait_until(proto, scb); + + if (scb->callback) + (*scb->callback) (scb->cb_param, scb->nfrag > 1 ? + scb->chunk_size : scb->payload_size); + + if (!(scb->scb_flags & IPS_SEND_FLAG_PERSISTENT)) + ips_scbctrl_free(scb); + + /* set all index pointer to NULL if all frames has been acked */ + if (STAILQ_EMPTY(unackedq)) { + psmi_timer_cancel(proto->timerq, flow->timer_ack); + flow->timer_ack = NULL; + psmi_timer_cancel(proto->timerq, flow->timer_send); + flow->timer_send = NULL; + + SLIST_FIRST(scb_pend) = NULL; + psmi_assert(flow->scb_num_pending == 0); + /* Reset congestion window if all packets acknowledged */ + flow->credits = flow->cwin = proto->flow_credits; + flow->ack_interval = max((flow->credits >> 2) - 1, 1); + flow->flags &= ~IPS_FLOW_FLAG_CONGESTED; + goto ret; + } else if (flow->timer_ack == scb->timer_ack) { + /* + * Exchange timers with last scb on unackedq. + * timer in scb is used by flow, cancelling current + * timer and then requesting a new timer takes more + * time, instead, we exchange the timer between current + * freeing scb and the last scb on unacked queue. + */ + psmi_timer *timer; + ips_scb_t *last = STAILQ_LAST(unackedq, ips_scb, nextq); + + timer = scb->timer_ack; + scb->timer_ack = last->timer_ack; + last->timer_ack = timer; + timer = scb->timer_send; + scb->timer_send = last->timer_send; + last->timer_send = timer; + + scb->timer_ack->context = scb; + scb->timer_send->context = scb; + last->timer_ack->context = last; + last->timer_send->context = last; + } + } + + psmi_assert(!STAILQ_EMPTY(unackedq)); /* sanity for above loop */ + + if (protocol == PSM_PROTOCOL_TIDFLOW) + ips_tidflow_nak_post_process(proto, flow); + else if (scb->nfrag > 1) + ips_dmaflow_nak_post_process(proto, flow); + + /* Always cancel ACK timer as we are going to restart the flow */ + psmi_timer_cancel(proto->timerq, flow->timer_ack); + + /* What's now pending is all that was unacked */ + SLIST_FIRST(scb_pend) = scb; +#ifdef PSM_DEBUG + flow->scb_num_pending = flow->scb_num_unacked; +#endif + while (scb && !(scb->scb_flags & IPS_SEND_FLAG_PENDING)) { + /* Wait for the previous dma completion */ + if (flow->transfer == PSM_TRANSFER_DMA && + scb->dma_complete == 0) + ips_proto_dma_wait_until(proto, scb); + + scb->scb_flags |= IPS_SEND_FLAG_PENDING; + scb = SLIST_NEXT(scb, next); + } + + /* If NAK with congestion bit set - delay re-transmitting and THEN adjust + * CCA rate. + */ + if_pf(rcv_ev->is_congested & IPS_RECV_EVENT_BECN) { + uint64_t offset; + + /* Clear congestion event and mark flow as congested */ + rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN; + flow->flags |= IPS_FLOW_FLAG_CONGESTED; + + /* For congested flow use slow start i.e. reduce congestion window. + * For TIDFLOW we cannot reduce congestion window as peer expects + * header packets at regular intervals (protoexp->hdr_pkt_interval). + */ + if (flow->protocol != PSM_PROTOCOL_TIDFLOW) + flow->credits = flow->cwin = 1; + else + flow->credits = flow->cwin; + + flow->ack_interval = max((flow->credits >> 2) - 1, 1); + + /* During congestion cancel send timer and delay retransmission by + * random interval + */ + psmi_timer_cancel(proto->timerq, flow->timer_send); + if (SLIST_FIRST(scb_pend)->ack_timeout != TIMEOUT_INFINITE) + offset = (SLIST_FIRST(scb_pend)->ack_timeout >> 1); + else + offset = 0; + struct drand48_data drand48_data; + srand48_r((long int)(ipsaddr->epaddr.epid + proto->ep->epid), &drand48_data); + double rnum; + drand48_r(&drand48_data, &rnum); + psmi_timer_request(proto->timerq, flow->timer_send, + (get_cycles() + + (uint64_t) (offset * + (rnum + 1.0)))); + } + else { + int num_resent = 0; + + /* Reclaim all credits upto congestion window only */ + flow->credits = flow->cwin; + flow->ack_interval = max((flow->credits >> 2) - 1, 1); + + /* Flush pending scb's */ + flow->flush(flow, &num_resent); + + proto->epaddr_stats.send_rexmit += num_resent; + } + +ret: + return IPS_RECVHDRQ_CONTINUE; +} + +int +ips_proto_process_err_chk(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_recvhdrq *recvq = (struct ips_recvhdrq *)rcv_ev->recvq; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); + struct ips_flow *flow; + psmi_seqnum_t seq_num; + int16_t seq_off; + + INC_TIME_SPEND(TIME_SPEND_USER4); + PSM2_LOG_MSG("entering"); + psmi_assert(flowid < EP_FLOW_LAST); + flow = &ipsaddr->flows[flowid]; + recvq->proto->epaddr_stats.err_chk_recv++; + /* Ignore FECN bit since this is the control path */ + rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; + + seq_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); + seq_off = (int16_t) (flow->recv_seq_num.psn_num - seq_num.psn_num); + + if_pf(seq_off <= 0) { + _HFI_VDBG("naking for seq=%d, off=%d on flowid %d\n", + seq_num.psn_num, seq_off, flowid); + + if (seq_off < -flow->ack_interval) + flow->flags |= IPS_FLOW_FLAG_GEN_BECN; + + ips_proto_send_nak(recvq, flow); + flow->flags |= IPS_FLOW_FLAG_NAK_SEND; + } + else { + ips_scb_t ctrlscb; + + ctrlscb.scb_flags = 0; + ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num; + ips_proto_send_ctrl_message(flow, OPCODE_ACK, + &ipsaddr->ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + } + + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; +} + +int +ips_proto_process_err_chk_gen(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_recvhdrq *recvq = (struct ips_recvhdrq *)rcv_ev->recvq; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_protoexp *protoexp = recvq->proto->protoexp; + struct ips_tid_recv_desc *tidrecvc; + psmi_seqnum_t err_seqnum, recvseq; + ptl_arg_t desc_id = p_hdr->data[0]; + ptl_arg_t send_desc_id = p_hdr->data[1]; + int16_t seq_off; + uint8_t ack_type; + ips_scb_t ctrlscb; + + INC_TIME_SPEND(TIME_SPEND_USER4); + PSM2_LOG_MSG("entering"); + recvq->proto->epaddr_stats.err_chk_recv++; + + /* Ignore FECN bit since this is the control path */ + rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; + + /* Get the flowgenseq for err chk gen */ + err_seqnum.psn_val = __be32_to_cpu(p_hdr->bth[2]); + + /* Get receive descriptor */ + psmi_assert(desc_id._desc_idx < HFI_TF_NFLOWS); + tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; + + if (tidrecvc->rdescid._desc_genc != desc_id._desc_genc) { + /* Receive descriptor mismatch in time and space. + * Stale err chk gen, drop packet + */ + _HFI_DBG + ("ERR_CHK_GEN: gen mismatch Pkt: 0x%x, Current: 0x%x\n", + desc_id._desc_genc, tidrecvc->rdescid._desc_genc); + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; + } + psmi_assert(tidrecvc->state == TIDRECVC_STATE_BUSY); + + /* + * We change tidrecvc->tidflow_genseq here only when a new generation + * is allocated and programmed into hardware. Otherwise we use local + * variable recvseq to create the reply. + */ + recvseq = tidrecvc->tidflow_genseq; + + /* Get the latest seq from hardware tidflow table. But + * only do this when context sharing is not used, because + * context sharing might drop packet even though hardware + * has received it successfully. + */ + if (!tidrecvc->context->tf_ctrl) + { + uint64_t tf; + uint32_t seqno=0; + + psmi_hal_tidflow_get(tidrecvc->rdescid._desc_idx, &tf, + tidrecvc->context->psm_hw_ctxt); + psmi_hal_tidflow_get_seqnum(tf, &seqno); + recvseq.psn_seq = seqno; + } + + if (err_seqnum.psn_gen != recvseq.psn_gen) { + ack_type = OPCODE_NAK; + /* NAK without allocating a new generation */ + + /* My current generation and last received seq */ + ctrlscb.ips_lrh.data[1].u32w0 = recvseq.psn_val; + } else { + /* Either lost packets or lost ack, we need to deal + * with wrap around of the seq value from 2047 to 0 + * because seq is only 11 bits */ + seq_off = (int16_t)(err_seqnum.psn_seq - recvseq.psn_seq); + if (seq_off < 0) + seq_off += 2048; /* seq is 11 bits */ + + if (seq_off < 1024) { + ack_type = OPCODE_NAK; + /* NAK with allocating a new generation */ + + /* set latest seq */ + tidrecvc->tidflow_genseq.psn_seq = recvseq.psn_seq; + /* allocate and set a new generation */ + ips_protoexp_flow_newgen(tidrecvc); + /* get the new generation */ + recvseq.psn_gen = tidrecvc->tidflow_genseq.psn_gen; + + /* My new generation and last received seq */ + ctrlscb.ips_lrh.data[1].u32w0 = recvseq.psn_val; + } else + /* ACK with last received seq, + * no need to set ips_lrh.data[1].u32w0 */ + ack_type = OPCODE_ACK; + } + + ctrlscb.scb_flags = 0; + ctrlscb.ips_lrh.data[0].u64 = send_desc_id.u64; + /* Keep peer generation but use my last received sequence */ + err_seqnum.psn_seq = recvseq.psn_seq; + ctrlscb.ips_lrh.ack_seq_num = err_seqnum.psn_val; + + /* May want to generate a BECN if a lot of swapped generations */ + if_pf((tidrecvc->tidflow_nswap_gen > 4) && + (protoexp->proto->flags & IPS_PROTO_FLAG_CCA)) { + _HFI_CCADBG + ("ERR_CHK_GEN: Generating BECN. Number of swapped generations: %d.\n", + tidrecvc->tidflow_nswap_gen); + /* Mark flow to generate BECN in control packet */ + tidrecvc->tidflow.flags |= IPS_FLOW_FLAG_GEN_BECN; + + /* Update stats for congestion encountered */ + recvq->proto->epaddr_stats.congestion_pkts++; + } + + ips_proto_send_ctrl_message(&tidrecvc->tidflow, + ack_type, &tidrecvc->ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + + /* Update stats for expected window */ + tidrecvc->stats.nErrChkReceived++; + if (ack_type == OPCODE_NAK) + tidrecvc->stats.nReXmit++; /* Update stats for retransmit (Sent a NAK) */ + + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; +} + +int +ips_proto_process_becn(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_proto *proto = rcv_ev->proto; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + int flowid = ips_proto_flowid(p_hdr); + struct ips_flow *flow; + + psmi_assert(flowid < EP_FLOW_LAST); + flow = &ipsaddr->flows[flowid]; + if ((flow->path->pr_ccti + + proto->cace[flow->path->pr_sl].ccti_increase) <= proto->ccti_limit) { + ips_cca_adjust_rate(flow->path, + proto->cace[flow->path->pr_sl].ccti_increase); + /* Clear congestion event */ + rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN; + } + + return IPS_RECVHDRQ_CONTINUE; +} + +static void ips_bad_opcode(uint8_t op_code, struct ips_message_header *proto) +{ + _HFI_DBG("Discarding message with bad opcode 0x%x\n", op_code); + + if (hfi_debug & __HFI_DBG) { + ips_proto_show_header(proto, "received bad opcode"); + ips_proto_dump_frame(proto, sizeof(struct ips_message_header), + "Opcode error protocol header dump"); + } +} + +int +ips_proto_process_unknown_opcode(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_message_header *protocol_header = rcv_ev->p_hdr; + struct ips_proto *proto = rcv_ev->proto; + + proto->stats.unknown_packets++; + ips_bad_opcode(_get_proto_hfi_opcode(protocol_header), protocol_header); + + return IPS_RECVHDRQ_CONTINUE; +} + +int +ips_proto_connect_disconnect(struct ips_recvhdrq_event *rcv_ev) +{ + psm2_error_t err = PSM2_OK; + char *payload = ips_recvhdrq_event_payload(rcv_ev); + uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev); + + psmi_assert(payload); + err = ips_proto_process_connect(rcv_ev->proto, + _get_proto_hfi_opcode(rcv_ev->p_hdr), + rcv_ev->p_hdr, + payload, + paylen); + if (err != PSM2_OK) + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Process connect/disconnect error: %d, opcode %d\n", + err, _get_proto_hfi_opcode(rcv_ev->p_hdr)); + + return IPS_RECVHDRQ_CONTINUE; +} + +/* Return 1 if packet is ok. */ +/* Return 0 if packet should be skipped */ +int ips_proto_process_unknown(const struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_proto *proto = rcv_ev->proto; + psm2_ep_t ep_err; + char *pkt_type; + int opcode = (int)_get_proto_hfi_opcode(p_hdr); + + /* + * If the protocol is disabled or not yet enabled, no processing happens + * We set it t_init to 0 when disabling the protocol + */ + if (proto->t_init == 0) + return IPS_RECVHDRQ_CONTINUE; + + /* Connect messages don't have to be from a known epaddr */ + switch (opcode) { + case OPCODE_CONNECT_REQUEST: + case OPCODE_CONNECT_REPLY: + case OPCODE_DISCONNECT_REQUEST: + case OPCODE_DISCONNECT_REPLY: + ips_proto_connect_disconnect( + (struct ips_recvhdrq_event *)rcv_ev); + return IPS_RECVHDRQ_CONTINUE; + default: + break; + } + + /* Packet from "unknown" peer. Log the packet and payload if at appropriate + * verbose level. + */ + { + char *payload = ips_recvhdrq_event_payload(rcv_ev); + uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) + + ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3); + + ips_proto_dump_err_stats(proto); + + if (hfi_debug & __HFI_PKTDBG) { + ips_proto_dump_frame(rcv_ev->p_hdr, + HFI_MESSAGE_HDR_SIZE, "header"); + if (paylen) + ips_proto_dump_frame(payload, paylen, "data"); + } + } + + /* Other messages are definitely crosstalk. */ + /* out-of-context expected messages are always fatal */ + if (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_EXPECTED) { + ep_err = PSMI_EP_NORETURN; + pkt_type = "expected"; + } else if (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_EAGER) { + ep_err = PSMI_EP_LOGEVENT; + pkt_type = "eager"; + } else { + ep_err = PSMI_EP_NORETURN; + pkt_type = "unknown"; + } + + proto->stats.stray_packets++; + + /* If we have debug mode, print the complete packet every time */ + if (hfi_debug & __HFI_PKTDBG) + ips_proto_show_header(p_hdr, "invalid connidx"); + + /* At this point we are out of luck. */ + psmi_handle_error(ep_err, PSM2_EPID_NETWORK_ERROR, + "Received %s message(s) ptype=0x%x opcode=0x%x" + " from an unknown process", pkt_type, psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf), opcode); + + return 0; /* Always skip this packet unless the above call was a noreturn + * call */ +} + +/* get the error string as a number and a string */ +static void rhf_errnum_string(char *msg, size_t msglen, long err) +{ + int len; + char *errmsg; + + len = snprintf(msg, msglen, "RHFerror %lx: ", err); + if (len > 0 && len < msglen) { + errmsg = msg + len; + msglen -= len; + } else + errmsg = msg; + *errmsg = 0; + ips_proto_get_rhf_errstring(err, errmsg, msglen); +} + +/* + * Error handling + */ +int ips_proto_process_packet_error(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_proto *proto = rcv_ev->proto; + int pkt_verbose_err = hfi_debug & __HFI_PKTDBG; + int tiderr = psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf) & PSMI_HAL_RHF_ERR_TID; + int tf_seqerr = psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf) & PSMI_HAL_RHF_ERR_TFSEQ; + int tf_generr = psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf) & PSMI_HAL_RHF_ERR_TFGEN; + int data_err = psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf) & + (PSMI_HAL_RHF_ERR_ICRC | PSMI_HAL_RHF_ERR_ECC | PSMI_HAL_RHF_ERR_LEN | + PSMI_HAL_RHF_ERR_DC | PSMI_HAL_RHF_ERR_DCUN | PSMI_HAL_RHF_ERR_KHDRLEN); + char pktmsg[128]; + + *pktmsg = 0; + /* + * Tid errors on eager pkts mean we get a headerq overflow, perfectly + * safe. Tid errors on expected or other packets means trouble. + */ + if (tiderr && psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_EAGER) { + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + + /* Payload dropped - Determine flow for this header and see if + * we need to generate a NAK. + * + * ALL PACKET DROPS IN THIS CATEGORY CAN BE FLAGGED AS DROPPED DUE TO + * CONGESTION AS THE EAGER BUFFER IS FULL. + * + * Possible eager packet type: + * + * Ctrl Message - ignore + * MQ message - Can get flow and see if we need to NAK. + * AM message - Can get flow and see if we need to NAK. + */ + + proto->stats.hdr_overflow++; + if (data_err) + return 0; + + switch (_get_proto_hfi_opcode(p_hdr)) { + case OPCODE_TINY: + case OPCODE_SHORT: + case OPCODE_EAGER: + case OPCODE_LONG_RTS: + case OPCODE_LONG_CTS: + case OPCODE_LONG_DATA: + case OPCODE_AM_REQUEST: + case OPCODE_AM_REQUEST_NOREPLY: + case OPCODE_AM_REPLY: + { + ips_epaddr_flow_t flowid = + ips_proto_flowid(p_hdr); + struct ips_epstate_entry *epstaddr; + struct ips_flow *flow; + psmi_seqnum_t sequence_num; + int16_t diff; + + /* Obtain ipsaddr for packet */ + epstaddr = + ips_epstate_lookup(rcv_ev->recvq->epstate, + rcv_ev->p_hdr->connidx); + if_pf(epstaddr == NULL + || epstaddr->ipsaddr == NULL) + return 0; /* Unknown packet - drop */ + + rcv_ev->ipsaddr = epstaddr->ipsaddr; + + psmi_assert(flowid < EP_FLOW_LAST); + flow = &rcv_ev->ipsaddr->flows[flowid]; + sequence_num.psn_val = + __be32_to_cpu(p_hdr->bth[2]); + diff = + (int16_t) (sequence_num.psn_num - + flow->recv_seq_num.psn_num); + + if (diff >= 0 + && !(flow-> + flags & IPS_FLOW_FLAG_NAK_SEND)) { + /* Mark flow as congested and attempt to generate NAK */ + flow->flags |= IPS_FLOW_FLAG_GEN_BECN; + proto->epaddr_stats.congestion_pkts++; + + flow->flags |= IPS_FLOW_FLAG_NAK_SEND; + flow->cca_ooo_pkts = 0; + ips_proto_send_nak((struct ips_recvhdrq + *)rcv_ev->recvq, + flow); + } + + /* Safe to process ACKs from header */ + ips_proto_process_ack(rcv_ev); + } + break; + case OPCODE_EXPTID: + /* If RSM is matching packets that are TID&FECN&SH, + * it is possible to have a EXPTID packet encounter + * the eager full condition and have the payload + * dropped (but the header delivered). + * Treat this condition as a data error (corruption,etc) + * and send a NAK. + */ + if (psmi_hal_has_cap(PSM_HAL_CAP_RSM_FECN_SUPP)) + ips_protoexp_handle_data_err(rcv_ev); + break; + default: + break; + } + } else if (tf_generr) /* handle generr, ignore tiderr if any */ + ips_protoexp_handle_tf_generr(rcv_ev); + else if (tf_seqerr) + ips_protoexp_handle_tf_seqerr(rcv_ev); + else if (tiderr) { /* tid error, but not on an eager pkt */ + psm2_ep_t ep_err = PSMI_EP_LOGEVENT; + uint16_t tid, offset; + uint64_t t_now = get_cycles(); + + proto->tiderr_cnt++; + + /* Whether and how we will be logging this event */ + if (proto->tiderr_max > 0 + && proto->tiderr_cnt >= proto->tiderr_max) + ep_err = PSMI_EP_NORETURN; + else if (proto->tiderr_warn_interval != UINT64_MAX && + proto->tiderr_tnext <= t_now) + proto->tiderr_tnext = + get_cycles() + proto->tiderr_warn_interval; + else + ep_err = NULL; + + if (ep_err != NULL) { + rhf_errnum_string(pktmsg, sizeof(pktmsg), + psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf)); + + tid = (__le32_to_cpu(rcv_ev->p_hdr->khdr.kdeth0) >> + HFI_KHDR_TID_SHIFT) & HFI_KHDR_TID_MASK; + offset = __le32_to_cpu(rcv_ev->p_hdr->khdr.kdeth0) & + HFI_KHDR_OFFSET_MASK; + + psmi_handle_error(ep_err, PSM2_EP_DEVICE_FAILURE, + "%s with tid=%d,offset=%d,count=%d: %s %s", + "TID Error", + tid, offset, proto->tiderr_cnt, + pktmsg, ep_err == PSMI_EP_NORETURN ? + "(Terminating...)" : ""); + } + + ips_protoexp_handle_tiderr(rcv_ev); + } else if (data_err) { +#if _HFI_DEBUGGING + if (_HFI_DBG_ON) { + uint8_t op_code + = _get_proto_hfi_opcode(rcv_ev->p_hdr); + + if (!pkt_verbose_err) { + rhf_errnum_string(pktmsg, sizeof(pktmsg), + psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf)); + _HFI_DBG_ALWAYS + ("Error %s pkt type opcode 0x%x at hd=0x%x %s\n", + (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == + PSM_HAL_RHF_RX_TYPE_EAGER) ? "eager" : ( + psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == + PSM_HAL_RHF_RX_TYPE_EXPECTED) + ? "expected" : (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == + PSM_HAL_RHF_RX_TYPE_NON_KD) ? "non-kd" : + "", op_code, + rcv_ev->recvq->state->hdrq_head, pktmsg); + } + } +#endif + + if (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_EXPECTED) + ips_protoexp_handle_data_err(rcv_ev); + } else { /* not a tid or data error -- some other error */ +#if _HFI_DEBUGGING + if (_HFI_DBG_ON) { + uint8_t op_code = + __be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 24 & 0xFF; + + if (!pkt_verbose_err) + rhf_errnum_string(pktmsg, sizeof(pktmsg), + psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf)); + + /* else RHFerr decode printed below */ + _HFI_DBG_ALWAYS + ("Error pkt type 0x%x opcode 0x%x at hd=0x%x %s\n", + psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf), op_code, + rcv_ev->recvq->state->hdrq_head, pktmsg); + } +#endif + } + if (pkt_verbose_err) { + if (!*pktmsg) + rhf_errnum_string(pktmsg, sizeof(pktmsg), + psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf)); + ips_proto_show_header(rcv_ev->p_hdr, pktmsg); + } + + return 0; +} diff --git a/ptl_ips/ips_recvhdrq.c b/ptl_ips/ips_recvhdrq.c new file mode 100644 index 0000000..16908ba --- /dev/null +++ b/ptl_ips/ips_recvhdrq.c @@ -0,0 +1,830 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" + +#include "ips_epstate.h" +#include "ips_proto.h" +#include "ips_expected_proto.h" +#include "ips_proto_help.h" +#include "ips_proto_internal.h" + +/* + * Receive header queue initialization. + */ +psm2_error_t +ips_recvhdrq_init(const psmi_context_t *context, + const struct ips_epstate *epstate, + const struct ips_proto *proto, + const struct ips_recvhdrq_callbacks *callbacks, + uint32_t subcontext, + struct ips_recvhdrq *recvq, + struct ips_recvhdrq_state *recvq_state, + psmi_hal_cl_q psm_hal_cl_hdrq) +{ + psm2_error_t err = PSM2_OK; + + memset(recvq, 0, sizeof(*recvq)); + recvq->proto = (struct ips_proto *)proto; + recvq->state = recvq_state; + recvq->context = context; + recvq->subcontext = subcontext; + recvq->psm_hal_cl_hdrq = psm_hal_cl_hdrq; + pthread_spin_init(&recvq->hdrq_lock, PTHREAD_PROCESS_SHARED); + recvq->hdrq_elemlast = ((psmi_hal_get_rx_hdr_q_cnt(context->psm_hw_ctxt) - 1) * + (psmi_hal_get_rx_hdr_q_ent_size(context->psm_hw_ctxt) >> BYTE2DWORD_SHIFT)); + + recvq->epstate = epstate; + recvq->recvq_callbacks = *callbacks; /* deep copy */ + SLIST_INIT(&recvq->pending_acks); + + recvq->state->hdrq_head = 0; + recvq->state->rcv_egr_index_head = NO_EAGER_UPDATE; + recvq->state->num_hdrq_done = 0; + recvq->state->num_egrq_done = 0; + recvq->state->hdr_countdown = 0; + recvq->state->hdrq_cachedlastscan = 0; + + { + union psmi_envvar_val env_hdr_update; + psmi_getenv("PSM2_HEAD_UPDATE", + "header queue update interval (0 to update after all entries are processed). Default is 64", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val) 64, &env_hdr_update); + + /* Cap max header update interval to size of header/eager queue */ + recvq->state->head_update_interval = + min(env_hdr_update.e_uint, psmi_hal_get_rx_hdr_q_cnt(context->psm_hw_ctxt) - 1); + recvq->state->egrq_update_interval = 1; + } + return err; +} + + +/* flush the eager buffers, by setting the eager index head to eager index tail + if eager buffer queue is full. + + Called when we had eager buffer overflows (ERR_TID/HFI_RHF_H_TIDERR + was set in RHF errors), and no good eager packets were received, so + that eager head wasn't advanced. +*/ +#if 0 +static void ips_flush_egrq_if_required(struct ips_recvhdrq *recvq) +{ + const uint32_t tail = ips_recvq_tail_get(&recvq->egrq); + const uint32_t head = ips_recvq_head_get(&recvq->egrq); + uint32_t egr_cnt = recvq->egrq.elemcnt; + + if ((head % egr_cnt) == ((tail + 1) % egr_cnt)) { + _HFI_DBG("eager array full after overflow, flushing " + "(head %llx, tail %llx)\n", + (long long)head, (long long)tail); + recvq->proto->stats.egr_overflow++; + } + return; +} +#endif + +/* + * Helpers for ips_recvhdrq_progress. + */ + +static __inline__ int +_get_proto_subcontext(const struct ips_message_header *p_hdr) +{ + return ((__be32_to_cpu(p_hdr->bth[1]) >> + HFI_BTH_SUBCTXT_SHIFT) & HFI_BTH_SUBCTXT_MASK); +} + +/* Determine if FECN bit is set IBTA 1.2.1 CCA Annex A*/ + +static __inline__ uint8_t +_is_cca_fecn_set(const struct ips_message_header *p_hdr) +{ + return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_FECN_SHIFT) & 0x1; +} + +/* Detrmine if BECN bit is set IBTA 1.2.1 CCA Annex A*/ +static __inline__ uint8_t +_is_cca_becn_set(const struct ips_message_header *p_hdr) +{ + return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_BECN_SHIFT) & 0x1; +} + +static __inline__ void _dump_invalid_pkt(struct ips_recvhdrq_event *rcv_ev) +{ + char *payload = ips_recvhdrq_event_payload(rcv_ev); + uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) + + ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3); + +#ifdef PSM_DEBUG + ips_proto_show_header((struct ips_message_header *) + rcv_ev->p_hdr, "received invalid pkt"); +#endif + if (hfi_debug & __HFI_PKTDBG) { + ips_proto_dump_frame(rcv_ev->p_hdr, HFI_MESSAGE_HDR_SIZE, + "header"); + if (paylen) + ips_proto_dump_frame(payload, paylen, "data"); + } + +} + +static __inline__ void +_update_error_stats(struct ips_proto *proto, uint32_t err) +{ + if (err & PSMI_HAL_RHF_ERR_ICRC) + proto->error_stats.num_icrc_err++; + if (err & PSMI_HAL_RHF_ERR_ECC) + proto->error_stats.num_ecc_err++; + if (err & PSMI_HAL_RHF_ERR_LEN) + proto->error_stats.num_len_err++; + if (err & PSMI_HAL_RHF_ERR_TID) + proto->error_stats.num_tid_err++; + if (err & PSMI_HAL_RHF_ERR_DC) + proto->error_stats.num_dc_err++; + if (err & PSMI_HAL_RHF_ERR_DCUN) + proto->error_stats.num_dcunc_err++; + if (err & PSMI_HAL_RHF_ERR_KHDRLEN) + proto->error_stats.num_khdrlen_err++; +} + +#ifdef PSM_DEBUG + +static int _check_headers(struct ips_recvhdrq_event *rcv_ev, psmi_hal_cl_q cl_q) +{ + struct ips_recvhdrq *recvq = (struct ips_recvhdrq *)rcv_ev->recvq; + struct ips_proto *proto = rcv_ev->proto; + uint32_t *lrh = (uint32_t *) rcv_ev->p_hdr; + uint32_t dest_context; + const uint16_t pkt_dlid = __be16_to_cpu(rcv_ev->p_hdr->lrh[1]); + const uint16_t base_dlid = + __be16_to_cpu(recvq->proto->epinfo.ep_base_lid); + + /* Check that the receive header queue entry has a sane sequence number */ + if (psmi_hal_check_rhf_sequence_number(psmi_hal_rhf_get_seq(rcv_ev->psm_hal_rhf)) + != PSM_HAL_ERROR_OK) { + unsigned int seqno=0; + + psmi_hal_get_rhf_expected_sequence_number(&seqno, cl_q, recvq->context->psm_hw_ctxt); + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "ErrPkt: Invalid header queue entry! RHF Sequence in Hdrq Seq: %d, Recvq State Seq: %d. LRH[0]: 0x%08x, LRH[1] (PktCount): 0x%08x\n", + psmi_hal_rhf_get_seq(rcv_ev->psm_hal_rhf), + seqno, lrh[0], lrh[1]); + return -1; + } + + /* Verify that the packet was destined for our context */ + dest_context = ips_proto_dest_context_from_header(proto, rcv_ev->p_hdr); + if_pf(dest_context != recvq->proto->epinfo.ep_context) { + + struct ips_recvhdrq_state *state = recvq->state; + + /* Packet not targeted at us. Drop packet and continue */ + ips_proto_dump_err_stats(proto); + _dump_invalid_pkt(rcv_ev); + + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "ErrPkt: Received packet for context %d on context %d. Receive Header Queue offset: 0x%x. Exiting.\n", + dest_context, recvq->proto->epinfo.ep_context, + state->hdrq_head); + + return -1; + } + + /* Verify that rhf packet length matches the length in LRH */ + if_pf(psmi_hal_rhf_get_packet_length(rcv_ev->psm_hal_rhf) != + (__be16_to_cpu(rcv_ev->p_hdr->lrh[2]) << BYTE2DWORD_SHIFT)) { + _HFI_EPDBG + ("ErrPkt: RHF Packet Len (0x%x) does not match LRH (0x%x).\n", + psmi_hal_rhf_get_packet_length(rcv_ev->psm_hal_rhf) >> 2, + __be16_to_cpu(rcv_ev->p_hdr->lrh[2])); + + ips_proto_dump_err_stats(proto); + _dump_invalid_pkt(rcv_ev); + return -1; + } + + /* Verify that the DLID matches our local LID. */ + if_pf(!((base_dlid <= pkt_dlid) && + (pkt_dlid <= + (base_dlid + (1 << recvq->proto->epinfo.ep_lmc))))) { + _HFI_EPDBG + ("ErrPkt: DLID in LRH (0x%04x) does not match local LID (0x%04x) Skipping packet!\n", + rcv_ev->p_hdr->lrh[1], recvq->proto->epinfo.ep_base_lid); + ips_proto_dump_err_stats(proto); + _dump_invalid_pkt(rcv_ev); + return -1; + } + + return 0; +} +#endif + +static __inline__ int do_pkt_cksum(struct ips_recvhdrq_event *rcv_ev) +{ + char *payload = ips_recvhdrq_event_payload(rcv_ev); + uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) + + ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3); + uint32_t *ckptr; + uint32_t recv_cksum, cksum, dest_subcontext; + /* With checksum every packet has a payload */ + psmi_assert_always(payload); + + ckptr = (uint32_t *) (payload + paylen); + recv_cksum = ckptr[0]; + + /* Calculate checksum hdr + payload (includes any padding words) */ + cksum = 0xffffffff; + cksum = ips_crc_calculate(HFI_MESSAGE_HDR_SIZE, + (uint8_t *) rcv_ev->p_hdr, cksum); + if (paylen) + cksum = ips_crc_calculate(paylen, (uint8_t *) payload, cksum); + + if ((cksum != recv_cksum) || (ckptr[0] != ckptr[1])) { + struct ips_epstate_entry *epstaddr; + uint32_t lcontext; + psmi_hal_cl_idx hd, tl; + + epstaddr = + ips_epstate_lookup(rcv_ev->recvq->epstate, + rcv_ev->p_hdr->connidx); + epstaddr = (epstaddr && epstaddr->ipsaddr) ? epstaddr : NULL; + lcontext = epstaddr ? rcv_ev->proto->epinfo.ep_context : -1; + + hd = psmi_hal_get_cl_q_head_index(PSM_HAL_CL_Q_RX_HDR_Q, + rcv_ev->recvq->context->psm_hw_ctxt); + tl = psmi_hal_get_cl_q_tail_index(PSM_HAL_CL_Q_RX_HDR_Q, + rcv_ev->recvq->context->psm_hw_ctxt); + + dest_subcontext = _get_proto_subcontext(rcv_ev->p_hdr); + + _HFI_ERROR + ("ErrPkt: SharedContext: %s. Local Context: %i, Checksum mismatch from LID %d! Received Checksum: 0x%08x, Expected: 0x%08x & 0x%08x. Opcode: 0x%08x, Error Flag: 0x%08x. hdrq hd 0x%x tl 0x%x rhf 0x%" + PRIx64 ", rhfseq 0x%x\n", + (dest_subcontext != + rcv_ev->recvq->subcontext) ? "Yes" : "No", lcontext, + epstaddr ? __be16_to_cpu(epstaddr->ipsaddr->pathgrp-> + pg_base_dlid) : -1, cksum, + ckptr[0], ckptr[1], _get_proto_hfi_opcode(rcv_ev->p_hdr), + psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf), hd, tl, rcv_ev->psm_hal_rhf.raw_rhf, + psmi_hal_rhf_get_seq(rcv_ev->psm_hal_rhf)); + /* Dump packet */ + _dump_invalid_pkt(rcv_ev); + return 0; /* Packet checksum error */ + } + + return 1; +} + +PSMI_ALWAYS_INLINE( +void +process_pending_acks(struct ips_recvhdrq *recvq)) +{ + ips_scb_t ctrlscb; + struct ips_message_header *msg_hdr = NULL; + + /* If any pending acks, dispatch them now */ + while (!SLIST_EMPTY(&recvq->pending_acks)) { + struct ips_flow *flow = SLIST_FIRST(&recvq->pending_acks); + + SLIST_REMOVE_HEAD(&recvq->pending_acks, next); + SLIST_NEXT(flow, next) = NULL; + + ctrlscb.scb_flags = 0; + msg_hdr = &ctrlscb.ips_lrh; + msg_hdr->ack_seq_num = flow->recv_seq_num.psn_num; + + if (flow->flags & IPS_FLOW_FLAG_PENDING_ACK) { + psmi_assert_always((flow-> + flags & IPS_FLOW_FLAG_PENDING_NAK) + == 0); + + flow->flags &= ~IPS_FLOW_FLAG_PENDING_ACK; + ips_proto_send_ctrl_message(flow, OPCODE_ACK, + &flow->ipsaddr-> + ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + } else { + psmi_assert_always(flow-> + flags & IPS_FLOW_FLAG_PENDING_NAK); + + flow->flags &= ~IPS_FLOW_FLAG_PENDING_NAK; + ips_proto_send_ctrl_message(flow, OPCODE_NAK, + &flow->ipsaddr-> + ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + } + } +} + +/* + * Core receive progress function + * + * recvhdrq_progress is the core function that services the receive header + * queue and optionally, the eager queue. At the lowest level, it identifies + * packets marked with errors by the chip and also detects and corrects when + * eager overflow conditions occur. At the highest level, it queries the + * 'epstate' interface to classify packets from "known" and "unknown" + * endpoints. In order to support shared contexts, it can also handle packets + * destined for other contexts (or "subcontexts"). + */ +psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq) +{ + /* When PSM_PERF is enabled, the following line causes the + PMU to start a stop watch to measure instruction cycles of the + RX speedpath of PSM. The stop watch is stopped below. */ + GENERIC_PERF_BEGIN(PSM_RX_SPEEDPATH_CTR); + struct ips_recvhdrq_state *state = recvq->state; + PSMI_CACHEALIGN struct ips_recvhdrq_event rcv_ev = {.proto = + recvq->proto, + .recvq = recvq + }; + struct ips_epstate_entry *epstaddr; + uint32_t num_hdrq_done = 0; + const uint32_t num_hdrq_todo = psmi_hal_get_rx_hdr_q_cnt(recvq->context->psm_hw_ctxt); + uint32_t dest_subcontext; + const uint32_t hdrq_elemsz = psmi_hal_get_rx_hdr_q_ent_size(recvq->context->psm_hw_ctxt) >> BYTE2DWORD_SHIFT; + int ret = IPS_RECVHDRQ_CONTINUE; + int done = 0, empty = 0; + int do_hdr_update = 0; + const psmi_hal_cl_q psm_hal_hdr_q = recvq->psm_hal_cl_hdrq; + const psmi_hal_cl_q psm_hal_egr_q = psm_hal_hdr_q + 1; + + /* Returns whether the currently set 'rcv_hdr'/head is a readable entry */ +#define next_hdrq_is_ready() (! empty ) + + if (psmi_hal_cl_q_empty(state->hdrq_head, psm_hal_hdr_q, recvq->context->psm_hw_ctxt)) + return PSM2_OK; + + PSM2_LOG_MSG("entering"); + + done = !next_hdrq_is_ready(); + + rcv_ev.psm_hal_hdr_q = psm_hal_hdr_q; + + while (!done) { + psmi_hal_get_receive_event(state->hdrq_head, recvq->context->psm_hw_ctxt, + &rcv_ev); + rcv_ev.has_cksum = + ((recvq->proto->flags & IPS_PROTO_FLAG_CKSUM) && + (rcv_ev.p_hdr->flags & IPS_SEND_FLAG_PKTCKSUM)); + _HFI_VDBG + ("new packet: rcv_hdr %p, rhf %" PRIx64 "\n", + rcv_ev.p_hdr, rcv_ev.psm_hal_rhf.raw_rhf); + + /* If the hdrq_head is before cachedlastscan, that means that we have + * already prescanned this for BECNs and FECNs, so we should not check + * again + */ + if_pt((recvq->proto->flags & IPS_PROTO_FLAG_CCA) && + (state->hdrq_head >= state->hdrq_cachedlastscan)) { + /* IBTA CCA handling: + * If FECN bit set handle IBTA CCA protocol. For the + * flow that suffered congestion we flag it to generate + * a control packet with the BECN bit set - This is + * currently an unsolicited ACK. + * + * For all MQ packets the FECN processing/BECN + * generation is done in the is_expected_or_nak + * function as each eager packet is inspected there. + * + * For TIDFLOW/Expected data transfers the FECN + * bit/BECN generation is done in protoexp_data. Since + * header suppression can result in even FECN packets + * being suppressed the expected protocol generated + * additional BECN packets if a "large" number of + * generations are swapped without progress being made + * for receive. "Large" is set empirically to 4. + * + * FECN packets are ignored for all control messages + * (except ACKs and NAKs) since they indicate + * congestion on the control path which is not rate + * controlled. The CCA specification allows FECN on + * ACKs to be disregarded as well. + */ + + rcv_ev.is_congested = + _is_cca_fecn_set(rcv_ev. + p_hdr) & IPS_RECV_EVENT_FECN; + rcv_ev.is_congested |= + (_is_cca_becn_set(rcv_ev.p_hdr) << + (IPS_RECV_EVENT_BECN - 1)); + } else + rcv_ev.is_congested = 0; + +#ifdef PSM_DEBUG + if_pf(_check_headers(&rcv_ev, psm_hal_hdr_q)) + goto skip_packet; +#endif + dest_subcontext = _get_proto_subcontext(rcv_ev.p_hdr); + + /* If the destination is not our subcontext, process + * message as subcontext message (shared contexts) */ + if (dest_subcontext != recvq->subcontext) { + rcv_ev.ipsaddr = NULL; + + ret = recvq->recvq_callbacks.callback_subcontext + (&rcv_ev, dest_subcontext); + if (ret == IPS_RECVHDRQ_REVISIT) + { + PSM2_LOG_MSG("leaving"); + /* When PSM_PERF is enabled, the following line causes the + PMU to stop a stop watch to measure instruction cycles of + the RX speedpath of PSM. The stop watch was started + above. */ + GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR); + return PSM2_OK_NO_PROGRESS; + } + + goto skip_packet; + } + + if_pf(psmi_hal_rhf_get_all_err_flags(rcv_ev.psm_hal_rhf)) { + + _update_error_stats(recvq->proto, psmi_hal_rhf_get_all_err_flags(rcv_ev.psm_hal_rhf)); + + recvq->recvq_callbacks.callback_error(&rcv_ev); + + if ((psmi_hal_rhf_get_rx_type(rcv_ev.psm_hal_rhf) != PSM_HAL_RHF_RX_TYPE_EAGER) || + (!(psmi_hal_rhf_get_all_err_flags(rcv_ev.psm_hal_rhf) & PSMI_HAL_RHF_ERR_TID))) + goto skip_packet; + + /* no pending eager update, header + * is not currently under tracing. */ + if (state->hdr_countdown == 0 && + state->rcv_egr_index_head == NO_EAGER_UPDATE) { + uint32_t egr_cnt = psmi_hal_get_rx_egr_tid_cnt(recvq->context->psm_hw_ctxt); + psmi_hal_cl_idx etail=0, ehead=0; + + ehead = psmi_hal_get_cl_q_head_index( + psm_hal_egr_q, + rcv_ev.recvq->context->psm_hw_ctxt); + etail = psmi_hal_get_cl_q_tail_index( + psm_hal_egr_q, + rcv_ev.recvq->context->psm_hw_ctxt); + if (ehead == ((etail + 1) % egr_cnt)) { + /* eager is full, + * trace existing header entries */ + uint32_t hdr_size = + recvq->hdrq_elemlast + + hdrq_elemsz; + psmi_hal_cl_idx htail=0; + + htail = psmi_hal_get_cl_q_tail_index( + psm_hal_hdr_q, + rcv_ev.recvq->context->psm_hw_ctxt); + const uint32_t hhead = state->hdrq_head; + + state->hdr_countdown = + (htail > hhead) ? + (htail - hhead) : + (htail + hdr_size - hhead); + } + } + + /* Eager packet and tiderr. + * Don't consider updating egr head, unless we're in + * the congested state. If we're congested, we should + * try to keep the eager buffers free. */ + + if (!rcv_ev.is_congested) + goto skip_packet_no_egr_update; + else + goto skip_packet; + } + + /* If checksum is enabled, verify that it is valid */ + if_pf(rcv_ev.has_cksum && !do_pkt_cksum(&rcv_ev)) + goto skip_packet; + + if (_HFI_VDBG_ON) + { + psmi_hal_cl_idx egr_buff_q_head, egr_buff_q_tail; + + egr_buff_q_head = psmi_hal_get_cl_q_head_index( + psm_hal_egr_q, + rcv_ev.recvq->context->psm_hw_ctxt); + egr_buff_q_tail = psmi_hal_get_cl_q_tail_index( + psm_hal_egr_q, + rcv_ev.recvq->context->psm_hw_ctxt); + + _HFI_VDBG_ALWAYS( + "hdrq_head %d, p_hdr: %p, opcode %x, payload %p paylen %d; " + "egrhead %x egrtail %x; " + "useegrbit %x egrindex %x, egroffset %x, egrindexhead %x\n", + state->hdrq_head, + rcv_ev.p_hdr, + _get_proto_hfi_opcode(rcv_ev.p_hdr), + ips_recvhdrq_event_payload(&rcv_ev), + ips_recvhdrq_event_paylen(&rcv_ev), + egr_buff_q_head,egr_buff_q_tail, + psmi_hal_rhf_get_use_egr_buff(rcv_ev.psm_hal_rhf), + psmi_hal_rhf_get_egr_buff_index(rcv_ev.psm_hal_rhf), + psmi_hal_rhf_get_egr_buff_offset(rcv_ev.psm_hal_rhf), + state->rcv_egr_index_head); + } + + PSM2_LOG_PKT_STRM(PSM2_LOG_RX,rcv_ev.p_hdr,&rcv_ev.psm_hal_rhf.raw_rhf, + "PKT_STRM:"); + + /* Classify packet from a known or unknown endpoint */ + epstaddr = ips_epstate_lookup(recvq->epstate, + rcv_ev.p_hdr->connidx); + if_pf((epstaddr == NULL) || (epstaddr->ipsaddr == NULL)) { + rcv_ev.ipsaddr = NULL; + recvq->recvq_callbacks. + callback_packet_unknown(&rcv_ev); + } else { + rcv_ev.ipsaddr = epstaddr->ipsaddr; + ret = ips_proto_process_packet(&rcv_ev); + if (ret == IPS_RECVHDRQ_REVISIT) + { + PSM2_LOG_MSG("leaving"); + /* When PSM_PERF is enabled, the following line causes the + PMU to stop a stop watch to measure instruction cycles of + the RX speedpath of PSM. The stop watch was started + above. */ + GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR); + return PSM2_OK_NO_PROGRESS; + } + } + +skip_packet: + /* + * if eager buffer is used, record the index. + */ + if (psmi_hal_rhf_get_use_egr_buff(rcv_ev.psm_hal_rhf)) { + /* set only when a new entry is used */ + if (psmi_hal_rhf_get_egr_buff_offset(rcv_ev.psm_hal_rhf) == 0) { + state->rcv_egr_index_head = + psmi_hal_rhf_get_egr_buff_index(rcv_ev.psm_hal_rhf); + state->num_egrq_done++; + } + /* a header entry is using an eager entry, stop tracing. */ + state->hdr_countdown = 0; + } + +skip_packet_no_egr_update: + /* Note that state->hdrq_head is sampled speculatively by the code + * in ips_ptl_shared_poll() when context sharing, so it is not safe + * for this shared variable to temporarily exceed the last element. */ + _HFI_VDBG + ("head %d, elemsz %d elemlast %d\n", + state->hdrq_head, hdrq_elemsz, + recvq->hdrq_elemlast); + psmi_hal_retire_hdr_q_entry(&state->hdrq_head, psm_hal_hdr_q, + recvq->context->psm_hw_ctxt, + hdrq_elemsz, recvq->hdrq_elemlast, &empty); + state->num_hdrq_done++; + num_hdrq_done++; + done = (!next_hdrq_is_ready() || (ret == IPS_RECVHDRQ_BREAK) + || (num_hdrq_done == num_hdrq_todo)); + + do_hdr_update = (state->head_update_interval ? + (state->num_hdrq_done == + state->head_update_interval) : done); + if (do_hdr_update) { + + psmi_hal_set_cl_q_head_index( + state->hdrq_head, + psm_hal_hdr_q, + rcv_ev.recvq->context->psm_hw_ctxt); + /* Reset header queue entries processed */ + state->num_hdrq_done = 0; + } + if (state->num_egrq_done >= state->egrq_update_interval) { + /* Lazy update of egrq */ + if (state->rcv_egr_index_head != NO_EAGER_UPDATE) { + psmi_hal_set_cl_q_head_index( + state->rcv_egr_index_head, + psm_hal_egr_q, + recvq->context->psm_hw_ctxt); + state->rcv_egr_index_head = NO_EAGER_UPDATE; + state->num_egrq_done = 0; + } + } + if (state->hdr_countdown > 0) { + /* a header entry is consumed. */ + state->hdr_countdown -= hdrq_elemsz; + if (state->hdr_countdown == 0) { + /* header entry count reaches zero. */ + psmi_hal_cl_idx tail=0; + + tail = psmi_hal_get_cl_q_tail_index( + psm_hal_egr_q, + recvq->context->psm_hw_ctxt); + + psmi_hal_cl_idx head=0; + + head = psmi_hal_get_cl_q_head_index( + psm_hal_egr_q, + recvq->context->psm_hw_ctxt); + + uint32_t egr_cnt = psmi_hal_get_rx_egr_tid_cnt(recvq->context->psm_hw_ctxt); + /* Checks eager-full again. This is a real false-egr-full */ + if (head == ((tail + 1) % egr_cnt)) { + + psmi_hal_set_cl_q_tail_index( + tail, + psm_hal_egr_q, + recvq->context->psm_hw_ctxt); + + _HFI_DBG + ("eager array full after overflow, flushing " + "(head %llx, tail %llx)\n", + (long long)head, (long long)tail); + recvq->proto->stats.egr_overflow++; + } else + _HFI_ERROR + ("PSM BUG: EgrOverflow: eager queue is not full\n"); + } + } + } + /* while (hdrq_entries_to_read) */ + + /* Process any pending acks before exiting */ + process_pending_acks(recvq); + + PSM2_LOG_MSG("leaving"); + /* When PSM_PERF is enabled, the following line causes the + PMU to stop a stop watch to measure instruction cycles of + the RX speedpath of PSM. The stop watch was started + above. */ + GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR); + return num_hdrq_done ? PSM2_OK : PSM2_OK_NO_PROGRESS; +} + +/* This function is designed to implement RAPID CCA. It iterates + through the recvq, checking each element for set FECN or BECN bits. + In the case of finding one, the proper response is executed, and the bits + are cleared. +*/ +psm2_error_t ips_recvhdrq_scan_cca (struct ips_recvhdrq *recvq) +{ + +/* Looks at hdr and determines if it is the last item in the queue */ + +#define is_last_hdr(idx) \ + psmi_hal_cl_q_empty(idx, psm_hal_hdr_q, recvq->context->psm_hw_ctxt) + + struct ips_recvhdrq_state *state = recvq->state; + PSMI_CACHEALIGN struct ips_recvhdrq_event rcv_ev = {.proto = recvq->proto, + .recvq = recvq + }; + + uint32_t num_hdrq_done = state->hdrq_cachedlastscan / + psmi_hal_get_rx_hdr_q_ent_size(recvq->context->psm_hw_ctxt) >> BYTE2DWORD_SHIFT; + const int num_hdrq_todo = psmi_hal_get_rx_hdr_q_cnt(recvq->context->psm_hw_ctxt); + const uint32_t hdrq_elemsz = psmi_hal_get_rx_hdr_q_ent_size(recvq->context->psm_hw_ctxt) >> BYTE2DWORD_SHIFT; + + int done; + uint32_t scan_head = state->hdrq_head + state->hdrq_cachedlastscan; + const psmi_hal_cl_q psm_hal_hdr_q = recvq->psm_hal_cl_hdrq; + + /* Skip the first element, since we're going to process it soon anyway */ + if ( state->hdrq_cachedlastscan == 0 ) + { + scan_head += hdrq_elemsz; + num_hdrq_done++; + } + + PSM2_LOG_MSG("entering"); + done = !is_last_hdr(scan_head); + rcv_ev.psm_hal_hdr_q = psm_hal_hdr_q; + while (!done) { + psmi_hal_get_receive_event(scan_head, recvq->context->psm_hw_ctxt, + &rcv_ev); + rcv_ev.has_cksum = + ((recvq->proto->flags & IPS_PROTO_FLAG_CKSUM) && + (rcv_ev.p_hdr->flags & IPS_SEND_FLAG_PKTCKSUM)); + + _HFI_VDBG + ("scanning new packet for CCA: rcv_hdr %p, rhf %" PRIx64 "\n", + rcv_ev.p_hdr, rcv_ev.psm_hal_rhf.raw_rhf); + + if_pt ( _is_cca_fecn_set(rcv_ev.p_hdr) & IPS_RECV_EVENT_FECN ) { + struct ips_epstate_entry *epstaddr = ips_epstate_lookup(recvq->epstate, + rcv_ev.p_hdr->connidx); + + if (epstaddr != NULL && epstaddr->ipsaddr != NULL) + { + rcv_ev.ipsaddr = epstaddr->ipsaddr; + + /* Send BECN back */ + ips_epaddr_t *ipsaddr = rcv_ev.ipsaddr; + struct ips_message_header *p_hdr = rcv_ev.p_hdr; + ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); + struct ips_flow *flow; + ips_scb_t ctrlscb; + + psmi_assert(flowid < EP_FLOW_LAST); + flow = &ipsaddr->flows[flowid]; + ctrlscb.scb_flags = 0; + ctrlscb.ips_lrh.data[0].u32w0 = + flow->cca_ooo_pkts; + + rcv_ev.proto->epaddr_stats.congestion_pkts++; + /* Clear FECN event */ + rcv_ev.is_congested &= ~IPS_RECV_EVENT_FECN; + + ips_proto_send_ctrl_message(flow, + OPCODE_BECN, + &flow->ipsaddr-> + ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + } + } + else if_pt (0 != (_is_cca_becn_set(rcv_ev.p_hdr) << (IPS_RECV_EVENT_BECN - 1))) { + struct ips_epstate_entry *epstaddr = ips_epstate_lookup(recvq->epstate, + rcv_ev.p_hdr->connidx); + + if (epstaddr != NULL && epstaddr->ipsaddr != NULL) + { + rcv_ev.ipsaddr = epstaddr->ipsaddr; + + /* Adjust flow */ + struct ips_proto *proto = rcv_ev.proto; + struct ips_message_header *p_hdr = rcv_ev.p_hdr; + ips_epaddr_t *ipsaddr = rcv_ev.ipsaddr; + struct ips_flow *flow; + ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); + + psmi_assert(flowid < EP_FLOW_LAST); + flow = &ipsaddr->flows[flowid]; + if ((flow->path->pr_ccti + + proto->cace[flow->path->pr_sl].ccti_increase) <= proto->ccti_limit) { + ips_cca_adjust_rate(flow->path, + proto->cace[flow->path->pr_sl].ccti_increase); + /* Clear congestion event */ + rcv_ev.is_congested &= ~IPS_RECV_EVENT_BECN; + } + } + } + + num_hdrq_done++; + scan_head += hdrq_elemsz; + state->hdrq_cachedlastscan += hdrq_elemsz; + + done = (num_hdrq_done == num_hdrq_todo && !is_last_hdr(scan_head) ); + + } + /* while (hdrq_entries_to_read) */ + + + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} diff --git a/ptl_ips/ips_recvhdrq.h b/ptl_ips/ips_recvhdrq.h new file mode 100644 index 0000000..daef846 --- /dev/null +++ b/ptl_ips/ips_recvhdrq.h @@ -0,0 +1,213 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "ips_proto_params.h" +#include "ips_proto_header.h" + +#ifndef _IPS_RECVHDRQ_H +#define _IPS_RECVHDRQ_H + +struct ips_recvhdrq; +struct ips_recvhdrq_state; +struct ips_epstate; + +/* process current packet, continue on next packet */ +#define IPS_RECVHDRQ_CONTINUE 0 +/* process current packet, break and return to caller */ +#define IPS_RECVHDRQ_BREAK 1 +/* keep current packet, revisit the same packet next time */ +#define IPS_RECVHDRQ_REVISIT 2 + +/* CCA related receive events */ +#define IPS_RECV_EVENT_FECN 0x1 +#define IPS_RECV_EVENT_BECN 0x2 + +struct ips_recvhdrq_event { + struct ips_proto *proto; + const struct ips_recvhdrq *recvq; /* where message received */ + psmi_hal_rhf_t psm_hal_rhf; + struct ips_message_header *p_hdr; /* protocol header in rcv_hdr */ + struct ips_epaddr *ipsaddr; /* peer ipsaddr, if available */ + uint8_t has_cksum; /* payload has cksum */ + uint8_t is_congested; /* Packet faced congestion */ + psmi_hal_cl_q psm_hal_hdr_q; +}; + +struct ips_recvhdrq_callbacks { + int (*callback_packet_unknown) (const struct ips_recvhdrq_event *); + int (*callback_subcontext) (struct ips_recvhdrq_event *, + uint32_t subcontext); + int (*callback_error) (struct ips_recvhdrq_event *); +}; + +psm2_error_t +ips_recvhdrq_init(const psmi_context_t *context, + const struct ips_epstate *epstate, + const struct ips_proto *proto, + const struct ips_recvhdrq_callbacks *callbacks, + uint32_t subcontext, + struct ips_recvhdrq *recvq, + struct ips_recvhdrq_state *recvq_state, + psmi_hal_cl_q cl_q); + +psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq); + + /* This function is designed to implement RAPID CCA. It iterates + * through the recvq, checking each element for set FECN or BECN bits. + * In the case of finding one, the proper response is executed, and the bits + * are cleared. + */ +psm2_error_t ips_recvhdrq_scan_cca(struct ips_recvhdrq *recvq); + +/* + * Structure containing state for recvhdrq reading. This is logically + * part of ips_recvhdrq but needs to be separated out for context + * sharing so that it can be put in a shared memory page and hence + * be available to all processes sharing the context. Generally, do not + * put pointers in here since the address map of each process can be + * different. + */ +#define NO_EAGER_UPDATE ~0U +struct ips_recvhdrq_state { + psmi_hal_cl_idx hdrq_head; /* software copy of head */ + psmi_hal_cl_idx rcv_egr_index_head; /* software copy of eager index head */ + uint32_t head_update_interval; /* Header update interval */ + uint32_t num_hdrq_done; /* Num header queue done */ + uint32_t egrq_update_interval; /* Eager buffer update interval */ + uint32_t num_egrq_done; /* num eager buffer done */ + uint32_t hdr_countdown; /* for false-egr-full tracing */ + uint32_t hdrq_cachedlastscan; /* last element to be prescanned */ +}; + +/* + * Structure to read from recvhdrq + */ +struct ips_recvhdrq { + struct ips_proto *proto; + const psmi_context_t *context; /* error handling, epid id, etc. */ + struct ips_recvhdrq_state *state; + uint32_t subcontext; /* messages that don't match subcontext call + * recv_callback_subcontext */ + psmi_hal_cl_q psm_hal_cl_hdrq; + /* Header queue handling */ + pthread_spinlock_t hdrq_lock; /* Lock for thread-safe polling */ + uint32_t hdrq_elemlast; /* last element precomputed */ + /* Lookup endpoints epid -> ptladdr (rank)) */ + const struct ips_epstate *epstate; + + /* Callbacks to handle recvq events */ + struct ips_recvhdrq_callbacks recvq_callbacks; + + /* List of flows with pending acks for receive queue */ + SLIST_HEAD(pending_flows, ips_flow) pending_acks; + + volatile __u64 *spi_status; +}; + +PSMI_INLINE(int ips_recvhdrq_isempty(const struct ips_recvhdrq *recvq)) +{ + return psmi_hal_cl_q_empty(recvq->state->hdrq_head, + recvq->psm_hal_cl_hdrq, + recvq->context->psm_hw_ctxt); +} + +PSMI_INLINE( +void * +ips_recvhdrq_event_payload(const struct ips_recvhdrq_event *rcv_ev)) +{ + if (psmi_hal_rhf_get_use_egr_buff(rcv_ev->psm_hal_rhf)) + return psmi_hal_get_egr_buff( + psmi_hal_rhf_get_egr_buff_index(rcv_ev->psm_hal_rhf), + rcv_ev->psm_hal_hdr_q + 1 /* The circular list q (cl_q) for the + egr buff for any rx hdrq event is + always one more than the hdrq cl q */, + rcv_ev->recvq->context->psm_hw_ctxt)+ + (psmi_hal_rhf_get_egr_buff_offset(rcv_ev->psm_hal_rhf)*64); + else + return NULL; +} + +PSMI_INLINE( +uint32_t +ips_recvhdrq_event_paylen(const struct ips_recvhdrq_event *rcv_ev)) +{ + uint32_t cksum_len = rcv_ev->has_cksum ? PSM_CRC_SIZE_IN_BYTES : 0; + + return psmi_hal_rhf_get_packet_length(rcv_ev->psm_hal_rhf) - + (sizeof(struct ips_message_header) + + HFI_CRC_SIZE_IN_BYTES + cksum_len); + /* PSM does not use bth0].PadCnt, it figures out real datalen other way */ +} + +PSMI_INLINE(int ips_recvhdrq_trylock(struct ips_recvhdrq *recvq)) +{ + int ret = pthread_spin_trylock(&recvq->hdrq_lock); + return !ret; +} + +PSMI_INLINE(int ips_recvhdrq_lock(struct ips_recvhdrq *recvq)) +{ + int ret = pthread_spin_lock(&recvq->hdrq_lock); + return !ret; +} + +PSMI_INLINE(int ips_recvhdrq_unlock(struct ips_recvhdrq *recvq)) +{ + int ret = pthread_spin_unlock(&recvq->hdrq_lock); + return !ret; +} + +#endif /* _IPS_RECVHDRQ_H */ diff --git a/ptl_ips/ips_recvq.c b/ptl_ips/ips_recvq.c new file mode 100644 index 0000000..1fb4bf5 --- /dev/null +++ b/ptl_ips/ips_recvq.c @@ -0,0 +1,92 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "ips_recvq.h" + +/* We return a table of pointer indexes. + * + * From the point of view of the returned pointer, index -1 always points to + * the address to call psmi_free on (since we force page-alignment). + */ +void **ips_recvq_egrbuf_table_alloc(psm2_ep_t ep, void *baseptr, + uint32_t bufnum, uint32_t bufsize) +{ + unsigned i; + void *ptr_alloc; + uintptr_t *buft; + uintptr_t base = (uintptr_t) baseptr; + + ptr_alloc = psmi_malloc(ep, UNDEFINED, + PSMI_PAGESIZE + sizeof(uintptr_t) * (bufnum + + 1)); + if (ptr_alloc == NULL) + return NULL; + /* First pointer is to the actual allocated address, so we can free it but + * buft[1] is first on the page boundary + */ + buft = (uintptr_t *) PSMI_ALIGNUP(ptr_alloc + 1, PSMI_PAGESIZE); + buft[-1] = (uintptr_t) ptr_alloc; + for (i = 0; i < bufnum; i++) + buft[i] = (uintptr_t) ((char *)base + i * bufsize); + return (void **)buft; +} + +void ips_recvq_egrbuf_table_free(void **buftable) +{ + uintptr_t *buft = (uintptr_t *) buftable; + void *ptr_alloc = (void *)buft[-1]; + psmi_free(ptr_alloc); +} diff --git a/ptl_ips/ips_recvq.h b/ptl_ips/ips_recvq.h new file mode 100644 index 0000000..7d1a990 --- /dev/null +++ b/ptl_ips/ips_recvq.h @@ -0,0 +1,73 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_RECVQ_H +#define _IPS_RECVQ_H + +#include "psm_user.h" + +/* + * Tables to map eager indexes into their buffer addresses + * + * If function returns NULL, no memory has been allocated and the error handler + * has been executed on 'ep' and hence assume status PSM2_NO_MEMORY. + */ +void **ips_recvq_egrbuf_table_alloc(psm2_ep_t ep, + void *base, uint32_t bufnum, + uint32_t bufsize); +void ips_recvq_egrbuf_table_free(void **buftable); + + +#endif /* _IPS_RECVQ_H */ diff --git a/ptl_ips/ips_scb.c b/ptl_ips/ips_scb.c new file mode 100644 index 0000000..52b9a93 --- /dev/null +++ b/ptl_ips/ips_scb.c @@ -0,0 +1,341 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include "ips_scb.h" +#include "ips_proto_internal.h" + +psm2_error_t +ips_scbctrl_init(const psmi_context_t *context, + uint32_t numscb, uint32_t numbufs, + uint32_t imm_size, uint32_t bufsize, + ips_scbctrl_avail_callback_fn_t scb_avail_callback, + void *scb_avail_context, struct ips_scbctrl *scbc) +{ + int i; + struct ips_scb *scb; + size_t scb_size; + size_t alloc_sz; + uintptr_t base, imm_base; + psm2_ep_t ep = context->ep; + /* scbc->context = context; */ + psm2_error_t err = PSM2_OK; + + psmi_assert_always(numscb > 0); + scbc->sbuf_num = scbc->sbuf_num_cur = numbufs; + SLIST_INIT(&scbc->sbuf_free); + scbc->sbuf_buf_size = bufsize; + scbc->sbuf_buf_base = NULL; + scbc->sbuf_buf_alloc = NULL; + scbc->sbuf_buf_last = NULL; + + /* send buffers are not mandatory but when allocating them, make sure they + * are on a page boundary */ + if (numbufs > 0) { + struct ips_scbbuf *sbuf; + + bufsize = PSMI_ALIGNUP(bufsize, 64); + + alloc_sz = numbufs * bufsize + PSMI_PAGESIZE; + scbc->sbuf_buf_alloc = + psmi_calloc(ep, NETWORK_BUFFERS, 1, alloc_sz); + if (scbc->sbuf_buf_alloc == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + base = (uintptr_t) scbc->sbuf_buf_alloc; + base = PSMI_ALIGNUP(base, PSMI_PAGESIZE); + scbc->sbuf_buf_base = (void *)base; + scbc->sbuf_buf_last = (void *)(base + bufsize * (numbufs - 1)); + _HFI_VDBG + ("sendbufs=%d, (size=%d),base=[%p..%p)\n", + numbufs, bufsize, + (void *)scbc->sbuf_buf_base, (void *)scbc->sbuf_buf_last); + + for (i = 0; i < numbufs; i++) { + sbuf = (struct ips_scbbuf *)(base + bufsize * i); + SLIST_NEXT(sbuf, next) = NULL; + SLIST_INSERT_HEAD(&scbc->sbuf_free, sbuf, next); + } + } + + imm_base = 0; + scbc->scb_imm_size = imm_size; + if (scbc->scb_imm_size) { + scbc->scb_imm_size = PSMI_ALIGNUP(imm_size, 64); + alloc_sz = numscb * scbc->scb_imm_size + 64; + scbc->scb_imm_buf = psmi_memalign(ep, NETWORK_BUFFERS, 64, + alloc_sz); + + if (scbc->scb_imm_buf == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + + memset(scbc->scb_imm_buf, 0, alloc_sz); + imm_base = PSMI_ALIGNUP(scbc->scb_imm_buf, 64); + } else + scbc->scb_imm_buf = NULL; + + scbc->scb_num = scbc->scb_num_cur = numscb; + SLIST_INIT(&scbc->scb_free); + + scb_size = PSMI_ALIGNUP(sizeof(*scb), 64); + alloc_sz = numscb * scb_size; + + scbc->scb_base = psmi_memalign(ep, NETWORK_BUFFERS, 64, alloc_sz); + if (scbc->scb_base == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + + memset(scbc->scb_base, 0, alloc_sz); + base = (uintptr_t) scbc->scb_base; + + /* + * Allocate ack/send timer for each scb object. + */ + scbc->timers = (struct psmi_timer *) + psmi_calloc(ep, UNDEFINED, 2*numscb, + sizeof(struct psmi_timer)); + if (scbc->timers == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + + for (i = 0; i < numscb; i++) { + scb = (struct ips_scb *)(base + i * scb_size); + + scb->scbc = scbc; + if (scbc->scb_imm_buf) + scb->imm_payload = + (void *)(imm_base + (i * scbc->scb_imm_size)); + else + scb->imm_payload = NULL; + + SLIST_INSERT_HEAD(&scbc->scb_free, scb, next); + + /* + * Initialize timers. + * Associate the timers to each scb, the association is + * not fixed because later PSM may exchange the timers + * between scb, the reason for exchanging is that the + * timer is currently using by flow, but the scb is to + * be freed. see ack/nak processing in file ips_prot_recv.c + */ + scb->timer_ack = &scbc->timers[2*i]; + psmi_timer_entry_init(scb->timer_ack, + ips_proto_timer_ack_callback, scb); + + scb->timer_send = &scbc->timers[2*i+1]; + psmi_timer_entry_init(scb->timer_send, + ips_proto_timer_send_callback, scb); + } + scbc->scb_avail_callback = scb_avail_callback; + scbc->scb_avail_context = scb_avail_context; + + +fail: + return err; +} + +psm2_error_t ips_scbctrl_fini(struct ips_scbctrl *scbc) +{ + if (scbc->scb_base != NULL) { + psmi_free(scbc->scb_base); + } + if (scbc->sbuf_buf_alloc) { + psmi_free(scbc->sbuf_buf_alloc); + } + return PSM2_OK; +} + +int ips_scbctrl_bufalloc(ips_scb_t *scb) +{ + struct ips_scbctrl *scbc = scb->scbc; + + psmi_assert(scbc->sbuf_num > 0); + psmi_assert(!((ips_scb_buffer(scb) >= scbc->sbuf_buf_base) && + (ips_scb_buffer(scb) <= scbc->sbuf_buf_last))); + psmi_assert(scb->payload_size <= scbc->sbuf_buf_size); + + if (scb->payload_size <= scbc->scb_imm_size) { + /* Attach immediate buffer */ + ips_scb_buffer(scb) = scb->imm_payload; + return 1; + } + + if (SLIST_EMPTY(&scbc->sbuf_free)) + return 0; + else { + psmi_assert(scbc->sbuf_num_cur); + ips_scb_buffer(scb) = SLIST_FIRST(&scbc->sbuf_free); + scbc->sbuf_num_cur--; + + /* If under memory pressure request ACK for packet to reclaim + * credits. + */ + if (scbc->sbuf_num_cur < (scbc->sbuf_num >> 1)) + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + + SLIST_REMOVE_HEAD(&scbc->sbuf_free, next); + return 1; + } +} + +int ips_scbctrl_avail(struct ips_scbctrl *scbc) +{ + return (!SLIST_EMPTY(&scbc->scb_free) && scbc->sbuf_num_cur > 0); +} + +ips_scb_t *MOCKABLE(ips_scbctrl_alloc)(struct ips_scbctrl *scbc, int scbnum, int len, + uint32_t flags) +{ + ips_scb_t *scb, *scb_head = NULL; + + psmi_assert(flags & IPS_SCB_FLAG_ADD_BUFFER ? (scbc->sbuf_num > 0) : 1); + psmi_assert(scbc->sbuf_buf_size >= len); + + while (scbnum--) { + if (SLIST_EMPTY(&scbc->scb_free)) + break; + scb = SLIST_FIRST(&scbc->scb_free); + /* Need to set this here as bufalloc may request + * an ACK under memory pressure + */ + scb->scb_flags = 0; + if (flags & IPS_SCB_FLAG_ADD_BUFFER) { + scb->payload_size = len; + if (!ips_scbctrl_bufalloc(scb)) + break; + } else { + ips_scb_buffer(scb) = NULL; + scb->payload_size = 0; + } + + scb->tidsendc = NULL; + scb->callback = NULL; + scb->tidctrl = 0; + scb->nfrag = 1; + scb->frag_size = 0; +#ifdef PSM_CUDA + scb->mq_req = NULL; +#endif + + scbc->scb_num_cur--; + if (scbc->scb_num_cur < (scbc->scb_num >> 1)) + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + + SLIST_REMOVE_HEAD(&scbc->scb_free, next); + SLIST_NEXT(scb, next) = scb_head; + scb_head = scb; + } + return scb_head; +} +MOCK_DEF_EPILOGUE(ips_scbctrl_alloc); + +void ips_scbctrl_free(ips_scb_t *scb) +{ + struct ips_scbctrl *scbc = scb->scbc; + if (scbc->sbuf_num && (ips_scb_buffer(scb) >= scbc->sbuf_buf_base) && + (ips_scb_buffer(scb) <= scbc->sbuf_buf_last)) { + scbc->sbuf_num_cur++; + SLIST_INSERT_HEAD(&scbc->sbuf_free, scb->sbuf, next); + } + + ips_scb_buffer(scb) = NULL; + scb->tidsendc = NULL; + scb->payload_size = 0; + scbc->scb_num_cur++; + if (SLIST_EMPTY(&scbc->scb_free)) { + SLIST_INSERT_HEAD(&scbc->scb_free, scb, next); + if (scbc->scb_avail_callback != NULL) + scbc->scb_avail_callback(scbc, scbc->scb_avail_context); + } else + SLIST_INSERT_HEAD(&scbc->scb_free, scb, next); + + return; +} + +ips_scb_t *MOCKABLE(ips_scbctrl_alloc_tiny)(struct ips_scbctrl *scbc) +{ + ips_scb_t *scb; + if (SLIST_EMPTY(&scbc->scb_free)) + return NULL; + scb = SLIST_FIRST(&scbc->scb_free); + + SLIST_REMOVE_HEAD(&scbc->scb_free, next); + SLIST_NEXT(scb, next) = NULL; + + ips_scb_buffer(scb) = NULL; + scb->payload_size = 0; + scb->scb_flags = 0; + scb->tidsendc = NULL; + scb->callback = NULL; + scb->tidctrl = 0; + scb->nfrag = 1; + scb->frag_size = 0; +#ifdef PSM_CUDA + scb->mq_req = NULL; +#endif + + scbc->scb_num_cur--; + if (scbc->scb_num_cur < (scbc->scb_num >> 1)) + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + return scb; +} +MOCK_DEF_EPILOGUE(ips_scbctrl_alloc_tiny); diff --git a/ptl_ips/ips_scb.h b/ptl_ips/ips_scb.h new file mode 100644 index 0000000..8d914d1 --- /dev/null +++ b/ptl_ips/ips_scb.h @@ -0,0 +1,219 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_SCB_H +#define _IPS_SCB_H + +#include "psm2_mock_testing.h" +#include "psm_user.h" +#include "ips_proto_header.h" + +/* ips_alloc_scb flags */ +#define IPS_SCB_FLAG_NONE 0x0 +#define IPS_SCB_FLAG_ADD_BUFFER 0x1 + +/* macros to update scb */ +#define ips_scb_opcode(scb) scb->opcode +#define ips_scb_buffer(scb) scb->payload +#define ips_scb_length(scb) scb->payload_size +#define ips_scb_flags(scb) scb->scb_flags +#define ips_scb_dma_cntr(scb) scb->dma_cntr +#define ips_scb_epaddr(scb) scb->epaddr +#define ips_scb_cb(scb) scb->callback +#define ips_scb_cb_param(scb) scb->cb_param + +#define ips_scb_copy_tag(dst, src) \ + (dst)[0] = (src)[0]; \ + (dst)[1] = (src)[1]; \ + (dst)[2] = (src)[2]; + +struct ips_scbbuf; +struct ips_scb; +struct ips_scbctrl; +struct ips_tid_send_desc; + +typedef void (*ips_scbctrl_avail_callback_fn_t) (struct ips_scbctrl *, + void *context); + +STAILQ_HEAD(ips_scb_stailq, ips_scb); +SLIST_HEAD(ips_scb_slist, ips_scb); + +struct ips_scbctrl { + /* const psmi_context_t *context; */ + + /* Send control blocks for each send */ + uint32_t scb_num; + uint32_t scb_num_cur; + SLIST_HEAD(scb_free, ips_scb) scb_free; + void *scb_base; + ips_scbctrl_avail_callback_fn_t scb_avail_callback; + void *scb_avail_context; + + /* Immediate data for send buffers */ + uint32_t scb_imm_size; + void *scb_imm_buf; + psmi_timer *timers; /* ack/send timers */ + + /* + * Send buffers (or bounce buffers) to keep user data if we need to + * retransmit. + */ + uint32_t sbuf_num; + uint32_t sbuf_num_cur; + SLIST_HEAD(sbuf_free, ips_scbbuf) sbuf_free; + void *sbuf_buf_alloc; + uint32_t sbuf_buf_size; + void *sbuf_buf_base; + void *sbuf_buf_last; +}; + +struct ips_scbbuf { + SLIST_ENTRY(ips_scbbuf) next; +}; + +typedef struct ips_scb ips_scb_t; + +struct ips_scb { + union { + SLIST_ENTRY(ips_scb) next; + STAILQ_ENTRY(ips_scb) nextq; + }; + union { + void *payload; + struct ips_scbbuf *sbuf; + }; + uint64_t ack_timeout; /* in cycles */ + uint64_t abs_timeout; /* in cycles */ + + psmi_timer *timer_send; /* for sending packets */ + psmi_timer *timer_ack; /* for acking packets */ + + /* Used when composing packet */ + psmi_seqnum_t seq_num; + uint32_t cksum[2]; + uint32_t scb_flags; + uint32_t payload_size; /* remaining first packet size */ + uint32_t chunk_size; /* total buffer size if nfrag > 1 */ + /* initially chunk_size_remaining = chunk_size. */ + uint32_t chunk_size_remaining; /* buffer size to re-transmit */ + uint16_t nfrag; /* total packets in sequence */ + /* initially nfrag_remaining = nfrag */ + uint16_t nfrag_remaining; /* number packets to re-transmit */ + uint16_t dma_complete; + uint16_t tidctrl; + uint16_t frag_size; /* max packet size in sequence */ + uint16_t opcode; + uint16_t tsess_length; + uint32_t *tsess; + struct ips_flow *flow; + struct ips_tid_send_desc *tidsendc; + + struct ips_scbctrl *scbc; + void *imm_payload; + + union { + int (*callback) (void *, uint32_t); + psm2_am_completion_fn_t completion_am; + }; + void *cb_param; +#ifdef PSM_CUDA + psm2_mq_req_t mq_req; /* back pointer to original request */ +#endif + /* sdma header place holder, PSM2 code should access + * the psm_hal_sdma_req_info only using the psmi_get_sdma_req_info() + * accessor function. */ + /* + * The size of struct psm_hal_sdma_req_info is variable. (10 bytes for + * GPU-direct and 8 bytes for non GPU-Direct) + * When GPU-Direct feature is used, all 10 bytes of the space is used. + * Otherwise, we only use upto 8 bytes. The usage is controlled by + * psmi_get_sdma_req_info() in ips_proto.h + */ + struct psm_hal_sdma_req_info _DO_NOT_USE_; + struct { + struct psm_hal_pbc pbc; + struct ips_message_header ips_lrh; + } PSMI_CACHEALIGN; +}; + +/* Make sure pbc is at the right place before the message header */ + +COMPILE_TIME_ASSERT(PBC_ABUTS_IPS_MSG_HDR,(sizeof(struct psm_hal_pbc) == + (size_t) (offsetof(struct ips_scb, ips_lrh) - + offsetof(struct ips_scb, pbc)))); + +#ifdef PSM_CUDA +#define IS_TRANSFER_BUF_GPU_MEM(scb) (ips_scb_flags(scb) & IPS_SEND_FLAG_PAYLOAD_BUF_GPU) +#endif + +void ips_scbctrl_free(ips_scb_t *scb); +int ips_scbctrl_bufalloc(ips_scb_t *scb); +int ips_scbctrl_avail(struct ips_scbctrl *scbc); +ips_scb_t *MOCKABLE(ips_scbctrl_alloc)(struct ips_scbctrl *scbc, + int scbnum, int len, uint32_t flags); +MOCK_DCL_EPILOGUE(ips_scbctrl_alloc); +ips_scb_t *MOCKABLE(ips_scbctrl_alloc_tiny)(struct ips_scbctrl *scbc); +MOCK_DCL_EPILOGUE(ips_scbctrl_alloc_tiny); + +psm2_error_t ips_scbctrl_init(const psmi_context_t *context, + uint32_t numscb, uint32_t numbufs, + uint32_t imm_size, uint32_t bufsize, + ips_scbctrl_avail_callback_fn_t, + void *avail_context, struct ips_scbctrl *); +psm2_error_t ips_scbctrl_fini(struct ips_scbctrl *); + +psm2_error_t ips_scbctrl_writev(struct ips_scb_slist *slist, int fd); + +#endif /* _IPS_SCB_H */ diff --git a/ptl_ips/ips_stats.h b/ptl_ips/ips_stats.h new file mode 100644 index 0000000..046e0c3 --- /dev/null +++ b/ptl_ips/ips_stats.h @@ -0,0 +1,83 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_STATS_H +#define _IPS_STATS_H + +struct psm2_epaddr; /* for non-PSM clients */ + +/* Old stats */ +typedef struct { + uint64_t err_chk_send; + uint64_t err_chk_recv; + uint64_t send_failed; + uint64_t recv_dropped; + union { + uint64_t recv_copied; /* obsolete */ + uint64_t nak_sent; + }; + uint64_t nak_recv; + uint64_t total_send_eager; + uint64_t total_send_exp; + uint64_t acks_sent; + uint64_t retransmits; + uint64_t recv_matched; + uint64_t recv_unmatched; + uint64_t scb_alloc_yields; +} ips_sess_stat; + +int ips_get_stat(struct psm2_epaddr *epaddr, ips_sess_stat *stats); + +#endif /* _IPS_STATS_H */ diff --git a/ptl_ips/ips_subcontext.h b/ptl_ips/ips_subcontext.h new file mode 100644 index 0000000..4f5afcb --- /dev/null +++ b/ptl_ips/ips_subcontext.h @@ -0,0 +1,79 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef __IPS_SUBCONTEXT_H +#define __IPS_SUBCONTEXT_H + +#include "psm_user.h" +#include "ips_recvhdrq.h" +#include "ips_writehdrq.h" + +/* This data structure is allocated in ureg page of each subcontext process */ + +struct ips_subcontext_ureg { + /* head/eager head/tail register storage, one per cacheline + (member is unused by PSM, but needed here to match driver structures). */ + uint64_t subcontext_uregbase[40 /* i.e. ur_maxreg * 8 */]; + struct ips_writehdrq_state writeq_state; /* used in all ureg pages */ +} __attribute__ ((aligned(64))); + +struct ips_hwcontext_ctrl { + pthread_spinlock_t context_lock; /* lock shared by all subctxts */ + struct ips_recvhdrq_state recvq_state; /* state shared by all subctxts */ + uint32_t rx_hdrq_rhf_seq; /* rhf seq for the hw hdrq shared + by all subctxts */ +} __attribute__ ((aligned(64))); + +#endif diff --git a/ptl_ips/ips_tid.c b/ptl_ips/ips_tid.c new file mode 100644 index 0000000..365de2a --- /dev/null +++ b/ptl_ips/ips_tid.c @@ -0,0 +1,275 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_tid.h" +#include "ips_proto.h" +#include "ips_expected_proto.h" + +psm2_error_t +ips_tid_init(const psmi_context_t *context, struct ips_protoexp *protoexp, + ips_tid_avail_cb_fn_t cb, void *cb_context) +{ + struct ips_tid *tidc = &protoexp->tidc; + + struct psmi_stats_entry entries[] = { + PSMI_STATS_DECL("tid update count", MPSPAWN_STATS_REDUCTION_ALL, + NULL, &tidc->tid_num_total), + }; + + tidc->context = context; + tidc->protoexp = protoexp; + tidc->tid_num_total = 0; + tidc->tid_num_inuse = 0; + tidc->tid_avail_cb = cb; + tidc->tid_avail_context = cb_context; + tidc->tid_array = NULL; + + /* + * PSM uses tid registration caching only if driver has enabled it. + */ + if (!psmi_hal_has_cap(PSM_HAL_CAP_TID_UNMAP)) { + int i; + cl_qmap_t *p_map; + cl_map_item_t *root,*nil_item; + + tidc->tid_array = (uint32_t *) + psmi_calloc(context->ep, UNDEFINED, + psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt), + sizeof(uint32_t)); + if (tidc->tid_array == NULL) + return PSM2_NO_MEMORY; + + /* + * first is root node, last is terminator node. + */ + p_map = &tidc->tid_cachemap; + root = (cl_map_item_t *) + psmi_calloc(context->ep, UNDEFINED, + psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt) + 2, + sizeof(cl_map_item_t)); + + if (root == NULL) + return PSM2_NO_MEMORY; + + nil_item = &root + [psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt) + 1]; + + ips_tidcache_map_init(p_map,root,nil_item); + + NTID = 0; + NIDLE = 0; + IPREV(IHEAD) = INEXT(IHEAD) = IHEAD; + for (i = 1; i <= psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt); i++) { + INVALIDATE(i) = 1; + } + + /* + * if not shared context, all tids are used by the same + * process. Otherwise, subcontext process can only cache + * its own portion. Driver makes the same tid number + * assignment to subcontext processes. + */ + tidc->tid_cachesize = psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt); + if (psmi_hal_get_subctxt_cnt(context->psm_hw_ctxt) > 0) { + uint16_t remainder = tidc->tid_cachesize % + psmi_hal_get_subctxt_cnt(context->psm_hw_ctxt); + tidc->tid_cachesize /= psmi_hal_get_subctxt_cnt(context->psm_hw_ctxt); + if (psmi_hal_get_subctxt(context->psm_hw_ctxt) < remainder) + tidc->tid_cachesize++; + } + } + + /* + * Setup shared control structure. + */ + tidc->tid_ctrl = (struct ips_tid_ctrl *)context->tid_ctrl; + if (!tidc->tid_ctrl) { + tidc->tid_ctrl = (struct ips_tid_ctrl *) + psmi_calloc(context->ep, UNDEFINED, 1, + sizeof(struct ips_tid_ctrl)); + if (tidc->tid_ctrl == NULL) { + return PSM2_NO_MEMORY; + } + } + + /* + * Only the master process can initialize. + */ + if (psmi_hal_get_subctxt(context->psm_hw_ctxt) == 0) { + pthread_spin_init(&tidc->tid_ctrl->tid_ctrl_lock, + PTHREAD_PROCESS_SHARED); + + tidc->tid_ctrl->tid_num_max = + psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt); + tidc->tid_ctrl->tid_num_avail = tidc->tid_ctrl->tid_num_max; + } + + return psmi_stats_register_type(PSMI_STATS_NO_HEADING, + PSMI_STATSTYPE_TIDS, + entries, + PSMI_STATS_HOWMANY(entries), tidc); +} + +psm2_error_t ips_tid_fini(struct ips_tid *tidc) +{ + if (tidc->tid_array) + ips_tidcache_cleanup(tidc); + + if (!tidc->context->tid_ctrl) + psmi_free(tidc->tid_ctrl); + + return PSM2_OK; +} + +psm2_error_t +ips_tid_acquire(struct ips_tid *tidc, + const void *buf, uint32_t *length, + uint32_t *tid_array, uint32_t *tidcnt +#ifdef PSM_CUDA + , uint8_t is_cuda_ptr +#endif + ) +{ + struct ips_tid_ctrl *ctrl = tidc->tid_ctrl; + psm2_error_t err = PSM2_OK; + uint16_t flags = 0; + int rc; + + psmi_assert(((uintptr_t) buf & 0xFFF) == 0); + psmi_assert(((*length) & 0xFFF) == 0); + + if (tidc->context->tid_ctrl) + pthread_spin_lock(&ctrl->tid_ctrl_lock); + + if (!ctrl->tid_num_avail) { + err = PSM2_EP_NO_RESOURCES; + goto fail; + } + + /* Clip length if it exceeds worst case tid allocation, + where each entry in the tid array can accommodate only + 1 page. */ + if (*length > 4096*tidc->tid_ctrl->tid_num_max) + { + *length = 4096*tidc->tid_ctrl->tid_num_max; + } + +#ifdef PSM_CUDA + if (is_cuda_ptr) + flags = PSM_HAL_BUF_GPU_MEM; +#endif + + rc = psmi_hal_update_tid(tidc->context->psm_hw_ctxt, + (uint64_t) (uintptr_t) buf, length, + (uint64_t) (uintptr_t) tid_array, tidcnt, flags); + + if (rc < 0) { + /* Unable to pin pages? retry later */ + err = PSM2_EP_DEVICE_FAILURE; + goto fail; + } + + psmi_assert_always((*tidcnt) > 0); + psmi_assert(ctrl->tid_num_avail >= (*tidcnt)); + ctrl->tid_num_avail -= (*tidcnt); + tidc->tid_num_total += (*tidcnt); + tidc->tid_num_inuse += (*tidcnt); + +fail: + if (tidc->context->tid_ctrl) + pthread_spin_unlock(&ctrl->tid_ctrl_lock); + + return err; +} + +psm2_error_t +ips_tid_release(struct ips_tid *tidc, + uint32_t *tid_array, uint32_t tidcnt) +{ + struct ips_tid_ctrl *ctrl = tidc->tid_ctrl; + psm2_error_t err = PSM2_OK; + + psmi_assert(tidcnt > 0); + if (tidc->context->tid_ctrl) + pthread_spin_lock(&ctrl->tid_ctrl_lock); + + if (psmi_hal_free_tid(tidc->context->psm_hw_ctxt, + (uint64_t) (uintptr_t) tid_array, tidcnt) < 0) { + if (tidc->context->tid_ctrl) + pthread_spin_unlock(&ctrl->tid_ctrl_lock); + + /* If failed to unpin pages, it's fatal error */ + err = psmi_handle_error(tidc->context->ep, + PSM2_EP_DEVICE_FAILURE, + "Failed to tid free %d tids", + tidcnt); + goto fail; + } + + ctrl->tid_num_avail += tidcnt; + if (tidc->context->tid_ctrl) + pthread_spin_unlock(&ctrl->tid_ctrl_lock); + + tidc->tid_num_inuse -= tidcnt; + /* If an available callback is registered invoke it */ + if (((tidc->tid_num_inuse + tidcnt) == ctrl->tid_num_max) + && tidc->tid_avail_cb) + tidc->tid_avail_cb(tidc, tidc->tid_avail_context); + +fail: + return err; +} diff --git a/ptl_ips/ips_tid.h b/ptl_ips/ips_tid.h new file mode 100644 index 0000000..42cad27 --- /dev/null +++ b/ptl_ips/ips_tid.h @@ -0,0 +1,154 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +/* included header files */ + +#ifndef _IPS_TID_H +#define _IPS_TID_H + +#include "psm_user.h" +#include "ips_tidcache.h" + +struct ips_tid; + +typedef void (*ips_tid_avail_cb_fn_t) (struct ips_tid *, void *context); + +struct ips_tid_ctrl { + pthread_spinlock_t tid_ctrl_lock; + uint32_t tid_num_max; + uint32_t tid_num_avail; +} __attribute__ ((aligned(64))); + +struct ips_tid { + const psmi_context_t *context; + struct ips_protoexp *protoexp; + + void *tid_avail_context; + struct ips_tid_ctrl *tid_ctrl; + + ips_tid_avail_cb_fn_t tid_avail_cb; + uint64_t tid_num_total; + uint32_t tid_num_inuse; + uint32_t tid_cachesize; /* items can be cached */ + cl_qmap_t tid_cachemap; /* RB tree implementation */ + /* + * tids storage. + * This is used in tid registration caching case for + * tid invalidation, acquire, replace and release, + * entries should be the assigned tid number. + */ + uint32_t *tid_array; +}; + +psm2_error_t ips_tid_init(const psmi_context_t *context, + struct ips_protoexp *protoexp, + ips_tid_avail_cb_fn_t cb, void *cb_context); +psm2_error_t ips_tid_fini(struct ips_tid *tidc); + +/* Acquiring tids. + * Buffer base has to be aligned on page boundary + * Buffer length has to be multiple pages + */ +psm2_error_t ips_tidcache_acquire(struct ips_tid *tidc, + const void *buf, /* input buffer, aligned to page boundary */ + uint32_t *length, /* buffer length, aligned to page size */ + uint32_t *tid_array, /* output tidarray, */ + uint32_t *tidcnt, /* output of tid count */ + uint32_t *pageoff /* output of offset in first tid */ +#ifdef PSM_CUDA + , uint8_t is_cuda_ptr +#endif + ); + +psm2_error_t ips_tidcache_release(struct ips_tid *tidc, + uint32_t *tid_array, /* input tidarray, */ + uint32_t tidcnt); /* input of tid count */ + +psm2_error_t ips_tidcache_cleanup(struct ips_tid *tidc); +psm2_error_t ips_tidcache_invalidation(struct ips_tid *tidc); + +psm2_error_t ips_tid_acquire(struct ips_tid *tidc, + const void *buf, /* input buffer, aligned to page boundary */ + uint32_t *length, /* buffer length, aligned to page size */ + uint32_t *tid_array, /* output tidarray, */ + uint32_t *tidcnt +#ifdef PSM_CUDA + , uint8_t is_cuda_ptr +#endif + ); /* output of tid count */ + +psm2_error_t ips_tid_release(struct ips_tid *tidc, + uint32_t *tid_array, /* input tidarray, */ + uint32_t tidcnt); /* input of tid count */ + +PSMI_INLINE(int ips_tid_num_available(struct ips_tid *tidc)) +{ + if (tidc->tid_ctrl->tid_num_avail == 0) { + if (tidc->tid_ctrl->tid_num_max == tidc->tid_num_inuse) + return -1; + else + return 0; + } + + return tidc->tid_ctrl->tid_num_avail; +} + +/* Note that the caller is responsible for making sure that NIDLE is non-zero + before calling ips_tidcache_evict. If NIDLE is 0 at the time of call, + ips_tidcache_evict is unstable. + */ +uint64_t ips_tidcache_evict(struct ips_tid *tidc, uint64_t length); + +#endif /* _IPS_TID_H */ diff --git a/ptl_ips/ips_tidcache.c b/ptl_ips/ips_tidcache.c new file mode 100644 index 0000000..393ab3a --- /dev/null +++ b/ptl_ips/ips_tidcache.c @@ -0,0 +1,674 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include "ips_expected_proto.h" + +#define RBTREE_GET_LEFTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start) +#define RBTREE_GET_RIGHTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start+((PAYLOAD_PTR)->length<<12)) +#define RBTREE_ASSERT psmi_assert +#define RBTREE_MAP_COUNT(PAYLOAD_PTR) ((PAYLOAD_PTR)->ntid) + +#include "rbtree.c" + +void ips_tidcache_map_init(cl_qmap_t *p_map, + cl_map_item_t* const root, + cl_map_item_t* const nil_item) +{ + ips_cl_qmap_init(p_map,root,nil_item); +} + +/* + * + * Force to remove a tid, check invalidation event afterwards. + */ +static psm2_error_t +ips_tidcache_remove(struct ips_tid *tidc, uint32_t tidcnt) +{ + cl_qmap_t *p_map = &tidc->tid_cachemap; + uint32_t idx; + uint64_t events_mask; + psm2_error_t err; + + /* + * call driver to free the tids. + */ + if (psmi_hal_free_tid(tidc->context->psm_hw_ctxt, + (uint64_t) (uintptr_t) tidc->tid_array, tidcnt) < 0) { + /* If failed to unpin pages, it's fatal error */ + err = psmi_handle_error(tidc->context->ep, + PSM2_EP_DEVICE_FAILURE, + "Failed to tid free %d tids", 1); + return err; + } + + while (tidcnt) { + tidcnt--; + idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[tidcnt]) + + IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[tidcnt]); + + /* + * sanity check. + */ + psmi_assert(idx != 0); + psmi_assert(idx <= tidc->tid_ctrl->tid_num_max); + psmi_assert(INVALIDATE(idx) == 0); + psmi_assert(REFCNT(idx) == 0); + + /* + * mark the tid invalidated. + */ + INVALIDATE(idx) = 1; + + /* + * remove the tid from RB tree. + */ + IDLE_REMOVE(idx); + ips_cl_qmap_remove_item(p_map, &p_map->root[idx]); + } + + /* + * Because the freed tid is not from invalidation list, + * it is possible that kernel just invalidated the tid, + * then we need to check and process the invalidation + * before we can re-use this tid. The reverse order + * will wrongly invalidate this tid again. + */ + err = psmi_hal_get_hfi_event_bits(&events_mask,tidc->context->psm_hw_ctxt); + + if_pf (err) + return PSM2_INTERNAL_ERR; + + if (events_mask & PSM_HAL_HFI_EVENT_TID_MMU_NOTIFY) { + err = ips_tidcache_invalidation(tidc); + if (err) + return err; + } + + return PSM2_OK; +} + +/* + * Register a new buffer with driver, and cache the tidinfo. + */ +static psm2_error_t +ips_tidcache_register(struct ips_tid *tidc, + unsigned long start, uint32_t length, uint32_t *firstidx +#ifdef PSM_CUDA + , uint8_t is_cuda_ptr +#endif + ) +{ + cl_qmap_t *p_map = &tidc->tid_cachemap; + uint32_t tidoff, tidlen; + uint32_t idx, tidcnt; + uint16_t flags = 0; + psm2_error_t err; + + /* + * make sure we have at least one free tid to + * register the new buffer. + */ + if (NTID == tidc->tid_cachesize) { + /* all tids are in active use, error? */ + if (NIDLE == 0) + return PSM2_OK_NO_PROGRESS; + + /* + * free the first tid in idle queue. + */ + idx = IPREV(IHEAD); + tidc->tid_array[0] = p_map->root[idx].payload.tidinfo; + err = ips_tidcache_remove(tidc, 1); + if (err) + return err; + } + psmi_assert(NTID < tidc->tid_cachesize); + + /* Clip length if it exceeds worst case tid allocation, + where each entry in the tid array can accommodate only + 1 page. */ + if (length > 4096*tidc->tid_ctrl->tid_num_max) + { + length = 4096*tidc->tid_ctrl->tid_num_max; + } + /* + * register the new buffer. + */ + +retry: + tidcnt = 0; + +#ifdef PSM_CUDA + if (is_cuda_ptr) + flags = PSM_HAL_BUF_GPU_MEM; +#endif + + if (psmi_hal_update_tid(tidc->context->psm_hw_ctxt, + (uint64_t) start, &length, + (uint64_t) tidc->tid_array, &tidcnt, + flags) < 0) { + /* if driver reaches lockable memory limit */ + if ((errno == ENOMEM +#ifdef PSM_CUDA + /* This additional check is in place for just the cuda + * version. It is a temporary workaround for a known + * issue where nvidia driver returns EINVAL instead of + * ENOMEM when there is no BAR1 space left to pin pages. + * PSM frees tidcache enteries when the driver sends + * EINVAL there by unpinning pages and freeing some + * BAR1 space.*/ + || (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)start) && errno == EINVAL) +#endif + ) && NIDLE) { + uint64_t lengthEvicted = ips_tidcache_evict(tidc,length); + + if (lengthEvicted >= length) + goto retry; + } else if (errno == EFAULT) + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + " Unhandled error in TID Update: %s\n", strerror(errno)); +#ifdef PSM_CUDA + else if (PSMI_IS_CUDA_ENABLED && errno == ENOTSUP) + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + " Nvidia driver apis mismatch: %s\n", strerror(errno)); +#endif + + /* Unable to pin pages? retry later */ + return PSM2_EP_DEVICE_FAILURE; + } + psmi_assert_always(tidcnt > 0); + psmi_assert((tidcnt+NTID) <= tidc->tid_cachesize); + + /* + * backward processing because we want to return + * the first RB index in the array. + */ + idx = 0; + tidoff = length; + while (tidcnt) { + /* + * Driver only returns tidctrl=1 or tidctrl=2. + */ + tidcnt--; + idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[tidcnt]) + + IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[tidcnt]); + tidlen = IPS_TIDINFO_GET_LENGTH(tidc->tid_array[tidcnt]); + + /* + * sanity check. + */ + psmi_assert(idx != 0); + psmi_assert(idx <= tidc->tid_ctrl->tid_num_max); + psmi_assert(INVALIDATE(idx) != 0); + psmi_assert(REFCNT(idx) == 0); + + /* + * clear the tid invalidated. + */ + INVALIDATE(idx) = 0; + + /* + * put the tid into a RB node. + */ + tidoff -= tidlen << 12; + START(idx) = start + tidoff; + LENGTH(idx) = tidlen; + p_map->root[idx].payload.tidinfo = tidc->tid_array[tidcnt]; + + /* + * put the node into RB tree and idle queue head. + */ + IDLE_INSERT(idx); + ips_cl_qmap_insert_item(p_map, &p_map->root[idx]); + } + psmi_assert(idx != 0); + psmi_assert(tidoff == 0); + *firstidx = idx; + + return PSM2_OK; +} + +/* + * Get mmu notifier invalidation info and update PSM's caching. + */ +psm2_error_t +ips_tidcache_invalidation(struct ips_tid *tidc) +{ + cl_qmap_t *p_map = &tidc->tid_cachemap; + uint32_t i, j, idx, tidcnt; + psm2_error_t err; + + /* + * get a list of invalidated tids from driver, + * driver will clear the event bit before return. + */ + tidcnt = 0; + if (psmi_hal_get_tidcache_invalidation(tidc->context->psm_hw_ctxt, + (uint64_t) (uintptr_t) tidc->tid_array, + &tidcnt) < 0) { + /* If failed to get invalidation info, it's fatal error */ + err = psmi_handle_error(tidc->context->ep, + PSM2_EP_DEVICE_FAILURE, + "Failed to get invalidation info"); + return err; + } + psmi_assert(tidcnt > 0 && tidcnt <= tidc->tid_ctrl->tid_num_max); + + j = 0; + for (i = 0; i < tidcnt; i++) { + /* + * Driver only returns tidctrl=1 or tidctrl=2. + */ + idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[i]) + + IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[i]); + psmi_assert(idx != 0); + psmi_assert(idx <= tidc->tid_ctrl->tid_num_max); + + /* + * sanity check. + */ + psmi_assert(p_map->root[idx].payload.tidinfo == tidc->tid_array[i]); + psmi_assert(LENGTH(idx) == + IPS_TIDINFO_GET_LENGTH(tidc->tid_array[i])); + + /* + * if the tid is already invalidated, ignore it, + * but do sanity check. + */ + if (INVALIDATE(idx) != 0) { + psmi_assert(REFCNT(idx) == 0); + continue; + } + + /* + * mark the tid invalidated. + */ + INVALIDATE(idx) = 1; + + /* + * if the tid is idle, remove the tid from RB tree + * and idle queue, put on free list. + */ + if (REFCNT(idx) == 0) { + IDLE_REMOVE(idx); + ips_cl_qmap_remove_item(p_map, &p_map->root[idx]); + + if (i != j) + tidc->tid_array[j] = tidc->tid_array[i]; + j++; + } + } + + if (j > 0) { + /* + * call driver to free the tids. + */ + if (psmi_hal_free_tid(tidc->context->psm_hw_ctxt, + (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) { + /* If failed to unpin pages, it's fatal error */ + err = psmi_handle_error(tidc->context->ep, + PSM2_EP_DEVICE_FAILURE, + "Failed to tid free %d tids", j); + return err; + } + } + + return PSM2_OK; +} + +psm2_error_t +ips_tidcache_acquire(struct ips_tid *tidc, + const void *buf, uint32_t *length, + uint32_t *tid_array, uint32_t *tidcnt, + uint32_t *tidoff +#ifdef PSM_CUDA + , uint8_t is_cuda_ptr +#endif + ) +{ + cl_qmap_t *p_map = &tidc->tid_cachemap; + cl_map_item_t *p_item; + unsigned long start = (unsigned long)buf; + unsigned long end = start + (*length); + uint32_t idx, nbytes; + uint64_t event_mask; + psm2_error_t err; + + /* + * Before every tid caching search, we need to update the + * tid caching if there is invalidation event, otherwise, + * the cached address may be invalidated and we might have + * wrong matching. + */ + err = psmi_hal_get_hfi_event_bits(&event_mask,tidc->context->psm_hw_ctxt); + + if_pf (err) + return PSM2_INTERNAL_ERR; + + if (event_mask & PSM_HAL_HFI_EVENT_TID_MMU_NOTIFY) { + err = ips_tidcache_invalidation(tidc); + if (err) + return err; + } + + /* + * Now we can do matching from the caching, because obsolete + * address in caching has been removed or identified. + */ +retry: + p_item = ips_cl_qmap_search(p_map, start, end); + idx = 2*IPS_TIDINFO_GET_TID(p_item->payload.tidinfo) + + IPS_TIDINFO_GET_TIDCTRL(p_item->payload.tidinfo); + + /* + * There is tid matching. + */ + if (idx) { + /* + * if there is a caching match, but the tid has been + * invalidated, we can't match this tid, and we also + * can't register this address, we need to wait this + * tid to be freed. + */ + if (INVALIDATE(idx) != 0) + return PSM2_OK_NO_PROGRESS; + + /* + * if the page offset within the tid is not less than + * 128K, the address offset within the page is not 64B + * multiple, PSM can't handle this tid with any offset + * mode. We need to free this tid and re-register with + * the asked page address. + */ + if (((start - START(idx)) >= 131072) && ((*tidoff) & 63)) { + /* + * If the tid is currently used, retry later. + */ + if (REFCNT(idx) != 0) + return PSM2_OK_NO_PROGRESS; + + /* + * free this tid. + */ + tidc->tid_array[0] = p_map->root[idx].payload.tidinfo; + err = ips_tidcache_remove(tidc, 1); + if (err) + return err; + + /* try to match a node again */ + goto retry; + } + } + + /* + * If there is no match node, or 'start' falls out of node range, + * whole or partial buffer from 'start' is not registered yet. + */ + if (!idx || START(idx) > start) { + if (!idx) + nbytes = end - start; + else + nbytes = START(idx) - start; + + /* + * Because we don't have any match tid yet, if + * there is an error, we return from here, PSM + * will try later. + */ + err = ips_tidcache_register(tidc, start, nbytes, &idx +#ifdef PSM_CUDA + , is_cuda_ptr +#endif + ); + if (err) + return err; + } + + /* + * sanity check. + */ + psmi_assert(START(idx) <= start); + psmi_assert(INVALIDATE(idx) == 0); + + *tidoff += start - START(idx); + *tidcnt = 1; + + tid_array[0] = p_map->root[idx].payload.tidinfo; + REFCNT(idx)++; + if (REFCNT(idx) == 1) + IDLE_REMOVE(idx); + start = END(idx); + + while (start < end) { + p_item = ips_cl_qmap_successor(p_map, &p_map->root[idx]); + idx = 2*IPS_TIDINFO_GET_TID(p_item->payload.tidinfo) + + IPS_TIDINFO_GET_TIDCTRL(p_item->payload.tidinfo); + if (!idx || START(idx) != start) { + if (!idx) + nbytes = end - start; + else + nbytes = (START(idx) > end) ? + (end - start) : + (START(idx) - start); + + /* + * Because we already have at least one match tid, + * if it is error to register new pages, we break + * here and return the tids we already have. + */ + err = ips_tidcache_register(tidc, start, nbytes, &idx +#ifdef PSM_CUDA + , is_cuda_ptr +#endif + ); + if (err) + break; + } else if (INVALIDATE(idx) != 0) { + /* + * the tid has been invalidated, it is still in + * caching because it is still being used, but + * any new usage is not allowed, we ignore it and + * return the tids we already have. + */ + psmi_assert(REFCNT(idx) != 0); + break; + } + + /* + * sanity check. + */ + psmi_assert(START(idx) == start); + psmi_assert(INVALIDATE(idx) == 0); + + tid_array[(*tidcnt)++] = p_map->root[idx].payload.tidinfo; + REFCNT(idx)++; + if (REFCNT(idx) == 1) + IDLE_REMOVE(idx); + start = END(idx); + } + + if (start < end) + *length = start - (unsigned long)buf; + /* otherwise, all pages are registered */ + psmi_assert((*tidcnt) > 0); + + return PSM2_OK; +} + +psm2_error_t +ips_tidcache_release(struct ips_tid *tidc, + uint32_t *tid_array, uint32_t tidcnt) +{ + cl_qmap_t *p_map = &tidc->tid_cachemap; + uint32_t i, j, idx; + psm2_error_t err; + + psmi_assert(tidcnt > 0); + + j = 0; + for (i = 0; i < tidcnt; i++) { + /* + * Driver only returns tidctrl=1 or tidctrl=2. + */ + idx = 2*IPS_TIDINFO_GET_TID(tid_array[i]) + + IPS_TIDINFO_GET_TIDCTRL(tid_array[i]); + psmi_assert(idx != 0); + psmi_assert(idx <= tidc->tid_ctrl->tid_num_max); + psmi_assert(REFCNT(idx) != 0); + + REFCNT(idx)--; + if (REFCNT(idx) == 0) { + if (INVALIDATE(idx) != 0) { + ips_cl_qmap_remove_item(p_map, &p_map->root[idx]); + + tidc->tid_array[j] = tid_array[i]; + j++; + } else { + IDLE_INSERT(idx); + } + } + } + + if (j > 0) { + /* + * call driver to free the tids. + */ + if (psmi_hal_free_tid(tidc->context->psm_hw_ctxt, + (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) { + /* If failed to unpin pages, it's fatal error */ + err = psmi_handle_error(tidc->context->ep, + PSM2_EP_DEVICE_FAILURE, + "Failed to tid free %d tids", j); + return err; + } + } + + return PSM2_OK; +} + +/* + * + * Call driver to free all cached tids. + */ +psm2_error_t +ips_tidcache_cleanup(struct ips_tid *tidc) +{ + cl_qmap_t *p_map = &tidc->tid_cachemap; + psm2_error_t err; + int i, j; + + j = 0; + for (i = 1; i <= tidc->tid_ctrl->tid_num_max; i++) { + psmi_assert(REFCNT(i) == 0); + if (INVALIDATE(i) == 0) { + tidc->tid_array[j++] = p_map->root[i].payload.tidinfo; + } + } + + if (j > 0) { + /* + * call driver to free the tids. + */ + if (psmi_hal_free_tid(tidc->context->psm_hw_ctxt, + (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) { + /* If failed to unpin pages, it's fatal error */ + err = psmi_handle_error(tidc->context->ep, + PSM2_EP_DEVICE_FAILURE, + "Failed to tid free %d tids", j); + return err; + } + } + + psmi_free(tidc->tid_array); + psmi_free(tidc->tid_cachemap.root); + + return PSM2_OK; +} + + +/* Note that the caller is responsible for making sure that NIDLE is non-zero + before calling ips_tidcache_evict. If NIDLE is 0 at the time of call, + ips_tidcache_evict is unstable. + */ +uint64_t +ips_tidcache_evict(struct ips_tid *tidc,uint64_t length) +{ + cl_qmap_t *p_map = &tidc->tid_cachemap; + uint32_t idx = IHEAD, tidcnt = 0, tidlen = 0; + /* + * try to free the required + * pages from idle queue tids + */ + + do { + idx = IPREV(idx); + psmi_assert(idx != 0); + tidc->tid_array[tidcnt] = + p_map->root[idx].payload.tidinfo; + tidcnt++; + + tidlen += IPS_TIDINFO_GET_LENGTH + (p_map->root[idx].payload.tidinfo)<<12; + } while (tidcnt < NIDLE && tidlen < length); + + /* + * free the selected tids on successfully finding some:. + */ + if (tidcnt > 0 && ips_tidcache_remove(tidc, tidcnt)) + return 0; + + return tidlen; +} diff --git a/ptl_ips/ips_tidcache.h b/ptl_ips/ips_tidcache.h new file mode 100644 index 0000000..20d45bf --- /dev/null +++ b/ptl_ips/ips_tidcache.h @@ -0,0 +1,158 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef _IPS_TIDCACHE_H +#define _IPS_TIDCACHE_H + +#include +#include +#include +#include + +/* + * Design notes. + * + * PSM needs to call into driver to program receiving buffer pages to + * HFI gen1 hardware, each tid can be programmed with physically contiguous + * power-of-two pages from 1 pages to 512 pages. This procedure takes + * time. + * + * Lots of applications tend to re-use the same receiving buffer, caching + * such programmed tids in user space process will save time and improve + * application performance. + * + * This PSM tid registration caching design requires cooperation between + * PSM and driver. Here is what happen between PSM and driver. + * + * 1. PSM call into driver with a chunk of buffer with virtual address + * and length. + * 2. driver pins the buffer pages, program hardware with the physical + * pages, get a list of tids. + * 3. driver caches the tids with the corresponding virtual address in + * user space for each tid, and return the list of tids back to PSM. + * 4. PSM also caches the list of tids with the corresponding virtual + * address for each tid, and use the list of tids for transmission. + * 5. when process frees a buffer, kernel VM will catch the event and + * calls the callback in driver to notify that the virtual address + * range is gone in the process. + * 6. driver will search its cache system and find the tids with the + * removed virtual address, put these tid in an invalidation queue + * and notify PSM the event. + * 7. PSM will pick the event and remove the tids from its own cache + * as well. + * 8. PSM must check such invalidation event every time before searching + * its caching system to match tids for a 'new' buffer chunk. + * 9, when the caching system is full, and a new buffer chunk is asked + * to register, PSM picks a victim to remove. + */ + +typedef struct +{ + unsigned long start; /* start virtual address */ + uint32_t tidinfo; /* tid encoding */ + uint16_t length; /* length in pages */ + uint16_t invalidate; /* invalidate flag */ + uint16_t refcount; /* usage reference count */ + uint16_t i_prev; /* idle queue previous */ + uint16_t i_next; /* idle queue next */ +} rbtree_tidcache_mapitem_pl_t; + +typedef struct { + uint32_t ntid; /* tids are cached */ + uint32_t nidle; /* tids are idle */ +} rbtree_tidcache_map_pl_t; + +#define RBTREE_MI_PL rbtree_tidcache_mapitem_pl_t +#define RBTREE_MAP_PL rbtree_tidcache_map_pl_t + +#include "rbtree.h" + +/* + * Macro definition for easy programming. + */ + +#define NTID p_map->payload.ntid +#define REFCNT(x) p_map->root[x].payload.refcount +#define INVALIDATE(x) p_map->root[x].payload.invalidate + +#define LENGTH(x) p_map->root[x].payload.length +#define START(x) p_map->root[x].payload.start +#define END(x) (START(x) + (LENGTH(x)<<12)) + +/* + * Macro for idle tid queue management. + */ +#define NIDLE p_map->payload.nidle +#define IHEAD 0 +#define INEXT(x) p_map->root[x].payload.i_next +#define IPREV(x) p_map->root[x].payload.i_prev + +#define IDLE_REMOVE(x) do { \ + INEXT(IPREV(x)) = INEXT(x); \ + IPREV(INEXT(x)) = IPREV(x); \ + NIDLE--; \ + } while (0) + +#define IDLE_INSERT(x) do { \ + INEXT(x) = INEXT(IHEAD); \ + IPREV(x) = IHEAD; \ + IPREV(INEXT(IHEAD)) = x; \ + INEXT(IHEAD) = x; \ + NIDLE++; \ + } while (0) + +extern void ips_tidcache_map_init(cl_qmap_t *p_map, + cl_map_item_t* const root, + cl_map_item_t* const nil_item); + +#endif diff --git a/ptl_ips/ips_tidflow.c b/ptl_ips/ips_tidflow.c new file mode 100644 index 0000000..24556e4 --- /dev/null +++ b/ptl_ips/ips_tidflow.c @@ -0,0 +1,268 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include "ips_expected_proto.h" +#include "ips_tidflow.h" + +psm2_error_t ips_tf_init(struct ips_protoexp *protoexp, + const psmi_context_t *context, + struct ips_tf *tfc, + ips_tf_avail_cb_fn_t cb) +{ + int tf_idx; + +#if TF_ADD + struct psmi_stats_entry entries[] = { + PSMI_STATS_DECL("tidflow update count", + MPSPAWN_STATS_REDUCTION_ALL, + NULL, &tfc->tf_num_total), + }; +#endif + + tfc->context = context; + tfc->tf_num_total = 0; + tfc->tf_num_inuse = 0; + tfc->tf_avail_cb = cb; + tfc->tf_avail_context = (void *)protoexp; + if (psmi_hal_has_cap(PSM_HAL_CAP_EXTENDED_PSN)) { + tfc->tf_gen_mask = 0xFFFFF; + } else { + tfc->tf_gen_mask = 0x1FFF; + } + + /* Allocate and Initialize tidrecvc array. */ + tfc->tidrecvc = (struct ips_tid_recv_desc *) + psmi_calloc(context->ep, UNDEFINED, 1, + sizeof(struct ips_tid_recv_desc)*HFI_TF_NFLOWS); + if (tfc->tidrecvc == NULL) + return PSM2_NO_MEMORY; + + for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) { + tfc->tidrecvc[tf_idx].context = context; + tfc->tidrecvc[tf_idx].protoexp = protoexp; + tfc->tidrecvc[tf_idx].rdescid._desc_idx = tf_idx; + tfc->tidrecvc[tf_idx].rdescid._desc_genc = tf_idx; + tfc->tidrecvc[tf_idx].tidflow.flowid = EP_FLOW_TIDFLOW; + tfc->tidrecvc[tf_idx].tidflow.frag_size = protoexp->proto->epinfo.ep_mtu; + } + + /* Shared control structure, it will be in shared memory + * for context sharing, otherwise calloc() it */ + tfc->tf_ctrl = (struct ips_tf_ctrl *)context->tf_ctrl; + if (!tfc->tf_ctrl) { + tfc->tf_ctrl = (struct ips_tf_ctrl *) + psmi_calloc(context->ep, UNDEFINED, 1, + sizeof(struct ips_tf_ctrl)); + if (tfc->tf_ctrl == NULL) { + return PSM2_NO_MEMORY; + } + } + + /* + * Only the master process can initialize. + */ + if (psmi_hal_get_subctxt(context->psm_hw_ctxt) == 0) { + pthread_spin_init(&tfc->tf_ctrl->tf_ctrl_lock, + PTHREAD_PROCESS_SHARED); + tfc->tf_ctrl->tf_num_max = HFI_TF_NFLOWS; + tfc->tf_ctrl->tf_num_avail = HFI_TF_NFLOWS; + + for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) { + /* Update flow state */ + tfc->tf_ctrl->tf[tf_idx].state = TF_STATE_DEALLOCATED; + tfc->tf_ctrl->tf[tf_idx].tf_idx = tf_idx; + tfc->tf_ctrl->tf[tf_idx].next_gen = 0; + tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1; + + psmi_hal_tidflow_reset(tfc->context->psm_hw_ctxt, tf_idx, + tfc->tf_gen_mask, 0x7FF); + } + tfc->tf_ctrl->tf_head = 0; + } + +#if TF_ADD + /* TF_ADD: Add a new stats type for tid flows in psm_stats.h */ + return psmi_stats_register_type(PSMI_STATS_NO_HEADING, + PSMI_STATSTYPE_TIDS, + entries, + PSMI_STATS_HOWMANY(entries), tidc); +#else + return PSM2_OK; +#endif +} + +psm2_error_t ips_tf_fini(struct ips_tf *tfc) +{ + if (!tfc->context->tf_ctrl) + psmi_free(tfc->tf_ctrl); + psmi_free(tfc->tidrecvc); + return PSM2_OK; +} + +/* Allocate a tidflow */ +psm2_error_t ips_tf_allocate(struct ips_tf *tfc, + struct ips_tid_recv_desc **tidrecvc) +{ + struct ips_tf_ctrl *ctrl = tfc->tf_ctrl; + struct ips_tf_entry *entry; + + if (tfc->context->tf_ctrl) + pthread_spin_lock(&ctrl->tf_ctrl_lock); + + if (!ctrl->tf_num_avail) { + psmi_assert(ctrl->tf_head == HFI_TF_NFLOWS); + *tidrecvc = NULL; + + if (tfc->context->tf_ctrl) + pthread_spin_unlock(&ctrl->tf_ctrl_lock); + + return PSM2_EP_NO_RESOURCES; + } + + entry = &ctrl->tf[ctrl->tf_head]; + ctrl->tf_head = entry->next_free; + ctrl->tf_num_avail--; + + if (tfc->context->tf_ctrl) + pthread_spin_unlock(&ctrl->tf_ctrl_lock); + + tfc->tf_num_total++; + tfc->tf_num_inuse++; + + psmi_assert(entry->state == TF_STATE_DEALLOCATED); + entry->state = TF_STATE_ALLOCATED; + + *tidrecvc = &(tfc->tidrecvc[entry->tf_idx]); + /* initial tidflow generation */ + (*tidrecvc)->tidflow_active_gen = entry->next_gen; + + psmi_assert((*tidrecvc)->rdescid._desc_idx == entry->tf_idx); + psmi_assert_always(entry->next_gen < tfc->tf_gen_mask); + + entry->next_gen++; + if (entry->next_gen == tfc->tf_gen_mask) + entry->next_gen = 0; + + return PSM2_OK; +} + +/* Deallocate a tidflow */ +psm2_error_t ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx) +{ + struct ips_tf_ctrl *ctrl = tfc->tf_ctrl; + struct ips_tf_entry *entry; + + psmi_assert(tf_idx < HFI_TF_NFLOWS); + psmi_assert(tf_idx >= 0); + + entry = &ctrl->tf[tf_idx]; + psmi_assert(entry->state == TF_STATE_ALLOCATED); + entry->state = TF_STATE_DEALLOCATED; + + /* + * The wire protocol only uses 16bits tidrecvc generation + * count in exptid packet, this should be bigger enough, + * u16w3 is the lower 16bits of _desc_genc + */ + tfc->tidrecvc[tf_idx].rdescid.u16w3++; + + /* Mark invalid generation for flow (stale packets will be dropped) */ + psmi_hal_tidflow_reset(tfc->context->psm_hw_ctxt, tf_idx, + tfc->tf_gen_mask, 0x7FF); + + if (tfc->context->tf_ctrl) + pthread_spin_lock(&ctrl->tf_ctrl_lock); + + entry->next_free = ctrl->tf_head; + ctrl->tf_head = tf_idx; + ctrl->tf_num_avail++; + + if (tfc->context->tf_ctrl) + pthread_spin_unlock(&ctrl->tf_ctrl_lock); + + tfc->tf_num_inuse--; + /* If an available callback is registered invoke it */ + if (((tfc->tf_num_inuse + 1) == ctrl->tf_num_max) && tfc->tf_avail_cb) + tfc->tf_avail_cb(tfc, tfc->tf_avail_context); + + return PSM2_OK; +} + +/* Allocate a generation for a flow */ +psm2_error_t ips_tfgen_allocate(struct ips_tf *tfc, + uint32_t tf_idx, uint32_t *tfgen) +{ + struct ips_tf_entry *entry; + int ret = PSM2_OK; + + psmi_assert(tf_idx < HFI_TF_NFLOWS); + psmi_assert(tf_idx >= 0); + + entry = &tfc->tf_ctrl->tf[tf_idx]; + psmi_assert(entry->state == TF_STATE_ALLOCATED); + + *tfgen = entry->next_gen; + + entry->next_gen++; + if (entry->next_gen == tfc->tf_gen_mask) + entry->next_gen = 0; + + psmi_assert_always(*tfgen < tfc->tf_gen_mask); + + return ret; +} diff --git a/ptl_ips/ips_tidflow.h b/ptl_ips/ips_tidflow.h new file mode 100644 index 0000000..f942ab3 --- /dev/null +++ b/ptl_ips/ips_tidflow.h @@ -0,0 +1,129 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_TIDFLOW_H +#define _IPS_TIDFLOW_H + +#include "psm_user.h" + +struct ips_tf; +struct ips_protoexp; + +typedef void (*ips_tf_avail_cb_fn_t) (struct ips_tf *, void *context); +typedef enum { + TF_STATE_INVALID = 0, + TF_STATE_ALLOCATED = 1, + TF_STATE_DEALLOCATED = 2 +} tf_state_t; + +struct ips_tf_entry { + tf_state_t state; + uint32_t tf_idx; + uint32_t next_gen; + uint32_t next_free; +}; + +struct ips_tf_ctrl { + pthread_spinlock_t tf_ctrl_lock; + uint32_t tf_num_max; + uint32_t tf_num_avail; + uint32_t tf_head; + struct ips_tf_entry tf[HFI_TF_NFLOWS]; +} __attribute__ ((aligned(64))); + +struct ips_tf { + const psmi_context_t *context; + ips_tf_avail_cb_fn_t tf_avail_cb; + void *tf_avail_context; + struct ips_tf_ctrl *tf_ctrl; + + uint64_t tf_num_total; + uint32_t tf_num_inuse; + uint32_t tf_gen_mask; + + /* Pointer to array of size HFI_TF_NFLOWS */ + struct ips_tid_recv_desc *tidrecvc; +}; + +PSMI_ALWAYS_INLINE(int ips_tf_available(struct ips_tf *tf)) +{ + if (tf->tf_ctrl->tf_num_avail == 0) { + if (tf->tf_ctrl->tf_num_max == tf->tf_num_inuse) + return -1; + else + return 0; + } + + return tf->tf_ctrl->tf_num_avail; +} + +psm2_error_t ips_tf_init(struct ips_protoexp *protoexp, + const psmi_context_t *context, + struct ips_tf *tfc, + ips_tf_avail_cb_fn_t cb); +psm2_error_t ips_tf_fini(struct ips_tf *tfc); + +/* Allocate a tidflow */ +psm2_error_t ips_tf_allocate(struct ips_tf *tfc, + struct ips_tid_recv_desc **tidrecvc); + +/* Deallocate a tidflow */ +psm2_error_t ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx); + +/* Allocate a generation for a flow */ +psm2_error_t ips_tfgen_allocate(struct ips_tf *tfc, + uint32_t tf_idx, uint32_t *tfgen); + +#endif diff --git a/ptl_ips/ips_writehdrq.c b/ptl_ips/ips_writehdrq.c new file mode 100644 index 0000000..fbcf8d6 --- /dev/null +++ b/ptl_ips/ips_writehdrq.c @@ -0,0 +1,78 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_writehdrq.h" +#include "ips_proto_params.h" + +psm2_error_t +ips_writehdrq_init(const psmi_context_t *context, + struct ips_writehdrq *writeq, + struct ips_writehdrq_state *state, + uint32_t subcontext) +{ + uint32_t elemsz = psmi_hal_get_rx_hdr_q_ent_size(context->psm_hw_ctxt), + elemcnt = psmi_hal_get_rx_hdr_q_cnt(context->psm_hw_ctxt); + + memset(writeq, 0, sizeof(*writeq)); + writeq->context = context; + writeq->state = state; + writeq->hdrq_elemlast = (elemcnt - 1) * (elemsz >> BYTE2DWORD_SHIFT); + + writeq->state->enabled = 1; + return PSM2_OK; +} + diff --git a/ptl_ips/ips_writehdrq.h b/ptl_ips/ips_writehdrq.h new file mode 100644 index 0000000..20819cc --- /dev/null +++ b/ptl_ips/ips_writehdrq.h @@ -0,0 +1,88 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_WRITEHDRQ_H +#define _IPS_WRITEHDRQ_H + +#include "psm_user.h" +#include "ips_recvq.h" + +/* + * Structure containing state for writehdrq writing. This is logically + * part of ips_writehdrq but needs to be separated out for context + * sharing so that it can be put in a shared memory page and hence + * be available to all processes sharing the port. Generally, do not + * put pointers in here since the address map of each process can be + * different. + */ +struct ips_writehdrq_state { + uint32_t hdrq_rhf_seq; /* last seq */ + uint32_t egrq_offset; /* in bytes unit, not 64B */ + uint32_t enabled; /* enables writing */ +}; + +struct ips_writehdrq { + const psmi_context_t *context; + struct ips_writehdrq_state *state; + uint32_t hdrq_elemlast; +}; + +psm2_error_t +ips_writehdrq_init(const psmi_context_t *context, + struct ips_writehdrq *writeq, + struct ips_writehdrq_state *state, + uint32_t subcontext); + +#endif /* _IPS_WRITEHDRQ_H */ diff --git a/ptl_ips/ptl.c b/ptl_ips/ptl.c new file mode 100644 index 0000000..39b5631 --- /dev/null +++ b/ptl_ips/ptl.c @@ -0,0 +1,867 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +/* This file implements the PSM PTL for ips */ +#include "psm_user.h" +#include "psm2_hal.h" +#include "ptl_ips.h" +#include "psm_mq_internal.h" + +int ips_ptl_recvq_isempty(const struct ptl *ptl); + +static +int +ips_subcontext_ignore(struct ips_recvhdrq_event *rcv_ev, + uint32_t subcontext) +{ + return IPS_RECVHDRQ_CONTINUE; +} + +static +int +ips_subcontext_process(struct ips_recvhdrq_event *rcv_ev, + uint32_t subcontext) +{ + struct ptl_shared *recvshc = ((struct ptl_ips *)(rcv_ev->proto->ptl))->recvshc; + if_pt(subcontext != recvshc->subcontext && + subcontext < recvshc->subcontext_cnt) { + return psmi_hal_forward_packet_to_subcontext(&recvshc->writeq[subcontext], + rcv_ev, subcontext, + rcv_ev->recvq->context->psm_hw_ctxt); + } + else { + _HFI_VDBG + ("Drop pkt for subcontext %d out of %d (I am %d) : errors 0x%x\n", + (int)subcontext, (int)recvshc->subcontext_cnt, + (int)recvshc->subcontext, psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf)); + return IPS_RECVHDRQ_BREAK; + } +} + +static psm2_error_t shrecvq_init(ptl_t *ptl, const psmi_context_t *context); +static psm2_error_t shrecvq_fini(ptl_t *ptl); + +static size_t ips_ptl_sizeof(void) +{ + return sizeof(struct ptl_ips); +} + +static +int ips_ptl_epaddr_stats_num(void) +{ + return sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t); +} + +static +int ips_ptl_epaddr_stats_init(char **desc, uint16_t *flags) +{ + int num_stats = + sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t); + int i; + + /* All stats are uint64_t */ + for (i = 0; i < num_stats; i++) + flags[i] = MPSPAWN_STATS_REDUCTION_ALL | + MPSPAWN_STATS_SKIP_IF_ZERO; + + desc[0] = "errchecks sent"; + desc[1] = "errchecks recv"; + desc[2] = "naks sent"; + desc[3] = "naks recv"; + desc[4] = "connect reqs sent"; + desc[5] = "disconnect reqs sent"; + desc[6] = "tid grants sent"; + desc[7] = "tid grants recv"; + desc[8] = "send rexmit"; + desc[9] = "congestion packets"; + + return num_stats; +} + +int ips_ptl_epaddr_stats_get(psm2_epaddr_t epaddr, uint64_t *stats_o) +{ + int i, num_stats = + sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t); + uint64_t *stats_i = (uint64_t *) &epaddr->proto->epaddr_stats; + + for (i = 0; i < num_stats; i++) + stats_o[i] = stats_i[i]; + + return num_stats; +} + +static +psm2_error_t +psmi_context_check_status_callback(struct psmi_timer *t, uint64_t current) +{ + struct ptl_ips *ptl = (struct ptl_ips *)t->context; + const uint64_t current_count = get_cycles(); + psm2_error_t err; + + err = psmi_context_check_status(ptl->context); + if (err == PSM2_OK || err == PSM2_OK_NO_PROGRESS) + { + int rc = psmi_hal_spio_process_events((struct ptl *)ptl); + err = rc >= 0 ? PSM2_OK : PSM2_INTERNAL_ERR; + } + psmi_timer_request_always(&ptl->timerq, &ptl->status_timer, + current_count + ptl->status_cyc_timeout); + + return err; +} + +static +psm2_error_t ips_ptl_init(const psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + psm2_error_t err = PSM2_OK; + uint32_t num_of_send_bufs = ep->hfi_num_sendbufs; + uint32_t num_of_send_desc = ep->hfi_num_descriptors; + uint32_t imm_size = ep->hfi_imm_size; + const psmi_context_t *context = &ep->context; + const int enable_shcontexts = (psmi_hal_get_subctxt_cnt(context->psm_hw_ctxt) > 0); + const uint64_t current_count = get_cycles(); + + /* Preconditions */ + psmi_assert_always(ep != NULL); + psmi_assert_always(ep->epaddr != NULL); + psmi_assert_always(ep->epid != 0); + psmi_assert_always(ep->hfi_num_sendbufs > 0); + + memset(ptl, 0, sizeof(struct ptl_ips)); + + ptl->ep = ep; /* back pointer */ + ptl->epid = ep->epid; /* cache epid */ + ptl->epaddr = ep->epaddr; /* cache a copy */ + ptl->ctl = ctl; + ptl->context = context; + + memset(ctl, 0, sizeof(*ctl)); + /* Fill in the control structure */ + ctl->ep = ep; + ctl->ptl = ptl_gen; + ctl->ep_poll = enable_shcontexts ? ips_ptl_shared_poll : ips_ptl_poll; + ctl->ep_connect = ips_ptl_connect; + ctl->ep_disconnect = ips_ptl_disconnect; + ctl->mq_send = ips_proto_mq_send; + ctl->mq_isend = ips_proto_mq_isend; + + ctl->am_get_parameters = ips_am_get_parameters; + + ctl->am_short_request = ips_am_short_request; + ctl->am_short_reply = ips_am_short_reply; + + ctl->epaddr_stats_num = ips_ptl_epaddr_stats_num; + ctl->epaddr_stats_init = ips_ptl_epaddr_stats_init; + ctl->epaddr_stats_get = ips_ptl_epaddr_stats_get; + + ctl->msg_size_thresh_query = ips_proto_msg_size_thresh_query; + + /* + * Runtime flags in 'ptl' are different from runtime flags in 'context'. + * In 'context', runtime flags reflect what the driver is capable of. + * In 'ptl', runtime flags reflect the features we can or want to use in + * the driver's supported runtime flags. + */ + + /* + * This timer is to be used to check the context's status at every + * PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS. This is useful to detect when + * the link transitions from the DOWN state to the UP state. We can thus + * stop aggregating link failure messages once we detect that the link is + * up. + */ + psmi_timer_entry_init(&ptl->status_timer, + psmi_context_check_status_callback, ptl); + + /* cache the context's status timeout in cycles */ + ptl->status_cyc_timeout = + ms_2_cycles(PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS); + + /* + * Retransmissions and pending operations are kept in a timer structure + * (queue). The timerq is shared to various internal IPS interfaces so + * that they too may schedule events on the timer queue. The timerq is + * drained in the progress function. + */ + if ((err = psmi_timer_init(&ptl->timerq))) + goto fail; + + /* start the context's status timer */ + psmi_timer_request_always(&ptl->timerq, &ptl->status_timer, + current_count + ptl->status_cyc_timeout); + + /* + * Epstate maps endpoint ids (epid integers) to ipsaddr (structs). Mappings + * are added/removed by the connect portion of the ips protocol and lookup + * is made by the receive queue processing component. + */ + if ((err = ips_epstate_init(&ptl->epstate, context))) + goto fail; + + /* + * Context sharing, setup subcontext ureg page. + */ + if (enable_shcontexts) { + struct ptl_shared *recvshc; + + recvshc = (struct ptl_shared *) + psmi_calloc(ep, UNDEFINED, 1, sizeof(struct ptl_shared)); + if (recvshc == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + + ptl->recvshc = recvshc; + recvshc->ptl = ptl_gen; + + /* Initialize recvshc fields */ + recvshc->context = psmi_hal_get_context(context->psm_hw_ctxt); + recvshc->subcontext = psmi_hal_get_subctxt(context->psm_hw_ctxt); + recvshc->subcontext_cnt = psmi_hal_get_subctxt_cnt(context->psm_hw_ctxt); + psmi_assert_always(recvshc->subcontext_cnt <= + PSM_HAL_MAX_SHARED_CTXTS); + psmi_assert_always(recvshc->subcontext < + recvshc->subcontext_cnt); + + /* + * Using ep->context to avoid const modifier since this function + * will modify the content in ep->context. + */ + if ((err = psmi_hal_subcontext_ureg_get(ptl_gen, + recvshc->subcontext_ureg, context->psm_hw_ctxt))) + goto fail; + + /* Note that the GEN1 HAL instance initializes struct ips_subcontext_ureg + during context open. */ + + recvshc->context_lock = &recvshc->hwcontext_ctrl->context_lock; + if (recvshc->subcontext == 0) { + if (pthread_spin_init(recvshc->context_lock, + PTHREAD_PROCESS_SHARED) != 0) { + err = + psmi_handle_error(ptl->ep, + PSM2_EP_DEVICE_FAILURE, + "Couldn't initialize process-shared spin lock"); + goto fail; + } + } + } + /* + * Hardware send pio used by eager and control messages. + */ + if ((err = psmi_hal_spio_init(context, ptl_gen, &ptl->spioc))) + goto fail; + + /* + * Actual ips protocol handling. + */ + if ((err = + ips_proto_init(context, ptl_gen, num_of_send_bufs, num_of_send_desc, + imm_size, &ptl->timerq, &ptl->epstate, ptl->spioc, + &ptl->proto))) + goto fail; + + /* + * Hardware receive hdr/egr queue, services incoming packets and issues + * callbacks for protocol handling in proto_recv. It uses the epstate + * interface to determine if a packet is known or unknown. + */ + if (!enable_shcontexts) { + struct ips_recvhdrq_callbacks recvq_callbacks; + recvq_callbacks.callback_packet_unknown = + ips_proto_process_unknown; + recvq_callbacks.callback_subcontext = ips_subcontext_ignore; + recvq_callbacks.callback_error = ips_proto_process_packet_error; + if ((err = + ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto, + &recvq_callbacks, + 0, &ptl->recvq, + &ptl->recvq_state, + PSM_HAL_CL_Q_RX_HDR_Q))) + goto fail; + } + /* + * Software receive hdr/egr queue, used in shared contexts. + */ + else if ((err = shrecvq_init(ptl_gen, context))) + goto fail; + + /* + * Receive thread, always initialized but not necessary creates a + * pthread. + */ + if ((err = ips_ptl_rcvthread_init(ptl_gen, &ptl->recvq))) + goto fail; +fail: + return err; +} + +static psm2_error_t ips_ptl_fini(ptl_t *ptl_gen, int force, uint64_t timeout_in) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + psm2_error_t err = PSM2_OK; + const int enable_shcontexts = (psmi_hal_get_subctxt_cnt(ptl->context->psm_hw_ctxt) > 0); + + if ((err = ips_proto_fini(&ptl->proto, force, timeout_in))) + goto fail; + + /* We have to cancel the thread after terminating the protocol because + * connect/disconnect packets use interrupts and the kernel doesn't + * like to have no pollers waiting */ + if ((err = ips_ptl_rcvthread_fini(ptl_gen))) + goto fail; + + if ((err = ips_epstate_fini(&ptl->epstate))) + goto fail; + + if ((err = psmi_hal_spio_fini(&ptl->spioc, ptl->context->psm_hw_ctxt))) + goto fail; + + if ((err = psmi_timer_fini(&ptl->timerq))) + goto fail; + + + if (enable_shcontexts && (err = shrecvq_fini(ptl_gen))) + goto fail; + +fail: + return err; +} + +static +psm2_error_t +ips_ptl_optctl(const void *core_obj, int optname, + void *optval, uint64_t *optlen, int get) +{ + psm2_error_t err = PSM2_OK; + + switch (optname) { + case PSM2_IB_OPT_EP_SL: + { + /* Core object is psm2_epaddr */ + psm2_epaddr_t epaddr = (psm2_epaddr_t) core_obj; + ips_epaddr_t *ipsaddr = (ips_epaddr_t *) epaddr; + + /* If endpoint does not use IB ignore for set, complain for get */ + if (epaddr->ptlctl->ep_connect != ips_ptl_connect) { + if (get) + err = + psmi_handle_error(PSMI_EP_LOGEVENT, + PSM2_PARAM_ERR, + "Invalid EP transport"); + goto exit_fn; + } + + /* Sanity check option length */ + if (*optlen < sizeof(uint8_t)) { + err = + psmi_handle_error(PSMI_EP_LOGEVENT, + PSM2_PARAM_ERR, + "Option value length error"); + *optlen = sizeof(unsigned); + goto exit_fn; + } + + if (get) { + /* Get returns the SL for the PIO flow */ + *((uint8_t *) optval) = + (uint8_t) ipsaddr-> + flows[EP_FLOW_GO_BACK_N_PIO].path->pr_sl; + } else { + uint16_t new_sl; + + /* Sanity check if SL is within range */ + new_sl = (uint16_t) *(uint8_t *) optval; + if (new_sl > PSMI_SL_MAX) { + err = + psmi_handle_error(PSMI_EP_LOGEVENT, + PSM2_PARAM_ERR, + "Invalid SL value %u. %d<= SL <=%d.", + new_sl, PSMI_SL_MIN, PSMI_SL_MAX); + goto exit_fn; + } + + /* Set new SL for all flows */ + ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].path-> + pr_sl = new_sl; + ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA].path-> + pr_sl = new_sl; + } + } + break; + case PSM2_IB_OPT_DF_SL: + { + /* Set default SL to be used by an endpoint for all communication */ + /* Core object is psm2_epaddr */ + psm2_ep_t ep = (psm2_ep_t) core_obj; + + /* Make sure ep is specified */ + if (!ep) { + err = + psmi_handle_error(PSMI_EP_LOGEVENT, + PSM2_PARAM_ERR, + "Invalid PSM Endpoint"); + goto exit_fn; + } + + /* Sanity check option length */ + if (*optlen < sizeof(uint8_t)) { + err = + psmi_handle_error(PSMI_EP_LOGEVENT, + PSM2_PARAM_ERR, + "Option value length error"); + *optlen = sizeof(uint8_t); + goto exit_fn; + } + + if (get) { + *((uint8_t *) optval) = + ((struct ptl_ips *)(ep->ptl_ips.ptl))->proto.epinfo.ep_sl; + } else { + uint16_t new_sl; + + /* Sanity check if SL is within range */ + new_sl = (uint16_t) *(uint8_t *) optval; + if (new_sl > PSMI_SL_MAX) { + err = + psmi_handle_error(PSMI_EP_LOGEVENT, + PSM2_PARAM_ERR, + "Invalid SL value %u. %d<= SL <=%d.", + new_sl, PSMI_SL_MIN, PSMI_SL_MAX); + goto exit_fn; + } + + ((struct ptl_ips *)(ep->ptl_ips.ptl))->proto.epinfo.ep_sl = + (uint8_t) new_sl; + } + } + break; + default: + err = + psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Unknown PSM2_IB option %u.", optname); + } + +exit_fn: + return err; +} + +static +psm2_error_t +ips_ptl_setopt(const void *component_obj, int optname, + const void *optval, uint64_t optlen) +{ + return ips_ptl_optctl(component_obj, optname, (void *)optval, &optlen, + 0); +} + +static +psm2_error_t +ips_ptl_getopt(const void *component_obj, int optname, + void *optval, uint64_t *optlen) +{ + return ips_ptl_optctl(component_obj, optname, optval, optlen, 1); +} + +static +uint32_t +ips_ptl_rcvthread_is_enabled(const ptl_t *ptl) +{ + return psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); +} + +psm2_error_t ips_ptl_poll(ptl_t *ptl_gen, int _ignored) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + const uint64_t current_count = get_cycles(); + const int do_lock = PSMI_LOCK_DISABLED && + psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); + psm2_error_t err = PSM2_OK_NO_PROGRESS; + psm2_error_t err2; + + if (!ips_recvhdrq_isempty(&ptl->recvq)) { + if (do_lock && !ips_recvhdrq_trylock(&ptl->recvq)) + return err; + if (ptl->recvq.proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) { + ips_recvhdrq_scan_cca(&ptl->recvq); + } + err = ips_recvhdrq_progress(&ptl->recvq); + if (do_lock) + ips_recvhdrq_unlock(&ptl->recvq); + if_pf(err > PSM2_OK_NO_PROGRESS) + return err; + err2 = + psmi_timer_process_if_expired(&(ptl->timerq), + current_count); + if (err2 != PSM2_OK_NO_PROGRESS) + return err2; + else + return err; + } + + /* + * Process timer expirations after servicing receive queues (some packets + * may have been acked, some requests-to-send may have been queued). + * + * It's safe to look at the timer without holding the lock because it's not + * incorrect to be wrong some of the time. + */ + if (psmi_timer_is_expired(&(ptl->timerq), current_count)) { + if (do_lock) + ips_recvhdrq_lock(&ptl->recvq); + err = psmi_timer_process_expired(&(ptl->timerq), current_count); + if (do_lock) + ips_recvhdrq_unlock(&ptl->recvq); + } + + return err; +} + +PSMI_INLINE(int ips_try_lock_shared_context(struct ptl_shared *recvshc)) +{ + return pthread_spin_trylock(recvshc->context_lock); +} + +PSMI_INLINE(void ips_lock_shared_context(struct ptl_shared *recvshc)) +{ + pthread_spin_lock(recvshc->context_lock); +} + +PSMI_INLINE(void ips_unlock_shared_context(struct ptl_shared *recvshc)) +{ + pthread_spin_unlock(recvshc->context_lock); +} + +psm2_error_t ips_ptl_shared_poll(ptl_t *ptl_gen, int _ignored) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + const uint64_t current_count = get_cycles(); + psm2_error_t err = PSM2_OK_NO_PROGRESS; + psm2_error_t err2; + struct ptl_shared *recvshc = ptl->recvshc; + psmi_assert(recvshc != NULL); + + /* The following header queue checks are speculative (but safe) + * until this process has acquired the lock. The idea is to + * minimize lock contention due to processes spinning on the + * shared context. */ + if (ips_recvhdrq_isempty(&recvshc->recvq)) { + if (!ips_recvhdrq_isempty(&ptl->recvq) && + ips_try_lock_shared_context(recvshc) == 0) { + /* check that subcontext is empty while under lock to avoid + * re-ordering of incoming packets (since packets from + * hardware context will be processed immediately). */ + if_pt(ips_recvhdrq_isempty(&recvshc->recvq)) { + if (ptl->recvq.proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) { + ips_recvhdrq_scan_cca(&ptl->recvq); + } + err = ips_recvhdrq_progress(&ptl->recvq); + } + ips_unlock_shared_context(recvshc); + } + } + + if_pf(err > PSM2_OK_NO_PROGRESS) + return err; + + if (!ips_recvhdrq_isempty(&recvshc->recvq)) { + if (recvshc->recvq.proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) { + ips_recvhdrq_scan_cca(&recvshc->recvq); + } + err2 = ips_recvhdrq_progress(&recvshc->recvq); + if (err2 != PSM2_OK_NO_PROGRESS) { + err = err2; + } + } + + if_pf(err > PSM2_OK_NO_PROGRESS) + return err; + + /* + * Process timer expirations after servicing receive queues (some packets + * may have been acked, some requests-to-send may have been queued). + */ + err2 = psmi_timer_process_if_expired(&(ptl->timerq), current_count); + if (err2 != PSM2_OK_NO_PROGRESS) + err = err2; + + return err; +} + +int ips_ptl_recvq_isempty(const ptl_t *ptl_gen) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + struct ptl_shared *recvshc = ptl->recvshc; + + if (recvshc != NULL && !ips_recvhdrq_isempty(&recvshc->recvq)) + return 0; + return ips_recvhdrq_isempty(&ptl->recvq); +} + +/* + * Legacy ips_get_stat -- do nothing. + */ +int ips_get_stat(psm2_epaddr_t epaddr, ips_sess_stat *stats) +{ + memset(stats, 0, sizeof(ips_sess_stat)); + return 0; +} + +static psm2_error_t shrecvq_init(ptl_t *ptl_gen, const psmi_context_t *context) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + struct ptl_shared *recvshc = ptl->recvshc; + struct ips_recvhdrq_callbacks recvq_callbacks; + psm2_error_t err = PSM2_OK; + int i; + + /* Initialize (shared) hardware context recvq (ptl->recvq) */ + /* NOTE: uses recvq in ptl structure for shared h/w context */ + recvq_callbacks.callback_packet_unknown = ips_proto_process_unknown; + recvq_callbacks.callback_subcontext = ips_subcontext_process; + recvq_callbacks.callback_error = ips_proto_process_packet_error; + if ((err = ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto, + &recvq_callbacks, + recvshc->subcontext, + &ptl->recvq, + &recvshc->hwcontext_ctrl->recvq_state, + PSM_HAL_CL_Q_RX_HDR_Q))) { + goto fail; + } + + /* Initialize software subcontext (recvshc->recvq). Subcontexts do */ + /* not require the rcvhdr copy feature. */ + recvq_callbacks.callback_subcontext = ips_subcontext_ignore; + if ((err = ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto, + &recvq_callbacks, + recvshc->subcontext, + &recvshc->recvq, &recvshc->recvq_state, + PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(recvshc->subcontext)))) { + goto fail; + } + + /* Initialize each recvshc->writeq for shared contexts */ + for (i = 0; i < recvshc->subcontext_cnt; i++) { + if ((err = ips_writehdrq_init(context, + &recvshc->writeq[i], + &recvshc->subcontext_ureg[i]-> + writeq_state, + i))) { + goto fail; + } + } + + if (err == PSM2_OK) + _HFI_DBG + ("Context sharing in use: lid %d, context %d, sub-context %d\n", + (int)psm2_epid_nid(ptl->epid), recvshc->context, + recvshc->subcontext); +fail: + return err; +} + +static psm2_error_t shrecvq_fini(ptl_t *ptl_gen) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + psm2_error_t err = PSM2_OK; + int i; + + /* disable my write header queue before deallocation */ + i = ptl->recvshc->subcontext; + ptl->recvshc->subcontext_ureg[i]->writeq_state.enabled = 0; + psmi_free(ptl->recvshc); + return err; +} + +psm2_error_t +ips_ptl_connect(ptl_t *ptl_gen, int numep, const psm2_epid_t *array_of_epid, + const int *array_of_epid_mask, psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, uint64_t timeout_in) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + psm2_error_t err; + psm2_ep_t ep; + psm2_epid_t *epid_array = NULL; + psm2_error_t *error_array = NULL; + psm2_epaddr_t *epaddr_array = NULL; + ips_epaddr_t *ipsaddr_master, *ipsaddr; + int *mask_array = NULL; + int i; + + PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock); + err = ips_proto_connect(&ptl->proto, numep, array_of_epid, + array_of_epid_mask, array_of_errors, + array_of_epaddr, timeout_in); + if (err) + return err; + + psmi_assert_always(ptl->ep->mctxt_master == ptl->ep); + if (ptl->ep->mctxt_next == ptl->ep) + return err; + + /* make the additional mutil-context connections. */ + epid_array = (psm2_epid_t *) + psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_epid_t) * numep); + mask_array = (int *) + psmi_malloc(ptl->ep, UNDEFINED, sizeof(int) * numep); + error_array = (psm2_error_t *) + psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_error_t) * numep); + epaddr_array = (psm2_epaddr_t *) + psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_epaddr_t) * numep); + if (!epid_array || !mask_array || !error_array || !epaddr_array) { + goto fail; + } + + ep = ptl->ep->mctxt_next; + while (ep != ep->mctxt_master) { + + /* Setup the mask array and epid array. */ + for (i = 0; i < numep; i++) { + if (array_of_epid_mask[i] + && array_of_errors[i] == PSM2_OK) { + ipsaddr_master = + (ips_epaddr_t *) array_of_epaddr[i]; + ipsaddr = ipsaddr_master->next; + mask_array[i] = 0; + while (ipsaddr != ipsaddr_master) { + if (((psm2_epaddr_t) ipsaddr)->proto-> + ep == ep) { + mask_array[i] = 1; + epid_array[i] = + ((psm2_epaddr_t) ipsaddr)-> + epid; + break; + } + ipsaddr = ipsaddr->next; + } + } else { + mask_array[i] = 0; + } + } + + /* Make the real protocol connections. */ + err = + ips_proto_connect(&((struct ptl_ips *)(ep->ptl_ips.ptl))->proto, + numep, epid_array, mask_array, error_array, + epaddr_array, timeout_in); + if (err) + goto fail; + + ep = ep->mctxt_next; + } + +fail: + if (epid_array) + psmi_free(epid_array); + if (mask_array) + psmi_free(mask_array); + if (error_array) + psmi_free(error_array); + if (epaddr_array) + psmi_free(epaddr_array); + + return err; +} + +psm2_error_t +ips_ptl_disconnect(ptl_t *ptl_gen, int force, int numep, + psm2_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm2_error_t array_of_errors[], uint64_t timeout_in) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + int *array_of_epaddr_mask_internal, i; + psm2_error_t err; + + /* + * Copy true values from array_of_epaddr_mask, provided that their + * respective epaddr is an ips one. + * Newly created mask will be used for the protocol disconnect call + * instead. + */ + PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock); + array_of_epaddr_mask_internal = psmi_calloc(ptl->ep, UNDEFINED, + sizeof(int), numep); + if (!array_of_epaddr_mask_internal) + return PSM2_NO_MEMORY; + + for (i = 0; i < numep; ++i) { + if (array_of_epaddr_mask[i] && array_of_epaddr[i] + && array_of_epaddr[i]->ptlctl->ptl == ptl_gen) { + array_of_epaddr_mask_internal[i] = 1; + } + } + + err = ips_proto_disconnect(&ptl->proto, force, numep, array_of_epaddr, + array_of_epaddr_mask_internal, + array_of_errors, timeout_in); + + psmi_free(array_of_epaddr_mask_internal); + return err; +} + +/* Only symbol we expose out of here */ +struct ptl_ctl_init +psmi_ptl_ips = { + ips_ptl_sizeof, ips_ptl_init, ips_ptl_fini, ips_ptl_setopt, + ips_ptl_getopt +}; + +struct ptl_ctl_rcvthread +psmi_ptl_ips_rcvthread = { + ips_ptl_rcvthread_is_enabled, + ips_ptl_rcvthread_transfer_ownership, +}; diff --git a/ptl_ips/ptl_fwd.h b/ptl_ips/ptl_fwd.h new file mode 100644 index 0000000..3702fba --- /dev/null +++ b/ptl_ips/ptl_fwd.h @@ -0,0 +1,67 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _PTL_FWD_IPS_H +#define _PTL_FWD_IPS_H +#include "ptl.h" + +typedef struct ips_epaddr ips_epaddr_t; +typedef struct ips_msgctl ips_msgctl_t; + +/* Symbol in ips ptl */ +struct ptl_ctl_init psmi_ptl_ips; + +struct ptl_ctl_rcvthread psmi_ptl_ips_rcvthread; +#endif /* _PTL_FWD_IPS_H */ diff --git a/ptl_ips/ptl_ips.h b/ptl_ips/ptl_ips.h new file mode 100644 index 0000000..24ef035 --- /dev/null +++ b/ptl_ips/ptl_ips.h @@ -0,0 +1,185 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_PTL_H +#define _IPS_PTL_H + +#include "psm_user.h" + +#include "ips_proto.h" +#include "ips_stats.h" +#include "ips_subcontext.h" + +struct ptl_shared; + +/* + * PTL at the ips level (for OPA) + * + * This PTL structure glues all the ips components together. + * + * * ips timer, shared by various components, allows each component to + * schedule time-based expiration callbacks on the timerq. + * * HW receive queue + * * send control block to handle eager messages + * * instantiation of the ips protocol + * * endpoint state, to map endpoint indexes into structures + * + * Receive-side + * + * ----[ proto ] + * / ^ ^ + * | | | + * | packet packet + * | known unknown + * add_endpt \ / + * | | + * `----> [epstate] + * ^ + * | + * lookup_endpt + * | + * [recvq] + * | + * poll + * + */ +/* Updates to this struct must be reflected in PTL_IPS_SIZE in ptl_fwd.h */ +/* IPS knows it functions as a PTL whenever ptl->ep is non-NULL */ +struct ptl_ips { + psm2_ep_t ep; /* back ptr */ + psm2_epid_t epid; /* cached from ep */ + psm2_epaddr_t epaddr; /* cached from ep */ + ips_epaddr_t *ipsaddr; /* cached from epaddr */ + ptl_ctl_t *ctl; /* cached from init */ + const psmi_context_t *context; /* cached from init */ + + void *spioc; /* PIO send control (opaque ptr) */ + struct ips_proto proto; /* protocol instance: timerq, epstate, spio */ + + struct psmi_timer_ctrl timerq; + struct ips_epstate epstate; /* map incoming packets */ + struct ips_recvhdrq_state recvq_state; + struct ips_recvhdrq recvq; /* HW recvq: epstate, proto */ + + /* timer to check the context's status */ + struct psmi_timer status_timer; + + /* context's status check timeout in cycles -- cached */ + uint64_t status_cyc_timeout; + /* Shared contexts context */ + struct ptl_shared *recvshc; + /* Rcv thread context */ + struct ptl_rcvthread *rcvthread; +} +#ifndef PACK_STRUCT_STL +#define PACK_STRUCT_STL /* nothing */ +#endif + __attribute__ ((PACK_STRUCT_STL aligned(16))); + +/* + * Sample implementation of shared contexts context. + * + * In shared mode, the hardware queue is serviced by more than one process. + * Each process also mirrors the hardware queue in software (represented by an + * ips_recvhdrq). For packets we service in the hardware queue that are not + * destined for us, we write them in other processes's receive queues + * (represented by an ips_writehdrq). + * + */ +struct ptl_shared { + ptl_t *ptl; /* backptr to main ptl */ + uint32_t context; + uint32_t subcontext; + uint32_t subcontext_cnt; + + pthread_spinlock_t *context_lock; + struct ips_subcontext_ureg *subcontext_ureg[PSM_HAL_MAX_SHARED_CTXTS]; + struct ips_hwcontext_ctrl *hwcontext_ctrl; + struct ips_recvhdrq recvq; /* subcontext receive queue */ + struct ips_recvhdrq_state recvq_state; /* subcontext receive queue state */ + struct ips_writehdrq writeq[PSM_HAL_MAX_SHARED_CTXTS]; /* peer subcontexts */ +}; + +/* + * Connect/disconnect are wrappers around psm proto's connect/disconnect, + * mostly to abstract away PSM-specific stuff from ips internal structures + */ +psm2_error_t ips_ptl_connect(ptl_t *ptl, int numep, + const psm2_epid_t *array_of_epid, + const int *array_of_epid_mask, + psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, + uint64_t timeout_in); + +psm2_error_t ips_ptl_disconnect(ptl_t *ptl, int force, int numep, + psm2_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm2_error_t array_of_errors[], + uint64_t timeout_in); + +/* + * Generic Poll function for ips-level ptl + */ +psm2_error_t ips_ptl_poll(ptl_t *ptl, int _ignored); +psm2_error_t ips_ptl_shared_poll(ptl_t *ptl, int _ignored); + +/* + * Support for receive thread + */ +psm2_error_t ips_ptl_rcvthread_init(ptl_t *ptl, struct ips_recvhdrq *recvq); +psm2_error_t ips_ptl_rcvthread_fini(ptl_t *ptl); +void ips_ptl_rcvthread_transfer_ownership(ptl_t *from_ptl, ptl_t *to_ptl); + +#endif /* _IPS_PTL_H */ diff --git a/ptl_ips/ptl_rcvthread.c b/ptl_ips/ptl_rcvthread.c new file mode 100644 index 0000000..4adb65a --- /dev/null +++ b/ptl_ips/ptl_rcvthread.c @@ -0,0 +1,516 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include + +#include "psm_user.h" +#include "psm2_hal.h" +#include "psm_mq_internal.h" +#include "ptl_ips.h" +#include "ips_proto.h" + +struct ptl_rcvthread; + +static void *ips_ptl_pollintr(void *recvthreadc); +static psm2_error_t rcvthread_initstats(ptl_t *ptl); +static psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc); + +struct ptl_rcvthread { + const psmi_context_t *context; + const ptl_t *ptl; + struct ips_recvhdrq *recvq; + + pthread_t hdrq_threadid; + uint64_t t_start_cyc; + int pipefd[2]; + + /* stats and some for scheduling */ + uint64_t pollcnt; + uint64_t pollcnt_to; + uint64_t pollcyc; + uint64_t pollok; + + /* For scheduling interrupt thread */ + int timeout_period_min; + int timeout_period_max; + int timeout_shift; + uint64_t pollok_last; + uint64_t pollcnt_last; + uint32_t last_timeout; +}; + +#ifdef PSM_CUDA + /* This is a global cuda context (extern declaration in psm_user.h) + * stored to provide hints during a cuda failure + * due to a null cuda context. + */ + CUcontext ctxt; +#endif + +/* + * The receive thread knows about the ptl interface, so it can muck with it + * directly. + */ +psm2_error_t ips_ptl_rcvthread_init(ptl_t *ptl_gen, struct ips_recvhdrq *recvq) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + psm2_error_t err = PSM2_OK; + struct ptl_rcvthread *rcvc; + + ptl->rcvthread = + psmi_calloc(ptl->ep, UNDEFINED, 1, sizeof(struct ptl_rcvthread)); + if (ptl->rcvthread == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + rcvc = ptl->rcvthread; + + rcvc->recvq = recvq; + rcvc->ptl = ptl_gen; + rcvc->context = ptl->context; + rcvc->t_start_cyc = get_cycles(); + +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED) + PSMI_CUDA_CALL(cuCtxGetCurrent, &ctxt); +#endif + + if (psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD) && + (!psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED))){ + + if ((err = rcvthread_initsched(rcvc))) + goto fail; + + /* Create a pipe so we can synchronously terminate the thread */ + if (pipe(rcvc->pipefd) != 0) { + err = psmi_handle_error(ptl->ep, PSM2_EP_DEVICE_FAILURE, + "Cannot create a pipe for receive thread: %s\n", + strerror(errno)); + goto fail; + } + + psmi_hal_add_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); + if (pthread_create(&rcvc->hdrq_threadid, NULL, + ips_ptl_pollintr, ptl->rcvthread)) { + close(rcvc->pipefd[0]); + close(rcvc->pipefd[1]); + err = psmi_handle_error(ptl->ep, PSM2_EP_DEVICE_FAILURE, + "Cannot start receive thread: %s\n", + strerror(errno)); + goto fail; + } + + } + + if ((err = rcvthread_initstats(ptl_gen))) + goto fail; + +fail: + return err; +} + +psm2_error_t ips_ptl_rcvthread_fini(ptl_t *ptl_gen) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)ptl->rcvthread; + uint64_t t_now; + psm2_error_t err = PSM2_OK; + + PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock); + + if (ptl->rcvthread == NULL) + return err; + + if (rcvc->hdrq_threadid && psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED)) { + t_now = get_cycles(); + + /* Disable interrupts then kill the receive thread */ + if (psmi_context_interrupt_isenabled + ((psmi_context_t *) ptl->context)) + if ((err = + psmi_context_interrupt_set((psmi_context_t *) ptl-> + context, 0))) + goto fail; + + /* Close the pipe so we can have the thread synchronously exit. + On Linux just closing the pipe does not wake up the receive + thread. + */ + if (write(rcvc->pipefd[1], (const void *)&t_now, + sizeof(uint64_t)) == -1 || + close(rcvc->pipefd[1]) == -1) { + _HFI_VDBG + ("unable to close pipe to receive thread cleanly\n"); + } + pthread_join(rcvc->hdrq_threadid, NULL); + psmi_hal_sub_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); + rcvc->hdrq_threadid = 0; + if (_HFI_PRDBG_ON) { + _HFI_PRDBG_ALWAYS + ("rcvthread poll success %lld/%lld times, " + "thread cancelled in %.3f us\n", + (long long)rcvc->pollok, (long long)rcvc->pollcnt, + (double)cycles_to_nanosecs(get_cycles() - t_now) / 1e3); + } + } + + psmi_free(ptl->rcvthread); + ptl->rcvthread = NULL; +fail: + return err; +} + +void ips_ptl_rcvthread_transfer_ownership(ptl_t *from_ptl_gen, ptl_t *to_ptl_gen) +{ + struct ptl_rcvthread *rcvc; + + psmi_hal_sub_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); + struct ptl_ips *from_ptl = (struct ptl_ips *)from_ptl_gen; + struct ptl_ips *to_ptl = (struct ptl_ips *)to_ptl_gen; + to_ptl->rcvthread = from_ptl->rcvthread; + from_ptl->rcvthread = NULL; + + rcvc = to_ptl->rcvthread; + + rcvc->recvq = &to_ptl->recvq; + rcvc->context = to_ptl->context; + rcvc->ptl = to_ptl_gen; +} + +psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc) +{ + union psmi_envvar_val env_to; + char buf[192]; + char *rcv_freq = buf; + int no_timeout = 0; + int tvals[3] = { RCVTHREAD_TO_MIN_FREQ, + RCVTHREAD_TO_MAX_FREQ, + RCVTHREAD_TO_SHIFT + }; + snprintf(buf, sizeof(buf) - 1, "%d:%d:%d", RCVTHREAD_TO_MIN_FREQ, + RCVTHREAD_TO_MAX_FREQ, RCVTHREAD_TO_SHIFT); + buf[sizeof(buf) - 1] = '\0'; + + if (!psmi_getenv("PSM2_RCVTHREAD_FREQ", + "Thread timeouts (per sec) ", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)rcv_freq, &env_to)) { + /* not using default values */ + int nparsed = psmi_parse_str_tuples(env_to.e_str, 3, tvals); + int invalid = 0; + + if (nparsed < 1 || (nparsed > 0 && tvals[0] == 0) || + (nparsed > 1 && tvals[1] == 0)) { + no_timeout = 1; + } else { + if (nparsed > 0 && tvals[0] > 1000) + invalid = 1; + if (nparsed > 1 + && (tvals[1] > 1000 || tvals[1] < tvals[0])) + invalid = 1; + if (nparsed > 2 && tvals[2] > 10) + invalid = 1; + } + + if (invalid) { + _HFI_INFO + ("Overriding invalid request for RcvThread frequency" + " settings of %s to be <%d:%d:%d>\n", env_to.e_str, + RCVTHREAD_TO_MIN_FREQ, RCVTHREAD_TO_MAX_FREQ, + RCVTHREAD_TO_SHIFT); + tvals[0] = RCVTHREAD_TO_MIN_FREQ; + tvals[1] = RCVTHREAD_TO_MAX_FREQ; + tvals[2] = RCVTHREAD_TO_SHIFT; + } + } + + if (no_timeout) { + rcvc->last_timeout = -1; + _HFI_PRDBG("PSM2_RCVTHREAD_FREQ set to only interrupt " + "(no timeouts)\n"); + } else { + /* Convert freq to period in microseconds (for poll()) */ + rcvc->timeout_period_max = 1000 / tvals[0]; + rcvc->timeout_period_min = 1000 / tvals[1]; + rcvc->timeout_shift = tvals[2]; + /* Start in the middle of min and max */ + rcvc->last_timeout = (rcvc->timeout_period_min + + rcvc->timeout_period_max) / 2; + _HFI_PRDBG("PSM2_RCVTHREAD_FREQ converted to period " + "min=%dms,max=%dms,shift=%d\n", + rcvc->timeout_period_min, rcvc->timeout_period_max, + rcvc->timeout_shift); + } + return PSM2_OK; +} + +static +int rcvthread_next_timeout(struct ptl_rcvthread *rcvc) +{ + uint64_t pollok_diff = rcvc->pollok - rcvc->pollok_last; + + if (pollok_diff > 0) { + if (rcvc->last_timeout > rcvc->timeout_period_min) + /* By default, be less aggressive, but there's a more aggressive + * alternative if need be */ +#if 1 + rcvc->last_timeout >>= rcvc->timeout_shift; +#else + rcvc->last_timeout = rcvc->timeout_period_min; +#endif + } else { /* we had less progress */ + if (rcvc->last_timeout < rcvc->timeout_period_max) + rcvc->last_timeout <<= rcvc->timeout_shift; + } + + rcvc->pollok_last = rcvc->pollok; + rcvc->pollcnt_last = rcvc->pollcnt; + return (int)rcvc->last_timeout; +} + +extern int ips_in_rcvthread; + +/* + * Receiver thread support. + * + * By default, polling in the driver asks the chip to generate an interrupt on + * every packet. When the driver supports POLLURG we can switch the poll mode + * to one that requests interrupts only for packets that contain an urgent bit + * (and optionally enable interrupts for hdrq overflow events). When poll + * returns an event, we *try* to make progress on the receive queue but simply + * go back to sleep if we notice that the main thread is already making + * progress. + */ +static +void *ips_ptl_pollintr(void *rcvthreadc) +{ + struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)rcvthreadc; + struct ips_recvhdrq *recvq = rcvc->recvq; + int fd_pipe = rcvc->pipefd[0]; + psm2_ep_t ep; + struct pollfd pfd[2]; + int ret; + int next_timeout = rcvc->last_timeout; + uint64_t t_cyc; + psm2_error_t err; + +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED && ctxt != NULL) + PSMI_CUDA_CALL(cuCtxSetCurrent, ctxt); +#endif + + PSM2_LOG_MSG("entering"); + /* No reason to have many of these, keep this as a backup in case the + * recvhdrq init function is misused */ + psmi_assert_always(psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED)); + + /* Switch driver to a mode where it can interrupt on urgent packets */ + if (psmi_context_interrupt_set((psmi_context_t *) + rcvc->context, 1) == PSM2_EP_NO_RESOURCES) { + _HFI_PRDBG + ("hfi_poll_type feature not present in driver, turning " + "off internal progress thread\n"); + return NULL; + } + + _HFI_PRDBG("Enabled communication thread on URG packets\n"); + + while (1) { + pfd[0].fd = psmi_hal_get_fd(rcvc->context->psm_hw_ctxt); + pfd[0].events = POLLIN; + pfd[0].revents = 0; + pfd[1].fd = fd_pipe; + pfd[1].events = POLLIN; + pfd[1].revents = 0; + + ret = poll(pfd, 2, next_timeout); + t_cyc = get_cycles(); + if_pf(ret < 0) { + if (errno == EINTR) + _HFI_DBG("got signal, keep polling\n"); + else + psmi_handle_error(PSMI_EP_NORETURN, + PSM2_INTERNAL_ERR, + "Receive thread poll() error: %s", + strerror(errno)); + } else if (pfd[1].revents) { + /* Any type of event on this fd means exit, should be POLLHUP */ + _HFI_DBG("close thread: revents=0x%x\n", pfd[1].revents); + close(fd_pipe); + break; + } else { + rcvc->pollcnt++; + if (!PSMI_LOCK_TRY(psmi_creation_lock)) { + if (ret == 0 || pfd[0].revents & (POLLIN | POLLERR)) { + if (PSMI_LOCK_DISABLED) { + /* We do this check without acquiring the lock, no sense to + * adding the overhead and it doesn't matter if we're + * wrong. */ + if (ips_recvhdrq_isempty(recvq)) + continue; + if(recvq->proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) { + ips_recvhdrq_scan_cca(recvq); + } + if (!ips_recvhdrq_trylock(recvq)) + continue; + err = ips_recvhdrq_progress(recvq); + if (err == PSM2_OK) + rcvc->pollok++; + else + rcvc->pollcyc += get_cycles() - t_cyc; + ips_recvhdrq_unlock(recvq); + } else { + + ep = psmi_opened_endpoint; + + if (!PSMI_LOCK_TRY(ep->mq->progress_lock)) { + if(recvq->proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN ) { + ips_recvhdrq_scan_cca(recvq); + } + PSMI_UNLOCK(ep->mq->progress_lock); + } + /* Go through all master endpoints. */ + do{ + if (!PSMI_LOCK_TRY(ep->mq->progress_lock)) { + /* If we time out, we service shm and hfi. If not, we + * assume to have received an hfi interrupt and service + * only hfi. + */ + err = psmi_poll_internal(ep, + ret == + 0 ? PSMI_TRUE : + PSMI_FALSE); + + if (err == PSM2_OK) + rcvc->pollok++; + else + rcvc->pollcyc += get_cycles() - t_cyc; + PSMI_UNLOCK(ep->mq->progress_lock); + } + + /* get next endpoint from multi endpoint list */ + ep = ep->user_ep_next; + } while(NULL != ep); + } + } + PSMI_UNLOCK(psmi_creation_lock); + } + if (ret == 0) { /* change timeout only on timed out poll */ + rcvc->pollcnt_to++; + next_timeout = rcvthread_next_timeout(rcvc); + } + } + } + + PSM2_LOG_MSG("leaving"); + return NULL; +} + +static uint64_t rcvthread_stats_pollok(void *context) +{ + struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)context; + double ratio = 0.0; + uint64_t ratio_u; + if (rcvc->pollcnt > 0) + ratio = (double)rcvc->pollok * 100.0 / rcvc->pollcnt; + memcpy(&ratio_u, &ratio, sizeof(uint64_t)); + return ratio_u; +} + +static uint64_t rcvthread_stats_pollcyc(void *context) +{ + struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)context; + /* log in milliseconds */ + return (uint64_t) ((double)cycles_to_nanosecs(rcvc->pollcyc) / 1.0e6); +} + +static psm2_error_t rcvthread_initstats(ptl_t *ptl_gen) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)ptl->rcvthread; + struct psmi_stats_entry entries[] = { + PSMI_STATS_DECL("intrthread schedule count", + MPSPAWN_STATS_REDUCTION_ALL | + MPSPAWN_STATS_SKIP_IF_ZERO, + NULL, &rcvc->pollcnt), + PSMI_STATS_DECL("intrthread schedule success (%)", + MPSPAWN_STATS_REDUCTION_ALL | + MPSPAWN_STATS_TYPE_DOUBLE, + rcvthread_stats_pollok, NULL), + PSMI_STATS_DECL("intrthread timeout count", + MPSPAWN_STATS_REDUCTION_ALL | + MPSPAWN_STATS_SKIP_IF_ZERO, + NULL, &rcvc->pollcnt_to), + PSMI_STATS_DECL("intrthread wasted time (ms)", + MPSPAWN_STATS_REDUCTION_ALL, + rcvthread_stats_pollcyc, NULL) + }; + + /* If we don't want a thread, make sure we still initialize the counters + * but set them to NaN instead */ + if (!psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED)) { + int i; + static uint64_t ctr_nan = MPSPAWN_NAN; + for (i = 0; i < (int)PSMI_STATS_HOWMANY(entries); i++) { + entries[i].getfn = NULL; + entries[i].u.val = &ctr_nan; + } + } + + return psmi_stats_register_type(PSMI_STATS_NO_HEADING, + PSMI_STATSTYPE_RCVTHREAD, + entries, + PSMI_STATS_HOWMANY(entries), rcvc); +} diff --git a/ptl_self/Makefile b/ptl_self/Makefile new file mode 100644 index 0000000..6af8bf7 --- /dev/null +++ b/ptl_self/Makefile @@ -0,0 +1,89 @@ +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2015 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2015 Intel Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Copyright (c) 2003-2014 Intel Corporation. All rights reserved. +# + +OUTDIR = . + +this_srcdir = $(shell readlink -m .) +top_srcdir := $(this_srcdir)/.. +INCLUDES += -I$(top_srcdir) + +${TARGLIB}-objs := ptl.o +${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs}) +DEPS := $(${TARGLIB}-objs:.o=.d) + +.PHONY: all clean +IGNORE_DEP_TARGETS = clean + +all .DEFAULT: ${${TARGLIB}-objs} + +$(OUTDIR)/%.d: $(this_srcdir)/%.c + $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o) + +$(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS} + $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -c $< -o $@ + +clean: + @if [ -d $(OUTDIR) ]; then \ + cd $(OUTDIR); \ + rm -f *.o *.d *.gcda *.gcno; \ + cd -; \ + fi + +#ifeq prevents the deps from being included during clean +#-include line is required to pull in auto-dependecies during 2nd pass +ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),) +-include ${DEPS} +endif + +install: + @echo "Nothing to do for install." diff --git a/ptl_self/ptl.c b/ptl_self/ptl.c new file mode 100644 index 0000000..49d898d --- /dev/null +++ b/ptl_self/ptl.c @@ -0,0 +1,419 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +/* + * This file implements the PSM PTL for self (loopback) + */ + +#include "psm_user.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" + +struct ptl_self { + psm2_ep_t ep; + psm2_epid_t epid; + psm2_epaddr_t epaddr; + ptl_ctl_t *ctl; +} __attribute__((aligned(16))); + +static +psm2_error_t +ptl_handle_rtsmatch(psm2_mq_req_t recv_req, int was_posted) +{ + psm2_mq_req_t send_req = (psm2_mq_req_t) recv_req->ptl_req_ptr; + + if (recv_req->req_data.recv_msglen > 0) { + psmi_mq_mtucpy(recv_req->req_data.buf, send_req->req_data.buf, + recv_req->req_data.recv_msglen); + } + + psmi_mq_handle_rts_complete(recv_req); + + /* If the send is already marked complete, that's because it was internally + * buffered. */ + if (send_req->state == MQ_STATE_COMPLETE) { + psmi_mq_stats_rts_account(send_req); + if (send_req->req_data.buf != NULL && send_req->req_data.send_msglen > 0) + psmi_mq_sysbuf_free(send_req->mq, send_req->req_data.buf); + /* req was left "live" even though the sender was told that the + * send was done */ + psmi_mq_req_free(send_req); + } else + psmi_mq_handle_rts_complete(send_req); + + _HFI_VDBG("[self][complete][b=%p][sreq=%p][rreq=%p]\n", + recv_req->req_data.buf, send_req, recv_req); + return PSM2_OK; +} + +static +psm2_error_t self_mq_send_testwait(psm2_mq_req_t *ireq) +{ + uint8_t *ubuf; + psm2_mq_req_t req = *ireq; + + PSMI_LOCK_ASSERT(req->mq->progress_lock); + + /* We're waiting on a send request, and the matching receive has not been + * posted yet. This is a deadlock condition in MPI but we accommodate it + * here in the "self ptl" by using system-allocated memory. + */ + req->testwait_callback = NULL; /* no more calls here */ + + ubuf = req->req_data.buf; + if (ubuf != NULL && req->req_data.send_msglen > 0) { + req->req_data.buf = psmi_mq_sysbuf_alloc(req->mq, req->req_data.send_msglen); + if (req->req_data.buf == NULL) + return PSM2_NO_MEMORY; + psmi_mq_mtucpy(req->req_data.buf, ubuf, req->req_data.send_msglen); + } + + /* Mark it complete but don't free the req, it's freed when the receiver + * does the match */ + req->state = MQ_STATE_COMPLETE; + *ireq = PSM2_MQ_REQINVALID; + return PSM2_OK; +} + +/* Self is different. We do everything as rendezvous. */ +static +psm2_error_t +self_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags_user, + uint32_t flags_internal, psm2_mq_tag_t *tag, const void *ubuf, + uint32_t len, void *context, psm2_mq_req_t *req_o) +{ + psm2_mq_req_t send_req; + psm2_mq_req_t recv_req; + int rc; + + send_req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); + if_pf(send_req == NULL) + return PSM2_NO_MEMORY; + +#ifdef PSM_CUDA + /* CUDA documentation dictates the use of SYNC_MEMOPS attribute + * when the buffer pointer received into PSM has been allocated + * by the application. This guarantees the all memory operations + * to this region of memory (used by multiple layers of the stack) + * always synchronize + */ + if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)ubuf)) { + int trueflag = 1; + PSMI_CUDA_CALL(cuPointerSetAttribute, &trueflag, + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + (CUdeviceptr)ubuf); + send_req->is_buf_gpu_mem = 1; + } else + send_req->is_buf_gpu_mem = 0; +#endif + + rc = psmi_mq_handle_rts(mq, epaddr, tag, + len, NULL, 0, 1, + ptl_handle_rtsmatch, &recv_req); + send_req->req_data.tag = *tag; + send_req->req_data.buf = (void *)ubuf; + send_req->req_data.send_msglen = len; + send_req->req_data.context = context; + recv_req->ptl_req_ptr = (void *)send_req; + recv_req->rts_sbuf = (uintptr_t) ubuf; + recv_req->rts_peer = epaddr; + if (rc == MQ_RET_MATCH_OK) + ptl_handle_rtsmatch(recv_req, 1); + else + send_req->testwait_callback = self_mq_send_testwait; + + _HFI_VDBG("[self][b=%p][m=%d][t=%08x.%08x.%08x][match=%s][req=%p]\n", + ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2], + rc == MQ_RET_MATCH_OK ? "YES" : "NO", send_req); + *req_o = send_req; + return PSM2_OK; +} + +static +psm2_error_t +self_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags, + psm2_mq_tag_t *tag, const void *ubuf, uint32_t len) +{ + psm2_error_t err; + psm2_mq_req_t req; + err = self_mq_isend(mq, epaddr, flags, PSMI_REQ_FLAG_NORMAL, tag, ubuf, len, NULL, &req); + psmi_mq_wait_internal(&req); + return err; +} + +/* Fill in AM capabilities parameters */ +static psm2_error_t +self_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters) +{ + if (parameters == NULL) { + return PSM2_PARAM_ERR; + } + + /* Self is just a loop-back and has no restrictions. */ + parameters->max_handlers = INT_MAX; + parameters->max_nargs = INT_MAX; + parameters->max_request_short = INT_MAX; + parameters->max_reply_short = INT_MAX; + + return PSM2_OK; +} + +static +psm2_error_t +self_am_short_request(psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + struct psm2_ep_am_handle_entry *hentry; + psm2_ep_t ep = ((struct ptl_self *)(epaddr->ptlctl->ptl))->ep; + struct psmi_am_token tok; + + tok.epaddr_incoming = epaddr; + + hentry = psm_am_get_handler_function(ep, handler); + + /* Note a guard here for hentry != NULL is not needed because at + * initialization, a psmi_assert_always() assure the entry will be + * non-NULL. */ + + if (likely(hentry->version == PSM2_AM_HANDLER_V2)) { + psm2_am_handler_2_fn_t hfn2 = + (psm2_am_handler_2_fn_t)hentry->hfn; + hfn2(&tok, args, nargs, src, len, hentry->hctx); + } else { + psm2_am_handler_fn_t hfn1 = + (psm2_am_handler_fn_t)hentry->hfn; + hfn1(&tok, args, nargs, src, len); + } + + if (completion_fn) { + completion_fn(completion_ctxt); + } + + return PSM2_OK; +} + +static +psm2_error_t +self_am_short_reply(psm2_am_token_t token, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, void *completion_ctxt) +{ + struct psm2_ep_am_handle_entry *hentry; + struct psmi_am_token *tok = token; + struct ptl_self *ptl = (struct ptl_self *)tok->epaddr_incoming->ptlctl->ptl; + psm2_ep_t ep = ptl->ep; + + hentry = psm_am_get_handler_function(ep, handler); + + /* Note a guard here for hentry != NULL is not needed because at + * initialization, a psmi_assert_always() assure the entry will be + * non-NULL. */ + + if (likely(hentry->version == PSM2_AM_HANDLER_V2)) { + psm2_am_handler_2_fn_t hfn2 = + (psm2_am_handler_2_fn_t)hentry->hfn; + hfn2(token, args, nargs, src, len, hentry->hctx); + } else { + psm2_am_handler_fn_t hfn1 = + (psm2_am_handler_fn_t)hentry->hfn; + hfn1(token, args, nargs, src, len); + } + + if (completion_fn) { + completion_fn(completion_ctxt); + } + + return PSM2_OK; +} + +static +psm2_error_t +self_connect(ptl_t *ptl_gen, + int numep, + const psm2_epid_t array_of_epid[], + const int array_of_epid_mask[], + psm2_error_t array_of_errors[], + psm2_epaddr_t array_of_epaddr[], uint64_t timeout_ns) +{ + struct ptl_self *ptl = (struct ptl_self *)ptl_gen; + psmi_assert_always(ptl->epaddr != NULL); + psm2_error_t err = PSM2_OK; + int i; + + PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock); + + for (i = 0; i < numep; i++) { + if (!array_of_epid_mask[i]) + continue; + + if (array_of_epid[i] == ptl->epid) { + array_of_epaddr[i] = ptl->epaddr; + array_of_epaddr[i]->ptlctl = ptl->ctl; + array_of_epaddr[i]->epid = ptl->epid; + if (psmi_epid_set_hostname(psm2_epid_nid(ptl->epid), + psmi_gethostname(), 0)) { + err = PSM2_NO_MEMORY; + goto fail; + } + psmi_epid_add(ptl->ep, ptl->epid, ptl->epaddr); + array_of_errors[i] = PSM2_OK; + } else { + array_of_epaddr[i] = NULL; + array_of_errors[i] = PSM2_EPID_UNREACHABLE; + } + } + +fail: + return err; +} + +static +psm2_error_t +self_disconnect(ptl_t *ptl_gen, int force, int numep, + psm2_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm2_error_t array_of_errors[], uint64_t timeout_in) +{ + struct ptl_self *ptl = (struct ptl_self *)ptl_gen; + int i; + for (i = 0; i < numep; i++) { + if (array_of_epaddr_mask[i] == 0) + continue; + + if (array_of_epaddr[i] == ptl->epaddr) { + psmi_epid_remove(ptl->ep, ptl->epid); + array_of_errors[i] = PSM2_OK; + } + } + return PSM2_OK; +} + +static +size_t self_ptl_sizeof(void) +{ + return sizeof(struct ptl_self); +} + +ustatic +psm2_error_t self_ptl_init(const psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) +{ + struct ptl_self *ptl = (struct ptl_self *)ptl_gen; + psmi_assert_always(ep != NULL); + psmi_assert_always(ep->epaddr != NULL); + psmi_assert_always(ep->epid != 0); + + ptl->ep = ep; + ptl->epid = ep->epid; + ptl->epaddr = ep->epaddr; + ptl->ctl = ctl; + + memset(ctl, 0, sizeof(*ctl)); + /* Fill in the control structure */ + ctl->ptl = ptl_gen; + ctl->ep = ep; + ctl->ep_poll = NULL; + ctl->ep_connect = self_connect; + ctl->ep_disconnect = self_disconnect; + + ctl->mq_send = self_mq_send; + ctl->mq_isend = self_mq_isend; + + ctl->am_get_parameters = self_am_get_parameters; + ctl->am_short_request = self_am_short_request; + ctl->am_short_reply = self_am_short_reply; + + /* No stats in self */ + ctl->epaddr_stats_num = NULL; + ctl->epaddr_stats_init = NULL; + ctl->epaddr_stats_get = NULL; + + return PSM2_OK; +} + +static psm2_error_t self_ptl_fini(ptl_t *ptl, int force, uint64_t timeout_ns) +{ + return PSM2_OK; /* nothing to do */ +} + +static +psm2_error_t +self_ptl_setopt(const void *component_obj, int optname, + const void *optval, uint64_t optlen) +{ + /* No options for SELF PTL at the moment */ + return psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Unknown SELF ptl option %u.", optname); +} + +static +psm2_error_t +self_ptl_getopt(const void *component_obj, int optname, + void *optval, uint64_t *optlen) +{ + /* No options for SELF PTL at the moment */ + return psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Unknown SELF ptl option %u.", optname); +} + +/* Only symbol we expose out of here */ +struct ptl_ctl_init +psmi_ptl_self = { + self_ptl_sizeof, self_ptl_init, self_ptl_fini, self_ptl_setopt, + self_ptl_getopt +}; diff --git a/ptl_self/ptl_fwd.h b/ptl_self/ptl_fwd.h new file mode 100644 index 0000000..77ee7f9 --- /dev/null +++ b/ptl_self/ptl_fwd.h @@ -0,0 +1,62 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _PTL_FWD_SELF_H +#define _PTL_FWD_SELF_H + +/* Symbol in am ptl */ +struct ptl_ctl_init psmi_ptl_self; + +#endif diff --git a/rpm_release_extension b/rpm_release_extension new file mode 100644 index 0000000..0d6dd55 --- /dev/null +++ b/rpm_release_extension @@ -0,0 +1 @@ +91_1